Imported Upstream version 2.4.1upstream/2.4.1

Change-Id: I0b584f569cb0e0f4eac13cdb79e110c2dbc34bfc
author: Yonghee Han <onstudy@samsung.com> 2016-07-27 16:40:17 +0900
committer: Yonghee Han <onstudy@samsung.com> 2016-07-27 00:53:56 -0700
commit: 3158f4a51894e46ecb593bffbfd12824e1d6534a (patch)
tree: 2bef7f0238e687c5de65f48b5995ee124a95d157 /docs
parent: a3b133b0ea0696e42fd876b9a803e28bc6ef5299 (diff)
download: qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.tar.gz
qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.tar.bz2
qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.zip
19 files changed, 2856 insertions, 214 deletions
diff --git a/docs/aio_notify.promela b/docs/aio_notify.promela
index ad3f6f08b..fccc7ee1c 100644
--- a/docs/aio_notify.promela
+++ b/docs/aio_notify.promela
@@ -1,5 +1,5 @@
 /*
- * This model describes the interaction between aio_set_dispatching()
+ * This model describes the interaction between ctx->notify_me
  * and aio_notify().
  *
  * Author: Paolo Bonzini <pbonzini@redhat.com>
@@ -14,57 +14,53 @@
  *     spin -a docs/aio_notify.promela
  *     gcc -O2 pan.c
  *     ./a.out -a
+ *
+ * To verify it (with a bug planted in the model):
+ *     spin -a -DBUG docs/aio_notify.promela
+ *     gcc -O2 pan.c
+ *     ./a.out -a
  */
 
 #define MAX   4
 #define LAST  (1 << (MAX - 1))
 #define FINAL ((LAST << 1) - 1)
 
-bool dispatching;
+bool notify_me;
 bool event;
 
-int req, done;
+int req;
+int done;
 
 active proctype waiter()
 {
-     int fetch, blocking;
+    int fetch;
 
-     do
-        :: done != FINAL -> {
-            // Computing "blocking" is separate from execution of the
-            // "bottom half"
-            blocking = (req == 0);
-
-            // This is our "bottom half"
-            atomic { fetch = req; req = 0; }
-            done = done | fetch;
-
-            // Wait for a nudge from the other side
-            do
-                :: event == 1 -> { event = 0; break; }
-                :: !blocking  -> break;
-            od;
+    do
+        :: true -> {
+            notify_me++;
 
-            dispatching = 1;
+            if
+#ifndef BUG
+                :: (req > 0) -> skip;
+#endif
+                :: else ->
+                    // Wait for a nudge from the other side
+                    do
+                        :: event == 1 -> { event = 0; break; }
+                    od;
+            fi;
 
-            // If you are simulating this model, you may want to add
-            // something like this here:
-            //
-            //      int foo; foo++; foo++; foo++;
-            //
-            // This only wastes some time and makes it more likely
-            // that the notifier process hits the "fast path".
+            notify_me--;
 
-            dispatching = 0;
+            atomic { fetch = req; req = 0; }
+            done = done | fetch;
         }
-        :: else -> break;
     od
 }
 
 active proctype notifier()
 {
     int next = 1;
-    int sets = 0;
 
     do
         :: next <= LAST -> {
@@ -74,8 +70,8 @@ active proctype notifier()
 
             // aio_notify
             if
-                :: dispatching == 0 -> sets++; event = 1;
-                :: else             -> skip;
+                :: notify_me == 1 -> event = 1;
+                :: else           -> printf("Skipped event_notifier_set\n"); skip;
             fi;
 
             // Test both synchronous and asynchronous delivery
@@ -86,19 +82,12 @@ active proctype notifier()
                 :: 1 -> skip;
             fi;
         }
-        :: else -> break;
     od;
-    printf("Skipped %d event_notifier_set\n", MAX - sets);
 }
 
-#define p (done == FINAL)
-
-never  {
-    do
-        :: 1                      // after an arbitrarily long prefix
-        :: p -> break             // p becomes true
-    od;
-    do
-        :: !p -> accept: break    // it then must remains true forever after
-    od
+never { /* [] done < FINAL */
+accept_init:
+        do
+        :: done < FINAL -> skip;
+        od;
 }
diff --git a/docs/aio_notify_accept.promela b/docs/aio_notify_accept.promela
new file mode 100644
index 000000000..9cef2c955
--- /dev/null
+++ b/docs/aio_notify_accept.promela
@@ -0,0 +1,152 @@
+/*
+ * This model describes the interaction between ctx->notified
+ * and ctx->notifier.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This file is in the public domain.  If you really want a license,
+ * the WTFPL will do.
+ *
+ * To verify the buggy version:
+ *     spin -a -DBUG1 docs/aio_notify_bug.promela
+ *     gcc -O2 pan.c
+ *     ./a.out -a -f
+ * (or -DBUG2)
+ *
+ * To verify the fixed version:
+ *     spin -a docs/aio_notify_bug.promela
+ *     gcc -O2 pan.c
+ *     ./a.out -a -f
+ *
+ * Add -DCHECK_REQ to test an alternative invariant and the
+ * "notify_me" optimization.
+ */
+
+int notify_me;
+bool notified;
+bool event;
+bool req;
+bool notifier_done;
+
+#ifdef CHECK_REQ
+#define USE_NOTIFY_ME 1
+#else
+#define USE_NOTIFY_ME 0
+#endif
+
+#ifdef BUG
+#error Please define BUG1 or BUG2 instead.
+#endif
+
+active proctype notifier()
+{
+    do
+        :: true -> {
+            req = 1;
+            if
+               :: !USE_NOTIFY_ME || notify_me ->
+#if defined BUG1
+                   /* CHECK_REQ does not detect this bug! */
+                   notified = 1;
+                   event = 1;
+#elif defined BUG2
+                   if
+                      :: !notified -> event = 1;
+                      :: else -> skip;
+                   fi;
+                   notified = 1;
+#else
+                   event = 1;
+                   notified = 1;
+#endif
+               :: else -> skip;
+            fi
+        }
+        :: true -> break;
+    od;
+    notifier_done = 1;
+}
+
+#define AIO_POLL                                                    \
+    notify_me++;                                                    \
+    if                                                              \
+        :: !req -> {                                                \
+            if                                                      \
+                :: event -> skip;                                   \
+            fi;                                                     \
+        }                                                           \
+        :: else -> skip;                                            \
+    fi;                                                             \
+    notify_me--;                                                    \
+                                                                    \
+    atomic { old = notified; notified = 0; }                        \
+    if                                                              \
+       :: old -> event = 0;                                         \
+       :: else -> skip;                                             \
+    fi;                                                             \
+                                                                    \
+    req = 0;
+
+active proctype waiter()
+{
+    bool old;
+
+    do
+       :: true -> AIO_POLL;
+    od;
+}
+
+/* Same as waiter(), but disappears after a while.  */
+active proctype temporary_waiter()
+{
+    bool old;
+
+    do
+       :: true -> AIO_POLL;
+       :: true -> break;
+    od;
+}
+
+#ifdef CHECK_REQ
+never {
+    do
+        :: req -> goto accept_if_req_not_eventually_false;
+        :: true -> skip;
+    od;
+
+accept_if_req_not_eventually_false:
+    if
+        :: req -> goto accept_if_req_not_eventually_false;
+    fi;
+    assert(0);
+}
+
+#else
+/* There must be infinitely many transitions of event as long
+ * as the notifier does not exit.
+ *
+ * If event stayed always true, the waiters would be busy looping.
+ * If event stayed always false, the waiters would be sleeping
+ * forever.
+ */
+never {
+    do
+        :: !event    -> goto accept_if_event_not_eventually_true;
+        :: event     -> goto accept_if_event_not_eventually_false;
+        :: true      -> skip;
+    od;
+
+accept_if_event_not_eventually_true:
+    if
+        :: !event && notifier_done  -> do :: true -> skip; od;
+        :: !event && !notifier_done -> goto accept_if_event_not_eventually_true;
+    fi;
+    assert(0);
+
+accept_if_event_not_eventually_false:
+    if
+        :: event     -> goto accept_if_event_not_eventually_false;
+    fi;
+    assert(0);
+}
+#endif
diff --git a/docs/aio_notify_bug.promela b/docs/aio_notify_bug.promela
new file mode 100644
index 000000000..b3bfca1ca
--- /dev/null
+++ b/docs/aio_notify_bug.promela
@@ -0,0 +1,140 @@
+/*
+ * This model describes a bug in aio_notify.  If ctx->notifier is
+ * cleared too late, a wakeup could be lost.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This file is in the public domain.  If you really want a license,
+ * the WTFPL will do.
+ *
+ * To verify the buggy version:
+ *     spin -a -DBUG docs/aio_notify_bug.promela
+ *     gcc -O2 pan.c
+ *     ./a.out -a -f
+ *
+ * To verify the fixed version:
+ *     spin -a docs/aio_notify_bug.promela
+ *     gcc -O2 pan.c
+ *     ./a.out -a -f
+ *
+ * Add -DCHECK_REQ to test an alternative invariant and the
+ * "notify_me" optimization.
+ */
+
+int notify_me;
+bool event;
+bool req;
+bool notifier_done;
+
+#ifdef CHECK_REQ
+#define USE_NOTIFY_ME 1
+#else
+#define USE_NOTIFY_ME 0
+#endif
+
+active proctype notifier()
+{
+    do
+        :: true -> {
+            req = 1;
+            if
+               :: !USE_NOTIFY_ME || notify_me -> event = 1;
+               :: else -> skip;
+            fi
+        }
+        :: true -> break;
+    od;
+    notifier_done = 1;
+}
+
+#ifdef BUG
+#define AIO_POLL                                                    \
+    notify_me++;                                                    \
+    if                                                              \
+        :: !req -> {                                                \
+            if                                                      \
+                :: event -> skip;                                   \
+            fi;                                                     \
+        }                                                           \
+        :: else -> skip;                                            \
+    fi;                                                             \
+    notify_me--;                                                    \
+                                                                    \
+    req = 0;                                                        \
+    event = 0;
+#else
+#define AIO_POLL                                                    \
+    notify_me++;                                                    \
+    if                                                              \
+        :: !req -> {                                                \
+            if                                                      \
+                :: event -> skip;                                   \
+            fi;                                                     \
+        }                                                           \
+        :: else -> skip;                                            \
+    fi;                                                             \
+    notify_me--;                                                    \
+                                                                    \
+    event = 0;                                                      \
+    req = 0;
+#endif
+
+active proctype waiter()
+{
+    do
+       :: true -> AIO_POLL;
+    od;
+}
+
+/* Same as waiter(), but disappears after a while.  */
+active proctype temporary_waiter()
+{
+    do
+       :: true -> AIO_POLL;
+       :: true -> break;
+    od;
+}
+
+#ifdef CHECK_REQ
+never {
+    do
+        :: req -> goto accept_if_req_not_eventually_false;
+        :: true -> skip;
+    od;
+
+accept_if_req_not_eventually_false:
+    if
+        :: req -> goto accept_if_req_not_eventually_false;
+    fi;
+    assert(0);
+}
+
+#else
+/* There must be infinitely many transitions of event as long
+ * as the notifier does not exit.
+ *
+ * If event stayed always true, the waiters would be busy looping.
+ * If event stayed always false, the waiters would be sleeping
+ * forever.
+ */
+never {
+    do
+        :: !event    -> goto accept_if_event_not_eventually_true;
+        :: event     -> goto accept_if_event_not_eventually_false;
+        :: true      -> skip;
+    od;
+
+accept_if_event_not_eventually_true:
+    if
+        :: !event && notifier_done  -> do :: true -> skip; od;
+        :: !event && !notifier_done -> goto accept_if_event_not_eventually_true;
+    fi;
+    assert(0);
+
+accept_if_event_not_eventually_false:
+    if
+        :: event     -> goto accept_if_event_not_eventually_false;
+    fi;
+    assert(0);
+}
+#endif
diff --git a/docs/atomics.txt b/docs/atomics.txt
index 6f2997bc6..ef285e3c2 100644
--- a/docs/atomics.txt
+++ b/docs/atomics.txt
@@ -281,7 +281,7 @@ note that the other barrier may actually be in a driver that runs in
 the guest!
 
 For the purposes of pairing, smp_read_barrier_depends() and smp_rmb()
-both count as read barriers.  A read barriers shall pair with a write
+both count as read barriers.  A read barrier shall pair with a write
 barrier or a full barrier; a write barrier shall pair with a read
 barrier or a full barrier.  A full barrier can pair with anything.
 For example:
@@ -294,7 +294,7 @@ For example:
                              smp_rmb();
                              y = a;
 
-Note that the "writing" thread are accessing the variables in the
+Note that the "writing" thread is accessing the variables in the
 opposite order as the "reading" thread.  This is expected: stores
 before the write barrier will normally match the loads after the
 read barrier, and vice versa.  The same is true for more than 2
diff --git a/docs/bitmaps.md b/docs/bitmaps.md
new file mode 100644
index 000000000..fa87f077f
--- /dev/null
+++ b/docs/bitmaps.md
@@ -0,0 +1,352 @@
+<!--
+Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
+All rights reserved.
+
+This file is licensed via The FreeBSD Documentation License, the full text of
+which is included at the end of this document.
+-->
+
+# Dirty Bitmaps and Incremental Backup
+
+* Dirty Bitmaps are objects that track which data needs to be backed up for the
+  next incremental backup.
+
+* Dirty bitmaps can be created at any time and attached to any node
+  (not just complete drives.)
+
+## Dirty Bitmap Names
+
+* A dirty bitmap's name is unique to the node, but bitmaps attached to different
+  nodes can share the same name.
+
+## Bitmap Modes
+
+* A Bitmap can be "frozen," which means that it is currently in-use by a backup
+  operation and cannot be deleted, renamed, written to, reset,
+  etc.
+
+## Basic QMP Usage
+
+### Supported Commands ###
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-remove
+* block-dirty-bitmap-clear
+
+### Creation
+
+* To create a new bitmap, enabled, on the drive with id=drive0:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0"
+  }
+}
+```
+
+* This bitmap will have a default granularity that matches the cluster size of
+  its associated drive, if available, clamped to between [4KiB, 64KiB].
+  The current default for qcow2 is 64KiB.
+
+* To create a new bitmap that tracks changes in 32KiB segments:
+
+```json
+{ "execute": "block-dirty-bitmap-add",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0",
+    "granularity": 32768
+  }
+}
+```
+
+### Deletion
+
+* Bitmaps that are frozen cannot be deleted.
+
+* Deleting the bitmap does not impact any other bitmaps attached to the same
+  node, nor does it affect any backups already created from this node.
+
+* Because bitmaps are only unique to the node to which they are attached,
+  you must specify the node/drive name here, too.
+
+```json
+{ "execute": "block-dirty-bitmap-remove",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0"
+  }
+}
+```
+
+### Resetting
+
+* Resetting a bitmap will clear all information it holds.
+
+* An incremental backup created from an empty bitmap will copy no data,
+  as if nothing has changed.
+
+```json
+{ "execute": "block-dirty-bitmap-clear",
+  "arguments": {
+    "node": "drive0",
+    "name": "bitmap0"
+  }
+}
+```
+
+## Transactions (Not yet implemented)
+
+* Transactional commands are forthcoming in a future version,
+  and are not yet available for use. This section serves as
+  documentation of intent for their design and usage.
+
+### Justification
+
+Bitmaps can be safely modified when the VM is paused or halted by using
+the basic QMP commands. For instance, you might perform the following actions:
+
+1. Boot the VM in a paused state.
+2. Create a full drive backup of drive0.
+3. Create a new bitmap attached to drive0.
+4. Resume execution of the VM.
+5. Incremental backups are ready to be created.
+
+At this point, the bitmap and drive backup would be correctly in sync,
+and incremental backups made from this point forward would be correctly aligned
+to the full drive backup.
+
+This is not particularly useful if we decide we want to start incremental
+backups after the VM has been running for a while, for which we will need to
+perform actions such as the following:
+
+1. Boot the VM and begin execution.
+2. Using a single transaction, perform the following operations:
+    * Create bitmap0.
+    * Create a full drive backup of drive0.
+3. Incremental backups are now ready to be created.
+
+### Supported Bitmap Transactions
+
+* block-dirty-bitmap-add
+* block-dirty-bitmap-clear
+
+The usages are identical to their respective QMP commands, but see below
+for examples.
+
+### Example: New Incremental Backup
+
+As outlined in the justification, perhaps we want to create a new incremental
+backup chain attached to a drive.
+
+```json
+{ "execute": "transaction",
+  "arguments": {
+    "actions": [
+      {"type": "block-dirty-bitmap-add",
+       "data": {"node": "drive0", "name": "bitmap0"} },
+      {"type": "drive-backup",
+       "data": {"device": "drive0", "target": "/path/to/full_backup.img",
+                "sync": "full", "format": "qcow2"} }
+    ]
+  }
+}
+```
+
+### Example: New Incremental Backup Anchor Point
+
+Maybe we just want to create a new full backup with an existing bitmap and
+want to reset the bitmap to track the new chain.
+
+```json
+{ "execute": "transaction",
+  "arguments": {
+    "actions": [
+      {"type": "block-dirty-bitmap-clear",
+       "data": {"node": "drive0", "name": "bitmap0"} },
+      {"type": "drive-backup",
+       "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
+                "sync": "full", "format": "qcow2"} }
+    ]
+  }
+}
+```
+
+## Incremental Backups
+
+The star of the show.
+
+**Nota Bene!** Only incremental backups of entire drives are supported for now.
+So despite the fact that you can attach a bitmap to any arbitrary node, they are
+only currently useful when attached to the root node. This is because
+drive-backup only supports drives/devices instead of arbitrary nodes.
+
+### Example: First Incremental Backup
+
+1. Create a full backup and sync it to the dirty bitmap, as in the transactional
+examples above; or with the VM offline, manually create a full copy and then
+create a new bitmap before the VM begins execution.
+
+    * Let's assume the full backup is named 'full_backup.img'.
+    * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'.
+
+2. Create a destination image for the incremental backup that utilizes the
+full backup as a backing image.
+
+    * Let's assume it is named 'incremental.0.img'.
+
+    ```sh
+    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+    ```
+
+3. Issue the incremental backup command:
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.0.img",
+        "format": "qcow2",
+        "sync": "incremental",
+        "mode": "existing"
+      }
+    }
+    ```
+
+### Example: Second Incremental Backup
+
+1. Create a new destination image for the incremental backup that points to the
+   previous one, e.g.: 'incremental.1.img'
+
+    ```sh
+    # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
+    ```
+
+2. Issue a new incremental backup command. The only difference here is that we
+   have changed the target image below.
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.1.img",
+        "format": "qcow2",
+        "sync": "incremental",
+        "mode": "existing"
+      }
+    }
+    ```
+
+## Errors
+
+* In the event of an error that occurs after a backup job is successfully
+  launched, either by a direct QMP command or a QMP transaction, the user
+  will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied
+  by a BLOCK_JOB_ERROR event.
+
+* In the case of an event being cancelled, the user will receive a
+  BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events.
+
+* In either case, the incremental backup data contained within the bitmap is
+  safely rolled back, and the data within the bitmap is not lost. The image
+  file created for the failed attempt can be safely deleted.
+
+* Once the underlying problem is fixed (e.g. more storage space is freed up),
+  you can simply retry the incremental backup command with the same bitmap.
+
+### Example
+
+1. Create a target image:
+
+    ```sh
+    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+    ```
+
+2. Attempt to create an incremental backup via QMP:
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.0.img",
+        "format": "qcow2",
+        "sync": "incremental",
+        "mode": "existing"
+      }
+    }
+    ```
+
+3. Receive an event notifying us of failure:
+
+    ```json
+    { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
+      "data": { "speed": 0, "offset": 0, "len": 67108864,
+                "error": "No space left on device",
+                "device": "drive1", "type": "backup" },
+      "event": "BLOCK_JOB_COMPLETED" }
+    ```
+
+4. Delete the failed incremental, and re-create the image.
+
+    ```sh
+    # rm incremental.0.img
+    # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
+    ```
+
+5. Retry the command after fixing the underlying problem,
+   such as freeing up space on the backup volume:
+
+    ```json
+    { "execute": "drive-backup",
+      "arguments": {
+        "device": "drive0",
+        "bitmap": "bitmap0",
+        "target": "incremental.0.img",
+        "format": "qcow2",
+        "sync": "incremental",
+        "mode": "existing"
+      }
+    }
+    ```
+
+6. Receive confirmation that the job completed successfully:
+
+    ```json
+    { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
+      "data": { "device": "drive1", "type": "backup",
+                "speed": 0, "len": 67108864, "offset": 67108864},
+      "event": "BLOCK_JOB_COMPLETED" }
+    ```
+
+<!--
+The FreeBSD Documentation License
+
+Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
+PDF, PostScript, RTF and so forth) with or without modification, are permitted
+provided that the following conditions are met:
+
+Redistributions of source code (Markdown) must retain the above copyright
+notice, this list of conditions and the following disclaimer of this file
+unmodified.
+
+Redistributions in compiled form (transformed to other DTDs, converted to PDF,
+PostScript, RTF and other formats) must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR  PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS  BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
diff --git a/docs/memory-hotplug.txt b/docs/memory-hotplug.txt
index f70571df0..56bdd0a47 100644
--- a/docs/memory-hotplug.txt
+++ b/docs/memory-hotplug.txt
@@ -4,9 +4,7 @@ QEMU memory hotplug
 This document explains how to use the memory hotplug feature in QEMU,
 which is present since v2.1.0.
 
-Please, note that memory hotunplug is not supported yet. This means
-that you're able to add memory, but you're not able to remove it.
-Also, proper guest support is required for memory hotplug to work.
+Guest support is required for memory hotplug to work.
 
 Basic RAM hotplug
 -----------------
@@ -74,3 +72,22 @@ comes from regular RAM, 1GB is a 1GB hugepage page and 256MB is from
    -device pc-dimm,id=dimm1,memdev=mem1 \
    -object memory-backend-file,id=mem2,size=256M,mem-path=/mnt/hugepages-2MB \
    -device pc-dimm,id=dimm2,memdev=mem2
+
+
+RAM hot-unplug
+---------------
+
+In order to be able to hot unplug pc-dimm device, QEMU has to be told the ids
+of pc-dimm device and memory backend object. The ids were assigned when you hot
+plugged memory.
+
+Two monitor commands are used to hot unplug memory:
+
+ - "device_del": deletes a front-end pc-dimm device
+ - "object_del": deletes a memory backend object
+
+For example, assuming that the pc-dimm device with id "dimm1" exists, and its memory
+backend is "mem1", the following commands tries to remove it.
+
+  (qemu) device_del dimm1
+  (qemu) object_del mem1
diff --git a/docs/migration.txt b/docs/migration.txt
index 0492a4547..f6df4beb2 100644
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -257,6 +257,7 @@ const VMStateDescription vmstate_ide_drive_pio_state = {
     .minimum_version_id = 1,
     .pre_save = ide_drive_pio_pre_save,
     .post_load = ide_drive_pio_post_load,
+    .needed = ide_drive_pio_state_needed,
     .fields = (VMStateField[]) {
         VMSTATE_INT32(req_nb_sectors, IDEState),
         VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1,
@@ -279,13 +280,9 @@ const VMStateDescription vmstate_ide_drive = {
         .... several fields ....
         VMSTATE_END_OF_LIST()
     },
-    .subsections = (VMStateSubsection []) {
-        {
-            .vmsd = &vmstate_ide_drive_pio_state,
-            .needed = ide_drive_pio_state_needed,
-        }, {
-            /* empty */
-        }
+    .subsections = (const VMStateDescription*[]) {
+        &vmstate_ide_drive_pio_state,
+        NULL
     }
 };
 
diff --git a/docs/multi-thread-compression.txt b/docs/multi-thread-compression.txt
new file mode 100644
index 000000000..3d477c3bd
--- /dev/null
+++ b/docs/multi-thread-compression.txt
@@ -0,0 +1,149 @@
+Use multiple thread (de)compression in live migration
+=====================================================
+Copyright (C) 2015 Intel Corporation
+Author: Liang Li <liang.z.li@intel.com>
+
+This work is licensed under the terms of the GNU GPLv2 or later. See
+the COPYING file in the top-level directory.
+
+Contents:
+=========
+* Introduction
+* When to use
+* Performance
+* Usage
+* TODO
+
+Introduction
+============
+Instead of sending the guest memory directly, this solution will
+compress the RAM page before sending; after receiving, the data will
+be decompressed. Using compression in live migration can help
+to reduce the data transferred about 60%, this is very useful when the
+bandwidth is limited, and the total migration time can also be reduced
+about 70% in a typical case. In addition to this, the VM downtime can be
+reduced about 50%. The benefit depends on data's compressibility in VM.
+
+The process of compression will consume additional CPU cycles, and the
+extra CPU cycles will increase the migration time. On the other hand,
+the amount of data transferred will decrease; this factor can reduce
+the total migration time. If the process of the compression is quick
+enough, then the total migration time can be reduced, and multiple
+thread compression can be used to accelerate the compression process.
+
+The decompression speed of Zlib is at least 4 times as quick as
+compression, if the source and destination CPU have equal speed,
+keeping the compression thread count 4 times the decompression
+thread count can avoid resource waste.
+
+Compression level can be used to control the compression speed and the
+compression ratio. High compression ratio will take more time, level 0
+stands for no compression, level 1 stands for the best compression
+speed, and level 9 stands for the best compression ratio. Users can
+select a level number between 0 and 9.
+
+
+When to use the multiple thread compression in live migration
+=============================================================
+Compression of data will consume extra CPU cycles; so in a system with
+high overhead of CPU, avoid using this feature. When the network
+bandwidth is very limited and the CPU resource is adequate, use of
+multiple thread compression will be very helpful. If both the CPU and
+the network bandwidth are adequate, use of multiple thread compression
+can still help to reduce the migration time.
+
+Performance
+===========
+Test environment:
+
+CPU: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz
+Socket Count: 2
+RAM: 128G
+NIC: Intel I350 (10/100/1000Mbps)
+Host OS: CentOS 7 64-bit
+Guest OS: RHEL 6.5 64-bit
+Parameter: qemu-system-x86_64 -enable-kvm -smp 4 -m 4096
+ /share/ia32e_rhel6u5.qcow -monitor stdio
+
+There is no additional application is running on the guest when doing
+the test.
+
+
+Speed limit: 1000Gb/s
+---------------------------------------------------------------
+                    | original  | compress thread: 8
+                    |   way     | decompress thread: 2
+                    |           | compression level: 1
+---------------------------------------------------------------
+total time(msec):   |   3333    |  1833
+---------------------------------------------------------------
+downtime(msec):     |    100    |   27
+---------------------------------------------------------------
+transferred ram(kB):|  363536   | 107819
+---------------------------------------------------------------
+throughput(mbps):   |  893.73   | 482.22
+---------------------------------------------------------------
+total ram(kB):      |  4211524  | 4211524
+---------------------------------------------------------------
+
+There is an application running on the guest which write random numbers
+to RAM block areas periodically.
+
+Speed limit: 1000Gb/s
+---------------------------------------------------------------
+                    | original  | compress thread: 8
+                    |   way     | decompress thread: 2
+                    |           | compression level: 1
+---------------------------------------------------------------
+total time(msec):   |   37369   | 15989
+---------------------------------------------------------------
+downtime(msec):     |    337    |  173
+---------------------------------------------------------------
+transferred ram(kB):|  4274143  | 1699824
+---------------------------------------------------------------
+throughput(mbps):   |  936.99   | 870.95
+---------------------------------------------------------------
+total ram(kB):      |  4211524  | 4211524
+---------------------------------------------------------------
+
+Usage
+=====
+1. Verify both the source and destination QEMU are able
+to support the multiple thread compression migration:
+    {qemu} info_migrate_capabilities
+    {qemu} ... compress: off ...
+
+2. Activate compression on the source:
+    {qemu} migrate_set_capability compress on
+
+3. Set the compression thread count on source:
+    {qemu} migrate_set_parameter compress_threads 12
+
+4. Set the compression level on the source:
+    {qemu} migrate_set_parameter compress_level 1
+
+5. Set the decompression thread count on destination:
+    {qemu} migrate_set_parameter decompress_threads 3
+
+6. Start outgoing migration:
+    {qemu} migrate -d tcp:destination.host:4444
+    {qemu} info migrate
+    Capabilities: ... compress: on
+    ...
+
+The following are the default settings:
+    compress: off
+    compress_threads: 8
+    decompress_threads: 2
+    compress_level: 1 (which means best speed)
+
+So, only the first two steps are required to use the multiple
+thread compression in migration. You can do more if the default
+settings are not appropriate.
+
+TODO
+====
+Some faster (de)compression method such as LZ4 and Quicklz can help
+to reduce the CPU consumption when doing (de)compression. If using
+these faster (de)compression method, less (de)compression threads
+are needed when doing the migration.
diff --git a/docs/multiseat.txt b/docs/multiseat.txt
index b963665ef..ebf244693 100644
--- a/docs/multiseat.txt
+++ b/docs/multiseat.txt
@@ -2,8 +2,8 @@
 multiseat howto (with some multihead coverage)
 ==============================================
 
-host side
----------
+host devices
+------------
 
 First you must compile qemu with a user interface supporting
 multihead/multiseat and input event routing.  Right now this
@@ -41,6 +41,19 @@ The "display=video2" sets up the input routing.  Any input coming from
 the window which belongs to the video.2 display adapter will be routed
 to these input devices.
 
+Starting with qemu 2.4 and linux kernel 4.1 you can also use virtio
+for the input devices, using this ...
+
+	-device pci-bridge,addr=12.0,chassis_nr=2,id=head.2 \
+	-device secondary-vga,bus=head.2,addr=02.0,id=video.2 \
+	-device virtio-keyboard-pci,bus=head.2,addr=03.0,display=video.2 \
+	-device virtio-tablet-pci,bus=head.2,addr=03.0,display=video.2
+
+... instead of xhci and usb hid devices.
+
+host ui
+-------
+
 The sdl2 ui will start up with two windows, one for each display
 device.  The gtk ui will start with a single window and each display
 in a separate tab.  You can either simply switch tabs to switch heads,
@@ -106,6 +119,26 @@ the devices attached to the seat.
 Background info is here:
   http://www.freedesktop.org/wiki/Software/systemd/multiseat/
 
+
+guest side with pci-bridge-seat
+-------------------------------
+
+Qemu version 2.4 and newer has a new pci-bridge-seat device which
+can be used instead of pci-bridge.  Just swap the device name in the
+qemu command line above.  The only difference between the two devices
+is the pci id.  We can match the pci id instead of the device path
+with a nice generic rule now, which simplifies the guest
+configuration:
+
+    [root@fedora ~]# cat /etc/udev/rules.d/70-qemu-pci-bridge-seat.rules
+    SUBSYSTEM=="pci", ATTR{vendor}=="0x1b36", ATTR{device}=="0x000a", \
+            TAG+="seat", ENV{ID_AUTOSEAT}="1"
+
+Patch with this rule has been submitted to upstream udev/systemd, was
+accepted and and should be included in the next systemd release (222).
+So, if your guest has this or a newer version, multiseat will work just
+fine without any manual guest configuration.
+
 Enjoy!
 
 --
diff --git a/docs/pci_expander_bridge.txt b/docs/pci_expander_bridge.txt
new file mode 100644
index 000000000..d7913fb4a
--- /dev/null
+++ b/docs/pci_expander_bridge.txt
@@ -0,0 +1,58 @@
+PCI EXPANDER BRIDGE (PXB)
+=========================
+
+Description
+===========
+PXB is a "light-weight" host bridge in the same PCI domain
+as the main host bridge whose purpose is to enable
+the main host bridge to support multiple PCI root buses.
+It is implemented only for i440fx and can be placed only
+on bus 0 (pci.0).
+
+As opposed to PCI-2-PCI bridge's secondary bus, PXB's bus
+is a primary bus and can be associated with a NUMA node
+(different from the main host bridge) allowing the guest OS
+to recognize the proximity of a pass-through device to
+other resources as RAM and CPUs.
+
+Usage
+=====
+A detailed command line would be:
+
+[qemu-bin + storage options]
+-m 2G
+-object memory-backend-ram,size=1024M,policy=bind,host-nodes=0,id=ram-node0 -numa node,nodeid=0,cpus=0,memdev=ram-node0
+-object memory-backend-ram,size=1024M,policy=bind,host-nodes=1,id=ram-node1 -numa node,nodeid=1,cpus=1,memdev=ram-node1
+-device pxb,id=bridge1,bus=pci.0,numa_node=1,bus_nr=4 -netdev user,id=nd-device e1000,bus=bridge1,addr=0x4,netdev=nd
+-device pxb,id=bridge2,bus=pci.0,numa_node=0,bus_nr=8,bus=pci.0 -device e1000,bus=bridge2,addr=0x3
+-device pxb,id=bridge3,bus=pci.0,bus_nr=40,bus=pci.0 -drive if=none,id=drive0,file=[img] -device virtio-blk-pci,drive=drive0,scsi=off,bus=bridge3,addr=1
+
+Here you have:
+ - 2 NUMA nodes for the guest, 0 and 1. (both mapped to the same NUMA node in host, but you can and should put it in different host NUMA nodes)
+ - a pxb host bridge attached to NUMA 1 with an e1000 behind it
+ - a pxb host bridge attached to NUMA 0 with an e1000 behind it
+ - a pxb host bridge not attached to any NUMA with a hard drive behind it.
+
+Limitations
+===========
+Please observe that we specified the bus "pci.0" for the second and third pxb.
+This is because when no bus is given, another pxb can be selected by QEMU as default bus,
+however, PXBs can be placed only under the root bus.
+
+Implementation
+==============
+The PXB is composed by:
+- HostBridge (TYPE_PXB_HOST)
+  The host bridge allows to register and query the PXB's rPCI root bus in QEMU.
+- PXBDev(TYPE_PXB_DEVICE)
+  It is a regular PCI Device that resides on the piix host-bridge bus and its bus uses the same PCI domain.
+  However, the bus behind is exposed through ACPI as a primary PCI bus and starts a new PCI hierarchy.
+  The interrupts from devices behind the PXB are routed through this device the same as if it were a
+  PCI-2-PCI bridge. The _PRT follows the i440fx model.
+- PCIBridgeDev(TYPE_PCI_BRIDGE_DEV)
+  Created automatically as part of init sequence.
+  When adding a device to PXB it is attached to the bridge for two reasons:
+  - Using the bridge will enable hotplug support
+  - All the devices behind the bridge will use bridge's IO/MEM windows compacting
+    the PCI address space.
+
diff --git a/docs/qapi-code-gen.txt b/docs/qapi-code-gen.txt
index 8313ba6af..61b5be47f 100644
--- a/docs/qapi-code-gen.txt
+++ b/docs/qapi-code-gen.txt
@@ -1,61 +1,193 @@
 = How to use the QAPI code generator =
 
-QAPI is a native C API within QEMU which provides management-level
-functionality to internal/external users. For external
-users/processes, this interface is made available by a JSON-based
-QEMU Monitor protocol that is provided by the QMP server.
-
-To map QMP-defined interfaces to the native C QAPI implementations,
-a JSON-based schema is used to define types and function
-signatures, and a set of scripts is used to generate types/signatures,
-and marshaling/dispatch code. The QEMU Guest Agent also uses these
-scripts, paired with a separate schema, to generate
-marshaling/dispatch code for the guest agent server running in the
-guest.
+Copyright IBM Corp. 2011
+Copyright (C) 2012-2015 Red Hat, Inc.
 
-This document will describe how the schemas, scripts, and resulting
-code are used.
+This work is licensed under the terms of the GNU GPL, version 2 or
+later. See the COPYING file in the top-level directory.
 
+== Introduction ==
 
-== QMP/Guest agent schema ==
-
-This file defines the types, commands, and events used by QMP.  It should
-fully describe the interface used by QMP.
+QAPI is a native C API within QEMU which provides management-level
+functionality to internal and external users. For external
+users/processes, this interface is made available by a JSON-based wire
+format for the QEMU Monitor Protocol (QMP) for controlling qemu, as
+well as the QEMU Guest Agent (QGA) for communicating with the guest.
+The remainder of this document uses "Client JSON Protocol" when
+referring to the wire contents of a QMP or QGA connection.
 
-This file is designed to be loosely based on JSON although it's technically
-executable Python.  While dictionaries are used, they are parsed as
-OrderedDicts so that ordering is preserved.
+To map Client JSON Protocol interfaces to the native C QAPI
+implementations, a JSON-based schema is used to define types and
+function signatures, and a set of scripts is used to generate types,
+signatures, and marshaling/dispatch code. This document will describe
+how the schemas, scripts, and resulting code are used.
 
-There are two basic syntaxes used, type definitions and command definitions.
 
-The first syntax defines a type and is represented by a dictionary.  There are
-three kinds of user-defined types that are supported: complex types,
-enumeration types and union types.
+== QMP/Guest agent schema ==
 
-Generally speaking, types definitions should always use CamelCase for the type
-names. Command names should be all lower case with words separated by a hyphen.
+A QAPI schema file is designed to be loosely based on JSON
+(http://www.ietf.org/rfc/rfc7159.txt) with changes for quoting style
+and the use of comments; a QAPI schema file is then parsed by a python
+code generation program.  A valid QAPI schema consists of a series of
+top-level expressions, with no commas between them.  Where
+dictionaries (JSON objects) are used, they are parsed as python
+OrderedDicts so that ordering is preserved (for predictable layout of
+generated C structs and parameter lists).  Ordering doesn't matter
+between top-level expressions or the keys within an expression, but
+does matter within dictionary values for 'data' and 'returns' members
+of a single expression.  QAPI schema input is written using 'single
+quotes' instead of JSON's "double quotes" (in contrast, Client JSON
+Protocol uses no comments, and while input accepts 'single quotes' as
+an extension, output is strict JSON using only "double quotes").  As
+in JSON, trailing commas are not permitted in arrays or dictionaries.
+Input must be ASCII (although QMP supports full Unicode strings, the
+QAPI parser does not).  At present, there is no place where a QAPI
+schema requires the use of JSON numbers or null.
+
+Comments are allowed; anything between an unquoted # and the following
+newline is ignored.  Although there is not yet a documentation
+generator, a form of stylized comments has developed for consistently
+documenting details about an expression and when it was added to the
+schema.  The documentation is delimited between two lines of ##, then
+the first line names the expression, an optional overview is provided,
+then individual documentation about each member of 'data' is provided,
+and finally, a 'Since: x.y.z' tag lists the release that introduced
+the expression.  Optional fields are tagged with the phrase
+'#optional', often with their default value; and extensions added
+after the expression was first released are also given a '(since
+x.y.z)' comment.  For example:
+
+    ##
+    # @BlockStats:
+    #
+    # Statistics of a virtual block device or a block backing device.
+    #
+    # @device: #optional If the stats are for a virtual block device, the name
+    #          corresponding to the virtual block device.
+    #
+    # @stats:  A @BlockDeviceStats for the device.
+    #
+    # @parent: #optional This describes the file block device if it has one.
+    #
+    # @backing: #optional This describes the backing block device if it has one.
+    #           (Since 2.0)
+    #
+    # Since: 0.14.0
+    ##
+    { 'struct': 'BlockStats',
+      'data': {'*device': 'str', 'stats': 'BlockDeviceStats',
+               '*parent': 'BlockStats',
+               '*backing': 'BlockStats'} }
+
+The schema sets up a series of types, as well as commands and events
+that will use those types.  Forward references are allowed: the parser
+scans in two passes, where the first pass learns all type names, and
+the second validates the schema and generates the code.  This allows
+the definition of complex structs that can have mutually recursive
+types, and allows for indefinite nesting of Client JSON Protocol that
+satisfies the schema.  A type name should not be defined more than
+once.  It is permissible for the schema to contain additional types
+not used by any commands or events in the Client JSON Protocol, for
+the side effect of generated C code used internally.
+
+There are seven top-level expressions recognized by the parser:
+'include', 'command', 'struct', 'enum', 'union', 'alternate', and
+'event'.  There are several groups of types: simple types (a number of
+built-in types, such as 'int' and 'str'; as well as enumerations),
+complex types (structs and two flavors of unions), and alternate types
+(a choice between other types).  The 'command' and 'event' expressions
+can refer to existing types by name, or list an anonymous type as a
+dictionary. Listing a type name inside an array refers to a
+single-dimension array of that type; multi-dimension arrays are not
+directly supported (although an array of a complex struct that
+contains an array member is possible).
+
+Types, commands, and events share a common namespace.  Therefore,
+generally speaking, type definitions should always use CamelCase for
+user-defined type names, while built-in types are lowercase. Type
+definitions should not end in 'Kind', as this namespace is used for
+creating implicit C enums for visiting union types.  Command names,
+and field names within a type, should be all lower case with words
+separated by a hyphen.  However, some existing older commands and
+complex types use underscore; when extending such expressions,
+consistency is preferred over blindly avoiding underscore.  Event
+names should be ALL_CAPS with words separated by underscore.  The
+special string '**' appears for some commands that manually perform
+their own type checking rather than relying on the type-safe code
+produced by the qapi code generators.
+
+Any name (command, event, type, field, or enum value) beginning with
+"x-" is marked experimental, and may be withdrawn or changed
+incompatibly in a future release.  Downstream vendors may add
+extensions; such extensions should begin with a prefix matching
+"__RFQDN_" (for the reverse-fully-qualified-domain-name of the
+vendor), even if the rest of the name uses dash (example:
+__com.redhat_drive-mirror).  Other than downstream extensions (with
+leading underscore and the use of dots), all names should begin with a
+letter, and contain only ASCII letters, digits, dash, and underscore.
+It is okay to reuse names that match C keywords; the generator will
+rename a field named "default" in the QAPI to "q_default" in the
+generated C code.
+
+In the rest of this document, usage lines are given for each
+expression type, with literal strings written in lower case and
+placeholders written in capitals.  If a literal string includes a
+prefix of '*', that key/value pair can be omitted from the expression.
+For example, a usage statement that includes '*base':STRUCT-NAME
+means that an expression has an optional key 'base', which if present
+must have a value that forms a struct name.
+
+
+=== Built-in Types ===
+
+The following types are built-in to the parser:
+  'str' - arbitrary UTF-8 string
+  'int' - 64-bit signed integer (although the C code may place further
+          restrictions on acceptable range)
+  'number' - floating point number
+  'bool' - JSON value of true or false
+  'int8', 'int16', 'int32', 'int64' - like 'int', but enforce maximum
+                                      bit size
+  'uint8', 'uint16', 'uint32', 'uint64' - unsigned counterparts
+  'size' - like 'uint64', but allows scaled suffix from command line
+           visitor
 
 
 === Includes ===
 
+Usage: { 'include': STRING }
+
 The QAPI schema definitions can be modularized using the 'include' directive:
 
- { 'include': 'path/to/file.json'}
+ { 'include': 'path/to/file.json' }
 
 The directive is evaluated recursively, and include paths are relative to the
-file using the directive. Multiple includes of the same file are safe.
+file using the directive. Multiple includes of the same file are
+safe.  No other keys should appear in the expression, and the include
+value should be a string.
+
+As a matter of style, it is a good idea to have all files be
+self-contained, but at the moment, nothing prevents an included file
+from making a forward reference to a type that is only introduced by
+an outer file.  The parser may be made stricter in the future to
+prevent incomplete include files.
 
 
-=== Complex types ===
+=== Struct types ===
 
-A complex type is a dictionary containing a single key whose value is a
-dictionary.  This corresponds to a struct in C or an Object in JSON.  An
-example of a complex type is:
+Usage: { 'struct': STRING, 'data': DICT, '*base': STRUCT-NAME }
 
- { 'type': 'MyType',
+A struct is a dictionary containing a single 'data' key whose
+value is a dictionary.  This corresponds to a struct in C or an Object
+in JSON. Each value of the 'data' dictionary must be the name of a
+type, or a one-element array containing a type name.  An example of a
+struct is:
+
+ { 'struct': 'MyType',
    'data': { 'member1': 'str', 'member2': 'int', '*member3': 'str' } }
 
-The use of '*' as a prefix to the name means the member is optional.
+The use of '*' as a prefix to the name means the member is optional in
+the corresponding JSON protocol usage.
 
 The default initialization value of an optional argument should not be changed
 between versions of QEMU unless the new default maintains backward
@@ -84,13 +216,13 @@ A structure that is used in both input and output of various commands
 must consider the backwards compatibility constraints of both directions
 of use.
 
-A complex type definition can specify another complex type as its base.
+A struct definition can specify another struct as its base.
 In this case, the fields of the base type are included as top-level fields
-of the new complex type's dictionary in the QMP wire format. An example
-definition is:
+of the new struct's dictionary in the Client JSON Protocol wire
+format. An example definition is:
 
- { 'type': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } }
- { 'type': 'BlockdevOptionsGenericCOWFormat',
+ { 'struct': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } }
+ { 'struct': 'BlockdevOptionsGenericCOWFormat',
    'base': 'BlockdevOptionsGenericFormat',
    'data': { '*backing': 'str' } }
 
@@ -100,97 +232,158 @@ both fields like this:
  { "file": "/some/place/my-image",
    "backing": "/some/place/my-backing-file" }
 
+
 === Enumeration types ===
 
-An enumeration type is a dictionary containing a single key whose value is a
-list of strings.  An example enumeration is:
+Usage: { 'enum': STRING, 'data': ARRAY-OF-STRING }
+
+An enumeration type is a dictionary containing a single 'data' key
+whose value is a list of strings.  An example enumeration is:
 
  { 'enum': 'MyEnum', 'data': [ 'value1', 'value2', 'value3' ] }
 
+Nothing prevents an empty enumeration, although it is probably not
+useful.  The list of strings should be lower case; if an enum name
+represents multiple words, use '-' between words.  The string 'max' is
+not allowed as an enum value, and values should not be repeated.
+
+The enumeration values are passed as strings over the Client JSON
+Protocol, but are encoded as C enum integral values in generated code.
+While the C code starts numbering at 0, it is better to use explicit
+comparisons to enum values than implicit comparisons to 0; the C code
+will also include a generated enum member ending in _MAX for tracking
+the size of the enum, useful when using common functions for
+converting between strings and enum values.  Since the wire format
+always passes by name, it is acceptable to reorder or add new
+enumeration members in any location without breaking clients of Client
+JSON Protocol; however, removing enum values would break
+compatibility.  For any struct that has a field that will only contain
+a finite set of string values, using an enum type for that field is
+better than open-coding the field to be type 'str'.
+
+
 === Union types ===
 
-Union types are used to let the user choose between several different data
-types.  A union type is defined using a dictionary as explained in the
-following paragraphs.
+Usage: { 'union': STRING, 'data': DICT }
+or:    { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME,
+         'discriminator': ENUM-MEMBER-OF-BASE }
 
+Union types are used to let the user choose between several different
+variants for an object.  There are two flavors: simple (no
+discriminator or base), flat (both discriminator and base).  A union
+type is defined using a data dictionary as explained in the following
+paragraphs.
 
-A simple union type defines a mapping from discriminator values to data types
-like in this example:
+A simple union type defines a mapping from automatic discriminator
+values to data types like in this example:
 
- { 'type': 'FileOptions', 'data': { 'filename': 'str' } }
- { 'type': 'Qcow2Options',
+ { 'struct': 'FileOptions', 'data': { 'filename': 'str' } }
+ { 'struct': 'Qcow2Options',
    'data': { 'backing-file': 'str', 'lazy-refcounts': 'bool' } }
 
  { 'union': 'BlockdevOptions',
    'data': { 'file': 'FileOptions',
              'qcow2': 'Qcow2Options' } }
 
-In the QMP wire format, a simple union is represented by a dictionary that
-contains the 'type' field as a discriminator, and a 'data' field that is of the
-specified data type corresponding to the discriminator value:
+In the Client JSON Protocol, a simple union is represented by a
+dictionary that contains the 'type' field as a discriminator, and a
+'data' field that is of the specified data type corresponding to the
+discriminator value, as in these examples:
 
+ { "type": "file", "data" : { "filename": "/some/place/my-image" } }
  { "type": "qcow2", "data" : { "backing-file": "/some/place/my-image",
                                "lazy-refcounts": true } }
 
+The generated C code uses a struct containing a union. Additionally,
+an implicit C enum 'NameKind' is created, corresponding to the union
+'Name', for accessing the various branches of the union.  No branch of
+the union can be named 'max', as this would collide with the implicit
+enum.  The value for each branch can be of any type.
 
-A union definition can specify a complex type as its base. In this case, the
-fields of the complex type are included as top-level fields of the union
-dictionary in the QMP wire format. An example definition is:
 
- { 'type': 'BlockdevCommonOptions', 'data': { 'readonly': 'bool' } }
- { 'union': 'BlockdevOptions',
-   'base': 'BlockdevCommonOptions',
-   'data': { 'raw': 'RawOptions',
-             'qcow2': 'Qcow2Options' } }
+A flat union definition specifies a struct as its base, and
+avoids nesting on the wire.  All branches of the union must be
+complex types, and the top-level fields of the union dictionary on
+the wire will be combination of fields from both the base type and the
+appropriate branch type (when merging two dictionaries, there must be
+no keys in common).  The 'discriminator' field must be the name of an
+enum-typed member of the base struct.
 
-And it looks like this on the wire:
-
- { "type": "qcow2",
-   "readonly": false,
-   "data" : { "backing-file": "/some/place/my-image",
-              "lazy-refcounts": true } }
-
-
-Flat union types avoid the nesting on the wire. They are used whenever a
-specific field of the base type is declared as the discriminator ('type' is
-then no longer generated). The discriminator must be of enumeration type.
-The above example can then be modified as follows:
+The following example enhances the above simple union example by
+adding a common field 'readonly', renaming the discriminator to
+something more applicable, and reducing the number of {} required on
+the wire:
 
  { 'enum': 'BlockdevDriver', 'data': [ 'raw', 'qcow2' ] }
- { 'type': 'BlockdevCommonOptions',
+ { 'struct': 'BlockdevCommonOptions',
    'data': { 'driver': 'BlockdevDriver', 'readonly': 'bool' } }
  { 'union': 'BlockdevOptions',
    'base': 'BlockdevCommonOptions',
    'discriminator': 'driver',
-   'data': { 'raw': 'RawOptions',
+   'data': { 'file': 'FileOptions',
              'qcow2': 'Qcow2Options' } }
 
-Resulting in this JSON object:
+Resulting in these JSON objects:
+
+ { "driver": "file", "readonly": true,
+   "filename": "/some/place/my-image" }
+ { "driver": "qcow2", "readonly": false,
+   "backing-file": "/some/place/my-image", "lazy-refcounts": true }
+
+Notice that in a flat union, the discriminator name is controlled by
+the user, but because it must map to a base member with enum type, the
+code generator can ensure that branches exist for all values of the
+enum (although the order of the keys need not match the declaration of
+the enum).  In the resulting generated C data types, a flat union is
+represented as a struct with the base member fields included directly,
+and then a union of structures for each branch of the struct.
+
+A simple union can always be re-written as a flat union where the base
+class has a single member named 'type', and where each branch of the
+union has a struct with a single member named 'data'.  That is,
 
- { "driver": "qcow2",
-   "readonly": false,
-   "backing-file": "/some/place/my-image",
-   "lazy-refcounts": true }
+ { 'union': 'Simple', 'data': { 'one': 'str', 'two': 'int' } }
 
+is identical on the wire to:
 
-A special type of unions are anonymous unions. They don't form a dictionary in
-the wire format but allow the direct use of different types in their place. As
-they aren't structured, they don't have any explicit discriminator but use
-the (QObject) data type of their value as an implicit discriminator. This means
-that they are restricted to using only one discriminator value per QObject
-type. For example, you cannot have two different complex types in an anonymous
-union, or two different integer types.
+ { 'enum': 'Enum', 'data': ['one', 'two'] }
+ { 'struct': 'Base', 'data': { 'type': 'Enum' } }
+ { 'struct': 'Branch1', 'data': { 'data': 'str' } }
+ { 'struct': 'Branch2', 'data': { 'data': 'int' } }
+ { 'union': 'Flat': 'base': 'Base', 'discriminator': 'type',
+   'data': { 'one': 'Branch1', 'two': 'Branch2' } }
 
-Anonymous unions are declared using an empty dictionary as their discriminator.
-The discriminator values never appear on the wire, they are only used in the
-generated C code. Anonymous unions cannot have a base type.
 
- { 'union': 'BlockRef',
-   'discriminator': {},
+=== Alternate types ===
+
+Usage: { 'alternate': STRING, 'data': DICT }
+
+An alternate type is one that allows a choice between two or more JSON
+data types (string, integer, number, or object, but currently not
+array) on the wire.  The definition is similar to a simple union type,
+where each branch of the union names a QAPI type.  For example:
+
+ { 'alternate': 'BlockRef',
    'data': { 'definition': 'BlockdevOptions',
              'reference': 'str' } }
 
-This example allows using both of the following example objects:
+Just like for a simple union, an implicit C enum 'NameKind' is created
+to enumerate the branches for the alternate 'Name'.
+
+Unlike a union, the discriminator string is never passed on the wire
+for the Client JSON Protocol.  Instead, the value's JSON type serves
+as an implicit discriminator, which in turn means that an alternate
+can only express a choice between types represented differently in
+JSON.  If a branch is typed as the 'bool' built-in, the alternate
+accepts true and false; if it is typed as any of the various numeric
+built-ins, it accepts a JSON number; if it is typed as a 'str'
+built-in or named enum type, it accepts a JSON string; and if it is
+typed as a complex type (struct or union), it accepts a JSON object.
+Two different complex types, for instance, aren't permitted, because
+both are represented as a JSON object.
+
+The example alternate declaration above allows using both of the
+following example objects:
 
  { "file": "my_existing_block_device_id" }
  { "file": { "driver": "file",
@@ -200,23 +393,95 @@ This example allows using both of the following example objects:
 
 === Commands ===
 
-Commands are defined by using a list containing three members.  The first
-member is the command name, the second member is a dictionary containing
-arguments, and the third member is the return type.
-
-An example command is:
+Usage: { 'command': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT,
+         '*returns': TYPE-NAME-OR-DICT,
+         '*gen': false, '*success-response': false }
+
+Commands are defined by using a dictionary containing several members,
+where three members are most common.  The 'command' member is a
+mandatory string, and determines the "execute" value passed in a
+Client JSON Protocol command exchange.
+
+The 'data' argument maps to the "arguments" dictionary passed in as
+part of a Client JSON Protocol command.  The 'data' member is optional
+and defaults to {} (an empty dictionary).  If present, it must be the
+string name of a complex type, a one-element array containing the name
+of a complex type, or a dictionary that declares an anonymous type
+with the same semantics as a 'struct' expression, with one exception
+noted below when 'gen' is used.
+
+The 'returns' member describes what will appear in the "return" field
+of a Client JSON Protocol reply on successful completion of a command.
+The member is optional from the command declaration; if absent, the
+"return" field will be an empty dictionary.  If 'returns' is present,
+it must be the string name of a complex or built-in type, a
+one-element array containing the name of a complex or built-in type,
+or a dictionary that declares an anonymous type with the same
+semantics as a 'struct' expression, with one exception noted below
+when 'gen' is used.  Although it is permitted to have the 'returns'
+member name a built-in type or an array of built-in types, any command
+that does this cannot be extended to return additional information in
+the future; thus, new commands should strongly consider returning a
+dictionary-based type or an array of dictionaries, even if the
+dictionary only contains one field at the present.
+
+All commands in Client JSON Protocol use a dictionary to report
+failure, with no way to specify that in QAPI.  Where the error return
+is different than the usual GenericError class in order to help the
+client react differently to certain error conditions, it is worth
+documenting this in the comments before the command declaration.
+
+Some example commands:
+
+ { 'command': 'my-first-command',
+   'data': { 'arg1': 'str', '*arg2': 'str' } }
+ { 'struct': 'MyType', 'data': { '*value': 'str' } }
+ { 'command': 'my-second-command',
+   'returns': [ 'MyType' ] }
+
+which would validate this Client JSON Protocol transaction:
+
+ => { "execute": "my-first-command",
+      "arguments": { "arg1": "hello" } }
+ <= { "return": { } }
+ => { "execute": "my-second-command" }
+ <= { "return": [ { "value": "one" }, { } ] }
+
+In rare cases, QAPI cannot express a type-safe representation of a
+corresponding Client JSON Protocol command.  In these cases, if the
+command expression includes the key 'gen' with boolean value false,
+then the 'data' or 'returns' member that intends to bypass generated
+type-safety and do its own manual validation should use an inline
+dictionary definition, with a value of '**' rather than a valid type
+name for the keys that the generated code will not validate.  Please
+try to avoid adding new commands that rely on this, and instead use
+type-safe unions.  For an example of bypass usage:
+
+ { 'command': 'netdev_add',
+   'data': {'type': 'str', 'id': 'str', '*props': '**'},
+   'gen': false }
+
+Normally, the QAPI schema is used to describe synchronous exchanges,
+where a response is expected.  But in some cases, the action of a
+command is expected to change state in a way that a successful
+response is not possible (although the command will still return a
+normal dictionary error on failure).  When a successful reply is not
+possible, the command expression should include the optional key
+'success-response' with boolean value false.  So far, only QGA makes
+use of this field.
 
- { 'command': 'my-command',
-   'data': { 'arg1': 'str', '*arg2': 'str' },
-   'returns': 'str' }
 
 === Events ===
 
-Events are defined with the keyword 'event'.  When 'data' is also specified,
-additional info will be included in the event.  Finally there will be C API
-generated in qapi-event.h; when called by QEMU code, a message with timestamp
-will be emitted on the wire.  If timestamp is -1, it means failure to retrieve
-host time.
+Usage: { 'event': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT }
+
+Events are defined with the keyword 'event'.  It is not allowed to
+name an event 'MAX', since the generator also produces a C enumeration
+of all event names with a generated _MAX value at the end.  When
+'data' is also specified, additional info will be included in the
+event, with similar semantics to a 'struct' expression.  Finally there
+will be C API generated in qapi-event.h; when called by QEMU code, a
+message with timestamp will be emitted on the wire.
 
 An example event is:
 
@@ -234,9 +499,9 @@ Resulting in this JSON object:
 
 Schemas are fed into 3 scripts to generate all the code/files that, paired
 with the core QAPI libraries, comprise everything required to take JSON
-commands read in by a QMP/guest agent server, unmarshal the arguments into
+commands read in by a Client JSON Protocol server, unmarshal the arguments into
 the underlying C types, call into the corresponding C function, and map the
-response back to a QMP/guest agent response to be returned to the user.
+response back to a Client JSON Protocol response to be returned to the user.
 
 As an example, we'll use the following schema, which describes a single
 complex user-defined type (which will produce a C struct, along with a list
@@ -245,7 +510,7 @@ case we want to accept/return a list of this type with a command), and a
 command which takes that type as a parameter and returns the same type:
 
     $ cat example-schema.json
-    { 'type': 'UserDefOne',
+    { 'struct': 'UserDefOne',
       'data': { 'integer': 'int', 'string': 'str' } }
 
     { 'command': 'my-command',
@@ -271,7 +536,7 @@ created code.
 Example:
 
     $ python scripts/qapi-types.py --output-dir="qapi-generated" \
-    --prefix="example-" --input-file=example-schema.json
+    --prefix="example-" example-schema.json
     $ cat qapi-generated/example-qapi-types.c
 [Uninteresting stuff omitted...]
 
@@ -311,7 +576,7 @@ Example:
     #ifndef EXAMPLE_QAPI_TYPES_H
     #define EXAMPLE_QAPI_TYPES_H
 
-[Builtin types omitted...]
+[Built-in types omitted...]
 
     typedef struct UserDefOne UserDefOne;
 
@@ -324,7 +589,7 @@ Example:
         struct UserDefOneList *next;
     } UserDefOneList;
 
-[Functions on builtin types omitted...]
+[Functions on built-in types omitted...]
 
     struct UserDefOne
     {
@@ -358,7 +623,7 @@ $(prefix)qapi-visit.h: declarations for previously mentioned visitor
 Example:
 
     $ python scripts/qapi-visit.py --output-dir="qapi-generated"
-    --prefix="example-" --input-file=example-schema.json
+    --prefix="example-" example-schema.json
     $ cat qapi-generated/example-qapi-visit.c
 [Uninteresting stuff omitted...]
 
@@ -415,15 +680,13 @@ Example:
     out:
         error_propagate(errp, err);
     }
-    $ python scripts/qapi-commands.py --output-dir="qapi-generated" \
-    --prefix="example-" --input-file=example-schema.json
     $ cat qapi-generated/example-qapi-visit.h
 [Uninteresting stuff omitted...]
 
     #ifndef EXAMPLE_QAPI_VISIT_H
     #define EXAMPLE_QAPI_VISIT_H
 
-[Visitors for builtin types omitted...]
+[Visitors for built-in types omitted...]
 
     void visit_type_UserDefOne(Visitor *m, UserDefOne **obj, const char *name, Error **errp);
     void visit_type_UserDefOneList(Visitor *m, UserDefOneList **obj, const char *name, Error **errp);
@@ -450,7 +713,7 @@ $(prefix)qmp-commands.h: Function prototypes for the QMP commands
 Example:
 
     $ python scripts/qapi-commands.py --output-dir="qapi-generated"
-    --prefix="example-" --input-file=example-schema.json
+    --prefix="example-" example-schema.json
     $ cat qapi-generated/example-qmp-marshal.c
 [Uninteresting stuff omitted...]
 
@@ -541,7 +804,7 @@ $(prefix)qapi-event.c - Implementation of functions to send an event
 Example:
 
     $ python scripts/qapi-event.py --output-dir="qapi-generated"
-    --prefix="example-" --input-file=example-schema.json
+    --prefix="example-" example-schema.json
     $ cat qapi-generated/example-qapi-event.c
 [Uninteresting stuff omitted...]
 
diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt
index d759d1974..d92cc4833 100644
--- a/docs/qmp/qmp-events.txt
+++ b/docs/qmp/qmp-events.txt
@@ -31,21 +31,27 @@ Example:
 BLOCK_IMAGE_CORRUPTED
 ---------------------
 
-Emitted when a disk image is being marked corrupt.
+Emitted when a disk image is being marked corrupt. The image can be
+identified by its device or node name. The 'device' field is always
+present for compatibility reasons, but it can be empty ("") if the
+image does not have a device name associated.
 
 Data:
 
-- "device": Device name (json-string)
-- "msg":    Informative message (e.g., reason for the corruption) (json-string)
-- "offset": If the corruption resulted from an image access, this is the access
-            offset into the image (json-int)
-- "size":   If the corruption resulted from an image access, this is the access
-            size (json-int)
+- "device":    Device name (json-string)
+- "node-name": Node name (json-string, optional)
+- "msg":       Informative message (e.g., reason for the corruption)
+               (json-string)
+- "offset":    If the corruption resulted from an image access, this
+               is the host's access offset into the image
+               (json-int, optional)
+- "size":      If the corruption resulted from an image access, this
+               is the access size (json-int, optional)
 
 Example:
 
 { "event": "BLOCK_IMAGE_CORRUPTED",
-    "data": { "device": "ide0-hd0",
+    "data": { "device": "ide0-hd0", "node-name": "node0",
         "msg": "Prevented active L1 table overwrite", "offset": 196608,
         "size": 65536 },
     "timestamp": { "seconds": 1378126126, "microseconds": 966463 } }
@@ -226,6 +232,23 @@ Example:
 { "event": "GUEST_PANICKED",
      "data": { "action": "pause" } }
 
+MEM_UNPLUG_ERROR
+--------------------
+Emitted when memory hot unplug error occurs.
+
+Data:
+
+- "device": device name (json-string)
+- "msg": Informative message (e.g., reason for the error) (json-string)
+
+Example:
+
+{ "event": "MEM_UNPLUG_ERROR"
+  "data": { "device": "dimm1",
+            "msg": "acpi: device unplug for unsupported device"
+  },
+  "timestamp": { "seconds": 1265044230, "microseconds": 450486 } }
+
 NIC_RX_FILTER_CHANGED
 ---------------------
 
@@ -450,6 +473,20 @@ Example:
 { "timestamp": {"seconds": 1290688046, "microseconds": 417172},
   "event": "SPICE_MIGRATE_COMPLETED" }
 
+MIGRATION
+---------
+
+Emitted when a migration event happens
+
+Data: None.
+
+ - "status": migration status
+     See MigrationStatus in ~/qapi-schema.json for possible values
+
+Example:
+
+{"timestamp": {"seconds": 1432121972, "microseconds": 744001},
+ "event": "MIGRATION", "data": {"status": "completed"}}
 
 STOP
 ----
diff --git a/docs/qmp/qmp-spec.txt b/docs/qmp/qmp-spec.txt
index 22568c644..4c28cd943 100644
--- a/docs/qmp/qmp-spec.txt
+++ b/docs/qmp/qmp-spec.txt
@@ -1,10 +1,21 @@
                       QEMU Machine Protocol Specification
 
+0. About This Document
+======================
+
+Copyright (C) 2009-2015 Red Hat, Inc.
+
+This work is licensed under the terms of the GNU GPL, version 2 or
+later. See the COPYING file in the top-level directory.
+
 1. Introduction
 ===============
 
-This document specifies the QEMU Machine Protocol (QMP), a JSON-based protocol
-which is available for applications to operate QEMU at the machine-level.
+This document specifies the QEMU Machine Protocol (QMP), a JSON-based
+protocol which is available for applications to operate QEMU at the
+machine-level.  It is also in use by the QEMU Guest Agent (QGA), which
+is available for host applications to interact with the guest
+operating system.
 
 2. Protocol Specification
 =========================
@@ -18,14 +29,27 @@ following format:
 
     json-DATA-STRUCTURE-NAME
 
-Where DATA-STRUCTURE-NAME is any valid JSON data structure, as defined by
-the JSON standard:
+Where DATA-STRUCTURE-NAME is any valid JSON data structure, as defined
+by the JSON standard:
+
+http://www.ietf.org/rfc/rfc7159.txt
 
-http://www.ietf.org/rfc/rfc4627.txt
+The protocol is always encoded in UTF-8 except for synchronization
+bytes (documented below); although thanks to json-string escape
+sequences, the server will reply using only the strict ASCII subset.
 
-For convenience, json-object members and json-array elements mentioned in
-this document will be in a certain order. However, in real protocol usage
-they can be in ANY order, thus no particular order should be assumed.
+For convenience, json-object members mentioned in this document will
+be in a certain order. However, in real protocol usage they can be in
+ANY order, thus no particular order should be assumed. On the other
+hand, use of json-array elements presumes that preserving order is
+important unless specifically documented otherwise.  Repeating a key
+within a json-object gives unpredictable results.
+
+Also for convenience, the server will accept an extension of
+'single-quoted' strings in place of the usual "double-quoted"
+json-string, and both input forms of strings understand an additional
+escape sequence of "\'" for a single quote. The server will only use
+double quoting on output.
 
 2.1 General Definitions
 -----------------------
@@ -52,7 +76,16 @@ The greeting message format is:
 - The "version" member contains the Server's version information (the format
   is the same of the query-version command)
 - The "capabilities" member specify the availability of features beyond the
-  baseline specification
+  baseline specification; the order of elements in this array has no
+  particular significance, so a client must search the entire array
+  when looking for a particular capability
+
+2.2.1 Capabilities
+------------------
+
+As of the date this document was last revised, no server or client
+capability strings have been defined.
+
 
 2.3 Issuing Commands
 --------------------
@@ -65,10 +98,14 @@ The format for command execution is:
 
 - The "execute" member identifies the command to be executed by the Server
 - The "arguments" member is used to pass any arguments required for the
-  execution of the command, it is optional when no arguments are required
+  execution of the command, it is optional when no arguments are
+  required. Each command documents what contents will be considered
+  valid when handling the json-argument
 - The "id" member is a transaction identification associated with the
   command execution, it is optional and will be part of the response if
-  provided
+  provided. The "id" member can be any json-value, although most
+  clients merely use a json-number incremented for each successive
+  command
 
 2.4 Commands Responses
 ----------------------
@@ -81,13 +118,15 @@ of a command execution: success or error.
 
 The format of a success response is:
 
-{ "return": json-object, "id": json-value }
+{ "return": json-value, "id": json-value }
 
  Where,
 
-- The "return" member contains the command returned data, which is defined
-  in a per-command basis or an empty json-object if the command does not
-  return data
+- The "return" member contains the data returned by the command, which
+  is defined on a per-command basis (usually a json-object or
+  json-array of json-objects, but sometimes a json-number, json-string,
+  or json-array of json-strings); it is an empty json-object if the
+  command does not return data
 - The "id" member contains the transaction identification associated
   with the command execution if issued by the Client
 
@@ -114,7 +153,8 @@ if provided by the client.
 -----------------------
 
 As a result of state changes, the Server may send messages unilaterally
-to the Client at any time. They are called "asynchronous events".
+to the Client at any time, when not in the middle of any other
+response. They are called "asynchronous events".
 
 The format of asynchronous events is:
 
@@ -126,13 +166,27 @@ The format of asynchronous events is:
 - The "event" member contains the event's name
 - The "data" member contains event specific data, which is defined in a
   per-event basis, it is optional
-- The "timestamp" member contains the exact time of when the event occurred
-  in the Server. It is a fixed json-object with time in seconds and
-  microseconds
+- The "timestamp" member contains the exact time of when the event
+  occurred in the Server. It is a fixed json-object with time in
+  seconds and microseconds relative to the Unix Epoch (1 Jan 1970); if
+  there is a failure to retrieve host time, both members of the
+  timestamp will be set to -1.
 
 For a listing of supported asynchronous events, please, refer to the
 qmp-events.txt file.
 
+2.5 QGA Synchronization
+-----------------------
+
+When using QGA, an additional synchronization feature is built into
+the protocol.  If the Client sends a raw 0xFF sentinel byte (not valid
+JSON), then the Server will reset its state and discard all pending
+data prior to the sentinel.  Conversely, if the Client makes use of
+the 'guest-sync-delimited' command, the Server will send a raw 0xFF
+sentinel byte prior to its response, to aid the Client in discarding
+any data prior to the sentinel.
+
+
 3. QMP Examples
 ===============
 
@@ -145,32 +199,37 @@ This section provides some examples of real QMP usage, in all of them
 S: { "QMP": { "version": { "qemu": { "micro": 50, "minor": 6, "major": 1 },
      "package": ""}, "capabilities": []}}
 
-3.2 Simple 'stop' execution
+3.2 Client QMP negotiation
+--------------------------
+C: { "execute": "qmp_capabilities" }
+S: { "return": {}}
+
+3.3 Simple 'stop' execution
 ---------------------------
 
 C: { "execute": "stop" }
 S: { "return": {} }
 
-3.3 KVM information
+3.4 KVM information
 -------------------
 
 C: { "execute": "query-kvm", "id": "example" }
 S: { "return": { "enabled": true, "present": true }, "id": "example"}
 
-3.4 Parsing error
+3.5 Parsing error
 ------------------
 
 C: { "execute": }
 S: { "error": { "class": "GenericError", "desc": "Invalid JSON syntax" } }
 
-3.5 Powerdown event
+3.6 Powerdown event
 -------------------
 
 S: { "timestamp": { "seconds": 1258551470, "microseconds": 802384 },
     "event": "POWERDOWN" }
 
 4. Capabilities Negotiation
-----------------------------
+===========================
 
 When a Client successfully establishes a connection, the Server is in
 Capabilities Negotiation mode.
@@ -189,7 +248,7 @@ effect, all commands (except qmp_capabilities) are allowed and asynchronous
 messages are delivered.
 
 5 Compatibility Considerations
-------------------------------
+==============================
 
 All protocol changes or new features which modify the protocol format in an
 incompatible way are disabled by default and will be advertised by the
@@ -213,12 +272,16 @@ However, Clients must not assume any particular:
 - Amount of errors generated by a command, that is, new errors can be added
   to any existing command in newer versions of the Server
 
+Any command or field name beginning with "x-" is deemed experimental,
+and may be withdrawn or changed in an incompatible manner in a future
+release.
+
 Of course, the Server does guarantee to send valid JSON.  But apart from
 this, a Client should be "conservative in what they send, and liberal in
 what they accept".
 
 6. Downstream extension of QMP
-------------------------------
+==============================
 
 We recommend that downstream consumers of QEMU do *not* modify QMP.
 Management tools should be able to support both upstream and downstream
diff --git a/docs/specs/acpi_mem_hotplug.txt b/docs/specs/acpi_mem_hotplug.txt
index 12909940c..3df3620ce 100644
--- a/docs/specs/acpi_mem_hotplug.txt
+++ b/docs/specs/acpi_mem_hotplug.txt
@@ -2,7 +2,7 @@ QEMU<->ACPI BIOS memory hotplug interface
 --------------------------------------
 
 ACPI BIOS GPE.3 handler is dedicated for notifying OS about memory hot-add
-events.
+and hot-remove events.
 
 Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access):
 ---------------------------------------------------------------
@@ -19,7 +19,9 @@ Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access):
               1: Device insert event, used to distinguish device for which
                  no device check event to OSPM was issued.
                  It's valid only when bit 1 is set.
-              2-7: reserved and should be ignored by OSPM
+              2: Device remove event, used to distinguish device for which
+                 no device eject request to OSPM was issued.
+              3-7: reserved and should be ignored by OSPM
       [0x15-0x17] reserved
 
   write access:
@@ -31,14 +33,62 @@ Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access):
       [0xc-0x13] reserved, writes into it are ignored
       [0x14] Memory device control fields
           bits:
-              0: reserved, OSPM must clear it before writing to register
+              0: reserved, OSPM must clear it before writing to register.
+                 Due to BUG in versions prior 2.4 that field isn't cleared
+                 when other fields are written. Keep it reserved and don't
+                 try to reuse it.
               1: if set to 1 clears device insert event, set by OSPM
                  after it has emitted device check event for the
                  selected memory device
-              2-7: reserved, OSPM must clear them before writing to register
+              2: if set to 1 clears device remove event, set by OSPM
+                 after it has emitted device eject request for the
+                 selected memory device
+              3: if set to 1 initiates device eject, set by OSPM when it
+                 triggers memory device removal and calls _EJ0 method
+              4-7: reserved, OSPM must clear them before writing to register
 
 Selecting memory device slot beyond present range has no effect on platform:
    - write accesses to memory hot-plug registers not documented above are
      ignored
    - read accesses to memory hot-plug registers not documented above return
      all bits set to 1.
+
+Memory hot remove process diagram:
+----------------------------------
+ +-------------+     +-----------------------+      +------------------+     
+ |  1. QEMU    |     | 2. QEMU               |      |3. QEMU           |     
+ |  device_del +---->+ device unplug request +----->+Send SCI to guest,|     
+ |             |     |         cb            |      |return control to |     
+ +-------------+     +-----------------------+      |management        |     
+                                                    +------------------+     
+                                                                             
+ +---------------------------------------------------------------------+     
+                                                                             
+ +---------------------+              +-------------------------+            
+ | OSPM:               | remove event | OSPM:                   |            
+ | send Eject Request, |              | Scan memory devices     |            
+ | clear remove event  +<-------------+ for event flags         |            
+ |                     |              |                         |            
+ +---------------------+              +-------------------------+            
+           |                                                                 
+           |                                                                 
+ +---------v--------+            +-----------------------+                   
+ | Guest OS:        |  success   | OSPM:                 |                   
+ | process Ejection +----------->+ Execute _EJ0 method,  |                   
+ | request          |            | set eject bit in flags|                   
+ +------------------+            +-----------------------+                   
+           |failure                         |                                
+           v                                v                                
+ +------------------------+      +-----------------------+                   
+ | OSPM:                  |      | QEMU:                 |                   
+ | set OST event & status |      | call device unplug cb |                   
+ | fields                 |      |                       |                   
+ +------------------------+      +-----------------------+                   
+          |                                  |                               
+          v                                  v                               
+ +------------------+              +-------------------+                     
+ |QEMU:             |              |QEMU:              |                     
+ |Send OST QMP event|              |Send device deleted|                     
+ |                  |              |QMP event          |                     
+ +------------------+              |                   |                     
+                                   +-------------------+
diff --git a/docs/specs/fw_cfg.txt b/docs/specs/fw_cfg.txt
index 6accd924b..74351dd18 100644
--- a/docs/specs/fw_cfg.txt
+++ b/docs/specs/fw_cfg.txt
@@ -203,3 +203,24 @@ completes fully overwriting the item's data.
 
 NOTE: This function is deprecated, and will be completely removed
 starting with QEMU v2.4.
+
+== Externally Provided Items ==
+
+As of v2.4, "file" fw_cfg items (i.e., items with selector keys above
+FW_CFG_FILE_FIRST, and with a corresponding entry in the fw_cfg file
+directory structure) may be inserted via the QEMU command line, using
+the following syntax:
+
+    -fw_cfg [name=]<item_name>,file=<path>
+
+where <item_name> is the fw_cfg item name, and <path> is the location
+on the host file system of a file containing the data to be inserted.
+
+NOTE: Users *SHOULD* choose item names beginning with the prefix "opt/"
+when using the "-fw_cfg" command line option, to avoid conflicting with
+item names used internally by QEMU. For instance:
+
+    -fw_cfg name=opt/my_item_name,file=./my_blob.bin
+
+Similarly, QEMU developers *SHOULD NOT* use item names prefixed with
+"opt/" when inserting items programmatically, e.g. via fw_cfg_add_file().
diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt
index c6732fe00..0adcb89aa 100644
--- a/docs/specs/pci-ids.txt
+++ b/docs/specs/pci-ids.txt
@@ -45,7 +45,9 @@ PCI devices (other than virtio):
 1b36:0003  PCI Dual-port 16550A adapter (docs/specs/pci-serial.txt)
 1b36:0004  PCI Quad-port 16550A adapter (docs/specs/pci-serial.txt)
 1b36:0005  PCI test device (docs/specs/pci-testdev.txt)
+1b36:0006  PCI Rocker Ethernet switch device
 1b36:0007  PCI SD Card Host Controller Interface (SDHCI)
+1b36:000a  PCI-PCI bridge (multiseat)
 
 All these devices are documented in docs/specs.
 
diff --git a/docs/specs/ppc-spapr-hotplug.txt b/docs/specs/ppc-spapr-hotplug.txt
new file mode 100644
index 000000000..46e07196b
--- /dev/null
+++ b/docs/specs/ppc-spapr-hotplug.txt
@@ -0,0 +1,305 @@
+= sPAPR Dynamic Reconfiguration =
+
+sPAPR/"pseries" guests make use of a facility called dynamic-reconfiguration
+to handle hotplugging of dynamic "physical" resources like PCI cards, or
+"logical"/paravirtual resources like memory, CPUs, and "physical"
+host-bridges, which are generally managed by the host/hypervisor and provided
+to guests as virtualized resources. The specifics of dynamic-reconfiguration
+are documented extensively in PAPR+ v2.7, Section 13.1. This document
+provides a summary of that information as it applies to the implementation
+within QEMU.
+
+== Dynamic-reconfiguration Connectors ==
+
+To manage hotplug/unplug of these resources, a firmware abstraction known as
+a Dynamic Resource Connector (DRC) is used to assign a particular dynamic
+resource to the guest, and provide an interface for the guest to manage
+configuration/removal of the resource associated with it.
+
+== Device-tree description of DRCs ==
+
+A set of 4 Open Firmware device tree array properties are used to describe
+the name/index/power-domain/type of each DRC allocated to a guest at
+boot-time. There may be multiple sets of these arrays, rooted at different
+paths in the device tree depending on the type of resource the DRCs manage.
+
+In some cases, the DRCs themselves may be provided by a dynamic resource,
+such as the DRCs managing PCI slots on a hotplugged PHB. In this case the
+arrays would be fetched as part of the device tree retrieval interfaces
+for hotplugged resources described under "Guest->Host interface".
+
+The array properties are described below. Each entry/element in an array
+describes the DRC identified by the element in the corresponding position
+of ibm,drc-indexes:
+
+ibm,drc-names:
+  first 4-bytes: BE-encoded integer denoting the number of entries
+  each entry: a NULL-terminated <name> string encoded as a byte array
+
+  <name> values for logical/virtual resources are defined in PAPR+ v2.7,
+  Section 13.5.2.4, and basically consist of the type of the resource
+  followed by a space and a numerical value that's unique across resources
+  of that type.
+
+  <name> values for "physical" resources such as PCI or VIO devices are
+  defined as being "location codes", which are the "location labels" of
+  each encapsulating device, starting from the chassis down to the
+  individual slot for the device, concatenated by a hyphen. This provides
+  a mapping of resources to a physical location in a chassis for debugging
+  purposes. For QEMU, this mapping is less important, so we assign a
+  location code that conforms to naming specifications, but is simply a
+  location label for the slot by itself to simplify the implementation.
+  The naming convention for location labels is documented in detail in
+  PAPR+ v2.7, Section 12.3.1.5, and in our case amounts to using "C<n>"
+  for PCI/VIO device slots, where <n> is unique across all PCI/VIO
+  device slots.
+
+ibm,drc-indexes:
+  first 4-bytes: BE-encoded integer denoting the number of entries
+  each 4-byte entry: BE-encoded <index> integer that is unique across all DRCs
+    in the machine
+
+  <index> is arbitrary, but in the case of QEMU we try to maintain the
+  convention used to assign them to pSeries guests on pHyp:
+
+    bit[31:28]: integer encoding of <type>, where <type> is:
+                  1 for CPU resource
+                  2 for PHB resource
+                  3 for VIO resource
+                  4 for PCI resource
+                  8 for Memory resource
+    bit[27:0]: integer encoding of <id>, where <id> is unique across
+                 all resources of specified type
+
+ibm,drc-power-domains:
+  first 4-bytes: BE-encoded integer denoting the number of entries
+  each 4-byte entry: 32-bit, BE-encoded <index> integer that specifies the
+    power domain the resource will be assigned to. In the case of QEMU
+    we associated all resources with a "live insertion" domain, where the
+    power is assumed to be managed automatically. The integer value for
+    this domain is a special value of -1.
+
+
+ibm,drc-types:
+  first 4-bytes: BE-encoded integer denoting the number of entries
+  each entry: a NULL-terminated <type> string encoded as a byte array
+
+  <type> is assigned as follows:
+    "CPU" for a CPU
+    "PHB" for a physical host-bridge
+    "SLOT" for a VIO slot
+    "28" for a PCI slot
+    "MEM" for memory resource
+
+== Guest->Host interface to manage dynamic resources ==
+
+Each DRC is given a globally unique DRC Index, and resources associated with
+a particular DRC are configured/managed by the guest via a number of RTAS
+calls which reference individual DRCs based on the DRC index. This can be
+considered the guest->host interface.
+
+rtas-set-power-level:
+  arg[0]: integer identifying power domain
+  arg[1]: new power level for the domain, 0-100
+  output[0]: status, 0 on success
+  output[1]: power level after command
+
+  Set the power level for a specified power domain
+
+rtas-get-power-level:
+  arg[0]: integer identifying power domain
+  output[0]: status, 0 on success
+  output[1]: current power level
+
+  Get the power level for a specified power domain
+
+rtas-set-indicator:
+  arg[0]: integer identifying sensor/indicator type
+  arg[1]: index of sensor, for DR-related sensors this is generally the
+          DRC index
+  arg[2]: desired sensor value
+  output[0]: status, 0 on success
+
+  Set the state of an indicator or sensor. For the purpose of this document we
+  focus on the indicator/sensor types associated with a DRC. The types are:
+
+    9001: isolation-state, controls/indicates whether a device has been made
+          accessible to a guest
+
+          supported sensor values:
+            0: isolate, device is made unaccessible by guest OS
+            1: unisolate, device is made available to guest OS
+
+    9002: dr-indicator, controls "visual" indicator associated with device
+
+          supported sensor values:
+            0: inactive, resource may be safely removed
+            1: active, resource is in use and cannot be safely removed
+            2: identify, used to visually identify slot for interactive hotplug
+            3: action, in most cases, used in the same manner as identify
+
+    9003: allocation-state, generally only used for "logical" DR resources to
+          request the allocation/deallocation of a resource prior to acquiring
+          it via isolation-state->unisolate, or after releasing it via
+          isolation-state->isolate, respectively. for "physical" DR (like PCI
+          hotplug/unplug) the pre-allocation of the resource is implied and
+          this sensor is unused.
+
+          supported sensor values:
+            0: unusable, tell firmware/system the resource can be
+               unallocated/reclaimed and added back to the system resource pool
+            1: usable, request the resource be allocated/reserved for use by
+               guest OS
+            2: exchange, used to allocate a spare resource to use for fail-over
+               in certain situations. unused in QEMU
+            3: recover, used to reclaim a previously allocated resource that's
+               not currently allocated to the guest OS. unused in QEMU
+
+rtas-get-sensor-state:
+  arg[0]: integer identifying sensor/indicator type
+  arg[1]: index of sensor, for DR-related sensors this is generally the
+          DRC index
+  output[0]: status, 0 on success
+
+  Used to read an indicator or sensor value.
+
+  For DR-related operations, the only noteworthy sensor is dr-entity-sense,
+  which has a type value of 9003, as allocation-state does in the case of
+  rtas-set-indicator. The semantics/encodings of the sensor values are distinct
+  however:
+
+  supported sensor values for dr-entity-sense (9003) sensor:
+    0: empty,
+         for physical resources: DRC/slot is empty
+         for logical resources: unused
+    1: present,
+         for physical resources: DRC/slot is populated with a device/resource
+         for logical resources: resource has been allocated to the DRC
+    2: unusable,
+         for physical resources: unused
+         for logical resources: DRC has no resource allocated to it
+    3: exchange,
+         for physical resources: unused
+         for logical resources: resource available for exchange (see
+           allocation-state sensor semantics above)
+    4: recovery,
+         for physical resources: unused
+         for logical resources: resource available for recovery (see
+           allocation-state sensor semantics above)
+
+rtas-ibm-configure-connector:
+  arg[0]: guest physical address of 4096-byte work area buffer
+  arg[1]: 0, or address of additional 4096-byte work area buffer. only non-zero
+          if a prior RTAS response indicated a need for additional memory
+  output[0]: status:
+               0: completed transmittal of device-tree node
+               1: instruct guest to prepare for next DT sibling node
+               2: instruct guest to prepare for next DT child node
+               3: instruct guest to prepare for next DT property
+               4: instruct guest to ascend to parent DT node
+               5: instruct guest to provide additional work-area buffer
+                  via arg[1]
+            990x: instruct guest that operation took too long and to try
+                  again later
+
+  Used to fetch an OF device-tree description of the resource associated with
+  a particular DRC. The DRC index is encoded in the first 4-bytes of the first
+  work area buffer.
+
+  Work area layout, using 4-byte offsets:
+    wa[0]: DRC index of the DRC to fetch device-tree nodes from
+    wa[1]: 0 (hard-coded)
+    wa[2]: for next-sibling/next-child response:
+             wa offset of null-terminated string denoting the new node's name
+           for next-property response:
+             wa offset of null-terminated string denoting new property's name
+    wa[3]: for next-property response (unused otherwise):
+             byte-length of new property's value
+    wa[4]: for next-property response (unused otherwise):
+             new property's value, encoded as an OFDT-compatible byte array
+
+== hotplug/unplug events ==
+
+For most DR operations, the hypervisor will issue host->guest add/remove events
+using the EPOW/check-exception notification framework, where the host issues a
+check-exception interrupt, then provides an RTAS event log via an
+rtas-check-exception call issued by the guest in response. This framework is
+documented by PAPR+ v2.7, and already use in by QEMU for generating powerdown
+requests via EPOW events.
+
+For DR, this framework has been extended to include hotplug events, which were
+previously unneeded due to direct manipulation of DR-related guest userspace
+tools by host-level management such as an HMC. This level of management is not
+applicable to PowerKVM, hence the reason for extending the notification
+framework to support hotplug events.
+
+Note that these events are not yet formally part of the PAPR+ specification,
+but support for this format has already been implemented in DR-related
+guest tools such as powerpc-utils/librtas, as well as kernel patches that have
+been submitted to handle in-kernel processing of memory/cpu-related hotplug
+events[1], and is planned for formal inclusion is PAPR+ specification. The
+hotplug-specific payload is QEMU implemented as follows (with all values
+encoded in big-endian format):
+
+struct rtas_event_log_v6_hp {
+#define SECTION_ID_HOTPLUG              0x4850 /* HP */
+    struct section_header {
+        uint16_t section_id;            /* set to SECTION_ID_HOTPLUG */
+        uint16_t section_length;        /* sizeof(rtas_event_log_v6_hp),
+                                         * plus the length of the DRC name
+                                         * if a DRC name identifier is
+                                         * specified for hotplug_identifier
+                                         */
+        uint8_t section_version;        /* version 1 */
+        uint8_t section_subtype;        /* unused */
+        uint16_t creator_component_id;  /* unused */
+    } hdr;
+#define RTAS_LOG_V6_HP_TYPE_CPU         1
+#define RTAS_LOG_V6_HP_TYPE_MEMORY      2
+#define RTAS_LOG_V6_HP_TYPE_SLOT        3
+#define RTAS_LOG_V6_HP_TYPE_PHB         4
+#define RTAS_LOG_V6_HP_TYPE_PCI         5
+    uint8_t hotplug_type;               /* type of resource/device */
+#define RTAS_LOG_V6_HP_ACTION_ADD       1
+#define RTAS_LOG_V6_HP_ACTION_REMOVE    2
+    uint8_t hotplug_action;             /* action (add/remove) */
+#define RTAS_LOG_V6_HP_ID_DRC_NAME      1
+#define RTAS_LOG_V6_HP_ID_DRC_INDEX     2
+#define RTAS_LOG_V6_HP_ID_DRC_COUNT     3
+    uint8_t hotplug_identifier;         /* type of the resource identifier,
+                                         * which serves as the discriminator
+                                         * for the 'drc' union field below
+                                         */
+    uint8_t reserved;
+    union {
+        uint32_t index;                 /* DRC index of resource to take action
+                                         * on
+                                         */
+        uint32_t count;                 /* number of DR resources to take
+                                         * action on (guest chooses which)
+                                         */
+        char name[1];                   /* string representing the name of the
+                                         * DRC to take action on
+                                         */
+    } drc;
+} QEMU_PACKED;
+
+== ibm,lrdr-capacity ==
+
+ibm,lrdr-capacity is a property in the /rtas device tree node that identifies
+the dynamic reconfiguration capabilities of the guest. It consists of a triple
+consisting of <phys>, <size> and <maxcpus>.
+
+  <phys>, encoded in BE format represents the maximum address in bytes and
+  hence the maximum memory that can be allocated to the guest.
+
+  <size>, encoded in BE format represents the size increments in which
+  memory can be hot-plugged to the guest.
+
+  <maxcpus>, a BE-encoded integer, represents the maximum number of
+  processors that the guest can have.
+
+pseries guests use this property to note the maximum allowed CPUs for the
+guest.
+
+[1] http://thread.gmane.org/gmane.linux.ports.ppc.embedded/75350/focus=106867
diff --git a/docs/specs/rocker.txt b/docs/specs/rocker.txt
new file mode 100644
index 000000000..1c743515c
--- /dev/null
+++ b/docs/specs/rocker.txt
@@ -0,0 +1,1014 @@
+Rocker Network Switch Register Programming Guide
+Copyright (c) Scott Feldman <sfeldma@gmail.com>
+Copyright (c) Neil Horman <nhorman@tuxdriver.com>
+Version 0.11, 12/29/2014
+
+LICENSE
+=======
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+SECTION 1: Introduction
+=======================
+
+Overview
+--------
+
+This document describes the hardware/software interface for the Rocker switch
+device.  The intended audience is authors of OS drivers and device emulation
+software.
+
+Notations and Conventions
+-------------------------
+
+o In register descriptions, [n:m] indicates a range from bit n to bit m,
+inclusive.
+o Use of leading 0x indicates a hexadecimal number.
+o Use of leading 0b indicates a binary number.
+o The use of RSVD or Reserved indicates that a bit or field is reserved for
+future use.
+o Field width is in bytes, unless otherwise noted.
+o Register are (R) read-only, (R/W) read/write, (W) write-only, or (COR) clear
+on read
+o TLV values in network-byte-order are designated with (N).
+
+
+SECTION 2: PCI Configuration Registers
+======================================
+
+PCI Configuration Space
+-----------------------
+
+Each switch instance registers as a PCI device with PCI configuration space:
+
+	offset	width	description		value
+	---------------------------------------------
+	0x0	2	Vendor ID		0x1b36
+	0x2	2	Device ID		0x0006
+	0x4	4	Command/Status
+	0x8	1	Revision ID		0x01
+	0x9	3	Class code		0x2800
+	0xC	1	Cache line size
+	0xD	1	Latency timer
+	0xE	1	Header type
+	0xF	1	Built-in self test
+	0x10	4	Base address low
+	0x14	4	Base address high
+	0x18-28		Reserved
+	0x2C	2	Subsystem vendor ID	*
+	0x2E	2	Subsystem ID		*
+	0x30-38		Reserved
+	0x3C	1	Interrupt line
+	0x3D	1	Interrupt pin		0x00
+	0x3E	1	Min grant		0x00
+	0x3D	1	Max latency		0x00
+	0x40	1	TRDY timeout
+	0x41	1	Retry count
+	0x42	2	Reserved
+
+
+* Assigned by sub-system implementation
+
+SECTION 3: Memory-Mapped Register Space
+=======================================
+
+There are two memory-mapped BARs.  BAR0 maps device register space and is
+0x2000 in size.  BAR1 maps MSI-X vector and PBA tables and is also 0x2000 in
+size, allowing for 256 MSI-X vectors.
+
+All registers are 4 or 8 bytes long.  It is assumed host software will access 4
+byte registers with one 4-byte access, and 8 byte registers with either two
+4-byte accesses or a single 8-byte access.  In the case of two 4-byte accesses,
+access must be lower and then upper 4-bytes, in that order.
+
+BAR0 device register space is organized as follows:
+
+	offset		description
+	------------------------------------------------------
+	0x0000-0x000f	Bogus registers to catch misbehaving
+			drivers.  Writes do nothing.  Reads
+			back as 0xDEADBABE.
+	0x0010-0x00ff	Test registers
+	0x0300-0x03ff	General purpose registers
+	0x1000-0x1fff	Descriptor control
+
+Holes in register space are reserved.  Writes to reserved registers do nothing.
+Reads to reserved registers read back as 0.
+
+No fancy stuff like write-combining is enabled on any of the registers.
+
+BAR1 MSI-X register space is organized as follows:
+
+	offset		description
+	------------------------------------------------------
+	0x0000-0x0fff	MSI-X vector table (256 vectors total)
+	0x1000-0x1fff	MSI-X PBA table
+
+
+SECTION 4: Interrupts, DMA, and Endianness
+==========================================
+
+PCI Interrupts
+--------------
+
+The device supports only MSI-X interrupts.  BAR1 memory-mapped region contains
+the MSI-X vector and PBA tables, with support for up to 256 MSI-X vectors.
+
+The vector assignment is:
+
+	vector		description
+	-----------------------------------------------------
+	0		Command descriptor ring completion
+	1		Event descriptor ring completion
+	2		Test operation completion
+	3		RSVD
+	4-255		Tx and Rx descriptor ring completion
+			  Tx vector is even
+			  Rx vector is odd
+
+A MSI-X vector table entry is 16 bytes:
+
+	field		offset	width	description
+	-------------------------------------------------------------
+	lower_addr	0x0	4	[31:2] message address[31:2]
+					[1:0] Rsvd (4 byte alignment
+						    required)
+	upper_addr	0x4	4	[31:19] Rsvd
+					[14:0] message address[46:32]
+	data		0x8	4	message data[31:0]
+	control		0xc	4	[31:1] Rsvd
+					[0] mask (0 = enable,
+						  1 = masked)
+
+Software should install the Interrupt Service Routine (ISR) before any ports
+are enabled or any commands are issued on the command ring.
+
+DMA Operations
+--------------
+
+DMA operations are used for packet DMA to/from the CPU, command and event
+processing.  Command processing includes statistical counters and table dumps,
+table insertion/deletion, and more.  Event processing provides an async
+notification method for device-originating events.  Each DMA operation has a
+set of control registers to manage a descriptor ring.  The descriptor rings are
+allocated from contiguous host DMA-able memory and registers specify the rings
+base address, size and current head and tail indices.  Software always writes
+the head, and hardware always writes the tail.
+
+The higher-order bit of DMA_DESC_COMP_ERR is used to mark hardware completion
+of a descriptor.  Software will clear this bit when posting a descriptor to the
+ring, and hardware will set this bit when the descriptor is complete.
+
+Descriptor ring sizes must be a power of 2 and range from 2 to 64K entries.
+Descriptor rings' base address must be 8-byte aligned.  Descriptors must be
+packed within ring.  Each descriptor in each ring must also be aligned on an 8
+byte boundary.  Each descriptor ring will have these registers:
+
+	DMA_DESC_xxx_BASE_ADDR, offset 0x1000 + (x * 32), 64-bit, (R/W)
+	DMA_DESC_xxx_SIZE, offset 0x1008 + (x * 32), 32-bit, (R/W)
+	DMA_DESC_xxx_HEAD, offset 0x100c + (x * 32), 32-bit, (R/W)
+	DMA_DESC_xxx_TAIL, offset 0x1010 + (x * 32), 32-bit, (R)
+	DMA_DESC_xxx_CTRL, offset 0x1014 + (x * 32), 32-bit, (W)
+	DMA_DESC_xxx_CREDITS, offset 0x1018 + (x * 32), 32-bit, (R/W)
+	DMA_DESC_xxx_RSVD1, offset 0x101c + (x * 32), 32-bit, (R/W)
+
+Where x is descriptor ring index:
+
+	index		ring
+	--------------------
+	0		CMD
+	1		EVENT
+	2		TX (port 0)
+	3		RX (port 0)
+	4		TX (port 1)
+	5		RX (port 1)
+	.
+	.
+	.
+	124		TX (port 61)
+	125		RX (port 61)
+	126		Resv
+	127		Resv
+
+Writing BASE_ADDR or SIZE will reset HEAD and TAIL to zero.  HEAD cannot be
+written past TAIL.  To do so would wrap the ring.  An empty ring is when HEAD
+== TAIL.  A full ring is when HEAD is one position behind TAIL.  Both HEAD and
+TAIL increment and modulo wrap at the ring size.
+
+CTRL register bits:
+
+	bit	name		description
+	------------------------------------------------------------------------
+	[0]	CTRL_RESET	Reset the descriptor ring
+	[1:31]	Reserved
+
+All descriptor types share some common fields:
+
+	field			width	description
+	-------------------------------------------------------------------
+	DMA_DESC_BUF_ADDR	8	Phys addr of desc payload, 8-byte
+					aligned
+	DMA_DESC_COOKIE		8	Desc cookie for completion matching,
+					upper-most bit is reserved
+	DMA_DESC_BUF_SIZE	2	Desc payload size in bytes
+	DMA_DESC_TLV_SIZE	2	Desc payload total size in bytes
+					used for TLVs.  Must be <=
+					DMA_DESC_BUF_SIZE.
+	DMA_DESC_COMP_ERR	2	Completion status of associated
+					desc payload.  High order bit is
+					clear on new descs, toggled by
+					hw for completed items.
+
+To support forward- and backward-compatibility, descriptor and completion
+payloads are specified in TLV format.  Fields are packed with Type=field name,
+Length=field length, and Value=field value.  Software will ignore unknown fields
+filled in by the switch.  Likewise, the switch will ignore unknown fields
+filled in by software.
+
+Descriptor payload buffer is 8-byte aligned and TLVs are 8-byte aligned.  The
+value within a TLV is also 8-byte aligned.  The (packed, 8 byte) TLV header is:
+
+	field	width	description
+	-----------------------------
+	type	4	TLV type
+	len	2	TLV value length
+	pad	2	Reserved
+
+The alignment requirements for descriptors and TLVs are to avoid unaligned
+access exceptions in software.  Note that the payload for each TLV is also
+8 byte aligned.
+
+Figure 1 shows an example descriptor buffer with two TLVs.
+
+                  <------- 8 bytes ------->
+
+  8-byte  +––––+  +–––––––––––+–––––+–––––+                     +–+
+  align           |   type    | len | pad |    TLV#1 hdr          |
+                  +–––––––––––+–––––+–––––+    (len=22)           |
+                  |                       |                       |
+                  |  value                |    TVL#1 value        |
+                  |                       |    (padded to 8-byte  |
+                  |                 +–––––+     alignment)        |
+                  |                 |/////|                       |
+   8-byte +––––+  +–––––––––––+–––––––––––+                       |
+   align          |   type    | len | pad |    TLV#2 hdr    DESC_BUF_SIZE
+                  +–––––+–––––+–––––+–––––+    (len=2)            |
+                  |value|/////////////////|    TLV#2 value        |
+                  +–––––+/////////////////|                       |
+                  |///////////////////////|                       |
+                  |///////////////////////|                       |
+                  |///////////////////////|                       |
+                  |////////unused/////////|                       |
+                  |////////space//////////|                       |
+                  |///////////////////////|                       |
+                  |///////////////////////|                       |
+                  |///////////////////////|                       |
+                  +–––––––––––––––––––––––+                     +–+
+
+				fig. 1
+
+TLVs can be nested within the NEST TLV type.
+
+Interrupt credits
+^^^^^^^^^^^^^^^^^
+
+MSI-X vectors used for descriptor ring completions use a credit mechanism for
+efficient device, PCIe bus, OS and driver operations.  Each descriptor ring has
+a credit count which represents the number of outstanding descriptors to be
+processed by the driver.  As the device marks descriptors complete, the credit
+count is incremented.  As the driver processes those outstanding descriptors,
+it returns credits back to the device.  This way, the device knows the driver's
+progress and can make decisions about when to fire the next interrupt or not.
+When the credit count is zero, and the first descriptors are posted for the
+driver, a single interrupt is fired.  Once the interrupt is fired, the
+interrupt is disabled (auto-masked*).  In response to the interrupt, the driver
+will process descriptors and PIO write a returned credit value for that
+descriptor ring.  If the driver returns all credits (the driver caught up with
+the device and there is no outstanding work), then the interrupt is unmasked,
+but not fired.  If only partial credits are returned, the interrupt remains
+masked but the device generates an interrupt, signaling the driver that more
+outstanding work is available.
+
+(* this masking is unrelated to to the MSI-X interrupt mask register)
+
+Endianness
+----------
+
+Device registers are hard-coded to little-endian (LE).  The driver should
+convert to/from host endianess to LE for device register accesses.
+
+Descriptors are LE.  Descriptor buffer TLVs will have LE type and length
+fields, but the value field can either be LE or network-byte-order, depending
+on context.  TLV values containing network packet data will be in network-byte
+order.  A TLV value containing a field or mask used to compare against network
+packet data is network-byte order.  For example, flow match fields (and masks)
+are network-byte-order since they're matched directly, byte-by-byte, against
+network packet data.  All non-network-packet TLV multi-byte values will be LE.
+
+TLV values in network-byte-order are designated with (N).
+
+
+SECTION 5: Test Registers
+=========================
+
+Rocker has several test registers to support troubleshooting register access,
+interrupt generation, and DMA operations:
+
+	TEST_REG, offset 0x0010, 32-bit (R/W)
+	TEST_REG64, offset 0x0018, 64-bit (R/W)
+	TEST_IRQ, offset 0x0020, 32-bit (R/W)
+	TEST_DMA_ADDR, offset 0x0028, 64-bit (R/W)
+	TEST_DMA_SIZE, offset 0x0030, 32-bit (R/W)
+	TEST_DMA_CTRL, offset 0x0034, 32-bit (R/W)
+
+Reads to TEST_REG and TEST_REG64 will read a value equal to twice the last
+value written to the register.  The 32-bit and 64-bit versions are for testing
+32-bit and 64-bit host accesses.
+
+A vector can be written to TEST_IRQ and the device will generate an interrupt
+for that vector.
+
+To test basic DMA operations, allocate a DMA-able host buffer and put the
+buffer address into TEST_DMA_ADDR and size into TEST_DMA_SIZE.  Then, write to
+TEST_DMA_CTRL to manipulate the buffer contents.  TEST_DMA_CTRL operations are:
+
+	operation		value	description
+	-----------------------------------------------------------
+	TEST_DMA_CTRL_CLEAR	1	clear buffer
+	TEST_DMA_CTRL_FILL	2	fill buffer bytes with 0x96
+	TEST_DMA_CTRL_INVERT	4	invert bytes in buffer
+
+Various buffer address and sizes should be tested to verify no address boundary
+issue exists.  In particular, buffers that start on odd-8-byte boundary and/or
+span multiple PAGE sizes should be tested.
+
+
+SECTION 6: Ports
+================
+
+Physical and Logical Ports
+------------------------------------
+
+The switch supports up to 62 physical (front-panel) ports.  Register
+PORT_PHYS_COUNT returns the actual number of physical ports available:
+
+	PORT_PHYS_COUNT, offset 0x0304, 32-bit, (R)
+
+In addition to front-panel ports, the switch supports logical ports for
+tunnels.
+
+Front-panel ports and logical tunnel ports are mapped into a single 32-bit port
+space.  A special CPU port is assigned port 0.  The front-panel ports are
+mapped to ports 1-62.  A special loopback port is assigned port 63.  Logical
+tunnel ports are assigned ports 0x0001000-0x0001ffff.
+To summarize the port assignments:
+
+	port			mapping
+	-------------------------------------------------------
+	0			CPU port (for packets to/from host CPU)
+	1-62			front-panel physical ports
+	63			loopback port
+	64-0x0000ffff		RSVD
+	0x00010000-0x0001ffff	logical tunnel ports
+	0x00020000-0xffffffff	RSVD
+
+Physical Port Mode
+------------------
+
+Switch front-panel ports operate in a mode.  Currently, the only mode is
+OF-DPA.  OF-DPA[1] mode is based on OpenFlow Data Plane Abstraction (OF-DPA)
+Abstract Switch Specification, Version 1.0, from Broadcom Corporation.  To
+set/get the mode for front-panel ports, see port settings, below.
+
+Port Settings
+-------------
+
+Link status for all front-panel ports is available via PORT_PHYS_LINK_STATUS:
+
+	PORT_PHYS_LINK_STATUS, offset 0x0310, 64-bit, (R)
+
+	Value is port bitmap.  Bits 0 and 63 always read 0.  Bits 1-62
+	read 1 for link UP and 0 for link DOWN for respective front-panel ports.
+
+Other properties for front-panel ports are available via DMA CMD descriptors:
+
+	Get PORT_SETTINGS descriptor:
+
+		field		width	description
+		----------------------------------------------
+		PORT_SETTINGS	2	CMD_GET
+		PPORT		4	Physical port #
+
+	Get PORT_SETTINGS completion:
+
+		field		width	description
+		----------------------------------------------
+		PPORT		4	Physical port #
+		SPEED		4	Current port interface speed, in Mbps
+		DUPLEX		1	1 = Full, 0 = Half
+		AUTONEG		1	1 = enabled, 0 = disabled
+		MACADDR		6	Port MAC address
+		MODE		1	0 = OF-DPA
+		LEARNING	1	MAC address learning on port
+						1 = enabled
+						0 = disabled
+		PHYS_NAME	<var>	Physical port name (string)
+
+	Set PORT_SETTINGS descriptor:
+
+		field		width	description
+		----------------------------------------------
+		PORT_SETTINGS	2	CMD_SET
+		PPORT		4	Physical port #
+		SPEED		4	Port interface speed, in Mbps
+		DUPLEX		1	1 = Full, 0 = Half
+		AUTONEG		1	1 = enabled, 0 = disabled
+		MACADDR		6	Port MAC address
+		MODE		1	0 = OF-DPA
+
+Port Enable
+-----------
+
+Front-panel ports are initially disabled, which means port ingress and egress
+packets will be dropped.  To enable or disable a port, use PORT_PHYS_ENABLE:
+
+	PORT_PHYS_ENABLE: offset 0x0318, 64-bit, (R/W)
+
+	Value is bitmap of first 64 ports.  Bits 0 and 63 are ignored
+	and always read as 0.  Write 1 to enable port; write 0 to disable it.
+	Default is 0.
+
+
+SECTION 7: Switch Control
+=========================
+
+This section covers switch-wide register settings.
+
+Control
+-------
+
+This register is used for low level control of the switch.
+
+	CONTROL: offset 0x0300, 32-bit, (W)
+
+	bit	name		description
+	------------------------------------------------------------------------
+	[0]	CONTROL_RESET	If set, device will perform reset
+	[1:31]	Reserved
+
+Switch ID
+---------
+
+The switch has a SWITCH_ID to be used by software to uniquely identify the
+switch:
+
+	SWITCH_ID: offset 0x0320, 64-bit, (R)
+
+	Value is opaque to switch software and no special encoding is implied.
+
+
+SECTION 8: Events
+=================
+
+Non-I/O asynchronous events from the device are notified to the host using the
+event ring.  The TLV structure for events is:
+
+	field		width	description
+	---------------------------------------------------
+	TYPE		4	Event type, one of:
+					1: LINK_CHANGED
+					2: MAC_VLAN_SEEN
+	INFO		<nest>	Event info (details below)
+
+Link Changed Event
+------------------
+
+When link status changes on a physical port, this event is generated.
+
+	field		width	description
+	---------------------------------------------------
+	INFO		<nest>
+	  PPORT		4	Physical port
+	  LINKUP	1	Link status:
+					0: down
+					1: up
+
+MAC VLAN Seen Event
+-------------------
+
+When a packet ingresses on a port and the source MAC/VLAN isn't known to the
+device, the device will generate this event.  In response to the event, the
+driver should install to the device the MAC/VLAN on the port into the bridge
+table.  Once installed, the MAC/VLAN is known on the port and this event will
+no longer be generated.
+
+	field		width	description
+	---------------------------------------------------
+	INFO		<nest>
+	  PPORT		4	Physical port
+	  MAC		6	MAC address
+	  VLAN		2	VLAN ID
+
+
+SECTION 9: CPU Packet Processing
+================================
+
+Ingress packets directed to the host CPU for further processing are delivered
+in the DMA RX ring.  Likewise, host CPU originating packets destined to egress
+on switch ports are scheduled by software using the DMA TX ring.
+
+Tx Packet Processing
+--------------------
+
+Software schedules packets for egress on switch ports using the DMA TX ring.  A
+TX descriptor buffer describes the packet location and size in host DMA-able
+memory, the destination port, and any hardware-offload functions (such as L3
+payload checksum offload).  Software then bumps the descriptor head to signal
+hardware of new Tx work.  In response, hardware will DMA read Tx descriptors up
+to head, DMA read descriptor buffer and packet data, perform offloading
+functions, and finally frame packet on wire (network).  Once packet processing
+is complete, hardware will writeback status to descriptor(s) to signal to
+software that Tx is complete and software resources (e.g. skb) backing packet
+can be released.
+
+Figure 2 shows an example 3-fragment packet queued with one Tx descriptor.  A
+TLV is used for each packet fragment.
+
+	                                           pkt frag 1
+	                                           +–––––––+  +–+
+	                                       +–––+       |    |
+	                         desc buf      |   |       |    |
+	                        +––––––––+     |   |       |    |
+	        Tx ring     +–––+        +–––––+   |       |    |
+	      +–––––––––+   |   |  TLVs  |         +–––––––+    |
+	      |         +–––+   +––––––––+         pkt frag 2   |
+	      | desc 0  |       |        +–––––+   +–––––––+    |
+	      +–––––––––+       |  TLVs  |     +–––+       |    |
+	head+–+         |       +––––––––+         |       |    |
+	      | desc 1  |       |        +–––––+   +–––––––+    |pkt
+	      +–––––––––+       |  TLVs  |     |                |
+	      |         |       +––––––––+     |   pkt frag 3   |
+	      |         |                      |   +–––––––+    |
+	      +–––––––––+                      +–––+       |    |
+	      |         |                          |       |    |
+	      |         |                          |       |    |
+	      +–––––––––+                          |       |    |
+	      |         |                          |       |    |
+	      |         |                          |       |    |
+	      +–––––––––+                          |       |    |
+	      |         |                          +–––––––+  +–+
+	      |         |
+	      +–––––––––+
+
+				fig 2.
+
+The TLVs for Tx descriptor buffer are:
+
+	field			width	description
+	---------------------------------------------------------------------
+	PPORT			4	Destination physical port #
+	TX_OFFLOAD		1	Hardware offload modes:
+					  0: no offload
+					  1: insert IP csum (ipv4 only)
+					  2: insert TCP/UDP csum
+					  3: L3 csum calc and insert
+                        	             into csum offset (TX_L3_CSUM_OFF)
+                 	                    16-bit 1's complement csum value.
+                                	     IPv4 pseudo-header and IP
+                        	             already calculated by OS
+                  	                   and inserted.
+					  4: TSO (TCP Segmentation Offload)
+	TX_L3_CSUM_OFF		2	For L3 csum offload mode, the offset,
+					from the beginning of the packet,
+					of the csum field in the L3 header
+	TX_TSO_MSS		2	For TSO offload mode, the
+					Maximum Segment Size in bytes
+        TX_TSO_HDR_LEN		2	For TSO offload mode, the
+					length of ethernet, IP, and
+					TCP/UDP headers, including IP
+					and TCP options.
+	TX_FRAGS		<array>	Packet fragments
+	  TX_FRAG		<nest>	Packet fragment
+	    TX_FRAG_ADDR	8	DMA address of packet fragment
+	    TX_FRAG_LEN		2	Packet fragment length
+
+Possible status return codes in descriptor on completion are:
+
+	DESC_COMP_ERR	reason
+	--------------------------------------------------------------------
+	0		OK
+	-ROCKER_ENXIO	address or data read err on desc buf or packet
+			fragment
+	-ROCKER_EINVAL	bad pport or TSO or csum offloading error
+	-ROCKER_ENOMEM	no memory for internal staging tx fragment
+
+Rx Packet Processing
+--------------------
+
+For packets ingressing on switch ports that are not forwarded by the switch but
+rather directed to the host CPU for further processing are delivered in the DMA
+RX ring.  Rx descriptor buffers are allocated by software and placed on the
+ring.  Hardware will fill Rx descriptor buffers with packet data, write the
+completion, and signal to software that a new packet is ready.  Since Rx packet
+size is not known a-priori, the Rx descriptor buffer must be allocated for
+worst-case packet size.  A single Rx descriptor will contain the entire Rx
+packet data in one RX_FRAG.  Other Rx TLVs describe and hardware offloads
+performed on the packet, such as checksum validation.
+
+The TLVs for Rx descriptor buffer are:
+
+	field		width	description
+	---------------------------------------------------
+	PPORT		4	Source physical port #
+	RX_FLAGS	2	Packet parsing flags:
+				  (1 << 0): IPv4 packet
+				  (1 << 1): IPv6 packet
+				  (1 << 2): csum calculated
+				  (1 << 3): IPv4 csum good
+				  (1 << 4): IP fragment
+				  (1 << 5): TCP packet
+				  (1 << 6): UDP packet
+				  (1 << 7): TCP/UDP csum good
+				  (1 << 8): Offload forward
+	RX_CSUM		2	IP calculated checksum:
+				  IPv4: IP payload csum
+				  IPv6: header and payload csum
+				(Only valid is RX_FLAGS:csum calc is set)
+	RX_FRAG_ADDR	8	DMA address of packet fragment
+	RX_FRAG_MAX_LEN	2	Packet maximum fragment length
+	RX_FRAG_LEN	2	Actual packet fragment length after receive
+
+Offload forward RX_FLAG indicates the device has already forwarded the packet
+so the host CPU should not also forward the packet.
+
+Possible status return codes in descriptor on completion are:
+
+	DESC_COMP_ERR	reason
+	--------------------------------------------------------------------
+	0		OK
+	-ROCKER_ENXIO	address or data read err on desc buf
+	-ROCKER_ENOMEM	no memory for internal staging desc buf
+	-ROCKER_EMSGSIZE Rx descriptor buffer wasn't big enough to contain
+			packet data TLV and other TLVs.
+
+
+SECTION 10: OF-DPA Mode
+======================
+
+OF-DPA mode allows the switch to offload flow packet processing functions to
+hardware.  An OpenFlow controller would communicate with an OpenFlow agent
+installed on the switch.  The OpenFlow agent would (directly or indirectly)
+communicate with the Rocker switch driver, which in turn would program switch
+hardware with flow functionality, as defined in OF-DPA.  The block diagram is:
+
+		+–––––––––––––––----–––+
+		|        OF            |
+		|  Remote Controller   |
+		+––––––––+––----–––––––+
+		         |
+		         |
+		+––––––––+–––––––––+
+		|       OF         |
+		|   Local Agent    |
+		+––––––––––––––––––+
+		|                  |
+		|   Rocker Driver  |
+		+––––––––––––––––––+
+		    <this spec>
+		+––––––––––––––––––+
+		|                  |
+		|   Rocker Switch  |
+		+––––––––––––––––––+
+
+To participate in flow functions, ports must be configure for OF-DPA mode
+during switch initialization.
+
+OF-DPA Flow Table Interface
+---------------------------
+
+There are commands to add, modify, delete, and get stats of flow table entries.
+The commands are issued using the DMA CMD descriptor ring.  The following
+commands are defined:
+
+	CMD_ADD:		add an entry to flow table
+	CMD_MOD:		modify an entry in flow table
+	CMD_DEL:		delete an entry from flow table
+	CMD_GET_STATS:		get stats for flow entry
+
+TLVs for add and modify commands are:
+
+	field			width	description
+	----------------------------------------------------
+	OF_DPA_CMD		2	CMD_[ADD|MOD]
+	OF_DPA_TBL		2	Flow table ID
+					  0: ingress port
+					  10: vlan
+					  20: termination mac
+					  30: unicast routing
+					  40: multicast routing
+					  50: bridging
+					  60: ACL policy
+	OF_DPA_PRIORITY		4	Flow priority
+	OF_DPA_HARDTIME		4	Hard timeout for flow
+	OF_DPA_IDLETIME		4	Idle timeout for flow
+	OF_DPA_COOKIE		8	Cookie
+
+Additional TLVs based on flow table ID:
+
+Table ID 0: ingress port
+
+	field			width	description
+	----------------------------------------------------
+	OF_DPA_IN_PPORT		4	ingress physical port number
+	OF_DPA_GOTO_TBL		2	goto table ID; zero to drop
+
+Table ID 10: vlan
+
+	field			width	description
+	----------------------------------------------------
+	OF_DPA_IN_PPORT		4	ingress physical port number
+	OF_DPA_VLAN_ID		2 (N)	vlan ID
+	OF_DPA_VLAN_ID_MASK	2 (N)	vlan ID mask
+	OF_DPA_GOTO_TBL		2	goto table ID; zero to drop
+	OF_DPA_NEW_VLAN_ID	2 (N)	new vlan ID
+
+Table ID 20: termination mac
+
+	field			width	description
+	----------------------------------------------------
+	OF_DPA_IN_PPORT		4	ingress physical port number
+	OF_DPA_IN_PPORT_MASK	4	ingress physical port number mask
+	OF_DPA_ETHERTYPE	2 (N)	must be either 0x0800 or 0x86dd
+	OF_DPA_DST_MAC		6 (N)	destination MAC
+	OF_DPA_DST_MAC_MASK	6 (N)	destination MAC mask
+	OF_DPA_VLAN_ID		2 (N)	vlan ID
+	OF_DPA_VLAN_ID_MASK	2 (N)	vlan ID mask
+	OF_DPA_GOTO_TBL		2	only acceptable values are
+					unicast or multicast routing
+					table IDs
+	OF_DPA_OUT_PPORT	2	if specified, must be
+					controller, set zero otherwise
+
+Table ID 30: unicast routing
+
+	field			width	description
+	----------------------------------------------------
+	OF_DPA_ETHERTYPE	2 (N)	must be either 0x0800 or 0x86dd
+	OF_DPA_DST_IP		4 (N)	destination IPv4 address.
+					Must be unicast address
+	OF_DPA_DST_IP_MASK	4 (N)	IP mask.  Must be prefix mask
+	OF_DPA_DST_IPV6		16 (N)	destination IPv6 address.
+					Must be unicast address
+	OF_DPA_DST_IPV6_MASK	16 (N)	IPv6 mask. Must be prefix mask
+	OF_DPA_GOTO_TBL		2	goto table ID; zero to drop
+	OF_DPA_GROUP_ID		4	data for GROUP action must
+					be an L3 Unicast group entry
+
+Table ID 40: multicast routing
+
+	field			width	description
+	----------------------------------------------------
+	OF_DPA_ETHERTYPE	2 (N)	must be either 0x0800 or 0x86dd
+	OF_DPA_VLAN_ID		2 (N)	vlan ID
+	OF_DPA_SRC_IP		4 (N)	source IPv4. Optional,
+					can contain IPv4 address,
+					must be completely masked
+					if not used
+	OF_DPA_SRC_IP_MASK	4 (N)	IP Mask
+	OF_DPA_DST_IP		4 (N)	destination IPv4 address.
+					Must be multicast address
+	OF_DPA_SRC_IPV6		16 (N)	source IPv6 Address. Optional.
+					Can contain IPv6 address,
+					must be completely masked
+					if not used
+	OF_DPA_SRC_IPV6_MASK	16 (N)	IPv6 mask.
+	OF_DPA_DST_IPV6		16 (N)	destination IPv6 Address. Must
+					be multicast address
+					Must be multicast address
+	OF_DPA_GOTO_TBL		2	goto table ID; zero to drop
+	OF_DPA_GROUP_ID		4	data for GROUP action must
+					be an L3 multicast group entry
+
+Table ID 50: bridging
+
+	field			width	description
+	----------------------------------------------------
+	OF_DPA_VLAN_ID		2 (N)	vlan ID
+	OF_DPA_TUNNEL_ID	4	tunnel ID
+	OF_DPA_DST_MAC		6 (N)	destination MAC
+	OF_DPA_DST_MAC_MASK	6 (N)	destination MAC mask
+	OF_DPA_GOTO_TBL		2	goto table ID; zero to drop
+	OF_DPA_GROUP_ID		4	data for GROUP action must
+					be a L2 Interface, L2
+					Multicast, L2 Flood,
+					or L2 Overlay group entry
+					as appropriate
+	OF_DPA_TUNNEL_LPORT	4	unicast Tenant Bridging
+					flows specify a tunnel
+					logical port ID
+	OF_DPA_OUT_PPORT	2	data for OUTPUT action,
+					restricted to CONTROLLER,
+					set to 0 otherwise
+
+Table ID 60: acl policy
+
+	field			width	description
+	----------------------------------------------------
+	OF_DPA_IN_PPORT		4	ingress physical port number
+	OF_DPA_IN_PPORT_MASK	4	ingress physical port number mask
+	OF_DPA_ETHERTYPE	2 (N)	ethertype
+	OF_DPA_VLAN_ID		2 (N)	vlan ID
+	OF_DPA_VLAN_ID_MASK	2 (N)	vlan ID mask
+	OF_DPA_VLAN_PCP		2 (N)	vlan Priority Code Point
+	OF_DPA_VLAN_PCP_MASK	2 (N)	vlan Priority Code Point mask
+	OF_DPA_SRC_MAC		6 (N)	source MAC
+	OF_DPA_SRC_MAC_MASK	6 (N)	source MAC mask
+	OF_DPA_DST_MAC		6 (N)	destination MAC
+	OF_DPA_DST_MAC_MASK	6 (N)	destination MAC mask
+	OF_DPA_TUNNEL_ID	4	tunnel ID
+	OF_DPA_SRC_IP		4 (N)	source IPv4. Optional,
+					can contain IPv4 address,
+					must be completely masked
+					if not used
+	OF_DPA_SRC_IP_MASK	4 (N)	IP Mask
+	OF_DPA_DST_IP		4 (N)	destination IPv4 address.
+					Must be multicast address
+	OF_DPA_DST_IP_MASK	4 (N)	IP Mask
+	OF_DPA_SRC_IPV6		16 (N)	source IPv6 Address. Optional.
+					Can contain IPv6 address,
+					must be completely masked
+					if not used
+	OF_DPA_SRC_IPV6_MASK	16 (N)	IPv6 mask
+	OF_DPA_DST_IPV6		16 (N)	destination IPv6 Address. Must
+					be multicast address.
+	OF_DPA_DST_IPV6_MASK	16 (N)	IPv6 mask
+	OF_DPA_SRC_ARP_IP	4 (N)	source IPv4 address in the ARP
+					payload.  Only used if ethertype
+					== 0x0806.
+	OF_DPA_SRC_ARP_IP_MASK	4 (N)	IP Mask
+	OF_DPA_IP_PROTO		1	IP protocol
+	OF_DPA_IP_PROTO_MASK	1	IP protocol mask
+	OF_DPA_IP_DSCP		1	DSCP
+	OF_DPA_IP_DSCP_MASK	1	DSCP mask
+	OF_DPA_IP_ECN		1	ECN
+	OF_DPA_IP_ECN_MASK		1	ECN mask
+	OF_DPA_L4_SRC_PORT	2 (N)	L4 source port, only for
+					TCP, UDP, or SCTP
+	OF_DPA_L4_SRC_PORT_MASK	2 (N)	L4 source port mask
+	OF_DPA_L4_DST_PORT	2 (N)	L4 source port, only for
+					TCP, UDP, or SCTP
+	OF_DPA_L4_DST_PORT_MASK	2 (N)	L4 source port mask
+	OF_DPA_ICMP_TYPE	1	ICMP type, only if IP
+					protocol is 1
+	OF_DPA_ICMP_TYPE_MASK	1	ICMP type mask
+	OF_DPA_ICMP_CODE	1	ICMP code
+	OF_DPA_ICMP_CODE_MASK	1	ICMP code mask
+	OF_DPA_IPV6_LABEL	4 (N)	IPv6 flow label
+	OF_DPA_IPV6_LABEL_MASK	4 (N)	IPv6 flow label mask
+	OF_DPA_GROUP_ID		4	data for GROUP action
+	OF_DPA_QUEUE_ID_ACTION	1	write the queue ID
+	OF_DPA_NEW_QUEUE_ID	1	queue ID
+	OF_DPA_VLAN_PCP_ACTION	1	write the VLAN priority
+	OF_DPA_NEW_VLAN_PCP	1	VLAN priority
+	OF_DPA_IP_DSCP_ACTION	1	write the DSCP
+	OF_DPA_NEW_IP_DSCP	1	new DSCP
+	OF_DPA_TUNNEL_LPORT	4	restrct to valid tunnel
+					logical port, set to 0
+					otherwise.
+	OF_DPA_OUT_PPORT	2	data for OUTPUT action,
+					restricted to CONTROLLER,
+					set to 0 otherwise
+	OF_DPA_CLEAR_ACTIONS	4	if 1 packets matching flow are
+					dropped (all other instructions
+					ignored)
+
+TLVs for flow delete and get stats command are:
+
+	field			width	description
+	---------------------------------------------------
+	OF_DPA_CMD		2	CMD_[DEL|GET_STATS]
+	OF_DPA_COOKIE		8	Cookie
+
+On completion of get stats command, the descriptor buffer is written back with
+the following TLVs:
+
+	field			width	description
+	---------------------------------------------------
+	OF_DPA_STAT_DURATION	4	Flow duration
+	OF_DPA_STAT_RX_PKTS	8	Received packets
+	OF_DPA_STAT_TX_PKTS	8	Transmit packets
+
+Possible status return codes in descriptor on completion are:
+
+	DESC_COMP_ERR	command			reason
+	--------------------------------------------------------------------
+	0		all			OK
+	-ROCKER_EFAULT	all			head or tail index outside
+						of ring
+	-ROCKER_ENXIO	all			address or data read err on
+						desc buf
+	-ROCKER_EMSGSIZE GET_STATS		cmd descriptor buffer wasn't
+						big enough to contain write-back
+						TLVs
+	-ROCKER_EINVAL	all			invalid parameters passed in
+	-ROCKER_EEXIST	ADD			entry already exists
+	-ROCKER_ENOSPC	ADD			no space left in flow table
+	-ROCKER_ENOENT	MOD|DEL|GET_STATS	cookie invalid
+
+Group Table Interface
+---------------------
+
+There are commands to add, modify, delete, and get stats of group table
+entries.  The commands are issued using the DMA CMD descriptor ring.  The
+following commands are defined:
+
+	CMD_ADD:		add an entry to group table
+	CMD_MOD:		modify an entry in group table
+	CMD_DEL:		delete an entry from group table
+	CMD_GET_STATS:		get stats for group entry
+
+TLVs for add and modify commands are:
+
+	field			width	description
+	-----------------------------------------------------------
+	FLOW_GROUP_CMD		2	CMD_[ADD|MOD]
+	FLOW_GROUP_ID		2	Flow group ID
+	FLOW_GROUP_TYPE		1	Group type:
+					  0: L2 interface
+					  1: L2 rewrite
+					  2: L3 unicast
+					  3: L2 multicast
+					  4: L2 flood
+					  5: L3 interface
+					  6: L3 multicast
+					  7: L3 ECMP
+					  8: L2 overlay
+	FLOW_VLAN_ID		2	Vlan ID (types 0, 3, 4, 6)
+	FLOW_L2_PORT		2	Port (types 0)
+	FLOW_INDEX		4	Index (all types but 0)
+	FLOW_OVERLAY_TYPE	1	Overlay sub-type (type 8):
+					  0: Flood unicast tunnel
+					  1: Flood multicast tunnel
+					  2: Multicast unicast tunnel
+					  3: Multicast multicast tunnel
+	FLOW_GROUP_ACTION		nest
+	  FLOW_GROUP_ID		2	next group ID in chain (all
+					types except 0)
+	  FLOW_OUT_PORT		4	egress port (types 0, 8)
+	  FLOW_POP_VLAN_TAG	1	strip outer VLAN tag (type 1
+					only)
+	  FLOW_VLAN_ID		2	(types 1, 5)
+	  FLOW_SRC_MAC		6	(types 1, 2, 5)
+	  FLOW_DST_MAC		6	(types 1, 2)
+
+TLVs for flow delete and get stats command are:
+
+	field			width	description
+	-----------------------------------------------------------
+	FLOW_GROUP_CMD		2	CMD_[DEL|GET_STATS]
+	FLOW_GROUP_ID		2	Flow group ID
+
+On completion of get stats command, the descriptor buffer is written back with
+the following TLVs:
+
+	field			width	description
+	---------------------------------------------------
+	FLOW_GROUP_ID		2	Flow group ID
+	FLOW_STAT_DURATION	4	Flow duration
+	FLOW_STAT_REF_COUNT	4	Flow reference count
+	FLOW_STAT_BUCKET_COUNT	4	Flow bucket count
+
+Possible status return codes in descriptor on completion are:
+
+	DESC_COMP_ERR	command			reason
+	--------------------------------------------------------------------
+	0		all			OK
+	-ROCKER_EFAULT	all			head or tail index outside
+						of ring
+	-ROCKER_ENXIO	all			address or data read err on
+						desc buf
+	-ROCKER_ENOSPC	GET_STATS		cmd descriptor buffer wasn't
+						big enough to contain write-back
+						TLVs
+	-ROCKER_EINVAL	ADD|MOD			invalid parameters passed in
+	-ROCKER_EEXIST	ADD			entry already exists
+	-ROCKER_ENOSPC	ADD			no space left in flow table
+	-ROCKER_ENOENT	MOD|DEL|GET_STATS	group ID invalid
+	-ROCKER_EBUSY	DEL			group reference count non-zero
+	-ROCKER_ENODEV	ADD			next group ID doesn't exist
+
+
+
+References
+==========
+
+[1] OpenFlow Data Plane Abstraction (OF-DPA) Abstract Switch Specification,
+Version 1.0, from Broadcom Corporation, February 21, 2014.
diff --git a/docs/writing-qmp-commands.txt b/docs/writing-qmp-commands.txt
index f3df2066a..ab1fdd36b 100644
--- a/docs/writing-qmp-commands.txt
+++ b/docs/writing-qmp-commands.txt
@@ -598,7 +598,7 @@ stored in its "value" member. In our example, the "value" member is a pointer
 to an TimerAlarmMethod instance.
 
 Notice that the "current" variable is used as "true" only in the first
-interation of the loop. That's because the alarm timer method in use is the
+iteration of the loop. That's because the alarm timer method in use is the
 first element of the alarm_timers array. Also notice that QAPI lists are handled
 by hand and we return the head of the list.
author	Yonghee Han <onstudy@samsung.com>	2016-07-27 16:40:17 +0900
committer	Yonghee Han <onstudy@samsung.com>	2016-07-27 00:53:56 -0700
commit	3158f4a51894e46ecb593bffbfd12824e1d6534a (patch)
tree	2bef7f0238e687c5de65f48b5995ee124a95d157 /docs
parent	a3b133b0ea0696e42fd876b9a803e28bc6ef5299 (diff)
download	qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.tar.gz qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.tar.bz2 qemu-3158f4a51894e46ecb593bffbfd12824e1d6534a.zip