summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorJens Axboe <axboe@fb.com>2017-02-17 14:06:45 -0700
committerJens Axboe <axboe@fb.com>2017-02-17 14:06:45 -0700
commit6010720da8aab51f33beee63b73cf88016e9b250 (patch)
treea4c5a7f645998e86a1f49cb05f8e0c4e51448294 /block
parent2fe1e8a7b2f4dcac3fcb07ff06b0ae7396201fd6 (diff)
parent8a9ae523282f324989850fcf41312b42a2fb9296 (diff)
downloadlinux-rpi-6010720da8aab51f33beee63b73cf88016e9b250.tar.gz
linux-rpi-6010720da8aab51f33beee63b73cf88016e9b250.tar.bz2
linux-rpi-6010720da8aab51f33beee63b73cf88016e9b250.zip
Merge branch 'for-4.11/block' into for-4.11/linus-merge
Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig19
-rw-r--r--block/Kconfig.iosched50
-rw-r--r--block/Makefile5
-rw-r--r--block/bio.c6
-rw-r--r--block/blk-cgroup.c22
-rw-r--r--block/blk-core.c32
-rw-r--r--block/blk-exec.c3
-rw-r--r--block/blk-flush.c12
-rw-r--r--block/blk-ioc.c12
-rw-r--r--block/blk-merge.c4
-rw-r--r--block/blk-mq-debugfs.c756
-rw-r--r--block/blk-mq-sched.c481
-rw-r--r--block/blk-mq-sched.h142
-rw-r--r--block/blk-mq-sysfs.c235
-rw-r--r--block/blk-mq-tag.c190
-rw-r--r--block/blk-mq-tag.h10
-rw-r--r--block/blk-mq.c555
-rw-r--r--block/blk-mq.h77
-rw-r--r--block/blk-tag.c1
-rw-r--r--block/blk-throttle.c6
-rw-r--r--block/blk.h26
-rw-r--r--block/cfq-iosched.c10
-rw-r--r--block/deadline-iosched.c2
-rw-r--r--block/elevator.c253
-rw-r--r--block/mq-deadline.c555
-rw-r--r--block/noop-iosched.c2
-rw-r--r--block/opal_proto.h452
-rw-r--r--block/partitions/efi.c17
-rw-r--r--block/sed-opal.c2488
29 files changed, 5782 insertions, 641 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 8bf114a3858a..1aef809affae 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -147,6 +147,25 @@ config BLK_WBT_MQ
Multiqueue currently doesn't have support for IO scheduling,
enabling this option is recommended.
+config BLK_DEBUG_FS
+ bool "Block layer debugging information in debugfs"
+ default y
+ depends on DEBUG_FS
+ ---help---
+ Include block layer debugging information in debugfs. This information
+ is mostly useful for kernel developers, but it doesn't incur any cost
+ at runtime.
+
+ Unless you are building a kernel for a tiny system, you should
+ say Y here.
+
+config BLK_SED_OPAL
+ bool "Logic for interfacing with Opal enabled SEDs"
+ ---help---
+ Builds Logic for interfacing with Opal enabled controllers.
+ Enabling this option enables users to setup/unlock/lock
+ Locking ranges for SED devices using the Opal protocol.
+
menu "Partition Types"
source "block/partitions/Kconfig"
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9c4c48..0715ce93daef 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -63,6 +63,56 @@ config DEFAULT_IOSCHED
default "cfq" if DEFAULT_CFQ
default "noop" if DEFAULT_NOOP
+config MQ_IOSCHED_DEADLINE
+ tristate "MQ deadline I/O scheduler"
+ default y
+ ---help---
+ MQ version of the deadline IO scheduler.
+
+config MQ_IOSCHED_NONE
+ bool
+ default y
+
+choice
+ prompt "Default single-queue blk-mq I/O scheduler"
+ default DEFAULT_SQ_NONE
+ help
+ Select the I/O scheduler which will be used by default for blk-mq
+ managed block devices with a single queue.
+
+ config DEFAULT_SQ_DEADLINE
+ bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+ config DEFAULT_SQ_NONE
+ bool "None"
+
+endchoice
+
+config DEFAULT_SQ_IOSCHED
+ string
+ default "mq-deadline" if DEFAULT_SQ_DEADLINE
+ default "none" if DEFAULT_SQ_NONE
+
+choice
+ prompt "Default multi-queue blk-mq I/O scheduler"
+ default DEFAULT_MQ_NONE
+ help
+ Select the I/O scheduler which will be used by default for blk-mq
+ managed block devices with multiple queues.
+
+ config DEFAULT_MQ_DEADLINE
+ bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+ config DEFAULT_MQ_NONE
+ bool "None"
+
+endchoice
+
+config DEFAULT_MQ_IOSCHED
+ string
+ default "mq-deadline" if DEFAULT_MQ_DEADLINE
+ default "none" if DEFAULT_MQ_NONE
+
endmenu
endif
diff --git a/block/Makefile b/block/Makefile
index a827f988c4e6..6ba1b1bc9529 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
- blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
+ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
@@ -25,3 +26,5 @@ obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
+obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
+obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
diff --git a/block/bio.c b/block/bio.c
index 2b375020fc49..d3c26d1cb1da 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1403,7 +1403,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
bio_set_flag(bio, BIO_USER_MAPPED);
/*
- * subtle -- if __bio_map_user() ended up bouncing a bio,
+ * subtle -- if bio_map_user_iov() ended up bouncing a bio,
* it would normally disappear when its bi_end_io is run.
* however, we need it for the unmap, so grab an extra
* reference to it
@@ -1445,8 +1445,8 @@ static void __bio_unmap_user(struct bio *bio)
* bio_unmap_user - unmap a bio
* @bio: the bio being unmapped
*
- * Unmap a bio previously mapped by bio_map_user(). Must be called with
- * a process context.
+ * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from
+ * process context.
*
* bio_unmap_user() may sleep.
*/
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8ba0af780e88..fb59a3edc778 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1223,7 +1223,10 @@ int blkcg_activate_policy(struct request_queue *q,
if (blkcg_policy_enabled(q, pol))
return 0;
- blk_queue_bypass_start(q);
+ if (q->mq_ops)
+ blk_mq_freeze_queue(q);
+ else
+ blk_queue_bypass_start(q);
pd_prealloc:
if (!pd_prealloc) {
pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
@@ -1261,7 +1264,10 @@ pd_prealloc:
spin_unlock_irq(q->queue_lock);
out_bypass_end:
- blk_queue_bypass_end(q);
+ if (q->mq_ops)
+ blk_mq_unfreeze_queue(q);
+ else
+ blk_queue_bypass_end(q);
if (pd_prealloc)
pol->pd_free_fn(pd_prealloc);
return ret;
@@ -1284,7 +1290,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
if (!blkcg_policy_enabled(q, pol))
return;
- blk_queue_bypass_start(q);
+ if (q->mq_ops)
+ blk_mq_freeze_queue(q);
+ else
+ blk_queue_bypass_start(q);
+
spin_lock_irq(q->queue_lock);
__clear_bit(pol->plid, q->blkcg_pols);
@@ -1304,7 +1314,11 @@ void blkcg_deactivate_policy(struct request_queue *q,
}
spin_unlock_irq(q->queue_lock);
- blk_queue_bypass_end(q);
+
+ if (q->mq_ops)
+ blk_mq_unfreeze_queue(q);
+ else
+ blk_queue_bypass_end(q);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
diff --git a/block/blk-core.c b/block/blk-core.c
index 61ba08c58b64..b2df55a65250 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
#include "blk.h"
#include "blk-mq.h"
+#include "blk-mq-sched.h"
#include "blk-wbt.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -134,6 +135,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->cmd = rq->__cmd;
rq->cmd_len = BLK_MAX_CDB;
rq->tag = -1;
+ rq->internal_tag = -1;
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
@@ -525,12 +527,14 @@ void blk_set_queue_dying(struct request_queue *q)
else {
struct request_list *rl;
+ spin_lock_irq(q->queue_lock);
blk_queue_for_each_rl(rl, q) {
if (rl->rq_pool) {
wake_up(&rl->wait[BLK_RW_SYNC]);
wake_up(&rl->wait[BLK_RW_ASYNC]);
}
}
+ spin_unlock_irq(q->queue_lock);
}
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
@@ -1033,29 +1037,13 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
* Flush requests do not use the elevator so skip initialization.
* This allows a request to share the flush and elevator data.
*/
- if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA))
+ if (op_is_flush(bio->bi_opf))
return false;
return true;
}
/**
- * rq_ioc - determine io_context for request allocation
- * @bio: request being allocated is for this bio (can be %NULL)
- *
- * Determine io_context to use for request allocation for @bio. May return
- * %NULL if %current->io_context doesn't exist.
- */
-static struct io_context *rq_ioc(struct bio *bio)
-{
-#ifdef CONFIG_BLK_CGROUP
- if (bio && bio->bi_ioc)
- return bio->bi_ioc;
-#endif
- return current->io_context;
-}
-
-/**
* __get_request - get a free request
* @rl: request list to allocate from
* @op: operation and flags
@@ -1655,7 +1643,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
return BLK_QC_T_NONE;
}
- if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) {
+ if (op_is_flush(bio->bi_opf)) {
spin_lock_irq(q->queue_lock);
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
@@ -1894,7 +1882,7 @@ generic_make_request_checks(struct bio *bio)
* drivers without flush support don't have to worry
* about them.
*/
- if ((bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) &&
+ if (op_is_flush(bio->bi_opf) &&
!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
if (!nr_sectors) {
@@ -2143,7 +2131,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
if (q->mq_ops) {
if (blk_queue_io_stat(q))
blk_account_io_start(rq, true);
- blk_mq_insert_request(rq, false, true, false);
+ blk_mq_sched_insert_request(rq, false, true, false, false);
return 0;
}
@@ -2159,7 +2147,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
*/
BUG_ON(blk_queued_rq(rq));
- if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+ if (op_is_flush(rq->cmd_flags))
where = ELEVATOR_INSERT_FLUSH;
add_acct_request(q, rq, where);
@@ -3270,7 +3258,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
/*
* rq is already accounted, so use raw insert
*/
- if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA))
+ if (op_is_flush(rq->cmd_flags))
__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
else
__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 3ecb00a6cf45..ed1f10165268 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -9,6 +9,7 @@
#include <linux/sched/sysctl.h>
#include "blk.h"
+#include "blk-mq-sched.h"
/*
* for max sense size
@@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
* be reused after dying flag is set
*/
if (q->mq_ops) {
- blk_mq_insert_request(rq, at_head, true, false);
+ blk_mq_sched_insert_request(rq, at_head, true, false, false);
return;
}
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 20b7c7a02f1c..4427896641ac 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,6 +74,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
/* FLUSH/FUA sequences */
enum {
@@ -391,9 +392,10 @@ static void mq_flush_data_end_io(struct request *rq, int error)
* the comment in flush_end_io().
*/
spin_lock_irqsave(&fq->mq_flush_lock, flags);
- if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
- blk_mq_run_hw_queue(hctx, true);
+ blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+
+ blk_mq_run_hw_queue(hctx, true);
}
/**
@@ -453,9 +455,9 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- if (q->mq_ops) {
- blk_mq_insert_request(rq, false, true, false);
- } else
+ if (q->mq_ops)
+ blk_mq_sched_insert_request(rq, false, true, false, false);
+ else
list_add_tail(&rq->queuelist, &q->queue_head);
return;
}
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 381cb50a673c..fe186a9eade9 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -43,8 +43,10 @@ static void ioc_exit_icq(struct io_cq *icq)
if (icq->flags & ICQ_EXITED)
return;
- if (et->ops.elevator_exit_icq_fn)
- et->ops.elevator_exit_icq_fn(icq);
+ if (et->uses_mq && et->ops.mq.exit_icq)
+ et->ops.mq.exit_icq(icq);
+ else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
+ et->ops.sq.elevator_exit_icq_fn(icq);
icq->flags |= ICQ_EXITED;
}
@@ -383,8 +385,10 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
hlist_add_head(&icq->ioc_node, &ioc->icq_list);
list_add(&icq->q_node, &q->icq_list);
- if (et->ops.elevator_init_icq_fn)
- et->ops.elevator_init_icq_fn(icq);
+ if (et->uses_mq && et->ops.mq.init_icq)
+ et->ops.mq.init_icq(icq);
+ else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
+ et->ops.sq.elevator_init_icq_fn(icq);
} else {
kmem_cache_free(et->icq_cache, icq);
icq = ioc_lookup_icq(ioc, q);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 182398cb1524..6aa43dec5af4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -763,8 +763,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_allow_rq_merge_fn)
- if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next))
+ if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
+ if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
return 0;
return attempt_merge(q, rq, next);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
new file mode 100644
index 000000000000..5cd2b435a9f5
--- /dev/null
+++ b/block/blk-mq-debugfs.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+struct blk_mq_debugfs_attr {
+ const char *name;
+ umode_t mode;
+ const struct file_operations *fops;
+};
+
+static struct dentry *block_debugfs_root;
+
+static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
+{
+ struct seq_file *m;
+ int ret;
+
+ ret = seq_open(file, ops);
+ if (!ret) {
+ m = file->private_data;
+ m->private = inode->i_private;
+ }
+ return ret;
+}
+
+static int hctx_state_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ seq_printf(m, "0x%lx\n", hctx->state);
+ return 0;
+}
+
+static int hctx_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_state_show, inode->i_private);
+}
+
+static const struct file_operations hctx_state_fops = {
+ .open = hctx_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int hctx_flags_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ seq_printf(m, "0x%lx\n", hctx->flags);
+ return 0;
+}
+
+static int hctx_flags_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_flags_show, inode->i_private);
+}
+
+static const struct file_operations hctx_flags_fops = {
+ .open = hctx_flags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
+{
+ struct request *rq = list_entry_rq(v);
+
+ seq_printf(m, "%p {.cmd_type=%u, .cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n",
+ rq, rq->cmd_type, rq->cmd_flags, (unsigned int)rq->rq_flags,
+ rq->tag, rq->internal_tag);
+ return 0;
+}
+
+static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ spin_lock(&hctx->lock);
+ return seq_list_start(&hctx->dispatch, *pos);
+}
+
+static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ return seq_list_next(v, &hctx->dispatch, pos);
+}
+
+static void hctx_dispatch_stop(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ spin_unlock(&hctx->lock);
+}
+
+static const struct seq_operations hctx_dispatch_seq_ops = {
+ .start = hctx_dispatch_start,
+ .next = hctx_dispatch_next,
+ .stop = hctx_dispatch_stop,
+ .show = blk_mq_debugfs_rq_show,
+};
+
+static int hctx_dispatch_open(struct inode *inode, struct file *file)
+{
+ return blk_mq_debugfs_seq_open(inode, file, &hctx_dispatch_seq_ops);
+}
+
+static const struct file_operations hctx_dispatch_fops = {
+ .open = hctx_dispatch_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int hctx_ctx_map_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ sbitmap_bitmap_show(&hctx->ctx_map, m);
+ return 0;
+}
+
+static int hctx_ctx_map_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_ctx_map_show, inode->i_private);
+}
+
+static const struct file_operations hctx_ctx_map_fops = {
+ .open = hctx_ctx_map_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void blk_mq_debugfs_tags_show(struct seq_file *m,
+ struct blk_mq_tags *tags)
+{
+ seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
+ seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
+ seq_printf(m, "active_queues=%d\n",
+ atomic_read(&tags->active_queues));
+
+ seq_puts(m, "\nbitmap_tags:\n");
+ sbitmap_queue_show(&tags->bitmap_tags, m);
+
+ if (tags->nr_reserved_tags) {
+ seq_puts(m, "\nbreserved_tags:\n");
+ sbitmap_queue_show(&tags->breserved_tags, m);
+ }
+}
+
+static int hctx_tags_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+ struct request_queue *q = hctx->queue;
+
+ mutex_lock(&q->sysfs_lock);
+ if (hctx->tags)
+ blk_mq_debugfs_tags_show(m, hctx->tags);
+ mutex_unlock(&q->sysfs_lock);
+
+ return 0;
+}
+
+static int hctx_tags_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_tags_show, inode->i_private);
+}
+
+static const struct file_operations hctx_tags_fops = {
+ .open = hctx_tags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int hctx_tags_bitmap_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+ struct request_queue *q = hctx->queue;
+
+ mutex_lock(&q->sysfs_lock);
+ if (hctx->tags)
+ sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
+ mutex_unlock(&q->sysfs_lock);
+ return 0;
+}
+
+static int hctx_tags_bitmap_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_tags_bitmap_show, inode->i_private);
+}
+
+static const struct file_operations hctx_tags_bitmap_fops = {
+ .open = hctx_tags_bitmap_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int hctx_sched_tags_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+ struct request_queue *q = hctx->queue;
+
+ mutex_lock(&q->sysfs_lock);
+ if (hctx->sched_tags)
+ blk_mq_debugfs_tags_show(m, hctx->sched_tags);
+ mutex_unlock(&q->sysfs_lock);
+
+ return 0;
+}
+
+static int hctx_sched_tags_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_sched_tags_show, inode->i_private);
+}
+
+static const struct file_operations hctx_sched_tags_fops = {
+ .open = hctx_sched_tags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int hctx_sched_tags_bitmap_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+ struct request_queue *q = hctx->queue;
+
+ mutex_lock(&q->sysfs_lock);
+ if (hctx->sched_tags)
+ sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
+ mutex_unlock(&q->sysfs_lock);
+ return 0;
+}
+
+static int hctx_sched_tags_bitmap_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_sched_tags_bitmap_show, inode->i_private);
+}
+
+static const struct file_operations hctx_sched_tags_bitmap_fops = {
+ .open = hctx_sched_tags_bitmap_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int hctx_io_poll_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ seq_printf(m, "considered=%lu\n", hctx->poll_considered);
+ seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
+ seq_printf(m, "success=%lu\n", hctx->poll_success);
+ return 0;
+}
+
+static int hctx_io_poll_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_io_poll_show, inode->i_private);
+}
+
+static ssize_t hctx_io_poll_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *m = file->private_data;
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
+ return count;
+}
+
+static const struct file_operations hctx_io_poll_fops = {
+ .open = hctx_io_poll_open,
+ .read = seq_read,
+ .write = hctx_io_poll_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+ seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+ stat->nr_samples, stat->mean, stat->min, stat->max);
+}
+
+static int hctx_stats_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+ struct blk_rq_stat stat[2];
+
+ blk_stat_init(&stat[BLK_STAT_READ]);
+ blk_stat_init(&stat[BLK_STAT_WRITE]);
+
+ blk_hctx_stat_get(hctx, stat);
+
+ seq_puts(m, "read: ");
+ print_stat(m, &stat[BLK_STAT_READ]);
+ seq_puts(m, "\n");
+
+ seq_puts(m, "write: ");
+ print_stat(m, &stat[BLK_STAT_WRITE]);
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int hctx_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_stats_show, inode->i_private);
+}
+
+static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *m = file->private_data;
+ struct blk_mq_hw_ctx *hctx = m->private;
+ struct blk_mq_ctx *ctx;
+ int i;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ blk_stat_init(&ctx->stat[BLK_STAT_READ]);
+ blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
+ }
+ return count;
+}
+
+static const struct file_operations hctx_stats_fops = {
+ .open = hctx_stats_open,
+ .read = seq_read,
+ .write = hctx_stats_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int hctx_dispatched_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+ int i;
+
+ seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
+
+ for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
+ unsigned int d = 1U << (i - 1);
+
+ seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
+ }
+
+ seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
+ return 0;
+}
+
+static int hctx_dispatched_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_dispatched_show, inode->i_private);
+}
+
+static ssize_t hctx_dispatched_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *m = file->private_data;
+ struct blk_mq_hw_ctx *hctx = m->private;
+ int i;
+
+ for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
+ hctx->dispatched[i] = 0;
+ return count;
+}
+
+static const struct file_operations hctx_dispatched_fops = {
+ .open = hctx_dispatched_open,
+ .read = seq_read,
+ .write = hctx_dispatched_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int hctx_queued_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ seq_printf(m, "%lu\n", hctx->queued);
+ return 0;
+}
+
+static int hctx_queued_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_queued_show, inode->i_private);
+}
+
+static ssize_t hctx_queued_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *m = file->private_data;
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ hctx->queued = 0;
+ return count;
+}
+
+static const struct file_operations hctx_queued_fops = {
+ .open = hctx_queued_open,
+ .read = seq_read,
+ .write = hctx_queued_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int hctx_run_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ seq_printf(m, "%lu\n", hctx->run);
+ return 0;
+}
+
+static int hctx_run_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_run_show, inode->i_private);
+}
+
+static ssize_t hctx_run_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *m = file->private_data;
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ hctx->run = 0;
+ return count;
+}
+
+static const struct file_operations hctx_run_fops = {
+ .open = hctx_run_open,
+ .read = seq_read,
+ .write = hctx_run_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int hctx_active_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_hw_ctx *hctx = m->private;
+
+ seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
+ return 0;
+}
+
+static int hctx_active_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, hctx_active_show, inode->i_private);
+}
+
+static const struct file_operations hctx_active_fops = {
+ .open = hctx_active_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
+{
+ struct blk_mq_ctx *ctx = m->private;
+
+ spin_lock(&ctx->lock);
+ return seq_list_start(&ctx->rq_list, *pos);
+}
+
+static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct blk_mq_ctx *ctx = m->private;
+
+ return seq_list_next(v, &ctx->rq_list, pos);
+}
+
+static void ctx_rq_list_stop(struct seq_file *m, void *v)
+{
+ struct blk_mq_ctx *ctx = m->private;
+
+ spin_unlock(&ctx->lock);
+}
+
+static const struct seq_operations ctx_rq_list_seq_ops = {
+ .start = ctx_rq_list_start,
+ .next = ctx_rq_list_next,
+ .stop = ctx_rq_list_stop,
+ .show = blk_mq_debugfs_rq_show,
+};
+
+static int ctx_rq_list_open(struct inode *inode, struct file *file)
+{
+ return blk_mq_debugfs_seq_open(inode, file, &ctx_rq_list_seq_ops);
+}
+
+static const struct file_operations ctx_rq_list_fops = {
+ .open = ctx_rq_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int ctx_dispatched_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_ctx *ctx = m->private;
+
+ seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
+ return 0;
+}
+
+static int ctx_dispatched_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ctx_dispatched_show, inode->i_private);
+}
+
+static ssize_t ctx_dispatched_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *m = file->private_data;
+ struct blk_mq_ctx *ctx = m->private;
+
+ ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
+ return count;
+}
+
+static const struct file_operations ctx_dispatched_fops = {
+ .open = ctx_dispatched_open,
+ .read = seq_read,
+ .write = ctx_dispatched_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int ctx_merged_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_ctx *ctx = m->private;
+
+ seq_printf(m, "%lu\n", ctx->rq_merged);
+ return 0;
+}
+
+static int ctx_merged_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ctx_merged_show, inode->i_private);
+}
+
+static ssize_t ctx_merged_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *m = file->private_data;
+ struct blk_mq_ctx *ctx = m->private;
+
+ ctx->rq_merged = 0;
+ return count;
+}
+
+static const struct file_operations ctx_merged_fops = {
+ .open = ctx_merged_open,
+ .read = seq_read,
+ .write = ctx_merged_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int ctx_completed_show(struct seq_file *m, void *v)
+{
+ struct blk_mq_ctx *ctx = m->private;
+
+ seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
+ return 0;
+}
+
+static int ctx_completed_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ctx_completed_show, inode->i_private);
+}
+
+static ssize_t ctx_completed_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct seq_file *m = file->private_data;
+ struct blk_mq_ctx *ctx = m->private;
+
+ ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
+ return count;
+}
+
+static const struct file_operations ctx_completed_fops = {
+ .open = ctx_completed_open,
+ .read = seq_read,
+ .write = ctx_completed_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
+ {"state", 0400, &hctx_state_fops},
+ {"flags", 0400, &hctx_flags_fops},
+ {"dispatch", 0400, &hctx_dispatch_fops},
+ {"ctx_map", 0400, &hctx_ctx_map_fops},
+ {"tags", 0400, &hctx_tags_fops},
+ {"tags_bitmap", 0400, &hctx_tags_bitmap_fops},
+ {"sched_tags", 0400, &hctx_sched_tags_fops},
+ {"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
+ {"io_poll", 0600, &hctx_io_poll_fops},
+ {"stats", 0600, &hctx_stats_fops},
+ {"dispatched", 0600, &hctx_dispatched_fops},
+ {"queued", 0600, &hctx_queued_fops},
+ {"run", 0600, &hctx_run_fops},
+ {"active", 0400, &hctx_active_fops},
+};
+
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
+ {"rq_list", 0400, &ctx_rq_list_fops},
+ {"dispatched", 0600, &ctx_dispatched_fops},
+ {"merged", 0600, &ctx_merged_fops},
+ {"completed", 0600, &ctx_completed_fops},
+};
+
+int blk_mq_debugfs_register(struct request_queue *q, const char *name)
+{
+ if (!block_debugfs_root)
+ return -ENOENT;
+
+ q->debugfs_dir = debugfs_create_dir(name, block_debugfs_root);
+ if (!q->debugfs_dir)
+ goto err;
+
+ if (blk_mq_debugfs_register_hctxs(q))
+ goto err;
+
+ return 0;
+
+err:
+ blk_mq_debugfs_unregister(q);
+ return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister(struct request_queue *q)
+{
+ debugfs_remove_recursive(q->debugfs_dir);
+ q->mq_debugfs_dir = NULL;
+ q->debugfs_dir = NULL;
+}
+
+static int blk_mq_debugfs_register_ctx(struct request_queue *q,
+ struct blk_mq_ctx *ctx,
+ struct dentry *hctx_dir)
+{
+ struct dentry *ctx_dir;
+ char name[20];
+ int i;
+
+ snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
+ ctx_dir = debugfs_create_dir(name, hctx_dir);
+ if (!ctx_dir)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_ctx_attrs); i++) {
+ const struct blk_mq_debugfs_attr *attr;
+
+ attr = &blk_mq_debugfs_ctx_attrs[i];
+ if (!debugfs_create_file(attr->name, attr->mode, ctx_dir, ctx,
+ attr->fops))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int blk_mq_debugfs_register_hctx(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_ctx *ctx;
+ struct dentry *hctx_dir;
+ char name[20];
+ int i;
+
+ snprintf(name, sizeof(name), "%u", hctx->queue_num);
+ hctx_dir = debugfs_create_dir(name, q->mq_debugfs_dir);
+ if (!hctx_dir)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(blk_mq_debugfs_hctx_attrs); i++) {
+ const struct blk_mq_debugfs_attr *attr;
+
+ attr = &blk_mq_debugfs_hctx_attrs[i];
+ if (!debugfs_create_file(attr->name, attr->mode, hctx_dir, hctx,
+ attr->fops))
+ return -ENOMEM;
+ }
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ if (blk_mq_debugfs_register_ctx(q, ctx, hctx_dir))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ if (!q->debugfs_dir)
+ return -ENOENT;
+
+ q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir);
+ if (!q->mq_debugfs_dir)
+ goto err;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (blk_mq_debugfs_register_hctx(q, hctx))
+ goto err;
+ }
+
+ return 0;
+
+err:
+ blk_mq_debugfs_unregister_hctxs(q);
+ return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+{
+ debugfs_remove_recursive(q->mq_debugfs_dir);
+ q->mq_debugfs_dir = NULL;
+}
+
+void blk_mq_debugfs_init(void)
+{
+ block_debugfs_root = debugfs_create_dir("block", NULL);
+}
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
new file mode 100644
index 000000000000..114814ec3d49
--- /dev/null
+++ b/block/blk-mq-sched.c
@@ -0,0 +1,481 @@
+/*
+ * blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+
+#include <trace/events/block.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-wbt.h"
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+ void (*exit)(struct blk_mq_hw_ctx *))
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (exit && hctx->sched_data)
+ exit(hctx);
+ kfree(hctx->sched_data);
+ hctx->sched_data = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+ int (*init)(struct blk_mq_hw_ctx *),
+ void (*exit)(struct blk_mq_hw_ctx *))
+{
+ struct blk_mq_hw_ctx *hctx;
+ int ret;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
+ if (!hctx->sched_data) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ if (init) {
+ ret = init(hctx);
+ if (ret) {
+ /*
+ * We don't want to give exit() a partially
+ * initialized sched_data. init() must clean up
+ * if it fails.
+ */
+ kfree(hctx->sched_data);
+ hctx->sched_data = NULL;
+ goto error;
+ }
+ }
+ }
+
+ return 0;
+error:
+ blk_mq_sched_free_hctx_data(q, exit);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
+
+static void __blk_mq_sched_assign_ioc(struct request_queue *q,
+ struct request *rq, struct io_context *ioc)
+{
+ struct io_cq *icq;
+
+ spin_lock_irq(q->queue_lock);
+ icq = ioc_lookup_icq(ioc, q);
+ spin_unlock_irq(q->queue_lock);
+
+ if (!icq) {
+ icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
+ if (!icq)
+ return;
+ }
+
+ rq->elv.icq = icq;
+ if (!blk_mq_sched_get_rq_priv(q, rq)) {
+ rq->rq_flags |= RQF_ELVPRIV;
+ get_io_context(icq->ioc);
+ return;
+ }
+
+ rq->elv.icq = NULL;
+}
+
+static void blk_mq_sched_assign_ioc(struct request_queue *q,
+ struct request *rq, struct bio *bio)
+{
+ struct io_context *ioc;
+
+ ioc = rq_ioc(bio);
+ if (ioc)
+ __blk_mq_sched_assign_ioc(q, rq, ioc);
+}
+
+struct request *blk_mq_sched_get_request(struct request_queue *q,
+ struct bio *bio,
+ unsigned int op,
+ struct blk_mq_alloc_data *data)
+{
+ struct elevator_queue *e = q->elevator;
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ struct request *rq;
+
+ blk_queue_enter_live(q);
+ ctx = blk_mq_get_ctx(q);
+ hctx = blk_mq_map_queue(q, ctx->cpu);
+
+ blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx);
+
+ if (e) {
+ data->flags |= BLK_MQ_REQ_INTERNAL;
+
+ /*
+ * Flush requests are special and go directly to the
+ * dispatch list.
+ */
+ if (!op_is_flush(op) && e->type->ops.mq.get_request) {
+ rq = e->type->ops.mq.get_request(q, op, data);
+ if (rq)
+ rq->rq_flags |= RQF_QUEUED;
+ } else
+ rq = __blk_mq_alloc_request(data, op);
+ } else {
+ rq = __blk_mq_alloc_request(data, op);
+ if (rq)
+ data->hctx->tags->rqs[rq->tag] = rq;
+ }
+
+ if (rq) {
+ if (!op_is_flush(op)) {
+ rq->elv.icq = NULL;
+ if (e && e->type->icq_cache)
+ blk_mq_sched_assign_ioc(q, rq, bio);
+ }
+ data->hctx->queued++;
+ return rq;
+ }
+
+ blk_queue_exit(q);
+ return NULL;
+}
+
+void blk_mq_sched_put_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
+
+ if (rq->rq_flags & RQF_ELVPRIV) {
+ blk_mq_sched_put_rq_priv(rq->q, rq);
+ if (rq->elv.icq) {
+ put_io_context(rq->elv.icq->ioc);
+ rq->elv.icq = NULL;
+ }
+ }
+
+ if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
+ e->type->ops.mq.put_request(rq);
+ else
+ blk_mq_finish_request(rq);
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+ struct elevator_queue *e = hctx->queue->elevator;
+ LIST_HEAD(rq_list);
+
+ if (unlikely(blk_mq_hctx_stopped(hctx)))
+ return;
+
+ hctx->run++;
+
+ /*
+ * If we have previous entries on our dispatch list, grab them first for
+ * more fair dispatch.
+ */
+ if (!list_empty_careful(&hctx->dispatch)) {
+ spin_lock(&hctx->lock);
+ if (!list_empty(&hctx->dispatch))
+ list_splice_init(&hctx->dispatch, &rq_list);
+ spin_unlock(&hctx->lock);
+ }
+
+ /*
+ * Only ask the scheduler for requests, if we didn't have residual
+ * requests from the dispatch list. This is to avoid the case where
+ * we only ever dispatch a fraction of the requests available because
+ * of low device queue depth. Once we pull requests out of the IO
+ * scheduler, we can no longer merge or sort them. So it's best to
+ * leave them there for as long as we can. Mark the hw queue as
+ * needing a restart in that case.
+ */
+ if (!list_empty(&rq_list)) {
+ blk_mq_sched_mark_restart(hctx);
+ blk_mq_dispatch_rq_list(hctx, &rq_list);
+ } else if (!e || !e->type->ops.mq.dispatch_request) {
+ blk_mq_flush_busy_ctxs(hctx, &rq_list);
+ blk_mq_dispatch_rq_list(hctx, &rq_list);
+ } else {
+ do {
+ struct request *rq;
+
+ rq = e->type->ops.mq.dispatch_request(hctx);
+ if (!rq)
+ break;
+ list_add(&rq->queuelist, &rq_list);
+ } while (blk_mq_dispatch_rq_list(hctx, &rq_list));
+ }
+}
+
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+ struct list_head *rq_list,
+ struct request *(*get_rq)(struct blk_mq_hw_ctx *))
+{
+ do {
+ struct request *rq;
+
+ rq = get_rq(hctx);
+ if (!rq)
+ break;
+
+ list_add_tail(&rq->queuelist, rq_list);
+ } while (1);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
+
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio)
+{
+ struct request *rq;
+ int ret;
+
+ ret = elv_merge(q, &rq, bio);
+ if (ret == ELEVATOR_BACK_MERGE) {
+ if (!blk_mq_sched_allow_merge(q, rq, bio))
+ return false;
+ if (bio_attempt_back_merge(q, rq, bio)) {
+ if (!attempt_back_merge(q, rq))
+ elv_merged_request(q, rq, ret);
+ return true;
+ }
+ } else if (ret == ELEVATOR_FRONT_MERGE) {
+ if (!blk_mq_sched_allow_merge(q, rq, bio))
+ return false;
+ if (bio_attempt_front_merge(q, rq, bio)) {
+ if (!attempt_front_merge(q, rq))
+ elv_merged_request(q, rq, ret);
+ return true;
+ }
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
+
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (e->type->ops.mq.bio_merge) {
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+ blk_mq_put_ctx(ctx);
+ return e->type->ops.mq.bio_merge(hctx, bio);
+ }
+
+ return false;
+}
+
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+{
+ return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
+
+void blk_mq_sched_request_inserted(struct request *rq)
+{
+ trace_block_rq_insert(rq->q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
+
+bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+ if (rq->tag == -1) {
+ rq->rq_flags |= RQF_SORTED;
+ return false;
+ }
+
+ /*
+ * If we already have a real request tag, send directly to
+ * the dispatch list.
+ */
+ spin_lock(&hctx->lock);
+ list_add(&rq->queuelist, &hctx->dispatch);
+ spin_unlock(&hctx->lock);
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert);
+
+static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+{
+ if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+ clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ if (blk_mq_hctx_has_pending(hctx))
+ blk_mq_run_hw_queue(hctx, true);
+ }
+}
+
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
+{
+ unsigned int i;
+
+ if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+ blk_mq_sched_restart_hctx(hctx);
+ else {
+ struct request_queue *q = hctx->queue;
+
+ if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+ return;
+
+ clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_sched_restart_hctx(hctx);
+ }
+}
+
+/*
+ * Add flush/fua to the queue. If we fail getting a driver tag, then
+ * punt to the requeue list. Requeue will re-invoke us from a context
+ * that's safe to block from.
+ */
+static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, bool can_block)
+{
+ if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
+ blk_insert_flush(rq);
+ blk_mq_run_hw_queue(hctx, true);
+ } else
+ blk_mq_add_to_requeue_list(rq, true, true);
+}
+
+void blk_mq_sched_insert_request(struct request *rq, bool at_head,
+ bool run_queue, bool async, bool can_block)
+{
+ struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+ if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
+ blk_mq_sched_insert_flush(hctx, rq, can_block);
+ return;
+ }
+
+ if (e && e->type->ops.mq.insert_requests) {
+ LIST_HEAD(list);
+
+ list_add(&rq->queuelist, &list);
+ e->type->ops.mq.insert_requests(hctx, &list, at_head);
+ } else {
+ spin_lock(&ctx->lock);
+ __blk_mq_insert_request(hctx, rq, at_head);
+ spin_unlock(&ctx->lock);
+ }
+
+ if (run_queue)
+ blk_mq_run_hw_queue(hctx, async);
+}
+
+void blk_mq_sched_insert_requests(struct request_queue *q,
+ struct blk_mq_ctx *ctx,
+ struct list_head *list, bool run_queue_async)
+{
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+ struct elevator_queue *e = hctx->queue->elevator;
+
+ if (e && e->type->ops.mq.insert_requests)
+ e->type->ops.mq.insert_requests(hctx, list, false);
+ else
+ blk_mq_insert_requests(hctx, ctx, list);
+
+ blk_mq_run_hw_queue(hctx, run_queue_async);
+}
+
+static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
+ struct blk_mq_hw_ctx *hctx,
+ unsigned int hctx_idx)
+{
+ if (hctx->sched_tags) {
+ blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
+ blk_mq_free_rq_map(hctx->sched_tags);
+ hctx->sched_tags = NULL;
+ }
+}
+
+int blk_mq_sched_setup(struct request_queue *q)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+ struct blk_mq_hw_ctx *hctx;
+ int ret, i;
+
+ /*
+ * Default to 256, since we don't split into sync/async like the
+ * old code did. Additionally, this is a per-hw queue depth.
+ */
+ q->nr_requests = 2 * BLKDEV_MAX_RQ;
+
+ /*
+ * We're switching to using an IO scheduler, so setup the hctx
+ * scheduler tags and switch the request map from the regular
+ * tags to scheduler tags. First allocate what we need, so we
+ * can safely fail and fallback, if needed.
+ */
+ ret = 0;
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0);
+ if (!hctx->sched_tags) {
+ ret = -ENOMEM;
+ break;
+ }
+ ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
+ if (ret)
+ break;
+ }
+
+ /*
+ * If we failed, free what we did allocate
+ */
+ if (ret) {
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->sched_tags)
+ continue;
+ blk_mq_sched_free_tags(set, hctx, i);
+ }
+
+ return ret;
+ }
+
+ return 0;
+}
+
+void blk_mq_sched_teardown(struct request_queue *q)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_sched_free_tags(set, hctx, i);
+}
+
+int blk_mq_sched_init(struct request_queue *q)
+{
+ int ret;
+
+#if defined(CONFIG_DEFAULT_SQ_NONE)
+ if (q->nr_hw_queues == 1)
+ return 0;
+#endif
+#if defined(CONFIG_DEFAULT_MQ_NONE)
+ if (q->nr_hw_queues > 1)
+ return 0;
+#endif
+
+ mutex_lock(&q->sysfs_lock);
+ ret = elevator_init(q, NULL);
+ mutex_unlock(&q->sysfs_lock);
+
+ return ret;
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
new file mode 100644
index 000000000000..9478aaeb48c5
--- /dev/null
+++ b/block/blk-mq-sched.h
@@ -0,0 +1,142 @@
+#ifndef BLK_MQ_SCHED_H
+#define BLK_MQ_SCHED_H
+
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+ int (*init)(struct blk_mq_hw_ctx *),
+ void (*exit)(struct blk_mq_hw_ctx *));
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+ void (*exit)(struct blk_mq_hw_ctx *));
+
+struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data);
+void blk_mq_sched_put_request(struct request *rq);
+
+void blk_mq_sched_request_inserted(struct request *rq);
+bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq);
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx);
+
+void blk_mq_sched_insert_request(struct request *rq, bool at_head,
+ bool run_queue, bool async, bool can_block);
+void blk_mq_sched_insert_requests(struct request_queue *q,
+ struct blk_mq_ctx *ctx,
+ struct list_head *list, bool run_queue_async);
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
+ struct list_head *rq_list,
+ struct request *(*get_rq)(struct blk_mq_hw_ctx *));
+
+int blk_mq_sched_setup(struct request_queue *q);
+void blk_mq_sched_teardown(struct request_queue *q);
+
+int blk_mq_sched_init(struct request_queue *q);
+
+static inline bool
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio))
+ return false;
+
+ return __blk_mq_sched_bio_merge(q, bio);
+}
+
+static inline int blk_mq_sched_get_rq_priv(struct request_queue *q,
+ struct request *rq)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (e && e->type->ops.mq.get_rq_priv)
+ return e->type->ops.mq.get_rq_priv(q, rq);
+
+ return 0;
+}
+
+static inline void blk_mq_sched_put_rq_priv(struct request_queue *q,
+ struct request *rq)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (e && e->type->ops.mq.put_rq_priv)
+ e->type->ops.mq.put_rq_priv(q, rq);
+}
+
+static inline bool
+blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (e && e->type->ops.mq.allow_merge)
+ return e->type->ops.mq.allow_merge(q, rq, bio);
+
+ return true;
+}
+
+static inline void
+blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+ struct elevator_queue *e = hctx->queue->elevator;
+
+ if (e && e->type->ops.mq.completed_request)
+ e->type->ops.mq.completed_request(hctx, rq);
+
+ BUG_ON(rq->internal_tag == -1);
+
+ blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+}
+
+static inline void blk_mq_sched_started_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
+
+ if (e && e->type->ops.mq.started_request)
+ e->type->ops.mq.started_request(rq);
+}
+
+static inline void blk_mq_sched_requeue_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
+
+ if (e && e->type->ops.mq.requeue_request)
+ e->type->ops.mq.requeue_request(rq);
+}
+
+static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct elevator_queue *e = hctx->queue->elevator;
+
+ if (e && e->type->ops.mq.has_work)
+ return e->type->ops.mq.has_work(hctx);
+
+ return false;
+}
+
+static inline void blk_mq_sched_mark_restart(struct blk_mq_hw_ctx *hctx)
+{
+ if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+ set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
+ struct request_queue *q = hctx->queue;
+
+ if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
+ set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
+ }
+ }
+}
+
+static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
+{
+ return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+}
+
+#endif
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index eacd3af72099..308b3f4fc310 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -122,123 +122,16 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
return res;
}
-static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
-{
- return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
- ctx->rq_dispatched[0]);
-}
-
-static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
-{
- return sprintf(page, "%lu\n", ctx->rq_merged);
-}
-
-static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
-{
- return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
- ctx->rq_completed[0]);
-}
-
-static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
-{
- struct request *rq;
- int len = snprintf(page, PAGE_SIZE - 1, "%s:\n", msg);
-
- list_for_each_entry(rq, list, queuelist) {
- const int rq_len = 2 * sizeof(rq) + 2;
-
- /* if the output will be truncated */
- if (PAGE_SIZE - 1 < len + rq_len) {
- /* backspacing if it can't hold '\t...\n' */
- if (PAGE_SIZE - 1 < len + 5)
- len -= rq_len;
- len += snprintf(page + len, PAGE_SIZE - 1 - len,
- "\t...\n");
- break;
- }
- len += snprintf(page + len, PAGE_SIZE - 1 - len,
- "\t%p\n", rq);
- }
-
- return len;
-}
-
-static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
-{
- ssize_t ret;
-
- spin_lock(&ctx->lock);
- ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
- spin_unlock(&ctx->lock);
-
- return ret;
-}
-
-static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
- return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n",
- hctx->poll_considered, hctx->poll_invoked,
- hctx->poll_success);
-}
-
-static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx,
- const char *page, size_t size)
-{
- hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
-
- return size;
-}
-
-static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
- char *page)
-{
- return sprintf(page, "%lu\n", hctx->queued);
-}
-
-static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
- return sprintf(page, "%lu\n", hctx->run);
-}
-
-static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
- char *page)
-{
- char *start_page = page;
- int i;
-
- page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
-
- for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
- unsigned int d = 1U << (i - 1);
-
- page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]);
- }
-
- page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1),
- hctx->dispatched[i]);
- return page - start_page;
-}
-
-static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
+static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx,
char *page)
{
- ssize_t ret;
-
- spin_lock(&hctx->lock);
- ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
- spin_unlock(&hctx->lock);
-
- return ret;
+ return sprintf(page, "%u\n", hctx->tags->nr_tags);
}
-static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx,
+ char *page)
{
- return blk_mq_tag_sysfs_show(hctx->tags, page);
-}
-
-static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
- return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
+ return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags);
}
static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
@@ -259,121 +152,27 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
return ret;
}
-static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
-{
- struct blk_mq_ctx *ctx;
- unsigned int i;
-
- hctx_for_each_ctx(hctx, ctx, i) {
- blk_stat_init(&ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
- }
-}
-
-static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
- const char *page, size_t count)
-{
- blk_mq_stat_clear(hctx);
- return count;
-}
-
-static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
-{
- return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
- pre, (long long) stat->nr_samples,
- (long long) stat->mean, (long long) stat->min,
- (long long) stat->max);
-}
-
-static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
- struct blk_rq_stat stat[2];
- ssize_t ret;
-
- blk_stat_init(&stat[BLK_STAT_READ]);
- blk_stat_init(&stat[BLK_STAT_WRITE]);
-
- blk_hctx_stat_get(hctx, stat);
-
- ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
- ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
- return ret;
-}
-
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
- .attr = {.name = "dispatched", .mode = S_IRUGO },
- .show = blk_mq_sysfs_dispatched_show,
-};
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
- .attr = {.name = "merged", .mode = S_IRUGO },
- .show = blk_mq_sysfs_merged_show,
-};
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
- .attr = {.name = "completed", .mode = S_IRUGO },
- .show = blk_mq_sysfs_completed_show,
-};
-static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
- .attr = {.name = "rq_list", .mode = S_IRUGO },
- .show = blk_mq_sysfs_rq_list_show,
-};
-
static struct attribute *default_ctx_attrs[] = {
- &blk_mq_sysfs_dispatched.attr,
- &blk_mq_sysfs_merged.attr,
- &blk_mq_sysfs_completed.attr,
- &blk_mq_sysfs_rq_list.attr,
NULL,
};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
- .attr = {.name = "queued", .mode = S_IRUGO },
- .show = blk_mq_hw_sysfs_queued_show,
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
+ .attr = {.name = "nr_tags", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_nr_tags_show,
};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
- .attr = {.name = "run", .mode = S_IRUGO },
- .show = blk_mq_hw_sysfs_run_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
- .attr = {.name = "dispatched", .mode = S_IRUGO },
- .show = blk_mq_hw_sysfs_dispatched_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
- .attr = {.name = "active", .mode = S_IRUGO },
- .show = blk_mq_hw_sysfs_active_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
- .attr = {.name = "pending", .mode = S_IRUGO },
- .show = blk_mq_hw_sysfs_rq_list_show,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
- .attr = {.name = "tags", .mode = S_IRUGO },
- .show = blk_mq_hw_sysfs_tags_show,
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
+ .attr = {.name = "nr_reserved_tags", .mode = S_IRUGO },
+ .show = blk_mq_hw_sysfs_nr_reserved_tags_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
.attr = {.name = "cpu_list", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_cpus_show,
};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
- .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO },
- .show = blk_mq_hw_sysfs_poll_show,
- .store = blk_mq_hw_sysfs_poll_store,
-};
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
- .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
- .show = blk_mq_hw_sysfs_stat_show,
- .store = blk_mq_hw_sysfs_stat_store,
-};
static struct attribute *default_hw_ctx_attrs[] = {
- &blk_mq_hw_sysfs_queued.attr,
- &blk_mq_hw_sysfs_run.attr,
- &blk_mq_hw_sysfs_dispatched.attr,
- &blk_mq_hw_sysfs_pending.attr,
- &blk_mq_hw_sysfs_tags.attr,
+ &blk_mq_hw_sysfs_nr_tags.attr,
+ &blk_mq_hw_sysfs_nr_reserved_tags.attr,
&blk_mq_hw_sysfs_cpus.attr,
- &blk_mq_hw_sysfs_active.attr,
- &blk_mq_hw_sysfs_poll.attr,
- &blk_mq_hw_sysfs_stat.attr,
NULL,
};
@@ -455,6 +254,8 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
kobject_put(&hctx->kobj);
}
+ blk_mq_debugfs_unregister(q);
+
kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
kobject_del(&q->mq_kobj);
kobject_put(&q->mq_kobj);
@@ -504,6 +305,8 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
kobject_uevent(&q->mq_kobj, KOBJ_ADD);
+ blk_mq_debugfs_register(q, kobject_name(&dev->kobj));
+
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
@@ -529,6 +332,8 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
if (!q->mq_sysfs_init_done)
return;
+ blk_mq_debugfs_unregister_hctxs(q);
+
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
}
@@ -541,6 +346,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
if (!q->mq_sysfs_init_done)
return ret;
+ blk_mq_debugfs_register_hctxs(q);
+
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index dcf5ce3ba4bf..54c84363c1b2 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -90,113 +90,97 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return atomic_read(&hctx->nr_active) < depth;
}
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt)
+static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
+ struct sbitmap_queue *bt)
{
- if (!hctx_may_queue(hctx, bt))
+ if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
+ !hctx_may_queue(data->hctx, bt))
return -1;
return __sbitmap_queue_get(bt);
}
-static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
- struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags)
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ struct sbitmap_queue *bt;
struct sbq_wait_state *ws;
DEFINE_WAIT(wait);
+ unsigned int tag_offset;
+ bool drop_ctx;
int tag;
- tag = __bt_get(hctx, bt);
+ if (data->flags & BLK_MQ_REQ_RESERVED) {
+ if (unlikely(!tags->nr_reserved_tags)) {
+ WARN_ON_ONCE(1);
+ return BLK_MQ_TAG_FAIL;
+ }
+ bt = &tags->breserved_tags;
+ tag_offset = 0;
+ } else {
+ bt = &tags->bitmap_tags;
+ tag_offset = tags->nr_reserved_tags;
+ }
+
+ tag = __blk_mq_get_tag(data, bt);
if (tag != -1)
- return tag;
+ goto found_tag;
if (data->flags & BLK_MQ_REQ_NOWAIT)
- return -1;
+ return BLK_MQ_TAG_FAIL;
- ws = bt_wait_ptr(bt, hctx);
+ ws = bt_wait_ptr(bt, data->hctx);
+ drop_ctx = data->ctx == NULL;
do {
prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
- tag = __bt_get(hctx, bt);
+ tag = __blk_mq_get_tag(data, bt);
if (tag != -1)
break;
/*
* We're out of tags on this hardware queue, kick any
* pending IO submits before going to sleep waiting for
- * some to complete. Note that hctx can be NULL here for
- * reserved tag allocation.
+ * some to complete.
*/
- if (hctx)
- blk_mq_run_hw_queue(hctx, false);
+ blk_mq_run_hw_queue(data->hctx, false);
/*
* Retry tag allocation after running the hardware queue,
* as running the queue may also have found completions.
*/
- tag = __bt_get(hctx, bt);
+ tag = __blk_mq_get_tag(data, bt);
if (tag != -1)
break;
- blk_mq_put_ctx(data->ctx);
+ if (data->ctx)
+ blk_mq_put_ctx(data->ctx);
io_schedule();
data->ctx = blk_mq_get_ctx(data->q);
data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
- if (data->flags & BLK_MQ_REQ_RESERVED) {
- bt = &data->hctx->tags->breserved_tags;
- } else {
- hctx = data->hctx;
- bt = &hctx->tags->bitmap_tags;
- }
+ tags = blk_mq_tags_from_data(data);
+ if (data->flags & BLK_MQ_REQ_RESERVED)
+ bt = &tags->breserved_tags;
+ else
+ bt = &tags->bitmap_tags;
+
finish_wait(&ws->wait, &wait);
- ws = bt_wait_ptr(bt, hctx);
+ ws = bt_wait_ptr(bt, data->hctx);
} while (1);
- finish_wait(&ws->wait, &wait);
- return tag;
-}
-
-static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
-{
- int tag;
-
- tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
- data->hctx->tags);
- if (tag >= 0)
- return tag + data->hctx->tags->nr_reserved_tags;
-
- return BLK_MQ_TAG_FAIL;
-}
-
-static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
-{
- int tag;
-
- if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
- WARN_ON_ONCE(1);
- return BLK_MQ_TAG_FAIL;
- }
-
- tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL,
- data->hctx->tags);
- if (tag < 0)
- return BLK_MQ_TAG_FAIL;
+ if (drop_ctx && data->ctx)
+ blk_mq_put_ctx(data->ctx);
- return tag;
-}
+ finish_wait(&ws->wait, &wait);
-unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
-{
- if (data->flags & BLK_MQ_REQ_RESERVED)
- return __blk_mq_get_reserved_tag(data);
- return __blk_mq_get_tag(data);
+found_tag:
+ return tag + tag_offset;
}
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
- unsigned int tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+ struct blk_mq_ctx *ctx, unsigned int tag)
{
- struct blk_mq_tags *tags = hctx->tags;
-
if (tag >= tags->nr_reserved_tags) {
const int real_tag = tag - tags->nr_reserved_tags;
@@ -312,11 +296,11 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
struct blk_mq_tags *tags = set->tags[i];
for (j = 0; j < tags->nr_tags; j++) {
- if (!tags->rqs[j])
+ if (!tags->static_rqs[j])
continue;
ret = set->ops->reinit_request(set->driver_data,
- tags->rqs[j]);
+ tags->static_rqs[j]);
if (ret)
goto out;
}
@@ -351,11 +335,6 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
}
-static unsigned int bt_unused_tags(const struct sbitmap_queue *bt)
-{
- return bt->sb.depth - sbitmap_weight(&bt->sb);
-}
-
static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
bool round_robin, int node)
{
@@ -411,19 +390,56 @@ void blk_mq_free_tags(struct blk_mq_tags *tags)
kfree(tags);
}
-int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
+int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_tags **tagsptr, unsigned int tdepth,
+ bool can_grow)
{
- tdepth -= tags->nr_reserved_tags;
- if (tdepth > tags->nr_tags)
+ struct blk_mq_tags *tags = *tagsptr;
+
+ if (tdepth <= tags->nr_reserved_tags)
return -EINVAL;
+ tdepth -= tags->nr_reserved_tags;
+
/*
- * Don't need (or can't) update reserved tags here, they remain
- * static and should never need resizing.
+ * If we are allowed to grow beyond the original size, allocate
+ * a new set of tags before freeing the old one.
*/
- sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+ if (tdepth > tags->nr_tags) {
+ struct blk_mq_tag_set *set = hctx->queue->tag_set;
+ struct blk_mq_tags *new;
+ bool ret;
+
+ if (!can_grow)
+ return -EINVAL;
+
+ /*
+ * We need some sort of upper limit, set it high enough that
+ * no valid use cases should require more.
+ */
+ if (tdepth > 16 * BLKDEV_MAX_RQ)
+ return -EINVAL;
+
+ new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, 0);
+ if (!new)
+ return -ENOMEM;
+ ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
+ if (ret) {
+ blk_mq_free_rq_map(new);
+ return -ENOMEM;
+ }
+
+ blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
+ blk_mq_free_rq_map(*tagsptr);
+ *tagsptr = new;
+ } else {
+ /*
+ * Don't need (or can't) update reserved tags here, they
+ * remain static and should never need resizing.
+ */
+ sbitmap_queue_resize(&tags->bitmap_tags, tdepth);
+ }
- blk_mq_tag_wakeup_all(tags, false);
return 0;
}
@@ -454,25 +470,3 @@ u32 blk_mq_unique_tag(struct request *rq)
(rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
}
EXPORT_SYMBOL(blk_mq_unique_tag);
-
-ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
-{
- char *orig_page = page;
- unsigned int free, res;
-
- if (!tags)
- return 0;
-
- page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
- "bits_per_word=%u\n",
- tags->nr_tags, tags->nr_reserved_tags,
- 1U << tags->bitmap_tags.sb.shift);
-
- free = bt_unused_tags(&tags->bitmap_tags);
- res = bt_unused_tags(&tags->breserved_tags);
-
- page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
- page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
-
- return page - orig_page;
-}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d1662734dc53..63497423c5cd 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -16,6 +16,7 @@ struct blk_mq_tags {
struct sbitmap_queue breserved_tags;
struct request **rqs;
+ struct request **static_rqs;
struct list_head page_list;
};
@@ -24,11 +25,12 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
- unsigned int tag);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+ struct blk_mq_ctx *ctx, unsigned int tag);
extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
-extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
-extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
+extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_tags **tags,
+ unsigned int depth, bool can_grow);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c3400b5444a7..489076e7ae15 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -32,6 +32,7 @@
#include "blk-mq-tag.h"
#include "blk-stat.h"
#include "blk-wbt.h"
+#include "blk-mq-sched.h"
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
@@ -39,9 +40,11 @@ static LIST_HEAD(all_q_list);
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
-static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
- return sbitmap_any_bit_set(&hctx->ctx_map);
+ return sbitmap_any_bit_set(&hctx->ctx_map) ||
+ !list_empty_careful(&hctx->dispatch) ||
+ blk_mq_sched_has_work(hctx);
}
/*
@@ -167,8 +170,8 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
}
EXPORT_SYMBOL(blk_mq_can_queue);
-static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
- struct request *rq, unsigned int op)
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+ struct request *rq, unsigned int op)
{
INIT_LIST_HEAD(&rq->queuelist);
/* csd/requeue_work/fifo_time is initialized before use */
@@ -213,53 +216,58 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
ctx->rq_dispatched[op_is_sync(op)]++;
}
+EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
-static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+ unsigned int op)
{
struct request *rq;
unsigned int tag;
tag = blk_mq_get_tag(data);
if (tag != BLK_MQ_TAG_FAIL) {
- rq = data->hctx->tags->rqs[tag];
+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
- if (blk_mq_tag_busy(data->hctx)) {
- rq->rq_flags = RQF_MQ_INFLIGHT;
- atomic_inc(&data->hctx->nr_active);
+ rq = tags->static_rqs[tag];
+
+ if (data->flags & BLK_MQ_REQ_INTERNAL) {
+ rq->tag = -1;
+ rq->internal_tag = tag;
+ } else {
+ if (blk_mq_tag_busy(data->hctx)) {
+ rq->rq_flags = RQF_MQ_INFLIGHT;
+ atomic_inc(&data->hctx->nr_active);
+ }
+ rq->tag = tag;
+ rq->internal_tag = -1;
}
- rq->tag = tag;
blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
return rq;
}
return NULL;
}
+EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
unsigned int flags)
{
- struct blk_mq_ctx *ctx;
- struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct request *rq;
- struct blk_mq_alloc_data alloc_data;
int ret;
ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
if (ret)
return ERR_PTR(ret);
- ctx = blk_mq_get_ctx(q);
- hctx = blk_mq_map_queue(q, ctx->cpu);
- blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
- rq = __blk_mq_alloc_request(&alloc_data, rw);
- blk_mq_put_ctx(ctx);
+ rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
- if (!rq) {
- blk_queue_exit(q);
+ blk_mq_put_ctx(alloc_data.ctx);
+ blk_queue_exit(q);
+
+ if (!rq)
return ERR_PTR(-EWOULDBLOCK);
- }
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
@@ -319,10 +327,10 @@ out_queue_exit:
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
-static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx, struct request *rq)
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ struct request *rq)
{
- const int tag = rq->tag;
+ const int sched_tag = rq->internal_tag;
struct request_queue *q = rq->q;
if (rq->rq_flags & RQF_MQ_INFLIGHT)
@@ -333,23 +341,31 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
- blk_mq_put_tag(hctx, ctx, tag);
+ if (rq->tag != -1)
+ blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+ if (sched_tag != -1)
+ blk_mq_sched_completed_request(hctx, rq);
+ blk_mq_sched_restart_queues(hctx);
blk_queue_exit(q);
}
-void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
ctx->rq_completed[rq_is_sync(rq)]++;
- __blk_mq_free_request(hctx, ctx, rq);
+ __blk_mq_finish_request(hctx, ctx, rq);
+}
+void blk_mq_finish_request(struct request *rq)
+{
+ blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
}
-EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
void blk_mq_free_request(struct request *rq)
{
- blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
+ blk_mq_sched_put_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);
@@ -467,6 +483,8 @@ void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_mq_sched_started_request(rq);
+
trace_block_rq_issue(q, rq);
rq->resid_len = blk_rq_bytes(rq);
@@ -515,6 +533,7 @@ static void __blk_mq_requeue_request(struct request *rq)
trace_block_rq_requeue(q, rq);
wbt_requeue(q->rq_wb, &rq->issue_stat);
+ blk_mq_sched_requeue_request(rq);
if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -549,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
- blk_mq_insert_request(rq, true, false, false);
+ blk_mq_sched_insert_request(rq, true, false, false, true);
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
- blk_mq_insert_request(rq, false, false, false);
+ blk_mq_sched_insert_request(rq, false, false, false, true);
}
blk_mq_run_hw_queues(q, false);
@@ -639,7 +658,7 @@ struct blk_mq_timeout_data {
void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
- struct blk_mq_ops *ops = req->q->mq_ops;
+ const struct blk_mq_ops *ops = req->q->mq_ops;
enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
/*
@@ -763,6 +782,12 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
continue;
el_ret = blk_try_merge(rq, bio);
+ if (el_ret == ELEVATOR_NO_MERGE)
+ continue;
+
+ if (!blk_mq_sched_allow_merge(q, rq, bio))
+ break;
+
if (el_ret == ELEVATOR_BACK_MERGE) {
if (bio_attempt_back_merge(q, rq, bio)) {
ctx->rq_merged++;
@@ -803,7 +828,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
* Process software queues that have been marked busy, splicing them
* to the for-dispatch
*/
-static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct flush_busy_ctx_data data = {
.hctx = hctx,
@@ -812,6 +837,7 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
}
+EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
static inline unsigned int queued_to_index(unsigned int queued)
{
@@ -821,6 +847,74 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
+bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
+ bool wait)
+{
+ struct blk_mq_alloc_data data = {
+ .q = rq->q,
+ .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
+ .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+ };
+
+ if (rq->tag != -1) {
+done:
+ if (hctx)
+ *hctx = data.hctx;
+ return true;
+ }
+
+ rq->tag = blk_mq_get_tag(&data);
+ if (rq->tag >= 0) {
+ if (blk_mq_tag_busy(data.hctx)) {
+ rq->rq_flags |= RQF_MQ_INFLIGHT;
+ atomic_inc(&data.hctx->nr_active);
+ }
+ data.hctx->tags->rqs[rq->tag] = rq;
+ goto done;
+ }
+
+ return false;
+}
+
+static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ if (rq->tag == -1 || rq->internal_tag == -1)
+ return;
+
+ blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+ rq->tag = -1;
+
+ if (rq->rq_flags & RQF_MQ_INFLIGHT) {
+ rq->rq_flags &= ~RQF_MQ_INFLIGHT;
+ atomic_dec(&hctx->nr_active);
+ }
+}
+
+/*
+ * If we fail getting a driver tag because all the driver tags are already
+ * assigned and on the dispatch list, BUT the first entry does not have a
+ * tag, then we could deadlock. For that case, move entries with assigned
+ * driver tags to the front, leaving the set of tagged requests in the
+ * same order, and the untagged set in the same order.
+ */
+static bool reorder_tags_to_front(struct list_head *list)
+{
+ struct request *rq, *tmp, *first = NULL;
+
+ list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
+ if (rq == first)
+ break;
+ if (rq->tag != -1) {
+ list_move(&rq->queuelist, list);
+ if (!first)
+ first = rq;
+ }
+ }
+
+ return first != NULL;
+}
+
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct request_queue *q = hctx->queue;
@@ -843,6 +937,20 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
struct blk_mq_queue_data bd;
rq = list_first_entry(list, struct request, queuelist);
+ if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+ if (!queued && reorder_tags_to_front(list))
+ continue;
+
+ /*
+ * We failed getting a driver tag. Mark the queue(s)
+ * as needing a restart. Retry getting a tag again,
+ * in case the needed IO completed right before we
+ * marked the queue as needing a restart.
+ */
+ blk_mq_sched_mark_restart(hctx);
+ if (!blk_mq_get_driver_tag(rq, &hctx, false))
+ break;
+ }
list_del_init(&rq->queuelist);
bd.rq = rq;
@@ -855,6 +963,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
queued++;
break;
case BLK_MQ_RQ_QUEUE_BUSY:
+ blk_mq_put_driver_tag(hctx, rq);
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
@@ -885,7 +994,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
*/
if (!list_empty(list)) {
spin_lock(&hctx->lock);
- list_splice(list, &hctx->dispatch);
+ list_splice_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
/*
@@ -896,47 +1005,17 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
* the requests in rq_list might get lost.
*
* blk_mq_run_hw_queue() already checks the STOPPED bit
- **/
- blk_mq_run_hw_queue(hctx, true);
+ *
+ * If RESTART is set, then let completion restart the queue
+ * instead of potentially looping here.
+ */
+ if (!blk_mq_sched_needs_restart(hctx))
+ blk_mq_run_hw_queue(hctx, true);
}
return ret != BLK_MQ_RQ_QUEUE_BUSY;
}
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
-{
- LIST_HEAD(rq_list);
-
- if (unlikely(blk_mq_hctx_stopped(hctx)))
- return;
-
- hctx->run++;
-
- /*
- * Touch any software queue that has pending entries.
- */
- flush_busy_ctxs(hctx, &rq_list);
-
- /*
- * If we have previous entries on our dispatch list, grab them
- * and stuff them at the front for more fair dispatch.
- */
- if (!list_empty_careful(&hctx->dispatch)) {
- spin_lock(&hctx->lock);
- if (!list_empty(&hctx->dispatch))
- list_splice_init(&hctx->dispatch, &rq_list);
- spin_unlock(&hctx->lock);
- }
-
- blk_mq_dispatch_rq_list(hctx, &rq_list);
-}
-
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
int srcu_idx;
@@ -946,11 +1025,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
rcu_read_lock();
- blk_mq_process_rq_list(hctx);
+ blk_mq_sched_dispatch_requests(hctx);
rcu_read_unlock();
} else {
srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
- blk_mq_process_rq_list(hctx);
+ blk_mq_sched_dispatch_requests(hctx);
srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
}
}
@@ -1006,8 +1085,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- if ((!blk_mq_hctx_has_pending(hctx) &&
- list_empty_careful(&hctx->dispatch)) ||
+ if (!blk_mq_hctx_has_pending(hctx) ||
blk_mq_hctx_stopped(hctx))
continue;
@@ -1116,6 +1194,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
return;
+ blk_mq_stop_hw_queue(hctx);
kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
&hctx->delay_work, msecs_to_jiffies(msecs));
}
@@ -1135,8 +1214,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
list_add_tail(&rq->queuelist, &ctx->rq_list);
}
-static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
- struct request *rq, bool at_head)
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
@@ -1144,32 +1223,10 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
blk_mq_hctx_mark_pending(hctx, ctx);
}
-void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
- bool async)
-{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- struct request_queue *q = rq->q;
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
- spin_lock(&ctx->lock);
- __blk_mq_insert_request(hctx, rq, at_head);
- spin_unlock(&ctx->lock);
-
- if (run_queue)
- blk_mq_run_hw_queue(hctx, async);
-}
-
-static void blk_mq_insert_requests(struct request_queue *q,
- struct blk_mq_ctx *ctx,
- struct list_head *list,
- int depth,
- bool from_schedule)
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ struct list_head *list)
{
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
- trace_block_unplug(q, depth, !from_schedule);
-
/*
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
@@ -1185,8 +1242,6 @@ static void blk_mq_insert_requests(struct request_queue *q,
}
blk_mq_hctx_mark_pending(hctx, ctx);
spin_unlock(&ctx->lock);
-
- blk_mq_run_hw_queue(hctx, from_schedule);
}
static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1222,9 +1277,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
BUG_ON(!rq->q);
if (rq->mq_ctx != this_ctx) {
if (this_ctx) {
- blk_mq_insert_requests(this_q, this_ctx,
- &ctx_list, depth,
- from_schedule);
+ trace_block_unplug(this_q, depth, from_schedule);
+ blk_mq_sched_insert_requests(this_q, this_ctx,
+ &ctx_list,
+ from_schedule);
}
this_ctx = rq->mq_ctx;
@@ -1241,8 +1297,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
* on 'ctx_list'. Do those.
*/
if (this_ctx) {
- blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
- from_schedule);
+ trace_block_unplug(this_q, depth, from_schedule);
+ blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
+ from_schedule);
}
}
@@ -1280,46 +1337,39 @@ insert_rq:
}
spin_unlock(&ctx->lock);
- __blk_mq_free_request(hctx, ctx, rq);
+ __blk_mq_finish_request(hctx, ctx, rq);
return true;
}
}
-static struct request *blk_mq_map_request(struct request_queue *q,
- struct bio *bio,
- struct blk_mq_alloc_data *data)
+static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
{
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
- struct request *rq;
-
- blk_queue_enter_live(q);
- ctx = blk_mq_get_ctx(q);
- hctx = blk_mq_map_queue(q, ctx->cpu);
+ if (rq->tag != -1)
+ return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
- trace_block_getrq(q, bio, bio->bi_opf);
- blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
- rq = __blk_mq_alloc_request(data, bio->bi_opf);
-
- data->hctx->queued++;
- return rq;
+ return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
}
static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
{
- int ret;
struct request_queue *q = rq->q;
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
struct blk_mq_queue_data bd = {
.rq = rq,
.list = NULL,
.last = 1
};
- blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+ struct blk_mq_hw_ctx *hctx;
+ blk_qc_t new_cookie;
+ int ret;
- if (blk_mq_hctx_stopped(hctx))
+ if (q->elevator)
goto insert;
+ if (!blk_mq_get_driver_tag(rq, &hctx, false))
+ goto insert;
+
+ new_cookie = request_to_qc_t(hctx, rq);
+
/*
* For OK queue, we are done. For error, kill it. Any other
* error (busy), just add it to our list as we previously
@@ -1341,7 +1391,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
}
insert:
- blk_mq_insert_request(rq, false, true, true);
+ blk_mq_sched_insert_request(rq, false, true, true, false);
}
/*
@@ -1352,8 +1402,8 @@ insert:
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
- const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
- struct blk_mq_alloc_data data;
+ const int is_flush_fua = op_is_flush(bio->bi_opf);
+ struct blk_mq_alloc_data data = { .flags = 0 };
struct request *rq;
unsigned int request_count = 0, srcu_idx;
struct blk_plug *plug;
@@ -1374,9 +1424,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
return BLK_QC_T_NONE;
+ if (blk_mq_sched_bio_merge(q, bio))
+ return BLK_QC_T_NONE;
+
wb_acct = wbt_wait(q->rq_wb, bio, NULL);
- rq = blk_mq_map_request(q, bio, &data);
+ trace_block_getrq(q, bio, bio->bi_opf);
+
+ rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
@@ -1384,12 +1439,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
wbt_track(&rq->issue_stat, wb_acct);
- cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+ cookie = request_to_qc_t(data.hctx, rq);
if (unlikely(is_flush_fua)) {
+ blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
+ blk_mq_get_driver_tag(rq, NULL, true);
blk_insert_flush(rq);
- goto run_queue;
+ blk_mq_run_hw_queue(data.hctx, true);
+ goto done;
}
plug = current->plug;
@@ -1438,6 +1496,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
goto done;
}
+ if (q->elevator) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_sched_insert_request(rq, false, true,
+ !is_sync || is_flush_fua, true);
+ goto done;
+ }
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
/*
* For a SYNC request, send it to the hardware immediately. For
@@ -1445,7 +1510,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
* latter allows for merging opportunities and more efficient
* dispatching.
*/
-run_queue:
blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
}
blk_mq_put_ctx(data.ctx);
@@ -1460,10 +1524,10 @@ done:
static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
- const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
+ const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_plug *plug;
unsigned int request_count = 0;
- struct blk_mq_alloc_data data;
+ struct blk_mq_alloc_data data = { .flags = 0 };
struct request *rq;
blk_qc_t cookie;
unsigned int wb_acct;
@@ -1483,9 +1547,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
} else
request_count = blk_plug_queued_count(q);
+ if (blk_mq_sched_bio_merge(q, bio))
+ return BLK_QC_T_NONE;
+
wb_acct = wbt_wait(q->rq_wb, bio, NULL);
- rq = blk_mq_map_request(q, bio, &data);
+ trace_block_getrq(q, bio, bio->bi_opf);
+
+ rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
@@ -1493,12 +1562,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
wbt_track(&rq->issue_stat, wb_acct);
- cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
+ cookie = request_to_qc_t(data.hctx, rq);
if (unlikely(is_flush_fua)) {
+ blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
+ blk_mq_get_driver_tag(rq, NULL, true);
blk_insert_flush(rq);
- goto run_queue;
+ blk_mq_run_hw_queue(data.hctx, true);
+ goto done;
}
/*
@@ -1535,6 +1607,13 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
return cookie;
}
+ if (q->elevator) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_sched_insert_request(rq, false, true,
+ !is_sync || is_flush_fua, true);
+ goto done;
+ }
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
/*
* For a SYNC request, send it to the hardware immediately. For
@@ -1542,16 +1621,16 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
* latter allows for merging opportunities and more efficient
* dispatching.
*/
-run_queue:
blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
}
blk_mq_put_ctx(data.ctx);
+done:
return cookie;
}
-static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
- struct blk_mq_tags *tags, unsigned int hctx_idx)
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx)
{
struct page *page;
@@ -1559,11 +1638,13 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
int i;
for (i = 0; i < tags->nr_tags; i++) {
- if (!tags->rqs[i])
+ struct request *rq = tags->static_rqs[i];
+
+ if (!rq)
continue;
- set->ops->exit_request(set->driver_data, tags->rqs[i],
+ set->ops->exit_request(set->driver_data, rq,
hctx_idx, i);
- tags->rqs[i] = NULL;
+ tags->static_rqs[i] = NULL;
}
}
@@ -1577,33 +1658,32 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
kmemleak_free(page_address(page));
__free_pages(page, page->private);
}
+}
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+{
kfree(tags->rqs);
+ tags->rqs = NULL;
+ kfree(tags->static_rqs);
+ tags->static_rqs = NULL;
blk_mq_free_tags(tags);
}
-static size_t order_to_size(unsigned int order)
-{
- return (size_t)PAGE_SIZE << order;
-}
-
-static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx,
+ unsigned int nr_tags,
+ unsigned int reserved_tags)
{
struct blk_mq_tags *tags;
- unsigned int i, j, entries_per_page, max_order = 4;
- size_t rq_size, left;
- tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+ tags = blk_mq_init_tags(nr_tags, reserved_tags,
set->numa_node,
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags)
return NULL;
- INIT_LIST_HEAD(&tags->page_list);
-
- tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+ tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
set->numa_node);
if (!tags->rqs) {
@@ -1611,15 +1691,40 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
return NULL;
}
+ tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
+ GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+ set->numa_node);
+ if (!tags->static_rqs) {
+ kfree(tags->rqs);
+ blk_mq_free_tags(tags);
+ return NULL;
+ }
+
+ return tags;
+}
+
+static size_t order_to_size(unsigned int order)
+{
+ return (size_t)PAGE_SIZE << order;
+}
+
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx, unsigned int depth)
+{
+ unsigned int i, j, entries_per_page, max_order = 4;
+ size_t rq_size, left;
+
+ INIT_LIST_HEAD(&tags->page_list);
+
/*
* rq_size is the size of the request plus driver payload, rounded
* to the cacheline size
*/
rq_size = round_up(sizeof(struct request) + set->cmd_size,
cache_line_size());
- left = rq_size * set->queue_depth;
+ left = rq_size * depth;
- for (i = 0; i < set->queue_depth; ) {
+ for (i = 0; i < depth; ) {
int this_order = max_order;
struct page *page;
int to_do;
@@ -1653,15 +1758,17 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
*/
kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
entries_per_page = order_to_size(this_order) / rq_size;
- to_do = min(entries_per_page, set->queue_depth - i);
+ to_do = min(entries_per_page, depth - i);
left -= to_do * rq_size;
for (j = 0; j < to_do; j++) {
- tags->rqs[i] = p;
+ struct request *rq = p;
+
+ tags->static_rqs[i] = rq;
if (set->ops->init_request) {
if (set->ops->init_request(set->driver_data,
- tags->rqs[i], hctx_idx, i,
+ rq, hctx_idx, i,
set->numa_node)) {
- tags->rqs[i] = NULL;
+ tags->static_rqs[i] = NULL;
goto fail;
}
}
@@ -1670,11 +1777,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
i++;
}
}
- return tags;
+ return 0;
fail:
- blk_mq_free_rq_map(set, tags, hctx_idx);
- return NULL;
+ blk_mq_free_rqs(set, tags, hctx_idx);
+ return -ENOMEM;
}
/*
@@ -1866,6 +1973,35 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}
+static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
+{
+ int ret = 0;
+
+ set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+ set->queue_depth, set->reserved_tags);
+ if (!set->tags[hctx_idx])
+ return false;
+
+ ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
+ set->queue_depth);
+ if (!ret)
+ return true;
+
+ blk_mq_free_rq_map(set->tags[hctx_idx]);
+ set->tags[hctx_idx] = NULL;
+ return false;
+}
+
+static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ if (set->tags[hctx_idx]) {
+ blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+ blk_mq_free_rq_map(set->tags[hctx_idx]);
+ set->tags[hctx_idx] = NULL;
+ }
+}
+
static void blk_mq_map_swqueue(struct request_queue *q,
const struct cpumask *online_mask)
{
@@ -1894,17 +2030,15 @@ static void blk_mq_map_swqueue(struct request_queue *q,
hctx_idx = q->mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
- if (!set->tags[hctx_idx]) {
- set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
-
+ if (!set->tags[hctx_idx] &&
+ !__blk_mq_alloc_rq_map(set, hctx_idx)) {
/*
* If tags initialization fail for some hctx,
* that hctx won't be brought online. In this
* case, remap the current ctx to hctx[0] which
* is guaranteed to always have tags allocated
*/
- if (!set->tags[hctx_idx])
- q->mq_map[i] = 0;
+ q->mq_map[i] = 0;
}
ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -1927,10 +2061,9 @@ static void blk_mq_map_swqueue(struct request_queue *q,
* fallback in case of a new remap fails
* allocation
*/
- if (i && set->tags[i]) {
- blk_mq_free_rq_map(set, set->tags[i], i);
- set->tags[i] = NULL;
- }
+ if (i && set->tags[i])
+ blk_mq_free_map_and_requests(set, i);
+
hctx->tags = NULL;
continue;
}
@@ -2023,6 +2156,8 @@ void blk_mq_release(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
unsigned int i;
+ blk_mq_sched_teardown(q);
+
/* hctx kobj stays in hctx */
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx)
@@ -2097,10 +2232,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx = hctxs[j];
if (hctx) {
- if (hctx->tags) {
- blk_mq_free_rq_map(set, hctx->tags, j);
- set->tags[j] = NULL;
- }
+ if (hctx->tags)
+ blk_mq_free_map_and_requests(set, j);
blk_mq_exit_hctx(q, set, hctx, j);
free_cpumask_var(hctx->cpumask);
kobject_put(&hctx->kobj);
@@ -2181,6 +2314,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
mutex_unlock(&all_q_mutex);
put_online_cpus();
+ if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
+ int ret;
+
+ ret = blk_mq_sched_init(q);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
return q;
err_hctxs:
@@ -2279,10 +2420,10 @@ static int blk_mq_queue_reinit_dead(unsigned int cpu)
* Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
* and set bit0 in pending bitmap as ctx1->index_hw is still zero.
*
- * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in
- * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
- * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list
- * is ignored.
+ * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set
+ * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
+ * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is
+ * ignored.
*/
static int blk_mq_queue_reinit_prepare(unsigned int cpu)
{
@@ -2296,17 +2437,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
int i;
- for (i = 0; i < set->nr_hw_queues; i++) {
- set->tags[i] = blk_mq_init_rq_map(set, i);
- if (!set->tags[i])
+ for (i = 0; i < set->nr_hw_queues; i++)
+ if (!__blk_mq_alloc_rq_map(set, i))
goto out_unwind;
- }
return 0;
out_unwind:
while (--i >= 0)
- blk_mq_free_rq_map(set, set->tags[i], i);
+ blk_mq_free_rq_map(set->tags[i]);
return -ENOMEM;
}
@@ -2430,10 +2569,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
int i;
- for (i = 0; i < nr_cpu_ids; i++) {
- if (set->tags[i])
- blk_mq_free_rq_map(set, set->tags[i], i);
- }
+ for (i = 0; i < nr_cpu_ids; i++)
+ blk_mq_free_map_and_requests(set, i);
kfree(set->mq_map);
set->mq_map = NULL;
@@ -2449,14 +2586,28 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
struct blk_mq_hw_ctx *hctx;
int i, ret;
- if (!set || nr > set->queue_depth)
+ if (!set)
return -EINVAL;
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->tags)
continue;
- ret = blk_mq_tag_update_depth(hctx->tags, nr);
+ /*
+ * If we're using an MQ scheduler, just update the scheduler
+ * queue depth. This is similar to what the old code would do.
+ */
+ if (!hctx->sched_tags) {
+ ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
+ min(nr, set->queue_depth),
+ false);
+ } else {
+ ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
+ nr, true);
+ }
if (ret)
break;
}
@@ -2464,6 +2615,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
if (!ret)
q->nr_requests = nr;
+ blk_mq_unfreeze_queue(q);
+ blk_mq_start_stopped_hw_queues(q, true);
+
return ret;
}
@@ -2649,7 +2803,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
blk_flush_plug_list(plug, false);
hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
- rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+ if (!blk_qc_t_is_internal(cookie))
+ rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+ else
+ rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
return __blk_mq_poll(hctx, rq);
}
@@ -2667,6 +2824,8 @@ void blk_mq_enable_hotplug(void)
static int __init blk_mq_init(void)
{
+ blk_mq_debugfs_init();
+
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 63e9116cddbd..b52abd62b1b0 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -32,8 +32,32 @@ void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
+void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
+bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
+bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
+ bool wait);
/*
+ * Internal helpers for allocating/freeing the request map
+ */
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx);
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx,
+ unsigned int nr_tags,
+ unsigned int reserved_tags);
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx, unsigned int depth);
+
+/*
+ * Internal helpers for request insertion into sw queues
+ */
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head);
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ struct list_head *list);
+/*
* CPU hotplug helpers
*/
void blk_mq_enable_hotplug(void);
@@ -57,6 +81,40 @@ extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
+/*
+ * debugfs helpers
+ */
+#ifdef CONFIG_BLK_DEBUG_FS
+void blk_mq_debugfs_init(void);
+int blk_mq_debugfs_register(struct request_queue *q, const char *name);
+void blk_mq_debugfs_unregister(struct request_queue *q);
+int blk_mq_debugfs_register_hctxs(struct request_queue *q);
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
+#else
+static inline void blk_mq_debugfs_init(void)
+{
+}
+
+static inline int blk_mq_debugfs_register(struct request_queue *q,
+ const char *name)
+{
+ return 0;
+}
+
+static inline void blk_mq_debugfs_unregister(struct request_queue *q)
+{
+}
+
+static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+{
+ return 0;
+}
+
+static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+{
+}
+#endif
+
extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
void blk_mq_release(struct request_queue *q);
@@ -103,6 +161,25 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
data->hctx = hctx;
}
+static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
+{
+ if (data->flags & BLK_MQ_REQ_INTERNAL)
+ return data->hctx->sched_tags;
+
+ return data->hctx->tags;
+}
+
+/*
+ * Internal helpers for request allocation/init/free
+ */
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+ struct request *rq, unsigned int op);
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+ struct request *rq);
+void blk_mq_finish_request(struct request *rq);
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+ unsigned int op);
+
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
diff --git a/block/blk-tag.c b/block/blk-tag.c
index bae1decb6ec3..07cc329fa4b0 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
list_del_init(&rq->queuelist);
rq->rq_flags &= ~RQF_QUEUED;
rq->tag = -1;
+ rq->internal_tag = -1;
if (unlikely(bqt->tag_index[tag] == NULL))
printk(KERN_ERR "%s: tag %d is missing\n",
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a6bb4fe326c3..82fd0cc394eb 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -866,10 +866,12 @@ static void tg_update_disptime(struct throtl_grp *tg)
unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
struct bio *bio;
- if ((bio = throtl_peek_queued(&sq->queued[READ])))
+ bio = throtl_peek_queued(&sq->queued[READ]);
+ if (bio)
tg_may_dispatch(tg, bio, &read_wait);
- if ((bio = throtl_peek_queued(&sq->queued[WRITE])))
+ bio = throtl_peek_queued(&sq->queued[WRITE]);
+ if (bio)
tg_may_dispatch(tg, bio, &write_wait);
min_wait = min(read_wait, write_wait);
diff --git a/block/blk.h b/block/blk.h
index 041185e5f129..9a716b5925a4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -167,7 +167,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
return NULL;
}
if (unlikely(blk_queue_bypass(q)) ||
- !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
+ !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
return NULL;
}
}
@@ -176,16 +176,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_activate_req_fn)
- e->type->ops.elevator_activate_req_fn(q, rq);
+ if (e->type->ops.sq.elevator_activate_req_fn)
+ e->type->ops.sq.elevator_activate_req_fn(q, rq);
}
static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_deactivate_req_fn)
- e->type->ops.elevator_deactivate_req_fn(q, rq);
+ if (e->type->ops.sq.elevator_deactivate_req_fn)
+ e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
}
#ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -264,6 +264,22 @@ void ioc_clear_queue(struct request_queue *q);
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
/**
+ * rq_ioc - determine io_context for request allocation
+ * @bio: request being allocated is for this bio (can be %NULL)
+ *
+ * Determine io_context to use for request allocation for @bio. May return
+ * %NULL if %current->io_context doesn't exist.
+ */
+static inline struct io_context *rq_ioc(struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+ if (bio && bio->bi_ioc)
+ return bio->bi_ioc;
+#endif
+ return current->io_context;
+}
+
+/**
* create_io_context - try to create task->io_context
* @gfp_mask: allocation mask
* @node: allocation node
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c73a6fcaeb9d..f0f29ee731e1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2749,9 +2749,11 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
if (!cfqg)
return NULL;
- for_each_cfqg_st(cfqg, i, j, st)
- if ((cfqq = cfq_rb_first(st)) != NULL)
+ for_each_cfqg_st(cfqg, i, j, st) {
+ cfqq = cfq_rb_first(st);
+ if (cfqq)
return cfqq;
+ }
return NULL;
}
@@ -3864,6 +3866,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
goto out;
}
+ /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
+ cfqq->ioprio_class = IOPRIO_CLASS_NONE;
cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
cfq_init_prio_data(cfqq, cic);
cfq_link_cfqq_cfqg(cfqq, cfqg);
@@ -4837,7 +4841,7 @@ static struct elv_fs_entry cfq_attrs[] = {
};
static struct elevator_type iosched_cfq = {
- .ops = {
+ .ops.sq = {
.elevator_merge_fn = cfq_merge,
.elevator_merged_fn = cfq_merged_request,
.elevator_merge_req_fn = cfq_merged_requests,
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 55e0bb6d7da7..05fc0ea25a98 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -439,7 +439,7 @@ static struct elv_fs_entry deadline_attrs[] = {
};
static struct elevator_type iosched_deadline = {
- .ops = {
+ .ops.sq = {
.elevator_merge_fn = deadline_merge,
.elevator_merged_fn = deadline_merged_request,
.elevator_merge_req_fn = deadline_merged_requests,
diff --git a/block/elevator.c b/block/elevator.c
index 40f0c04e5ad3..b2a55167f0c2 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -40,6 +40,7 @@
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-mq-sched.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -58,8 +59,10 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_allow_bio_merge_fn)
- return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio);
+ if (e->uses_mq && e->type->ops.mq.allow_merge)
+ return e->type->ops.mq.allow_merge(q, rq, bio);
+ else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
+ return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
return 1;
}
@@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
hash_init(eq->hash);
+ eq->uses_mq = e->uses_mq;
return eq;
}
@@ -203,11 +207,12 @@ int elevator_init(struct request_queue *q, char *name)
}
/*
- * Use the default elevator specified by config boot param or
- * config option. Don't try to load modules as we could be running
- * off async and request_module() isn't allowed from async.
+ * Use the default elevator specified by config boot param for
+ * non-mq devices, or by config option. Don't try to load modules
+ * as we could be running off async and request_module() isn't
+ * allowed from async.
*/
- if (!e && *chosen_elevator) {
+ if (!e && !q->mq_ops && *chosen_elevator) {
e = elevator_get(chosen_elevator, false);
if (!e)
printk(KERN_ERR "I/O scheduler %s not found\n",
@@ -215,18 +220,32 @@ int elevator_init(struct request_queue *q, char *name)
}
if (!e) {
- e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+ if (q->mq_ops && q->nr_hw_queues == 1)
+ e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false);
+ else if (q->mq_ops)
+ e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false);
+ else
+ e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+
if (!e) {
printk(KERN_ERR
"Default I/O scheduler not found. " \
- "Using noop.\n");
+ "Using noop/none.\n");
e = elevator_get("noop", false);
}
}
- err = e->ops.elevator_init_fn(q, e);
- if (err)
+ if (e->uses_mq) {
+ err = blk_mq_sched_setup(q);
+ if (!err)
+ err = e->ops.mq.init_sched(q, e);
+ } else
+ err = e->ops.sq.elevator_init_fn(q, e);
+ if (err) {
+ if (e->uses_mq)
+ blk_mq_sched_teardown(q);
elevator_put(e);
+ }
return err;
}
EXPORT_SYMBOL(elevator_init);
@@ -234,8 +253,10 @@ EXPORT_SYMBOL(elevator_init);
void elevator_exit(struct elevator_queue *e)
{
mutex_lock(&e->sysfs_lock);
- if (e->type->ops.elevator_exit_fn)
- e->type->ops.elevator_exit_fn(e);
+ if (e->uses_mq && e->type->ops.mq.exit_sched)
+ e->type->ops.mq.exit_sched(e);
+ else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
+ e->type->ops.sq.elevator_exit_fn(e);
mutex_unlock(&e->sysfs_lock);
kobject_put(&e->kobj);
@@ -253,6 +274,7 @@ void elv_rqhash_del(struct request_queue *q, struct request *rq)
if (ELV_ON_HASH(rq))
__elv_rqhash_del(rq);
}
+EXPORT_SYMBOL_GPL(elv_rqhash_del);
void elv_rqhash_add(struct request_queue *q, struct request *rq)
{
@@ -262,6 +284,7 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq)
hash_add(e->hash, &rq->hash, rq_hash_key(rq));
rq->rq_flags |= RQF_HASHED;
}
+EXPORT_SYMBOL_GPL(elv_rqhash_add);
void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
{
@@ -443,8 +466,10 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
return ELEVATOR_BACK_MERGE;
}
- if (e->type->ops.elevator_merge_fn)
- return e->type->ops.elevator_merge_fn(q, req, bio);
+ if (e->uses_mq && e->type->ops.mq.request_merge)
+ return e->type->ops.mq.request_merge(q, req, bio);
+ else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
+ return e->type->ops.sq.elevator_merge_fn(q, req, bio);
return ELEVATOR_NO_MERGE;
}
@@ -456,8 +481,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
*
* Returns true if we merged, false otherwise
*/
-static bool elv_attempt_insert_merge(struct request_queue *q,
- struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
{
struct request *__rq;
bool ret;
@@ -495,8 +519,10 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_merged_fn)
- e->type->ops.elevator_merged_fn(q, rq, type);
+ if (e->uses_mq && e->type->ops.mq.request_merged)
+ e->type->ops.mq.request_merged(q, rq, type);
+ else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
+ e->type->ops.sq.elevator_merged_fn(q, rq, type);
if (type == ELEVATOR_BACK_MERGE)
elv_rqhash_reposition(q, rq);
@@ -508,10 +534,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
struct request *next)
{
struct elevator_queue *e = q->elevator;
- const int next_sorted = next->rq_flags & RQF_SORTED;
-
- if (next_sorted && e->type->ops.elevator_merge_req_fn)
- e->type->ops.elevator_merge_req_fn(q, rq, next);
+ bool next_sorted = false;
+
+ if (e->uses_mq && e->type->ops.mq.requests_merged)
+ e->type->ops.mq.requests_merged(q, rq, next);
+ else if (e->type->ops.sq.elevator_merge_req_fn) {
+ next_sorted = next->rq_flags & RQF_SORTED;
+ if (next_sorted)
+ e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
+ }
elv_rqhash_reposition(q, rq);
@@ -528,8 +559,11 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_bio_merged_fn)
- e->type->ops.elevator_bio_merged_fn(q, rq, bio);
+ if (WARN_ON_ONCE(e->uses_mq))
+ return;
+
+ if (e->type->ops.sq.elevator_bio_merged_fn)
+ e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
}
#ifdef CONFIG_PM
@@ -574,11 +608,15 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
void elv_drain_elevator(struct request_queue *q)
{
+ struct elevator_queue *e = q->elevator;
static int printed;
+ if (WARN_ON_ONCE(e->uses_mq))
+ return;
+
lockdep_assert_held(q->queue_lock);
- while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
+ while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
;
if (q->nr_sorted && printed++ < 10) {
printk(KERN_ERR "%s: forced dispatching is broken "
@@ -653,7 +691,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
* rq cannot be accessed after calling
* elevator_add_req_fn.
*/
- q->elevator->type->ops.elevator_add_req_fn(q, rq);
+ q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
break;
case ELEVATOR_INSERT_FLUSH:
@@ -682,8 +720,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_latter_req_fn)
- return e->type->ops.elevator_latter_req_fn(q, rq);
+ if (e->uses_mq && e->type->ops.mq.next_request)
+ return e->type->ops.mq.next_request(q, rq);
+ else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
+ return e->type->ops.sq.elevator_latter_req_fn(q, rq);
+
return NULL;
}
@@ -691,8 +732,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_former_req_fn)
- return e->type->ops.elevator_former_req_fn(q, rq);
+ if (e->uses_mq && e->type->ops.mq.former_request)
+ return e->type->ops.mq.former_request(q, rq);
+ if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
+ return e->type->ops.sq.elevator_former_req_fn(q, rq);
return NULL;
}
@@ -701,8 +744,11 @@ int elv_set_request(struct request_queue *q, struct request *rq,
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_set_req_fn)
- return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
+ if (WARN_ON_ONCE(e->uses_mq))
+ return 0;
+
+ if (e->type->ops.sq.elevator_set_req_fn)
+ return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
return 0;
}
@@ -710,16 +756,22 @@ void elv_put_request(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_put_req_fn)
- e->type->ops.elevator_put_req_fn(rq);
+ if (WARN_ON_ONCE(e->uses_mq))
+ return;
+
+ if (e->type->ops.sq.elevator_put_req_fn)
+ e->type->ops.sq.elevator_put_req_fn(rq);
}
int elv_may_queue(struct request_queue *q, unsigned int op)
{
struct elevator_queue *e = q->elevator;
- if (e->type->ops.elevator_may_queue_fn)
- return e->type->ops.elevator_may_queue_fn(q, op);
+ if (WARN_ON_ONCE(e->uses_mq))
+ return 0;
+
+ if (e->type->ops.sq.elevator_may_queue_fn)
+ return e->type->ops.sq.elevator_may_queue_fn(q, op);
return ELV_MQUEUE_MAY;
}
@@ -728,14 +780,17 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
+ if (WARN_ON_ONCE(e->uses_mq))
+ return;
+
/*
* request is released from the driver, io must be done
*/
if (blk_account_rq(rq)) {
q->in_flight[rq_is_sync(rq)]--;
if ((rq->rq_flags & RQF_SORTED) &&
- e->type->ops.elevator_completed_req_fn)
- e->type->ops.elevator_completed_req_fn(q, rq);
+ e->type->ops.sq.elevator_completed_req_fn)
+ e->type->ops.sq.elevator_completed_req_fn(q, rq);
}
}
@@ -803,8 +858,8 @@ int elv_register_queue(struct request_queue *q)
}
kobject_uevent(&e->kobj, KOBJ_ADD);
e->registered = 1;
- if (e->type->ops.elevator_registered_fn)
- e->type->ops.elevator_registered_fn(q);
+ if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
+ e->type->ops.sq.elevator_registered_fn(q);
}
return error;
}
@@ -891,9 +946,14 @@ EXPORT_SYMBOL_GPL(elv_unregister);
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
struct elevator_queue *old = q->elevator;
- bool registered = old->registered;
+ bool old_registered = false;
int err;
+ if (q->mq_ops) {
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+ }
+
/*
* Turn on BYPASS and drain all requests w/ elevator private data.
* Block layer doesn't call into a quiesced elevator - all requests
@@ -901,42 +961,76 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
* using INSERT_BACK. All requests have SOFTBARRIER set and no
* merge happens either.
*/
- blk_queue_bypass_start(q);
+ if (old) {
+ old_registered = old->registered;
- /* unregister and clear all auxiliary data of the old elevator */
- if (registered)
- elv_unregister_queue(q);
+ if (old->uses_mq)
+ blk_mq_sched_teardown(q);
- spin_lock_irq(q->queue_lock);
- ioc_clear_queue(q);
- spin_unlock_irq(q->queue_lock);
+ if (!q->mq_ops)
+ blk_queue_bypass_start(q);
+
+ /* unregister and clear all auxiliary data of the old elevator */
+ if (old_registered)
+ elv_unregister_queue(q);
+
+ spin_lock_irq(q->queue_lock);
+ ioc_clear_queue(q);
+ spin_unlock_irq(q->queue_lock);
+ }
/* allocate, init and register new elevator */
- err = new_e->ops.elevator_init_fn(q, new_e);
- if (err)
- goto fail_init;
+ if (new_e) {
+ if (new_e->uses_mq) {
+ err = blk_mq_sched_setup(q);
+ if (!err)
+ err = new_e->ops.mq.init_sched(q, new_e);
+ } else
+ err = new_e->ops.sq.elevator_init_fn(q, new_e);
+ if (err)
+ goto fail_init;
- if (registered) {
err = elv_register_queue(q);
if (err)
goto fail_register;
- }
+ } else
+ q->elevator = NULL;
/* done, kill the old one and finish */
- elevator_exit(old);
- blk_queue_bypass_end(q);
+ if (old) {
+ elevator_exit(old);
+ if (!q->mq_ops)
+ blk_queue_bypass_end(q);
+ }
- blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+ if (q->mq_ops) {
+ blk_mq_unfreeze_queue(q);
+ blk_mq_start_stopped_hw_queues(q, true);
+ }
+
+ if (new_e)
+ blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+ else
+ blk_add_trace_msg(q, "elv switch: none");
return 0;
fail_register:
+ if (q->mq_ops)
+ blk_mq_sched_teardown(q);
elevator_exit(q->elevator);
fail_init:
/* switch failed, restore and re-register old elevator */
- q->elevator = old;
- elv_register_queue(q);
- blk_queue_bypass_end(q);
+ if (old) {
+ q->elevator = old;
+ elv_register_queue(q);
+ if (!q->mq_ops)
+ blk_queue_bypass_end(q);
+ }
+ if (q->mq_ops) {
+ blk_mq_unfreeze_queue(q);
+ blk_mq_start_stopped_hw_queues(q, true);
+ }
return err;
}
@@ -949,8 +1043,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
- if (!q->elevator)
- return -ENXIO;
+ /*
+ * Special case for mq, turn off scheduling
+ */
+ if (q->mq_ops && !strncmp(name, "none", 4))
+ return elevator_switch(q, NULL);
strlcpy(elevator_name, name, sizeof(elevator_name));
e = elevator_get(strstrip(elevator_name), true);
@@ -959,11 +1056,21 @@ static int __elevator_change(struct request_queue *q, const char *name)
return -EINVAL;
}
- if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
+ if (q->elevator &&
+ !strcmp(elevator_name, q->elevator->type->elevator_name)) {
elevator_put(e);
return 0;
}
+ if (!e->uses_mq && q->mq_ops) {
+ elevator_put(e);
+ return -EINVAL;
+ }
+ if (e->uses_mq && !q->mq_ops) {
+ elevator_put(e);
+ return -EINVAL;
+ }
+
return elevator_switch(q, e);
}
@@ -985,7 +1092,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
{
int ret;
- if (!q->elevator)
+ if (!(q->mq_ops || q->request_fn))
return count;
ret = __elevator_change(q, name);
@@ -999,24 +1106,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
ssize_t elv_iosched_show(struct request_queue *q, char *name)
{
struct elevator_queue *e = q->elevator;
- struct elevator_type *elv;
+ struct elevator_type *elv = NULL;
struct elevator_type *__e;
int len = 0;
- if (!q->elevator || !blk_queue_stackable(q))
+ if (!blk_queue_stackable(q))
return sprintf(name, "none\n");
- elv = e->type;
+ if (!q->elevator)
+ len += sprintf(name+len, "[none] ");
+ else
+ elv = e->type;
spin_lock(&elv_list_lock);
list_for_each_entry(__e, &elv_list, list) {
- if (!strcmp(elv->elevator_name, __e->elevator_name))
+ if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
len += sprintf(name+len, "[%s] ", elv->elevator_name);
- else
+ continue;
+ }
+ if (__e->uses_mq && q->mq_ops)
+ len += sprintf(name+len, "%s ", __e->elevator_name);
+ else if (!__e->uses_mq && !q->mq_ops)
len += sprintf(name+len, "%s ", __e->elevator_name);
}
spin_unlock(&elv_list_lock);
+ if (q->mq_ops && q->elevator)
+ len += sprintf(name+len, "none");
+
len += sprintf(len+name, "\n");
return len;
}
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
new file mode 100644
index 000000000000..d93ec713fa62
--- /dev/null
+++ b/block/mq-deadline.c
@@ -0,0 +1,555 @@
+/*
+ * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
+ * for the blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+
+/*
+ * See Documentation/block/deadline-iosched.txt
+ */
+static const int read_expire = HZ / 2; /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2; /* max times reads can starve a write */
+static const int fifo_batch = 16; /* # of sequential requests treated as one
+ by the above parameters. For throughput. */
+
+struct deadline_data {
+ /*
+ * run time data
+ */
+
+ /*
+ * requests (deadline_rq s) are present on both sort_list and fifo_list
+ */
+ struct rb_root sort_list[2];
+ struct list_head fifo_list[2];
+
+ /*
+ * next in sort order. read, write or both are NULL
+ */
+ struct request *next_rq[2];
+ unsigned int batching; /* number of sequential requests made */
+ unsigned int starved; /* times reads have starved writes */
+
+ /*
+ * settings that change how the i/o scheduler behaves
+ */
+ int fifo_expire[2];
+ int fifo_batch;
+ int writes_starved;
+ int front_merges;
+
+ spinlock_t lock;
+ struct list_head dispatch;
+};
+
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+ return &dd->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * get the request after `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_latter_request(struct request *rq)
+{
+ struct rb_node *node = rb_next(&rq->rb_node);
+
+ if (node)
+ return rb_entry_rq(node);
+
+ return NULL;
+}
+
+static void
+deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+ struct rb_root *root = deadline_rb_root(dd, rq);
+
+ elv_rb_add(root, rq);
+}
+
+static inline void
+deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+ const int data_dir = rq_data_dir(rq);
+
+ if (dd->next_rq[data_dir] == rq)
+ dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+ elv_rb_del(deadline_rb_root(dd, rq), rq);
+}
+
+/*
+ * remove rq from rbtree and fifo.
+ */
+static void deadline_remove_request(struct request_queue *q, struct request *rq)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ list_del_init(&rq->queuelist);
+
+ /*
+ * We might not be on the rbtree, if we are doing an insert merge
+ */
+ if (!RB_EMPTY_NODE(&rq->rb_node))
+ deadline_del_rq_rb(dd, rq);
+
+ elv_rqhash_del(q, rq);
+ if (q->last_merge == rq)
+ q->last_merge = NULL;
+}
+
+static void dd_request_merged(struct request_queue *q, struct request *req,
+ int type)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ /*
+ * if the merge was a front merge, we need to reposition request
+ */
+ if (type == ELEVATOR_FRONT_MERGE) {
+ elv_rb_del(deadline_rb_root(dd, req), req);
+ deadline_add_rq_rb(dd, req);
+ }
+}
+
+static void dd_merged_requests(struct request_queue *q, struct request *req,
+ struct request *next)
+{
+ /*
+ * if next expires before rq, assign its expire time to rq
+ * and move into next position (next will be deleted) in fifo
+ */
+ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+ if (time_before((unsigned long)next->fifo_time,
+ (unsigned long)req->fifo_time)) {
+ list_move(&req->queuelist, &next->queuelist);
+ req->fifo_time = next->fifo_time;
+ }
+ }
+
+ /*
+ * kill knowledge of next, this one is a goner
+ */
+ deadline_remove_request(q, next);
+}
+
+/*
+ * move an entry to dispatch queue
+ */
+static void
+deadline_move_request(struct deadline_data *dd, struct request *rq)
+{
+ const int data_dir = rq_data_dir(rq);
+
+ dd->next_rq[READ] = NULL;
+ dd->next_rq[WRITE] = NULL;
+ dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+ /*
+ * take it off the sort and fifo list
+ */
+ deadline_remove_request(rq->q, rq);
+}
+
+/*
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ */
+static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
+{
+ struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
+
+ /*
+ * rq is expired!
+ */
+ if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * deadline_dispatch_requests selects the best request according to
+ * read/write expire, fifo_batch, etc
+ */
+static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ struct request *rq;
+ bool reads, writes;
+ int data_dir;
+
+ if (!list_empty(&dd->dispatch)) {
+ rq = list_first_entry(&dd->dispatch, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ goto done;
+ }
+
+ reads = !list_empty(&dd->fifo_list[READ]);
+ writes = !list_empty(&dd->fifo_list[WRITE]);
+
+ /*
+ * batches are currently reads XOR writes
+ */
+ if (dd->next_rq[WRITE])
+ rq = dd->next_rq[WRITE];
+ else
+ rq = dd->next_rq[READ];
+
+ if (rq && dd->batching < dd->fifo_batch)
+ /* we have a next request are still entitled to batch */
+ goto dispatch_request;
+
+ /*
+ * at this point we are not running a batch. select the appropriate
+ * data direction (read / write)
+ */
+
+ if (reads) {
+ BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
+
+ if (writes && (dd->starved++ >= dd->writes_starved))
+ goto dispatch_writes;
+
+ data_dir = READ;
+
+ goto dispatch_find_request;
+ }
+
+ /*
+ * there are either no reads or writes have been starved
+ */
+
+ if (writes) {
+dispatch_writes:
+ BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
+
+ dd->starved = 0;
+
+ data_dir = WRITE;
+
+ goto dispatch_find_request;
+ }
+
+ return NULL;
+
+dispatch_find_request:
+ /*
+ * we are not running a batch, find best request for selected data_dir
+ */
+ if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+ /*
+ * A deadline has expired, the last request was in the other
+ * direction, or we have run out of higher-sectored requests.
+ * Start again from the request with the earliest expiry time.
+ */
+ rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+ } else {
+ /*
+ * The last req was the same dir and we have a next request in
+ * sort order. No expired requests so continue on from here.
+ */
+ rq = dd->next_rq[data_dir];
+ }
+
+ dd->batching = 0;
+
+dispatch_request:
+ /*
+ * rq is the selected appropriate request.
+ */
+ dd->batching++;
+ deadline_move_request(dd, rq);
+done:
+ rq->rq_flags |= RQF_STARTED;
+ return rq;
+}
+
+static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ struct request *rq;
+
+ spin_lock(&dd->lock);
+ rq = __dd_dispatch_request(hctx);
+ spin_unlock(&dd->lock);
+
+ return rq;
+}
+
+static void dd_exit_queue(struct elevator_queue *e)
+{
+ struct deadline_data *dd = e->elevator_data;
+
+ BUG_ON(!list_empty(&dd->fifo_list[READ]));
+ BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+
+ kfree(dd);
+}
+
+/*
+ * initialize elevator private data (deadline_data).
+ */
+static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+ struct deadline_data *dd;
+ struct elevator_queue *eq;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
+
+ dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+ if (!dd) {
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+ eq->elevator_data = dd;
+
+ INIT_LIST_HEAD(&dd->fifo_list[READ]);
+ INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
+ dd->sort_list[READ] = RB_ROOT;
+ dd->sort_list[WRITE] = RB_ROOT;
+ dd->fifo_expire[READ] = read_expire;
+ dd->fifo_expire[WRITE] = write_expire;
+ dd->writes_starved = writes_starved;
+ dd->front_merges = 1;
+ dd->fifo_batch = fifo_batch;
+ spin_lock_init(&dd->lock);
+ INIT_LIST_HEAD(&dd->dispatch);
+
+ q->elevator = eq;
+ return 0;
+}
+
+static int dd_request_merge(struct request_queue *q, struct request **rq,
+ struct bio *bio)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ sector_t sector = bio_end_sector(bio);
+ struct request *__rq;
+
+ if (!dd->front_merges)
+ return ELEVATOR_NO_MERGE;
+
+ __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
+ if (__rq) {
+ BUG_ON(sector != blk_rq_pos(__rq));
+
+ if (elv_bio_merge_ok(__rq, bio)) {
+ *rq = __rq;
+ return ELEVATOR_FRONT_MERGE;
+ }
+ }
+
+ return ELEVATOR_NO_MERGE;
+}
+
+static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ int ret;
+
+ spin_lock(&dd->lock);
+ ret = blk_mq_sched_try_merge(q, bio);
+ spin_unlock(&dd->lock);
+
+ return ret;
+}
+
+/*
+ * add rq to rbtree and fifo
+ */
+static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const int data_dir = rq_data_dir(rq);
+
+ if (blk_mq_sched_try_insert_merge(q, rq))
+ return;
+
+ blk_mq_sched_request_inserted(rq);
+
+ if (blk_mq_sched_bypass_insert(hctx, rq))
+ return;
+
+ if (at_head || rq->cmd_type != REQ_TYPE_FS) {
+ if (at_head)
+ list_add(&rq->queuelist, &dd->dispatch);
+ else
+ list_add_tail(&rq->queuelist, &dd->dispatch);
+ } else {
+ deadline_add_rq_rb(dd, rq);
+
+ if (rq_mergeable(rq)) {
+ elv_rqhash_add(q, rq);
+ if (!q->last_merge)
+ q->last_merge = rq;
+ }
+
+ /*
+ * set expire time and add to fifo list
+ */
+ rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
+ list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+ }
+}
+
+static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list, bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ spin_lock(&dd->lock);
+ while (!list_empty(list)) {
+ struct request *rq;
+
+ rq = list_first_entry(list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ dd_insert_request(hctx, rq, at_head);
+ }
+ spin_unlock(&dd->lock);
+}
+
+static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+
+ return !list_empty_careful(&dd->dispatch) ||
+ !list_empty_careful(&dd->fifo_list[0]) ||
+ !list_empty_careful(&dd->fifo_list[1]);
+}
+
+/*
+ * sysfs parts below
+ */
+static ssize_t
+deadline_var_show(int var, char *page)
+{
+ return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+deadline_var_store(int *var, const char *page, size_t count)
+{
+ char *p = (char *) page;
+
+ *var = simple_strtol(p, &p, 10);
+ return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct deadline_data *dd = e->elevator_data; \
+ int __data = __VAR; \
+ if (__CONV) \
+ __data = jiffies_to_msecs(__data); \
+ return deadline_var_show(__data, (page)); \
+}
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
+{ \
+ struct deadline_data *dd = e->elevator_data; \
+ int __data; \
+ int ret = deadline_var_store(&__data, (page), count); \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ if (__CONV) \
+ *(__PTR) = msecs_to_jiffies(__data); \
+ else \
+ *(__PTR) = __data; \
+ return ret; \
+}
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+#undef STORE_FUNCTION
+
+#define DD_ATTR(name) \
+ __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+ deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+ DD_ATTR(read_expire),
+ DD_ATTR(write_expire),
+ DD_ATTR(writes_starved),
+ DD_ATTR(front_merges),
+ DD_ATTR(fifo_batch),
+ __ATTR_NULL
+};
+
+static struct elevator_type mq_deadline = {
+ .ops.mq = {
+ .insert_requests = dd_insert_requests,
+ .dispatch_request = dd_dispatch_request,
+ .next_request = elv_rb_latter_request,
+ .former_request = elv_rb_former_request,
+ .bio_merge = dd_bio_merge,
+ .request_merge = dd_request_merge,
+ .requests_merged = dd_merged_requests,
+ .request_merged = dd_request_merged,
+ .has_work = dd_has_work,
+ .init_sched = dd_init_queue,
+ .exit_sched = dd_exit_queue,
+ },
+
+ .uses_mq = true,
+ .elevator_attrs = deadline_attrs,
+ .elevator_name = "mq-deadline",
+ .elevator_owner = THIS_MODULE,
+};
+
+static int __init deadline_init(void)
+{
+ return elv_register(&mq_deadline);
+}
+
+static void __exit deadline_exit(void)
+{
+ elv_unregister(&mq_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ deadline IO scheduler");
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index a163c487cf38..2d1b15d89b45 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -92,7 +92,7 @@ static void noop_exit_queue(struct elevator_queue *e)
}
static struct elevator_type elevator_noop = {
- .ops = {
+ .ops.sq = {
.elevator_merge_req_fn = noop_merged_requests,
.elevator_dispatch_fn = noop_dispatch,
.elevator_add_req_fn = noop_add_request,
diff --git a/block/opal_proto.h b/block/opal_proto.h
new file mode 100644
index 000000000000..f40c9acf8895
--- /dev/null
+++ b/block/opal_proto.h
@@ -0,0 +1,452 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ * Rafael Antognolli <rafael.antognolli@intel.com>
+ * Scott Bauer <scott.bauer@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/types.h>
+
+#ifndef _OPAL_PROTO_H
+#define _OPAL_PROTO_H
+
+/*
+ * These constant values come from:
+ * SPC-4 section
+ * 6.30 SECURITY PROTOCOL IN command / table 265.
+ */
+enum {
+ TCG_SECP_00 = 0,
+ TCG_SECP_01,
+};
+
+/*
+ * Token defs derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * 3.2.2 Data Stream Encoding
+ */
+enum opal_response_token {
+ OPAL_DTA_TOKENID_BYTESTRING = 0xe0,
+ OPAL_DTA_TOKENID_SINT = 0xe1,
+ OPAL_DTA_TOKENID_UINT = 0xe2,
+ OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */
+ OPAL_DTA_TOKENID_INVALID = 0X0
+};
+
+#define DTAERROR_NO_METHOD_STATUS 0x89
+#define GENERIC_HOST_SESSION_NUM 0x41
+
+#define TPER_SYNC_SUPPORTED 0x01
+
+#define TINY_ATOM_DATA_MASK 0x3F
+#define TINY_ATOM_SIGNED 0x40
+
+#define SHORT_ATOM_ID 0x80
+#define SHORT_ATOM_BYTESTRING 0x20
+#define SHORT_ATOM_SIGNED 0x10
+#define SHORT_ATOM_LEN_MASK 0xF
+
+#define MEDIUM_ATOM_ID 0xC0
+#define MEDIUM_ATOM_BYTESTRING 0x10
+#define MEDIUM_ATOM_SIGNED 0x8
+#define MEDIUM_ATOM_LEN_MASK 0x7
+
+#define LONG_ATOM_ID 0xe0
+#define LONG_ATOM_BYTESTRING 0x2
+#define LONG_ATOM_SIGNED 0x1
+
+/* Derived from TCG Core spec 2.01 Section:
+ * 3.2.2.1
+ * Data Type
+ */
+#define TINY_ATOM_BYTE 0x7F
+#define SHORT_ATOM_BYTE 0xBF
+#define MEDIUM_ATOM_BYTE 0xDF
+#define LONG_ATOM_BYTE 0xE3
+
+#define OPAL_INVAL_PARAM 12
+#define OPAL_MANUFACTURED_INACTIVE 0x08
+#define OPAL_DISCOVERY_COMID 0x0001
+
+#define LOCKING_RANGE_NON_GLOBAL 0x03
+/*
+ * User IDs used in the TCG storage SSCs
+ * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 6.3 Assigned UIDs
+ */
+#define OPAL_UID_LENGTH 8
+#define OPAL_METHOD_LENGTH 8
+#define OPAL_MSID_KEYLEN 15
+#define OPAL_UID_LENGTH_HALF 4
+
+/* Enum to index OPALUID array */
+enum opal_uid {
+ /* users */
+ OPAL_SMUID_UID,
+ OPAL_THISSP_UID,
+ OPAL_ADMINSP_UID,
+ OPAL_LOCKINGSP_UID,
+ OPAL_ENTERPRISE_LOCKINGSP_UID,
+ OPAL_ANYBODY_UID,
+ OPAL_SID_UID,
+ OPAL_ADMIN1_UID,
+ OPAL_USER1_UID,
+ OPAL_USER2_UID,
+ OPAL_PSID_UID,
+ OPAL_ENTERPRISE_BANDMASTER0_UID,
+ OPAL_ENTERPRISE_ERASEMASTER_UID,
+ /* tables */
+ OPAL_LOCKINGRANGE_GLOBAL,
+ OPAL_LOCKINGRANGE_ACE_RDLOCKED,
+ OPAL_LOCKINGRANGE_ACE_WRLOCKED,
+ OPAL_MBRCONTROL,
+ OPAL_MBR,
+ OPAL_AUTHORITY_TABLE,
+ OPAL_C_PIN_TABLE,
+ OPAL_LOCKING_INFO_TABLE,
+ OPAL_ENTERPRISE_LOCKING_INFO_TABLE,
+ /* C_PIN_TABLE object ID's */
+ OPAL_C_PIN_MSID,
+ OPAL_C_PIN_SID,
+ OPAL_C_PIN_ADMIN1,
+ /* half UID's (only first 4 bytes used) */
+ OPAL_HALF_UID_AUTHORITY_OBJ_REF,
+ OPAL_HALF_UID_BOOLEAN_ACE,
+ /* omitted optional parameter */
+ OPAL_UID_HEXFF,
+};
+
+#define OPAL_METHOD_LENGTH 8
+
+/* Enum for indexing the OPALMETHOD array */
+enum opal_method {
+ OPAL_PROPERTIES,
+ OPAL_STARTSESSION,
+ OPAL_REVERT,
+ OPAL_ACTIVATE,
+ OPAL_EGET,
+ OPAL_ESET,
+ OPAL_NEXT,
+ OPAL_EAUTHENTICATE,
+ OPAL_GETACL,
+ OPAL_GENKEY,
+ OPAL_REVERTSP,
+ OPAL_GET,
+ OPAL_SET,
+ OPAL_AUTHENTICATE,
+ OPAL_RANDOM,
+ OPAL_ERASE,
+};
+
+enum opal_token {
+ /* Boolean */
+ OPAL_TRUE = 0x01,
+ OPAL_FALSE = 0x00,
+ OPAL_BOOLEAN_EXPR = 0x03,
+ /* cellblocks */
+ OPAL_TABLE = 0x00,
+ OPAL_STARTROW = 0x01,
+ OPAL_ENDROW = 0x02,
+ OPAL_STARTCOLUMN = 0x03,
+ OPAL_ENDCOLUMN = 0x04,
+ OPAL_VALUES = 0x01,
+ /* authority table */
+ OPAL_PIN = 0x03,
+ /* locking tokens */
+ OPAL_RANGESTART = 0x03,
+ OPAL_RANGELENGTH = 0x04,
+ OPAL_READLOCKENABLED = 0x05,
+ OPAL_WRITELOCKENABLED = 0x06,
+ OPAL_READLOCKED = 0x07,
+ OPAL_WRITELOCKED = 0x08,
+ OPAL_ACTIVEKEY = 0x0A,
+ /* locking info table */
+ OPAL_MAXRANGES = 0x04,
+ /* mbr control */
+ OPAL_MBRENABLE = 0x01,
+ OPAL_MBRDONE = 0x02,
+ /* properties */
+ OPAL_HOSTPROPERTIES = 0x00,
+ /* atoms */
+ OPAL_STARTLIST = 0xf0,
+ OPAL_ENDLIST = 0xf1,
+ OPAL_STARTNAME = 0xf2,
+ OPAL_ENDNAME = 0xf3,
+ OPAL_CALL = 0xf8,
+ OPAL_ENDOFDATA = 0xf9,
+ OPAL_ENDOFSESSION = 0xfa,
+ OPAL_STARTTRANSACTON = 0xfb,
+ OPAL_ENDTRANSACTON = 0xfC,
+ OPAL_EMPTYATOM = 0xff,
+ OPAL_WHERE = 0x00,
+};
+
+/* Locking state for a locking range */
+enum opal_lockingstate {
+ OPAL_LOCKING_READWRITE = 0x01,
+ OPAL_LOCKING_READONLY = 0x02,
+ OPAL_LOCKING_LOCKED = 0x03,
+};
+
+/* Packets derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Secion: 3.2.3 ComPackets, Packets & Subpackets
+ */
+
+/* Comm Packet (header) for transmissions. */
+struct opal_compacket {
+ __be32 reserved0;
+ u8 extendedComID[4];
+ __be32 outstandingData;
+ __be32 minTransfer;
+ __be32 length;
+};
+
+/* Packet structure. */
+struct opal_packet {
+ __be32 tsn;
+ __be32 hsn;
+ __be32 seq_number;
+ __be16 reserved0;
+ __be16 ack_type;
+ __be32 acknowledgment;
+ __be32 length;
+};
+
+/* Data sub packet header */
+struct opal_data_subpacket {
+ u8 reserved0[6];
+ __be16 kind;
+ __be32 length;
+};
+
+/* header of a response */
+struct opal_header {
+ struct opal_compacket cp;
+ struct opal_packet pkt;
+ struct opal_data_subpacket subpkt;
+};
+
+#define FC_TPER 0x0001
+#define FC_LOCKING 0x0002
+#define FC_GEOMETRY 0x0003
+#define FC_ENTERPRISE 0x0100
+#define FC_DATASTORE 0x0202
+#define FC_SINGLEUSER 0x0201
+#define FC_OPALV100 0x0200
+#define FC_OPALV200 0x0203
+
+/*
+ * The Discovery 0 Header. As defined in
+ * Opal SSC Documentation
+ * Section: 3.3.5 Capability Discovery
+ */
+struct d0_header {
+ __be32 length; /* the length of the header 48 in 2.00.100 */
+ __be32 revision; /**< revision of the header 1 in 2.00.100 */
+ __be32 reserved01;
+ __be32 reserved02;
+ /*
+ * the remainder of the structure is vendor specific and will not be
+ * addressed now
+ */
+ u8 ignored[32];
+};
+
+/*
+ * TPer Feature Descriptor. Contains flags indicating support for the
+ * TPer features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x001 in 2.00.100
+ */
+struct d0_tper_features {
+ /*
+ * supported_features bits:
+ * bit 7: reserved
+ * bit 6: com ID management
+ * bit 5: reserved
+ * bit 4: streaming support
+ * bit 3: buffer management
+ * bit 2: ACK/NACK
+ * bit 1: async
+ * bit 0: sync
+ */
+ u8 supported_features;
+ /*
+ * bytes 5 through 15 are reserved, but we represent the first 3 as
+ * u8 to keep the other two 32bits integers aligned.
+ */
+ u8 reserved01[3];
+ __be32 reserved02;
+ __be32 reserved03;
+};
+
+/*
+ * Locking Feature Descriptor. Contains flags indicating support for the
+ * locking features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x0002 in 2.00.100
+ */
+struct d0_locking_features {
+ /*
+ * supported_features bits:
+ * bits 6-7: reserved
+ * bit 5: MBR done
+ * bit 4: MBR enabled
+ * bit 3: media encryption
+ * bit 2: locked
+ * bit 1: locking enabled
+ * bit 0: locking supported
+ */
+ u8 supported_features;
+ /*
+ * bytes 5 through 15 are reserved, but we represent the first 3 as
+ * u8 to keep the other two 32bits integers aligned.
+ */
+ u8 reserved01[3];
+ __be32 reserved02;
+ __be32 reserved03;
+};
+
+/*
+ * Geometry Feature Descriptor. Contains flags indicating support for the
+ * geometry features described in the OPAL specification. The names match the
+ * OPAL terminology
+ *
+ * code == 0x0003 in 2.00.100
+ */
+struct d0_geometry_features {
+ /*
+ * skip 32 bits from header, needed to align the struct to 64 bits.
+ */
+ u8 header[4];
+ /*
+ * reserved01:
+ * bits 1-6: reserved
+ * bit 0: align
+ */
+ u8 reserved01;
+ u8 reserved02[7];
+ __be32 logical_block_size;
+ __be64 alignment_granularity;
+ __be64 lowest_aligned_lba;
+};
+
+/*
+ * Enterprise SSC Feature
+ *
+ * code == 0x0100
+ */
+struct d0_enterprise_ssc {
+ __be16 baseComID;
+ __be16 numComIDs;
+ /* range_crossing:
+ * bits 1-6: reserved
+ * bit 0: range crossing
+ */
+ u8 range_crossing;
+ u8 reserved01;
+ __be16 reserved02;
+ __be32 reserved03;
+ __be32 reserved04;
+};
+
+/*
+ * Opal V1 feature
+ *
+ * code == 0x0200
+ */
+struct d0_opal_v100 {
+ __be16 baseComID;
+ __be16 numComIDs;
+};
+
+/*
+ * Single User Mode feature
+ *
+ * code == 0x0201
+ */
+struct d0_single_user_mode {
+ __be32 num_locking_objects;
+ /* reserved01:
+ * bit 0: any
+ * bit 1: all
+ * bit 2: policy
+ * bits 3-7: reserved
+ */
+ u8 reserved01;
+ u8 reserved02;
+ __be16 reserved03;
+ __be32 reserved04;
+};
+
+/*
+ * Additonal Datastores feature
+ *
+ * code == 0x0202
+ */
+struct d0_datastore_table {
+ __be16 reserved01;
+ __be16 max_tables;
+ __be32 max_size_tables;
+ __be32 table_size_alignment;
+};
+
+/*
+ * OPAL 2.0 feature
+ *
+ * code == 0x0203
+ */
+struct d0_opal_v200 {
+ __be16 baseComID;
+ __be16 numComIDs;
+ /* range_crossing:
+ * bits 1-6: reserved
+ * bit 0: range crossing
+ */
+ u8 range_crossing;
+ /* num_locking_admin_auth:
+ * not aligned to 16 bits, so use two u8.
+ * stored in big endian:
+ * 0: MSB
+ * 1: LSB
+ */
+ u8 num_locking_admin_auth[2];
+ /* num_locking_user_auth:
+ * not aligned to 16 bits, so use two u8.
+ * stored in big endian:
+ * 0: MSB
+ * 1: LSB
+ */
+ u8 num_locking_user_auth[2];
+ u8 initialPIN;
+ u8 revertedPIN;
+ u8 reserved01;
+ __be32 reserved02;
+};
+
+/* Union of features used to parse the discovery 0 response */
+struct d0_features {
+ __be16 code;
+ /*
+ * r_version bits:
+ * bits 4-7: version
+ * bits 0-3: reserved
+ */
+ u8 r_version;
+ u8 length;
+ u8 features[];
+};
+
+#endif /* _OPAL_PROTO_H */
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index bcd86e5cd546..39f70d968754 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -293,7 +293,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
if (!gpt)
return NULL;
- count = le32_to_cpu(gpt->num_partition_entries) *
+ count = (size_t)le32_to_cpu(gpt->num_partition_entries) *
le32_to_cpu(gpt->sizeof_partition_entry);
if (!count)
return NULL;
@@ -352,7 +352,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
gpt_header **gpt, gpt_entry **ptes)
{
u32 crc, origcrc;
- u64 lastlba;
+ u64 lastlba, pt_size;
if (!ptes)
return 0;
@@ -434,13 +434,20 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
goto fail;
}
+ /* Sanity check partition table size */
+ pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) *
+ le32_to_cpu((*gpt)->sizeof_partition_entry);
+ if (pt_size > KMALLOC_MAX_SIZE) {
+ pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n",
+ (unsigned long long)pt_size, KMALLOC_MAX_SIZE);
+ goto fail;
+ }
+
if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
goto fail;
/* Check the GUID Partition Entry Array CRC */
- crc = efi_crc32((const unsigned char *) (*ptes),
- le32_to_cpu((*gpt)->num_partition_entries) *
- le32_to_cpu((*gpt)->sizeof_partition_entry));
+ crc = efi_crc32((const unsigned char *) (*ptes), pt_size);
if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
pr_debug("GUID Partition Entry Array CRC check failed.\n");
diff --git a/block/sed-opal.c b/block/sed-opal.c
new file mode 100644
index 000000000000..d1c52ba4d62d
--- /dev/null
+++ b/block/sed-opal.c
@@ -0,0 +1,2488 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Authors:
+ * Scott Bauer <scott.bauer@intel.com>
+ * Rafael Antognolli <rafael.antognolli@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/sed-opal.h>
+#include <linux/sed-opal.h>
+#include <linux/string.h>
+#include <linux/kdev_t.h>
+
+#include "opal_proto.h"
+
+#define IO_BUFFER_LENGTH 2048
+#define MAX_TOKS 64
+
+typedef int (*opal_step)(struct opal_dev *dev);
+
+enum opal_atom_width {
+ OPAL_WIDTH_TINY,
+ OPAL_WIDTH_SHORT,
+ OPAL_WIDTH_MEDIUM,
+ OPAL_WIDTH_LONG,
+ OPAL_WIDTH_TOKEN
+};
+
+/*
+ * On the parsed response, we don't store again the toks that are already
+ * stored in the response buffer. Instead, for each token, we just store a
+ * pointer to the position in the buffer where the token starts, and the size
+ * of the token in bytes.
+ */
+struct opal_resp_tok {
+ const u8 *pos;
+ size_t len;
+ enum opal_response_token type;
+ enum opal_atom_width width;
+ union {
+ u64 u;
+ s64 s;
+ } stored;
+};
+
+/*
+ * From the response header it's not possible to know how many tokens there are
+ * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later
+ * if we start dealing with messages that have more than that, we can increase
+ * this number. This is done to avoid having to make two passes through the
+ * response, the first one counting how many tokens we have and the second one
+ * actually storing the positions.
+ */
+struct parsed_resp {
+ int num;
+ struct opal_resp_tok toks[MAX_TOKS];
+};
+
+struct opal_dev {
+ bool supported;
+
+ void *data;
+ sec_send_recv *send_recv;
+
+ const opal_step *funcs;
+ void **func_data;
+ int state;
+ struct mutex dev_lock;
+ u16 comid;
+ u32 hsn;
+ u32 tsn;
+ u64 align;
+ u64 lowest_lba;
+
+ size_t pos;
+ u8 cmd[IO_BUFFER_LENGTH];
+ u8 resp[IO_BUFFER_LENGTH];
+
+ struct parsed_resp parsed;
+ size_t prev_d_len;
+ void *prev_data;
+
+ struct list_head unlk_lst;
+};
+
+
+static const u8 opaluid[][OPAL_UID_LENGTH] = {
+ /* users */
+ [OPAL_SMUID_UID] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff },
+ [OPAL_THISSP_UID] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 },
+ [OPAL_ADMINSP_UID] =
+ { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 },
+ [OPAL_LOCKINGSP_UID] =
+ { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 },
+ [OPAL_ENTERPRISE_LOCKINGSP_UID] =
+ { 0x00, 0x00, 0x02, 0x05, 0x00, 0x01, 0x00, 0x01 },
+ [OPAL_ANYBODY_UID] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 },
+ [OPAL_SID_UID] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 },
+ [OPAL_ADMIN1_UID] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 },
+ [OPAL_USER1_UID] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 },
+ [OPAL_USER2_UID] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 },
+ [OPAL_PSID_UID] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 },
+ [OPAL_ENTERPRISE_BANDMASTER0_UID] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x80, 0x01 },
+ [OPAL_ENTERPRISE_ERASEMASTER_UID] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 },
+
+ /* tables */
+
+ [OPAL_LOCKINGRANGE_GLOBAL] =
+ { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
+ [OPAL_LOCKINGRANGE_ACE_RDLOCKED] =
+ { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 },
+ [OPAL_LOCKINGRANGE_ACE_WRLOCKED] =
+ { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 },
+ [OPAL_MBRCONTROL] =
+ { 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 },
+ [OPAL_MBR] =
+ { 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 },
+ [OPAL_AUTHORITY_TABLE] =
+ { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00},
+ [OPAL_C_PIN_TABLE] =
+ { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00},
+ [OPAL_LOCKING_INFO_TABLE] =
+ { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 },
+ [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] =
+ { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 },
+
+ /* C_PIN_TABLE object ID's */
+
+ [OPAL_C_PIN_MSID] =
+ { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
+ [OPAL_C_PIN_SID] =
+ { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01},
+ [OPAL_C_PIN_ADMIN1] =
+ { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01},
+
+ /* half UID's (only first 4 bytes used) */
+
+ [OPAL_HALF_UID_AUTHORITY_OBJ_REF] =
+ { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff },
+ [OPAL_HALF_UID_BOOLEAN_ACE] =
+ { 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff },
+
+ /* special value for omitted optional parameter */
+ [OPAL_UID_HEXFF] =
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+};
+
+/*
+ * TCG Storage SSC Methods.
+ * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 6.3 Assigned UIDs
+ */
+static const u8 opalmethod[][OPAL_UID_LENGTH] = {
+ [OPAL_PROPERTIES] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 },
+ [OPAL_STARTSESSION] =
+ { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 },
+ [OPAL_REVERT] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 },
+ [OPAL_ACTIVATE] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 },
+ [OPAL_EGET] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06 },
+ [OPAL_ESET] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07 },
+ [OPAL_NEXT] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 },
+ [OPAL_EAUTHENTICATE] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c },
+ [OPAL_GETACL] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d },
+ [OPAL_GENKEY] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 },
+ [OPAL_REVERTSP] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 },
+ [OPAL_GET] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 },
+ [OPAL_SET] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 },
+ [OPAL_AUTHENTICATE] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c },
+ [OPAL_RANDOM] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 },
+ [OPAL_ERASE] =
+ { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
+};
+
+typedef int (cont_fn)(struct opal_dev *dev);
+
+static int end_opal_session_error(struct opal_dev *dev);
+
+struct opal_suspend_data {
+ struct opal_lock_unlock unlk;
+ u8 lr;
+ struct list_head node;
+};
+
+/*
+ * Derived from:
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 5.1.5 Method Status Codes
+ */
+static const char * const opal_errors[] = {
+ "Success",
+ "Not Authorized",
+ "Unknown Error",
+ "SP Busy",
+ "SP Failed",
+ "SP Disabled",
+ "SP Frozen",
+ "No Sessions Available",
+ "Uniqueness Conflict",
+ "Insufficient Space",
+ "Insufficient Rows",
+ "Invalid Function",
+ "Invalid Parameter",
+ "Invalid Reference",
+ "Unknown Error",
+ "TPER Malfunction",
+ "Transaction Failure",
+ "Response Overflow",
+ "Authority Locked Out",
+};
+
+static const char *opal_error_to_human(int error)
+{
+ if (error == 0x3f)
+ return "Failed";
+
+ if (error >= ARRAY_SIZE(opal_errors) || error < 0)
+ return "Unknown Error";
+
+ return opal_errors[error];
+}
+
+static void print_buffer(const u8 *ptr, u32 length)
+{
+#ifdef DEBUG
+ print_hex_dump_bytes("OPAL: ", DUMP_PREFIX_OFFSET, ptr, length);
+ pr_debug("\n");
+#endif
+}
+
+static bool check_tper(const void *data)
+{
+ const struct d0_tper_features *tper = data;
+ u8 flags = tper->supported_features;
+
+ if (!(flags & TPER_SYNC_SUPPORTED)) {
+ pr_err("TPer sync not supported. flags = %d\n",
+ tper->supported_features);
+ return false;
+ }
+
+ return true;
+}
+
+static bool check_sum(const void *data)
+{
+ const struct d0_single_user_mode *sum = data;
+ u32 nlo = be32_to_cpu(sum->num_locking_objects);
+
+ if (nlo == 0) {
+ pr_err("Need at least one locking object.\n");
+ return false;
+ }
+
+ pr_debug("Number of locking objects: %d\n", nlo);
+
+ return true;
+}
+
+static u16 get_comid_v100(const void *data)
+{
+ const struct d0_opal_v100 *v100 = data;
+
+ return be16_to_cpu(v100->baseComID);
+}
+
+static u16 get_comid_v200(const void *data)
+{
+ const struct d0_opal_v200 *v200 = data;
+
+ return be16_to_cpu(v200->baseComID);
+}
+
+static int opal_send_cmd(struct opal_dev *dev)
+{
+ return dev->send_recv(dev->data, dev->comid, TCG_SECP_01,
+ dev->cmd, IO_BUFFER_LENGTH,
+ true);
+}
+
+static int opal_recv_cmd(struct opal_dev *dev)
+{
+ return dev->send_recv(dev->data, dev->comid, TCG_SECP_01,
+ dev->resp, IO_BUFFER_LENGTH,
+ false);
+}
+
+static int opal_recv_check(struct opal_dev *dev)
+{
+ size_t buflen = IO_BUFFER_LENGTH;
+ void *buffer = dev->resp;
+ struct opal_header *hdr = buffer;
+ int ret;
+
+ do {
+ pr_debug("Sent OPAL command: outstanding=%d, minTransfer=%d\n",
+ hdr->cp.outstandingData,
+ hdr->cp.minTransfer);
+
+ if (hdr->cp.outstandingData == 0 ||
+ hdr->cp.minTransfer != 0)
+ return 0;
+
+ memset(buffer, 0, buflen);
+ ret = opal_recv_cmd(dev);
+ } while (!ret);
+
+ return ret;
+}
+
+static int opal_send_recv(struct opal_dev *dev, cont_fn *cont)
+{
+ int ret;
+
+ ret = opal_send_cmd(dev);
+ if (ret)
+ return ret;
+ ret = opal_recv_cmd(dev);
+ if (ret)
+ return ret;
+ ret = opal_recv_check(dev);
+ if (ret)
+ return ret;
+ return cont(dev);
+}
+
+static void check_geometry(struct opal_dev *dev, const void *data)
+{
+ const struct d0_geometry_features *geo = data;
+
+ dev->align = geo->alignment_granularity;
+ dev->lowest_lba = geo->lowest_aligned_lba;
+}
+
+static int next(struct opal_dev *dev)
+{
+ opal_step func;
+ int error = 0;
+
+ do {
+ func = dev->funcs[dev->state];
+ if (!func)
+ break;
+
+ error = func(dev);
+ if (error) {
+ pr_err("Error on step function: %d with error %d: %s\n",
+ dev->state, error,
+ opal_error_to_human(error));
+
+ /* For each OPAL command we do a discovery0 then we
+ * start some sort of session.
+ * If we haven't passed state 1 then there was an error
+ * on discovery0 or during the attempt to start a
+ * session. Therefore we shouldn't attempt to terminate
+ * a session, as one has not yet been created.
+ */
+ if (dev->state > 1)
+ return end_opal_session_error(dev);
+ }
+ dev->state++;
+ } while (!error);
+
+ return error;
+}
+
+static int opal_discovery0_end(struct opal_dev *dev)
+{
+ bool found_com_id = false, supported = true, single_user = false;
+ const struct d0_header *hdr = (struct d0_header *)dev->resp;
+ const u8 *epos = dev->resp, *cpos = dev->resp;
+ u16 comid = 0;
+
+ print_buffer(dev->resp, be32_to_cpu(hdr->length));
+
+ epos += be32_to_cpu(hdr->length); /* end of buffer */
+ cpos += sizeof(*hdr); /* current position on buffer */
+
+ while (cpos < epos && supported) {
+ const struct d0_features *body =
+ (const struct d0_features *)cpos;
+
+ switch (be16_to_cpu(body->code)) {
+ case FC_TPER:
+ supported = check_tper(body->features);
+ break;
+ case FC_SINGLEUSER:
+ single_user = check_sum(body->features);
+ break;
+ case FC_GEOMETRY:
+ check_geometry(dev, body);
+ break;
+ case FC_LOCKING:
+ case FC_ENTERPRISE:
+ case FC_DATASTORE:
+ /* some ignored properties */
+ pr_debug("Found OPAL feature description: %d\n",
+ be16_to_cpu(body->code));
+ break;
+ case FC_OPALV100:
+ comid = get_comid_v100(body->features);
+ found_com_id = true;
+ break;
+ case FC_OPALV200:
+ comid = get_comid_v200(body->features);
+ found_com_id = true;
+ break;
+ case 0xbfff ... 0xffff:
+ /* vendor specific, just ignore */
+ break;
+ default:
+ pr_debug("OPAL Unknown feature: %d\n",
+ be16_to_cpu(body->code));
+
+ }
+ cpos += body->length + 4;
+ }
+
+ if (!supported) {
+ pr_debug("This device is not Opal enabled. Not Supported!\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (!single_user)
+ pr_debug("Device doesn't support single user mode\n");
+
+
+ if (!found_com_id) {
+ pr_debug("Could not find OPAL comid for device. Returning early\n");
+ return -EOPNOTSUPP;;
+ }
+
+ dev->comid = comid;
+
+ return 0;
+}
+
+static int opal_discovery0(struct opal_dev *dev)
+{
+ int ret;
+
+ memset(dev->resp, 0, IO_BUFFER_LENGTH);
+ dev->comid = OPAL_DISCOVERY_COMID;
+ ret = opal_recv_cmd(dev);
+ if (ret)
+ return ret;
+ return opal_discovery0_end(dev);
+}
+
+static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
+{
+ if (*err)
+ return;
+ if (cmd->pos >= IO_BUFFER_LENGTH - 1) {
+ pr_err("Error adding u8: end of buffer.\n");
+ *err = -ERANGE;
+ return;
+ }
+ cmd->cmd[cmd->pos++] = tok;
+}
+
+static void add_short_atom_header(struct opal_dev *cmd, bool bytestring,
+ bool has_sign, int len)
+{
+ u8 atom;
+ int err = 0;
+
+ atom = SHORT_ATOM_ID;
+ atom |= bytestring ? SHORT_ATOM_BYTESTRING : 0;
+ atom |= has_sign ? SHORT_ATOM_SIGNED : 0;
+ atom |= len & SHORT_ATOM_LEN_MASK;
+
+ add_token_u8(&err, cmd, atom);
+}
+
+static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring,
+ bool has_sign, int len)
+{
+ u8 header0;
+
+ header0 = MEDIUM_ATOM_ID;
+ header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0;
+ header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0;
+ header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK;
+ cmd->cmd[cmd->pos++] = header0;
+ cmd->cmd[cmd->pos++] = len;
+}
+
+static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
+{
+
+ size_t len;
+ int msb;
+ u8 n;
+
+ if (!(number & ~TINY_ATOM_DATA_MASK)) {
+ add_token_u8(err, cmd, number);
+ return;
+ }
+
+ msb = fls(number);
+ len = DIV_ROUND_UP(msb, 4);
+
+ if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
+ pr_err("Error adding u64: end of buffer.\n");
+ *err = -ERANGE;
+ return;
+ }
+ add_short_atom_header(cmd, false, false, len);
+ while (len--) {
+ n = number >> (len * 8);
+ add_token_u8(err, cmd, n);
+ }
+}
+
+static void add_token_bytestring(int *err, struct opal_dev *cmd,
+ const u8 *bytestring, size_t len)
+{
+ size_t header_len = 1;
+ bool is_short_atom = true;
+
+ if (*err)
+ return;
+
+ if (len & ~SHORT_ATOM_LEN_MASK) {
+ header_len = 2;
+ is_short_atom = false;
+ }
+
+ if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) {
+ pr_err("Error adding bytestring: end of buffer.\n");
+ *err = -ERANGE;
+ return;
+ }
+
+ if (is_short_atom)
+ add_short_atom_header(cmd, true, false, len);
+ else
+ add_medium_atom_header(cmd, true, false, len);
+
+ memcpy(&cmd->cmd[cmd->pos], bytestring, len);
+ cmd->pos += len;
+
+}
+
+static int build_locking_range(u8 *buffer, size_t length, u8 lr)
+{
+ if (length > OPAL_UID_LENGTH) {
+ pr_err("Can't build locking range. Length OOB\n");
+ return -ERANGE;
+ }
+
+ memcpy(buffer, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH);
+
+ if (lr == 0)
+ return 0;
+ buffer[5] = LOCKING_RANGE_NON_GLOBAL;
+ buffer[7] = lr;
+
+ return 0;
+}
+
+static int build_locking_user(u8 *buffer, size_t length, u8 lr)
+{
+ if (length > OPAL_UID_LENGTH) {
+ pr_err("Can't build locking range user, Length OOB\n");
+ return -ERANGE;
+ }
+
+ memcpy(buffer, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+
+ buffer[7] = lr + 1;
+
+ return 0;
+}
+
+static void set_comid(struct opal_dev *cmd, u16 comid)
+{
+ struct opal_header *hdr = (struct opal_header *)cmd->cmd;
+
+ hdr->cp.extendedComID[0] = comid >> 8;
+ hdr->cp.extendedComID[1] = comid;
+ hdr->cp.extendedComID[2] = 0;
+ hdr->cp.extendedComID[3] = 0;
+}
+
+static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
+{
+ struct opal_header *hdr;
+ int err = 0;
+
+ add_token_u8(&err, cmd, OPAL_ENDOFDATA);
+ add_token_u8(&err, cmd, OPAL_STARTLIST);
+ add_token_u8(&err, cmd, 0);
+ add_token_u8(&err, cmd, 0);
+ add_token_u8(&err, cmd, 0);
+ add_token_u8(&err, cmd, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error finalizing command.\n");
+ return -EFAULT;
+ }
+
+ hdr = (struct opal_header *) cmd->cmd;
+
+ hdr->pkt.tsn = cpu_to_be32(tsn);
+ hdr->pkt.hsn = cpu_to_be32(hsn);
+
+ hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr));
+ while (cmd->pos % 4) {
+ if (cmd->pos >= IO_BUFFER_LENGTH) {
+ pr_err("Error: Buffer overrun\n");
+ return -ERANGE;
+ }
+ cmd->cmd[cmd->pos++] = 0;
+ }
+ hdr->pkt.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp) -
+ sizeof(hdr->pkt));
+ hdr->cp.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp));
+
+ return 0;
+}
+
+static enum opal_response_token token_type(const struct parsed_resp *resp,
+ int n)
+{
+ const struct opal_resp_tok *tok;
+
+ if (n >= resp->num) {
+ pr_err("Token number doesn't exist: %d, resp: %d\n",
+ n, resp->num);
+ return OPAL_DTA_TOKENID_INVALID;
+ }
+
+ tok = &resp->toks[n];
+ if (tok->len == 0) {
+ pr_err("Token length must be non-zero\n");
+ return OPAL_DTA_TOKENID_INVALID;
+ }
+
+ return tok->type;
+}
+
+/*
+ * This function returns 0 in case of invalid token. One should call
+ * token_type() first to find out if the token is valid or not.
+ */
+static enum opal_token response_get_token(const struct parsed_resp *resp,
+ int n)
+{
+ const struct opal_resp_tok *tok;
+
+ if (n >= resp->num) {
+ pr_err("Token number doesn't exist: %d, resp: %d\n",
+ n, resp->num);
+ return 0;
+ }
+
+ tok = &resp->toks[n];
+ if (tok->len == 0) {
+ pr_err("Token length must be non-zero\n");
+ return 0;
+ }
+
+ return tok->pos[0];
+}
+
+static size_t response_parse_tiny(struct opal_resp_tok *tok,
+ const u8 *pos)
+{
+ tok->pos = pos;
+ tok->len = 1;
+ tok->width = OPAL_WIDTH_TINY;
+
+ if (pos[0] & TINY_ATOM_SIGNED) {
+ tok->type = OPAL_DTA_TOKENID_SINT;
+ } else {
+ tok->type = OPAL_DTA_TOKENID_UINT;
+ tok->stored.u = pos[0] & 0x3f;
+ }
+
+ return tok->len;
+}
+
+static size_t response_parse_short(struct opal_resp_tok *tok,
+ const u8 *pos)
+{
+ tok->pos = pos;
+ tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1;
+ tok->width = OPAL_WIDTH_SHORT;
+
+ if (pos[0] & SHORT_ATOM_BYTESTRING) {
+ tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+ } else if (pos[0] & SHORT_ATOM_SIGNED) {
+ tok->type = OPAL_DTA_TOKENID_SINT;
+ } else {
+ u64 u_integer = 0;
+ int i, b = 0;
+
+ tok->type = OPAL_DTA_TOKENID_UINT;
+ if (tok->len > 9) {
+ pr_warn("uint64 with more than 8 bytes\n");
+ return -EINVAL;
+ }
+ for (i = tok->len - 1; i > 0; i--) {
+ u_integer |= ((u64)pos[i] << (8 * b));
+ b++;
+ }
+ tok->stored.u = u_integer;
+ }
+
+ return tok->len;
+}
+
+static size_t response_parse_medium(struct opal_resp_tok *tok,
+ const u8 *pos)
+{
+ tok->pos = pos;
+ tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2;
+ tok->width = OPAL_WIDTH_MEDIUM;
+
+ if (pos[0] & MEDIUM_ATOM_BYTESTRING)
+ tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+ else if (pos[0] & MEDIUM_ATOM_SIGNED)
+ tok->type = OPAL_DTA_TOKENID_SINT;
+ else
+ tok->type = OPAL_DTA_TOKENID_UINT;
+
+ return tok->len;
+}
+
+static size_t response_parse_long(struct opal_resp_tok *tok,
+ const u8 *pos)
+{
+ tok->pos = pos;
+ tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4;
+ tok->width = OPAL_WIDTH_LONG;
+
+ if (pos[0] & LONG_ATOM_BYTESTRING)
+ tok->type = OPAL_DTA_TOKENID_BYTESTRING;
+ else if (pos[0] & LONG_ATOM_SIGNED)
+ tok->type = OPAL_DTA_TOKENID_SINT;
+ else
+ tok->type = OPAL_DTA_TOKENID_UINT;
+
+ return tok->len;
+}
+
+static size_t response_parse_token(struct opal_resp_tok *tok,
+ const u8 *pos)
+{
+ tok->pos = pos;
+ tok->len = 1;
+ tok->type = OPAL_DTA_TOKENID_TOKEN;
+ tok->width = OPAL_WIDTH_TOKEN;
+
+ return tok->len;
+}
+
+static int response_parse(const u8 *buf, size_t length,
+ struct parsed_resp *resp)
+{
+ const struct opal_header *hdr;
+ struct opal_resp_tok *iter;
+ int num_entries = 0;
+ int total;
+ size_t token_length;
+ const u8 *pos;
+
+ if (!buf)
+ return -EFAULT;
+
+ if (!resp)
+ return -EFAULT;
+
+ hdr = (struct opal_header *)buf;
+ pos = buf;
+ pos += sizeof(*hdr);
+
+ pr_debug("Response size: cp: %d, pkt: %d, subpkt: %d\n",
+ be32_to_cpu(hdr->cp.length),
+ be32_to_cpu(hdr->pkt.length),
+ be32_to_cpu(hdr->subpkt.length));
+
+ if (hdr->cp.length == 0 || hdr->pkt.length == 0 ||
+ hdr->subpkt.length == 0) {
+ pr_err("Bad header length. cp: %d, pkt: %d, subpkt: %d\n",
+ be32_to_cpu(hdr->cp.length),
+ be32_to_cpu(hdr->pkt.length),
+ be32_to_cpu(hdr->subpkt.length));
+ print_buffer(pos, sizeof(*hdr));
+ return -EINVAL;
+ }
+
+ if (pos > buf + length)
+ return -EFAULT;
+
+ iter = resp->toks;
+ total = be32_to_cpu(hdr->subpkt.length);
+ print_buffer(pos, total);
+ while (total > 0) {
+ if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */
+ token_length = response_parse_tiny(iter, pos);
+ else if (pos[0] <= SHORT_ATOM_BYTE) /* short atom */
+ token_length = response_parse_short(iter, pos);
+ else if (pos[0] <= MEDIUM_ATOM_BYTE) /* medium atom */
+ token_length = response_parse_medium(iter, pos);
+ else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */
+ token_length = response_parse_long(iter, pos);
+ else /* TOKEN */
+ token_length = response_parse_token(iter, pos);
+
+ if (token_length == -EINVAL)
+ return -EINVAL;
+
+ pos += token_length;
+ total -= token_length;
+ iter++;
+ num_entries++;
+ }
+
+ if (num_entries == 0) {
+ pr_err("Couldn't parse response.\n");
+ return -EINVAL;
+ }
+ resp->num = num_entries;
+
+ return 0;
+}
+
+static size_t response_get_string(const struct parsed_resp *resp, int n,
+ const char **store)
+{
+ *store = NULL;
+ if (!resp) {
+ pr_err("Response is NULL\n");
+ return 0;
+ }
+
+ if (n > resp->num) {
+ pr_err("Response has %d tokens. Can't access %d\n",
+ resp->num, n);
+ return 0;
+ }
+
+ if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) {
+ pr_err("Token is not a byte string!\n");
+ return 0;
+ }
+
+ *store = resp->toks[n].pos + 1;
+ return resp->toks[n].len - 1;
+}
+
+static u64 response_get_u64(const struct parsed_resp *resp, int n)
+{
+ if (!resp) {
+ pr_err("Response is NULL\n");
+ return 0;
+ }
+
+ if (n > resp->num) {
+ pr_err("Response has %d tokens. Can't access %d\n",
+ resp->num, n);
+ return 0;
+ }
+
+ if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) {
+ pr_err("Token is not unsigned it: %d\n",
+ resp->toks[n].type);
+ return 0;
+ }
+
+ if (!(resp->toks[n].width == OPAL_WIDTH_TINY ||
+ resp->toks[n].width == OPAL_WIDTH_SHORT)) {
+ pr_err("Atom is not short or tiny: %d\n",
+ resp->toks[n].width);
+ return 0;
+ }
+
+ return resp->toks[n].stored.u;
+}
+
+static u8 response_status(const struct parsed_resp *resp)
+{
+ if (token_type(resp, 0) == OPAL_DTA_TOKENID_TOKEN &&
+ response_get_token(resp, 0) == OPAL_ENDOFSESSION) {
+ return 0;
+ }
+
+ if (resp->num < 5)
+ return DTAERROR_NO_METHOD_STATUS;
+
+ if (token_type(resp, resp->num - 1) != OPAL_DTA_TOKENID_TOKEN ||
+ token_type(resp, resp->num - 5) != OPAL_DTA_TOKENID_TOKEN ||
+ response_get_token(resp, resp->num - 1) != OPAL_ENDLIST ||
+ response_get_token(resp, resp->num - 5) != OPAL_STARTLIST)
+ return DTAERROR_NO_METHOD_STATUS;
+
+ return response_get_u64(resp, resp->num - 4);
+}
+
+/* Parses and checks for errors */
+static int parse_and_check_status(struct opal_dev *dev)
+{
+ int error;
+
+ print_buffer(dev->cmd, dev->pos);
+
+ error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed);
+ if (error) {
+ pr_err("Couldn't parse response.\n");
+ return error;
+ }
+
+ return response_status(&dev->parsed);
+}
+
+static void clear_opal_cmd(struct opal_dev *dev)
+{
+ dev->pos = sizeof(struct opal_header);
+ memset(dev->cmd, 0, IO_BUFFER_LENGTH);
+}
+
+static int start_opal_session_cont(struct opal_dev *dev)
+{
+ u32 hsn, tsn;
+ int error = 0;
+
+ error = parse_and_check_status(dev);
+ if (error)
+ return error;
+
+ hsn = response_get_u64(&dev->parsed, 4);
+ tsn = response_get_u64(&dev->parsed, 5);
+
+ if (hsn == 0 && tsn == 0) {
+ pr_err("Couldn't authenticate session\n");
+ return -EPERM;
+ }
+
+ dev->hsn = hsn;
+ dev->tsn = tsn;
+ return 0;
+}
+
+static void add_suspend_info(struct opal_dev *dev,
+ struct opal_suspend_data *sus)
+{
+ struct opal_suspend_data *iter;
+
+ list_for_each_entry(iter, &dev->unlk_lst, node) {
+ if (iter->lr == sus->lr) {
+ list_del(&iter->node);
+ kfree(iter);
+ break;
+ }
+ }
+ list_add_tail(&sus->node, &dev->unlk_lst);
+}
+
+static int end_session_cont(struct opal_dev *dev)
+{
+ dev->hsn = 0;
+ dev->tsn = 0;
+ return parse_and_check_status(dev);
+}
+
+static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
+{
+ int ret;
+
+ ret = cmd_finalize(dev, dev->hsn, dev->tsn);
+ if (ret) {
+ pr_err("Error finalizing command buffer: %d\n", ret);
+ return ret;
+ }
+
+ print_buffer(dev->cmd, dev->pos);
+
+ return opal_send_recv(dev, cont);
+}
+
+static int gen_key(struct opal_dev *dev)
+{
+ const u8 *method;
+ u8 uid[OPAL_UID_LENGTH];
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len));
+ method = opalmethod[OPAL_GENKEY];
+ kfree(dev->prev_data);
+ dev->prev_data = NULL;
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_GENKEY],
+ OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error building gen key command\n");
+ return err;
+
+ }
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int get_active_key_cont(struct opal_dev *dev)
+{
+ const char *activekey;
+ size_t keylen;
+ int error = 0;
+
+ error = parse_and_check_status(dev);
+ if (error)
+ return error;
+ keylen = response_get_string(&dev->parsed, 4, &activekey);
+ if (!activekey) {
+ pr_err("%s: Couldn't extract the Activekey from the response\n",
+ __func__);
+ return OPAL_INVAL_PARAM;
+ }
+ dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
+
+ if (!dev->prev_data)
+ return -ENOMEM;
+
+ dev->prev_d_len = keylen;
+
+ return 0;
+}
+
+static int get_active_key(struct opal_dev *dev)
+{
+ u8 uid[OPAL_UID_LENGTH];
+ int err = 0;
+ u8 *lr;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+ lr = dev->func_data[dev->state];
+
+ err = build_locking_range(uid, sizeof(uid), *lr);
+ if (err)
+ return err;
+
+ err = 0;
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 3); /* startCloumn */
+ add_token_u8(&err, dev, 10); /* ActiveKey */
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 4); /* endColumn */
+ add_token_u8(&err, dev, 10); /* ActiveKey */
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ if (err) {
+ pr_err("Error building get active key command\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, get_active_key_cont);
+}
+
+static int generic_lr_enable_disable(struct opal_dev *dev,
+ u8 *uid, bool rle, bool wle,
+ bool rl, bool wl)
+{
+ int err = 0;
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_VALUES);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 5); /* ReadLockEnabled */
+ add_token_u8(&err, dev, rle);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 6); /* WriteLockEnabled */
+ add_token_u8(&err, dev, wle);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_READLOCKED);
+ add_token_u8(&err, dev, rl);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_WRITELOCKED);
+ add_token_u8(&err, dev, wl);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ return err;
+}
+
+static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
+ struct opal_user_lr_setup *setup)
+{
+ int err;
+
+ err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE,
+ 0, 0);
+ if (err)
+ pr_err("Failed to create enable global lr command\n");
+ return err;
+}
+
+static int setup_locking_range(struct opal_dev *dev)
+{
+ u8 uid[OPAL_UID_LENGTH];
+ struct opal_user_lr_setup *setup;
+ u8 lr;
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ setup = dev->func_data[dev->state];
+ lr = setup->session.opal_key.lr;
+ err = build_locking_range(uid, sizeof(uid), lr);
+ if (err)
+ return err;
+
+ if (lr == 0)
+ err = enable_global_lr(dev, uid, setup);
+ else {
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+ OPAL_UID_LENGTH);
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_VALUES);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 3); /* Ranges Start */
+ add_token_u64(&err, dev, setup->range_start);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 4); /* Ranges length */
+ add_token_u64(&err, dev, setup->range_length);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 5); /*ReadLockEnabled */
+ add_token_u64(&err, dev, !!setup->RLE);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 6); /*WriteLockEnabled*/
+ add_token_u64(&err, dev, !!setup->WLE);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ }
+ if (err) {
+ pr_err("Error building Setup Locking range command.\n");
+ return err;
+
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int start_generic_opal_session(struct opal_dev *dev,
+ enum opal_uid auth,
+ enum opal_uid sp_type,
+ const char *key,
+ u8 key_len)
+{
+ u32 hsn;
+ int err = 0;
+
+ if (key == NULL && auth != OPAL_ANYBODY_UID) {
+ pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \
+ "Challenge, and not as the Anybody UID\n", __func__);
+ return OPAL_INVAL_PARAM;
+ }
+
+ clear_opal_cmd(dev);
+
+ set_comid(dev, dev->comid);
+ hsn = GENERIC_HOST_SESSION_NUM;
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID],
+ OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION],
+ OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u64(&err, dev, hsn);
+ add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, 1);
+
+ switch (auth) {
+ case OPAL_ANYBODY_UID:
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ break;
+ case OPAL_ADMIN1_UID:
+ case OPAL_SID_UID:
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 0); /* HostChallenge */
+ add_token_bytestring(&err, dev, key, key_len);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 3); /* HostSignAuth */
+ add_token_bytestring(&err, dev, opaluid[auth],
+ OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ break;
+ default:
+ pr_err("Cannot start Admin SP session with auth %d\n", auth);
+ return OPAL_INVAL_PARAM;
+ }
+
+ if (err) {
+ pr_err("Error building start adminsp session command.\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, start_opal_session_cont);
+}
+
+static int start_anybodyASP_opal_session(struct opal_dev *dev)
+{
+ return start_generic_opal_session(dev, OPAL_ANYBODY_UID,
+ OPAL_ADMINSP_UID, NULL, 0);
+}
+
+static int start_SIDASP_opal_session(struct opal_dev *dev)
+{
+ int ret;
+ const u8 *key = dev->prev_data;
+ struct opal_key *okey;
+
+ if (!key) {
+ okey = dev->func_data[dev->state];
+ ret = start_generic_opal_session(dev, OPAL_SID_UID,
+ OPAL_ADMINSP_UID,
+ okey->key,
+ okey->key_len);
+ } else {
+ ret = start_generic_opal_session(dev, OPAL_SID_UID,
+ OPAL_ADMINSP_UID,
+ key, dev->prev_d_len);
+ kfree(key);
+ dev->prev_data = NULL;
+ }
+ return ret;
+}
+
+static inline int start_admin1LSP_opal_session(struct opal_dev *dev)
+{
+ struct opal_key *key = dev->func_data[dev->state];
+
+ return start_generic_opal_session(dev, OPAL_ADMIN1_UID,
+ OPAL_LOCKINGSP_UID,
+ key->key, key->key_len);
+}
+
+static int start_auth_opal_session(struct opal_dev *dev)
+{
+ u8 lk_ul_user[OPAL_UID_LENGTH];
+ int err = 0;
+
+ struct opal_session_info *session = dev->func_data[dev->state];
+ size_t keylen = session->opal_key.key_len;
+ u8 *key = session->opal_key.key;
+ u32 hsn = GENERIC_HOST_SESSION_NUM;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ if (session->sum) {
+ err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
+ session->opal_key.lr);
+ if (err)
+ return err;
+
+ } else if (session->who != OPAL_ADMIN1 && !session->sum) {
+ err = build_locking_user(lk_ul_user, sizeof(lk_ul_user),
+ session->who - 1);
+ if (err)
+ return err;
+ } else
+ memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH);
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, opaluid[OPAL_SMUID_UID],
+ OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_STARTSESSION],
+ OPAL_UID_LENGTH);
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u64(&err, dev, hsn);
+ add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+ OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, 1);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 0);
+ add_token_bytestring(&err, dev, key, keylen);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 3);
+ add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error building STARTSESSION command.\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, start_opal_session_cont);
+}
+
+static int revert_tper(struct opal_dev *dev)
+{
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, opaluid[OPAL_ADMINSP_UID],
+ OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_REVERT],
+ OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ if (err) {
+ pr_err("Error building REVERT TPER command.\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int internal_activate_user(struct opal_dev *dev)
+{
+ struct opal_session_info *session = dev->func_data[dev->state];
+ u8 uid[OPAL_UID_LENGTH];
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+ uid[7] = session->who;
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_VALUES);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 5); /* Enabled */
+ add_token_u8(&err, dev, OPAL_TRUE);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error building Activate UserN command.\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int erase_locking_range(struct opal_dev *dev)
+{
+ struct opal_session_info *session;
+ u8 uid[OPAL_UID_LENGTH];
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+ session = dev->func_data[dev->state];
+
+ if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0)
+ return -ERANGE;
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_ERASE],
+ OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error building Erase Locking Range Command.\n");
+ return err;
+ }
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_mbr_done(struct opal_dev *dev)
+{
+ u8 mbr_done_tf = *(u8 *)dev->func_data[dev->state];
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
+ OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_VALUES);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 2); /* Done */
+ add_token_u8(&err, dev, mbr_done_tf); /* Done T or F */
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error Building set MBR Done command\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_mbr_enable_disable(struct opal_dev *dev)
+{
+ u8 mbr_en_dis = *(u8 *)dev->func_data[dev->state];
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, opaluid[OPAL_MBRCONTROL],
+ OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_VALUES);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 1);
+ add_token_u8(&err, dev, mbr_en_dis);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error Building set MBR done command\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
+ struct opal_dev *dev)
+{
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, cpin_uid, OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+ OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_VALUES);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 3); /* PIN */
+ add_token_bytestring(&err, dev, key, key_len);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ return err;
+}
+
+static int set_new_pw(struct opal_dev *dev)
+{
+ u8 cpin_uid[OPAL_UID_LENGTH];
+ struct opal_session_info *usr = dev->func_data[dev->state];
+
+
+ memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH);
+
+ if (usr->who != OPAL_ADMIN1) {
+ cpin_uid[5] = 0x03;
+ if (usr->sum)
+ cpin_uid[7] = usr->opal_key.lr + 1;
+ else
+ cpin_uid[7] = usr->who;
+ }
+
+ if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len,
+ cpin_uid, dev)) {
+ pr_err("Error building set password command.\n");
+ return -ERANGE;
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int set_sid_cpin_pin(struct opal_dev *dev)
+{
+ u8 cpin_uid[OPAL_UID_LENGTH];
+ struct opal_key *key = dev->func_data[dev->state];
+
+ memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
+
+ if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) {
+ pr_err("Error building Set SID cpin\n");
+ return -ERANGE;
+ }
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int add_user_to_lr(struct opal_dev *dev)
+{
+ u8 lr_buffer[OPAL_UID_LENGTH];
+ u8 user_uid[OPAL_UID_LENGTH];
+ struct opal_lock_unlock *lkul;
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ lkul = dev->func_data[dev->state];
+
+ memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED],
+ OPAL_UID_LENGTH);
+
+ if (lkul->l_state == OPAL_RW)
+ memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED],
+ OPAL_UID_LENGTH);
+
+ lr_buffer[7] = lkul->session.opal_key.lr;
+
+ memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
+
+ user_uid[7] = lkul->session.who;
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_SET],
+ OPAL_UID_LENGTH);
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_VALUES);
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 3);
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_bytestring(&err, dev,
+ opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
+ OPAL_UID_LENGTH/2);
+ add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_bytestring(&err, dev,
+ opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
+ OPAL_UID_LENGTH/2);
+ add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE],
+ OPAL_UID_LENGTH/2);
+ add_token_u8(&err, dev, 1);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error building add user to locking range command.\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int lock_unlock_locking_range(struct opal_dev *dev)
+{
+ u8 lr_buffer[OPAL_UID_LENGTH];
+ const u8 *method;
+ struct opal_lock_unlock *lkul;
+ u8 read_locked = 1, write_locked = 1;
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ method = opalmethod[OPAL_SET];
+ lkul = dev->func_data[dev->state];
+ if (build_locking_range(lr_buffer, sizeof(lr_buffer),
+ lkul->session.opal_key.lr) < 0)
+ return -ERANGE;
+
+ switch (lkul->l_state) {
+ case OPAL_RO:
+ read_locked = 0;
+ write_locked = 1;
+ break;
+ case OPAL_RW:
+ read_locked = 0;
+ write_locked = 0;
+ break;
+ case OPAL_LK:
+ /* vars are initalized to locked */
+ break;
+ default:
+ pr_err("Tried to set an invalid locking state... returning to uland\n");
+ return OPAL_INVAL_PARAM;
+ }
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, lr_buffer, OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_SET], OPAL_UID_LENGTH);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_VALUES);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_READLOCKED);
+ add_token_u8(&err, dev, read_locked);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_WRITELOCKED);
+ add_token_u8(&err, dev, write_locked);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error building SET command.\n");
+ return err;
+ }
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+
+static int lock_unlock_locking_range_sum(struct opal_dev *dev)
+{
+ u8 lr_buffer[OPAL_UID_LENGTH];
+ u8 read_locked = 1, write_locked = 1;
+ const u8 *method;
+ struct opal_lock_unlock *lkul;
+ int ret;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ method = opalmethod[OPAL_SET];
+ lkul = dev->func_data[dev->state];
+ if (build_locking_range(lr_buffer, sizeof(lr_buffer),
+ lkul->session.opal_key.lr) < 0)
+ return -ERANGE;
+
+ switch (lkul->l_state) {
+ case OPAL_RO:
+ read_locked = 0;
+ write_locked = 1;
+ break;
+ case OPAL_RW:
+ read_locked = 0;
+ write_locked = 0;
+ break;
+ case OPAL_LK:
+ /* vars are initalized to locked */
+ break;
+ default:
+ pr_err("Tried to set an invalid locking state.\n");
+ return OPAL_INVAL_PARAM;
+ }
+ ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1,
+ read_locked, write_locked);
+
+ if (ret < 0) {
+ pr_err("Error building SET command.\n");
+ return ret;
+ }
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int activate_lsp(struct opal_dev *dev)
+{
+ struct opal_lr_act *opal_act;
+ u8 user_lr[OPAL_UID_LENGTH];
+ u8 uint_3 = 0x83;
+ int err = 0, i;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ opal_act = dev->func_data[dev->state];
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+ OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_ACTIVATE],
+ OPAL_UID_LENGTH);
+
+
+ if (opal_act->sum) {
+ err = build_locking_range(user_lr, sizeof(user_lr),
+ opal_act->lr[0]);
+ if (err)
+ return err;
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, uint_3);
+ add_token_u8(&err, dev, 6);
+ add_token_u8(&err, dev, 0);
+ add_token_u8(&err, dev, 0);
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+ for (i = 1; i < opal_act->num_lrs; i++) {
+ user_lr[7] = opal_act->lr[i];
+ add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+ }
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ } else {
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ }
+
+ if (err) {
+ pr_err("Error building Activate LockingSP command.\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int get_lsp_lifecycle_cont(struct opal_dev *dev)
+{
+ u8 lc_status;
+ int error = 0;
+
+ error = parse_and_check_status(dev);
+ if (error)
+ return error;
+
+ lc_status = response_get_u64(&dev->parsed, 4);
+ /* 0x08 is Manufacured Inactive */
+ /* 0x09 is Manufactured */
+ if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
+ pr_err("Couldn't determine the status of the Lifcycle state\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+/* Determine if we're in the Manufactured Inactive or Active state */
+static int get_lsp_lifecycle(struct opal_dev *dev)
+{
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID],
+ OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 3); /* Start Column */
+ add_token_u8(&err, dev, 6); /* Lifecycle Column */
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 4); /* End Column */
+ add_token_u8(&err, dev, 6); /* Lifecycle Column */
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error Building GET Lifecycle Status command\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, get_lsp_lifecycle_cont);
+}
+
+static int get_msid_cpin_pin_cont(struct opal_dev *dev)
+{
+ const char *msid_pin;
+ size_t strlen;
+ int error = 0;
+
+ error = parse_and_check_status(dev);
+ if (error)
+ return error;
+
+ strlen = response_get_string(&dev->parsed, 4, &msid_pin);
+ if (!msid_pin) {
+ pr_err("%s: Couldn't extract PIN from response\n", __func__);
+ return OPAL_INVAL_PARAM;
+ }
+
+ dev->prev_data = kmemdup(msid_pin, strlen, GFP_KERNEL);
+ if (!dev->prev_data)
+ return -ENOMEM;
+
+ dev->prev_d_len = strlen;
+
+ return 0;
+}
+
+static int get_msid_cpin_pin(struct opal_dev *dev)
+{
+ int err = 0;
+
+ clear_opal_cmd(dev);
+ set_comid(dev, dev->comid);
+
+
+ add_token_u8(&err, dev, OPAL_CALL);
+ add_token_bytestring(&err, dev, opaluid[OPAL_C_PIN_MSID],
+ OPAL_UID_LENGTH);
+ add_token_bytestring(&err, dev, opalmethod[OPAL_GET], OPAL_UID_LENGTH);
+
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+ add_token_u8(&err, dev, OPAL_STARTLIST);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 3); /* Start Column */
+ add_token_u8(&err, dev, 3); /* PIN */
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, 4); /* End Column */
+ add_token_u8(&err, dev, 3); /* Lifecycle Column */
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+ add_token_u8(&err, dev, OPAL_ENDLIST);
+
+ if (err) {
+ pr_err("Error building Get MSID CPIN PIN command.\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, get_msid_cpin_pin_cont);
+}
+
+static int build_end_opal_session(struct opal_dev *dev)
+{
+ int err = 0;
+
+ clear_opal_cmd(dev);
+
+ set_comid(dev, dev->comid);
+ add_token_u8(&err, dev, OPAL_ENDOFSESSION);
+ return err;
+}
+
+static int end_opal_session(struct opal_dev *dev)
+{
+ int ret = build_end_opal_session(dev);
+
+ if (ret < 0)
+ return ret;
+ return finalize_and_send(dev, end_session_cont);
+}
+
+static int end_opal_session_error(struct opal_dev *dev)
+{
+ const opal_step error_end_session[] = {
+ end_opal_session,
+ NULL,
+ };
+ dev->funcs = error_end_session;
+ dev->state = 0;
+ return next(dev);
+}
+
+static inline void setup_opal_dev(struct opal_dev *dev,
+ const opal_step *funcs)
+{
+ dev->state = 0;
+ dev->funcs = funcs;
+ dev->tsn = 0;
+ dev->hsn = 0;
+ dev->func_data = NULL;
+ dev->prev_data = NULL;
+}
+
+static int check_opal_support(struct opal_dev *dev)
+{
+ static const opal_step funcs[] = {
+ opal_discovery0,
+ NULL
+ };
+ int ret;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, funcs);
+ ret = next(dev);
+ dev->supported = !ret;
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
+{
+ struct opal_dev *dev;
+
+ dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return NULL;
+
+ INIT_LIST_HEAD(&dev->unlk_lst);
+ mutex_init(&dev->dev_lock);
+ dev->data = data;
+ dev->send_recv = send_recv;
+ if (check_opal_support(dev) != 0) {
+ pr_debug("Opal is not supported on this device\n");
+ kfree(dev);
+ return NULL;
+ }
+ return dev;
+}
+EXPORT_SYMBOL(init_opal_dev);
+
+static int opal_secure_erase_locking_range(struct opal_dev *dev,
+ struct opal_session_info *opal_session)
+{
+ void *data[3] = { NULL };
+ static const opal_step erase_funcs[] = {
+ opal_discovery0,
+ start_auth_opal_session,
+ get_active_key,
+ gen_key,
+ end_opal_session,
+ NULL,
+ };
+ int ret;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, erase_funcs);
+
+ dev->func_data = data;
+ dev->func_data[1] = opal_session;
+ dev->func_data[2] = &opal_session->opal_key.lr;
+
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int opal_erase_locking_range(struct opal_dev *dev,
+ struct opal_session_info *opal_session)
+{
+ void *data[3] = { NULL };
+ static const opal_step erase_funcs[] = {
+ opal_discovery0,
+ start_auth_opal_session,
+ erase_locking_range,
+ end_opal_session,
+ NULL,
+ };
+ int ret;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, erase_funcs);
+
+ dev->func_data = data;
+ dev->func_data[1] = opal_session;
+ dev->func_data[2] = opal_session;
+
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
+ struct opal_mbr_data *opal_mbr)
+{
+ void *func_data[6] = { NULL };
+ static const opal_step mbr_funcs[] = {
+ opal_discovery0,
+ start_admin1LSP_opal_session,
+ set_mbr_done,
+ end_opal_session,
+ start_admin1LSP_opal_session,
+ set_mbr_enable_disable,
+ end_opal_session,
+ NULL,
+ };
+ int ret;
+
+ if (opal_mbr->enable_disable != OPAL_MBR_ENABLE &&
+ opal_mbr->enable_disable != OPAL_MBR_DISABLE)
+ return -EINVAL;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, mbr_funcs);
+ dev->func_data = func_data;
+ dev->func_data[1] = &opal_mbr->key;
+ dev->func_data[2] = &opal_mbr->enable_disable;
+ dev->func_data[4] = &opal_mbr->key;
+ dev->func_data[5] = &opal_mbr->enable_disable;
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+{
+ struct opal_suspend_data *suspend;
+
+ suspend = kzalloc(sizeof(*suspend), GFP_KERNEL);
+ if (!suspend)
+ return -ENOMEM;
+
+ suspend->unlk = *lk_unlk;
+ suspend->lr = lk_unlk->session.opal_key.lr;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, NULL);
+ add_suspend_info(dev, suspend);
+ mutex_unlock(&dev->dev_lock);
+ return 0;
+}
+
+static int opal_add_user_to_lr(struct opal_dev *dev,
+ struct opal_lock_unlock *lk_unlk)
+{
+ void *func_data[3] = { NULL };
+ static const opal_step funcs[] = {
+ opal_discovery0,
+ start_admin1LSP_opal_session,
+ add_user_to_lr,
+ end_opal_session,
+ NULL
+ };
+ int ret;
+
+ if (lk_unlk->l_state != OPAL_RO &&
+ lk_unlk->l_state != OPAL_RW) {
+ pr_err("Locking state was not RO or RW\n");
+ return -EINVAL;
+ }
+ if (lk_unlk->session.who < OPAL_USER1 &&
+ lk_unlk->session.who > OPAL_USER9) {
+ pr_err("Authority was not within the range of users: %d\n",
+ lk_unlk->session.who);
+ return -EINVAL;
+ }
+ if (lk_unlk->session.sum) {
+ pr_err("%s not supported in sum. Use setup locking range\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, funcs);
+ dev->func_data = func_data;
+ dev->func_data[1] = &lk_unlk->session.opal_key;
+ dev->func_data[2] = lk_unlk;
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
+{
+ void *data[2] = { NULL };
+ static const opal_step revert_funcs[] = {
+ opal_discovery0,
+ start_SIDASP_opal_session,
+ revert_tper, /* controller will terminate session */
+ NULL,
+ };
+ int ret;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, revert_funcs);
+ dev->func_data = data;
+ dev->func_data[1] = opal;
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int __opal_lock_unlock_sum(struct opal_dev *dev)
+{
+ static const opal_step ulk_funcs_sum[] = {
+ opal_discovery0,
+ start_auth_opal_session,
+ lock_unlock_locking_range_sum,
+ end_opal_session,
+ NULL
+ };
+
+ dev->funcs = ulk_funcs_sum;
+ return next(dev);
+}
+
+static int __opal_lock_unlock(struct opal_dev *dev)
+{
+ static const opal_step _unlock_funcs[] = {
+ opal_discovery0,
+ start_auth_opal_session,
+ lock_unlock_locking_range,
+ end_opal_session,
+ NULL
+ };
+
+ dev->funcs = _unlock_funcs;
+ return next(dev);
+}
+
+static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
+{
+ void *func_data[3] = { NULL };
+ int ret;
+
+ if (lk_unlk->session.who < OPAL_ADMIN1 ||
+ lk_unlk->session.who > OPAL_USER9)
+ return -EINVAL;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, NULL);
+ dev->func_data = func_data;
+ dev->func_data[1] = &lk_unlk->session;
+ dev->func_data[2] = lk_unlk;
+
+ if (lk_unlk->session.sum)
+ ret = __opal_lock_unlock_sum(dev);
+ else
+ ret = __opal_lock_unlock(dev);
+
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
+{
+ static const opal_step owner_funcs[] = {
+ opal_discovery0,
+ start_anybodyASP_opal_session,
+ get_msid_cpin_pin,
+ end_opal_session,
+ start_SIDASP_opal_session,
+ set_sid_cpin_pin,
+ end_opal_session,
+ NULL
+ };
+ void *data[6] = { NULL };
+ int ret;
+
+ if (!dev)
+ return -ENODEV;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, owner_funcs);
+ dev->func_data = data;
+ dev->func_data[4] = opal;
+ dev->func_data[5] = opal;
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int opal_activate_lsp(struct opal_dev *dev, struct opal_lr_act *opal_lr_act)
+{
+ void *data[4] = { NULL };
+ static const opal_step active_funcs[] = {
+ opal_discovery0,
+ start_SIDASP_opal_session, /* Open session as SID auth */
+ get_lsp_lifecycle,
+ activate_lsp,
+ end_opal_session,
+ NULL
+ };
+ int ret;
+
+ if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS)
+ return -EINVAL;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, active_funcs);
+ dev->func_data = data;
+ dev->func_data[1] = &opal_lr_act->key;
+ dev->func_data[3] = opal_lr_act;
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int opal_setup_locking_range(struct opal_dev *dev,
+ struct opal_user_lr_setup *opal_lrs)
+{
+ void *data[3] = { NULL };
+ static const opal_step lr_funcs[] = {
+ opal_discovery0,
+ start_auth_opal_session,
+ setup_locking_range,
+ end_opal_session,
+ NULL,
+ };
+ int ret;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, lr_funcs);
+ dev->func_data = data;
+ dev->func_data[1] = &opal_lrs->session;
+ dev->func_data[2] = opal_lrs;
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
+{
+ static const opal_step pw_funcs[] = {
+ opal_discovery0,
+ start_auth_opal_session,
+ set_new_pw,
+ end_opal_session,
+ NULL
+ };
+ void *data[3] = { NULL };
+ int ret;
+
+ if (opal_pw->session.who < OPAL_ADMIN1 ||
+ opal_pw->session.who > OPAL_USER9 ||
+ opal_pw->new_user_pw.who < OPAL_ADMIN1 ||
+ opal_pw->new_user_pw.who > OPAL_USER9)
+ return -EINVAL;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, pw_funcs);
+ dev->func_data = data;
+ dev->func_data[1] = (void *) &opal_pw->session;
+ dev->func_data[2] = (void *) &opal_pw->new_user_pw;
+
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+static int opal_activate_user(struct opal_dev *dev,
+ struct opal_session_info *opal_session)
+{
+ static const opal_step act_funcs[] = {
+ opal_discovery0,
+ start_admin1LSP_opal_session,
+ internal_activate_user,
+ end_opal_session,
+ NULL
+ };
+ void *data[3] = { NULL };
+ int ret;
+
+ /* We can't activate Admin1 it's active as manufactured */
+ if (opal_session->who < OPAL_USER1 &&
+ opal_session->who > OPAL_USER9) {
+ pr_err("Who was not a valid user: %d\n", opal_session->who);
+ return -EINVAL;
+ }
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, act_funcs);
+ dev->func_data = data;
+ dev->func_data[1] = &opal_session->opal_key;
+ dev->func_data[2] = opal_session;
+ ret = next(dev);
+ mutex_unlock(&dev->dev_lock);
+ return ret;
+}
+
+bool opal_unlock_from_suspend(struct opal_dev *dev)
+{
+ struct opal_suspend_data *suspend;
+ void *func_data[3] = { NULL };
+ bool was_failure = false;
+ int ret = 0;
+
+ if (!dev)
+ return false;
+ if (!dev->supported)
+ return false;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev, NULL);
+ dev->func_data = func_data;
+
+ list_for_each_entry(suspend, &dev->unlk_lst, node) {
+ dev->state = 0;
+ dev->func_data[1] = &suspend->unlk.session;
+ dev->func_data[2] = &suspend->unlk;
+ dev->tsn = 0;
+ dev->hsn = 0;
+
+ if (suspend->unlk.session.sum)
+ ret = __opal_lock_unlock_sum(dev);
+ else
+ ret = __opal_lock_unlock(dev);
+ if (ret) {
+ pr_warn("Failed to unlock LR %hhu with sum %d\n",
+ suspend->unlk.session.opal_key.lr,
+ suspend->unlk.session.sum);
+ was_failure = true;
+ }
+ }
+ mutex_unlock(&dev->dev_lock);
+ return was_failure;
+}
+EXPORT_SYMBOL(opal_unlock_from_suspend);
+
+int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
+{
+ void *p;
+ int ret = -ENOTTY;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ if (!dev)
+ return -ENOTSUPP;
+ if (!dev->supported) {
+ pr_err("Not supported\n");
+ return -ENOTSUPP;
+ }
+
+ p = memdup_user(arg, _IOC_SIZE(cmd));
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ switch (cmd) {
+ case IOC_OPAL_SAVE:
+ ret = opal_save(dev, p);
+ break;
+ case IOC_OPAL_LOCK_UNLOCK:
+ ret = opal_lock_unlock(dev, p);
+ break;
+ case IOC_OPAL_TAKE_OWNERSHIP:
+ ret = opal_take_ownership(dev, p);
+ break;
+ case IOC_OPAL_ACTIVATE_LSP:
+ ret = opal_activate_lsp(dev, p);
+ break;
+ case IOC_OPAL_SET_PW:
+ ret = opal_set_new_pw(dev, p);
+ break;
+ case IOC_OPAL_ACTIVATE_USR:
+ ret = opal_activate_user(dev, p);
+ break;
+ case IOC_OPAL_REVERT_TPR:
+ ret = opal_reverttper(dev, p);
+ break;
+ case IOC_OPAL_LR_SETUP:
+ ret = opal_setup_locking_range(dev, p);
+ break;
+ case IOC_OPAL_ADD_USR_TO_LR:
+ ret = opal_add_user_to_lr(dev, p);
+ break;
+ case IOC_OPAL_ENABLE_DISABLE_MBR:
+ ret = opal_enable_disable_shadow_mbr(dev, p);
+ break;
+ case IOC_OPAL_ERASE_LR:
+ ret = opal_erase_locking_range(dev, p);
+ break;
+ case IOC_OPAL_SECURE_ERASE_LR:
+ ret = opal_secure_erase_locking_range(dev, p);
+ break;
+ default:
+ pr_warn("No such Opal Ioctl %u\n", cmd);
+ }
+
+ kfree(p);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sed_ioctl);