summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/autofs4/expire.c2
-rw-r--r--fs/bio.c10
-rw-r--r--fs/btrfs/Kconfig19
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/backref.c1131
-rw-r--r--fs/btrfs/backref.h5
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c3069
-rw-r--r--fs/btrfs/check-integrity.h36
-rw-r--r--fs/btrfs/ctree.c42
-rw-r--r--fs/btrfs/ctree.h239
-rw-r--r--fs/btrfs/delayed-inode.c45
-rw-r--r--fs/btrfs/delayed-ref.c153
-rw-r--r--fs/btrfs/delayed-ref.h104
-rw-r--r--fs/btrfs/disk-io.c126
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/export.c2
-rw-r--r--fs/btrfs/extent-tree.c514
-rw-r--r--fs/btrfs/extent_io.c8
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c11
-rw-r--r--fs/btrfs/free-space-cache.c422
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c76
-rw-r--r--fs/btrfs/ioctl.c270
-rw-r--r--fs/btrfs/ioctl.h54
-rw-r--r--fs/btrfs/locking.c53
-rw-r--r--fs/btrfs/relocation.c20
-rw-r--r--fs/btrfs/scrub.c12
-rw-r--r--fs/btrfs/super.c190
-rw-r--r--fs/btrfs/transaction.c20
-rw-r--r--fs/btrfs/tree-log.c8
-rw-r--r--fs/btrfs/ulist.c220
-rw-r--r--fs/btrfs/ulist.h68
-rw-r--r--fs/btrfs/volumes.c993
-rw-r--r--fs/btrfs/volumes.h54
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/ceph/caps.c4
-rw-r--r--fs/ceph/dir.c4
-rw-r--r--fs/ceph/mds_client.c10
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/xattr.c4
-rw-r--r--fs/cifs/Kconfig7
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_spnego.c10
-rw-r--r--fs/cifs/cifs_unicode.c41
-rw-r--r--fs/cifs/cifs_unicode.h20
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsencrypt.c21
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifssmb.c162
-rw-r--r--fs/cifs/connect.c318
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/readdir.c9
-rw-r--r--fs/cifs/sess.c45
-rw-r--r--fs/cifs/smbencrypt.c2
-rw-r--r--fs/compat.c56
-rw-r--r--fs/dcache.c8
-rw-r--r--fs/debugfs/file.c2
-rw-r--r--fs/ecryptfs/crypto.c122
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h11
-rw-r--r--fs/ecryptfs/inode.c50
-rw-r--r--fs/ecryptfs/keystore.c14
-rw-r--r--fs/ecryptfs/miscdev.c140
-rw-r--r--fs/ecryptfs/mmap.c12
-rw-r--r--fs/ecryptfs/read_write.c100
-rw-r--r--fs/ecryptfs/super.c14
-rw-r--r--fs/exec.c33
-rw-r--r--fs/ext2/ioctl.c22
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--fs/inode.c11
-rw-r--r--fs/ioprio.c2
-rw-r--r--fs/jbd/checkpoint.c27
-rw-r--r--fs/jbd/recovery.c4
-rw-r--r--fs/jffs2/erase.c2
-rw-r--r--fs/logfs/dev_mtd.c6
-rw-r--r--fs/logfs/dir.c2
-rw-r--r--fs/logfs/file.c2
-rw-r--r--fs/logfs/gc.c2
-rw-r--r--fs/logfs/inode.c4
-rw-r--r--fs/logfs/journal.c1
-rw-r--r--fs/logfs/logfs.h5
-rw-r--r--fs/logfs/readwrite.c51
-rw-r--r--fs/logfs/segment.c51
-rw-r--r--fs/logfs/super.c3
-rw-r--r--fs/namei.c32
-rw-r--r--fs/nfs/nfs4proc.c130
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfs4xdr.c5
-rw-r--r--fs/nilfs2/ioctl.c2
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/proc/base.c234
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/qnx4/inode.c62
-rw-r--r--fs/quota/dquot.c8
-rw-r--r--fs/quota/quota.c24
-rw-r--r--fs/select.c2
-rw-r--r--fs/super.c24
-rw-r--r--fs/sysfs/file.c6
-rw-r--r--fs/sysfs/inode.c5
-rw-r--r--fs/xfs/kmem.h6
-rw-r--r--fs/xfs/xfs_aops.c29
-rw-r--r--fs/xfs/xfs_attr.c4
-rw-r--r--fs/xfs/xfs_attr_leaf.c9
-rw-r--r--fs/xfs/xfs_bmap.c116
-rw-r--r--fs/xfs/xfs_dfrag.c43
-rw-r--r--fs/xfs/xfs_dquot.c127
-rw-r--r--fs/xfs/xfs_file.c184
-rw-r--r--fs/xfs/xfs_fs_subr.c2
-rw-r--r--fs/xfs/xfs_iget.c24
-rw-r--r--fs/xfs/xfs_inode.c193
-rw-r--r--fs/xfs/xfs_inode.h114
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_iomap.c46
-rw-r--r--fs/xfs/xfs_iops.c46
-rw-r--r--fs/xfs/xfs_log_recover.c8
-rw-r--r--fs/xfs/xfs_qm.c291
-rw-r--r--fs/xfs/xfs_qm.h14
-rw-r--r--fs/xfs/xfs_qm_stats.c4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c12
-rw-r--r--fs/xfs/xfs_super.c8
-rw-r--r--fs/xfs/xfs_sync.c9
-rw-r--r--fs/xfs/xfs_trace.h34
-rw-r--r--fs/xfs/xfs_trans.c4
-rw-r--r--fs/xfs/xfs_trans_dquot.c10
-rw-r--r--fs/xfs/xfs_vnodeops.c47
127 files changed, 8585 insertions, 2800 deletions
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 450f529a4ea..1feb68ecef9 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -124,6 +124,7 @@ start:
/* Negative dentry - try next */
if (!simple_positive(q)) {
spin_unlock(&p->d_lock);
+ lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_);
p = q;
goto again;
}
@@ -186,6 +187,7 @@ again:
/* Negative dentry - try next */
if (!simple_positive(ret)) {
spin_unlock(&p->d_lock);
+ lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_);
p = ret;
goto again;
}
diff --git a/fs/bio.c b/fs/bio.c
index b1fe82cf88c..b980ecde026 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -505,13 +505,9 @@ EXPORT_SYMBOL(bio_clone);
int bio_get_nr_vecs(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
- int nr_pages;
-
- nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (nr_pages > queue_max_segments(q))
- nr_pages = queue_max_segments(q);
-
- return nr_pages;
+ return min_t(unsigned,
+ queue_max_segments(q),
+ queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
}
EXPORT_SYMBOL(bio_get_nr_vecs);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be14..d33f01c08b6 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
Linux website <http://acl.bestbits.at/>.
If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+ bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+ depends on BTRFS_FS
+ help
+ Adds code that examines all block write requests (including
+ writes of the super block). The goal is to verify that the
+ state of the filesystem on disk is always consistent, i.e.,
+ after a power-loss or kernel panic event the filesystem is
+ in a consistent state.
+
+ If the integrity check tool is included and activated in
+ the mount options, plenty of kernel memory is used, and
+ plenty of additional CPU cycles are spent. Enabling this
+ functionality is not intended for normal use.
+
+ In most cases, unless you are a btrfs developer who needs
+ to verify the integrity of (super)-block write requests
+ during the run of a regression test, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e..0c4fa2befae 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
- reada.o backref.o
+ reada.o backref.o ulist.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd..633c701a287 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,18 +19,789 @@
#include "ctree.h"
#include "disk-io.h"
#include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
-struct __data_ref {
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
struct list_head list;
- u64 inum;
- u64 root;
- u64 extent_data_item_offset;
+ u64 root_id;
+ struct btrfs_key key;
+ int level;
+ int count;
+ u64 parent;
+ u64 wanted_disk_byte;
};
-struct __shared_ref {
- struct list_head list;
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+ struct btrfs_key *key, int level, u64 parent,
+ u64 wanted_disk_byte, int count)
+{
+ struct __prelim_ref *ref;
+
+ /* in case we're adding delayed refs, we're holding the refs spinlock */
+ ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+ if (!ref)
+ return -ENOMEM;
+
+ ref->root_id = root_id;
+ if (key)
+ ref->key = *key;
+ else
+ memset(&ref->key, 0, sizeof(ref->key));
+
+ ref->level = level;
+ ref->count = count;
+ ref->parent = parent;
+ ref->wanted_disk_byte = wanted_disk_byte;
+ list_add_tail(&ref->list, head);
+
+ return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+ struct ulist *parents,
+ struct extent_buffer *eb, int level,
+ u64 wanted_objectid, u64 wanted_disk_byte)
+{
+ int ret;
+ int slot;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
u64 disk_byte;
-};
+
+add_parent:
+ ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+ if (ret < 0)
+ return ret;
+
+ if (level != 0)
+ return 0;
+
+ /*
+ * if the current leaf is full with EXTENT_DATA items, we must
+ * check the next one if that holds a reference as well.
+ * ref->count cannot be used to skip this check.
+ * repeat this until we don't find any additional EXTENT_DATA items.
+ */
+ while (1) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ return 0;
+
+ eb = path->nodes[0];
+ for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
+ btrfs_item_key_to_cpu(eb, &key, slot);
+ if (key.objectid != wanted_objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ return 0;
+ fi = btrfs_item_ptr(eb, slot,
+ struct btrfs_file_extent_item);
+ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+ if (disk_byte == wanted_disk_byte)
+ goto add_parent;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+ struct __prelim_ref *ref,
+ struct ulist *parents)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ struct btrfs_key root_key;
+ struct btrfs_key key = {0};
+ struct extent_buffer *eb;
+ int ret = 0;
+ int root_level;
+ int level = ref->level;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ root_key.objectid = ref->root_id;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+ root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+
+ rcu_read_lock();
+ root_level = btrfs_header_level(root->node);
+ rcu_read_unlock();
+
+ if (root_level + 1 == level)
+ goto out;
+
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+ pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+ "%d for key (%llu %u %llu)\n",
+ (unsigned long long)ref->root_id, level, ref->count, ret,
+ (unsigned long long)ref->key.objectid, ref->key.type,
+ (unsigned long long)ref->key.offset);
+ if (ret < 0)
+ goto out;
+
+ eb = path->nodes[level];
+ if (!eb) {
+ WARN_ON(1);
+ ret = 1;
+ goto out;
+ }
+
+ if (level == 0) {
+ if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret)
+ goto out;
+ eb = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+ }
+
+ /* the last two parameters will only be used for level == 0 */
+ ret = add_all_parents(root, path, parents, eb, level, key.objectid,
+ ref->wanted_disk_byte);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+ struct list_head *head)
+{
+ int err;
+ int ret = 0;
+ struct __prelim_ref *ref;
+ struct __prelim_ref *ref_safe;
+ struct __prelim_ref *new_ref;
+ struct ulist *parents;
+ struct ulist_node *node;
+
+ parents = ulist_alloc(GFP_NOFS);
+ if (!parents)
+ return -ENOMEM;
+
+ /*
+ * _safe allows us to insert directly after the current item without
+ * iterating over the newly inserted items.
+ * we're also allowed to re-assign ref during iteration.
+ */
+ list_for_each_entry_safe(ref, ref_safe, head, list) {
+ if (ref->parent) /* already direct */
+ continue;
+ if (ref->count == 0)
+ continue;
+ err = __resolve_indirect_ref(fs_info, ref, parents);
+ if (err) {
+ if (ret == 0)
+ ret = err;
+ continue;
+ }
+
+ /* we put the first parent into the ref at hand */
+ node = ulist_next(parents, NULL);
+ ref->parent = node ? node->val : 0;
+
+ /* additional parents require new refs being added here */
+ while ((node = ulist_next(parents, node))) {
+ new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+ if (!new_ref) {
+ ret = -ENOMEM;
+ break;
+ }
+ memcpy(new_ref, ref, sizeof(*ref));
+ new_ref->parent = node->val;
+ list_add(&new_ref->list, &ref->list);
+ }
+ ulist_reinit(parents);
+ }
+
+ ulist_free(parents);
+ return ret;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * mode = 2: merge identical parents
+ */
+static int __merge_refs(struct list_head *head, int mode)
+{
+ struct list_head *pos1;
+
+ list_for_each(pos1, head) {
+ struct list_head *n2;
+ struct list_head *pos2;
+ struct __prelim_ref *ref1;
+
+ ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+ if (mode == 1 && ref1->key.type == 0)
+ continue;
+ for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+ pos2 = n2, n2 = pos2->next) {
+ struct __prelim_ref *ref2;
+
+ ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+ if (mode == 1) {
+ if (memcmp(&ref1->key, &ref2->key,
+ sizeof(ref1->key)) ||
+ ref1->level != ref2->level ||
+ ref1->root_id != ref2->root_id)
+ continue;
+ ref1->count += ref2->count;
+ } else {
+ if (ref1->parent != ref2->parent)
+ continue;
+ ref1->count += ref2->count;
+ }
+ list_del(&ref2->list);
+ kfree(ref2);
+ }
+
+ }
+ return 0;
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+ struct btrfs_key *info_key,
+ struct list_head *prefs)
+{
+ struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+ struct rb_node *n = &head->node.rb_node;
+ int sgn;
+ int ret = 0;
+
+ if (extent_op && extent_op->update_key)
+ btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+
+ while ((n = rb_prev(n))) {
+ struct btrfs_delayed_ref_node *node;
+ node = rb_entry(n, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (node->bytenr != head->node.bytenr)
+ break;
+ WARN_ON(node->is_head);
+
+ if (node->seq > seq)
+ continue;
+
+ switch (node->action) {
+ case BTRFS_ADD_DELAYED_EXTENT:
+ case BTRFS_UPDATE_DELAYED_HEAD:
+ WARN_ON(1);
+ continue;
+ case BTRFS_ADD_DELAYED_REF:
+ sgn = 1;
+ break;
+ case BTRFS_DROP_DELAYED_REF:
+ sgn = -1;
+ break;
+ default:
+ BUG_ON(1);
+ }
+ switch (node->type) {
+ case BTRFS_TREE_BLOCK_REF_KEY: {
+ struct btrfs_delayed_tree_ref *ref;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ret = __add_prelim_ref(prefs, ref->root, info_key,
+ ref->level + 1, 0, node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ case BTRFS_SHARED_BLOCK_REF_KEY: {
+ struct btrfs_delayed_tree_ref *ref;
+
+ ref = btrfs_delayed_node_to_tree_ref(node);
+ ret = __add_prelim_ref(prefs, ref->root, info_key,
+ ref->level + 1, ref->parent,
+ node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_key key;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ key.objectid = ref->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ref->offset;
+ ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+ node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_delayed_data_ref *ref;
+ struct btrfs_key key;
+
+ ref = btrfs_delayed_node_to_data_ref(node);
+
+ key.objectid = ref->objectid;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = ref->offset;
+ ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+ ref->parent, node->bytenr,
+ node->ref_mod * sgn);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ BUG_ON(ret);
+ }
+
+ return 0;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 bytenr,
+ struct btrfs_key *info_key, int *info_level,
+ struct list_head *prefs)
+{
+ int ret = 0;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ unsigned long ptr;
+ unsigned long end;
+ struct btrfs_extent_item *ei;
+ u64 flags;
+ u64 item_size;
+
+ /*
+ * enumerate all inline refs
+ */
+ leaf = path->nodes[0];
+ slot = path->slots[0] - 1;
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+ BUG_ON(item_size < sizeof(*ei));
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+
+ ptr = (unsigned long)(ei + 1);
+ end = (unsigned long)ei + item_size;
+
+ if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ struct btrfs_tree_block_info *info;
+ struct btrfs_disk_key disk_key;
+
+ info = (struct btrfs_tree_block_info *)ptr;
+ *info_level = btrfs_tree_block_level(leaf, info);
+ btrfs_tree_block_key(leaf, info, &disk_key);
+ btrfs_disk_key_to_cpu(info_key, &disk_key);
+ ptr += sizeof(struct btrfs_tree_block_info);
+ BUG_ON(ptr > end);
+ } else {
+ BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+ }
+
+ while (ptr < end) {
+ struct btrfs_extent_inline_ref *iref;
+ u64 offset;
+ int type;
+
+ iref = (struct btrfs_extent_inline_ref *)ptr;
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+ offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+ switch (type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, 0, info_key,
+ *info_level + 1, offset,
+ bytenr, 1);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+ ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+ bytenr, count);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, offset, info_key,
+ *info_level + 1, 0, bytenr, 1);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+ root = btrfs_extent_data_ref_root(leaf, dref);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
+ count);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ BUG_ON(ret);
+ ptr += btrfs_extent_inline_ref_size(type);
+ }
+
+ return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 bytenr,
+ struct btrfs_key *info_key, int info_level,
+ struct list_head *prefs)
+{
+ struct btrfs_root *extent_root = fs_info->extent_root;
+ int ret;
+ int slot;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+
+ while (1) {
+ ret = btrfs_next_item(extent_root, path);
+ if (ret < 0)
+ break;
+ if (ret) {
+ ret = 0;
+ break;
+ }
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.objectid != bytenr)
+ break;
+ if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+ continue;
+ if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+ break;
+
+ switch (key.type) {
+ case BTRFS_SHARED_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, 0, info_key,
+ info_level + 1, key.offset,
+ bytenr, 1);
+ break;
+ case BTRFS_SHARED_DATA_REF_KEY: {
+ struct btrfs_shared_data_ref *sdref;
+ int count;
+
+ sdref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_shared_data_ref);
+ count = btrfs_shared_data_ref_count(leaf, sdref);
+ ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+ bytenr, count);
+ break;
+ }
+ case BTRFS_TREE_BLOCK_REF_KEY:
+ ret = __add_prelim_ref(prefs, key.offset, info_key,
+ info_level + 1, 0, bytenr, 1);
+ break;
+ case BTRFS_EXTENT_DATA_REF_KEY: {
+ struct btrfs_extent_data_ref *dref;
+ int count;
+ u64 root;
+
+ dref = btrfs_item_ptr(leaf, slot,
+ struct btrfs_extent_data_ref);
+ count = btrfs_extent_data_ref_count(leaf, dref);
+ key.objectid = btrfs_extent_data_ref_objectid(leaf,
+ dref);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+ root = btrfs_extent_data_ref_root(leaf, dref);
+ ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+ bytenr, count);
+ break;
+ }
+ default:
+ WARN_ON(1);
+ }
+ BUG_ON(ret);
+ }
+
+ return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 seq, struct ulist *refs, struct ulist *roots)
+{
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct btrfs_key info_key = { 0 };
+ struct btrfs_delayed_ref_root *delayed_refs = NULL;
+ struct btrfs_delayed_ref_head *head = NULL;
+ int info_level = 0;
+ int ret;
+ struct list_head prefs_delayed;
+ struct list_head prefs;
+ struct __prelim_ref *ref;
+
+ INIT_LIST_HEAD(&prefs);
+ INIT_LIST_HEAD(&prefs_delayed);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * grab both a lock on the path and a lock on the delayed ref head.
+ * We need both to get a consistent picture of how the refs look
+ * at a specified point in time
+ */
+again:
+ ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ /*
+ * look if there are updates for this ref queued and lock the head
+ */
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(path);
+
+ /*
+ * Mutex was contended, block until it's
+ * released and try again
+ */
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ goto again;
+ }
+ ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
+ if (ret)
+ goto out;
+ }
+ spin_unlock(&delayed_refs->lock);
+
+ if (path->slots[0]) {
+ struct extent_buffer *leaf;
+ int slot;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0] - 1;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid == bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY) {
+ ret = __add_inline_refs(fs_info, path, bytenr,
+ &info_key, &info_level, &prefs);
+ if (ret)
+ goto out;
+ ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+ info_level, &prefs);
+ if (ret)
+ goto out;
+ }
+ }
+ btrfs_release_path(path);
+
+ /*
+ * when adding the delayed refs above, the info_key might not have
+ * been known yet. Go over the list and replace the missing keys
+ */
+ list_for_each_entry(ref, &prefs_delayed, list) {
+ if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
+ memcpy(&ref->key, &info_key, sizeof(ref->key));
+ }
+ list_splice_init(&prefs_delayed, &prefs);
+
+ ret = __merge_refs(&prefs, 1);
+ if (ret)
+ goto out;
+
+ ret = __resolve_indirect_refs(fs_info, &prefs);
+ if (ret)
+ goto out;
+
+ ret = __merge_refs(&prefs, 2);
+ if (ret)
+ goto out;
+
+ while (!list_empty(&prefs)) {
+ ref = list_first_entry(&prefs, struct __prelim_ref, list);
+ list_del(&ref->list);
+ if (ref->count < 0)
+ WARN_ON(1);
+ if (ref->count && ref->root_id && ref->parent == 0) {
+ /* no parent == root of tree */
+ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+ BUG_ON(ret < 0);
+ }
+ if (ref->count && ref->parent) {
+ ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+ BUG_ON(ret < 0);
+ }
+ kfree(ref);
+ }
+
+out:
+ if (head)
+ mutex_unlock(&head->mutex);
+ btrfs_free_path(path);
+ while (!list_empty(&prefs)) {
+ ref = list_first_entry(&prefs, struct __prelim_ref, list);
+ list_del(&ref->list);
+ kfree(ref);
+ }
+ while (!list_empty(&prefs_delayed)) {
+ ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+ list);
+ list_del(&ref->list);
+ kfree(ref);
+ }
+
+ return ret;
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 seq, struct ulist **leafs)
+{
+ struct ulist *tmp;
+ int ret;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ *leafs = ulist_alloc(GFP_NOFS);
+ if (!*leafs) {
+ ulist_free(tmp);
+ return -ENOMEM;
+ }
+
+ ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+ ulist_free(tmp);
+
+ if (ret < 0 && ret != -ENOENT) {
+ ulist_free(*leafs);
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 seq, struct ulist **roots)
+{
+ struct ulist *tmp;
+ struct ulist_node *node = NULL;
+ int ret;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+ *roots = ulist_alloc(GFP_NOFS);
+ if (!*roots) {
+ ulist_free(tmp);
+ return -ENOMEM;
+ }
+
+ while (1) {
+ ret = find_parent_nodes(trans, fs_info, bytenr, seq,
+ tmp, *roots);
+ if (ret < 0 && ret != -ENOENT) {
+ ulist_free(tmp);
+ ulist_free(*roots);
+ return ret;
+ }
+ node = ulist_next(tmp, node);
+ if (!node)
+ break;
+ bytenr = node->val;
+ }
+
+ ulist_free(tmp);
+ return 0;
+}
+
static int __inode_info(u64 inum, u64 ioff, u8 key_type,
struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
found_key->objectid > logical ||
- found_key->objectid + found_key->offset <= logical)
+ found_key->objectid + found_key->offset <= logical) {
+ pr_debug("logical %llu is not within any extent\n",
+ (unsigned long long)logical);
return -ENOENT;
+ }
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
+ pr_debug("logical %llu is at position %llu within the extent (%llu "
+ "EXTENT_ITEM %llu) flags %#llx size %u\n",
+ (unsigned long long)logical,
+ (unsigned long long)(logical - found_key->objectid),
+ (unsigned long long)found_key->objectid,
+ (unsigned long long)found_key->offset,
+ (unsigned long long)flags, item_size);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
return BTRFS_EXTENT_FLAG_TREE_BLOCK;
if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
return 0;
}
-static int __data_list_add(struct list_head *head, u64 inum,
- u64 extent_data_item_offset, u64 root)
-{
- struct __data_ref *ref;
-
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- ref->inum = inum;
- ref->extent_data_item_offset = extent_data_item_offset;
- ref->root = root;
- list_add_tail(&ref->list, head);
-
- return 0;
-}
-
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
- struct btrfs_extent_data_ref *dref)
-{
- return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
- btrfs_extent_data_ref_offset(eb, dref),
- btrfs_extent_data_ref_root(eb, dref));
-}
-
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
- struct __shared_ref *ref;
-
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- ref->disk_byte = disk_byte;
- list_add_tail(&ref->list, head);
-
- return 0;
-}
-
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
- u64 logical, u64 inum,
- u64 extent_data_item_offset,
- u64 extent_offset,
- struct btrfs_path *path,
- struct list_head *data_refs,
- iterate_extent_inodes_t *iterate,
- void *ctx)
-{
- u64 ref_root;
- u32 item_size;
- struct btrfs_key key;
- struct extent_buffer *eb;
- struct btrfs_extent_item *ei;
- struct btrfs_extent_inline_ref *eiref;
- struct __data_ref *ref;
- int ret;
- int type;
- int last;
- unsigned long ptr = 0;
-
- WARN_ON(!list_empty(data_refs));
- ret = extent_from_logical(fs_info, logical, path, &key);
- if (ret & BTRFS_EXTENT_FLAG_DATA)
- ret = -EIO;
- if (ret < 0)
- goto out;
-
- eb = path->nodes[0];
- ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
- ret = 0;
- ref_root = 0;
- /*
- * as done in iterate_extent_inodes, we first build a list of refs to
- * iterate, then free the path and then iterate them to avoid deadlocks.
- */
- do {
- last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
- &eiref, &type);
- if (last < 0) {
- ret = last;
- goto out;
- }
- if (type == BTRFS_TREE_BLOCK_REF_KEY ||
- type == BTRFS_SHARED_BLOCK_REF_KEY) {
- ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
- ret = __data_list_add(data_refs, inum,
- extent_data_item_offset,
- ref_root);
- }
- } while (!ret && !last);
-
- btrfs_release_path(path);
-
- if (ref_root == 0) {
- printk(KERN_ERR "btrfs: failed to find tree block ref "
- "for shared data backref %llu\n", logical);
- WARN_ON(1);
- ret = -EIO;
- }
-
-out:
- while (!list_empty(data_refs)) {
- ref = list_first_entry(data_refs, struct __data_ref, list);
- list_del(&ref->list);
- if (!ret)
- ret = iterate(ref->inum, extent_offset +
- ref->extent_data_item_offset,
- ref->root, ctx);
- kfree(ref);
- }
-
- return ret;
-}
-
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
- u64 logical, u64 orig_extent_item_objectid,
- u64 extent_offset, struct btrfs_path *path,
- struct list_head *data_refs,
- iterate_extent_inodes_t *iterate,
- void *ctx)
+static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path, u64 logical,
+ u64 orig_extent_item_objectid,
+ u64 extent_item_pos, u64 root,
+ iterate_extent_inodes_t *iterate, void *ctx)
{
u64 disk_byte;
struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb;
int slot;
int nritems;
- int ret;
- int found = 0;
+ int ret = 0;
+ int extent_type;
+ u64 data_offset;
+ u64 data_len;
eb = read_tree_block(fs_info->tree_root, logical,
fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
if (key.type != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
- if (!fi) {
- free_extent_buffer(eb);
- return -EIO;
- }
+ extent_type = btrfs_file_extent_type(eb, fi);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+ continue;
+ /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
- if (disk_byte != orig_extent_item_objectid) {
- if (found)
- break;
- else
- continue;
- }
- ++found;
- ret = __iter_shared_inline_ref_inodes(fs_info, logical,
- key.objectid,
- key.offset,
- extent_offset, path,
- data_refs,
- iterate, ctx);
- if (ret)
- break;
- }
+ if (disk_byte != orig_extent_item_objectid)
+ continue;
- if (!found) {
- printk(KERN_ERR "btrfs: failed to follow shared data backref "
- "to parent %llu\n", logical);
- WARN_ON(1);
- ret = -EIO;
+ data_offset = btrfs_file_extent_offset(eb, fi);
+ data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+ if (extent_item_pos < data_offset ||
+ extent_item_pos >= data_offset + data_len)
+ continue;
+
+ pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+ "root %llu\n", orig_extent_item_objectid,
+ key.objectid, key.offset, root);
+ ret = iterate(key.objectid,
+ key.offset + (extent_item_pos - data_offset),
+ root, ctx);
+ if (ret) {
+ pr_debug("stopping iteration because ret=%d\n", ret);
+ break;
+ }
}
free_extent_buffer(eb);
+
return ret;
}
/*
* calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
- * released.
+ * the given parameters.
* when the iterator function returns a non-zero value, iteration stops.
+ * path is guaranteed to be in released state when iterate() is called.
*/
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
- u64 extent_item_objectid,
- u64 extent_offset,
+ u64 extent_item_objectid, u64 extent_item_pos,
iterate_extent_inodes_t *iterate, void *ctx)
{
- unsigned long ptr = 0;
- int last;
int ret;
- int type;
- u64 logical;
- u32 item_size;
- struct btrfs_extent_inline_ref *eiref;
- struct btrfs_extent_data_ref *dref;
- struct extent_buffer *eb;
- struct btrfs_extent_item *ei;
- struct btrfs_key key;
struct list_head data_refs = LIST_HEAD_INIT(data_refs);
struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
- struct __data_ref *ref_d;
- struct __shared_ref *ref_s;
+ struct btrfs_trans_handle *trans;
+ struct ulist *refs;
+ struct ulist *roots;
+ struct ulist_node *ref_node = NULL;
+ struct ulist_node *root_node = NULL;
+ struct seq_list seq_elem;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ pr_debug("resolving all inodes for extent %llu\n",
+ extent_item_objectid);
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+ spin_unlock(&delayed_refs->lock);
+
+ ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+ extent_item_pos, seq_elem.seq,
+ &refs);
- eb = path->nodes[0];
- ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
- /* first we iterate the inline refs, ... */
- do {
- last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
- &eiref, &type);
- if (last == -ENOENT) {
- ret = 0;
- break;
- }
- if (last < 0) {
- ret = last;
- break;
- }
-
- if (type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
- ret = __data_list_add_eb(&data_refs, eb, dref);
- } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
- logical = btrfs_extent_inline_ref_offset(eb, eiref);
- ret = __shared_list_add(&shared_refs, logical);
- }
- } while (!ret && !last);
+ if (ret)
+ goto out;
- /* ... then we proceed to in-tree references and ... */
- while (!ret) {
- ++path->slots[0];
- if (path->slots[0] > btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(fs_info->extent_root, path);
- if (ret) {
- if (ret == 1)
- ret = 0; /* we're done */
- break;
- }
- eb = path->nodes[0];
- }
- btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
- if (key.objectid != extent_item_objectid)
+ while (!ret && (ref_node = ulist_next(refs, ref_node))) {
+ ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
+ seq_elem.seq, &roots);
+ if (ret)
break;
- if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
- dref = btrfs_item_ptr(eb, path->slots[0],
- struct btrfs_extent_data_ref);
- ret = __data_list_add_eb(&data_refs, eb, dref);
- } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
- ret = __shared_list_add(&shared_refs, key.offset);
+ while (!ret && (root_node = ulist_next(roots, root_node))) {
+ pr_debug("root %llu references leaf %llu\n",
+ root_node->val, ref_node->val);
+ ret = iterate_leaf_refs(fs_info, path, ref_node->val,
+ extent_item_objectid,
+ extent_item_pos, root_node->val,
+ iterate, ctx);
}
}
- btrfs_release_path(path);
-
- /*
- * ... only at the very end we can process the refs we found. this is
- * because the iterator function we call is allowed to make tree lookups
- * and we have to avoid deadlocks. additionally, we need more tree
- * lookups ourselves for shared data refs.
- */
- while (!list_empty(&data_refs)) {
- ref_d = list_first_entry(&data_refs, struct __data_ref, list);
- list_del(&ref_d->list);
- if (!ret)
- ret = iterate(ref_d->inum, extent_offset +
- ref_d->extent_data_item_offset,
- ref_d->root, ctx);
- kfree(ref_d);
- }
-
- while (!list_empty(&shared_refs)) {
- ref_s = list_first_entry(&shared_refs, struct __shared_ref,
- list);
- list_del(&ref_s->list);
- if (!ret)
- ret = __iter_shared_inline_ref(fs_info,
- ref_s->disk_byte,
- extent_item_objectid,
- extent_offset, path,
- &data_refs,
- iterate, ctx);
- kfree(ref_s);
- }
-
+ ulist_free(refs);
+ ulist_free(roots);
+out:
+ btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+ btrfs_end_transaction(trans, fs_info->extent_root);
return ret;
}
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
iterate_extent_inodes_t *iterate, void *ctx)
{
int ret;
- u64 offset;
+ u64 extent_item_pos;
struct btrfs_key found_key;
ret = extent_from_logical(fs_info, logical, path,
&found_key);
+ btrfs_release_path(path);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = -EINVAL;
if (ret < 0)
return ret;
- offset = logical - found_key.objectid;
+ extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
- offset, iterate, ctx);
+ extent_item_pos, iterate, ctx);
return ret;
}
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
name_len = btrfs_inode_ref_name_len(eb, iref);
/* path must be released before calling iterate()! */
+ pr_debug("following ref at offset %u for inode %llu in "
+ "tree %llu\n", cur,
+ (unsigned long long)found_key.objectid,
+ (unsigned long long)fs_root->objectid);
ret = iterate(parent, iref, eb, ctx);
if (ret) {
free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
return PTR_ERR(fspath);
if (fspath > fspath_min) {
+ pr_debug("path resolved: %s\n", fspath);
ipath->fspath->val[i] = (u64)(unsigned long)fspath;
++ipath->fspath->elem_cnt;
ipath->fspath->bytes_left = fspath - fspath_min;
} else {
+ pr_debug("missed path, not enough space. missing bytes: %lu, "
+ "constructed so far: %s\n",
+ (unsigned long)(fspath_min - fspath), fspath_min);
++ipath->fspath->elem_missed;
ipath->fspath->bytes_missing += fspath_min - fspath;
ipath->fspath->bytes_left = 0;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8..d00dfa9ca93 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,7 @@
#define __BTRFS_BACKREF__
#include "ioctl.h"
+#include "ulist.h"
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 num_bytes, u64 seq, struct ulist **roots);
+
struct btrfs_data_container *init_data_container(u32 total_bytes);
struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
struct btrfs_path *path);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d..9b9b15fd520 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
+ /* held while doing delalloc reservations */
+ struct mutex delalloc_mutex;
+
/* used to order data wrt metadata */
struct btrfs_ordered_inode_tree ordered_tree;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 00000000000..b669a7d8e49
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3069 @@
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ * currently referenced by the super block (either directly
+ * or indirectly).
+ * 2. When a super block is written, it is verified that all
+ * referenced (directly or indirectly) blocks fulfill the
+ * following requirements:
+ * 2a. All referenced blocks have either been present when
+ * the file system was mounted, (i.e., they have been
+ * referenced by the super block) or they have been
+ * written since then and the write completion callback
+ * was called and a FLUSH request to the device where
+ * these blocks are located was received and completed.
+ * 2b. All referenced blocks need to have a generation
+ * number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/crc32c.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
+ * excluding " [...]" */
+#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
+
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+ u32 magic_num; /* only used for debug purposes */
+ unsigned int is_metadata:1; /* if it is meta-data, not data-data */
+ unsigned int is_superblock:1; /* if it is one of the superblocks */
+ unsigned int is_iodone:1; /* if is done by lower subsystem */
+ unsigned int iodone_w_error:1; /* error was indicated to endio */
+ unsigned int never_written:1; /* block was added because it was
+ * referenced, not because it was
+ * written */
+ unsigned int mirror_num:2; /* large enough to hold
+ * BTRFS_SUPER_MIRROR_MAX */
+ struct btrfsic_dev_state *dev_state;
+ u64 dev_bytenr; /* key, physical byte num on disk */
+ u64 logical_bytenr; /* logical byte num on disk */
+ u64 generation;
+ struct btrfs_disk_key disk_key; /* extra info to print in case of
+ * issues, will not always be correct */
+ struct list_head collision_resolving_node; /* list node */
+ struct list_head all_blocks_node; /* list node */
+
+ /* the following two lists contain block_link items */
+ struct list_head ref_to_list; /* list */
+ struct list_head ref_from_list; /* list */
+ struct btrfsic_block *next_in_same_bio;
+ void *orig_bio_bh_private;
+ union {
+ bio_end_io_t *bio;
+ bh_end_io_t *bh;
+ } orig_bio_bh_end_io;
+ int submit_bio_bh_rw;
+ u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+ u32 magic_num; /* only used for debug purposes */
+ u32 ref_cnt;
+ struct list_head node_ref_to; /* list node */
+ struct list_head node_ref_from; /* list node */
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block *block_ref_to;
+ struct btrfsic_block *block_ref_from;
+ u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+ u32 magic_num; /* only used for debug purposes */
+ struct block_device *bdev;
+ struct btrfsic_state *state;
+ struct list_head collision_resolving_node; /* list node */
+ struct btrfsic_block dummy_block_for_bio_bh_flush;
+ u64 last_flush_gen;
+ char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+ struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+ struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+ u64 start; /* virtual bytenr */
+ u64 dev_bytenr; /* physical bytenr on device */
+ u32 len;
+ struct btrfsic_dev_state *dev;
+ char *data;
+ struct buffer_head *bh; /* do not use if set to NULL */
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+ u32 magic;
+ u32 nr;
+ int error;
+ int i;
+ int limit_nesting;
+ int num_copies;
+ int mirror_num;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx *block_ctx;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfs_header *hdr;
+ struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+ u32 print_mask;
+ int include_extent_data;
+ int csum_size;
+ struct list_head all_blocks_list;
+ struct btrfsic_block_hashtable block_hashtable;
+ struct btrfsic_block_link_hashtable block_link_hashtable;
+ struct btrfs_root *root;
+ u64 max_superblock_generation;
+ struct btrfsic_block *latest_superblock;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+ struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+ struct block_device *bdev,
+ u64 dev_bytenr,
+ struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+ struct btrfsic_block_link *l,
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+ struct block_device *bdev_ref_to,
+ u64 dev_bytenr_ref_to,
+ struct block_device *bdev_ref_from,
+ u64 dev_bytenr_ref_from,
+ struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+ struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+ struct btrfsic_dev_state *ds,
+ struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+ struct block_device *bdev,
+ struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+ struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ struct btrfs_header *hdr,
+ int limit_nesting, int force_iodone_flag);
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx
+ *block_ctx, u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+ u32 len, struct block_device *bdev,
+ struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+ const u8 *data, unsigned int size);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, u8 *mapped_data,
+ unsigned int len, struct bio *bio,
+ int *bio_is_patched,
+ struct buffer_head *bh,
+ int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+ struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, char *data);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+ b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+ b->dev_state = NULL;
+ b->dev_bytenr = 0;
+ b->logical_bytenr = 0;
+ b->generation = BTRFSIC_GENERATION_UNKNOWN;
+ b->disk_key.objectid = 0;
+ b->disk_key.type = 0;
+ b->disk_key.offset = 0;
+ b->is_metadata = 0;
+ b->is_superblock = 0;
+ b->is_iodone = 0;
+ b->iodone_w_error = 0;
+ b->never_written = 0;
+ b->mirror_num = 0;
+ b->next_in_same_bio = NULL;
+ b->orig_bio_bh_private = NULL;
+ b->orig_bio_bh_end_io.bio = NULL;
+ INIT_LIST_HEAD(&b->collision_resolving_node);
+ INIT_LIST_HEAD(&b->all_blocks_node);
+ INIT_LIST_HEAD(&b->ref_to_list);
+ INIT_LIST_HEAD(&b->ref_from_list);
+ b->submit_bio_bh_rw = 0;
+ b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+ struct btrfsic_block *b;
+
+ b = kzalloc(sizeof(*b), GFP_NOFS);
+ if (NULL != b)
+ btrfsic_block_init(b);
+
+ return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+ BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+ kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+ l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+ l->ref_cnt = 1;
+ INIT_LIST_HEAD(&l->node_ref_to);
+ INIT_LIST_HEAD(&l->node_ref_from);
+ INIT_LIST_HEAD(&l->collision_resolving_node);
+ l->block_ref_to = NULL;
+ l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+ struct btrfsic_block_link *l;
+
+ l = kzalloc(sizeof(*l), GFP_NOFS);
+ if (NULL != l)
+ btrfsic_block_link_init(l);
+
+ return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+ BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+ kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+ ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+ ds->bdev = NULL;
+ ds->state = NULL;
+ ds->name[0] = '\0';
+ INIT_LIST_HEAD(&ds->collision_resolving_node);
+ ds->last_flush_gen = 0;
+ btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+ ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+ ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+ struct btrfsic_dev_state *ds;
+
+ ds = kzalloc(sizeof(*ds), GFP_NOFS);
+ if (NULL != ds)
+ btrfsic_dev_state_init(ds);
+
+ return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+ BUG_ON(!(NULL == ds ||
+ BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+ kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(b->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+ list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+ list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+ struct block_device *bdev,
+ u64 dev_bytenr,
+ struct btrfsic_block_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev))) &
+ (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_block *const b =
+ list_entry(elem, struct btrfsic_block,
+ collision_resolving_node);
+
+ if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+ return b;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+ struct btrfsic_block_link_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+ struct btrfsic_block_link *l,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+ ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+ ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+ & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+ list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+ struct block_device *bdev_ref_to,
+ u64 dev_bytenr_ref_to,
+ struct block_device *bdev_ref_from,
+ u64 dev_bytenr_ref_from,
+ struct btrfsic_block_link_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+ ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+ ((unsigned int)((uintptr_t)bdev_ref_from))) &
+ (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem, struct btrfsic_block_link,
+ collision_resolving_node);
+
+ BUG_ON(NULL == l->block_ref_to);
+ BUG_ON(NULL == l->block_ref_from);
+ if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+ l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+ l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+ l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+ return l;
+ }
+
+ return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+ struct btrfsic_dev_state_hashtable *h)
+{
+ int i;
+
+ for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+ INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+ struct btrfsic_dev_state *ds,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)((uintptr_t)ds->bdev)) &
+ (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+ list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+ list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+ struct block_device *bdev,
+ struct btrfsic_dev_state_hashtable *h)
+{
+ const unsigned int hashval =
+ (((unsigned int)((uintptr_t)bdev)) &
+ (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+ struct list_head *elem;
+
+ list_for_each(elem, h->table + hashval) {
+ struct btrfsic_dev_state *const ds =
+ list_entry(elem, struct btrfsic_dev_state,
+ collision_resolving_node);
+
+ if (ds->bdev == bdev)
+ return ds;
+ }
+
+ return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+ struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_super_block *selected_super;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+ struct btrfsic_dev_state *selected_dev_state = NULL;
+ int pass;
+
+ BUG_ON(NULL == state);
+ selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+ if (NULL == selected_super) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ return -1;
+ }
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ int i;
+ struct btrfsic_dev_state *dev_state;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ dev_state = btrfsic_dev_state_lookup(device->bdev);
+ BUG_ON(NULL == dev_state);
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ ret = btrfsic_process_superblock_dev_mirror(
+ state, dev_state, device, i,
+ &selected_dev_state, selected_super);
+ if (0 != ret && 0 == i) {
+ kfree(selected_super);
+ return ret;
+ }
+ }
+ }
+
+ if (NULL == state->latest_superblock) {
+ printk(KERN_INFO "btrfsic: no superblock found!\n");
+ kfree(selected_super);
+ return -1;
+ }
+
+ state->csum_size = btrfs_super_csum_size(selected_super);
+
+ for (pass = 0; pass < 3; pass++) {
+ int num_copies;
+ int mirror_num;
+ u64 next_bytenr;
+
+ switch (pass) {
+ case 0:
+ next_bytenr = btrfs_super_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "root@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 1:
+ next_bytenr = btrfs_super_chunk_root(selected_super);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "chunk@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 2:
+ next_bytenr = btrfs_super_log_root(selected_super);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "log@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+ struct btrfs_header *hdr;
+
+ ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO "btrfsic:"
+ " btrfsic_map_block(root @%llu,"
+ " mirror %d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ kfree(selected_super);
+ return -1;
+ }
+
+ next_block = btrfsic_block_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ &state->block_hashtable);
+ BUG_ON(NULL == next_block);
+
+ l = btrfsic_block_link_hashtable_lookup(
+ tmp_next_block_ctx.dev->bdev,
+ tmp_next_block_ctx.dev_bytenr,
+ state->latest_superblock->dev_state->
+ bdev,
+ state->latest_superblock->dev_bytenr,
+ &state->block_link_hashtable);
+ BUG_ON(NULL == l);
+
+ ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+ if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: read @logical %llu failed!\n",
+ (unsigned long long)
+ tmp_next_block_ctx.start);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ kfree(selected_super);
+ return -1;
+ }
+
+ hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
+ ret = btrfsic_process_metablock(state,
+ next_block,
+ &tmp_next_block_ctx,
+ hdr,
+ BTRFS_MAX_LEVEL + 3, 1);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ }
+ }
+
+ kfree(selected_super);
+ return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+ struct btrfsic_state *state,
+ struct btrfsic_dev_state *dev_state,
+ struct btrfs_device *device,
+ int superblock_mirror_num,
+ struct btrfsic_dev_state **selected_dev_state,
+ struct btrfs_super_block *selected_super)
+{
+ struct btrfs_super_block *super_tmp;
+ u64 dev_bytenr;
+ struct buffer_head *bh;
+ struct btrfsic_block *superblock_tmp;
+ int pass;
+ struct block_device *const superblock_bdev = device->bdev;
+
+ /* super block bytenr is always the unmapped device bytenr */
+ dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+ bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+ if (NULL == bh)
+ return -1;
+ super_tmp = (struct btrfs_super_block *)
+ (bh->b_data + (dev_bytenr & 4095));
+
+ if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+ strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+ sizeof(super_tmp->magic)) ||
+ memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+ brelse(bh);
+ return 0;
+ }
+
+ superblock_tmp =
+ btrfsic_block_hashtable_lookup(superblock_bdev,
+ dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == superblock_tmp) {
+ superblock_tmp = btrfsic_block_alloc();
+ if (NULL == superblock_tmp) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ brelse(bh);
+ return -1;
+ }
+ /* for superblock, only the dev_bytenr makes sense */
+ superblock_tmp->dev_bytenr = dev_bytenr;
+ superblock_tmp->dev_state = dev_state;
+ superblock_tmp->logical_bytenr = dev_bytenr;
+ superblock_tmp->generation = btrfs_super_generation(super_tmp);
+ superblock_tmp->is_metadata = 1;
+ superblock_tmp->is_superblock = 1;
+ superblock_tmp->is_iodone = 1;
+ superblock_tmp->never_written = 0;
+ superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO "New initial S-block (bdev %p, %s)"
+ " @%llu (%s/%llu/%d)\n",
+ superblock_bdev, device->name,
+ (unsigned long long)dev_bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ superblock_mirror_num);
+ list_add(&superblock_tmp->all_blocks_node,
+ &state->all_blocks_list);
+ btrfsic_block_hashtable_add(superblock_tmp,
+ &state->block_hashtable);
+ }
+
+ /* select the one with the highest generation field */
+ if (btrfs_super_generation(super_tmp) >
+ state->max_superblock_generation ||
+ 0 == state->max_superblock_generation) {
+ memcpy(selected_super, super_tmp, sizeof(*selected_super));
+ *selected_dev_state = dev_state;
+ state->max_superblock_generation =
+ btrfs_super_generation(super_tmp);
+ state->latest_superblock = superblock_tmp;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ u64 next_bytenr;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key;
+
+ tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_disk_key.offset = 0;
+ switch (pass) {
+ case 0:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "initial root ";
+ next_bytenr = btrfs_super_root(super_tmp);
+ break;
+ case 1:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "initial chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_tmp);
+ break;
+ case 2:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "initial log ";
+ next_bytenr = btrfs_super_log_root(super_tmp);
+ if (0 == next_bytenr)
+ continue;
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+
+ if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ &tmp_next_block_ctx,
+ mirror_num)) {
+ printk(KERN_INFO "btrfsic: btrfsic_map_block("
+ "bytenr @%llu, mirror %d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ brelse(bh);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ additional_string, 1, 1, 0,
+ mirror_num, NULL);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ brelse(bh);
+ return -1;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state, &tmp_next_block_ctx,
+ next_block, superblock_tmp,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l) {
+ brelse(bh);
+ return -1;
+ }
+ }
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+ btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+ brelse(bh);
+ return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+ struct btrfsic_stack_frame *sf;
+
+ sf = kzalloc(sizeof(*sf), GFP_NOFS);
+ if (NULL == sf)
+ printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+ else
+ sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+ return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+ BUG_ON(!(NULL == sf ||
+ BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+ kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const first_block,
+ struct btrfsic_block_data_ctx *const first_block_ctx,
+ struct btrfs_header *const first_hdr,
+ int first_limit_nesting, int force_iodone_flag)
+{
+ struct btrfsic_stack_frame initial_stack_frame = { 0 };
+ struct btrfsic_stack_frame *sf;
+ struct btrfsic_stack_frame *next_stack;
+
+ sf = &initial_stack_frame;
+ sf->error = 0;
+ sf->i = -1;
+ sf->limit_nesting = first_limit_nesting;
+ sf->block = first_block;
+ sf->block_ctx = first_block_ctx;
+ sf->next_block = NULL;
+ sf->hdr = first_hdr;
+ sf->prev = NULL;
+
+continue_with_new_stack_frame:
+ sf->block->generation = le64_to_cpu(sf->hdr->generation);
+ if (0 == sf->hdr->level) {
+ struct btrfs_leaf *const leafhdr =
+ (struct btrfs_leaf *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = le32_to_cpu(leafhdr->header.nritems);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "leaf %llu items %d generation %llu"
+ " owner %llu\n",
+ (unsigned long long)
+ sf->block_ctx->start,
+ sf->nr,
+ (unsigned long long)
+ le64_to_cpu(leafhdr->header.generation),
+ (unsigned long long)
+ le64_to_cpu(leafhdr->header.owner));
+ }
+
+continue_with_current_leaf_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_item *disk_item = leafhdr->items + sf->i;
+ struct btrfs_disk_key *disk_key = &disk_item->key;
+ u8 type;
+ const u32 item_offset = le32_to_cpu(disk_item->offset);
+
+ type = disk_key->type;
+
+ if (BTRFS_ROOT_ITEM_KEY == type) {
+ const struct btrfs_root_item *const root_item =
+ (struct btrfs_root_item *)
+ (sf->block_ctx->data +
+ offsetof(struct btrfs_leaf, items) +
+ item_offset);
+ const u64 next_bytenr =
+ le64_to_cpu(root_item->bytenr);
+
+ sf->error =
+ btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ disk_key,
+ le64_to_cpu(root_item->
+ generation));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.data;
+
+ next_stack =
+ btrfsic_stack_frame_alloc();
+ if (NULL == next_stack) {
+ btrfsic_release_block_ctx(
+ &sf->
+ next_block_ctx);
+ goto one_stack_frame_backwards;
+ }
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx =
+ &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+ } else if (BTRFS_EXTENT_DATA_KEY == type &&
+ state->include_extent_data) {
+ sf->error = btrfsic_handle_extent_data(
+ state,
+ sf->block,
+ sf->block_ctx,
+ item_offset,
+ force_iodone_flag);
+ if (sf->error)
+ goto one_stack_frame_backwards;
+ }
+
+ goto continue_with_current_leaf_stack_frame;
+ }
+ } else {
+ struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+ if (-1 == sf->i) {
+ sf->nr = le32_to_cpu(nodehdr->header.nritems);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "node %llu level %d items %d"
+ " generation %llu owner %llu\n",
+ (unsigned long long)
+ sf->block_ctx->start,
+ nodehdr->header.level, sf->nr,
+ (unsigned long long)
+ le64_to_cpu(nodehdr->header.generation),
+ (unsigned long long)
+ le64_to_cpu(nodehdr->header.owner));
+ }
+
+continue_with_current_node_stack_frame:
+ if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+ sf->i++;
+ sf->num_copies = 0;
+ }
+
+ if (sf->i < sf->nr) {
+ struct btrfs_key_ptr *disk_key_ptr =
+ nodehdr->ptrs + sf->i;
+ const u64 next_bytenr =
+ le64_to_cpu(disk_key_ptr->blockptr);
+
+ sf->error = btrfsic_create_link_to_next_block(
+ state,
+ sf->block,
+ sf->block_ctx,
+ next_bytenr,
+ sf->limit_nesting,
+ &sf->next_block_ctx,
+ &sf->next_block,
+ force_iodone_flag,
+ &sf->num_copies,
+ &sf->mirror_num,
+ &disk_key_ptr->key,
+ le64_to_cpu(disk_key_ptr->generation));
+ if (sf->error)
+ goto one_stack_frame_backwards;
+
+ if (NULL != sf->next_block) {
+ struct btrfs_header *const next_hdr =
+ (struct btrfs_header *)
+ sf->next_block_ctx.data;
+
+ next_stack = btrfsic_stack_frame_alloc();
+ if (NULL == next_stack)
+ goto one_stack_frame_backwards;
+
+ next_stack->i = -1;
+ next_stack->block = sf->next_block;
+ next_stack->block_ctx = &sf->next_block_ctx;
+ next_stack->next_block = NULL;
+ next_stack->hdr = next_hdr;
+ next_stack->limit_nesting =
+ sf->limit_nesting - 1;
+ next_stack->prev = sf;
+ sf = next_stack;
+ goto continue_with_new_stack_frame;
+ }
+
+ goto continue_with_current_node_stack_frame;
+ }
+ }
+
+one_stack_frame_backwards:
+ if (NULL != sf->prev) {
+ struct btrfsic_stack_frame *const prev = sf->prev;
+
+ /* the one for the initial block is freed in the caller */
+ btrfsic_release_block_ctx(sf->block_ctx);
+
+ if (sf->error) {
+ prev->error = sf->error;
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto one_stack_frame_backwards;
+ }
+
+ btrfsic_stack_frame_free(sf);
+ sf = prev;
+ goto continue_with_new_stack_frame;
+ } else {
+ BUG_ON(&initial_stack_frame != sf);
+ }
+
+ return sf->error;
+}
+
+static int btrfsic_create_link_to_next_block(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u64 next_bytenr,
+ int limit_nesting,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block **next_blockp,
+ int force_iodone_flag,
+ int *num_copiesp, int *mirror_nump,
+ struct btrfs_disk_key *disk_key,
+ u64 parent_generation)
+{
+ struct btrfsic_block *next_block = NULL;
+ int ret;
+ struct btrfsic_block_link *l;
+ int did_alloc_block_link;
+ int block_was_created;
+
+ *next_blockp = NULL;
+ if (0 == *num_copiesp) {
+ *num_copiesp =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, *num_copiesp);
+ *mirror_nump = 1;
+ }
+
+ if (*mirror_nump > *num_copiesp)
+ return 0;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+ *mirror_nump);
+ ret = btrfsic_map_block(state, next_bytenr,
+ BTRFSIC_BLOCK_SIZE,
+ next_block_ctx, *mirror_nump);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+ (unsigned long long)next_bytenr, *mirror_nump);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(state,
+ next_block_ctx, "referenced ",
+ 1, force_iodone_flag,
+ !force_iodone_flag,
+ *mirror_nump,
+ &block_was_created);
+ if (NULL == next_block) {
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+ if (block_was_created) {
+ l = NULL;
+ next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ } else {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr)) {
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d)"
+ " found in hash table, %c,"
+ " bytenr mismatch (!= stored %llu).\n",
+ (unsigned long long)next_bytenr,
+ next_block_ctx->dev->name,
+ (unsigned long long)next_block_ctx->dev_bytenr,
+ *mirror_nump,
+ btrfsic_get_block_type(state, next_block),
+ (unsigned long long)next_block->logical_bytenr);
+ } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ (unsigned long long)next_bytenr,
+ next_block_ctx->dev->name,
+ (unsigned long long)next_block_ctx->dev_bytenr,
+ *mirror_nump,
+ btrfsic_get_block_type(state, next_block));
+ next_block->logical_bytenr = next_bytenr;
+
+ next_block->mirror_num = *mirror_nump;
+ l = btrfsic_block_link_hashtable_lookup(
+ next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_link_hashtable);
+ }
+
+ next_block->disk_key = *disk_key;
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (NULL == l) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ did_alloc_block_link = 1;
+ l->block_ref_to = next_block;
+ l->block_ref_from = block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ did_alloc_block_link = 0;
+ if (0 == limit_nesting) {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+ }
+
+ if (limit_nesting > 0 && did_alloc_block_link) {
+ ret = btrfsic_read_block(state, next_block_ctx);
+ if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+ printk(KERN_INFO
+ "btrfsic: read block @logical %llu failed!\n",
+ (unsigned long long)next_bytenr);
+ btrfsic_release_block_ctx(next_block_ctx);
+ *next_blockp = NULL;
+ return -1;
+ }
+
+ *next_blockp = next_block;
+ } else {
+ *next_blockp = NULL;
+ }
+ (*mirror_nump)++;
+
+ return 0;
+}
+
+static int btrfsic_handle_extent_data(
+ struct btrfsic_state *state,
+ struct btrfsic_block *block,
+ struct btrfsic_block_data_ctx *block_ctx,
+ u32 item_offset, int force_iodone_flag)
+{
+ int ret;
+ struct btrfs_file_extent_item *file_extent_item =
+ (struct btrfs_file_extent_item *)(block_ctx->data +
+ offsetof(struct btrfs_leaf,
+ items) + item_offset);
+ u64 next_bytenr =
+ le64_to_cpu(file_extent_item->disk_bytenr) +
+ le64_to_cpu(file_extent_item->offset);
+ u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
+ u64 generation = le64_to_cpu(file_extent_item->generation);
+ struct btrfsic_block_link *l;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+ " offset = %llu, num_bytes = %llu\n",
+ file_extent_item->type,
+ (unsigned long long)
+ le64_to_cpu(file_extent_item->disk_bytenr),
+ (unsigned long long)
+ le64_to_cpu(file_extent_item->offset),
+ (unsigned long long)
+ le64_to_cpu(file_extent_item->num_bytes));
+ if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
+ ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
+ return 0;
+ while (num_bytes > 0) {
+ u32 chunk_len;
+ int num_copies;
+ int mirror_num;
+
+ if (num_bytes > BTRFSIC_BLOCK_SIZE)
+ chunk_len = BTRFSIC_BLOCK_SIZE;
+ else
+ chunk_len = num_bytes;
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ struct btrfsic_block_data_ctx next_block_ctx;
+ struct btrfsic_block *next_block;
+ int block_was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "btrfsic_handle_extent_data("
+ "mirror_num=%d)\n", mirror_num);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+ printk(KERN_INFO
+ "\tdisk_bytenr = %llu, num_bytes %u\n",
+ (unsigned long long)next_bytenr,
+ chunk_len);
+ ret = btrfsic_map_block(state, next_bytenr,
+ chunk_len, &next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu,"
+ " mirror=%d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &next_block_ctx,
+ "referenced ",
+ 0,
+ force_iodone_flag,
+ !force_iodone_flag,
+ mirror_num,
+ &block_was_created);
+ if (NULL == next_block) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&next_block_ctx);
+ return -1;
+ }
+ if (!block_was_created) {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr)) {
+ printk(KERN_INFO
+ "Referenced block"
+ " @%llu (%s/%llu/%d)"
+ " found in hash table, D,"
+ " bytenr mismatch"
+ " (!= stored %llu).\n",
+ (unsigned long long)next_bytenr,
+ next_block_ctx.dev->name,
+ (unsigned long long)
+ next_block_ctx.dev_bytenr,
+ mirror_num,
+ (unsigned long long)
+ next_block->logical_bytenr);
+ }
+ next_block->logical_bytenr = next_bytenr;
+ next_block->mirror_num = mirror_num;
+ }
+
+ l = btrfsic_block_link_lookup_or_add(state,
+ &next_block_ctx,
+ next_block, block,
+ generation);
+ btrfsic_release_block_ctx(&next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+
+ next_bytenr += chunk_len;
+ num_bytes -= chunk_len;
+ }
+
+ return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+ struct btrfsic_block_data_ctx *block_ctx_out,
+ int mirror_num)
+{
+ int ret;
+ u64 length;
+ struct btrfs_bio *multi = NULL;
+ struct btrfs_device *device;
+
+ length = len;
+ ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+ bytenr, &length, &multi, mirror_num);
+
+ device = multi->stripes[0].dev;
+ block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+ block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+ block_ctx_out->start = bytenr;
+ block_ctx_out->len = len;
+ block_ctx_out->data = NULL;
+ block_ctx_out->bh = NULL;
+
+ if (0 == ret)
+ kfree(multi);
+ if (NULL == block_ctx_out->dev) {
+ ret = -ENXIO;
+ printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+ }
+
+ return ret;
+}
+
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+ u32 len, struct block_device *bdev,
+ struct btrfsic_block_data_ctx *block_ctx_out)
+{
+ block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+ block_ctx_out->dev_bytenr = bytenr;
+ block_ctx_out->start = bytenr;
+ block_ctx_out->len = len;
+ block_ctx_out->data = NULL;
+ block_ctx_out->bh = NULL;
+ if (NULL != block_ctx_out->dev) {
+ return 0;
+ } else {
+ printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+ return -ENXIO;
+ }
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+ if (NULL != block_ctx->bh) {
+ brelse(block_ctx->bh);
+ block_ctx->bh = NULL;
+ }
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx)
+{
+ block_ctx->bh = NULL;
+ if (block_ctx->dev_bytenr & 4095) {
+ printk(KERN_INFO
+ "btrfsic: read_block() with unaligned bytenr %llu\n",
+ (unsigned long long)block_ctx->dev_bytenr);
+ return -1;
+ }
+ if (block_ctx->len > 4096) {
+ printk(KERN_INFO
+ "btrfsic: read_block() with too huge size %d\n",
+ block_ctx->len);
+ return -1;
+ }
+
+ block_ctx->bh = __bread(block_ctx->dev->bdev,
+ block_ctx->dev_bytenr >> 12, 4096);
+ if (NULL == block_ctx->bh)
+ return -1;
+ block_ctx->data = block_ctx->bh->b_data;
+
+ return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+ struct list_head *elem_all;
+
+ BUG_ON(NULL == state);
+
+ printk(KERN_INFO "all_blocks_list:\n");
+ list_for_each(elem_all, &state->all_blocks_list) {
+ const struct btrfsic_block *const b_all =
+ list_entry(elem_all, struct btrfsic_block,
+ all_blocks_node);
+ struct list_head *elem_ref_to;
+ struct list_head *elem_ref_from;
+
+ printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num);
+
+ list_for_each(elem_ref_to, &b_all->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+ " refers %u* to"
+ " %c @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ }
+
+ list_for_each(elem_ref_from, &b_all->ref_from_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_from,
+ struct btrfsic_block_link,
+ node_ref_from);
+
+ printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+ " is ref %u* from"
+ " %c @%llu (%s/%llu/%d)\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ }
+
+ printk(KERN_INFO "\n");
+ }
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+ const u8 *data, unsigned int size)
+{
+ struct btrfs_header *h;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ int fail = 0;
+ int crc_fail = 0;
+
+ h = (struct btrfs_header *)data;
+
+ if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+ fail++;
+
+ crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+ btrfs_csum_final(crc, csum);
+ if (memcmp(csum, h->csum, state->csum_size))
+ crc_fail++;
+
+ return fail || crc_fail;
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr,
+ u8 *mapped_data, unsigned int len,
+ struct bio *bio,
+ int *bio_is_patched,
+ struct buffer_head *bh,
+ int submit_bio_bh_rw)
+{
+ int is_metadata;
+ struct btrfsic_block *block;
+ struct btrfsic_block_data_ctx block_ctx;
+ int ret;
+ struct btrfsic_state *state = dev_state->state;
+ struct block_device *bdev = dev_state->bdev;
+
+ WARN_ON(len > PAGE_SIZE);
+ is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
+ if (NULL != bio_is_patched)
+ *bio_is_patched = 0;
+
+ block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+ &state->block_hashtable);
+ if (NULL != block) {
+ u64 bytenr = 0;
+ struct list_head *elem_ref_to;
+ struct list_head *tmp_ref_to;
+
+ if (block->is_superblock) {
+ bytenr = le64_to_cpu(((struct btrfs_super_block *)
+ mapped_data)->bytenr);
+ is_metadata = 1;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+ printk(KERN_INFO
+ "[before new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ }
+ if (is_metadata) {
+ if (!block->is_superblock) {
+ bytenr = le64_to_cpu(((struct btrfs_header *)
+ mapped_data)->bytenr);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+ dev_state,
+ dev_bytenr,
+ mapped_data);
+ }
+ if (block->logical_bytenr != bytenr) {
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c,"
+ " bytenr mismatch"
+ " (!= stored %llu).\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)
+ block->logical_bytenr);
+ block->logical_bytenr = bytenr;
+ } else if (state->print_mask &
+ BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block));
+ } else {
+ bytenr = block->logical_bytenr;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d)"
+ " found in hash table, %c.\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state, block));
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "ref_to_list: %cE, ref_from_list: %cE\n",
+ list_empty(&block->ref_to_list) ? ' ' : '!',
+ list_empty(&block->ref_from_list) ? ' ' : '!');
+ if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+ printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+ " @%llu (%s/%llu/%d), old(gen=%llu,"
+ " objectid=%llu, type=%d, offset=%llu),"
+ " new(gen=%llu),"
+ " which is referenced by most recent superblock"
+ " (superblockgen=%llu)!\n",
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ (unsigned long long)block->generation,
+ (unsigned long long)
+ le64_to_cpu(block->disk_key.objectid),
+ block->disk_key.type,
+ (unsigned long long)
+ le64_to_cpu(block->disk_key.offset),
+ (unsigned long long)
+ le64_to_cpu(((struct btrfs_header *)
+ mapped_data)->generation),
+ (unsigned long long)
+ state->max_superblock_generation);
+ btrfsic_dump_tree(state);
+ }
+
+ if (!block->is_iodone && !block->never_written) {
+ printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+ " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+ " which is not yet iodone!\n",
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr,
+ block->mirror_num,
+ (unsigned long long)block->generation,
+ (unsigned long long)
+ le64_to_cpu(((struct btrfs_header *)
+ mapped_data)->generation));
+ /* it would not be safe to go on */
+ btrfsic_dump_tree(state);
+ return;
+ }
+
+ /*
+ * Clear all references of this block. Do not free
+ * the block itself even if is not referenced anymore
+ * because it still carries valueable information
+ * like whether it was ever written and IO completed.
+ */
+ list_for_each_safe(elem_ref_to, tmp_ref_to,
+ &block->ref_to_list) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+ l->ref_cnt--;
+ if (0 == l->ref_cnt) {
+ list_del(&l->node_ref_to);
+ list_del(&l->node_ref_from);
+ btrfsic_block_link_hashtable_remove(l);
+ btrfsic_block_link_free(l);
+ }
+ }
+
+ if (block->is_superblock)
+ ret = btrfsic_map_superblock(state, bytenr, len,
+ bdev, &block_ctx);
+ else
+ ret = btrfsic_map_block(state, bytenr, len,
+ &block_ctx, 0);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(root @%llu)"
+ " failed!\n", (unsigned long long)bytenr);
+ return;
+ }
+ block_ctx.data = mapped_data;
+ /* the following is required in case of writes to mirrors,
+ * use the same that was used for the lookup */
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+
+ if (is_metadata || state->include_extent_data) {
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_bh_private =
+ bio->bi_private;
+ block->orig_bio_bh_end_io.bio =
+ bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_bh_private =
+ chained_block->orig_bio_bh_private;
+ block->orig_bio_bh_end_io.bio =
+ chained_block->orig_bio_bh_end_io.
+ bio;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else if (NULL != bh) {
+ block->is_iodone = 0;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_bh_private = NULL;
+ block->orig_bio_bh_end_io.bio = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ }
+
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (is_metadata) {
+ block->logical_bytenr = bytenr;
+ block->is_metadata = 1;
+ if (block->is_superblock) {
+ ret = btrfsic_process_written_superblock(
+ state,
+ block,
+ (struct btrfs_super_block *)
+ mapped_data);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+ printk(KERN_INFO
+ "[after new superblock is written]:\n");
+ btrfsic_dump_tree_sub(state, block, 0);
+ }
+ } else {
+ block->mirror_num = 0; /* unknown */
+ ret = btrfsic_process_metablock(
+ state,
+ block,
+ &block_ctx,
+ (struct btrfs_header *)
+ block_ctx.data,
+ 0, 0);
+ }
+ if (ret)
+ printk(KERN_INFO
+ "btrfsic: btrfsic_process_metablock"
+ "(root @%llu) failed!\n",
+ (unsigned long long)dev_bytenr);
+ } else {
+ block->is_metadata = 0;
+ block->mirror_num = 0; /* unknown */
+ block->generation = BTRFSIC_GENERATION_UNKNOWN;
+ if (!state->include_extent_data
+ && list_empty(&block->ref_from_list)) {
+ /*
+ * disk block is overwritten with extent
+ * data (not meta data) and we are configured
+ * to not include extent data: take the
+ * chance and free the block's memory
+ */
+ btrfsic_block_hashtable_remove(block);
+ list_del(&block->all_blocks_node);
+ btrfsic_block_free(block);
+ }
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ } else {
+ /* block has not been found in hash table */
+ u64 bytenr;
+
+ if (!is_metadata) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO "Written block (%s/%llu/?)"
+ " !found in hash table, D.\n",
+ dev_state->name,
+ (unsigned long long)dev_bytenr);
+ if (!state->include_extent_data)
+ return; /* ignore that written D block */
+
+ /* this is getting ugly for the
+ * include_extent_data case... */
+ bytenr = 0; /* unknown */
+ block_ctx.start = bytenr;
+ block_ctx.len = len;
+ block_ctx.bh = NULL;
+ } else {
+ bytenr = le64_to_cpu(((struct btrfs_header *)
+ mapped_data)->bytenr);
+ btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+ dev_bytenr,
+ mapped_data);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/?)"
+ " !found in hash table, M.\n",
+ (unsigned long long)bytenr,
+ dev_state->name,
+ (unsigned long long)dev_bytenr);
+
+ ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
+ 0);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(root @%llu)"
+ " failed!\n",
+ (unsigned long long)dev_bytenr);
+ return;
+ }
+ }
+ block_ctx.data = mapped_data;
+ /* the following is required in case of writes to mirrors,
+ * use the same that was used for the lookup */
+ block_ctx.dev = dev_state;
+ block_ctx.dev_bytenr = dev_bytenr;
+
+ block = btrfsic_block_alloc();
+ if (NULL == block) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&block_ctx);
+ return;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = dev_bytenr;
+ block->logical_bytenr = bytenr;
+ block->is_metadata = is_metadata;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->mirror_num = 0; /* unknown */
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = submit_bio_bh_rw;
+ if (NULL != bio) {
+ block->is_iodone = 0;
+ BUG_ON(NULL == bio_is_patched);
+ if (!*bio_is_patched) {
+ block->orig_bio_bh_private = bio->bi_private;
+ block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ *bio_is_patched = 1;
+ } else {
+ struct btrfsic_block *chained_block =
+ (struct btrfsic_block *)
+ bio->bi_private;
+
+ BUG_ON(NULL == chained_block);
+ block->orig_bio_bh_private =
+ chained_block->orig_bio_bh_private;
+ block->orig_bio_bh_end_io.bio =
+ chained_block->orig_bio_bh_end_io.bio;
+ block->next_in_same_bio = chained_block;
+ bio->bi_private = block;
+ }
+ } else if (NULL != bh) {
+ block->is_iodone = 0;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ } else {
+ block->is_iodone = 1;
+ block->orig_bio_bh_private = NULL;
+ block->orig_bio_bh_end_io.bio = NULL;
+ block->next_in_same_bio = NULL;
+ }
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "New written %c-block @%llu (%s/%llu/%d)\n",
+ is_metadata ? 'M' : 'D',
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+ if (is_metadata) {
+ ret = btrfsic_process_metablock(state, block,
+ &block_ctx,
+ (struct btrfs_header *)
+ block_ctx.data, 0, 0);
+ if (ret)
+ printk(KERN_INFO
+ "btrfsic: process_metablock(root @%llu)"
+ " failed!\n",
+ (unsigned long long)dev_bytenr);
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+ struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+ int iodone_w_error;
+
+ /* mutex is not held! This is not save if IO is not yet completed
+ * on umount */
+ iodone_w_error = 0;
+ if (bio_error_status)
+ iodone_w_error = 1;
+
+ BUG_ON(NULL == block);
+ bp->bi_private = block->orig_bio_bh_private;
+ bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+ do {
+ struct btrfsic_block *next_block;
+ struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+ bio_error_status,
+ btrfsic_get_block_type(dev_state->state, block),
+ (unsigned long long)block->logical_bytenr,
+ dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+ next_block = block->next_in_same_bio;
+ block->iodone_w_error = iodone_w_error;
+ if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ dev_state->last_flush_gen++;
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bio_end_io() new %s flush_gen=%llu\n",
+ dev_state->name,
+ (unsigned long long)
+ dev_state->last_flush_gen);
+ }
+ if (block->submit_bio_bh_rw & REQ_FUA)
+ block->flush_gen = 0; /* FUA completed means block is
+ * on disk */
+ block->is_iodone = 1; /* for FLUSH, this releases the block */
+ block = next_block;
+ } while (NULL != block);
+
+ bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+ struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+ int iodone_w_error = !uptodate;
+ struct btrfsic_dev_state *dev_state;
+
+ BUG_ON(NULL == block);
+ dev_state = block->dev_state;
+ if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+ iodone_w_error,
+ btrfsic_get_block_type(dev_state->state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+
+ block->iodone_w_error = iodone_w_error;
+ if (block->submit_bio_bh_rw & REQ_FLUSH) {
+ dev_state->last_flush_gen++;
+ if ((dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+ printk(KERN_INFO
+ "bh_end_io() new %s flush_gen=%llu\n",
+ dev_state->name,
+ (unsigned long long)dev_state->last_flush_gen);
+ }
+ if (block->submit_bio_bh_rw & REQ_FUA)
+ block->flush_gen = 0; /* FUA completed means block is on disk */
+
+ bh->b_private = block->orig_bio_bh_private;
+ bh->b_end_io = block->orig_bio_bh_end_io.bh;
+ block->is_iodone = 1; /* for FLUSH, this releases the block */
+ bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+ struct btrfsic_state *state,
+ struct btrfsic_block *const superblock,
+ struct btrfs_super_block *const super_hdr)
+{
+ int pass;
+
+ superblock->generation = btrfs_super_generation(super_hdr);
+ if (!(superblock->generation > state->max_superblock_generation ||
+ 0 == state->max_superblock_generation)) {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO
+ "btrfsic: superblock @%llu (%s/%llu/%d)"
+ " with old gen %llu <= %llu\n",
+ (unsigned long long)superblock->logical_bytenr,
+ superblock->dev_state->name,
+ (unsigned long long)superblock->dev_bytenr,
+ superblock->mirror_num,
+ (unsigned long long)
+ btrfs_super_generation(super_hdr),
+ (unsigned long long)
+ state->max_superblock_generation);
+ } else {
+ if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+ printk(KERN_INFO
+ "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+ " with new gen %llu > %llu\n",
+ (unsigned long long)superblock->logical_bytenr,
+ superblock->dev_state->name,
+ (unsigned long long)superblock->dev_bytenr,
+ superblock->mirror_num,
+ (unsigned long long)
+ btrfs_super_generation(super_hdr),
+ (unsigned long long)
+ state->max_superblock_generation);
+
+ state->max_superblock_generation =
+ btrfs_super_generation(super_hdr);
+ state->latest_superblock = superblock;
+ }
+
+ for (pass = 0; pass < 3; pass++) {
+ int ret;
+ u64 next_bytenr;
+ struct btrfsic_block *next_block;
+ struct btrfsic_block_data_ctx tmp_next_block_ctx;
+ struct btrfsic_block_link *l;
+ int num_copies;
+ int mirror_num;
+ const char *additional_string = NULL;
+ struct btrfs_disk_key tmp_disk_key;
+
+ tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+ tmp_disk_key.offset = 0;
+
+ switch (pass) {
+ case 0:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+ additional_string = "root ";
+ next_bytenr = btrfs_super_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "root@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 1:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+ additional_string = "chunk ";
+ next_bytenr = btrfs_super_chunk_root(super_hdr);
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "chunk@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ case 2:
+ tmp_disk_key.objectid =
+ cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+ additional_string = "log ";
+ next_bytenr = btrfs_super_log_root(super_hdr);
+ if (0 == next_bytenr)
+ continue;
+ if (state->print_mask &
+ BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+ printk(KERN_INFO "log@%llu\n",
+ (unsigned long long)next_bytenr);
+ break;
+ }
+
+ num_copies =
+ btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ next_bytenr, PAGE_SIZE);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+ printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+ (unsigned long long)next_bytenr, num_copies);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ int was_created;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic_process_written_superblock("
+ "mirror_num=%d)\n", mirror_num);
+ ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+ &tmp_next_block_ctx,
+ mirror_num);
+ if (ret) {
+ printk(KERN_INFO
+ "btrfsic: btrfsic_map_block(@%llu,"
+ " mirror=%d) failed!\n",
+ (unsigned long long)next_bytenr,
+ mirror_num);
+ return -1;
+ }
+
+ next_block = btrfsic_block_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ additional_string,
+ 1, 0, 1,
+ mirror_num,
+ &was_created);
+ if (NULL == next_block) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc failed!\n");
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ return -1;
+ }
+
+ next_block->disk_key = tmp_disk_key;
+ if (was_created)
+ next_block->generation =
+ BTRFSIC_GENERATION_UNKNOWN;
+ l = btrfsic_block_link_lookup_or_add(
+ state,
+ &tmp_next_block_ctx,
+ next_block,
+ superblock,
+ BTRFSIC_GENERATION_UNKNOWN);
+ btrfsic_release_block_ctx(&tmp_next_block_ctx);
+ if (NULL == l)
+ return -1;
+ }
+ }
+
+ if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
+ WARN_ON(1);
+ btrfsic_dump_tree(state);
+ }
+
+ return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+ struct btrfsic_block *const block,
+ int recursion_level)
+{
+ struct list_head *elem_ref_to;
+ int ret = 0;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /*
+ * Note that this situation can happen and does not
+ * indicate an error in regular cases. It happens
+ * when disk blocks are freed and later reused.
+ * The check-integrity module is not aware of any
+ * block free operations, it just recognizes block
+ * write operations. Therefore it keeps the linkage
+ * information for a block until a block is
+ * rewritten. This can temporarily cause incorrect
+ * and even circular linkage informations. This
+ * causes no harm unless such blocks are referenced
+ * by the most recent super block.
+ */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic: abort cyclic linkage (case 1).\n");
+
+ return ret;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack
+ * space is very small and the max recursion depth is limited.
+ */
+ list_for_each(elem_ref_to, &block->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to, struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "rl=%d, %c @%llu (%s/%llu/%d)"
+ " %u* refers to %c @%llu (%s/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ if (l->block_ref_to->never_written) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is never written!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (!l->block_ref_to->is_iodone) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is not yet iodone!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+ ret = -1;
+ } else if (l->parent_generation !=
+ l->block_ref_to->generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->parent_generation &&
+ BTRFSIC_GENERATION_UNKNOWN !=
+ l->block_ref_to->generation) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " with generation %llu !="
+ " parent generation %llu!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num,
+ (unsigned long long)l->block_ref_to->generation,
+ (unsigned long long)l->parent_generation);
+ ret = -1;
+ } else if (l->block_ref_to->flush_gen >
+ l->block_ref_to->dev_state->last_flush_gen) {
+ printk(KERN_INFO "btrfs: attempt to write superblock"
+ " which references block %c @%llu (%s/%llu/%d)"
+ " which is not flushed out of disk's write cache"
+ " (block flush_gen=%llu,"
+ " dev->flush_gen=%llu)!\n",
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)
+ l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num,
+ (unsigned long long)block->flush_gen,
+ (unsigned long long)
+ l->block_ref_to->dev_state->last_flush_gen);
+ ret = -1;
+ } else if (-1 == btrfsic_check_all_ref_blocks(state,
+ l->block_ref_to,
+ recursion_level +
+ 1)) {
+ ret = -1;
+ }
+ }
+
+ return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+ const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int recursion_level)
+{
+ struct list_head *elem_ref_from;
+
+ if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+ /* refer to comment at "abort cyclic linkage (case 1)" */
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "btrfsic: abort cyclic linkage (case 2).\n");
+
+ return 0;
+ }
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ list_for_each(elem_ref_from, &block->ref_from_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_from, struct btrfsic_block_link,
+ node_ref_from);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "rl=%d, %c @%llu (%s/%llu/%d)"
+ " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+ recursion_level,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num,
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)
+ l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)
+ l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num);
+ if (l->block_ref_from->is_superblock &&
+ state->latest_superblock->dev_bytenr ==
+ l->block_ref_from->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev ==
+ l->block_ref_from->dev_state->bdev)
+ return 1;
+ else if (btrfsic_is_block_ref_by_superblock(state,
+ l->block_ref_from,
+ recursion_level +
+ 1))
+ return 1;
+ }
+
+ return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ printk(KERN_INFO
+ "Add %u* link from %c @%llu (%s/%llu/%d)"
+ " to %c @%llu (%s/%llu/%d).\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+ const struct btrfsic_block_link *l)
+{
+ printk(KERN_INFO
+ "Rem %u* link from %c @%llu (%s/%llu/%d)"
+ " to %c @%llu (%s/%llu/%d).\n",
+ l->ref_cnt,
+ btrfsic_get_block_type(state, l->block_ref_from),
+ (unsigned long long)l->block_ref_from->logical_bytenr,
+ l->block_ref_from->dev_state->name,
+ (unsigned long long)l->block_ref_from->dev_bytenr,
+ l->block_ref_from->mirror_num,
+ btrfsic_get_block_type(state, l->block_ref_to),
+ (unsigned long long)l->block_ref_to->logical_bytenr,
+ l->block_ref_to->dev_state->name,
+ (unsigned long long)l->block_ref_to->dev_bytenr,
+ l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+ const struct btrfsic_block *block)
+{
+ if (block->is_superblock &&
+ state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+ state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+ return 'S';
+ else if (block->is_superblock)
+ return 's';
+ else if (block->is_metadata)
+ return 'M';
+ else
+ return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+ btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+ const struct btrfsic_block *block,
+ int indent_level)
+{
+ struct list_head *elem_ref_to;
+ int indent_add;
+ static char buf[80];
+ int cursor_position;
+
+ /*
+ * Should better fill an on-stack buffer with a complete line and
+ * dump it at once when it is time to print a newline character.
+ */
+
+ /*
+ * This algorithm is recursive because the amount of used stack space
+ * is very small and the max recursion depth is limited.
+ */
+ indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ block->dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ block->mirror_num);
+ if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ return;
+ }
+ printk(buf);
+ indent_level += indent_add;
+ if (list_empty(&block->ref_to_list)) {
+ printk("\n");
+ return;
+ }
+ if (block->mirror_num > 1 &&
+ !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+ printk(" [...]\n");
+ return;
+ }
+
+ cursor_position = indent_level;
+ list_for_each(elem_ref_to, &block->ref_to_list) {
+ const struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to, struct btrfsic_block_link,
+ node_ref_to);
+
+ while (cursor_position < indent_level) {
+ printk(" ");
+ cursor_position++;
+ }
+ if (l->ref_cnt > 1)
+ indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+ else
+ indent_add = sprintf(buf, " --> ");
+ if (indent_level + indent_add >
+ BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+ printk("[...]\n");
+ cursor_position = 0;
+ continue;
+ }
+
+ printk(buf);
+
+ btrfsic_dump_tree_sub(state, l->block_ref_to,
+ indent_level + indent_add);
+ cursor_position = 0;
+ }
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *next_block_ctx,
+ struct btrfsic_block *next_block,
+ struct btrfsic_block *from_block,
+ u64 parent_generation)
+{
+ struct btrfsic_block_link *l;
+
+ l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+ next_block_ctx->dev_bytenr,
+ from_block->dev_state->bdev,
+ from_block->dev_bytenr,
+ &state->block_link_hashtable);
+ if (NULL == l) {
+ l = btrfsic_block_link_alloc();
+ if (NULL == l) {
+ printk(KERN_INFO
+ "btrfsic: error, kmalloc" " failed!\n");
+ return NULL;
+ }
+
+ l->block_ref_to = next_block;
+ l->block_ref_from = from_block;
+ l->ref_cnt = 1;
+ l->parent_generation = parent_generation;
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+
+ list_add(&l->node_ref_to, &from_block->ref_to_list);
+ list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+ btrfsic_block_link_hashtable_add(l,
+ &state->block_link_hashtable);
+ } else {
+ l->ref_cnt++;
+ l->parent_generation = parent_generation;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_add_link(state, l);
+ }
+
+ return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+ struct btrfsic_state *state,
+ struct btrfsic_block_data_ctx *block_ctx,
+ const char *additional_string,
+ int is_metadata,
+ int is_iodone,
+ int never_written,
+ int mirror_num,
+ int *was_created)
+{
+ struct btrfsic_block *block;
+
+ block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+ block_ctx->dev_bytenr,
+ &state->block_hashtable);
+ if (NULL == block) {
+ struct btrfsic_dev_state *dev_state;
+
+ block = btrfsic_block_alloc();
+ if (NULL == block) {
+ printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+ return NULL;
+ }
+ dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+ if (NULL == dev_state) {
+ printk(KERN_INFO
+ "btrfsic: error, lookup dev_state failed!\n");
+ btrfsic_block_free(block);
+ return NULL;
+ }
+ block->dev_state = dev_state;
+ block->dev_bytenr = block_ctx->dev_bytenr;
+ block->logical_bytenr = block_ctx->start;
+ block->is_metadata = is_metadata;
+ block->is_iodone = is_iodone;
+ block->never_written = never_written;
+ block->mirror_num = mirror_num;
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ printk(KERN_INFO
+ "New %s%c-block @%llu (%s/%llu/%d)\n",
+ additional_string,
+ btrfsic_get_block_type(state, block),
+ (unsigned long long)block->logical_bytenr,
+ dev_state->name,
+ (unsigned long long)block->dev_bytenr,
+ mirror_num);
+ list_add(&block->all_blocks_node, &state->all_blocks_list);
+ btrfsic_block_hashtable_add(block, &state->block_hashtable);
+ if (NULL != was_created)
+ *was_created = 1;
+ } else {
+ if (NULL != was_created)
+ *was_created = 0;
+ }
+
+ return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+ u64 bytenr,
+ struct btrfsic_dev_state *dev_state,
+ u64 dev_bytenr, char *data)
+{
+ int num_copies;
+ int mirror_num;
+ int ret;
+ struct btrfsic_block_data_ctx block_ctx;
+ int match = 0;
+
+ num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+ bytenr, PAGE_SIZE);
+
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+ &block_ctx, mirror_num);
+ if (ret) {
+ printk(KERN_INFO "btrfsic:"
+ " btrfsic_map_block(logical @%llu,"
+ " mirror %d) failed!\n",
+ (unsigned long long)bytenr, mirror_num);
+ continue;
+ }
+
+ if (dev_state->bdev == block_ctx.dev->bdev &&
+ dev_bytenr == block_ctx.dev_bytenr) {
+ match++;
+ btrfsic_release_block_ctx(&block_ctx);
+ break;
+ }
+ btrfsic_release_block_ctx(&block_ctx);
+ }
+
+ if (!match) {
+ printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+ " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+ " phys_bytenr=%llu)!\n",
+ (unsigned long long)bytenr, dev_state->name,
+ (unsigned long long)dev_bytenr);
+ for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+ ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+ &block_ctx, mirror_num);
+ if (ret)
+ continue;
+
+ printk(KERN_INFO "Read logical bytenr @%llu maps to"
+ " (%s/%llu/%d)\n",
+ (unsigned long long)bytenr,
+ block_ctx.dev->name,
+ (unsigned long long)block_ctx.dev_bytenr,
+ mirror_num);
+ }
+ WARN_ON(1);
+ }
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+ struct block_device *bdev)
+{
+ struct btrfsic_dev_state *ds;
+
+ ds = btrfsic_dev_state_hashtable_lookup(bdev,
+ &btrfsic_dev_state_hashtable);
+ return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized)
+ return submit_bh(rw, bh);
+
+ mutex_lock(&btrfsic_mutex);
+ /* since btrfsic_submit_bh() might also be called before
+ * btrfsic_mount(), this might return NULL */
+ dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+ /* Only called to write the superblock (incl. FLUSH/FUA) */
+ if (NULL != dev_state &&
+ (rw & WRITE) && bh->b_size > 0) {
+ u64 dev_bytenr;
+
+ dev_bytenr = 4096 * bh->b_blocknr;
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
+ " size=%lu, data=%p, bdev=%p)\n",
+ rw, (unsigned long)bh->b_blocknr,
+ (unsigned long long)dev_bytenr,
+ (unsigned long)bh->b_size, bh->b_data,
+ bh->b_bdev);
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ bh->b_data, bh->b_size, NULL,
+ NULL, bh, rw);
+ } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+ rw, bh->b_bdev);
+ if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "btrfsic_submit_bh(%s) with FLUSH"
+ " but dummy block already in use"
+ " (ignored)!\n",
+ dev_state->name);
+ } else {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = rw;
+ block->orig_bio_bh_private = bh->b_private;
+ block->orig_bio_bh_end_io.bh = bh->b_end_io;
+ block->next_in_same_bio = NULL;
+ bh->b_private = block;
+ bh->b_end_io = btrfsic_bh_end_io;
+ }
+ }
+ mutex_unlock(&btrfsic_mutex);
+ return submit_bh(rw, bh);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+ struct btrfsic_dev_state *dev_state;
+
+ if (!btrfsic_is_initialized) {
+ submit_bio(rw, bio);
+ return;
+ }
+
+ mutex_lock(&btrfsic_mutex);
+ /* since btrfsic_submit_bio() is also called before
+ * btrfsic_mount(), this might return NULL */
+ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+ if (NULL != dev_state &&
+ (rw & WRITE) && NULL != bio->bi_io_vec) {
+ unsigned int i;
+ u64 dev_bytenr;
+ int bio_is_patched;
+
+ dev_bytenr = 512 * bio->bi_sector;
+ bio_is_patched = 0;
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bio(rw=0x%x, bi_vcnt=%u,"
+ " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
+ rw, bio->bi_vcnt, (unsigned long)bio->bi_sector,
+ (unsigned long long)dev_bytenr,
+ bio->bi_bdev);
+
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ u8 *mapped_data;
+
+ mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+ if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE) ==
+ (dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "#%u: page=%p, mapped=%p, len=%u,"
+ " offset=%u\n",
+ i, bio->bi_io_vec[i].bv_page,
+ mapped_data,
+ bio->bi_io_vec[i].bv_len,
+ bio->bi_io_vec[i].bv_offset);
+ btrfsic_process_written_block(dev_state, dev_bytenr,
+ mapped_data,
+ bio->bi_io_vec[i].bv_len,
+ bio, &bio_is_patched,
+ NULL, rw);
+ kunmap(bio->bi_io_vec[i].bv_page);
+ dev_bytenr += bio->bi_io_vec[i].bv_len;
+ }
+ } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+ printk(KERN_INFO
+ "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+ rw, bio->bi_bdev);
+ if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+ if ((dev_state->state->print_mask &
+ (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+ BTRFSIC_PRINT_MASK_VERBOSE)))
+ printk(KERN_INFO
+ "btrfsic_submit_bio(%s) with FLUSH"
+ " but dummy block already in use"
+ " (ignored)!\n",
+ dev_state->name);
+ } else {
+ struct btrfsic_block *const block =
+ &dev_state->dummy_block_for_bio_bh_flush;
+
+ block->is_iodone = 0;
+ block->never_written = 0;
+ block->iodone_w_error = 0;
+ block->flush_gen = dev_state->last_flush_gen + 1;
+ block->submit_bio_bh_rw = rw;
+ block->orig_bio_bh_private = bio->bi_private;
+ block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+ block->next_in_same_bio = NULL;
+ bio->bi_private = block;
+ bio->bi_end_io = btrfsic_bio_end_io;
+ }
+ }
+ mutex_unlock(&btrfsic_mutex);
+
+ submit_bio(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask)
+{
+ int ret;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ state = kzalloc(sizeof(*state), GFP_NOFS);
+ if (NULL == state) {
+ printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+ return -1;
+ }
+
+ if (!btrfsic_is_initialized) {
+ mutex_init(&btrfsic_mutex);
+ btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+ btrfsic_is_initialized = 1;
+ }
+ mutex_lock(&btrfsic_mutex);
+ state->root = root;
+ state->print_mask = print_mask;
+ state->include_extent_data = including_extent_data;
+ state->csum_size = 0;
+ INIT_LIST_HEAD(&state->all_blocks_list);
+ btrfsic_block_hashtable_init(&state->block_hashtable);
+ btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+ state->max_superblock_generation = 0;
+ state->latest_superblock = NULL;
+
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+ char *p;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_alloc();
+ if (NULL == ds) {
+ printk(KERN_INFO
+ "btrfs check-integrity: kmalloc() failed!\n");
+ mutex_unlock(&btrfsic_mutex);
+ return -1;
+ }
+ ds->bdev = device->bdev;
+ ds->state = state;
+ bdevname(ds->bdev, ds->name);
+ ds->name[BDEVNAME_SIZE - 1] = '\0';
+ for (p = ds->name; *p != '\0'; p++);
+ while (p > ds->name && *p != '/')
+ p--;
+ if (*p == '/')
+ p++;
+ strlcpy(ds->name, p, sizeof(ds->name));
+ btrfsic_dev_state_hashtable_add(ds,
+ &btrfsic_dev_state_hashtable);
+ }
+
+ ret = btrfsic_process_superblock(state, fs_devices);
+ if (0 != ret) {
+ mutex_unlock(&btrfsic_mutex);
+ btrfsic_unmount(root, fs_devices);
+ return ret;
+ }
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+ btrfsic_dump_database(state);
+ if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+ btrfsic_dump_tree(state);
+
+ mutex_unlock(&btrfsic_mutex);
+ return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices)
+{
+ struct list_head *elem_all;
+ struct list_head *tmp_all;
+ struct btrfsic_state *state;
+ struct list_head *dev_head = &fs_devices->devices;
+ struct btrfs_device *device;
+
+ if (!btrfsic_is_initialized)
+ return;
+
+ mutex_lock(&btrfsic_mutex);
+
+ state = NULL;
+ list_for_each_entry(device, dev_head, dev_list) {
+ struct btrfsic_dev_state *ds;
+
+ if (!device->bdev || !device->name)
+ continue;
+
+ ds = btrfsic_dev_state_hashtable_lookup(
+ device->bdev,
+ &btrfsic_dev_state_hashtable);
+ if (NULL != ds) {
+ state = ds->state;
+ btrfsic_dev_state_hashtable_remove(ds);
+ btrfsic_dev_state_free(ds);
+ }
+ }
+
+ if (NULL == state) {
+ printk(KERN_INFO
+ "btrfsic: error, cannot find state information"
+ " on umount!\n");
+ mutex_unlock(&btrfsic_mutex);
+ return;
+ }
+
+ /*
+ * Don't care about keeping the lists' state up to date,
+ * just free all memory that was allocated dynamically.
+ * Free the blocks and the block_links.
+ */
+ list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+ struct btrfsic_block *const b_all =
+ list_entry(elem_all, struct btrfsic_block,
+ all_blocks_node);
+ struct list_head *elem_ref_to;
+ struct list_head *tmp_ref_to;
+
+ list_for_each_safe(elem_ref_to, tmp_ref_to,
+ &b_all->ref_to_list) {
+ struct btrfsic_block_link *const l =
+ list_entry(elem_ref_to,
+ struct btrfsic_block_link,
+ node_ref_to);
+
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+ btrfsic_print_rem_link(state, l);
+
+ l->ref_cnt--;
+ if (0 == l->ref_cnt)
+ btrfsic_block_link_free(l);
+ }
+
+ if (b_all->is_iodone)
+ btrfsic_block_free(b_all);
+ else
+ printk(KERN_INFO "btrfs: attempt to free %c-block"
+ " @%llu (%s/%llu/%d) on umount which is"
+ " not yet iodone!\n",
+ btrfsic_get_block_type(state, b_all),
+ (unsigned long long)b_all->logical_bytenr,
+ b_all->dev_state->name,
+ (unsigned long long)b_all->dev_bytenr,
+ b_all->mirror_num);
+ }
+
+ mutex_unlock(&btrfsic_mutex);
+
+ kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 00000000000..8b59175cc50
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) STRATO AG 2011. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices,
+ int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+ struct btrfs_fs_devices *fs_devices);
+
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdee..0639a555e16 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
new_root_objectid, &disk_key, level,
- buf->start, 0);
+ buf->start, 0, 1);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
WARN_ON(btrfs_header_generation(buf) > trans->transid);
if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0);
+ ret = btrfs_inc_ref(trans, root, cow, 0, 1);
if (ret)
return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if ((owner == root->root_key.objectid ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
- ret = btrfs_inc_ref(trans, root, buf, 1);
+ ret = btrfs_inc_ref(trans, root, buf, 1, 1);
BUG_ON(ret);
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID) {
- ret = btrfs_dec_ref(trans, root, buf, 0);
+ ret = btrfs_dec_ref(trans, root, buf, 0, 1);
BUG_ON(ret);
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
BUG_ON(ret);
}
new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0);
+ ret = btrfs_inc_ref(trans, root, cow, 0, 1);
BUG_ON(ret);
}
if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
if (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID)
- ret = btrfs_inc_ref(trans, root, cow, 1);
+ ret = btrfs_inc_ref(trans, root, cow, 1, 1);
else
- ret = btrfs_inc_ref(trans, root, cow, 0);
+ ret = btrfs_inc_ref(trans, root, cow, 0, 1);
BUG_ON(ret);
- ret = btrfs_dec_ref(trans, root, buf, 1);
+ ret = btrfs_dec_ref(trans, root, buf, 1, 1);
BUG_ON(ret);
}
clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
root->root_key.objectid, &disk_key,
- level, search_start, empty_size);
+ level, search_start, empty_size, 1);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ last_ref, 1);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid);
btrfs_mark_buffer_dirty(parent);
btrfs_free_tree_block(trans, root, buf, parent_start,
- last_ref);
+ last_ref, 1);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
free_extent_buffer(mid);
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
/* once for the root ptr */
free_extent_buffer(mid);
return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (wret)
ret = wret;
root_sub_used(root, right->len);
- btrfs_free_tree_block(trans, root, right, 0, 1);
+ btrfs_free_tree_block(trans, root, right, 0, 1, 0);
free_extent_buffer(right);
right = NULL;
} else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (wret)
ret = wret;
root_sub_used(root, mid->len);
- btrfs_free_tree_block(trans, root, mid, 0, 1);
+ btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
free_extent_buffer(mid);
mid = NULL;
} else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
root->root_key.objectid, &lower_key,
- level, root->node->start, 0);
+ level, root->node->start, 0, 0);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
root->root_key.objectid,
- &disk_key, level, c->start, 0);
+ &disk_key, level, c->start, 0, 0);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -2970,7 +2970,7 @@ again:
right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
root->root_key.objectid,
- &disk_key, 0, l->start, 0);
+ &disk_key, 0, l->start, 0, 0);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used(root, leaf->len);
- btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
return 0;
}
/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323..27ebe61d3cc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
/* holds checksums of all the data extents */
#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
/* orhpan objectid for tracking unlinked/truncated files */
#define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
__le16 name_len;
} __attribute__ ((__packed__));
+struct btrfs_disk_balance_args {
+ /*
+ * profiles to operate on, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 profiles;
+
+ /* usage filter */
+ __le64 usage;
+
+ /* devid filter */
+ __le64 devid;
+
+ /* devid subset filter [pstart..pend) */
+ __le64 pstart;
+ __le64 pend;
+
+ /* btrfs virtual address space subset filter [vstart..vend) */
+ __le64 vstart;
+ __le64 vend;
+
+ /*
+ * profile to convert to, single is denoted by
+ * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+ */
+ __le64 target;
+
+ /* BTRFS_BALANCE_ARGS_* */
+ __le64 flags;
+
+ __le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+ /* BTRFS_BALANCE_* */
+ __le64 flags;
+
+ struct btrfs_disk_balance_args data;
+ struct btrfs_disk_balance_args meta;
+ struct btrfs_disk_balance_args sys;
+
+ __le64 unused[4];
+} __attribute__ ((__packed__));
+
#define BTRFS_FILE_EXTENT_INLINE 0
#define BTRFS_FILE_EXTENT_REG 1
#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
} __attribute__ ((__packed__));
/* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
-#define BTRFS_NR_RAID_TYPES 5
+#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES 5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
+ BTRFS_BLOCK_GROUP_SYSTEM | \
+ BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
+ BTRFS_BLOCK_GROUP_RAID1 | \
+ BTRFS_BLOCK_GROUP_DUP | \
+ BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available. This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk). The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
struct btrfs_block_group_item {
__le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
struct reloc_control;
struct btrfs_device;
struct btrfs_fs_devices;
+struct btrfs_balance_control;
struct btrfs_delayed_root;
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
* is required instead of the faster short fsync log commits
*/
u64 last_trans_log_full_commit;
- unsigned long mount_opt:20;
+ unsigned long mount_opt:21;
unsigned long compress_type:4;
u64 max_inline;
u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
spinlock_t ref_cache_lock;
u64 total_ref_cache_size;
+ /*
+ * these three are in extended format (availability of single
+ * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+ * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+ */
u64 avail_data_alloc_bits;
u64 avail_metadata_alloc_bits;
u64 avail_system_alloc_bits;
- u64 data_alloc_profile;
- u64 metadata_alloc_profile;
- u64 system_alloc_profile;
+
+ /* restriper state */
+ spinlock_t balance_lock;
+ struct mutex balance_mutex;
+ atomic_t balance_running;
+ atomic_t balance_pause_req;
+ atomic_t balance_cancel_req;
+ struct btrfs_balance_control *balance_ctl;
+ wait_queue_head_t balance_wait_q;
unsigned data_chunk_allocations;
unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
int scrub_workers_refcnt;
struct btrfs_workers scrub_workers;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ u32 check_integrity_print_mask;
+#endif
+
/* filesystem state */
u64 fs_state;
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_DEV_ITEM_KEY 216
#define BTRFS_CHUNK_ITEM_KEY 228
+#define BTRFS_BALANCE_ITEM_KEY 248
+
/*
* string items are for debugging. They just store a short string of
* data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
#define BTRFS_MOUNT_RECOVERY (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+ struct btrfs_balance_item *bi,
+ struct btrfs_disk_balance_args *ba)
+{
+ write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+ struct btrfs_disk_balance_args *disk)
+{
+ memset(cpu, 0, sizeof(*cpu));
+
+ cpu->profiles = le64_to_cpu(disk->profiles);
+ cpu->usage = le64_to_cpu(disk->usage);
+ cpu->devid = le64_to_cpu(disk->devid);
+ cpu->pstart = le64_to_cpu(disk->pstart);
+ cpu->pend = le64_to_cpu(disk->pend);
+ cpu->vstart = le64_to_cpu(disk->vstart);
+ cpu->vend = le64_to_cpu(disk->vend);
+ cpu->target = le64_to_cpu(disk->target);
+ cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+ struct btrfs_balance_args *cpu)
+{
+ memset(disk, 0, sizeof(*disk));
+
+ disk->profiles = cpu_to_le64(cpu->profiles);
+ disk->usage = cpu_to_le64(cpu->usage);
+ disk->devid = cpu_to_le64(cpu->devid);
+ disk->pstart = cpu_to_le64(cpu->pstart);
+ disk->pend = cpu_to_le64(cpu->pend);
+ disk->vstart = cpu_to_le64(cpu->vstart);
+ disk->vend = cpu_to_le64(cpu->vend);
+ disk->target = cpu_to_le64(cpu->target);
+ disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
return btrfs_item_size(eb, e) - offset;
}
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
}
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
- u64 hint, u64 empty_size);
+ u64 hint, u64 empty_size, int for_cow);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- u64 parent, int last_ref);
+ u64 parent, int last_ref, int for_cow);
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
u64 search_end, struct btrfs_key *ins,
u64 data);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, int full_backref, int for_cow);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref);
+ struct extent_buffer *buf, int full_backref, int for_cow);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
int is_data);
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset);
+ u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int for_cow);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset);
+ u64 root_objectid, u64 owner, u64 offset, int for_cow);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+ ++p->slots[0];
+ if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+ return btrfs_next_leaf(root, p);
+ return 0;
+}
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
void btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref);
+ struct btrfs_block_rsv *block_rsv, int update_ref,
+ int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
}
static inline void free_fs_info(struct btrfs_fs_info *fs_info)
{
+ kfree(fs_info->balance_ctl);
kfree(fs_info->delayed_root);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
kfree(fs_info->super_for_commit);
kfree(fs_info);
}
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+ u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+ if (extended)
+ mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (flags & mask)
+ return 0;
+ /* true if zero or exactly one bit set */
+ return (flags & (~flags + 1)) == flags;
+}
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c50..fe4cd0f1cef 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
- if (!ret)
+ if (!ret) {
+ trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ item->key.objectid,
+ num_bytes, 1);
item->bytes_reserved = num_bytes;
+ }
return ret;
}
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
return;
rsv = &root->fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+ item->key.objectid, item->bytes_reserved,
+ 0);
btrfs_block_rsv_release(root, rsv,
item->bytes_reserved);
}
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_block_rsv *dst_rsv;
u64 num_bytes;
int ret;
- int release = false;
+ bool release = false;
src_rsv = trans->block_rsv;
dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (ret == -EAGAIN)
ret = -ENOSPC;
- if (!ret)
+ if (!ret) {
node->bytes_reserved = num_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "delayed_inode",
+ btrfs_ino(inode),
+ num_bytes, 1);
+ }
return ret;
} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
* reservation here. I think it may be time for a documentation page on
* how block rsvs. work.
*/
- if (!ret)
+ if (!ret) {
+ trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ btrfs_ino(inode), num_bytes, 1);
node->bytes_reserved = num_bytes;
+ }
- if (release)
+ if (release) {
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), num_bytes, 0);
btrfs_block_rsv_release(root, src_rsv, num_bytes);
+ }
return ret;
}
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
return;
rsv = &root->fs_info->delayed_block_rsv;
+ trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+ node->inode_id, node->bytes_reserved, 0);
btrfs_block_rsv_release(root, rsv,
node->bytes_reserved);
node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
goto release_node;
}
- ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
- /*
- * we have reserved enough space when we start a new transaction,
- * so reserving metadata failure is impossible
- */
- BUG_ON(ret);
-
delayed_item->key.objectid = btrfs_ino(dir);
btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
dir_item->type = type;
memcpy((char *)(dir_item + 1), name, name_len);
+ ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+ /*
+ * we have reserved enough space when we start a new transaction,
+ * so reserving metadata failure is impossible
+ */
+ BUG_ON(ret);
+
+
mutex_lock(&delayed_node->mutex);
ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
if (unlikely(ret)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd0..66e4f29505a 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
return -1;
if (ref1->type > ref2->type)
return 1;
+ /* merging of sequenced refs is not allowed */
+ if (ref1->seq < ref2->seq)
+ return -1;
+ if (ref1->seq > ref2->seq)
+ return 1;
if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
/*
* find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
*/
static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
u64 bytenr,
- struct btrfs_delayed_ref_node **last)
+ struct btrfs_delayed_ref_node **last,
+ int return_bigger)
{
- struct rb_node *n = root->rb_node;
+ struct rb_node *n;
struct btrfs_delayed_ref_node *entry;
- int cmp;
+ int cmp = 0;
+again:
+ n = root->rb_node;
+ entry = NULL;
while (n) {
entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
else
return entry;
}
+ if (entry && return_bigger) {
+ if (cmp > 0) {
+ n = rb_next(&entry->rb_node);
+ if (!n)
+ n = rb_first(root);
+ entry = rb_entry(n, struct btrfs_delayed_ref_node,
+ rb_node);
+ bytenr = entry->bytenr;
+ return_bigger = 0;
+ goto again;
+ }
+ return entry;
+ }
return NULL;
}
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
return 0;
}
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq)
+{
+ struct seq_list *elem;
+
+ assert_spin_locked(&delayed_refs->lock);
+ if (list_empty(&delayed_refs->seq_head))
+ return 0;
+
+ elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
+ if (seq >= elem->seq) {
+ pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
+ seq, elem->seq, delayed_refs);
+ return 1;
+ }
+ return 0;
+}
+
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 start)
{
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
node = rb_first(&delayed_refs->root);
} else {
ref = NULL;
- find_ref_head(&delayed_refs->root, start, &ref);
+ find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
if (ref) {
- struct btrfs_delayed_ref_node *tmp;
-
- node = rb_prev(&ref->rb_node);
- while (node) {
- tmp = rb_entry(node,
- struct btrfs_delayed_ref_node,
- rb_node);
- if (tmp->bytenr < start)
- break;
- ref = tmp;
- node = rb_prev(&ref->rb_node);
- }
node = &ref->rb_node;
} else
node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
* this does all the dirty work in terms of maintaining the correct
* overall modification count.
*/
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes,
int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
ref->action = 0;
ref->is_head = 1;
ref->in_tree = 1;
+ ref->seq = 0;
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
/*
* helper to insert a delayed tree ref into the rbtree.
*/
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 ref_root, int level, int action)
+ u64 ref_root, int level, int action,
+ int for_cow)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
ref->is_head = 0;
ref->in_tree = 1;
+ if (need_ref_seq(for_cow, ref_root))
+ seq = inc_delayed_seq(delayed_refs);
+ ref->seq = seq;
+
full_ref = btrfs_delayed_node_to_tree_ref(ref);
- if (parent) {
- full_ref->parent = parent;
+ full_ref->parent = parent;
+ full_ref->root = ref_root;
+ if (parent)
ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
- } else {
- full_ref->root = ref_root;
+ else
ref->type = BTRFS_TREE_BLOCK_REF_KEY;
- }
full_ref->level = level;
trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
/*
* helper to insert a delayed data ref into the rbtree.
*/
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, u64 owner, u64 offset,
- int action)
+ int action, int for_cow)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ u64 seq = 0;
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
ref->is_head = 0;
ref->in_tree = 1;
+ if (need_ref_seq(for_cow, ref_root))
+ seq = inc_delayed_seq(delayed_refs);
+ ref->seq = seq;
+
full_ref = btrfs_delayed_node_to_data_ref(ref);
- if (parent) {
- full_ref->parent = parent;
+ full_ref->parent = parent;
+ full_ref->root = ref_root;
+ if (parent)
ref->type = BTRFS_SHARED_DATA_REF_KEY;
- } else {
- full_ref->root = ref_root;
+ else
ref->type = BTRFS_EXTENT_DATA_REF_KEY;
- }
+
full_ref->objectid = owner;
full_ref->offset = offset;
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
* to make sure the delayed ref is eventually processed before this
* transaction commits.
*/
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
- action, 0);
+ ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ num_bytes, action, 0);
BUG_ON(ret);
- ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
- parent, ref_root, level, action);
+ ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+ num_bytes, parent, ref_root, level, action,
+ for_cow);
BUG_ON(ret);
+ if (!need_ref_seq(for_cow, ref_root) &&
+ waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
/*
* add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
*/
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
- action, 1);
+ ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+ num_bytes, action, 1);
BUG_ON(ret);
- ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
- parent, ref_root, owner, offset, action);
+ ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+ num_bytes, parent, ref_root, owner, offset,
+ action, for_cow);
BUG_ON(ret);
+ if (!need_ref_seq(for_cow, ref_root) &&
+ waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
return 0;
}
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op)
{
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+ ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
BUG_ON(ret);
+ if (waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
struct btrfs_delayed_ref_root *delayed_refs;
delayed_refs = &trans->transaction->delayed_refs;
- ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+ ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
if (ref)
return btrfs_delayed_node_to_head(ref);
return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab..d8f244d9492 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
/* the size of the extent */
u64 num_bytes;
+ /* seq number to keep track of insertion order */
+ u64 seq;
+
/* ref count on this data structure */
atomic_t refs;
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
struct btrfs_delayed_tree_ref {
struct btrfs_delayed_ref_node node;
- union {
- u64 root;
- u64 parent;
- };
+ u64 root;
+ u64 parent;
int level;
};
struct btrfs_delayed_data_ref {
struct btrfs_delayed_ref_node node;
- union {
- u64 root;
- u64 parent;
- };
+ u64 root;
+ u64 parent;
u64 objectid;
u64 offset;
};
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
int flushing;
u64 run_delayed_start;
+
+ /*
+ * seq number of delayed refs. We need to know if a backref was being
+ * added before the currently processed ref or afterwards.
+ */
+ u64 seq;
+
+ /*
+ * seq_list holds a list of all seq numbers that are currently being
+ * added to the list. While walking backrefs (btrfs_find_all_roots,
+ * qgroups), which might take some time, no newer ref must be processed,
+ * as it might influence the outcome of the walk.
+ */
+ struct list_head seq_head;
+
+ /*
+ * when the only refs we have in the list must not be processed, we want
+ * to wait for more refs to show up or for the end of backref walking.
+ */
+ wait_queue_head_t seq_wait;
};
static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
}
}
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_extent_op *extent_op,
+ int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
struct btrfs_delayed_extent_op *extent_op);
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head);
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
struct list_head *cluster, u64 search_start);
+
+struct seq_list {
+ struct list_head list;
+ u64 seq;
+};
+
+static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
+{
+ assert_spin_locked(&delayed_refs->lock);
+ ++delayed_refs->seq;
+ return delayed_refs->seq;
+}
+
+static inline void
+btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ struct seq_list *elem)
+{
+ assert_spin_locked(&delayed_refs->lock);
+ elem->seq = delayed_refs->seq;
+ list_add_tail(&elem->list, &delayed_refs->seq_head);
+}
+
+static inline void
+btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ struct seq_list *elem)
+{
+ spin_lock(&delayed_refs->lock);
+ list_del(&elem->list);
+ wake_up(&delayed_refs->seq_wait);
+ spin_unlock(&delayed_refs->lock);
+}
+
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+ u64 seq);
+
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+ if (for_cow)
+ return 0;
+
+ if (rootid == BTRFS_FS_TREE_OBJECTID)
+ return 1;
+
+ if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return 1;
+
+ return 0;
+}
+
/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d8525662ca7..811d9f918b1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
#include "tree-log.h"
#include "free-space-cache.h"
#include "inode-map.h"
+#include "check-integrity.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -961,6 +962,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
tree = &BTRFS_I(page->mapping->host)->io_tree;
map = &BTRFS_I(page->mapping->host)->extent_tree;
+ /*
+ * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
+ * slab allocation from alloc_extent_state down the callchain where
+ * it'd hit a BUG_ON as those flags are not allowed.
+ */
+ gfp_flags &= ~GFP_SLAB_BUG_MASK;
+
ret = try_release_extent_state(map, tree, page, gfp_flags);
if (!ret)
return 0;
@@ -1143,7 +1151,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->orphan_item_inserted = 0;
root->orphan_cleanup_state = 0;
- root->fs_info = fs_info;
root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
@@ -1217,6 +1224,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
return 0;
}
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+ if (root)
+ root->fs_info = fs_info;
+ return root;
+}
+
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -1224,7 +1239,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = fs_info->tree_root;
struct extent_buffer *leaf;
- root = kzalloc(sizeof(*root), GFP_NOFS);
+ root = btrfs_alloc_root(fs_info);
if (!root)
return ERR_PTR(-ENOMEM);
@@ -1244,7 +1259,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->ref_cows = 0;
leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
- BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+ BTRFS_TREE_LOG_OBJECTID, NULL,
+ 0, 0, 0, 0);
if (IS_ERR(leaf)) {
kfree(root);
return ERR_CAST(leaf);
@@ -1318,7 +1334,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
u32 blocksize;
int ret = 0;
- root = kzalloc(sizeof(*root), GFP_NOFS);
+ root = btrfs_alloc_root(fs_info);
if (!root)
return ERR_PTR(-ENOMEM);
if (location->offset == (u64)-1) {
@@ -1874,9 +1890,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
}
-struct btrfs_root *open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options)
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options)
{
u32 sectorsize;
u32 nodesize;
@@ -1888,8 +1904,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_key location;
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
- struct btrfs_root *tree_root = btrfs_sb(sb);
- struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *tree_root;
struct btrfs_root *extent_root;
struct btrfs_root *csum_root;
struct btrfs_root *chunk_root;
@@ -1900,16 +1916,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
int num_backups_tried = 0;
int backup_index = 0;
- extent_root = fs_info->extent_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- csum_root = fs_info->csum_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- chunk_root = fs_info->chunk_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- dev_root = fs_info->dev_root =
- kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+ extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
+ csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
+ chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+ dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
- if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+ if (!tree_root || !extent_root || !csum_root ||
+ !chunk_root || !dev_root) {
err = -ENOMEM;
goto fail;
}
@@ -1998,6 +2012,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
init_waitqueue_head(&fs_info->scrub_pause_wait);
init_rwsem(&fs_info->scrub_super_lock);
fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ fs_info->check_integrity_print_mask = 0;
+#endif
+
+ spin_lock_init(&fs_info->balance_lock);
+ mutex_init(&fs_info->balance_mutex);
+ atomic_set(&fs_info->balance_running, 0);
+ atomic_set(&fs_info->balance_pause_req, 0);
+ atomic_set(&fs_info->balance_cancel_req, 0);
+ fs_info->balance_ctl = NULL;
+ init_waitqueue_head(&fs_info->balance_wait_q);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
@@ -2267,9 +2292,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
(unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
BTRFS_UUID_SIZE);
- mutex_lock(&fs_info->chunk_mutex);
ret = btrfs_read_chunk_tree(chunk_root);
- mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
sb->s_id);
@@ -2318,9 +2341,6 @@ retry_root_backup:
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
- fs_info->data_alloc_profile = (u64)-1;
- fs_info->metadata_alloc_profile = (u64)-1;
- fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
ret = btrfs_init_space_info(fs_info);
if (ret) {
@@ -2353,6 +2373,19 @@ retry_root_backup:
btrfs_set_opt(fs_info->mount_opt, SSD);
}
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+ ret = btrfsic_mount(tree_root, fs_devices,
+ btrfs_test_opt(tree_root,
+ CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+ 1 : 0,
+ fs_info->check_integrity_print_mask);
+ if (ret)
+ printk(KERN_WARNING "btrfs: failed to initialize"
+ " integrity check module %s\n", sb->s_id);
+ }
+#endif
+
/* do not make disk changes in broken FS */
if (btrfs_super_log_root(disk_super) != 0 &&
!(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2368,7 +2401,7 @@ retry_root_backup:
btrfs_level_size(tree_root,
btrfs_super_log_root_level(disk_super));
- log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+ log_tree_root = btrfs_alloc_root(fs_info);
if (!log_tree_root) {
err = -ENOMEM;
goto fail_trans_kthread;
@@ -2423,13 +2456,17 @@ retry_root_backup:
if (!err)
err = btrfs_orphan_cleanup(fs_info->tree_root);
up_read(&fs_info->cleanup_work_sem);
+
+ if (!err)
+ err = btrfs_recover_balance(fs_info->tree_root);
+
if (err) {
close_ctree(tree_root);
- return ERR_PTR(err);
+ return err;
}
}
- return tree_root;
+ return 0;
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
@@ -2475,8 +2512,7 @@ fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
btrfs_close_devices(fs_info->fs_devices);
- free_fs_info(fs_info);
- return ERR_PTR(err);
+ return err;
recovery_tree_root:
if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2631,7 +2667,7 @@ static int write_dev_supers(struct btrfs_device *device,
* we fua the first super. The others we allow
* to go down lazy.
*/
- ret = submit_bh(WRITE_FUA, bh);
+ ret = btrfsic_submit_bh(WRITE_FUA, bh);
if (ret)
errors++;
}
@@ -2708,7 +2744,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
device->flush_bio = bio;
bio_get(bio);
- submit_bio(WRITE_FLUSH, bio);
+ btrfsic_submit_bio(WRITE_FLUSH, bio);
return 0;
}
@@ -2972,6 +3008,9 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* pause restriper - we want to resume on mount */
+ btrfs_pause_balance(root->fs_info);
+
btrfs_scrub_cancel(root);
/* wait for any defraggers to finish */
@@ -2979,7 +3018,7 @@ int close_ctree(struct btrfs_root *root)
(atomic_read(&fs_info->defrag_running) == 0));
/* clear out the rbtree of defraggable inodes */
- btrfs_run_defrag_inodes(root->fs_info);
+ btrfs_run_defrag_inodes(fs_info);
/*
* Here come 2 situations when btrfs is broken to flip readonly:
@@ -3008,8 +3047,8 @@ int close_ctree(struct btrfs_root *root)
btrfs_put_block_group_cache(fs_info);
- kthread_stop(root->fs_info->transaction_kthread);
- kthread_stop(root->fs_info->cleaner_kthread);
+ kthread_stop(fs_info->transaction_kthread);
+ kthread_stop(fs_info->cleaner_kthread);
fs_info->closing = 2;
smp_mb();
@@ -3027,14 +3066,14 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(fs_info->extent_root->commit_root);
free_extent_buffer(fs_info->tree_root->node);
free_extent_buffer(fs_info->tree_root->commit_root);
- free_extent_buffer(root->fs_info->chunk_root->node);
- free_extent_buffer(root->fs_info->chunk_root->commit_root);
- free_extent_buffer(root->fs_info->dev_root->node);
- free_extent_buffer(root->fs_info->dev_root->commit_root);
- free_extent_buffer(root->fs_info->csum_root->node);
- free_extent_buffer(root->fs_info->csum_root->commit_root);
+ free_extent_buffer(fs_info->chunk_root->node);
+ free_extent_buffer(fs_info->chunk_root->commit_root);
+ free_extent_buffer(fs_info->dev_root->node);
+ free_extent_buffer(fs_info->dev_root->commit_root);
+ free_extent_buffer(fs_info->csum_root->node);
+ free_extent_buffer(fs_info->csum_root->commit_root);
- btrfs_free_block_groups(root->fs_info);
+ btrfs_free_block_groups(fs_info);
del_fs_roots(fs_info);
@@ -3054,14 +3093,17 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->caching_workers);
btrfs_stop_workers(&fs_info->readahead_workers);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_test_opt(root, CHECK_INTEGRITY))
+ btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
- free_fs_info(fs_info);
-
return 0;
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13f..e4bc4741319 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
int clean_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- char *options);
+int open_ctree(struct super_block *sb,
+ struct btrfs_fs_devices *fs_devices,
+ char *options);
int close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f..5f77166fd01 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u32 generation,
int check_generation)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
struct inode *inode;
struct btrfs_key key;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2b..283af7a676a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,23 +34,24 @@
#include "locking.h"
#include "free-space-cache.h"
-/* control flags for do_chunk_alloc's force field
+/*
+ * control flags for do_chunk_alloc's force field
* CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
* if we really need one.
*
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
* CHUNK_ALLOC_LIMITED means to only try and allocate one
* if we have very few chunks already allocated. This is
* used as part of the clustering code to help make sure
* we have a good pool of storage to cluster in, without
* filling the FS with empty chunks
*
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
*/
enum {
CHUNK_ALLOC_NO_FORCE = 0,
- CHUNK_ALLOC_FORCE = 1,
- CHUNK_ALLOC_LIMITED = 2,
+ CHUNK_ALLOC_LIMITED = 1,
+ CHUNK_ALLOC_FORCE = 2,
};
/*
@@ -618,8 +619,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA;
+ flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1872,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset)
+ u64 root_objectid, u64 owner, u64 offset, int for_cow)
{
int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
root_objectid == BTRFS_TREE_LOG_OBJECTID);
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+ num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL);
+ BTRFS_ADD_DELAYED_REF, NULL, for_cow);
} else {
- ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+ ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+ num_bytes,
parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL);
+ BTRFS_ADD_DELAYED_REF, NULL, for_cow);
}
return ret;
}
@@ -2233,6 +2237,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
}
/*
+ * locked_ref is the head node, so we have to go one
+ * node back for any delayed ref updates
+ */
+ ref = select_delayed_ref(locked_ref);
+
+ if (ref && ref->seq &&
+ btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+ /*
+ * there are still refs with lower seq numbers in the
+ * process of being added. Don't run this ref yet.
+ */
+ list_del_init(&locked_ref->cluster);
+ mutex_unlock(&locked_ref->mutex);
+ locked_ref = NULL;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ continue;
+ }
+
+ /*
* record the must insert reserved flag before we
* drop the spin lock.
*/
@@ -2242,11 +2268,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
extent_op = locked_ref->extent_op;
locked_ref->extent_op = NULL;
- /*
- * locked_ref is the head node, so we have to go one
- * node back for any delayed ref updates
- */
- ref = select_delayed_ref(locked_ref);
if (!ref) {
/* All delayed refs have been processed, Go ahead
* and send the head node to run_one_delayed_ref,
@@ -2267,9 +2288,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
BUG_ON(ret);
kfree(extent_op);
- cond_resched();
- spin_lock(&delayed_refs->lock);
- continue;
+ goto next;
}
list_del_init(&locked_ref->cluster);
@@ -2279,7 +2298,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
rb_erase(&ref->rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
-
+ /*
+ * we modified num_entries, but as we're currently running
+ * delayed refs, skip
+ * wake_up(&delayed_refs->seq_wait);
+ * here.
+ */
spin_unlock(&delayed_refs->lock);
ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2313,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
btrfs_put_delayed_ref(ref);
kfree(extent_op);
count++;
-
+next:
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024,
+ btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
cond_resched();
spin_lock(&delayed_refs->lock);
}
return count;
}
+
+static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+ unsigned long num_refs)
+{
+ struct list_head *first_seq = delayed_refs->seq_head.next;
+
+ spin_unlock(&delayed_refs->lock);
+ pr_debug("waiting for more refs (num %ld, first %p)\n",
+ num_refs, first_seq);
+ wait_event(delayed_refs->seq_wait,
+ num_refs != delayed_refs->num_entries ||
+ delayed_refs->seq_head.next != first_seq);
+ pr_debug("done waiting for more refs (num %ld, first %p)\n",
+ delayed_refs->num_entries, delayed_refs->seq_head.next);
+ spin_lock(&delayed_refs->lock);
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2311,15 +2356,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref;
struct list_head cluster;
int ret;
+ u64 delayed_start;
int run_all = count == (unsigned long)-1;
int run_most = 0;
+ unsigned long num_refs = 0;
+ int consider_waiting;
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+ CHUNK_ALLOC_NO_FORCE);
+
delayed_refs = &trans->transaction->delayed_refs;
INIT_LIST_HEAD(&cluster);
again:
+ consider_waiting = 0;
spin_lock(&delayed_refs->lock);
if (count == 0) {
count = delayed_refs->num_entries * 2;
@@ -2336,11 +2389,35 @@ again:
* of refs to process starting at the first one we are able to
* lock
*/
+ delayed_start = delayed_refs->run_delayed_start;
ret = btrfs_find_ref_cluster(trans, &cluster,
delayed_refs->run_delayed_start);
if (ret)
break;
+ if (delayed_start >= delayed_refs->run_delayed_start) {
+ if (consider_waiting == 0) {
+ /*
+ * btrfs_find_ref_cluster looped. let's do one
+ * more cycle. if we don't run any delayed ref
+ * during that cycle (because we can't because
+ * all of them are blocked) and if the number of
+ * refs doesn't change, we avoid busy waiting.
+ */
+ consider_waiting = 1;
+ num_refs = delayed_refs->num_entries;
+ } else {
+ wait_for_more_refs(delayed_refs, num_refs);
+ /*
+ * after waiting, things have changed. we
+ * dropped the lock and someone else might have
+ * run some refs, built new clusters and so on.
+ * therefore, we restart staleness detection.
+ */
+ consider_waiting = 0;
+ }
+ }
+
ret = run_clustered_refs(trans, root, &cluster);
BUG_ON(ret < 0);
@@ -2348,6 +2425,11 @@ again:
if (count == 0)
break;
+
+ if (ret || delayed_refs->run_delayed_start == 0) {
+ /* refs were run, let's reset staleness detection */
+ consider_waiting = 0;
+ }
}
if (run_all) {
@@ -2405,7 +2487,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->update_key = 0;
extent_op->is_data = is_data ? 1 : 0;
- ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+ ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+ num_bytes, extent_op);
if (ret)
kfree(extent_op);
return ret;
@@ -2590,7 +2673,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc)
+ int full_backref, int inc, int for_cow)
{
u64 bytenr;
u64 num_bytes;
@@ -2603,7 +2686,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int level;
int ret = 0;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64);
+ u64, u64, u64, u64, u64, u64, int);
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2723,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset);
+ key.offset, for_cow);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = btrfs_level_size(root, level - 1);
ret = process_func(trans, root, bytenr, num_bytes,
- parent, ref_root, level - 1, 0);
+ parent, ref_root, level - 1, 0,
+ for_cow);
if (ret)
goto fail;
}
@@ -2659,15 +2743,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, int full_backref, int for_cow)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref)
+ struct extent_buffer *buf, int full_backref, int for_cow)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3077,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
spin_lock_init(&found->lock);
- found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
- BTRFS_BLOCK_GROUP_SYSTEM |
- BTRFS_BLOCK_GROUP_METADATA);
+ found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
found->total_bytes = total_bytes;
found->disk_total = total_bytes * factor;
found->bytes_used = bytes_used;
@@ -3016,20 +3098,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
- u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP);
- if (extra_flags) {
- if (flags & BTRFS_BLOCK_GROUP_DATA)
- fs_info->avail_data_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_METADATA)
- fs_info->avail_metadata_alloc_bits |= extra_flags;
- if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- fs_info->avail_system_alloc_bits |= extra_flags;
- }
+ u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ /* chunk -> extended profile */
+ if (extra_flags == 0)
+ extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits |= extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits |= extra_flags;
}
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format. If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
/*
@@ -3040,6 +3129,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
u64 num_devices = root->fs_info->fs_devices->rw_devices +
root->fs_info->fs_devices->missing_devices;
+ /* pick restriper's target profile if it's available */
+ spin_lock(&root->fs_info->balance_lock);
+ if (root->fs_info->balance_ctl) {
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ u64 tgt = 0;
+
+ if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+ (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (flags & bctl->data.target)) {
+ tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (flags & bctl->sys.target)) {
+ tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+ (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (flags & bctl->meta.target)) {
+ tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ if (tgt) {
+ spin_unlock(&root->fs_info->balance_lock);
+ flags = tgt;
+ goto out;
+ }
+ }
+ spin_unlock(&root->fs_info->balance_lock);
+
if (num_devices == 1)
flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
if (num_devices < 4)
@@ -3059,22 +3176,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
((flags & BTRFS_BLOCK_GROUP_RAID1) |
(flags & BTRFS_BLOCK_GROUP_RAID10) |
- (flags & BTRFS_BLOCK_GROUP_DUP)))
+ (flags & BTRFS_BLOCK_GROUP_DUP))) {
flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+ }
+
+out:
+ /* extended -> chunk profile */
+ flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
return flags;
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
if (flags & BTRFS_BLOCK_GROUP_DATA)
- flags |= root->fs_info->avail_data_alloc_bits &
- root->fs_info->data_alloc_profile;
+ flags |= root->fs_info->avail_data_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
- flags |= root->fs_info->avail_system_alloc_bits &
- root->fs_info->system_alloc_profile;
+ flags |= root->fs_info->avail_system_alloc_bits;
else if (flags & BTRFS_BLOCK_GROUP_METADATA)
- flags |= root->fs_info->avail_metadata_alloc_bits &
- root->fs_info->metadata_alloc_profile;
+ flags |= root->fs_info->avail_metadata_alloc_bits;
+
return btrfs_reduce_alloc_profile(root, flags);
}
@@ -3191,6 +3311,8 @@ commit_trans:
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ (u64)data_sinfo, bytes, 1);
spin_unlock(&data_sinfo->lock);
return 0;
@@ -3210,6 +3332,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
data_sinfo = BTRFS_I(inode)->space_info;
spin_lock(&data_sinfo->lock);
data_sinfo->bytes_may_use -= bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ (u64)data_sinfo, bytes, 0);
spin_unlock(&data_sinfo->lock);
}
@@ -3257,27 +3381,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
if (num_bytes - num_allocated < thresh)
return 1;
}
-
- /*
- * we have two similar checks here, one based on percentage
- * and once based on a hard number of 256MB. The idea
- * is that if we have a good amount of free
- * room, don't allocate a chunk. A good mount is
- * less than 80% utilized of the chunks we have allocated,
- * or more than 256MB free
- */
- if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
- return 0;
-
- if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
- return 0;
-
thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
- /* 256MB or 5% of the FS */
- thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+ /* 256MB or 2% of the FS */
+ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
+ /* system chunks need a much small threshold */
+ if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ thresh = 32 * 1024 * 1024;
- if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+ if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
return 0;
return 1;
}
@@ -3291,7 +3403,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
int wait_for_alloc = 0;
int ret = 0;
- flags = btrfs_reduce_alloc_profile(extent_root, flags);
+ BUG_ON(!profile_is_valid(flags, 0));
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
@@ -3303,7 +3415,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
again:
spin_lock(&space_info->lock);
- if (space_info->force_alloc)
+ if (force < space_info->force_alloc)
force = space_info->force_alloc;
if (space_info->full) {
spin_unlock(&space_info->lock);
@@ -3582,6 +3694,10 @@ again:
if (used <= space_info->total_bytes) {
if (used + orig_bytes <= space_info->total_bytes) {
space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info",
+ (u64)space_info,
+ orig_bytes, 1);
ret = 0;
} else {
/*
@@ -3649,6 +3765,10 @@ again:
if (used + num_bytes < space_info->total_bytes + avail) {
space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info",
+ (u64)space_info,
+ orig_bytes, 1);
ret = 0;
} else {
wait_ordered = true;
@@ -3755,7 +3875,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +3912,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
if (num_bytes) {
spin_lock(&space_info->lock);
space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ (u64)space_info,
+ num_bytes, 0);
space_info->reservation_progress++;
spin_unlock(&space_info->lock);
}
@@ -3947,7 +4071,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
if (global_rsv->full || global_rsv == block_rsv ||
block_rsv->space_info != global_rsv->space_info)
global_rsv = NULL;
- block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+ block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+ num_bytes);
}
/*
@@ -4006,11 +4131,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
num_bytes = sinfo->total_bytes - num_bytes;
block_rsv->reserved += num_bytes;
sinfo->bytes_may_use += num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ (u64)sinfo, num_bytes, 1);
}
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
sinfo->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ (u64)sinfo, num_bytes, 0);
sinfo->reservation_progress++;
block_rsv->reserved = block_rsv->size;
block_rsv->full = 1;
@@ -4045,7 +4174,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
{
- block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+ block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+ (u64)-1);
WARN_ON(fs_info->delalloc_block_rsv.size > 0);
WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,6 +4192,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
if (!trans->bytes_reserved)
return;
+ trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
+ trans->bytes_reserved, 0);
btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
trans->bytes_reserved = 0;
}
@@ -4079,6 +4211,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
* when we are truly done with the orphan item.
*/
u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ trace_btrfs_space_reservation(root->fs_info, "orphan",
+ btrfs_ino(inode), num_bytes, 1);
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
@@ -4086,6 +4220,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ trace_btrfs_space_reservation(root->fs_info, "orphan",
+ btrfs_ino(inode), num_bytes, 0);
btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}
@@ -4213,12 +4349,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
/* Need to be holding the i_mutex here if we aren't free space cache */
if (btrfs_is_free_space_inode(root, inode))
flush = 0;
- else
- WARN_ON(!mutex_is_locked(&inode->i_mutex));
if (flush && btrfs_transaction_in_commit(root->fs_info))
schedule_timeout(1);
+ mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4401,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (dropped)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
- if (to_free)
+ if (to_free) {
btrfs_block_rsv_release(root, block_rsv, to_free);
+ trace_btrfs_space_reservation(root->fs_info,
+ "delalloc",
+ btrfs_ino(inode),
+ to_free, 0);
+ }
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
return ret;
}
@@ -4278,7 +4419,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
+ mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+ if (to_reserve)
+ trace_btrfs_space_reservation(root->fs_info,"delalloc",
+ btrfs_ino(inode), to_reserve, 1);
block_rsv_add_bytes(block_rsv, to_reserve, 1);
return 0;
@@ -4308,6 +4453,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
+ btrfs_ino(inode), to_free, 0);
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
to_free);
}
@@ -4562,7 +4709,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
if (reserve == RESERVE_ALLOC) {
- BUG_ON(space_info->bytes_may_use < num_bytes);
+ trace_btrfs_space_reservation(cache->fs_info,
+ "space_info",
+ (u64)space_info,
+ num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}
}
@@ -4928,6 +5078,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
rb_erase(&head->node.rb_node, &delayed_refs->root);
delayed_refs->num_entries--;
+ if (waitqueue_active(&delayed_refs->seq_wait))
+ wake_up(&delayed_refs->seq_wait);
/*
* we don't take a ref on the node because we're removing it from the
@@ -4955,16 +5107,17 @@ out:
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- u64 parent, int last_ref)
+ u64 parent, int last_ref, int for_cow)
{
struct btrfs_block_group_cache *cache = NULL;
int ret;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
- parent, root->root_key.objectid,
- btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL);
+ ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+ buf->start, buf->len,
+ parent, root->root_key.objectid,
+ btrfs_header_level(buf),
+ BTRFS_DROP_DELAYED_REF, NULL, for_cow);
BUG_ON(ret);
}
@@ -4999,12 +5152,12 @@ out:
btrfs_put_block_group(cache);
}
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+ u64 owner, u64 offset, int for_cow)
{
int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
/*
* tree log blocks never actually go into the extent allocation
@@ -5016,14 +5169,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_pin_extent(root, bytenr, num_bytes, 1);
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+ ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+ num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL);
+ BTRFS_DROP_DELAYED_REF, NULL, for_cow);
BUG_ON(ret);
} else {
- ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
- parent, root_objectid, owner,
- offset, BTRFS_DROP_DELAYED_REF, NULL);
+ ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+ num_bytes,
+ parent, root_objectid, owner,
+ offset, BTRFS_DROP_DELAYED_REF,
+ NULL, for_cow);
BUG_ON(ret);
}
return ret;
@@ -5146,6 +5302,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
ins->objectid = 0;
ins->offset = 0;
+ trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+
space_info = __find_space_info(root->fs_info, data);
if (!space_info) {
printk(KERN_ERR "No space info for %llu\n", data);
@@ -5295,15 +5453,6 @@ alloc:
if (unlikely(block_group->ro))
goto loop;
- spin_lock(&block_group->free_space_ctl->tree_lock);
- if (cached &&
- block_group->free_space_ctl->free_space <
- num_bytes + empty_cluster + empty_size) {
- spin_unlock(&block_group->free_space_ctl->tree_lock);
- goto loop;
- }
- spin_unlock(&block_group->free_space_ctl->tree_lock);
-
/*
* Ok we want to try and use the cluster allocator, so
* lets look there
@@ -5331,6 +5480,8 @@ alloc:
if (offset) {
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(root,
+ block_group, search_start, num_bytes);
goto checks;
}
@@ -5349,8 +5500,15 @@ refill_cluster:
* plenty of times and not have found
* anything, so we are likely way too
* fragmented for the clustering stuff to find
- * anything. */
- if (loop >= LOOP_NO_EMPTY_SIZE) {
+ * anything.
+ *
+ * However, if the cluster is taken from the
+ * current block group, release the cluster
+ * first, so that we stand a better chance of
+ * succeeding in the unclustered
+ * allocation. */
+ if (loop >= LOOP_NO_EMPTY_SIZE &&
+ last_ptr->block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
}
@@ -5361,6 +5519,11 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (loop >= LOOP_NO_EMPTY_SIZE) {
+ spin_unlock(&last_ptr->refill_lock);
+ goto unclustered_alloc;
+ }
+
/* allocate a cluster in this block group */
ret = btrfs_find_space_cluster(trans, root,
block_group, last_ptr,
@@ -5377,6 +5540,9 @@ refill_cluster:
if (offset) {
/* we found one, proceed */
spin_unlock(&last_ptr->refill_lock);
+ trace_btrfs_reserve_extent_cluster(root,
+ block_group, search_start,
+ num_bytes);
goto checks;
}
} else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5567,15 @@ refill_cluster:
}
unclustered_alloc:
+ spin_lock(&block_group->free_space_ctl->tree_lock);
+ if (cached &&
+ block_group->free_space_ctl->free_space <
+ num_bytes + empty_cluster + empty_size) {
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+ goto loop;
+ }
+ spin_unlock(&block_group->free_space_ctl->tree_lock);
+
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
/*
@@ -5438,9 +5613,6 @@ checks:
goto loop;
}
- ins->objectid = search_start;
- ins->offset = num_bytes;
-
if (offset < search_start)
btrfs_add_free_space(used_block_group, offset,
search_start - offset);
@@ -5457,6 +5629,8 @@ checks:
ins->objectid = search_start;
ins->offset = num_bytes;
+ trace_btrfs_reserve_extent(orig_root, block_group,
+ search_start, num_bytes);
if (offset < search_start)
btrfs_add_free_space(used_block_group, offset,
search_start - offset);
@@ -5621,6 +5795,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
u64 search_end, struct btrfs_key *ins,
u64 data)
{
+ bool final_tried = false;
int ret;
u64 search_start = 0;
@@ -5640,22 +5815,25 @@ again:
search_start, search_end, hint_byte,
ins, data);
- if (ret == -ENOSPC && num_bytes > min_alloc_size) {
- num_bytes = num_bytes >> 1;
- num_bytes = num_bytes & ~(root->sectorsize - 1);
- num_bytes = max(num_bytes, min_alloc_size);
- do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes, data, CHUNK_ALLOC_FORCE);
- goto again;
- }
- if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
- struct btrfs_space_info *sinfo;
-
- sinfo = __find_space_info(root->fs_info, data);
- printk(KERN_ERR "btrfs allocation failed flags %llu, "
- "wanted %llu\n", (unsigned long long)data,
- (unsigned long long)num_bytes);
- dump_space_info(sinfo, num_bytes, 1);
+ if (ret == -ENOSPC) {
+ if (!final_tried) {
+ num_bytes = num_bytes >> 1;
+ num_bytes = num_bytes & ~(root->sectorsize - 1);
+ num_bytes = max(num_bytes, min_alloc_size);
+ do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes, data, CHUNK_ALLOC_FORCE);
+ if (num_bytes == min_alloc_size)
+ final_tried = true;
+ goto again;
+ } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ struct btrfs_space_info *sinfo;
+
+ sinfo = __find_space_info(root->fs_info, data);
+ printk(KERN_ERR "btrfs allocation failed flags %llu, "
+ "wanted %llu\n", (unsigned long long)data,
+ (unsigned long long)num_bytes);
+ dump_space_info(sinfo, num_bytes, 1);
+ }
}
trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
@@ -5842,9 +6020,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
- 0, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_EXTENT, NULL);
+ ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+ ins->offset, 0,
+ root_objectid, owner, offset,
+ BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
return ret;
}
@@ -5997,10 +6176,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
return ERR_PTR(-ENOSPC);
}
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv, u32 blocksize)
{
block_rsv_add_bytes(block_rsv, blocksize, 0);
- block_rsv_release_bytes(block_rsv, NULL, 0);
+ block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
}
/*
@@ -6014,7 +6194,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
- u64 hint, u64 empty_size)
+ u64 hint, u64 empty_size, int for_cow)
{
struct btrfs_key ins;
struct btrfs_block_rsv *block_rsv;
@@ -6030,7 +6210,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
empty_size, hint, (u64)-1, &ins, 0);
if (ret) {
- unuse_block_rsv(block_rsv, blocksize);
+ unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
}
@@ -6058,10 +6238,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
extent_op->update_flags = 1;
extent_op->is_data = 0;
- ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+ ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+ ins.objectid,
ins.offset, parent, root_objectid,
level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op);
+ extent_op, for_cow);
BUG_ON(ret);
}
return buf;
@@ -6078,6 +6259,7 @@ struct walk_control {
int keep_locks;
int reada_slot;
int reada_count;
+ int for_reloc;
};
#define DROP_REFERENCE 1
@@ -6216,9 +6398,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
/* wc->stage == UPDATE_BACKREF */
if (!(wc->flags[level] & flag)) {
BUG_ON(!path->locks[level]);
- ret = btrfs_inc_ref(trans, root, eb, 1);
+ ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
BUG_ON(ret);
- ret = btrfs_dec_ref(trans, root, eb, 0);
+ ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
BUG_ON(ret);
ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
eb->len, flag, 0);
@@ -6362,7 +6544,7 @@ skip:
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- root->root_key.objectid, level - 1, 0);
+ root->root_key.objectid, level - 1, 0, 0);
BUG_ON(ret);
}
btrfs_tree_unlock(next);
@@ -6436,9 +6618,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
- ret = btrfs_dec_ref(trans, root, eb, 1);
+ ret = btrfs_dec_ref(trans, root, eb, 1,
+ wc->for_reloc);
else
- ret = btrfs_dec_ref(trans, root, eb, 0);
+ ret = btrfs_dec_ref(trans, root, eb, 0,
+ wc->for_reloc);
BUG_ON(ret);
}
/* make block locked assertion in clean_tree_block happy */
@@ -6465,7 +6649,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_header_owner(path->nodes[level + 1]));
}
- btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+ btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
@@ -6549,7 +6733,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* blocks are properly updated.
*/
void btrfs_drop_snapshot(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv, int update_ref)
+ struct btrfs_block_rsv *block_rsv, int update_ref,
+ int for_reloc)
{
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -6637,6 +6822,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
wc->stage = DROP_REFERENCE;
wc->update_ref = update_ref;
wc->keep_locks = 0;
+ wc->for_reloc = for_reloc;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
@@ -6721,6 +6907,7 @@ out:
* drop subtree rooted at tree block 'node'.
*
* NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
*/
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -6765,6 +6952,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->stage = DROP_REFERENCE;
wc->update_ref = 0;
wc->keep_locks = 1;
+ wc->for_reloc = 1;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
@@ -6792,6 +6980,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+ if (root->fs_info->balance_ctl) {
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ u64 tgt = 0;
+
+ /* pick restriper's target profile and return */
+ if (flags & BTRFS_BLOCK_GROUP_DATA &&
+ bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+ bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+ } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+ bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+ tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+ }
+
+ if (tgt) {
+ /* extended -> chunk profile */
+ tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ return tgt;
+ }
+ }
+
/*
* we add in the count of missing devices because we want
* to make sure that any RAID levels on a degraded FS
@@ -7085,7 +7296,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
* space to fit our block group in.
*/
if (device->total_bytes > device->bytes_used + min_free) {
- ret = find_free_dev_extent(NULL, device, min_free,
+ ret = find_free_dev_extent(device, min_free,
&dev_offset, NULL);
if (!ret)
dev_nr++;
@@ -7447,6 +7658,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
BUG_ON(ret);
+ update_global_block_rsv(root->fs_info);
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7466,6 +7678,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
return 0;
}
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+ u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ /* chunk -> extended profile */
+ if (extra_flags == 0)
+ extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ fs_info->avail_data_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+ if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start)
{
@@ -7476,6 +7704,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct inode *inode;
int ret;
+ int index;
int factor;
root = root->fs_info->extent_root;
@@ -7491,6 +7720,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, block_group);
memcpy(&key, &block_group->key, sizeof(key));
+ index = get_block_group_index(block_group);
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7795,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
+ if (list_empty(&block_group->space_info->block_groups[index]))
+ clear_avail_alloc_bits(root->fs_info, block_group->flags);
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f..fcf77e1ded4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,7 @@
#include "ctree.h"
#include "btrfs_inode.h"
#include "volumes.h"
+#include "check-integrity.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
}
bio->bi_bdev = dev->bdev;
bio_add_page(bio, page, length, start-page_offset(page));
- submit_bio(WRITE_SYNC, bio);
+ btrfsic_submit_bio(WRITE_SYNC, bio);
wait_for_completion(&compl);
if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
mirror_num, bio_flags, start);
else
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
atomic_set(&eb->blocking_writers, 0);
atomic_set(&eb->spinning_readers, 0);
atomic_set(&eb->spinning_writers, 0);
+ eb->lock_nested = 0;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@@ -3907,6 +3909,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
while (start <= end) {
index = start >> PAGE_CACHE_SHIFT;
page = find_get_page(tree->mapping, index);
+ if (!page)
+ return 1;
uptodate = PageUptodate(page);
page_cache_release(page);
if (!uptodate) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c300132..bc6a042cb6f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -129,6 +129,7 @@ struct extent_buffer {
struct list_head leak_list;
struct rcu_head rcu_head;
atomic_t refs;
+ pid_t lock_owner;
/* count of read lock holders on the extent buffer */
atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
atomic_t blocking_readers;
atomic_t spinning_readers;
atomic_t spinning_writers;
+ int lock_nested;
/* protects write locks */
rwlock_t lock;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 034d9850322..859ba2dd889 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset);
+ start - extent_offset, 0);
BUG_ON(ret);
*hint_byte = disk_bytenr;
}
@@ -753,7 +753,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
- extent_offset);
+ extent_offset, 0);
BUG_ON(ret);
inode_sub_bytes(inode,
extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset);
+ ino, orig_offset, 0);
BUG_ON(ret);
if (split == start) {
@@ -989,7 +989,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset);
+ ino, orig_offset, 0);
BUG_ON(ret);
}
other_start = 0;
@@ -1006,7 +1006,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset);
+ ino, orig_offset, 0);
BUG_ON(ret);
}
if (del_nr == 0) {
@@ -1274,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
dirty_pages);
if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
btrfs_btree_balance_dirty(root, 1);
- btrfs_throttle(root);
pos += copied;
num_written += copied;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a897bf7953..c2f20594c9f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
io_ctl_unmap_page(io_ctl);
for (i = 0; i < io_ctl->num_pages; i++) {
- ClearPageChecked(io_ctl->pages[i]);
- unlock_page(io_ctl->pages[i]);
- page_cache_release(io_ctl->pages[i]);
+ if (io_ctl->pages[i]) {
+ ClearPageChecked(io_ctl->pages[i]);
+ unlock_page(io_ctl->pages[i]);
+ page_cache_release(io_ctl->pages[i]);
+ }
}
}
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
if (!num_entries)
return 0;
- io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root);
+ if (ret)
+ return ret;
+
ret = readahead_cache(inode);
if (ret)
goto out;
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct io_ctl io_ctl;
struct list_head bitmap_list;
struct btrfs_key key;
- u64 start, end, len;
+ u64 start, extent_start, extent_end, len;
int entries = 0;
int bitmaps = 0;
int ret;
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
if (!i_size_read(inode))
return -1;
- io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root);
+ if (ret)
+ return -1;
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_cluster,
block_group_list);
- /*
- * We shouldn't have switched the pinned extents yet so this is the
- * right one
- */
- unpin = root->fs_info->pinned_extents;
-
/* Lock all pages first so we can lock the extent safely. */
io_ctl_prepare_pages(&io_ctl, inode, 0);
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state, GFP_NOFS);
- /*
- * When searching for pinned extents, we need to start at our start
- * offset.
- */
- if (block_group)
- start = block_group->key.objectid;
-
node = rb_first(&ctl->free_space_offset);
if (!node && cluster) {
node = rb_first(&cluster->root);
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* We want to add any pinned extents to our free space cache
* so we don't leak the space
*/
+
+ /*
+ * We shouldn't have switched the pinned extents yet so this is the
+ * right one
+ */
+ unpin = root->fs_info->pinned_extents;
+
+ if (block_group)
+ start = block_group->key.objectid;
+
while (block_group && (start < block_group->key.objectid +
block_group->key.offset)) {
- ret = find_first_extent_bit(unpin, start, &start, &end,
+ ret = find_first_extent_bit(unpin, start,
+ &extent_start, &extent_end,
EXTENT_DIRTY);
if (ret) {
ret = 0;
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
}
/* This pinned extent is out of our range */
- if (start >= block_group->key.objectid +
+ if (extent_start >= block_group->key.objectid +
block_group->key.offset)
break;
- len = block_group->key.objectid +
- block_group->key.offset - start;
- len = min(len, end + 1 - start);
+ extent_start = max(extent_start, start);
+ extent_end = min(block_group->key.objectid +
+ block_group->key.offset, extent_end + 1);
+ len = extent_end - extent_start;
entries++;
- ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+ ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
if (ret)
goto out_nospc;
- start = end + 1;
+ start = extent_end;
}
/* Write out the bitmaps */
@@ -2236,7 +2242,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
if (entry->bitmap) {
ret = btrfs_alloc_from_bitmap(block_group,
cluster, entry, bytes,
- min_start);
+ cluster->window_start);
if (ret == 0) {
node = rb_next(&entry->offset_index);
if (!node)
@@ -2245,6 +2251,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
offset_index);
continue;
}
+ cluster->window_start += bytes;
} else {
ret = entry->offset;
@@ -2283,23 +2290,23 @@ out:
static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *entry,
struct btrfs_free_cluster *cluster,
- u64 offset, u64 bytes, u64 min_bytes)
+ u64 offset, u64 bytes,
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
unsigned long next_zero;
unsigned long i;
- unsigned long search_bits;
- unsigned long total_bits;
+ unsigned long want_bits;
+ unsigned long min_bits;
unsigned long found_bits;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
- bool found = false;
i = offset_to_bit(entry->offset, block_group->sectorsize,
max_t(u64, offset, entry->offset));
- search_bits = bytes_to_bits(bytes, block_group->sectorsize);
- total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+ min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
again:
found_bits = 0;
@@ -2308,7 +2315,7 @@ again:
i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
next_zero = find_next_zero_bit(entry->bitmap,
BITS_PER_BITMAP, i);
- if (next_zero - i >= search_bits) {
+ if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
break;
}
@@ -2318,10 +2325,9 @@ again:
if (!found_bits)
return -ENOSPC;
- if (!found) {
+ if (!total_found) {
start = i;
cluster->max_size = 0;
- found = true;
}
total_found += found_bits;
@@ -2329,13 +2335,8 @@ again:
if (cluster->max_size < found_bits * block_group->sectorsize)
cluster->max_size = found_bits * block_group->sectorsize;
- if (total_found < total_bits) {
- i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
- if (i - start > total_bits * 2) {
- total_found = 0;
- cluster->max_size = 0;
- found = false;
- }
+ if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+ i = next_zero + 1;
goto again;
}
@@ -2346,28 +2347,31 @@ again:
&entry->offset_index, 1);
BUG_ON(ret);
+ trace_btrfs_setup_cluster(block_group, cluster,
+ total_found * block_group->sectorsize, 1);
return 0;
}
/*
* This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
*/
static noinline int
setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *first = NULL;
struct btrfs_free_space *entry = NULL;
- struct btrfs_free_space *prev = NULL;
struct btrfs_free_space *last;
struct rb_node *node;
u64 window_start;
u64 window_free;
u64 max_extent;
- u64 max_gap = 128 * 1024;
+ u64 total_size = 0;
entry = tree_search_offset(ctl, offset, 0, 1);
if (!entry)
@@ -2377,8 +2381,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
* We don't want bitmaps, so just move along until we find a normal
* extent entry.
*/
- while (entry->bitmap) {
- if (list_empty(&entry->list))
+ while (entry->bitmap || entry->bytes < min_bytes) {
+ if (entry->bitmap && list_empty(&entry->list))
list_add_tail(&entry->list, bitmaps);
node = rb_next(&entry->offset_index);
if (!node)
@@ -2391,12 +2395,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
max_extent = entry->bytes;
first = entry;
last = entry;
- prev = entry;
- while (window_free <= min_bytes) {
- node = rb_next(&entry->offset_index);
- if (!node)
- return -ENOSPC;
+ for (node = rb_next(&entry->offset_index); node;
+ node = rb_next(&entry->offset_index)) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
if (entry->bitmap) {
@@ -2405,26 +2406,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
continue;
}
- /*
- * we haven't filled the empty size and the window is
- * very large. reset and try again
- */
- if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
- entry->offset - window_start > (min_bytes * 2)) {
- first = entry;
- window_start = entry->offset;
- window_free = entry->bytes;
- last = entry;
+ if (entry->bytes < min_bytes)
+ continue;
+
+ last = entry;
+ window_free += entry->bytes;
+ if (entry->bytes > max_extent)
max_extent = entry->bytes;
- } else {
- last = entry;
- window_free += entry->bytes;
- if (entry->bytes > max_extent)
- max_extent = entry->bytes;
- }
- prev = entry;
}
+ if (window_free < bytes || max_extent < cont1_bytes)
+ return -ENOSPC;
+
cluster->window_start = first->offset;
node = &first->offset_index;
@@ -2438,17 +2431,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
- if (entry->bitmap)
+ if (entry->bitmap || entry->bytes < min_bytes)
continue;
rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 0);
+ total_size += entry->bytes;
BUG_ON(ret);
} while (node && entry != last);
cluster->max_size = max_extent;
-
+ trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
return 0;
}
@@ -2460,7 +2454,7 @@ static noinline int
setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
struct list_head *bitmaps, u64 offset, u64 bytes,
- u64 min_bytes)
+ u64 cont1_bytes, u64 min_bytes)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
@@ -2482,10 +2476,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
}
list_for_each_entry(entry, bitmaps, list) {
- if (entry->bytes < min_bytes)
+ if (entry->bytes < bytes)
continue;
ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
- bytes, min_bytes);
+ bytes, cont1_bytes, min_bytes);
if (!ret)
return 0;
}
@@ -2499,7 +2493,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
/*
* here we try to find a cluster of blocks in a block group. The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
* We might not find them all in one contiguous area.
*
* returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2509,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
struct btrfs_free_space *entry, *tmp;
LIST_HEAD(bitmaps);
u64 min_bytes;
+ u64 cont1_bytes;
int ret;
- /* for metadata, allow allocates with more holes */
+ /*
+ * Choose the minimum extent size we'll require for this
+ * cluster. For SSD_SPREAD, don't allow any fragmentation.
+ * For metadata, allow allocates with smaller extents. For
+ * data, keep it dense.
+ */
if (btrfs_test_opt(root, SSD_SPREAD)) {
- min_bytes = bytes + empty_size;
+ cont1_bytes = min_bytes = bytes + empty_size;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
- /*
- * we want to do larger allocations when we are
- * flushing out the delayed refs, it helps prevent
- * making more work as we go along.
- */
- if (trans->transaction->delayed_refs.flushing)
- min_bytes = max(bytes, (bytes + empty_size) >> 1);
- else
- min_bytes = max(bytes, (bytes + empty_size) >> 4);
- } else
- min_bytes = max(bytes, (bytes + empty_size) >> 2);
+ cont1_bytes = bytes;
+ min_bytes = block_group->sectorsize;
+ } else {
+ cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+ min_bytes = block_group->sectorsize;
+ }
spin_lock(&ctl->tree_lock);
@@ -2539,7 +2534,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
* If we know we don't have enough space to make a cluster don't even
* bother doing all the work to try and find one.
*/
- if (ctl->free_space < min_bytes) {
+ if (ctl->free_space < bytes) {
spin_unlock(&ctl->tree_lock);
return -ENOSPC;
}
@@ -2552,11 +2547,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
goto out;
}
+ trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+ min_bytes);
+
+ INIT_LIST_HEAD(&bitmaps);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
- bytes, min_bytes);
+ bytes + empty_size,
+ cont1_bytes, min_bytes);
if (ret)
ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
- offset, bytes, min_bytes);
+ offset, bytes + empty_size,
+ cont1_bytes, min_bytes);
/* Clear our temporary list */
list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2568,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
list_add_tail(&cluster->block_group_list,
&block_group->cluster_list);
cluster->block_group = block_group;
+ } else {
+ trace_btrfs_failed_cluster_setup(block_group);
}
out:
spin_unlock(&cluster->lock);
@@ -2588,17 +2591,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
cluster->block_group = NULL;
}
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
- u64 *trimmed, u64 start, u64 end, u64 minlen)
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 bytes,
+ u64 reserved_start, u64 reserved_bytes)
{
- struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- struct btrfs_free_space *entry = NULL;
+ struct btrfs_space_info *space_info = block_group->space_info;
struct btrfs_fs_info *fs_info = block_group->fs_info;
- u64 bytes = 0;
- u64 actually_trimmed;
- int ret = 0;
+ int ret;
+ int update = 0;
+ u64 trimmed = 0;
- *trimmed = 0;
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (!block_group->ro) {
+ block_group->reserved += reserved_bytes;
+ space_info->bytes_reserved += reserved_bytes;
+ update = 1;
+ }
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
+ ret = btrfs_error_discard_extent(fs_info->extent_root,
+ start, bytes, &trimmed);
+ if (!ret)
+ *total_trimmed += trimmed;
+
+ btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+
+ if (update) {
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (block_group->ro)
+ space_info->bytes_readonly += reserved_bytes;
+ block_group->reserved -= reserved_bytes;
+ space_info->bytes_reserved -= reserved_bytes;
+ spin_unlock(&space_info->lock);
+ spin_unlock(&block_group->lock);
+ }
+
+ return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ int ret = 0;
+ u64 extent_start;
+ u64 extent_bytes;
+ u64 bytes;
while (start < end) {
spin_lock(&ctl->tree_lock);
@@ -2609,81 +2652,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
}
entry = tree_search_offset(ctl, start, 0, 1);
- if (!entry)
- entry = tree_search_offset(ctl,
- offset_to_bitmap(ctl, start),
- 1, 1);
-
- if (!entry || entry->offset >= end) {
+ if (!entry) {
spin_unlock(&ctl->tree_lock);
break;
}
- if (entry->bitmap) {
- ret = search_bitmap(ctl, entry, &start, &bytes);
- if (!ret) {
- if (start >= end) {
- spin_unlock(&ctl->tree_lock);
- break;
- }
- bytes = min(bytes, end - start);
- bitmap_clear_bits(ctl, entry, start, bytes);
- if (entry->bytes == 0)
- free_bitmap(ctl, entry);
- } else {
- start = entry->offset + BITS_PER_BITMAP *
- block_group->sectorsize;
+ /* skip bitmaps */
+ while (entry->bitmap) {
+ node = rb_next(&entry->offset_index);
+ if (!node) {
spin_unlock(&ctl->tree_lock);
- ret = 0;
- continue;
+ goto out;
}
- } else {
- start = entry->offset;
- bytes = min(entry->bytes, end - start);
- unlink_free_space(ctl, entry);
- kmem_cache_free(btrfs_free_space_cachep, entry);
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
}
+ if (entry->offset >= end) {
+ spin_unlock(&ctl->tree_lock);
+ break;
+ }
+
+ extent_start = entry->offset;
+ extent_bytes = entry->bytes;
+ start = max(start, extent_start);
+ bytes = min(extent_start + extent_bytes, end) - start;
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ goto next;
+ }
+
+ unlink_free_space(ctl, entry);
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+
spin_unlock(&ctl->tree_lock);
- if (bytes >= minlen) {
- struct btrfs_space_info *space_info;
- int update = 0;
-
- space_info = block_group->space_info;
- spin_lock(&space_info->lock);
- spin_lock(&block_group->lock);
- if (!block_group->ro) {
- block_group->reserved += bytes;
- space_info->bytes_reserved += bytes;
- update = 1;
- }
- spin_unlock(&block_group->lock);
- spin_unlock(&space_info->lock);
-
- ret = btrfs_error_discard_extent(fs_info->extent_root,
- start,
- bytes,
- &actually_trimmed);
-
- btrfs_add_free_space(block_group, start, bytes);
- if (update) {
- spin_lock(&space_info->lock);
- spin_lock(&block_group->lock);
- if (block_group->ro)
- space_info->bytes_readonly += bytes;
- block_group->reserved -= bytes;
- space_info->bytes_reserved -= bytes;
- spin_unlock(&space_info->lock);
- spin_unlock(&block_group->lock);
- }
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ extent_start, extent_bytes);
+ if (ret)
+ break;
+next:
+ start += bytes;
- if (ret)
- break;
- *trimmed += actually_trimmed;
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+out:
+ return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+ u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ int ret = 0;
+ int ret2;
+ u64 bytes;
+ u64 offset = offset_to_bitmap(ctl, start);
+
+ while (offset < end) {
+ bool next_bitmap = false;
+
+ spin_lock(&ctl->tree_lock);
+
+ if (ctl->free_space < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ break;
+ }
+
+ entry = tree_search_offset(ctl, offset, 1, 0);
+ if (!entry) {
+ spin_unlock(&ctl->tree_lock);
+ next_bitmap = true;
+ goto next;
+ }
+
+ bytes = minlen;
+ ret2 = search_bitmap(ctl, entry, &start, &bytes);
+ if (ret2 || start >= end) {
+ spin_unlock(&ctl->tree_lock);
+ next_bitmap = true;
+ goto next;
+ }
+
+ bytes = min(bytes, end - start);
+ if (bytes < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ goto next;
+ }
+
+ bitmap_clear_bits(ctl, entry, start, bytes);
+ if (entry->bytes == 0)
+ free_bitmap(ctl, entry);
+
+ spin_unlock(&ctl->tree_lock);
+
+ ret = do_trimming(block_group, total_trimmed, start, bytes,
+ start, bytes);
+ if (ret)
+ break;
+next:
+ if (next_bitmap) {
+ offset += BITS_PER_BITMAP * ctl->unit;
+ } else {
+ start += bytes;
+ if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+ offset += BITS_PER_BITMAP * ctl->unit;
}
- start += bytes;
- bytes = 0;
if (fatal_signal_pending(current)) {
ret = -ERESTARTSYS;
@@ -2696,6 +2776,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
return ret;
}
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+ if (ret)
+ return ret;
+
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+
+ return ret;
+}
+
/*
* Find the left-most item in the cache tree, and then return the
* smallest inode number in the item.
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d6..213ffa86ce1 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
trans->bytes_reserved);
if (ret)
goto out;
+ trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+ trans->bytes_reserved, 1);
again:
inode = lookup_free_ino_inode(root, path);
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
out_put:
iput(inode);
out_release:
+ trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+ trans->bytes_reserved, 0);
btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
out:
trans->block_rsv = rsv;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81b235a61f8..32214fe0f7e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
+ struct btrfs_block_rsv *block_rsv;
int ret;
if (!list_empty(&root->orphan_list) ||
root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
return;
+ spin_lock(&root->orphan_lock);
+ if (!list_empty(&root->orphan_list)) {
+ spin_unlock(&root->orphan_lock);
+ return;
+ }
+
+ if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+ spin_unlock(&root->orphan_lock);
+ return;
+ }
+
+ block_rsv = root->orphan_block_rsv;
+ root->orphan_block_rsv = NULL;
+ spin_unlock(&root->orphan_lock);
+
if (root->orphan_item_inserted &&
btrfs_root_refs(&root->root_item) > 0) {
ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
root->orphan_item_inserted = 0;
}
- if (root->orphan_block_rsv) {
- WARN_ON(root->orphan_block_rsv->size > 0);
- btrfs_free_block_rsv(root, root->orphan_block_rsv);
- root->orphan_block_rsv = NULL;
+ if (block_rsv) {
+ WARN_ON(block_rsv->size > 0);
+ btrfs_free_block_rsv(root, block_rsv);
}
}
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
continue;
}
nr_truncate++;
- /*
- * Need to hold the imutex for reservation purposes, not
- * a huge deal here but I have a WARN_ON in
- * btrfs_delalloc_reserve_space to catch offenders.
- */
- mutex_lock(&inode->i_mutex);
ret = btrfs_truncate(inode);
- mutex_unlock(&inode->i_mutex);
} else {
nr_unlink++;
}
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
BUG_ON(!root->fs_info->enospc_unlink);
root->fs_info->enospc_unlink = 0;
}
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int pending_del_nr = 0;
int pending_del_slot = 0;
int extent_type = -1;
- int encoding;
int ret;
int err = 0;
u64 ino = btrfs_ino(inode);
@@ -3059,7 +3066,6 @@ search_again:
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
found_type = btrfs_key_type(&found_key);
- encoding = 0;
if (found_key.objectid != ino)
break;
@@ -3072,10 +3078,6 @@ search_again:
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(leaf, fi);
- encoding = btrfs_file_extent_compression(leaf, fi);
- encoding |= btrfs_file_extent_encryption(leaf, fi);
- encoding |= btrfs_file_extent_other_encoding(leaf, fi);
-
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
item_end +=
btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3105,7 @@ search_again:
if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
u64 num_dec;
extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (!del_item && !encoding) {
+ if (!del_item) {
u64 orig_num_bytes =
btrfs_file_extent_num_bytes(leaf, fi);
extent_num_bytes = new_size -
@@ -3179,7 +3181,7 @@ delete:
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
- ino, extent_offset);
+ ino, extent_offset, 0);
BUG_ON(ret);
}
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
i_size_write(inode, newsize);
btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
ret = btrfs_update_inode(trans, root, inode);
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
} else {
/*
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
}
out_unlock:
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root, nr);
if (drop_inode) {
inode_dec_link_count(inode);
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
}
out_unlock:
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
}
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
fail:
if (drop_inode) {
inode_dec_link_count(inode);
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
out_fail:
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
@@ -5121,7 +5123,7 @@ again:
}
flush_dcache_page(page);
} else if (create && PageUptodate(page)) {
- WARN_ON(1);
+ BUG();
if (!trans) {
kunmap(page);
free_extent_map(em);
@@ -6399,21 +6401,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unsigned long zero_start;
loff_t size;
int ret;
+ int reserved = 0;
u64 page_start;
u64 page_end;
- /* Need this to keep space reservations serialized */
- mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
- mutex_unlock(&inode->i_mutex);
- if (!ret)
+ if (!ret) {
ret = btrfs_update_time(vma->vm_file);
+ reserved = 1;
+ }
if (ret) {
if (ret == -ENOMEM)
ret = VM_FAULT_OOM;
else /* -ENOSPC, -EIO, etc */
ret = VM_FAULT_SIGBUS;
- goto out;
+ if (reserved)
+ goto out;
+ goto out_noreserve;
}
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
@@ -6494,8 +6498,9 @@ out_unlock:
if (!ret)
return VM_FAULT_LOCKED;
unlock_page(page);
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
out:
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out_noreserve:
return ret;
}
@@ -6668,7 +6673,7 @@ end_trans:
err = ret;
nr = trans->blocks_used;
- ret = btrfs_end_transaction_throttle(trans, root);
+ ret = btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root, nr);
}
@@ -6749,6 +6754,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
extent_io_tree_init(&ei->io_tree, &inode->i_data);
extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
mutex_init(&ei->log_mutex);
+ mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7074,7 +7080,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
btrfs_end_log_trans(root);
}
out_fail:
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7252,7 @@ out_unlock:
if (!err)
d_instantiate(dentry, inode);
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5441ff1480f..03bb62a9ee2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
struct btrfs_trans_handle *trans;
unsigned int flags, oldflags;
int ret;
+ u64 ip_oldflags;
+ unsigned int i_oldflags;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
mutex_lock(&inode->i_mutex);
+ ip_oldflags = ip->flags;
+ i_oldflags = inode->i_flags;
+
flags = btrfs_mask_flags(inode->i_mode, flags);
oldflags = btrfs_flags_to_ioctl(ip->flags);
if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
- trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_drop;
+ }
btrfs_update_iflags(inode);
inode->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
btrfs_end_transaction(trans, root);
+ out_drop:
+ if (ret) {
+ ip->flags = ip_oldflags;
+ inode->i_flags = i_oldflags;
+ }
mnt_drop_write_file(file);
-
- ret = 0;
out_unlock:
mutex_unlock(&inode->i_mutex);
return ret;
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
{
- struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
struct btrfs_device *device;
struct request_queue *q;
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+ u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
- ret = btrfs_trim_fs(root, &range);
+ ret = btrfs_trim_fs(fs_info->tree_root, &range);
if (ret < 0)
return ret;
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
return PTR_ERR(trans);
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
- 0, objectid, NULL, 0, 0, 0);
+ 0, objectid, NULL, 0, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
return 0;
file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
- mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_space(inode,
num_pages << PAGE_CACHE_SHIFT);
- mutex_unlock(&inode->i_mutex);
if (ret)
return ret;
again:
@@ -1058,7 +1065,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
i = range->start >> PAGE_CACHE_SHIFT;
}
if (!max_to_defrag)
- max_to_defrag = last_index;
+ max_to_defrag = last_index + 1;
/*
* make writeback starts from i, so the defrag range can be
@@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
- mutex_lock(&root->fs_info->volume_mutex);
sizestr = vol_args->name;
devstr = strchr(sizestr, ':');
if (devstr) {
@@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
if (!strcmp(sizestr, "max"))
new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
new_size = memparse(sizestr, NULL);
if (new_size == 0) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
}
@@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (mod < 0) {
if (new_size > old_size) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
new_size = old_size - new_size;
} else if (mod > 0) {
@@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
if (new_size < 256 * 1024 * 1024) {
ret = -EINVAL;
- goto out_unlock;
+ goto out_free;
}
if (new_size > device->bdev->bd_inode->i_size) {
ret = -EFBIG;
- goto out_unlock;
+ goto out_free;
}
do_div(new_size, root->sectorsize);
@@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out_unlock;
+ goto out_free;
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
@@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
ret = btrfs_shrink_device(device, new_size);
}
-out_unlock:
- mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_init_new_device(root, vol_args->name);
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
+ mutex_lock(&root->fs_info->volume_mutex);
+ if (root->fs_info->balance_ctl) {
+ printk(KERN_INFO "btrfs: balance in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
vol_args = memdup_user(arg, sizeof(*vol_args));
- if (IS_ERR(vol_args))
- return PTR_ERR(vol_args);
+ if (IS_ERR(vol_args)) {
+ ret = PTR_ERR(vol_args);
+ goto out;
+ }
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_rm_device(root, vol_args->name);
kfree(vol_args);
+out:
+ mutex_unlock(&root->fs_info->volume_mutex);
return ret;
}
@@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
disko, diskl, 0,
root->root_key.objectid,
btrfs_ino(inode),
- new_key.offset - datao);
+ new_key.offset - datao,
+ 0);
BUG_ON(ret);
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
{
int ret = 0;
int size;
- u64 extent_offset;
+ u64 extent_item_pos;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
struct btrfs_path *path = NULL;
@@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
}
ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+ btrfs_release_path(path);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
ret = -ENOENT;
if (ret < 0)
goto out;
- extent_offset = loi->logical - key.objectid;
+ extent_item_pos = loi->logical - key.objectid;
ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
- extent_offset, build_ino_list, inodes);
+ extent_item_pos, build_ino_list,
+ inodes);
if (ret < 0)
goto out;
@@ -3034,6 +3075,163 @@ out:
return ret;
}
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ bargs->flags = bctl->flags;
+
+ if (atomic_read(&fs_info->balance_running))
+ bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+ if (atomic_read(&fs_info->balance_pause_req))
+ bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+ if (atomic_read(&fs_info->balance_cancel_req))
+ bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+ memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+ memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+ memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+ if (lock) {
+ spin_lock(&fs_info->balance_lock);
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ spin_unlock(&fs_info->balance_lock);
+ } else {
+ memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+ }
+}
+
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ struct btrfs_balance_control *bctl;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (arg) {
+ bargs = memdup_user(arg, sizeof(*bargs));
+ if (IS_ERR(bargs)) {
+ ret = PTR_ERR(bargs);
+ goto out;
+ }
+
+ if (bargs->flags & BTRFS_BALANCE_RESUME) {
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out_bargs;
+ }
+
+ bctl = fs_info->balance_ctl;
+ spin_lock(&fs_info->balance_lock);
+ bctl->flags |= BTRFS_BALANCE_RESUME;
+ spin_unlock(&fs_info->balance_lock);
+
+ goto do_balance;
+ }
+ } else {
+ bargs = NULL;
+ }
+
+ if (fs_info->balance_ctl) {
+ ret = -EINPROGRESS;
+ goto out_bargs;
+ }
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out_bargs;
+ }
+
+ bctl->fs_info = fs_info;
+ if (arg) {
+ memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+ memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+ memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+ bctl->flags = bargs->flags;
+ } else {
+ /* balance everything - no filters */
+ bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+ }
+
+do_balance:
+ ret = btrfs_balance(bctl, bargs);
+ /*
+ * bctl is freed in __cancel_balance or in free_fs_info if
+ * restriper was paused all the way until unmount
+ */
+ if (arg) {
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+ }
+
+out_bargs:
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case BTRFS_BALANCE_CTL_PAUSE:
+ return btrfs_pause_balance(root->fs_info);
+ case BTRFS_BALANCE_CTL_CANCEL:
+ return btrfs_cancel_balance(root->fs_info);
+ }
+
+ return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+ void __user *arg)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_ioctl_balance_args *bargs;
+ int ret = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+
+ bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+ if (!bargs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ update_ioctl_balance_args(fs_info, 1, bargs);
+
+ if (copy_to_user(arg, bargs, sizeof(*bargs)))
+ ret = -EFAULT;
+
+ kfree(bargs);
+out:
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
- return btrfs_balance(root->fs_info->dev_root);
+ return btrfs_ioctl_balance(root, NULL);
case BTRFS_IOC_CLONE:
return btrfs_ioctl_clone(file, arg, 0, 0, 0);
case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_scrub_cancel(root, argp);
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(root, argp);
+ case BTRFS_IOC_BALANCE_V2:
+ return btrfs_ioctl_balance(root, argp);
+ case BTRFS_IOC_BALANCE_CTL:
+ return btrfs_ioctl_balance_ctl(root, arg);
+ case BTRFS_IOC_BALANCE_PROGRESS:
+ return btrfs_ioctl_balance_progress(root, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de..4f69028a68c 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
__u64 reserved[124]; /* pad to 1k */
};
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE 1
+#define BTRFS_BALANCE_CTL_CANCEL 2
+
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+ __u64 profiles;
+ __u64 usage;
+ __u64 devid;
+ __u64 pstart;
+ __u64 pend;
+ __u64 vstart;
+ __u64 vend;
+
+ __u64 target;
+
+ __u64 flags;
+
+ __u64 unused[8];
+} __attribute__ ((__packed__));
+
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+ __u64 expected; /* estimated # of chunks that will be
+ * relocated to fulfill the request */
+ __u64 considered; /* # of chunks we have considered so far */
+ __u64 completed; /* # of chunks relocated so far */
+};
+
+#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
+
+struct btrfs_ioctl_balance_args {
+ __u64 flags; /* in/out */
+ __u64 state; /* out */
+
+ struct btrfs_balance_args data; /* in/out */
+ struct btrfs_balance_args meta; /* in/out */
+ struct btrfs_balance_args sys; /* in/out */
+
+ struct btrfs_balance_progress stat; /* out */
+
+ __u64 unused[72]; /* pad to 1k */
+};
+
#define BTRFS_INO_LOOKUP_PATH_MAX 4080
struct btrfs_ioctl_ino_lookup_args {
__u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
struct btrfs_ioctl_dev_info_args)
#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+ struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+ struct btrfs_ioctl_balance_args)
#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
struct btrfs_ioctl_ino_path_args)
#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b27..5e178d8f716 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
*/
void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
+ if (eb->lock_nested) {
+ read_lock(&eb->lock);
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
+ }
if (rw == BTRFS_WRITE_LOCK) {
if (atomic_read(&eb->blocking_writers) == 0) {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
+ if (eb->lock_nested) {
+ read_lock(&eb->lock);
+ if (&eb->lock_nested && current->pid == eb->lock_owner) {
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
+ }
if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
again:
+ read_lock(&eb->lock);
+ if (atomic_read(&eb->blocking_writers) &&
+ current->pid == eb->lock_owner) {
+ /*
+ * This extent is already write-locked by our thread. We allow
+ * an additional read lock to be added because it's for the same
+ * thread. btrfs_find_all_roots() depends on this as it may be
+ * called on a partly (write-)locked tree.
+ */
+ BUG_ON(eb->lock_nested);
+ eb->lock_nested = 1;
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers)) {
read_unlock(&eb->lock);
- wait_event(eb->write_lock_wq,
- atomic_read(&eb->blocking_writers) == 0);
goto again;
}
atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
}
atomic_inc(&eb->write_locks);
atomic_inc(&eb->spinning_writers);
+ eb->lock_owner = current->pid;
return 1;
}
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
+ if (eb->lock_nested) {
+ read_lock(&eb->lock);
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
+ }
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
+ if (eb->lock_nested) {
+ read_lock(&eb->lock);
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ read_unlock(&eb->lock);
+ return;
+ }
+ read_unlock(&eb->lock);
+ }
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
atomic_inc(&eb->write_locks);
+ eb->lock_owner = current->pid;
return 0;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfb55434a46..8c1aae2c845 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
num_bytes, parent,
btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset, 1);
BUG_ON(ret);
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
parent, btrfs_header_owner(leaf),
- key.objectid, key.offset);
+ key.objectid, key.offset, 1);
BUG_ON(ret);
}
if (dirty)
@@ -1778,21 +1778,23 @@ again:
ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0);
+ src->root_key.objectid, level - 1, 0,
+ 1);
BUG_ON(ret);
ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0);
+ 0, 1);
BUG_ON(ret);
ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0);
+ src->root_key.objectid, level - 1, 0,
+ 1);
BUG_ON(ret);
ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0);
+ 0, 1);
BUG_ON(ret);
btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
} else {
list_del_init(&reloc_root->root_list);
}
- btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+ btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
}
if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
node->eb->start, blocksize,
upper->eb->start,
btrfs_header_owner(upper->eb),
- node->level, 0);
+ node->level, 0, 1);
BUG_ON(ret);
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
while (index <= last_index) {
- mutex_lock(&inode->i_mutex);
ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
- mutex_unlock(&inode->i_mutex);
if (ret)
goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc..9770cc5bfb7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
#include "transaction.h"
#include "backref.h"
#include "extent_io.h"
+#include "check-integrity.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
u8 ref_level;
unsigned long ptr = 0;
const int bufsize = 4096;
- u64 extent_offset;
+ u64 extent_item_pos;
path = btrfs_alloc_path();
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
if (ret < 0)
goto out;
- extent_offset = swarn.logical - found_key.objectid;
+ extent_item_pos = swarn.logical - found_key.objectid;
swarn.extent_item_size = found_key.offset;
eb = path->nodes[0];
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
item_size = btrfs_item_size_nr(eb, path->slots[0]);
+ btrfs_release_path(path);
if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
} else {
swarn.path = path;
iterate_extent_inodes(fs_info, path, found_key.objectid,
- extent_offset,
+ extent_item_pos,
scrub_print_warning_inode, &swarn);
}
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
bio_add_page(bio, page, PAGE_SIZE, 0);
bio->bi_end_io = scrub_fixup_end_io;
bio->bi_private = &complete;
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
/* this will also unplug the queue */
wait_for_completion(&complete);
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev)
sdev->curr = -1;
atomic_inc(&sdev->in_flight);
- submit_bio(READ, sbio->bio);
+ btrfsic_submit_bio(READ, sbio->bio);
return 0;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae488aa1966..3ce97b217cb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
static void btrfs_put_super(struct super_block *sb)
{
- struct btrfs_root *root = btrfs_sb(sb);
- int ret;
-
- ret = close_ctree(root);
- sb->s_fs_info = NULL;
-
- (void)ret; /* FIXME: need to fix VFS to return error? */
+ (void)close_ctree(btrfs_sb(sb)->tree_root);
+ /* FIXME: need to fix VFS to return error? */
+ /* AV: return it _where_? ->put_super() can be triggered by any number
+ * of async events, up to and including delivery of SIGKILL to the
+ * last process that kept it busy. Or segfault in the aforementioned
+ * process... Whom would you report that to?
+ */
}
enum {
@@ -163,8 +163,11 @@ enum {
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
- Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
- Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+ Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+ Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+ Opt_check_integrity, Opt_check_integrity_including_extent_data,
+ Opt_check_integrity_print_mask,
+ Opt_err,
};
static match_table_t tokens = {
@@ -199,6 +202,10 @@ static match_table_t tokens = {
{Opt_inode_cache, "inode_cache"},
{Opt_no_space_cache, "nospace_cache"},
{Opt_recovery, "recovery"},
+ {Opt_skip_balance, "skip_balance"},
+ {Opt_check_integrity, "check_int"},
+ {Opt_check_integrity_including_extent_data, "check_int_data"},
+ {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
{Opt_err, NULL},
};
@@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
printk(KERN_INFO "btrfs: enabling auto recovery");
btrfs_set_opt(info->mount_opt, RECOVERY);
break;
+ case Opt_skip_balance:
+ btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ case Opt_check_integrity_including_extent_data:
+ printk(KERN_INFO "btrfs: enabling check integrity"
+ " including extent data\n");
+ btrfs_set_opt(info->mount_opt,
+ CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ break;
+ case Opt_check_integrity:
+ printk(KERN_INFO "btrfs: enabling check integrity\n");
+ btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+ break;
+ case Opt_check_integrity_print_mask:
+ intarg = 0;
+ match_int(&args[0], &intarg);
+ if (intarg) {
+ info->check_integrity_print_mask = intarg;
+ printk(KERN_INFO "btrfs:"
+ " check_integrity_print_mask 0x%x\n",
+ info->check_integrity_print_mask);
+ }
+ break;
+#else
+ case Opt_check_integrity_including_extent_data:
+ case Opt_check_integrity:
+ case Opt_check_integrity_print_mask:
+ printk(KERN_ERR "btrfs: support for check_integrity*"
+ " not compiled in!\n");
+ ret = -EINVAL;
+ goto out;
+#endif
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -500,7 +541,8 @@ out:
static struct dentry *get_default_root(struct super_block *sb,
u64 subvol_objectid)
{
- struct btrfs_root *root = sb->s_fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
struct btrfs_path *path;
@@ -530,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb,
* will mount by default if we haven't been given a specific subvolume
* to mount.
*/
- dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+ dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
@@ -544,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb,
*/
btrfs_free_path(path);
dir_id = BTRFS_FIRST_FREE_OBJECTID;
- new_root = root->fs_info->fs_root;
+ new_root = fs_info->fs_root;
goto setup_root;
}
@@ -552,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb,
btrfs_free_path(path);
find_root:
- new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+ new_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (IS_ERR(new_root))
return ERR_CAST(new_root);
@@ -588,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
{
struct inode *inode;
struct dentry *root_dentry;
- struct btrfs_root *tree_root;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_key key;
int err;
@@ -603,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_flags |= MS_POSIXACL;
#endif
- tree_root = open_ctree(sb, fs_devices, (char *)data);
-
- if (IS_ERR(tree_root)) {
+ err = open_ctree(sb, fs_devices, (char *)data);
+ if (err) {
printk("btrfs: open_ctree failed\n");
- return PTR_ERR(tree_root);
+ return err;
}
- sb->s_fs_info = tree_root;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+ inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail_close;
@@ -631,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb,
save_mount_options(sb, data);
cleancache_init_fs(sb);
+ sb->s_flags |= MS_ACTIVE;
return 0;
fail_close:
- close_ctree(tree_root);
+ close_ctree(fs_info->tree_root);
return err;
}
int btrfs_sync_fs(struct super_block *sb, int wait)
{
struct btrfs_trans_handle *trans;
- struct btrfs_root *root = btrfs_sb(sb);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
int ret;
trace_btrfs_sync_fs(wait);
if (!wait) {
- filemap_flush(root->fs_info->btree_inode->i_mapping);
+ filemap_flush(fs_info->btree_inode->i_mapping);
return 0;
}
@@ -663,8 +705,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
- struct btrfs_root *root = btrfs_sb(dentry->d_sb);
- struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+ struct btrfs_root *root = info->tree_root;
char *compress_type;
if (btrfs_test_opt(root, DEGRADED))
@@ -722,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",autodefrag");
if (btrfs_test_opt(root, INODE_MAP_CACHE))
seq_puts(seq, ",inode_cache");
+ if (btrfs_test_opt(root, SKIP_BALANCE))
+ seq_puts(seq, ",skip_balance");
return 0;
}
static int btrfs_test_super(struct super_block *s, void *data)
{
- struct btrfs_root *test_root = data;
- struct btrfs_root *root = btrfs_sb(s);
+ struct btrfs_fs_info *p = data;
+ struct btrfs_fs_info *fs_info = btrfs_sb(s);
- /*
- * If this super block is going away, return false as it
- * can't match as an existing super block.
- */
- if (!atomic_read(&s->s_active))
- return 0;
- return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+ return fs_info->fs_devices == p->fs_devices;
}
static int btrfs_set_super(struct super_block *s, void *data)
{
- s->s_fs_info = data;
-
- return set_anon_super(s, data);
+ int err = set_anon_super(s, data);
+ if (!err)
+ s->s_fs_info = data;
+ return err;
}
/*
@@ -903,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
if (!fs_info)
return ERR_PTR(-ENOMEM);
- fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
- if (!fs_info->tree_root) {
- error = -ENOMEM;
- goto error_fs_info;
- }
- fs_info->tree_root->fs_info = fs_info;
fs_info->fs_devices = fs_devices;
fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -928,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
bdev = fs_devices->latest_bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super,
- fs_info->tree_root);
+ s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
if (IS_ERR(s)) {
error = PTR_ERR(s);
goto error_close_devices;
}
if (s->s_root) {
- if ((flags ^ s->s_flags) & MS_RDONLY) {
- deactivate_locked_super(s);
- error = -EBUSY;
- goto error_close_devices;
- }
-
btrfs_close_devices(fs_devices);
free_fs_info(fs_info);
+ if ((flags ^ s->s_flags) & MS_RDONLY)
+ error = -EBUSY;
} else {
char b[BDEVNAME_SIZE];
s->s_flags = flags | MS_NOSEC;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
- btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+ btrfs_sb(s)->bdev_holder = fs_type;
error = btrfs_fill_super(s, fs_devices, data,
flags & MS_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- s->s_flags |= MS_ACTIVE;
}
- root = get_default_root(s, subvol_objectid);
- if (IS_ERR(root)) {
+ root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+ if (IS_ERR(root))
deactivate_locked_super(s);
- return root;
- }
return root;
@@ -977,7 +997,8 @@ error_fs_info:
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
- struct btrfs_root *root = btrfs_sb(sb);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_root *root = fs_info->tree_root;
int ret;
ret = btrfs_parse_options(root, data);
@@ -993,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
ret = btrfs_commit_super(root);
WARN_ON(ret);
} else {
- if (root->fs_info->fs_devices->rw_devices == 0)
+ if (fs_info->fs_devices->rw_devices == 0)
return -EACCES;
- if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
+ if (btrfs_super_log_root(fs_info->super_copy) != 0)
return -EINVAL;
- ret = btrfs_cleanup_fs_roots(root->fs_info);
+ ret = btrfs_cleanup_fs_roots(fs_info);
WARN_ON(ret);
/* recover relocation */
@@ -1168,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct btrfs_root *root = btrfs_sb(dentry->d_sb);
- struct btrfs_super_block *disk_super = root->fs_info->super_copy;
- struct list_head *head = &root->fs_info->space_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ struct list_head *head = &fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
u64 total_free_data = 0;
int bits = dentry->d_sb->s_blocksize_bits;
- __be32 *fsid = (__be32 *)root->fs_info->fsid;
+ __be32 *fsid = (__be32 *)fs_info->fsid;
int ret;
/* holding chunk_muext to avoid allocating new chunks */
- mutex_lock(&root->fs_info->chunk_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1198,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bavail = total_free_data;
- ret = btrfs_calc_avail_data_space(root, &total_free_data);
+ ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
if (ret) {
- mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
buf->f_bavail += total_free_data;
buf->f_bavail = buf->f_bavail >> bits;
- mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
/* We treat it as constant endianness (it doesn't matter _which_)
because we want the fsid to come out the same whether mounted
@@ -1219,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+static void btrfs_kill_super(struct super_block *sb)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ kill_anon_super(sb);
+ free_fs_info(fs_info);
+}
+
static struct file_system_type btrfs_fs_type = {
.owner = THIS_MODULE,
.name = "btrfs",
.mount = btrfs_mount,
- .kill_sb = kill_anon_super,
+ .kill_sb = btrfs_kill_super,
.fs_flags = FS_REQUIRES_DEV,
};
@@ -1257,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
static int btrfs_freeze(struct super_block *sb)
{
- struct btrfs_root *root = btrfs_sb(sb);
- mutex_lock(&root->fs_info->transaction_kthread_mutex);
- mutex_lock(&root->fs_info->cleaner_mutex);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ mutex_lock(&fs_info->transaction_kthread_mutex);
+ mutex_lock(&fs_info->cleaner_mutex);
return 0;
}
static int btrfs_unfreeze(struct super_block *sb)
{
- struct btrfs_root *root = btrfs_sb(sb);
- mutex_unlock(&root->fs_info->cleaner_mutex);
- mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ mutex_unlock(&fs_info->cleaner_mutex);
+ mutex_unlock(&fs_info->transaction_kthread_mutex);
return 0;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3..287a6728b1a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
+ WARN_ON(transaction->delayed_refs.root.rb_node);
+ WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
memset(transaction, 0, sizeof(*transaction));
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
@@ -108,8 +110,11 @@ loop:
cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0;
cur_trans->delayed_refs.run_delayed_start = 0;
+ cur_trans->delayed_refs.seq = 1;
+ init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
spin_lock_init(&cur_trans->commit_lock);
spin_lock_init(&cur_trans->delayed_refs.lock);
+ INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -321,6 +326,8 @@ again:
}
if (num_bytes) {
+ trace_btrfs_space_reservation(root->fs_info, "transaction",
+ (u64)h, num_bytes, 1);
h->block_rsv = &root->fs_info->trans_block_rsv;
h->bytes_reserved = num_bytes;
}
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- while (count < 4) {
+ while (count < 2) {
unsigned long cur = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (cur &&
trans->transaction->delayed_refs.num_heads_ready > 64) {
trans->delayed_ref_updates = 0;
-
- /*
- * do a full flush if the transaction is trying
- * to close
- */
- if (trans->transaction->delayed_refs.flushing)
- cur = 0;
btrfs_run_delayed_refs(trans, root, cur);
} else {
break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- btrfs_drop_snapshot(root, NULL, 0);
+ btrfs_drop_snapshot(root, NULL, 0, 0);
else
- btrfs_drop_snapshot(root, NULL, 1);
+ btrfs_drop_snapshot(root, NULL, 1, 0);
}
return 0;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3568374d419..966cc74f5d6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root,
ins.objectid, ins.offset,
0, root->root_key.objectid,
- key->objectid, offset);
+ key->objectid, offset, 0);
BUG_ON(ret);
} else {
/*
@@ -1957,7 +1957,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
finish_wait(&root->log_commit_wait[index], &wait);
mutex_lock(&root->log_mutex);
- } while (root->log_transid < transid + 2 &&
+ } while (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && root->log_transid < transid + 2 &&
atomic_read(&root->log_commit[index]));
return 0;
}
@@ -1966,7 +1967,8 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
DEFINE_WAIT(wait);
- while (atomic_read(&root->log_writers)) {
+ while (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && atomic_read(&root->log_writers)) {
prepare_to_wait(&root->log_writer_wait,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 00000000000..12f5147bd2b
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "ulist.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * elem = NULL;
+ *
+ * while ((elem = ulist_next(ulist, elem)) {
+ * for (all child nodes n in elem)
+ * ulist_add(ulist, n);
+ * do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist: the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+ ulist->nnodes = 0;
+ ulist->nodes = ulist->int_nodes;
+ ulist->nodes_alloced = ULIST_SIZE;
+}
+EXPORT_SYMBOL(ulist_init);
+
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist: the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+void ulist_fini(struct ulist *ulist)
+{
+ /*
+ * The first ULIST_SIZE elements are stored inline in struct ulist.
+ * Only if more elements are alocated they need to be freed.
+ */
+ if (ulist->nodes_alloced > ULIST_SIZE)
+ kfree(ulist->nodes);
+ ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
+}
+EXPORT_SYMBOL(ulist_fini);
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist: ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+ ulist_fini(ulist);
+ ulist_init(ulist);
+}
+EXPORT_SYMBOL(ulist_reinit);
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask: allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(unsigned long gfp_mask)
+{
+ struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+ if (!ulist)
+ return NULL;
+
+ ulist_init(ulist);
+
+ return ulist;
+}
+EXPORT_SYMBOL(ulist_alloc);
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist: ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+ if (!ulist)
+ return;
+ ulist_fini(ulist);
+ kfree(ulist);
+}
+EXPORT_SYMBOL(ulist_free);
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist: ulist to add the element to
+ * @val: value to add to ulist
+ * @aux: auxiliary value to store along with val
+ * @gfp_mask: flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ * locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+ unsigned long gfp_mask)
+{
+ int i;
+
+ for (i = 0; i < ulist->nnodes; ++i) {
+ if (ulist->nodes[i].val == val)
+ return 0;
+ }
+
+ if (ulist->nnodes >= ulist->nodes_alloced) {
+ u64 new_alloced = ulist->nodes_alloced + 128;
+ struct ulist_node *new_nodes;
+ void *old = NULL;
+
+ /*
+ * if nodes_alloced == ULIST_SIZE no memory has been allocated
+ * yet, so pass NULL to krealloc
+ */
+ if (ulist->nodes_alloced > ULIST_SIZE)
+ old = ulist->nodes;
+
+ new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
+ gfp_mask);
+ if (!new_nodes)
+ return -ENOMEM;
+
+ if (!old)
+ memcpy(new_nodes, ulist->int_nodes,
+ sizeof(ulist->int_nodes));
+
+ ulist->nodes = new_nodes;
+ ulist->nodes_alloced = new_alloced;
+ }
+ ulist->nodes[ulist->nnodes].val = val;
+ ulist->nodes[ulist->nnodes].aux = aux;
+ ++ulist->nnodes;
+
+ return 1;
+}
+EXPORT_SYMBOL(ulist_add);
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist: ulist to iterate
+ * @prev: previously returned element or %NULL to start iteration
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ * locking is needed
+ *
+ * This function is used to iterate an ulist. The iteration is started with
+ * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+{
+ int next;
+
+ if (ulist->nnodes == 0)
+ return NULL;
+
+ if (!prev)
+ return &ulist->nodes[0];
+
+ next = (prev - ulist->nodes) + 1;
+ if (next < 0 || next >= ulist->nnodes)
+ return NULL;
+
+ return &ulist->nodes[next];
+}
+EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 00000000000..2e25dec58ec
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+
+#ifndef __ULIST__
+#define __ULIST__
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ */
+
+/*
+ * number of elements statically allocated inside struct ulist
+ */
+#define ULIST_SIZE 16
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+ u64 val; /* value to store */
+ unsigned long aux; /* auxiliary value saved along with the val */
+};
+
+struct ulist {
+ /*
+ * number of elements stored in list
+ */
+ unsigned long nnodes;
+
+ /*
+ * number of nodes we already have room for
+ */
+ unsigned long nodes_alloced;
+
+ /*
+ * pointer to the array storing the elements. The first ULIST_SIZE
+ * elements are stored inline. In this case the it points to int_nodes.
+ * After exceeding ULIST_SIZE, dynamic memory is allocated.
+ */
+ struct ulist_node *nodes;
+
+ /*
+ * inline storage space for the first ULIST_SIZE entries
+ */
+ struct ulist_node int_nodes[ULIST_SIZE];
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_fini(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(unsigned long gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+ unsigned long gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9..0b4e2af7954 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
#include <linux/random.h>
#include <linux/iocontext.h>
#include <linux/capability.h>
+#include <linux/kthread.h>
#include <asm/div64.h>
#include "compat.h"
#include "ctree.h"
@@ -32,6 +33,7 @@
#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
+#include "check-integrity.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -246,7 +248,7 @@ loop_lock:
sync_pending = 0;
}
- submit_bio(cur->bi_rw, cur);
+ btrfsic_submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
if (need_resched())
@@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
u64 devid;
u64 transid;
- mutex_lock(&uuid_mutex);
-
flags |= FMODE_EXCL;
bdev = blkdev_get_by_path(path, flags, holder);
@@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
goto error;
}
+ mutex_lock(&uuid_mutex);
ret = set_blocksize(bdev, 4096);
if (ret)
goto error_close;
@@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
brelse(bh);
error_close:
+ mutex_unlock(&uuid_mutex);
blkdev_put(bdev, flags);
error:
- mutex_unlock(&uuid_mutex);
return ret;
}
@@ -829,7 +830,6 @@ out:
/*
* find_free_dev_extent - find free space in the specified device
- * @trans: transaction handler
* @device: the device which we search the free space in
* @num_bytes: the size of the free space that we need
* @start: store the start of the free space.
@@ -848,8 +848,7 @@ out:
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *len)
{
struct btrfs_key key;
@@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
if (ret > 0) {
@@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
bool clear_super = false;
mutex_lock(&uuid_mutex);
- mutex_lock(&root->fs_info->volume_mutex);
all_avail = root->fs_info->avail_data_alloc_bits |
root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1450,6 @@ error_close:
if (bdev)
blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
out:
- mutex_unlock(&root->fs_info->volume_mutex);
mutex_unlock(&uuid_mutex);
return ret;
error_undo:
@@ -1469,8 +1466,7 @@ error_undo:
/*
* does all the dirty work required for changing file system's UUID.
*/
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_root *root)
{
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_fs_devices *old_devices;
@@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
}
filemap_write_and_wait(bdev->bd_inode->i_mapping);
- mutex_lock(&root->fs_info->volume_mutex);
devices = &root->fs_info->fs_devices->devices;
/*
@@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
if (seeding_dev) {
sb->s_flags &= ~MS_RDONLY;
- ret = btrfs_prepare_sprout(trans, root);
+ ret = btrfs_prepare_sprout(root);
BUG_ON(ret);
}
@@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_relocate_sys_chunks(root);
BUG_ON(ret);
}
-out:
- mutex_unlock(&root->fs_info->volume_mutex);
+
return ret;
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1760,7 @@ error:
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
- goto out;
+ return ret;
}
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2071,362 @@ error:
return ret;
}
+static int insert_balance_item(struct btrfs_root *root,
+ struct btrfs_balance_control *bctl)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ sizeof(*item));
+ if (ret)
+ goto out;
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+ btrfs_set_balance_data(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+ btrfs_set_balance_meta(leaf, item, &disk_bargs);
+ btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+ btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+ btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+ btrfs_mark_buffer_dirty(leaf);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret, err;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
+ return PTR_ERR(trans);
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
+ if (ret > 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = btrfs_del_item(trans, root, path);
+out:
+ btrfs_free_path(path);
+ err = btrfs_commit_transaction(trans, root);
+ if (err && !ret)
+ ret = err;
+ return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+ /*
+ * Turn on soft mode for chunk types that were being converted.
+ */
+ if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+ if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+ /*
+ * Turn on usage filter if is not already used. The idea is
+ * that chunks that we have already balanced should be
+ * reasonably full. Don't do it for chunks that are being
+ * converted - that will keep us from relocating unconverted
+ * (albeit full) chunks.
+ */
+ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->data.usage = 90;
+ }
+ if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->sys.usage = 90;
+ }
+ if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+ bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+ bctl->meta.usage = 90;
+ }
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper. Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+ BUG_ON(fs_info->balance_ctl);
+
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = bctl;
+ spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+ BUG_ON(!fs_info->balance_ctl);
+
+ spin_lock(&fs_info->balance_lock);
+ fs_info->balance_ctl = NULL;
+ spin_unlock(&fs_info->balance_lock);
+
+ kfree(bctl);
+}
+
+/*
+ * Balance filters. Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_profile,
+ struct btrfs_balance_args *bargs)
+{
+ chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ if (chunk_profile == 0)
+ chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (bargs->profiles & chunk_profile)
+ return 0;
+
+ return 1;
+}
+
+static u64 div_factor_fine(u64 num, int factor)
+{
+ if (factor <= 0)
+ return 0;
+ if (factor >= 100)
+ return num;
+
+ num *= factor;
+ do_div(num, 100);
+ return num;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group_cache *cache;
+ u64 chunk_used, user_thresh;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = btrfs_block_group_used(&cache->item);
+
+ user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+ if (chunk_used < user_thresh)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ int i;
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+ return 0;
+ }
+
+ return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ struct btrfs_stripe *stripe;
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ u64 stripe_offset;
+ u64 stripe_length;
+ int factor;
+ int i;
+
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+ return 0;
+
+ if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
+ factor = num_stripes / factor;
+
+ for (i = 0; i < num_stripes; i++) {
+ stripe = btrfs_stripe_nr(chunk, i);
+ if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+ continue;
+
+ stripe_offset = btrfs_stripe_offset(leaf, stripe);
+ stripe_length = btrfs_chunk_length(leaf, chunk);
+ do_div(stripe_length, factor);
+
+ if (stripe_offset < bargs->pend &&
+ stripe_offset + stripe_length > bargs->pstart)
+ return 0;
+ }
+
+ return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
+{
+ if (chunk_offset < bargs->vend &&
+ chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+ /* at least part of the chunk is inside this vrange */
+ return 0;
+
+ return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_profile,
+ struct btrfs_balance_args *bargs)
+{
+ if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+ return 0;
+
+ chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ if (chunk_profile == 0)
+ chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+ if (bargs->target & chunk_profile)
+ return 1;
+
+ return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+ struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+ struct btrfs_balance_args *bargs = NULL;
+ u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+ /* type filter */
+ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+ (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+ return 0;
+ }
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ bargs = &bctl->data;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ bargs = &bctl->sys;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ bargs = &bctl->meta;
+
+ /* profiles filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+ chunk_profiles_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ /* usage filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+ chunk_devid_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
+ /* drange filter, makes sense only with devid filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+ chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* vrange filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+ chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+ return 0;
+ }
+
+ /* soft profile changing mode */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+ chunk_soft_convert_filter(chunk_type, bargs)) {
+ return 0;
+ }
+
+ return 1;
+}
+
static u64 div_factor(u64 num, int factor)
{
if (factor == 10)
@@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor)
return num;
}
-int btrfs_balance(struct btrfs_root *dev_root)
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
- int ret;
- struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+ struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+ struct btrfs_root *chunk_root = fs_info->chunk_root;
+ struct btrfs_root *dev_root = fs_info->dev_root;
+ struct list_head *devices;
struct btrfs_device *device;
u64 old_size;
u64 size_to_free;
+ struct btrfs_chunk *chunk;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
- struct btrfs_trans_handle *trans;
struct btrfs_key found_key;
-
- if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mutex_lock(&dev_root->fs_info->volume_mutex);
- dev_root = dev_root->fs_info->dev_root;
+ struct btrfs_trans_handle *trans;
+ struct extent_buffer *leaf;
+ int slot;
+ int ret;
+ int enospc_errors = 0;
+ bool counting = true;
/* step one make some room on all the devices */
+ devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
old_size = device->total_bytes;
size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
ret = -ENOMEM;
goto error;
}
+
+ /* zero out stat counters */
+ spin_lock(&fs_info->balance_lock);
+ memset(&bctl->stat, 0, sizeof(bctl->stat));
+ spin_unlock(&fs_info->balance_lock);
+again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
+ if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -ECANCELED;
+ goto error;
+ }
+
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0)
goto error;
@@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
* failed
*/
if (ret == 0)
- break;
+ BUG(); /* FIXME break ? */
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
- if (ret)
+ if (ret) {
+ ret = 0;
break;
+ }
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, slot);
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
if (found_key.objectid != key.objectid)
break;
@@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
if (found_key.offset == 0)
break;
+ chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+ if (!counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.considered++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ ret = should_balance_chunk(chunk_root, leaf, chunk,
+ found_key.offset);
btrfs_release_path(path);
+ if (!ret)
+ goto loop;
+
+ if (counting) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.expected++;
+ spin_unlock(&fs_info->balance_lock);
+ goto loop;
+ }
+
ret = btrfs_relocate_chunk(chunk_root,
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
if (ret && ret != -ENOSPC)
goto error;
+ if (ret == -ENOSPC) {
+ enospc_errors++;
+ } else {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.completed++;
+ spin_unlock(&fs_info->balance_lock);
+ }
+loop:
key.offset = found_key.offset - 1;
}
- ret = 0;
+
+ if (counting) {
+ btrfs_release_path(path);
+ counting = false;
+ goto again;
+ }
error:
btrfs_free_path(path);
- mutex_unlock(&dev_root->fs_info->volume_mutex);
+ if (enospc_errors) {
+ printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+ enospc_errors);
+ if (!ret)
+ ret = -ENOSPC;
+ }
+
return ret;
}
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+ /* cancel requested || normal exit path */
+ return atomic_read(&fs_info->balance_cancel_req) ||
+ (atomic_read(&fs_info->balance_pause_req) == 0 &&
+ atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret;
+
+ unset_balance_control(fs_info);
+ ret = del_balance_item(fs_info->tree_root);
+ BUG_ON(ret);
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+ struct btrfs_ioctl_balance_args *bargs);
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs)
+{
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+ u64 allowed;
+ int ret;
+
+ if (btrfs_fs_closing(fs_info) ||
+ atomic_read(&fs_info->balance_pause_req) ||
+ atomic_read(&fs_info->balance_cancel_req)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * In case of mixed groups both data and meta should be picked,
+ * and identical options should be given for both of them.
+ */
+ allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+ if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+ (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+ if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+ !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+ memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+ printk(KERN_ERR "btrfs: with mixed groups data and "
+ "metadata balance options must be the same\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * Profile changing sanity checks. Skip them if a simple
+ * balance is requested.
+ */
+ if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+ BTRFS_BALANCE_ARGS_CONVERT))
+ goto do_balance;
+
+ allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+ if (fs_info->fs_devices->num_devices == 1)
+ allowed |= BTRFS_BLOCK_GROUP_DUP;
+ else if (fs_info->fs_devices->num_devices < 4)
+ allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+ else
+ allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10);
+
+ if (!profile_is_valid(bctl->data.target, 1) ||
+ bctl->data.target & ~allowed) {
+ printk(KERN_ERR "btrfs: unable to start balance with target "
+ "data profile %llu\n",
+ (unsigned long long)bctl->data.target);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!profile_is_valid(bctl->meta.target, 1) ||
+ bctl->meta.target & ~allowed) {
+ printk(KERN_ERR "btrfs: unable to start balance with target "
+ "metadata profile %llu\n",
+ (unsigned long long)bctl->meta.target);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!profile_is_valid(bctl->sys.target, 1) ||
+ bctl->sys.target & ~allowed) {
+ printk(KERN_ERR "btrfs: unable to start balance with target "
+ "system profile %llu\n",
+ (unsigned long long)bctl->sys.target);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+ printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* allow to reduce meta or sys integrity only if force set */
+ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10;
+ if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_system_alloc_bits & allowed) &&
+ !(bctl->sys.target & allowed)) ||
+ ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (fs_info->avail_metadata_alloc_bits & allowed) &&
+ !(bctl->meta.target & allowed))) {
+ if (bctl->flags & BTRFS_BALANCE_FORCE) {
+ printk(KERN_INFO "btrfs: force reducing metadata "
+ "integrity\n");
+ } else {
+ printk(KERN_ERR "btrfs: balance will reduce metadata "
+ "integrity, use force if you want this\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+do_balance:
+ ret = insert_balance_item(fs_info->tree_root, bctl);
+ if (ret && ret != -EEXIST)
+ goto out;
+
+ if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+ BUG_ON(ret == -EEXIST);
+ set_balance_control(bctl);
+ } else {
+ BUG_ON(ret != -EEXIST);
+ spin_lock(&fs_info->balance_lock);
+ update_balance_args(bctl);
+ spin_unlock(&fs_info->balance_lock);
+ }
+
+ atomic_inc(&fs_info->balance_running);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ ret = __btrfs_balance(fs_info);
+
+ mutex_lock(&fs_info->balance_mutex);
+ atomic_dec(&fs_info->balance_running);
+
+ if (bargs) {
+ memset(bargs, 0, sizeof(*bargs));
+ update_ioctl_balance_args(fs_info, 0, bargs);
+ }
+
+ if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+ balance_need_close(fs_info)) {
+ __cancel_balance(fs_info);
+ }
+
+ wake_up(&fs_info->balance_wait_q);
+
+ return ret;
+out:
+ if (bctl->flags & BTRFS_BALANCE_RESUME)
+ __cancel_balance(fs_info);
+ else
+ kfree(bctl);
+ return ret;
+}
+
+static int balance_kthread(void *data)
+{
+ struct btrfs_balance_control *bctl =
+ (struct btrfs_balance_control *)data;
+ struct btrfs_fs_info *fs_info = bctl->fs_info;
+ int ret = 0;
+
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ set_balance_control(bctl);
+
+ if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ printk(KERN_INFO "btrfs: force skipping balance\n");
+ } else {
+ printk(KERN_INFO "btrfs: continuing balance\n");
+ ret = btrfs_balance(bctl, NULL);
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ return ret;
+}
+
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+ struct task_struct *tsk;
+ struct btrfs_balance_control *bctl;
+ struct btrfs_balance_item *item;
+ struct btrfs_disk_balance_args disk_bargs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+ if (!bctl) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = BTRFS_BALANCE_OBJECTID;
+ key.type = BTRFS_BALANCE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out_bctl;
+ if (ret > 0) { /* ret = -ENOENT; */
+ ret = 0;
+ goto out_bctl;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+ bctl->fs_info = tree_root->fs_info;
+ bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+
+ btrfs_balance_data(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+ btrfs_balance_meta(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+ btrfs_balance_sys(leaf, item, &disk_bargs);
+ btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+ tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+ if (IS_ERR(tsk))
+ ret = PTR_ERR(tsk);
+ else
+ goto out;
+
+out_bctl:
+ kfree(bctl);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+ int ret = 0;
+
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ if (atomic_read(&fs_info->balance_running)) {
+ atomic_inc(&fs_info->balance_pause_req);
+ mutex_unlock(&fs_info->balance_mutex);
+
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+
+ mutex_lock(&fs_info->balance_mutex);
+ /* we are good with balance_ctl ripped off from under us */
+ BUG_ON(atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_pause_req);
+ } else {
+ ret = -ENOTCONN;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+ mutex_lock(&fs_info->balance_mutex);
+ if (!fs_info->balance_ctl) {
+ mutex_unlock(&fs_info->balance_mutex);
+ return -ENOTCONN;
+ }
+
+ atomic_inc(&fs_info->balance_cancel_req);
+ /*
+ * if we are running just wait and return, balance item is
+ * deleted in btrfs_balance in this case
+ */
+ if (atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ wait_event(fs_info->balance_wait_q,
+ atomic_read(&fs_info->balance_running) == 0);
+ mutex_lock(&fs_info->balance_mutex);
+ } else {
+ /* __cancel_balance needs volume_mutex */
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl)
+ __cancel_balance(fs_info);
+
+ mutex_unlock(&fs_info->volume_mutex);
+ }
+
+ BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+ atomic_dec(&fs_info->balance_cancel_req);
+ mutex_unlock(&fs_info->balance_mutex);
+ return 0;
+}
+
/*
* shrinking a device means finding all of the device extents past
* the new size, and then following the back refs to the chunks.
@@ -2323,8 +3041,7 @@ done:
return ret;
}
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
@@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = 1024 * 1024 * 1024;
max_chunk_size = 10 * max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- max_stripe_size = 256 * 1024 * 1024;
+ /* for larger filesystems, use larger metadata chunks */
+ if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+ max_stripe_size = 1024 * 1024 * 1024;
+ else
+ max_stripe_size = 256 * 1024 * 1024;
max_chunk_size = max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- max_stripe_size = 8 * 1024 * 1024;
+ max_stripe_size = 32 * 1024 * 1024;
max_chunk_size = 2 * max_stripe_size;
} else {
printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (total_avail == 0)
continue;
- ret = find_free_dev_extent(trans, device,
+ ret = find_free_dev_extent(device,
max_stripe_size * dev_stripes,
&dev_offset, &max_avail);
if (ret && ret != -ENOSPC)
@@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
BUG_ON(ret);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+ ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
item_size);
BUG_ON(ret);
}
@@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
return ret;
alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
- (fs_info->metadata_alloc_profile &
- fs_info->avail_metadata_alloc_bits);
+ fs_info->avail_metadata_alloc_bits;
alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
sys_chunk_offset = chunk_offset + chunk_size;
alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
- (fs_info->system_alloc_profile &
- fs_info->avail_system_alloc_bits);
+ fs_info->avail_system_alloc_bits;
alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 stripe_nr;
u64 stripe_nr_orig;
u64 stripe_nr_end;
- int stripes_allocated = 8;
- int stripes_required = 1;
int stripe_index;
int i;
+ int ret = 0;
int num_stripes;
int max_errors = 0;
struct btrfs_bio *bbio = NULL;
- if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
- stripes_allocated = 1;
-again:
- if (bbio_ret) {
- bbio = kzalloc(btrfs_bio_size(stripes_allocated),
- GFP_NOFS);
- if (!bbio)
- return -ENOMEM;
-
- atomic_set(&bbio->error, 0);
- }
-
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
read_unlock(&em_tree->lock);
@@ -2939,32 +3645,6 @@ again:
if (mirror_num > map->num_stripes)
mirror_num = 0;
- /* if our btrfs_bio struct is too small, back off and try again */
- if (rw & REQ_WRITE) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_DUP)) {
- stripes_required = map->num_stripes;
- max_errors = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- stripes_required = map->sub_stripes;
- max_errors = 1;
- }
- }
- if (rw & REQ_DISCARD) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID10)) {
- stripes_required = map->num_stripes;
- }
- }
- if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
- stripes_allocated < stripes_required) {
- stripes_allocated = map->num_stripes;
- free_extent_map(em);
- kfree(bbio);
- goto again;
- }
stripe_nr = offset;
/*
* stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3660,7 @@ again:
if (rw & REQ_DISCARD)
*length = min_t(u64, em->len - offset, *length);
- else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP)) {
+ else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
/* we limit the length of each bio to what fits in a stripe */
*length = min_t(u64, em->len - offset,
map->stripe_len - stripe_offset);
@@ -3059,81 +3736,55 @@ again:
}
BUG_ON(stripe_index >= map->num_stripes);
+ bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+ if (!bbio) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ atomic_set(&bbio->error, 0);
+
if (rw & REQ_DISCARD) {
+ int factor = 0;
+ int sub_stripes = 0;
+ u64 stripes_per_dev = 0;
+ u32 remaining_stripes = 0;
+
+ if (map->type &
+ (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ sub_stripes = 1;
+ else
+ sub_stripes = map->sub_stripes;
+
+ factor = map->num_stripes / sub_stripes;
+ stripes_per_dev = div_u64_rem(stripe_nr_end -
+ stripe_nr_orig,
+ factor,
+ &remaining_stripes);
+ }
+
for (i = 0; i < num_stripes; i++) {
bbio->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
bbio->stripes[i].dev = map->stripes[stripe_index].dev;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- u64 stripes;
- u32 last_stripe = 0;
- int j;
-
- div_u64_rem(stripe_nr_end - 1,
- map->num_stripes,
- &last_stripe);
-
- for (j = 0; j < map->num_stripes; j++) {
- u32 test;
-
- div_u64_rem(stripe_nr_end - 1 - j,
- map->num_stripes, &test);
- if (test == stripe_index)
- break;
- }
- stripes = stripe_nr_end - 1 - j;
- do_div(stripes, map->num_stripes);
- bbio->stripes[i].length = map->stripe_len *
- (stripes - stripe_nr + 1);
-
- if (i == 0) {
- bbio->stripes[i].length -=
- stripe_offset;
- stripe_offset = 0;
- }
- if (stripe_index == last_stripe)
- bbio->stripes[i].length -=
- stripe_end_offset;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- u64 stripes;
- int j;
- int factor = map->num_stripes /
- map->sub_stripes;
- u32 last_stripe = 0;
-
- div_u64_rem(stripe_nr_end - 1,
- factor, &last_stripe);
- last_stripe *= map->sub_stripes;
-
- for (j = 0; j < factor; j++) {
- u32 test;
-
- div_u64_rem(stripe_nr_end - 1 - j,
- factor, &test);
-
- if (test ==
- stripe_index / map->sub_stripes)
- break;
- }
- stripes = stripe_nr_end - 1 - j;
- do_div(stripes, factor);
- bbio->stripes[i].length = map->stripe_len *
- (stripes - stripe_nr + 1);
-
- if (i < map->sub_stripes) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ bbio->stripes[i].length = stripes_per_dev *
+ map->stripe_len;
+ if (i / sub_stripes < remaining_stripes)
+ bbio->stripes[i].length +=
+ map->stripe_len;
+ if (i < sub_stripes)
bbio->stripes[i].length -=
stripe_offset;
- if (i == map->sub_stripes - 1)
- stripe_offset = 0;
- }
- if (stripe_index >= last_stripe &&
- stripe_index <= (last_stripe +
- map->sub_stripes - 1)) {
+ if ((i / sub_stripes + 1) %
+ sub_stripes == remaining_stripes)
bbio->stripes[i].length -=
stripe_end_offset;
- }
+ if (i == sub_stripes - 1)
+ stripe_offset = 0;
} else
bbio->stripes[i].length = *length;
@@ -3155,15 +3806,22 @@ again:
stripe_index++;
}
}
- if (bbio_ret) {
- *bbio_ret = bbio;
- bbio->num_stripes = num_stripes;
- bbio->max_errors = max_errors;
- bbio->mirror_num = mirror_num;
+
+ if (rw & REQ_WRITE) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP)) {
+ max_errors = 1;
+ }
}
+
+ *bbio_ret = bbio;
+ bbio->num_stripes = num_stripes;
+ bbio->max_errors = max_errors;
+ bbio->mirror_num = mirror_num;
out:
free_extent_map(em);
- return 0;
+ return ret;
}
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
/* don't bother with additional async steps for reads, right now */
if (!(rw & REQ_WRITE)) {
bio_get(bio);
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
bio_put(bio);
return 0;
}
@@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
if (async_submit)
schedule_bio(root, dev, rw, bio);
else
- submit_bio(rw, bio);
+ btrfsic_submit_bio(rw, bio);
} else {
bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
bio->bi_sector = logical >> 9;
@@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
struct btrfs_fs_devices *fs_devices;
int ret;
- mutex_lock(&uuid_mutex);
+ BUG_ON(!mutex_is_locked(&uuid_mutex));
fs_devices = root->fs_info->fs_devices->seed;
while (fs_devices) {
@@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
fs_devices->seed = root->fs_info->fs_devices->seed;
root->fs_info->fs_devices->seed = fs_devices;
out:
- mutex_unlock(&uuid_mutex);
return ret;
}
@@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
if (!path)
return -ENOMEM;
+ mutex_lock(&uuid_mutex);
+ lock_chunks(root);
+
/* first we search for all of the device items, and then we
* read in all of the chunk items. This way we can create chunk
* mappings that reference all of the devices that are afound
@@ -3799,6 +4459,9 @@ again:
}
ret = 0;
error:
+ unlock_chunks(root);
+ mutex_unlock(&uuid_mutex);
+
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37..19ac95048b8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,51 @@ struct map_lookup {
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA (1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
+#define BTRFS_BALANCE_METADATA (1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
+ BTRFS_BALANCE_SYSTEM | \
+ BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE (1ULL << 3)
+#define BTRFS_BALANCE_RESUME (1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
+
+/*
+ * Profile changing flags. When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+ struct btrfs_fs_info *fs_info;
+
+ struct btrfs_balance_args data;
+ struct btrfs_balance_args meta;
+ struct btrfs_balance_args sys;
+
+ u64 flags;
+
+ struct btrfs_balance_progress stat;
+};
+
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
u8 *uuid, u8 *fsid);
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+ struct btrfs_ioctl_balance_args *bargs);
+int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310..e7a5659087e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
out:
- btrfs_end_transaction_throttle(trans, root);
+ btrfs_end_transaction(trans, root);
return ret;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b60fc8bfb3e..620daad201d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap)
unsigned long ttl;
u32 gen;
- spin_lock(&cap->session->s_cap_lock);
+ spin_lock(&cap->session->s_gen_ttl_lock);
gen = cap->session->s_cap_gen;
ttl = cap->session->s_cap_ttl;
- spin_unlock(&cap->session->s_cap_lock);
+ spin_unlock(&cap->session->s_gen_ttl_lock);
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
dout("__cap_is_valid %p cap %p issued %s "
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 618246bc219..3e8094be460 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -975,10 +975,10 @@ static int dentry_lease_is_valid(struct dentry *dentry)
di = ceph_dentry(dentry);
if (di->lease_session) {
s = di->lease_session;
- spin_lock(&s->s_cap_lock);
+ spin_lock(&s->s_gen_ttl_lock);
gen = s->s_cap_gen;
ttl = s->s_cap_ttl;
- spin_unlock(&s->s_cap_lock);
+ spin_unlock(&s->s_gen_ttl_lock);
if (di->lease_gen == gen &&
time_before(jiffies, dentry->d_time) &&
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 23ab6a3f182..866e8d7ca37 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
/* trace */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
+ ceph_decode_need(&p, end, len, bad);
err = parse_reply_info_trace(&p, p+len, info, features);
if (err < 0)
goto out_bad;
@@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg,
/* extra */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
+ ceph_decode_need(&p, end, len, bad);
err = parse_reply_info_extra(&p, p+len, info, features);
if (err < 0)
goto out_bad;
@@ -398,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
s->s_con.peer_name.num = cpu_to_le64(mds);
- spin_lock_init(&s->s_cap_lock);
+ spin_lock_init(&s->s_gen_ttl_lock);
s->s_cap_gen = 0;
s->s_cap_ttl = 0;
+
+ spin_lock_init(&s->s_cap_lock);
s->s_renew_requested = 0;
s->s_renew_seq = 0;
INIT_LIST_HEAD(&s->s_caps);
@@ -2326,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_STALE:
pr_info("mds%d caps went stale, renewing\n",
session->s_mds);
- spin_lock(&session->s_cap_lock);
+ spin_lock(&session->s_gen_ttl_lock);
session->s_cap_gen++;
session->s_cap_ttl = 0;
- spin_unlock(&session->s_cap_lock);
+ spin_unlock(&session->s_gen_ttl_lock);
send_renew_caps(mdsc, session);
break;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a50ca0e3947..8c7c04ebb59 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -117,10 +117,13 @@ struct ceph_mds_session {
void *s_authorizer_buf, *s_authorizer_reply_buf;
size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
- /* protected by s_cap_lock */
- spinlock_t s_cap_lock;
+ /* protected by s_gen_ttl_lock */
+ spinlock_t s_gen_ttl_lock;
u32 s_cap_gen; /* inc each time we get mds stale msg */
unsigned long s_cap_ttl; /* when session caps expire */
+
+ /* protected by s_cap_lock */
+ spinlock_t s_cap_lock;
struct list_head s_caps; /* all caps issued by this session */
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 857214ae8c0..a76f697303d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
}
static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+ { true, "ceph.file.layout", ceph_vxattrcb_layout},
+ /* The following extended attribute name is deprecated */
{ true, "ceph.layout", ceph_vxattrcb_layout},
- { NULL, NULL }
+ { true, NULL, NULL }
};
static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f66cc162515..2b243af70aa 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -139,8 +139,7 @@ config CIFS_DFS_UPCALL
points. If unsure, say N.
config CIFS_FSCACHE
- bool "Provide CIFS client caching support (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "Provide CIFS client caching support"
depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
help
Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
@@ -148,8 +147,8 @@ config CIFS_FSCACHE
manager. If unsure, say N.
config CIFS_ACL
- bool "Provide CIFS ACL support (EXPERIMENTAL)"
- depends on EXPERIMENTAL && CIFS_XATTR && KEYS
+ bool "Provide CIFS ACL support"
+ depends on CIFS_XATTR && KEYS
help
Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
is handed over to the application/caller.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 84e8c072470..24b3dfc0528 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -676,14 +676,23 @@ static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
{
char c;
int rc;
+ static bool warned;
rc = get_user(c, buffer);
if (rc)
return rc;
if (c == '0' || c == 'n' || c == 'N')
multiuser_mount = 0;
- else if (c == '1' || c == 'y' || c == 'Y')
+ else if (c == '1' || c == 'y' || c == 'Y') {
multiuser_mount = 1;
+ if (!warned) {
+ warned = true;
+ printk(KERN_WARNING "CIFS VFS: The legacy multiuser "
+ "mount code is scheduled to be deprecated in "
+ "3.5. Please switch to using the multiuser "
+ "mount option.");
+ }
+ }
return count;
}
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2272fd5fe5b..e622863b292 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,9 +113,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
MAX_MECH_STR_LEN +
UID_KEY_LEN + (sizeof(uid_t) * 2) +
CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
- USER_KEY_LEN + strlen(sesInfo->user_name) +
PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
+ if (sesInfo->user_name)
+ desc_len += USER_KEY_LEN + strlen(sesInfo->user_name);
+
spnego_key = ERR_PTR(-ENOMEM);
description = kzalloc(desc_len, GFP_KERNEL);
if (description == NULL)
@@ -152,8 +154,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
dp = description + strlen(description);
sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
- dp = description + strlen(description);
- sprintf(dp, ";user=%s", sesInfo->user_name);
+ if (sesInfo->user_name) {
+ dp = description + strlen(description);
+ sprintf(dp, ";user=%s", sesInfo->user_name);
+ }
dp = description + strlen(description);
sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 1b2e180b018..fbb9da95184 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -27,17 +27,17 @@
#include "cifs_debug.h"
/*
- * cifs_ucs2_bytes - how long will a string be after conversion?
- * @ucs - pointer to input string
+ * cifs_utf16_bytes - how long will a string be after conversion?
+ * @utf16 - pointer to input string
* @maxbytes - don't go past this many bytes of input string
* @codepage - destination codepage
*
- * Walk a ucs2le string and return the number of bytes that the string will
+ * Walk a utf16le string and return the number of bytes that the string will
* be after being converted to the given charset, not including any null
* termination required. Don't walk past maxbytes in the source buffer.
*/
int
-cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+cifs_utf16_bytes(const __le16 *from, int maxbytes,
const struct nls_table *codepage)
{
int i;
@@ -122,7 +122,7 @@ cp_convert:
}
/*
- * cifs_from_ucs2 - convert utf16le string to local charset
+ * cifs_from_utf16 - convert utf16le string to local charset
* @to - destination buffer
* @from - source buffer
* @tolen - destination buffer size (in bytes)
@@ -130,7 +130,7 @@ cp_convert:
* @codepage - codepage to which characters should be converted
* @mapchar - should characters be remapped according to the mapchars option?
*
- * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * Convert a little-endian utf16le string (as sent by the server) to a string
* in the provided codepage. The tolen and fromlen parameters are to ensure
* that the code doesn't walk off of the end of the buffer (which is always
* a danger if the alignment of the source buffer is off). The destination
@@ -139,12 +139,12 @@ cp_convert:
* null terminator).
*
* Note that some windows versions actually send multiword UTF-16 characters
- * instead of straight UCS-2. The linux nls routines however aren't able to
+ * instead of straight UTF16-2. The linux nls routines however aren't able to
* deal with those characters properly. In the event that we get some of
* those characters, they won't be translated properly.
*/
int
-cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
const struct nls_table *codepage, bool mapchar)
{
int i, charlen, safelen;
@@ -190,13 +190,13 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
}
/*
- * NAME: cifs_strtoUCS()
+ * NAME: cifs_strtoUTF16()
*
* FUNCTION: Convert character string to unicode string
*
*/
int
-cifs_strtoUCS(__le16 *to, const char *from, int len,
+cifs_strtoUTF16(__le16 *to, const char *from, int len,
const struct nls_table *codepage)
{
int charlen;
@@ -206,7 +206,7 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
charlen = codepage->char2uni(from, len, &wchar_to);
if (charlen < 1) {
- cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+ cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",
*from, charlen);
/* A question mark */
wchar_to = 0x003f;
@@ -220,7 +220,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
}
/*
- * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * cifs_strndup_from_utf16 - copy a string from wire format to the local
+ * codepage
* @src - source string
* @maxlen - don't walk past this many bytes in the source string
* @is_unicode - is this a unicode string?
@@ -231,19 +232,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
* error.
*/
char *
-cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
- const struct nls_table *codepage)
+cifs_strndup_from_utf16(const char *src, const int maxlen,
+ const bool is_unicode, const struct nls_table *codepage)
{
int len;
char *dst;
if (is_unicode) {
- len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+ len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
len += nls_nullsize(codepage);
dst = kmalloc(len, GFP_KERNEL);
if (!dst)
return NULL;
- cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+ cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
false);
} else {
len = strnlen(src, maxlen);
@@ -264,7 +265,7 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
* names are little endian 16 bit Unicode on the wire
*/
int
-cifsConvertToUCS(__le16 *target, const char *source, int srclen,
+cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
const struct nls_table *cp, int mapChars)
{
int i, j, charlen;
@@ -273,7 +274,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
wchar_t tmp;
if (!mapChars)
- return cifs_strtoUCS(target, source, PATH_MAX, cp);
+ return cifs_strtoUTF16(target, source, PATH_MAX, cp);
for (i = 0, j = 0; i < srclen; j++) {
src_char = source[i];
@@ -281,7 +282,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
switch (src_char) {
case 0:
put_unaligned(0, &target[j]);
- goto ctoUCS_out;
+ goto ctoUTF16_out;
case ':':
dst_char = cpu_to_le16(UNI_COLON);
break;
@@ -326,7 +327,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
put_unaligned(dst_char, &target[j]);
}
-ctoUCS_out:
+ctoUTF16_out:
return i;
}
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 6d02fd56056..a513a546700 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -74,16 +74,16 @@ extern const struct UniCaseRange CifsUniLowerRange[];
#endif /* UNIUPR_NOLOWER */
#ifdef __KERNEL__
-int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
- const struct nls_table *codepage, bool mapchar);
-int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
- const struct nls_table *codepage);
-int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
-char *cifs_strndup_from_ucs(const char *src, const int maxlen,
- const bool is_unicode,
- const struct nls_table *codepage);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
- const struct nls_table *cp, int mapChars);
+int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
+ const struct nls_table *codepage, bool mapchar);
+int cifs_utf16_bytes(const __le16 *from, int maxbytes,
+ const struct nls_table *codepage);
+int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
+char *cifs_strndup_from_utf16(const char *src, const int maxlen,
+ const bool is_unicode,
+ const struct nls_table *codepage);
+extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
+ const struct nls_table *cp, int mapChars);
#endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 72ddf23ef6f..c1b25448738 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -909,6 +909,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
umode_t group_mask = S_IRWXG;
umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
+ if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
+ return;
ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
GFP_KERNEL);
if (!ppace) {
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5d9b9acc5fc..63c460e503b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -327,7 +327,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
attrptr->length = cpu_to_le16(2 * dlen);
blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
- cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+ cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
return 0;
}
@@ -376,7 +376,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
kmalloc(attrsize + 1, GFP_KERNEL);
if (!ses->domainName)
return -ENOMEM;
- cifs_from_ucs2(ses->domainName,
+ cifs_from_utf16(ses->domainName,
(__le16 *)blobptr, attrsize, attrsize,
nls_cp, false);
break;
@@ -420,15 +420,20 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
}
/* convert ses->user_name to unicode and uppercase */
- len = strlen(ses->user_name);
+ len = ses->user_name ? strlen(ses->user_name) : 0;
user = kmalloc(2 + (len * 2), GFP_KERNEL);
if (user == NULL) {
cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
- UniStrupr(user);
+
+ if (len) {
+ len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
+ UniStrupr(user);
+ } else {
+ memset(user, '\0', 2);
+ }
rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
(char *)user, 2 * len);
@@ -448,8 +453,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
- nls_cp);
+ len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
+ nls_cp);
rc =
crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
(char *)domain, 2 * len);
@@ -468,7 +473,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
rc = -ENOMEM;
return rc;
}
- len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+ len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
nls_cp);
rc =
crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ba53c1c6c6c..76e7d8b6da1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -879,6 +879,8 @@ require use of the stronger protocol */
#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
#endif /* UPCALL */
#else /* do not allow weak pw hash */
+#define CIFSSEC_MUST_LANMAN 0
+#define CIFSSEC_MUST_PLNTXT 0
#ifdef CONFIG_CIFS_UPCALL
#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */
#else
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6600aa2d2ef..8b7794c3159 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -821,8 +821,8 @@ PsxDelete:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB add path length overrun check */
@@ -893,8 +893,8 @@ DelFileRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->fileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -938,8 +938,8 @@ RmDirRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName,
- PATH_MAX, nls_codepage, remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -981,8 +981,8 @@ MkDirRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name,
- PATH_MAX, nls_codepage, remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -1030,8 +1030,8 @@ PsxCreat:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, name,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, name,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -1197,8 +1197,8 @@ OldOpenRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
count = 1; /* account for one byte pad to word boundary */
name_len =
- cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
- fileName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+ fileName, PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve check for buffer overruns BB */
@@ -1304,8 +1304,8 @@ openRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
count = 1; /* account for one byte pad to word boundary */
name_len =
- cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
- fileName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
+ fileName, PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->NameLength = cpu_to_le16(name_len);
@@ -2649,16 +2649,16 @@ renameRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->OldFileName[name_len] = 0x04; /* pad */
/* protocol requires ASCII signature byte on Unicode string */
pSMB->OldFileName[name_len + 1] = 0x00;
name_len2 =
- cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
- toName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+ toName, PATH_MAX, nls_codepage, remap);
name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
name_len2 *= 2; /* convert to bytes */
} else { /* BB improve the check for buffer overruns BB */
@@ -2738,10 +2738,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
/* unicode only call */
if (target_name == NULL) {
sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
- len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+ len_of_str =
+ cifsConvertToUTF16((__le16 *)rename_info->target_name,
dummy_string, 24, nls_codepage, remap);
} else {
- len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+ len_of_str =
+ cifsConvertToUTF16((__le16 *)rename_info->target_name,
target_name, PATH_MAX, nls_codepage,
remap);
}
@@ -2795,17 +2797,17 @@ copyRetry:
pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName,
- fromName, PATH_MAX, nls_codepage,
- remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
+ fromName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->OldFileName[name_len] = 0x04; /* pad */
/* protocol requires ASCII signature byte on Unicode string */
pSMB->OldFileName[name_len + 1] = 0x00;
name_len2 =
- cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
- toName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+ toName, PATH_MAX, nls_codepage, remap);
name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
name_len2 *= 2; /* convert to bytes */
} else { /* BB improve the check for buffer overruns BB */
@@ -2861,9 +2863,9 @@ createSymLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX
- /* find define for this maxpathcomponent */
- , nls_codepage);
+ cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName,
+ /* find define for this maxpathcomponent */
+ PATH_MAX, nls_codepage);
name_len++; /* trailing null */
name_len *= 2;
@@ -2885,9 +2887,9 @@ createSymLinkRetry:
data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len_target =
- cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX
- /* find define for this maxpathcomponent */
- , nls_codepage);
+ cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX
+ /* find define for this maxpathcomponent */
+ , nls_codepage);
name_len_target++; /* trailing null */
name_len_target *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -2949,8 +2951,8 @@ createHardLinkRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName,
- PATH_MAX, nls_codepage, remap);
+ name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
@@ -2972,8 +2974,8 @@ createHardLinkRetry:
data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len_target =
- cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX,
- nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) data_offset, fromName,
+ PATH_MAX, nls_codepage, remap);
name_len_target++; /* trailing null */
name_len_target *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3042,8 +3044,8 @@ winCreateHardLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
@@ -3051,8 +3053,8 @@ winCreateHardLinkRetry:
pSMB->OldFileName[name_len] = 0x04;
pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
name_len2 =
- cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
- toName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
+ toName, PATH_MAX, nls_codepage, remap);
name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
name_len2 *= 2; /* convert to bytes */
} else { /* BB improve the check for buffer overruns BB */
@@ -3108,8 +3110,8 @@ querySymLinkRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifs_strtoUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage);
+ cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3166,8 +3168,8 @@ querySymLinkRetry:
is_unicode = false;
/* BB FIXME investigate remapping reserved chars here */
- *symlinkinfo = cifs_strndup_from_ucs(data_start, count,
- is_unicode, nls_codepage);
+ *symlinkinfo = cifs_strndup_from_utf16(data_start,
+ count, is_unicode, nls_codepage);
if (!*symlinkinfo)
rc = -ENOMEM;
}
@@ -3450,8 +3452,9 @@ queryAclRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
pSMB->FileName[name_len] = 0;
@@ -3537,8 +3540,8 @@ setAclRetry:
return rc;
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -3948,8 +3951,9 @@ QInfRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else {
@@ -4086,8 +4090,8 @@ QPathInfoRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4255,8 +4259,8 @@ UnixQPathInfoRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4344,8 +4348,8 @@ findFirstRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
/* We can not add the asterik earlier in case
it got remapped to 0xF03A as if it were part of the
directory name instead of a wildcard */
@@ -4656,8 +4660,9 @@ GetInodeNumberRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -4794,9 +4799,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
}
- cifsConvertToUCS((__le16 *) tmp, searchName,
- PATH_MAX, nls_codepage, remap);
- node->path_consumed = cifs_ucs2_bytes(tmp,
+ cifsConvertToUTF16((__le16 *) tmp, searchName,
+ PATH_MAX, nls_codepage, remap);
+ node->path_consumed = cifs_utf16_bytes(tmp,
le16_to_cpu(pSMBr->PathConsumed),
nls_codepage);
kfree(tmp);
@@ -4809,8 +4814,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
/* copy DfsPath */
temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
max_len = data_end - temp;
- node->path_name = cifs_strndup_from_ucs(temp, max_len,
- is_unicode, nls_codepage);
+ node->path_name = cifs_strndup_from_utf16(temp, max_len,
+ is_unicode, nls_codepage);
if (!node->path_name) {
rc = -ENOMEM;
goto parse_DFS_referrals_exit;
@@ -4819,8 +4824,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
/* copy link target UNC */
temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
max_len = data_end - temp;
- node->node_name = cifs_strndup_from_ucs(temp, max_len,
- is_unicode, nls_codepage);
+ node->node_name = cifs_strndup_from_utf16(temp, max_len,
+ is_unicode, nls_codepage);
if (!node->node_name)
rc = -ENOMEM;
}
@@ -4873,8 +4878,9 @@ getDFSRetry:
if (ses->capabilities & CAP_UNICODE) {
pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
name_len =
- cifsConvertToUCS((__le16 *) pSMB->RequestFileName,
- searchName, PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->RequestFileName,
+ searchName, PATH_MAX, nls_codepage,
+ remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5506,8 +5512,8 @@ SetEOFRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5796,8 +5802,8 @@ SetTimesRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -5877,8 +5883,8 @@ SetAttrLgcyRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- ConvertToUCS((__le16 *) pSMB->fileName, fileName,
- PATH_MAX, nls_codepage);
+ ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
+ PATH_MAX, nls_codepage);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -6030,8 +6036,8 @@ setPermsRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -6123,8 +6129,8 @@ QAllEAsRetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
list_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
+ PATH_MAX, nls_codepage, remap);
list_len++; /* trailing null */
list_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
@@ -6301,8 +6307,8 @@ SetEARetry:
if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
name_len =
- cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
- PATH_MAX, nls_codepage, remap);
+ cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+ PATH_MAX, nls_codepage, remap);
name_len++; /* trailing null */
name_len *= 2;
} else { /* BB improve the check for buffer overruns BB */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4666780f315..602f77c304c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -38,6 +38,7 @@
#include <asm/processor.h>
#include <linux/inet.h>
#include <linux/module.h>
+#include <keys/user-type.h>
#include <net/ipv6.h>
#include "cifspdu.h"
#include "cifsglob.h"
@@ -225,74 +226,90 @@ static int check2ndT2(struct smb_hdr *pSMB)
static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
{
- struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
+ struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)psecond;
struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB;
- char *data_area_of_target;
- char *data_area_of_buf2;
+ char *data_area_of_tgt;
+ char *data_area_of_src;
int remaining;
- unsigned int byte_count, total_in_buf;
- __u16 total_data_size, total_in_buf2;
+ unsigned int byte_count, total_in_tgt;
+ __u16 tgt_total_cnt, src_total_cnt, total_in_src;
- total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+ src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount);
+ tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
- if (total_data_size !=
- get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
- cFYI(1, "total data size of primary and secondary t2 differ");
+ if (tgt_total_cnt != src_total_cnt)
+ cFYI(1, "total data count of primary and secondary t2 differ "
+ "source=%hu target=%hu", src_total_cnt, tgt_total_cnt);
- total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+ total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
- remaining = total_data_size - total_in_buf;
+ remaining = tgt_total_cnt - total_in_tgt;
- if (remaining < 0)
+ if (remaining < 0) {
+ cFYI(1, "Server sent too much data. tgt_total_cnt=%hu "
+ "total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);
return -EPROTO;
+ }
- if (remaining == 0) /* nothing to do, ignore */
+ if (remaining == 0) {
+ /* nothing to do, ignore */
+ cFYI(1, "no more data remains");
return 0;
+ }
- total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
- if (remaining < total_in_buf2) {
+ total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
+ if (remaining < total_in_src)
cFYI(1, "transact2 2nd response contains too much data");
- }
/* find end of first SMB data area */
- data_area_of_target = (char *)&pSMBt->hdr.Protocol +
+ data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
- /* validate target area */
- data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
- get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
+ /* validate target area */
+ data_area_of_src = (char *)&pSMBs->hdr.Protocol +
+ get_unaligned_le16(&pSMBs->t2_rsp.DataOffset);
- data_area_of_target += total_in_buf;
+ data_area_of_tgt += total_in_tgt;
- /* copy second buffer into end of first buffer */
- total_in_buf += total_in_buf2;
+ total_in_tgt += total_in_src;
/* is the result too big for the field? */
- if (total_in_buf > USHRT_MAX)
+ if (total_in_tgt > USHRT_MAX) {
+ cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);
return -EPROTO;
- put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
+ }
+ put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
/* fix up the BCC */
byte_count = get_bcc(pTargetSMB);
- byte_count += total_in_buf2;
+ byte_count += total_in_src;
/* is the result too big for the field? */
- if (byte_count > USHRT_MAX)
+ if (byte_count > USHRT_MAX) {
+ cFYI(1, "coalesced BCC too large (%u)", byte_count);
return -EPROTO;
+ }
put_bcc(byte_count, pTargetSMB);
byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
- byte_count += total_in_buf2;
+ byte_count += total_in_src;
/* don't allow buffer to overflow */
- if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
+ if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);
return -ENOBUFS;
+ }
pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
- memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
+ /* copy second buffer into end of first buffer */
+ memcpy(data_area_of_tgt, data_area_of_src, total_in_src);
- if (remaining == total_in_buf2) {
- cFYI(1, "found the last secondary response");
- return 0; /* we are done */
- } else /* more responses to go */
+ if (remaining != total_in_src) {
+ /* more responses to go */
+ cFYI(1, "waiting for more secondary responses");
return 1;
+ }
+
+ /* we are done */
+ cFYI(1, "found the last secondary response");
+ return 0;
}
static void
@@ -756,10 +773,11 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
cifs_dump_mem("Bad SMB: ", buf,
min_t(unsigned int, server->total_read, 48));
- if (mid)
- handle_mid(mid, server, smb_buffer, length);
+ if (!mid)
+ return length;
- return length;
+ handle_mid(mid, server, smb_buffer, length);
+ return 0;
}
static int
@@ -1578,11 +1596,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
}
- if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
- cERROR(1, "Multiuser mounts currently require krb5 "
- "authentication!");
+#ifndef CONFIG_KEYS
+ /* Muliuser mounts require CONFIG_KEYS support */
+ if (vol->multiuser) {
+ cERROR(1, "Multiuser mounts require kernels with "
+ "CONFIG_KEYS enabled.");
goto cifs_parse_mount_err;
}
+#endif
if (vol->UNCip == NULL)
vol->UNCip = &vol->UNC[2];
@@ -1981,10 +2002,16 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
return 0;
break;
default:
+ /* NULL username means anonymous session */
+ if (ses->user_name == NULL) {
+ if (!vol->nullauth)
+ return 0;
+ break;
+ }
+
/* anything else takes username/password */
- if (ses->user_name == NULL)
- return 0;
- if (strncmp(ses->user_name, vol->username,
+ if (strncmp(ses->user_name,
+ vol->username ? vol->username : "",
MAX_USERNAME_SIZE))
return 0;
if (strlen(vol->username) != 0 &&
@@ -2039,6 +2066,132 @@ cifs_put_smb_ses(struct cifs_ses *ses)
cifs_put_tcp_session(server);
}
+#ifdef CONFIG_KEYS
+
+/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
+
+/* Populate username and pw fields from keyring if possible */
+static int
+cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
+{
+ int rc = 0;
+ char *desc, *delim, *payload;
+ ssize_t len;
+ struct key *key;
+ struct TCP_Server_Info *server = ses->server;
+ struct sockaddr_in *sa;
+ struct sockaddr_in6 *sa6;
+ struct user_key_payload *upayload;
+
+ desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
+ if (!desc)
+ return -ENOMEM;
+
+ /* try to find an address key first */
+ switch (server->dstaddr.ss_family) {
+ case AF_INET:
+ sa = (struct sockaddr_in *)&server->dstaddr;
+ sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr);
+ break;
+ case AF_INET6:
+ sa6 = (struct sockaddr_in6 *)&server->dstaddr;
+ sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
+ break;
+ default:
+ cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family);
+ rc = -EINVAL;
+ goto out_err;
+ }
+
+ cFYI(1, "%s: desc=%s", __func__, desc);
+ key = request_key(&key_type_logon, desc, "");
+ if (IS_ERR(key)) {
+ if (!ses->domainName) {
+ cFYI(1, "domainName is NULL");
+ rc = PTR_ERR(key);
+ goto out_err;
+ }
+
+ /* didn't work, try to find a domain key */
+ sprintf(desc, "cifs:d:%s", ses->domainName);
+ cFYI(1, "%s: desc=%s", __func__, desc);
+ key = request_key(&key_type_logon, desc, "");
+ if (IS_ERR(key)) {
+ rc = PTR_ERR(key);
+ goto out_err;
+ }
+ }
+
+ down_read(&key->sem);
+ upayload = key->payload.data;
+ if (IS_ERR_OR_NULL(upayload)) {
+ rc = upayload ? PTR_ERR(upayload) : -EINVAL;
+ goto out_key_put;
+ }
+
+ /* find first : in payload */
+ payload = (char *)upayload->data;
+ delim = strnchr(payload, upayload->datalen, ':');
+ cFYI(1, "payload=%s", payload);
+ if (!delim) {
+ cFYI(1, "Unable to find ':' in payload (datalen=%d)",
+ upayload->datalen);
+ rc = -EINVAL;
+ goto out_key_put;
+ }
+
+ len = delim - payload;
+ if (len > MAX_USERNAME_SIZE || len <= 0) {
+ cFYI(1, "Bad value from username search (len=%zd)", len);
+ rc = -EINVAL;
+ goto out_key_put;
+ }
+
+ vol->username = kstrndup(payload, len, GFP_KERNEL);
+ if (!vol->username) {
+ cFYI(1, "Unable to allocate %zd bytes for username", len);
+ rc = -ENOMEM;
+ goto out_key_put;
+ }
+ cFYI(1, "%s: username=%s", __func__, vol->username);
+
+ len = key->datalen - (len + 1);
+ if (len > MAX_PASSWORD_SIZE || len <= 0) {
+ cFYI(1, "Bad len for password search (len=%zd)", len);
+ rc = -EINVAL;
+ kfree(vol->username);
+ vol->username = NULL;
+ goto out_key_put;
+ }
+
+ ++delim;
+ vol->password = kstrndup(delim, len, GFP_KERNEL);
+ if (!vol->password) {
+ cFYI(1, "Unable to allocate %zd bytes for password", len);
+ rc = -ENOMEM;
+ kfree(vol->username);
+ vol->username = NULL;
+ goto out_key_put;
+ }
+
+out_key_put:
+ up_read(&key->sem);
+ key_put(key);
+out_err:
+ kfree(desc);
+ cFYI(1, "%s: returning %d", __func__, rc);
+ return rc;
+}
+#else /* ! CONFIG_KEYS */
+static inline int
+cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
+ struct cifs_ses *ses __attribute__((unused)))
+{
+ return -ENOSYS;
+}
+#endif /* CONFIG_KEYS */
+
static bool warned_on_ntlm; /* globals init to false automatically */
static struct cifs_ses *
@@ -2914,18 +3067,33 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
/*
- * Windows only supports a max of 60k reads. Default to that when posix
- * extensions aren't in force.
+ * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
+ * those values when posix extensions aren't in force. In actuality here, we
+ * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
+ * to be ok with the extra byte even though Windows doesn't send writes that
+ * are that large.
+ *
+ * Citation:
+ *
+ * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
*/
#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
+#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
static unsigned int
cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
{
__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
struct TCP_Server_Info *server = tcon->ses->server;
- unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
- CIFS_DEFAULT_IOSIZE;
+ unsigned int wsize;
+
+ /* start with specified wsize, or default */
+ if (pvolume_info->wsize)
+ wsize = pvolume_info->wsize;
+ else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+ wsize = CIFS_DEFAULT_IOSIZE;
+ else
+ wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
/* can server support 24-bit write sizes? (via UNIX extensions) */
if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -3136,10 +3304,9 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
return -EINVAL;
if (volume_info->nullauth) {
- cFYI(1, "null user");
- volume_info->username = kzalloc(1, GFP_KERNEL);
- if (volume_info->username == NULL)
- return -ENOMEM;
+ cFYI(1, "Anonymous login");
+ kfree(volume_info->username);
+ volume_info->username = NULL;
} else if (volume_info->username) {
/* BB fixme parse for domain name here */
cFYI(1, "Username: %s", volume_info->username);
@@ -3478,7 +3645,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
if (ses->capabilities & CAP_UNICODE) {
smb_buffer->Flags2 |= SMBFLG2_UNICODE;
length =
- cifs_strtoUCS((__le16 *) bcc_ptr, tree,
+ cifs_strtoUTF16((__le16 *) bcc_ptr, tree,
6 /* max utf8 char length in bytes */ *
(/* server len*/ + 256 /* share len */), nls_codepage);
bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */
@@ -3533,7 +3700,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
/* mostly informational -- no need to fail on error here */
kfree(tcon->nativeFileSystem);
- tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
+ tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,
bytes_left, is_unicode,
nls_codepage);
@@ -3657,25 +3824,43 @@ int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
return rc;
}
+static int
+cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
+{
+ switch (ses->server->secType) {
+ case Kerberos:
+ vol->secFlg = CIFSSEC_MUST_KRB5;
+ return 0;
+ case NTLMv2:
+ vol->secFlg = CIFSSEC_MUST_NTLMV2;
+ break;
+ case NTLM:
+ vol->secFlg = CIFSSEC_MUST_NTLM;
+ break;
+ case RawNTLMSSP:
+ vol->secFlg = CIFSSEC_MUST_NTLMSSP;
+ break;
+ case LANMAN:
+ vol->secFlg = CIFSSEC_MUST_LANMAN;
+ break;
+ }
+
+ return cifs_set_cifscreds(vol, ses);
+}
+
static struct cifs_tcon *
cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
{
+ int rc;
struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
struct cifs_ses *ses;
struct cifs_tcon *tcon = NULL;
struct smb_vol *vol_info;
- char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
- /* We used to have this as MAX_USERNAME which is */
- /* way too big now (256 instead of 32) */
vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
- if (vol_info == NULL) {
- tcon = ERR_PTR(-ENOMEM);
- goto out;
- }
+ if (vol_info == NULL)
+ return ERR_PTR(-ENOMEM);
- snprintf(username, sizeof(username), "krb50x%x", fsuid);
- vol_info->username = username;
vol_info->local_nls = cifs_sb->local_nls;
vol_info->linux_uid = fsuid;
vol_info->cred_uid = fsuid;
@@ -3685,8 +3870,11 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
vol_info->local_lease = master_tcon->local_lease;
vol_info->no_linux_ext = !master_tcon->unix_ext;
- /* FIXME: allow for other secFlg settings */
- vol_info->secFlg = CIFSSEC_MUST_KRB5;
+ rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
+ if (rc) {
+ tcon = ERR_PTR(rc);
+ goto out;
+ }
/* get a reference for the same TCP session */
spin_lock(&cifs_tcp_ses_lock);
@@ -3709,6 +3897,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
if (ses->capabilities & CAP_UNIX)
reset_cifs_unix_caps(0, tcon, NULL, vol_info);
out:
+ kfree(vol_info->username);
+ kfree(vol_info->password);
kfree(vol_info);
return tcon;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index df8fecb5b99..63a196b97d5 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
{
int xid;
int rc = 0; /* to get around spurious gcc warning, set to zero here */
- __u32 oplock = 0;
+ __u32 oplock = enable_oplocks ? REQ_OPLOCK : 0;
__u16 fileHandle = 0;
bool posix_open = false;
struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a090bbe6ee2..e2bbc683e01 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -647,10 +647,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
name.name = scratch_buf;
name.len =
- cifs_from_ucs2((char *)name.name, (__le16 *)de.name,
- UNICODE_NAME_MAX,
- min(de.namelen, (size_t)max_len), nlt,
- cifs_sb->mnt_cifs_flags &
+ cifs_from_utf16((char *)name.name, (__le16 *)de.name,
+ UNICODE_NAME_MAX,
+ min_t(size_t, de.namelen,
+ (size_t)max_len), nlt,
+ cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
name.len -= nls_nullsize(nlt);
} else {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4ec3ee9d72c..551d0c2b973 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -167,16 +167,16 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
int bytes_ret = 0;
/* Copy OS version */
- bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
- nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32,
+ nls_cp);
bcc_ptr += 2 * bytes_ret;
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release,
- 32, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release,
+ 32, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* trailing null */
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
- 32, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+ 32, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* trailing null */
@@ -197,8 +197,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
*(bcc_ptr+1) = 0;
bytes_ret = 0;
} else
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName,
- 256, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
+ 256, nls_cp);
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* account for null terminator */
@@ -226,8 +226,8 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
*bcc_ptr = 0;
*(bcc_ptr+1) = 0;
} else {
- bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
- MAX_USERNAME_SIZE, nls_cp);
+ bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
+ MAX_USERNAME_SIZE, nls_cp);
}
bcc_ptr += 2 * bytes_ret;
bcc_ptr += 2; /* account for null termination */
@@ -246,16 +246,15 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
/* copy user */
/* BB what about null user mounts - check that we do this BB */
/* copy user */
- if (ses->user_name != NULL)
+ if (ses->user_name != NULL) {
strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE);
+ bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
+ }
/* else null user mount */
-
- bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
*bcc_ptr = 0;
bcc_ptr++; /* account for null termination */
/* copy domain */
-
if (ses->domainName != NULL) {
strncpy(bcc_ptr, ses->domainName, 256);
bcc_ptr += strnlen(ses->domainName, 256);
@@ -287,7 +286,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
cFYI(1, "bleft %d", bleft);
kfree(ses->serverOS);
- ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+ ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
cFYI(1, "serverOS=%s", ses->serverOS);
len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
data += len;
@@ -296,7 +295,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
return;
kfree(ses->serverNOS);
- ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+ ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
cFYI(1, "serverNOS=%s", ses->serverNOS);
len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
data += len;
@@ -305,7 +304,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
return;
kfree(ses->serverDomain);
- ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+ ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
cFYI(1, "serverDomain=%s", ses->serverDomain);
return;
@@ -395,6 +394,10 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
+ if (tioffset > blob_len || tioffset + tilen > blob_len) {
+ cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen);
+ return -EINVAL;
+ }
if (tilen) {
ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
if (!ses->auth_key.response) {
@@ -502,8 +505,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
tmp += 2;
} else {
int len;
- len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
- MAX_USERNAME_SIZE, nls_cp);
+ len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
+ MAX_USERNAME_SIZE, nls_cp);
len *= 2; /* unicode is 2 bytes each */
sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -518,8 +521,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
tmp += 2;
} else {
int len;
- len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
- MAX_USERNAME_SIZE, nls_cp);
+ len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
+ MAX_USERNAME_SIZE, nls_cp);
len *= 2; /* unicode is 2 bytes each */
sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
sec_blob->UserName.Length = cpu_to_le16(len);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 80d85088193..d5cd9aa7eac 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -213,7 +213,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
/* Password cannot be longer than 128 characters */
if (passwd) /* Password must be converted to NT unicode */
- len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
+ len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);
else {
len = 0;
*wpwd = 0; /* Ensure string is null terminated */
diff --git a/fs/compat.c b/fs/compat.c
index fa9d721ecfe..07880bae28a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -131,41 +131,35 @@ asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_tim
static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
- compat_ino_t ino = stat->ino;
- typeof(ubuf->st_uid) uid = 0;
- typeof(ubuf->st_gid) gid = 0;
- int err;
+ struct compat_stat tmp;
- SET_UID(uid, stat->uid);
- SET_GID(gid, stat->gid);
+ if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ return -EOVERFLOW;
- if ((u64) stat->size > MAX_NON_LFS ||
- !old_valid_dev(stat->dev) ||
- !old_valid_dev(stat->rdev))
+ memset(&tmp, 0, sizeof(tmp));
+ tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_ino = stat->ino;
+ if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
- if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino)
+ tmp.st_mode = stat->mode;
+ tmp.st_nlink = stat->nlink;
+ if (tmp.st_nlink != stat->nlink)
return -EOVERFLOW;
-
- if (clear_user(ubuf, sizeof(*ubuf)))
- return -EFAULT;
-
- err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev);
- err |= __put_user(ino, &ubuf->st_ino);
- err |= __put_user(stat->mode, &ubuf->st_mode);
- err |= __put_user(stat->nlink, &ubuf->st_nlink);
- err |= __put_user(uid, &ubuf->st_uid);
- err |= __put_user(gid, &ubuf->st_gid);
- err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev);
- err |= __put_user(stat->size, &ubuf->st_size);
- err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime);
- err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec);
- err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime);
- err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec);
- err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime);
- err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec);
- err |= __put_user(stat->blksize, &ubuf->st_blksize);
- err |= __put_user(stat->blocks, &ubuf->st_blocks);
- return err;
+ SET_UID(tmp.st_uid, stat->uid);
+ SET_GID(tmp.st_gid, stat->gid);
+ tmp.st_rdev = old_encode_dev(stat->rdev);
+ if ((u64) stat->size > MAX_NON_LFS)
+ return -EOVERFLOW;
+ tmp.st_size = stat->size;
+ tmp.st_atime = stat->atime.tv_sec;
+ tmp.st_atime_nsec = stat->atime.tv_nsec;
+ tmp.st_mtime = stat->mtime.tv_sec;
+ tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+ tmp.st_ctime = stat->ctime.tv_sec;
+ tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+ tmp.st_blocks = stat->blocks;
+ tmp.st_blksize = stat->blksize;
+ return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
asmlinkage long compat_sys_newstat(const char __user * filename,
diff --git a/fs/dcache.c b/fs/dcache.c
index 16a53cc2cc0..fe19ac13f75 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries);
static void __init dcache_init_early(void)
{
- int loop;
+ unsigned int loop;
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
@@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void)
&d_hash_mask,
0);
- for (loop = 0; loop < (1 << d_hash_shift); loop++)
+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
static void __init dcache_init(void)
{
- int loop;
+ unsigned int loop;
/*
* A constructor could be added for stable state like the lists,
@@ -3016,7 +3016,7 @@ static void __init dcache_init(void)
&d_hash_mask,
0);
- for (loop = 0; loop < (1 << d_hash_shift); loop++)
+ for (loop = 0; loop < (1U << d_hash_shift); loop++)
INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
}
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index f65d4455c5e..ef023eef046 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob);
* debugfs_print_regs32 - use seq_print to describe a set of registers
* @s: the seq_file structure being used to generate output
* @regs: an array if struct debugfs_reg32 structures
- * @mregs: the length of the above array
+ * @nregs: the length of the above array
* @base: the base address to be used in reading the registers
* @prefix: a string to be prefixed to every output line
*
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2a834255c75..ea993128155 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
(unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
- "with iv:\n");
- ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
- "encryption:\n");
- ecryptfs_dump_hex((char *)
- (page_address(page)
- + (extent_offset * crypt_stat->extent_size)),
- 8);
- }
rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
page, (extent_offset
* crypt_stat->extent_size),
@@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
goto out;
}
rc = 0;
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
- "rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
- "encryption:\n");
- ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
- }
out:
return rc;
}
@@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
(unsigned long long)(extent_base + extent_offset), rc);
goto out;
}
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
- "with iv:\n");
- ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
- "decryption:\n");
- ecryptfs_dump_hex((char *)
- (page_address(enc_extent_page)
- + (extent_offset * crypt_stat->extent_size)),
- 8);
- }
rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
(extent_offset
* crypt_stat->extent_size),
@@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
goto out;
}
rc = 0;
- if (unlikely(ecryptfs_verbosity > 0)) {
- ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
- "rc = [%d]\n",
- (unsigned long long)(extent_base + extent_offset), rc);
- ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
- "decryption:\n");
- ecryptfs_dump_hex((char *)(page_address(page)
- + (extent_offset
- * crypt_stat->extent_size)), 8);
- }
out:
return rc;
}
@@ -1590,8 +1550,8 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
*/
int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
{
- int rc = 0;
- char *page_virt = NULL;
+ int rc;
+ char *page_virt;
struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
struct ecryptfs_crypt_stat *crypt_stat =
&ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
@@ -1616,11 +1576,13 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ecryptfs_dentry,
ECRYPTFS_VALIDATE_HEADER_SIZE);
if (rc) {
+ /* metadata is not in the file header, so try xattrs */
memset(page_virt, 0, PAGE_CACHE_SIZE);
rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
if (rc) {
printk(KERN_DEBUG "Valid eCryptfs headers not found in "
- "file header region or xattr region\n");
+ "file header region or xattr region, inode %lu\n",
+ ecryptfs_inode->i_ino);
rc = -EINVAL;
goto out;
}
@@ -1629,7 +1591,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);
if (rc) {
printk(KERN_DEBUG "Valid eCryptfs headers not found in "
- "file xattr region either\n");
+ "file xattr region either, inode %lu\n",
+ ecryptfs_inode->i_ino);
rc = -EINVAL;
}
if (crypt_stat->mount_crypt_stat->flags
@@ -1640,7 +1603,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
"crypto metadata only in the extended attribute "
"region, but eCryptfs was mounted without "
"xattr support enabled. eCryptfs will not treat "
- "this like an encrypted file.\n");
+ "this like an encrypted file, inode %lu\n",
+ ecryptfs_inode->i_ino);
rc = -EINVAL;
}
}
@@ -2026,6 +1990,17 @@ out:
return;
}
+static size_t ecryptfs_max_decoded_size(size_t encoded_size)
+{
+ /* Not exact; conservatively long. Every block of 4
+ * encoded characters decodes into a block of 3
+ * decoded characters. This segment of code provides
+ * the caller with the maximum amount of allocated
+ * space that @dst will need to point to in a
+ * subsequent call. */
+ return ((encoded_size + 1) * 3) / 4;
+}
+
/**
* ecryptfs_decode_from_filename
* @dst: If NULL, this function only sets @dst_size and returns. If
@@ -2044,13 +2019,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
size_t dst_byte_offset = 0;
if (dst == NULL) {
- /* Not exact; conservatively long. Every block of 4
- * encoded characters decodes into a block of 3
- * decoded characters. This segment of code provides
- * the caller with the maximum amount of allocated
- * space that @dst will need to point to in a
- * subsequent call. */
- (*dst_size) = (((src_size + 1) * 3) / 4);
+ (*dst_size) = ecryptfs_max_decoded_size(src_size);
goto out;
}
while (src_byte_offset < src_size) {
@@ -2275,3 +2244,52 @@ out_free:
out:
return rc;
}
+
+#define ENC_NAME_MAX_BLOCKLEN_8_OR_16 143
+
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+ struct blkcipher_desc desc;
+ struct mutex *tfm_mutex;
+ size_t cipher_blocksize;
+ int rc;
+
+ if (!(mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) {
+ (*namelen) = lower_namelen;
+ return 0;
+ }
+
+ rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex,
+ mount_crypt_stat->global_default_fn_cipher_name);
+ if (unlikely(rc)) {
+ (*namelen) = 0;
+ return rc;
+ }
+
+ mutex_lock(tfm_mutex);
+ cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm);
+ mutex_unlock(tfm_mutex);
+
+ /* Return an exact amount for the common cases */
+ if (lower_namelen == NAME_MAX
+ && (cipher_blocksize == 8 || cipher_blocksize == 16)) {
+ (*namelen) = ENC_NAME_MAX_BLOCKLEN_8_OR_16;
+ return 0;
+ }
+
+ /* Return a safe estimate for the uncommon cases */
+ (*namelen) = lower_namelen;
+ (*namelen) -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+ /* Since this is the max decoded size, subtract 1 "decoded block" len */
+ (*namelen) = ecryptfs_max_decoded_size(*namelen) - 3;
+ (*namelen) -= ECRYPTFS_TAG_70_MAX_METADATA_SIZE;
+ (*namelen) -= ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES;
+ /* Worst case is that the filename is padded nearly a full block size */
+ (*namelen) -= cipher_blocksize - 1;
+
+ if ((*namelen) < 0)
+ (*namelen) = 0;
+
+ return 0;
+}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a9f29b12fbf..867b64c5d84 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -151,12 +151,21 @@ ecryptfs_get_key_payload_data(struct key *key)
* dentry name */
#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
* metadata */
+#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */
+#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to
+ * ecryptfs_parse_packet_length() and
+ * ecryptfs_write_packet_length()
+ */
/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
* ECRYPTFS_MAX_IV_BYTES */
#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
#define MD5_DIGEST_SIZE 16
#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \
+ + ECRYPTFS_SIG_SIZE + 1 + 1)
+#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \
+ + ECRYPTFS_SIG_SIZE + 1 + 1)
#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
@@ -696,6 +705,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
size_t *packet_size,
struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
char *data, size_t max_packet_size);
+int ecryptfs_set_f_namelen(long *namelen, long lower_namelen,
+ struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
loff_t offset);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 19a8ca4ab1d..ab35b113003 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -822,18 +822,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
size_t num_zeros = (PAGE_CACHE_SIZE
- (ia->ia_size & ~PAGE_CACHE_MASK));
-
- /*
- * XXX(truncate) this should really happen at the begginning
- * of ->setattr. But the code is too messy to that as part
- * of a larger patch. ecryptfs is also totally missing out
- * on the inode_change_ok check at the beginning of
- * ->setattr while would include this.
- */
- rc = inode_newsize_ok(inode, ia->ia_size);
- if (rc)
- goto out;
-
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
truncate_setsize(inode, ia->ia_size);
lower_ia->ia_size = ia->ia_size;
@@ -883,6 +871,28 @@ out:
return rc;
}
+static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset)
+{
+ struct ecryptfs_crypt_stat *crypt_stat;
+ loff_t lower_oldsize, lower_newsize;
+
+ crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+ lower_oldsize = upper_size_to_lower_size(crypt_stat,
+ i_size_read(inode));
+ lower_newsize = upper_size_to_lower_size(crypt_stat, offset);
+ if (lower_newsize > lower_oldsize) {
+ /*
+ * The eCryptfs inode and the new *lower* size are mixed here
+ * because we may not have the lower i_mutex held and/or it may
+ * not be appropriate to call inode_newsize_ok() with inodes
+ * from other filesystems.
+ */
+ return inode_newsize_ok(inode, lower_newsize);
+ }
+
+ return 0;
+}
+
/**
* ecryptfs_truncate
* @dentry: The ecryptfs layer dentry
@@ -899,6 +909,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
struct iattr lower_ia = { .ia_valid = 0 };
int rc;
+ rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length);
+ if (rc)
+ return rc;
+
rc = truncate_upper(dentry, &ia, &lower_ia);
if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -978,6 +992,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
}
}
mutex_unlock(&crypt_stat->cs_mutex);
+
+ rc = inode_change_ok(inode, ia);
+ if (rc)
+ goto out;
+ if (ia->ia_valid & ATTR_SIZE) {
+ rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
+ if (rc)
+ goto out;
+ }
+
if (S_ISREG(inode->i_mode)) {
rc = filemap_write_and_wait(inode->i_mapping);
if (rc)
@@ -1061,6 +1085,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
}
rc = vfs_setxattr(lower_dentry, name, value, size, flags);
+ if (!rc)
+ fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
out:
return rc;
}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ac1ad48c237..2333203a120 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -109,7 +109,7 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
(*size) += ((unsigned char)(data[1]) + 192);
(*length_size) = 2;
} else if (data[0] == 255) {
- /* Five-byte length; we're not supposed to see this */
+ /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
"supported\n");
rc = -EINVAL;
@@ -126,7 +126,7 @@ out:
/**
* ecryptfs_write_packet_length
* @dest: The byte array target into which to write the length. Must
- * have at least 5 bytes allocated.
+ * have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated.
* @size: The length to write.
* @packet_size_length: The number of bytes used to encode the packet
* length is written to this address.
@@ -146,6 +146,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
dest[1] = ((size - 192) % 256);
(*packet_size_length) = 2;
} else {
+ /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
rc = -EINVAL;
ecryptfs_printk(KERN_WARNING,
"Unsupported packet size: [%zd]\n", size);
@@ -678,10 +679,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
* Octets N3-N4: Block-aligned encrypted filename
* - Consists of a minimum number of random characters, a \0
* separator, and then the filename */
- s->max_packet_size = (1 /* Tag 70 identifier */
- + 3 /* Max Tag 70 packet size */
- + ECRYPTFS_SIG_SIZE /* FNEK sig */
- + 1 /* Cipher identifier */
+ s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE
+ s->block_aligned_filename_size);
if (dest == NULL) {
(*packet_size) = s->max_packet_size;
@@ -933,10 +931,10 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
goto out;
}
s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+ if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) {
printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
"at least [%d]\n", __func__, max_packet_size,
- (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+ ECRYPTFS_TAG_70_MIN_METADATA_SIZE);
rc = -EINVAL;
goto out;
}
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 940a82e63dc..349209dc6a9 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -218,6 +218,29 @@ out_unlock:
return rc;
}
+/*
+ * miscdevfs packet format:
+ * Octet 0: Type
+ * Octets 1-4: network byte order msg_ctx->counter
+ * Octets 5-N0: Size of struct ecryptfs_message to follow
+ * Octets N0-N1: struct ecryptfs_message (including data)
+ *
+ * Octets 5-N1 not written if the packet type does not include a message
+ */
+#define PKT_TYPE_SIZE 1
+#define PKT_CTR_SIZE 4
+#define MIN_NON_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE)
+#define MIN_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \
+ + ECRYPTFS_MIN_PKT_LEN_SIZE)
+/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */
+#define MAX_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \
+ + ECRYPTFS_MAX_PKT_LEN_SIZE \
+ + sizeof(struct ecryptfs_message) \
+ + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)
+#define PKT_TYPE_OFFSET 0
+#define PKT_CTR_OFFSET PKT_TYPE_SIZE
+#define PKT_LEN_OFFSET (PKT_TYPE_SIZE + PKT_CTR_SIZE)
+
/**
* ecryptfs_miscdev_read - format and send message from queue
* @file: fs/ecryptfs/euid miscdevfs handle (ignored)
@@ -237,7 +260,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
struct ecryptfs_daemon *daemon;
struct ecryptfs_msg_ctx *msg_ctx;
size_t packet_length_size;
- char packet_length[3];
+ char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
size_t i;
size_t total_length;
uid_t euid = current_euid();
@@ -305,15 +328,8 @@ check_list:
packet_length_size = 0;
msg_ctx->msg_size = 0;
}
- /* miscdevfs packet format:
- * Octet 0: Type
- * Octets 1-4: network byte order msg_ctx->counter
- * Octets 5-N0: Size of struct ecryptfs_message to follow
- * Octets N0-N1: struct ecryptfs_message (including data)
- *
- * Octets 5-N1 not written if the packet type does not
- * include a message */
- total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size);
+ total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size
+ + msg_ctx->msg_size);
if (count < total_length) {
rc = 0;
printk(KERN_WARNING "%s: Only given user buffer of "
@@ -324,9 +340,10 @@ check_list:
rc = -EFAULT;
if (put_user(msg_ctx->type, buf))
goto out_unlock_msg_ctx;
- if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1)))
+ if (put_user(cpu_to_be32(msg_ctx->counter),
+ (__be32 __user *)(&buf[PKT_CTR_OFFSET])))
goto out_unlock_msg_ctx;
- i = 5;
+ i = PKT_TYPE_SIZE + PKT_CTR_SIZE;
if (msg_ctx->msg) {
if (copy_to_user(&buf[i], packet_length, packet_length_size))
goto out_unlock_msg_ctx;
@@ -391,12 +408,6 @@ out:
* @count: Amount of data in @buf
* @ppos: Pointer to offset in file (ignored)
*
- * miscdevfs packet format:
- * Octet 0: Type
- * Octets 1-4: network byte order msg_ctx->counter (0's for non-response)
- * Octets 5-N0: Size of struct ecryptfs_message to follow
- * Octets N0-N1: struct ecryptfs_message (including data)
- *
* Returns the number of bytes read from @buf
*/
static ssize_t
@@ -405,60 +416,78 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
{
__be32 counter_nbo;
u32 seq;
- size_t packet_size, packet_size_length, i;
- ssize_t sz = 0;
+ size_t packet_size, packet_size_length;
char *data;
uid_t euid = current_euid();
- int rc;
+ unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
+ ssize_t rc;
- if (count == 0)
- goto out;
+ if (count == 0) {
+ return 0;
+ } else if (count == MIN_NON_MSG_PKT_SIZE) {
+ /* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
+ goto memdup;
+ } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
+ printk(KERN_WARNING "%s: Acceptable packet size range is "
+ "[%d-%lu], but amount of data written is [%zu].",
+ __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET],
+ sizeof(packet_size_peek))) {
+ printk(KERN_WARNING "%s: Error while inspecting packet size\n",
+ __func__);
+ return -EFAULT;
+ }
+ rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size,
+ &packet_size_length);
+ if (rc) {
+ printk(KERN_WARNING "%s: Error parsing packet length; "
+ "rc = [%zd]\n", __func__, rc);
+ return rc;
+ }
+
+ if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size)
+ != count) {
+ printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
+ packet_size);
+ return -EINVAL;
+ }
+
+memdup:
data = memdup_user(buf, count);
if (IS_ERR(data)) {
printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
__func__, PTR_ERR(data));
- goto out;
+ return PTR_ERR(data);
}
- sz = count;
- i = 0;
- switch (data[i++]) {
+ switch (data[PKT_TYPE_OFFSET]) {
case ECRYPTFS_MSG_RESPONSE:
- if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
+ if (count < (MIN_MSG_PKT_SIZE
+ + sizeof(struct ecryptfs_message))) {
printk(KERN_WARNING "%s: Minimum acceptable packet "
"size is [%zd], but amount of data written is "
"only [%zd]. Discarding response packet.\n",
__func__,
- (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
- count);
+ (MIN_MSG_PKT_SIZE
+ + sizeof(struct ecryptfs_message)), count);
+ rc = -EINVAL;
goto out_free;
}
- memcpy(&counter_nbo, &data[i], 4);
+ memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
seq = be32_to_cpu(counter_nbo);
- i += 4;
- rc = ecryptfs_parse_packet_length(&data[i], &packet_size,
- &packet_size_length);
+ rc = ecryptfs_miscdev_response(
+ &data[PKT_LEN_OFFSET + packet_size_length],
+ packet_size, euid, current_user_ns(),
+ task_pid(current), seq);
if (rc) {
- printk(KERN_WARNING "%s: Error parsing packet length; "
- "rc = [%d]\n", __func__, rc);
- goto out_free;
- }
- i += packet_size_length;
- if ((1 + 4 + packet_size_length + packet_size) != count) {
- printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
- " + packet_size([%zd]))([%zd]) != "
- "count([%zd]). Invalid packet format.\n",
- __func__, packet_size_length, packet_size,
- (1 + packet_size_length + packet_size), count);
- goto out_free;
- }
- rc = ecryptfs_miscdev_response(&data[i], packet_size,
- euid, current_user_ns(),
- task_pid(current), seq);
- if (rc)
printk(KERN_WARNING "%s: Failed to deliver miscdev "
- "response to requesting operation; rc = [%d]\n",
+ "response to requesting operation; rc = [%zd]\n",
__func__, rc);
+ goto out_free;
+ }
break;
case ECRYPTFS_MSG_HELO:
case ECRYPTFS_MSG_QUIT:
@@ -467,12 +496,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
"message of unrecognized type [%d]\n",
data[0]);
- break;
+ rc = -EINVAL;
+ goto out_free;
}
+ rc = count;
out_free:
kfree(data);
-out:
- return sz;
+ return rc;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 6a44148c5fb..a46b3a8fee1 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -57,6 +57,10 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
* @page: Page that is locked before this call is made
*
* Returns zero on success; non-zero otherwise
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem. In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
*/
static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
{
@@ -146,7 +150,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
/* This is a header extent */
char *page_virt;
- page_virt = kmap_atomic(page, KM_USER0);
+ page_virt = kmap_atomic(page);
memset(page_virt, 0, PAGE_CACHE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
@@ -159,7 +163,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
crypt_stat,
&written);
}
- kunmap_atomic(page_virt, KM_USER0);
+ kunmap_atomic(page_virt);
flush_dcache_page(page);
if (rc) {
printk(KERN_ERR "%s: Error reading xattr "
@@ -481,10 +485,6 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
* @copied: The amount of data copied
* @page: The eCryptfs page
* @fsdata: The fsdata (unused)
- *
- * This is where we encrypt the data and pass the encrypted data to
- * the lower filesystem. In OpenPGP-compatible mode, we operate on
- * entire underlying packets.
*/
static int ecryptfs_write_end(struct file *file,
struct address_space *mapping,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3745f7c2b9c..b2a34a192f4 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -130,13 +130,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
- size_t total_remaining_bytes = ((offset + size) - pos);
+ loff_t total_remaining_bytes = ((offset + size) - pos);
+
+ if (fatal_signal_pending(current)) {
+ rc = -EINTR;
+ break;
+ }
if (num_bytes > total_remaining_bytes)
num_bytes = total_remaining_bytes;
if (pos < offset) {
/* remaining zeros to write, up to destination offset */
- size_t total_remaining_zeros = (offset - pos);
+ loff_t total_remaining_zeros = (offset - pos);
if (num_bytes > total_remaining_zeros)
num_bytes = total_remaining_zeros;
@@ -151,7 +156,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
ecryptfs_page_idx, rc);
goto out;
}
- ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
+ ecryptfs_page_virt = kmap_atomic(ecryptfs_page);
/*
* pos: where we're now writing, offset: where the request was
@@ -174,7 +179,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
(data + data_offset), num_bytes);
data_offset += num_bytes;
}
- kunmap_atomic(ecryptfs_page_virt, KM_USER0);
+ kunmap_atomic(ecryptfs_page_virt);
flush_dcache_page(ecryptfs_page);
SetPageUptodate(ecryptfs_page);
unlock_page(ecryptfs_page);
@@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
}
pos += num_bytes;
}
- if ((offset + size) > ecryptfs_file_size) {
- i_size_write(ecryptfs_inode, (offset + size));
+ if (pos > ecryptfs_file_size) {
+ i_size_write(ecryptfs_inode, pos);
if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
- rc = ecryptfs_write_inode_size_to_metadata(
+ int rc2;
+
+ rc2 = ecryptfs_write_inode_size_to_metadata(
ecryptfs_inode);
- if (rc) {
+ if (rc2) {
printk(KERN_ERR "Problem with "
"ecryptfs_write_inode_size_to_metadata; "
- "rc = [%d]\n", rc);
+ "rc = [%d]\n", rc2);
+ if (!rc)
+ rc = rc2;
goto out;
}
}
@@ -273,76 +282,3 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
flush_dcache_page(page_for_ecryptfs);
return rc;
}
-
-#if 0
-/**
- * ecryptfs_read
- * @data: The virtual address into which to write the data read (and
- * possibly decrypted) from the lower file
- * @offset: The offset in the decrypted view of the file from which to
- * read into @data
- * @size: The number of bytes to read into @data
- * @ecryptfs_file: The eCryptfs file from which to read
- *
- * Read an arbitrary amount of data from an arbitrary location in the
- * eCryptfs page cache. This is done on an extent-by-extent basis;
- * individual extents are decrypted and read from the lower page
- * cache (via VFS reads). This function takes care of all the
- * address translation to locations in the lower filesystem.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_read(char *data, loff_t offset, size_t size,
- struct file *ecryptfs_file)
-{
- struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
- struct page *ecryptfs_page;
- char *ecryptfs_page_virt;
- loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
- loff_t data_offset = 0;
- loff_t pos;
- int rc = 0;
-
- if ((offset + size) > ecryptfs_file_size) {
- rc = -EINVAL;
- printk(KERN_ERR "%s: Attempt to read data past the end of the "
- "file; offset = [%lld]; size = [%td]; "
- "ecryptfs_file_size = [%lld]\n",
- __func__, offset, size, ecryptfs_file_size);
- goto out;
- }
- pos = offset;
- while (pos < (offset + size)) {
- pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
- size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
- size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
- size_t total_remaining_bytes = ((offset + size) - pos);
-
- if (num_bytes > total_remaining_bytes)
- num_bytes = total_remaining_bytes;
- ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
- ecryptfs_page_idx);
- if (IS_ERR(ecryptfs_page)) {
- rc = PTR_ERR(ecryptfs_page);
- printk(KERN_ERR "%s: Error getting page at "
- "index [%ld] from eCryptfs inode "
- "mapping; rc = [%d]\n", __func__,
- ecryptfs_page_idx, rc);
- goto out;
- }
- ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
- memcpy((data + data_offset),
- ((char *)ecryptfs_page_virt + start_offset_in_page),
- num_bytes);
- kunmap_atomic(ecryptfs_page_virt, KM_USER0);
- flush_dcache_page(ecryptfs_page);
- SetPageUptodate(ecryptfs_page);
- unlock_page(ecryptfs_page);
- page_cache_release(ecryptfs_page);
- pos += num_bytes;
- data_offset += num_bytes;
- }
-out:
- return rc;
-}
-#endif /* 0 */
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 9df7fd6e0c3..cf152823bbf 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -30,6 +30,8 @@
#include <linux/seq_file.h>
#include <linux/file.h>
#include <linux/crypto.h>
+#include <linux/statfs.h>
+#include <linux/magic.h>
#include "ecryptfs_kernel.h"
struct kmem_cache *ecryptfs_inode_info_cache;
@@ -102,10 +104,20 @@ static void ecryptfs_destroy_inode(struct inode *inode)
static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+ int rc;
if (!lower_dentry->d_sb->s_op->statfs)
return -ENOSYS;
- return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+
+ rc = lower_dentry->d_sb->s_op->statfs(lower_dentry, buf);
+ if (rc)
+ return rc;
+
+ buf->f_type = ECRYPTFS_SUPER_MAGIC;
+ rc = ecryptfs_set_f_namelen(&buf->f_namelen, buf->f_namelen,
+ &ecryptfs_superblock_to_private(dentry->d_sb)->mount_crypt_stat);
+
+ return rc;
}
/**
diff --git a/fs/exec.c b/fs/exec.c
index aeb135c7ff5..92ce83a11e9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1071,6 +1071,21 @@ void set_task_comm(struct task_struct *tsk, char *buf)
perf_event_comm(tsk);
}
+static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
+{
+ int i, ch;
+
+ /* Copies the binary name from after last slash */
+ for (i = 0; (ch = *(fn++)) != '\0';) {
+ if (ch == '/')
+ i = 0; /* overwrite what we wrote */
+ else
+ if (i < len - 1)
+ tcomm[i++] = ch;
+ }
+ tcomm[i] = '\0';
+}
+
int flush_old_exec(struct linux_binprm * bprm)
{
int retval;
@@ -1085,6 +1100,7 @@ int flush_old_exec(struct linux_binprm * bprm)
set_mm_exe_file(bprm->mm, bprm->file);
+ filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
/*
* Release all of the old mmap stuff
*/
@@ -1116,10 +1132,6 @@ EXPORT_SYMBOL(would_dump);
void setup_new_exec(struct linux_binprm * bprm)
{
- int i, ch;
- const char *name;
- char tcomm[sizeof(current->comm)];
-
arch_pick_mmap_layout(current->mm);
/* This is the point of no return */
@@ -1130,18 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);
- name = bprm->filename;
-
- /* Copies the binary name from after last slash */
- for (i=0; (ch = *(name++)) != '\0';) {
- if (ch == '/')
- i = 0; /* overwrite what we wrote */
- else
- if (i < (sizeof(tcomm) - 1))
- tcomm[i++] = ch;
- }
- tcomm[i] = '\0';
- set_task_comm(current, tcomm);
+ set_task_comm(current, bprm->tcomm);
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread() on
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 1089f760c84..2de655f5d62 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -77,10 +77,11 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
flags = flags & EXT2_FL_USER_MODIFIABLE;
flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
ei->i_flags = flags;
- mutex_unlock(&inode->i_mutex);
ext2_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME_SEC;
+ mutex_unlock(&inode->i_mutex);
+
mark_inode_dirty(inode);
setflags_out:
mnt_drop_write_file(filp);
@@ -88,20 +89,29 @@ setflags_out:
}
case EXT2_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *) arg);
- case EXT2_IOC_SETVERSION:
+ case EXT2_IOC_SETVERSION: {
+ __u32 generation;
+
if (!inode_owner_or_capable(inode))
return -EPERM;
ret = mnt_want_write_file(filp);
if (ret)
return ret;
- if (get_user(inode->i_generation, (int __user *) arg)) {
+ if (get_user(generation, (int __user *) arg)) {
ret = -EFAULT;
- } else {
- inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
+ goto setversion_out;
}
+
+ mutex_lock(&inode->i_mutex);
+ inode->i_ctime = CURRENT_TIME_SEC;
+ inode->i_generation = generation;
+ mutex_unlock(&inode->i_mutex);
+
+ mark_inode_dirty(inode);
+setversion_out:
mnt_drop_write_file(filp);
return ret;
+ }
case EXT2_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION)
&& S_ISREG(inode->i_mode)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f855916657b..5b4a9362d5a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,14 +53,6 @@ struct wb_writeback_work {
};
/*
- * Include the creation of the trace points after defining the
- * wb_writeback_work structure so that the definition remains local to this
- * file.
- */
-#define CREATE_TRACE_POINTS
-#include <trace/events/writeback.h>
-
-/*
* We don't actually have pdflush, but this one is exported though /proc...
*/
int nr_pdflush_threads;
@@ -92,6 +84,14 @@ static inline struct inode *wb_inode(struct list_head *head)
return list_entry(head, struct inode, i_wb_list);
}
+/*
+ * Include the creation of the trace points after defining the
+ * wb_writeback_work structure and inline functions so that the definition
+ * remains local to this file.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/writeback.h>
+
/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
{
diff --git a/fs/inode.c b/fs/inode.c
index 4fa4f0916af..d3ebdbe723d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -322,9 +322,6 @@ EXPORT_SYMBOL(clear_nlink);
void set_nlink(struct inode *inode, unsigned int nlink)
{
if (!nlink) {
- printk_ratelimited(KERN_INFO
- "set_nlink() clearing i_nlink on %s inode %li\n",
- inode->i_sb->s_type->name, inode->i_ino);
clear_nlink(inode);
} else {
/* Yes, some filesystems do change nlink from zero to one */
@@ -1654,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries);
*/
void __init inode_init_early(void)
{
- int loop;
+ unsigned int loop;
/* If hashes are distributed across NUMA nodes, defer
* hash allocation until vmalloc space is available.
@@ -1672,13 +1669,13 @@ void __init inode_init_early(void)
&i_hash_mask,
0);
- for (loop = 0; loop < (1 << i_hash_shift); loop++)
+ for (loop = 0; loop < (1U << i_hash_shift); loop++)
INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
void __init inode_init(void)
{
- int loop;
+ unsigned int loop;
/* inode slab cache */
inode_cachep = kmem_cache_create("inode_cache",
@@ -1702,7 +1699,7 @@ void __init inode_init(void)
&i_hash_mask,
0);
- for (loop = 0; loop < (1 << i_hash_shift); loop++)
+ for (loop = 0; loop < (1U << i_hash_shift); loop++)
INIT_HLIST_HEAD(&inode_hashtable[loop]);
}
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f84b380d65e..0f1b9515213 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
if (ioc) {
ioc_ioprio_changed(ioc, ioprio);
- put_io_context(ioc, NULL);
+ put_io_context(ioc);
}
return err;
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 5d1a00a5041..05f0754f2b4 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -453,8 +453,6 @@ out:
*
* Return <0 on error, 0 on success, 1 if there was nothing to clean up.
*
- * Called with the journal lock held.
- *
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
@@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
if (is_journal_aborted(journal))
return 1;
- /* OK, work out the oldest transaction remaining in the log, and
+ /*
+ * OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
* If the log is now empty, we need to work out which is the
* next transaction ID we will write, and where it will
- * start. */
-
+ * start.
+ */
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
transaction = journal->j_checkpoint_transactions;
@@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
spin_unlock(&journal->j_state_lock);
return 1;
}
+ spin_unlock(&journal->j_state_lock);
+
+ /*
+ * We need to make sure that any blocks that were recently written out
+ * --- perhaps by log_do_checkpoint() --- are flushed out before we
+ * drop the transactions from the journal. It's unlikely this will be
+ * necessary, especially with an appropriately sized journal, but we
+ * need this to guarantee correctness. Fortunately
+ * cleanup_journal_tail() doesn't get called all that often.
+ */
+ if (journal->j_flags & JFS_BARRIER)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ spin_lock(&journal->j_state_lock);
+ if (!tid_gt(first_tid, journal->j_tail_sequence)) {
+ spin_unlock(&journal->j_state_lock);
+ /* Someone else cleaned up journal so return 0 */
+ return 0;
+ }
/* OK, update the superblock to recover the freed space.
* Physical blocks come first: have we wrapped beyond the end of
* the log? */
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 5b43e96788e..008bf062fd2 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,6 +20,7 @@
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
+#include <linux/blkdev.h>
#endif
/*
@@ -263,6 +264,9 @@ int journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
+ /* Flush disk caches to get replayed data on the permanent storage */
+ if (journal->j_flags & JFS_BARRIER)
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
return err;
}
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index a01cdad6aad..eafb8d37a6f 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -335,7 +335,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
void *ebuf;
uint32_t ofs;
size_t retlen;
- int ret = -EIO;
+ int ret;
unsigned long *wordebuf;
ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index e97404d611e..9c501449450 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -152,9 +152,6 @@ static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
filler_t *filler = logfs_mtd_readpage;
struct mtd_info *mtd = super->s_mtd;
- if (!mtd_can_have_bb(mtd))
- return NULL;
-
*ofs = 0;
while (mtd_block_isbad(mtd, *ofs)) {
*ofs += mtd->erasesize;
@@ -172,9 +169,6 @@ static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
filler_t *filler = logfs_mtd_readpage;
struct mtd_info *mtd = super->s_mtd;
- if (!mtd_can_have_bb(mtd))
- return NULL;
-
*ofs = mtd->size - mtd->erasesize;
while (mtd_block_isbad(mtd, *ofs)) {
*ofs -= mtd->erasesize;
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 501043e8966..3de7a32cadb 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
static int write_inode(struct inode *inode)
{
- return __logfs_write_inode(inode, WF_LOCK);
+ return __logfs_write_inode(inode, NULL, WF_LOCK);
}
static s64 dir_seek_data(struct inode *inode, s64 pos)
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index b548c87a86f..3886cded283 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
mutex_lock(&inode->i_mutex);
+ logfs_get_wblocks(sb, NULL, WF_LOCK);
logfs_write_anchor(sb);
+ logfs_put_wblocks(sb, NULL, WF_LOCK);
mutex_unlock(&inode->i_mutex);
return 0;
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index caa4419285d..d4efb061bdc 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb)
int i, max_dist;
struct gc_candidate *cand = NULL, *this;
- max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+ max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
for (i = max_dist; i >= 0; i--) {
this = first_in_list(&super->s_low_list[i]);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 388df1aa35e..a422f42238b 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -286,7 +286,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
return 0;
- ret = __logfs_write_inode(inode, flags);
+ ret = __logfs_write_inode(inode, NULL, flags);
LOGFS_BUG_ON(ret, inode->i_sb);
return ret;
}
@@ -363,7 +363,9 @@ static void logfs_init_once(void *_li)
static int logfs_sync_fs(struct super_block *sb, int wait)
{
+ logfs_get_wblocks(sb, NULL, WF_LOCK);
logfs_write_anchor(sb);
+ logfs_put_wblocks(sb, NULL, WF_LOCK);
return 0;
}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 9da29706f91..1e1c369df22 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
if (len == 0)
return logfs_write_header(super, header, 0, type);
- BUG_ON(len > sb->s_blocksize);
compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
if (compr_len < 0 || type == JE_ANCHOR) {
memcpy(data, buf, len);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 926373866a5..5f093760946 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void);
void logfs_set_blocks(struct inode *inode, u64 no);
/* these logically belong into inode.c but actually reside in readwrite.c */
int logfs_read_inode(struct inode *inode);
-int __logfs_write_inode(struct inode *inode, long flags);
+int __logfs_write_inode(struct inode *inode, struct page *, long flags);
void logfs_evict_inode(struct inode *inode);
/* journal.c */
@@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block,
__be64 *array, int page_is_empty);
int logfs_exist_block(struct inode *inode, u64 bix);
int get_page_reserve(struct inode *inode, struct page *page);
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
extern struct logfs_block_ops indirect_block_ops;
/* segment.c */
@@ -594,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb);
void logfs_sync_area(struct logfs_area *area);
void logfs_sync_segments(struct super_block *sb);
void freeseg(struct super_block *sb, u32 segno);
+void free_areas(struct super_block *sb);
/* area handling */
int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 2ac4217b790..4153e65b014 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock)
* is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
* in addition to PG_locked.
*/
-static void logfs_get_wblocks(struct super_block *sb, struct page *page,
- int lock)
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
{
struct logfs_super *super = logfs_super(sb);
@@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page,
}
}
-static void logfs_put_wblocks(struct super_block *sb, struct page *page,
- int lock)
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
{
struct logfs_super *super = logfs_super(sb);
@@ -424,7 +422,7 @@ static void inode_write_block(struct logfs_block *block)
if (inode->i_ino == LOGFS_INO_MASTER)
logfs_write_anchor(inode->i_sb);
else {
- ret = __logfs_write_inode(inode, 0);
+ ret = __logfs_write_inode(inode, NULL, 0);
/* see indirect_write_block comment */
BUG_ON(ret);
}
@@ -560,8 +558,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block)
static void indirect_free_block(struct super_block *sb,
struct logfs_block *block)
{
- ClearPagePrivate(block->page);
- block->page->private = 0;
+ struct page *page = block->page;
+
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ set_page_private(page, 0);
+ }
__free_block(sb, block);
}
@@ -650,8 +653,11 @@ static void alloc_data_block(struct inode *inode, struct page *page)
logfs_unpack_index(page->index, &bix, &level);
block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
block->page = page;
+
SetPagePrivate(page);
- page->private = (unsigned long)block;
+ page_cache_get(page);
+ set_page_private(page, (unsigned long) block);
+
block->ops = &indirect_block_ops;
}
@@ -1570,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags)
static int __logfs_delete(struct inode *inode, struct page *page)
{
long flags = WF_DELETE;
+ int err;
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
if (page->index < I0_BLOCKS)
return logfs_write_direct(inode, page, flags);
+ err = grow_inode(inode, page->index, 0);
+ if (err)
+ return err;
return logfs_write_rec(inode, page, page->index, 0, flags);
}
@@ -1623,7 +1633,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
if (inode->i_ino == LOGFS_INO_MASTER)
logfs_write_anchor(inode->i_sb);
else {
- err = __logfs_write_inode(inode, flags);
+ err = __logfs_write_inode(inode, page, flags);
}
}
}
@@ -1873,7 +1883,7 @@ int logfs_truncate(struct inode *inode, u64 target)
logfs_get_wblocks(sb, NULL, 1);
err = __logfs_truncate(inode, size);
if (!err)
- err = __logfs_write_inode(inode, 0);
+ err = __logfs_write_inode(inode, NULL, 0);
logfs_put_wblocks(sb, NULL, 1);
}
@@ -1901,8 +1911,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
li->li_block = block;
block->page = NULL;
- page->private = 0;
- ClearPagePrivate(page);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ set_page_private(page, 0);
+ }
}
static void move_inode_to_page(struct page *page, struct inode *inode)
@@ -1918,8 +1931,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
BUG_ON(PagePrivate(page));
block->ops = &indirect_block_ops;
block->page = page;
- page->private = (unsigned long)block;
- SetPagePrivate(page);
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, (unsigned long) block);
+ }
block->inode = NULL;
li->li_block = NULL;
@@ -2106,14 +2123,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
ec_level);
}
-int __logfs_write_inode(struct inode *inode, long flags)
+int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
{
struct super_block *sb = inode->i_sb;
int ret;
- logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+ logfs_get_wblocks(sb, page, flags & WF_LOCK);
ret = do_write_inode(inode);
- logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+ logfs_put_wblocks(sb, page, flags & WF_LOCK);
return ret;
}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 9d518735325..ab798ed1cc8 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
BUG_ON(!page); /* FIXME: reserve a pool */
SetPageUptodate(page);
memcpy(page_address(page) + offset, buf, copylen);
- SetPagePrivate(page);
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ }
page_cache_release(page);
buf += copylen;
@@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area)
page = get_mapping_page(sb, index, 0);
BUG_ON(!page); /* FIXME: reserve a pool */
memset(page_address(page) + offset, 0xff, len);
- SetPagePrivate(page);
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ }
page_cache_release(page);
}
}
@@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area)
BUG_ON(!page); /* FIXME: reserve a pool */
SetPageUptodate(page);
memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
- SetPagePrivate(page);
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ }
page_cache_release(page);
index++;
no_indizes--;
@@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
mempool_free(item, super->s_alias_pool);
}
block->page = page;
- SetPagePrivate(page);
- page->private = (unsigned long)block;
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ page_cache_get(page);
+ set_page_private(page, (unsigned long) block);
+ }
block->ops = &indirect_block_ops;
initialize_block_counters(page, block, data, 0);
}
@@ -536,8 +550,12 @@ void move_page_to_btree(struct page *page)
list_add(&item->list, &block->item_list);
}
block->page = NULL;
- ClearPagePrivate(page);
- page->private = 0;
+
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ set_page_private(page, 0);
+ }
block->ops = &btree_block_ops;
err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
block);
@@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno)
page = find_get_page(mapping, ofs >> PAGE_SHIFT);
if (!page)
continue;
- ClearPagePrivate(page);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+ page_cache_release(page);
+ }
page_cache_release(page);
}
}
@@ -841,6 +862,16 @@ static void free_area(struct logfs_area *area)
kfree(area);
}
+void free_areas(struct super_block *sb)
+{
+ struct logfs_super *super = logfs_super(sb);
+ int i;
+
+ for_each_area(i)
+ free_area(super->s_area[i]);
+ free_area(super->s_journal_area);
+}
+
static struct logfs_area *alloc_area(struct super_block *sb)
{
struct logfs_area *area;
@@ -923,10 +954,6 @@ err:
void logfs_cleanup_areas(struct super_block *sb)
{
struct logfs_super *super = logfs_super(sb);
- int i;
btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
- for_each_area(i)
- free_area(super->s_area[i]);
- free_area(super->s_journal_area);
}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index e795c234ea3..c9ee7f5d1ca 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -486,14 +486,15 @@ static void logfs_kill_sb(struct super_block *sb)
/* Alias entries slow down mount, so evict as many as possible */
sync_filesystem(sb);
logfs_write_anchor(sb);
+ free_areas(sb);
/*
* From this point on alias entries are simply dropped - and any
* writes to the object store are considered bugs.
*/
- super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
log_super("LogFS: Now in shutdown\n");
generic_shutdown_super(sb);
+ super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008..a780ea515c4 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
static char *getname_flags(const char __user *filename, int flags, int *empty)
{
- char *tmp, *result;
-
- result = ERR_PTR(-ENOMEM);
- tmp = __getname();
- if (tmp) {
- int retval = do_getname(filename, tmp);
-
- result = tmp;
- if (retval < 0) {
- if (retval == -ENOENT && empty)
- *empty = 1;
- if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
- __putname(tmp);
- result = ERR_PTR(retval);
- }
+ char *result = __getname();
+ int retval;
+
+ if (!result)
+ return ERR_PTR(-ENOMEM);
+
+ retval = do_getname(filename, result);
+ if (retval < 0) {
+ if (retval == -ENOENT && empty)
+ *empty = 1;
+ if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+ __putname(result);
+ return ERR_PTR(retval);
}
}
audit_getname(result);
@@ -1097,8 +1095,10 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr
struct dentry *old;
/* Don't create child dentry for a dead directory. */
- if (unlikely(IS_DEADDIR(inode)))
+ if (unlikely(IS_DEADDIR(inode))) {
+ dput(dentry);
return ERR_PTR(-ENOENT);
+ }
old = inode->i_op->lookup(inode, dentry, nd);
if (unlikely(old)) {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f0c849c98fe..ec9f6ef6c5d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3575,8 +3575,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
}
if (npages > 1) {
/* for decoding across pages */
- args.acl_scratch = alloc_page(GFP_KERNEL);
- if (!args.acl_scratch)
+ res.acl_scratch = alloc_page(GFP_KERNEL);
+ if (!res.acl_scratch)
goto out_free;
}
args.acl_len = npages * PAGE_SIZE;
@@ -3612,8 +3612,8 @@ out_free:
for (i = 0; i < npages; i++)
if (pages[i])
__free_page(pages[i]);
- if (args.acl_scratch)
- __free_page(args.acl_scratch);
+ if (res.acl_scratch)
+ __free_page(res.acl_scratch);
return ret;
}
@@ -4883,8 +4883,10 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
clp->cl_rpcclient->cl_auth->au_flavor);
res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL);
- if (unlikely(!res.server_scope))
- return -ENOMEM;
+ if (unlikely(!res.server_scope)) {
+ status = -ENOMEM;
+ goto out;
+ }
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
if (!status)
@@ -4901,12 +4903,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
clp->server_scope = NULL;
}
- if (!clp->server_scope)
+ if (!clp->server_scope) {
clp->server_scope = res.server_scope;
- else
- kfree(res.server_scope);
+ goto out;
+ }
}
-
+ kfree(res.server_scope);
+out:
dprintk("<-- %s status= %d\n", __func__, status);
return status;
}
@@ -5008,37 +5011,53 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo)
return status;
}
+static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags)
+{
+ return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags);
+}
+
+static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
+ struct nfs4_slot *new,
+ u32 max_slots,
+ u32 ivalue)
+{
+ struct nfs4_slot *old = NULL;
+ u32 i;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ if (new) {
+ old = tbl->slots;
+ tbl->slots = new;
+ tbl->max_slots = max_slots;
+ }
+ tbl->highest_used_slotid = -1; /* no slot is currently used */
+ for (i = 0; i < tbl->max_slots; i++)
+ tbl->slots[i].seq_nr = ivalue;
+ spin_unlock(&tbl->slot_tbl_lock);
+ kfree(old);
+}
+
/*
- * Reset a slot table
+ * (re)Initialise a slot table
*/
-static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
- int ivalue)
+static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs,
+ u32 ivalue)
{
struct nfs4_slot *new = NULL;
- int i;
- int ret = 0;
+ int ret = -ENOMEM;
dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__,
max_reqs, tbl->max_slots);
/* Does the newly negotiated max_reqs match the existing slot table? */
if (max_reqs != tbl->max_slots) {
- ret = -ENOMEM;
- new = kmalloc(max_reqs * sizeof(struct nfs4_slot),
- GFP_NOFS);
+ new = nfs4_alloc_slots(max_reqs, GFP_NOFS);
if (!new)
goto out;
- ret = 0;
- kfree(tbl->slots);
}
- spin_lock(&tbl->slot_tbl_lock);
- if (new) {
- tbl->slots = new;
- tbl->max_slots = max_reqs;
- }
- for (i = 0; i < tbl->max_slots; ++i)
- tbl->slots[i].seq_nr = ivalue;
- spin_unlock(&tbl->slot_tbl_lock);
+ ret = 0;
+
+ nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue);
dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
tbl, tbl->slots, tbl->max_slots);
out:
@@ -5061,36 +5080,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session)
}
/*
- * Initialize slot table
- */
-static int nfs4_init_slot_table(struct nfs4_slot_table *tbl,
- int max_slots, int ivalue)
-{
- struct nfs4_slot *slot;
- int ret = -ENOMEM;
-
- BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE);
-
- dprintk("--> %s: max_reqs=%u\n", __func__, max_slots);
-
- slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS);
- if (!slot)
- goto out;
- ret = 0;
-
- spin_lock(&tbl->slot_tbl_lock);
- tbl->max_slots = max_slots;
- tbl->slots = slot;
- tbl->highest_used_slotid = -1; /* no slot is currently used */
- spin_unlock(&tbl->slot_tbl_lock);
- dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__,
- tbl, tbl->slots, tbl->max_slots);
-out:
- dprintk("<-- %s: return %d\n", __func__, ret);
- return ret;
-}
-
-/*
* Initialize or reset the forechannel and backchannel tables
*/
static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
@@ -5101,25 +5090,16 @@ static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
dprintk("--> %s\n", __func__);
/* Fore channel */
tbl = &ses->fc_slot_table;
- if (tbl->slots == NULL) {
- status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
- if (status) /* -ENOMEM */
- return status;
- } else {
- status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
- if (status)
- return status;
- }
+ status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+ if (status) /* -ENOMEM */
+ return status;
/* Back channel */
tbl = &ses->bc_slot_table;
- if (tbl->slots == NULL) {
- status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
- if (status)
- /* Fore and back channel share a connection so get
- * both slot tables or neither */
- nfs4_destroy_slot_tables(ses);
- } else
- status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+ status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
+ if (status && tbl->slots == NULL)
+ /* Fore and back channel share a connection so get
+ * both slot tables or neither */
+ nfs4_destroy_slot_tables(ses);
return status;
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a53f33b4ac3..45392032e7b 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1132,6 +1132,8 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4
{
struct nfs_client *clp = server->nfs_client;
+ if (test_and_clear_bit(NFS_DELEGATED_STATE, &state->flags))
+ nfs_async_inode_return_delegation(state->inode, &state->stateid);
nfs4_state_mark_reclaim_nograce(clp, state);
nfs4_schedule_state_manager(clp);
}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 95e92e43840..33bd8d0f745 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2522,7 +2522,6 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
args->acl_pages, args->acl_pgbase, args->acl_len);
- xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);
encode_nops(&hdr);
}
@@ -6032,6 +6031,10 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
struct compound_hdr hdr;
int status;
+ if (res->acl_scratch != NULL) {
+ void *p = page_address(res->acl_scratch);
+ xdr_set_scratch_buffer(xdr, p, PAGE_SIZE);
+ }
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 886649627c3..2a70fce70c6 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -603,6 +603,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
nsegs = argv[4].v_nmembs;
if (argv[4].v_size != argsz[4])
goto out;
+ if (nsegs > UINT_MAX / sizeof(__u64))
+ goto out;
/*
* argv[4] points to segment numbers this ioctl cleans. We
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index be244692550..a9856e3eaaf 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1053,7 +1053,7 @@ static int ocfs2_rename(struct inode *old_dir,
handle_t *handle = NULL;
struct buffer_head *old_dir_bh = NULL;
struct buffer_head *new_dir_bh = NULL;
- nlink_t old_dir_nlink = old_dir->i_nlink;
+ u32 old_dir_nlink = old_dir->i_nlink;
struct ocfs2_dinode *old_di;
struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, };
struct ocfs2_dir_lookup_result target_lookup_res = { NULL, };
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5485a5388ec..d4548dd49b0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -198,82 +198,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static struct mm_struct *__check_mem_permission(struct task_struct *task)
-{
- struct mm_struct *mm;
-
- mm = get_task_mm(task);
- if (!mm)
- return ERR_PTR(-EINVAL);
-
- /*
- * A task can always look at itself, in case it chooses
- * to use system calls instead of load instructions.
- */
- if (task == current)
- return mm;
-
- /*
- * If current is actively ptrace'ing, and would also be
- * permitted to freshly attach with ptrace now, permit it.
- */
- if (task_is_stopped_or_traced(task)) {
- int match;
- rcu_read_lock();
- match = (ptrace_parent(task) == current);
- rcu_read_unlock();
- if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
- return mm;
- }
-
- /*
- * No one else is allowed.
- */
- mmput(mm);
- return ERR_PTR(-EPERM);
-}
-
-/*
- * If current may access user memory in @task return a reference to the
- * corresponding mm, otherwise ERR_PTR.
- */
-static struct mm_struct *check_mem_permission(struct task_struct *task)
-{
- struct mm_struct *mm;
- int err;
-
- /*
- * Avoid racing if task exec's as we might get a new mm but validate
- * against old credentials.
- */
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
- if (err)
- return ERR_PTR(err);
-
- mm = __check_mem_permission(task);
- mutex_unlock(&task->signal->cred_guard_mutex);
-
- return mm;
-}
-
struct mm_struct *mm_for_maps(struct task_struct *task)
{
- struct mm_struct *mm;
- int err;
-
- err = mutex_lock_killable(&task->signal->cred_guard_mutex);
- if (err)
- return ERR_PTR(err);
-
- mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, PTRACE_MODE_READ)) {
- mmput(mm);
- mm = ERR_PTR(-EACCES);
- }
- mutex_unlock(&task->signal->cred_guard_mutex);
-
- return mm;
+ return mm_access(task, PTRACE_MODE_READ);
}
static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -752,133 +679,96 @@ static const struct file_operations proc_single_file_operations = {
static int mem_open(struct inode* inode, struct file* file)
{
- file->private_data = (void*)((long)current->self_exec_id);
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
- return 0;
-}
-
-static ssize_t mem_read(struct file * file, char __user * buf,
- size_t count, loff_t *ppos)
-{
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- char *page;
- unsigned long src = *ppos;
- int ret = -ESRCH;
struct mm_struct *mm;
if (!task)
- goto out_no_task;
+ return -ESRCH;
- ret = -ENOMEM;
- page = (char *)__get_free_page(GFP_TEMPORARY);
- if (!page)
- goto out;
+ mm = mm_access(task, PTRACE_MODE_ATTACH);
+ put_task_struct(task);
- mm = check_mem_permission(task);
- ret = PTR_ERR(mm);
if (IS_ERR(mm))
- goto out_free;
-
- ret = -EIO;
-
- if (file->private_data != (void*)((long)current->self_exec_id))
- goto out_put;
-
- ret = 0;
-
- while (count > 0) {
- int this_len, retval;
-
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- retval = access_remote_vm(mm, src, page, this_len, 0);
- if (!retval) {
- if (!ret)
- ret = -EIO;
- break;
- }
+ return PTR_ERR(mm);
- if (copy_to_user(buf, page, retval)) {
- ret = -EFAULT;
- break;
- }
-
- ret += retval;
- src += retval;
- buf += retval;
- count -= retval;
+ if (mm) {
+ /* ensure this mm_struct can't be freed */
+ atomic_inc(&mm->mm_count);
+ /* but do not pin its memory */
+ mmput(mm);
}
- *ppos = src;
-out_put:
- mmput(mm);
-out_free:
- free_page((unsigned long) page);
-out:
- put_task_struct(task);
-out_no_task:
- return ret;
+ /* OK to pass negative loff_t, we can catch out-of-range */
+ file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ file->private_data = mm;
+
+ return 0;
}
-static ssize_t mem_write(struct file * file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t mem_rw(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, int write)
{
- int copied;
+ struct mm_struct *mm = file->private_data;
+ unsigned long addr = *ppos;
+ ssize_t copied;
char *page;
- struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
- unsigned long dst = *ppos;
- struct mm_struct *mm;
- copied = -ESRCH;
- if (!task)
- goto out_no_task;
+ if (!mm)
+ return 0;
- copied = -ENOMEM;
page = (char *)__get_free_page(GFP_TEMPORARY);
if (!page)
- goto out_task;
-
- mm = check_mem_permission(task);
- copied = PTR_ERR(mm);
- if (IS_ERR(mm))
- goto out_free;
-
- copied = -EIO;
- if (file->private_data != (void *)((long)current->self_exec_id))
- goto out_mm;
+ return -ENOMEM;
copied = 0;
+ if (!atomic_inc_not_zero(&mm->mm_users))
+ goto free;
+
while (count > 0) {
- int this_len, retval;
+ int this_len = min_t(int, count, PAGE_SIZE);
- this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
- if (copy_from_user(page, buf, this_len)) {
+ if (write && copy_from_user(page, buf, this_len)) {
copied = -EFAULT;
break;
}
- retval = access_remote_vm(mm, dst, page, this_len, 1);
- if (!retval) {
+
+ this_len = access_remote_vm(mm, addr, page, this_len, write);
+ if (!this_len) {
if (!copied)
copied = -EIO;
break;
}
- copied += retval;
- buf += retval;
- dst += retval;
- count -= retval;
+
+ if (!write && copy_to_user(buf, page, this_len)) {
+ copied = -EFAULT;
+ break;
+ }
+
+ buf += this_len;
+ addr += this_len;
+ copied += this_len;
+ count -= this_len;
}
- *ppos = dst;
+ *ppos = addr;
-out_mm:
mmput(mm);
-out_free:
+free:
free_page((unsigned long) page);
-out_task:
- put_task_struct(task);
-out_no_task:
return copied;
}
+static ssize_t mem_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, buf, count, ppos, 0);
+}
+
+static ssize_t mem_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
+
loff_t mem_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
@@ -895,11 +785,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos;
}
+static int mem_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+ if (mm)
+ mmdrop(mm);
+ return 0;
+}
+
static const struct file_operations proc_mem_operations = {
.llseek = mem_lseek,
.read = mem_read,
.write = mem_write,
.open = mem_open,
+ .release = mem_release,
};
static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1199,9 +1098,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
ssize_t length;
uid_t loginuid;
- if (!capable(CAP_AUDIT_CONTROL))
- return -EPERM;
-
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
rcu_read_unlock();
@@ -1230,7 +1126,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
goto out_free_page;
}
- length = audit_set_loginuid(current, loginuid);
+ length = audit_set_loginuid(loginuid);
if (likely(length == 0))
length = count;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index d76ca6ae2b1..121f77cfef7 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -77,6 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+ sum += kstat_cpu_irqs_sum(i);
+ sum += arch_irq_stat_cpu(i);
for (j = 0; j < NR_SOFTIRQS; j++) {
unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e418c5abdb0..7dcd2a25049 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -518,6 +518,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
if (!page)
continue;
+ if (PageReserved(page))
+ continue;
+
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
ClearPageReferenced(page);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2bfd987f485..6b009548d2e 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -179,47 +179,33 @@ static const char *qnx4_checkroot(struct super_block *sb)
struct qnx4_inode_entry *rootdir;
int rd, rl;
int i, j;
- int found = 0;
- if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
+ if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
return "no qnx4 filesystem (no root dir).";
- } else {
- QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
- rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
- rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
- for (j = 0; j < rl; j++) {
- bh = sb_bread(sb, rd + j); /* root dir, first block */
- if (bh == NULL) {
- return "unable to read root entry.";
- }
- for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
- rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
- if (rootdir->di_fname != NULL) {
- QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
- if (!strcmp(rootdir->di_fname,
- QNX4_BMNAME)) {
- found = 1;
- qnx4_sb(sb)->BitMap = kmemdup(rootdir,
- sizeof(struct qnx4_inode_entry),
- GFP_KERNEL);
- if (!qnx4_sb(sb)->BitMap) {
- brelse (bh);
- return "not enough memory for bitmap inode";
- }/* keep bitmap inode known */
- break;
- }
- }
- }
+ QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
+ rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
+ rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+ for (j = 0; j < rl; j++) {
+ bh = sb_bread(sb, rd + j); /* root dir, first block */
+ if (bh == NULL)
+ return "unable to read root entry.";
+ rootdir = (struct qnx4_inode_entry *) bh->b_data;
+ for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) {
+ QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+ if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0)
+ continue;
+ qnx4_sb(sb)->BitMap = kmemdup(rootdir,
+ sizeof(struct qnx4_inode_entry),
+ GFP_KERNEL);
brelse(bh);
- if (found != 0) {
- break;
- }
- }
- if (found == 0) {
- return "bitmap file not found.";
+ if (!qnx4_sb(sb)->BitMap)
+ return "not enough memory for bitmap inode";
+ /* keep bitmap inode known */
+ return NULL;
}
+ brelse(bh);
}
- return NULL;
+ return "bitmap file not found.";
}
static int qnx4_fill_super(struct super_block *s, void *data, int silent)
@@ -270,7 +256,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
if (IS_ERR(root)) {
printk(KERN_ERR "qnx4: get inode failed\n");
ret = PTR_ERR(root);
- goto out;
+ goto outb;
}
ret = -ENOMEM;
@@ -283,6 +269,8 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
outi:
iput(root);
+ outb:
+ kfree(qs->BitMap);
out:
brelse(bh);
outnobh:
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 5ec59b20cf7..46741970371 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2125,6 +2125,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
mutex_unlock(&dqopt->dqio_mutex);
goto out_file_init;
}
+ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+ dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
mutex_unlock(&dqopt->dqio_mutex);
spin_lock(&dq_state_lock);
dqopt->flags |= dquot_state_flag(flags, type);
@@ -2464,7 +2466,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
spin_lock(&dq_data_lock);
ii->dqi_bgrace = mi->dqi_bgrace;
ii->dqi_igrace = mi->dqi_igrace;
- ii->dqi_flags = mi->dqi_flags & DQF_MASK;
+ ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK;
ii->dqi_valid = IIF_ALL;
spin_unlock(&dq_data_lock);
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
@@ -2490,8 +2492,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
if (ii->dqi_valid & IIF_IGRACE)
mi->dqi_igrace = ii->dqi_igrace;
if (ii->dqi_valid & IIF_FLAGS)
- mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
- (ii->dqi_flags & DQF_MASK);
+ mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) |
+ (ii->dqi_flags & DQF_SETINFO_MASK);
spin_unlock(&dq_data_lock);
mark_info_dirty(sb, type);
/* Force write to disk */
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 7898cd688a0..fc2c4388d12 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -292,11 +292,26 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
}
}
+/* Return 1 if 'cmd' will block on frozen filesystem */
+static int quotactl_cmd_write(int cmd)
+{
+ switch (cmd) {
+ case Q_GETFMT:
+ case Q_GETINFO:
+ case Q_SYNC:
+ case Q_XGETQSTAT:
+ case Q_XGETQUOTA:
+ case Q_XQUOTASYNC:
+ return 0;
+ }
+ return 1;
+}
+
/*
* look up a superblock on which quota ops will be performed
* - use the name of a block device to find the superblock thereon
*/
-static struct super_block *quotactl_block(const char __user *special)
+static struct super_block *quotactl_block(const char __user *special, int cmd)
{
#ifdef CONFIG_BLOCK
struct block_device *bdev;
@@ -309,7 +324,10 @@ static struct super_block *quotactl_block(const char __user *special)
putname(tmp);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
- sb = get_super(bdev);
+ if (quotactl_cmd_write(cmd))
+ sb = get_super_thawed(bdev);
+ else
+ sb = get_super(bdev);
bdput(bdev);
if (!sb)
return ERR_PTR(-ENODEV);
@@ -361,7 +379,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
pathp = &path;
}
- sb = quotactl_block(special);
+ sb = quotactl_block(special, cmds);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
goto out;
diff --git a/fs/select.c b/fs/select.c
index d33418fdc85..e782258d0de 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -912,7 +912,7 @@ static long do_restart_poll(struct restart_block *restart_block)
}
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
- long, timeout_msecs)
+ int, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;
diff --git a/fs/super.c b/fs/super.c
index de41e1e46f0..6277ec6cb60 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -634,6 +634,28 @@ rescan:
EXPORT_SYMBOL(get_super);
/**
+ * get_super_thawed - get thawed superblock of a device
+ * @bdev: device to get the superblock for
+ *
+ * Scans the superblock list and finds the superblock of the file system
+ * mounted on the device. The superblock is returned once it is thawed
+ * (or immediately if it was not frozen). %NULL is returned if no match
+ * is found.
+ */
+struct super_block *get_super_thawed(struct block_device *bdev)
+{
+ while (1) {
+ struct super_block *s = get_super(bdev);
+ if (!s || s->s_frozen == SB_UNFROZEN)
+ return s;
+ up_read(&s->s_umount);
+ vfs_check_frozen(s, SB_FREEZE_WRITE);
+ put_super(s);
+ }
+}
+EXPORT_SYMBOL(get_super_thawed);
+
+/**
* get_active_super - get an active reference to the superblock of a device
* @bdev: device to get the superblock for
*
@@ -1186,6 +1208,8 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_frozen = SB_UNFROZEN;
+ smp_wmb();
+ wake_up(&sb->s_wait_unfrozen);
deactivate_locked_super(sb);
return ret;
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 62f4fb37789..00012e31829 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
const void *ns = NULL;
int err;
+ if (!dir_sd) {
+ WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
+ kobject_name(kobj));
+ return -ENOENT;
+ }
+
err = 0;
if (!sysfs_ns_type(dir_sd))
goto out;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 4a802b4a905..85eb81683a2 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -318,8 +318,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
struct sysfs_addrm_cxt acxt;
struct sysfs_dirent *sd;
- if (!dir_sd)
+ if (!dir_sd) {
+ WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+ name);
return -ENOENT;
+ }
sysfs_addrm_start(&acxt, dir_sd);
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 292eff19803..ab7c53fe346 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -110,10 +110,4 @@ kmem_zone_destroy(kmem_zone_t *zone)
extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-static inline int
-kmem_shake_allow(gfp_t gfp_mask)
-{
- return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
-}
-
#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b62..74b9baf36ac 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
xfs_fsize_t bsize;
bsize = ioend->io_offset + ioend->io_size;
- isize = MAX(ip->i_size, ip->i_new_size);
- isize = MIN(isize, bsize);
+ isize = MIN(i_size_read(VFS_I(ip)), bsize);
return isize > ip->i_d.di_size ? isize : 0;
}
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
}
/*
- * Update on-disk file size now that data has been written to disk. The
- * current in-memory file size is i_size. If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated. If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
+ * Update on-disk file size now that data has been written to disk.
*
* This function does not block as blocking on the inode lock in IO completion
* can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1279,6 +1274,15 @@ xfs_end_io_direct_write(
struct xfs_ioend *ioend = iocb->private;
/*
+ * While the generic direct I/O code updates the inode size, it does
+ * so only after the end_io handler is called, which means our
+ * end_io handler thinks the on-disk size is outside the in-core
+ * size. To prevent this just update it a little bit earlier here.
+ */
+ if (offset + size > i_size_read(ioend->io_inode))
+ i_size_write(ioend->io_inode, offset + size);
+
+ /*
* blockdev_direct_IO can return an error even after the I/O
* completion handler was called. Thus we need to protect
* against double-freeing.
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
if (to > inode->i_size) {
/*
- * punch out the delalloc blocks we have already allocated. We
- * don't call xfs_setattr() to do this as we may be in the
- * middle of a multi-iovec write and so the vfs inode->i_size
- * will not match the xfs ip->i_size and so it will zero too
- * much. Hence we jus truncate the page cache to zero what is
- * necessary and punch the delalloc blocks directly.
+ * Punch out the delalloc blocks we have already allocated.
+ *
+ * Don't bother with xfs_setattr given that nothing can have
+ * made it to disk yet as the page is still locked at this
+ * point.
*/
struct xfs_inode *ip = XFS_I(inode);
xfs_fileoff_t start_fsb;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea..08b9ac644c3 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
if (error)
goto out;
- /*
- * Commit the last in the sequence of transactions.
- */
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e59655..d25eafd4d28 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
dp = args->dp;
mp = dp->i_mount;
dp->i_d.di_forkoff = forkoff;
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
ifp = dp->i_afp;
ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
ASSERT(ip->i_d.di_anextents == 0);
ASSERT(ip->i_afp == NULL);
- ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!(mp->m_flags & XFS_MOUNT_ATTR2) ||
dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
- dp->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
- dp->i_df.if_ext_max =
- XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
xfs_trans_log_inode(args->trans, dp,
XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab7883705..188ef2fbd62 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
}
/*
-* Update the record referred to by cur to the value given
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+ return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
* by [off, bno, len, state].
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist,
&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur, 1,
&tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
- if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
bma->firstblock, bma->flist, &bma->cur,
1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
}
/* convert to a btree if necessary */
- if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
int tmp_logflags; /* partial log flag return val */
ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
/*
* Make space in the inode incore.
*/
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
- if (dfl_forkoff > ip->i_d.di_forkoff) {
+ if (dfl_forkoff > ip->i_d.di_forkoff)
ip->i_d.di_forkoff = dfl_forkoff;
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
- }
}
}
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
int error; /* error return value */
ASSERT(XFS_IFORK_Q(ip) == 0);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
mp = ip->i_mount;
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
error = XFS_ERROR(EINVAL);
goto error1;
}
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
ip->i_afp->if_flags = XFS_IFEXTENTS;
logflags = 0;
xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
} else
spin_unlock(&mp->m_sb_lock);
}
- if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
goto error2;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
- return error;
+ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
error2:
xfs_bmap_cancel(&flist);
error1:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
error0:
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
return error;
}
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block(
xfs_bmbt_irec_t s; /* internal version of extent */
#ifndef DEBUG
- if (whichfork == XFS_DATA_FORK) {
- return S_ISREG(ip->i_d.di_mode) ?
- (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
- (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
- }
+ if (whichfork == XFS_DATA_FORK)
+ return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
#endif /* !DEBUG */
if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
xfs_bmbt_get_all(ep, &s);
rval = s.br_startoff == 0 && s.br_blockcount == 1;
if (rval && whichfork == XFS_DATA_FORK)
- ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+ ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
return rval;
}
@@ -4379,8 +4379,6 @@ xfs_bmapi_read(
XFS_STATS_INC(xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4869,6 @@ xfs_bmapi_write(
return XFS_ERROR(EIO);
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
XFS_STATS_INC(xs_blk_mapw);
@@ -4981,8 +4977,7 @@ xfs_bmapi_write(
/*
* Transform from btree to extents, give it cur.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ if (xfs_bmap_wants_extents(ip, whichfork)) {
int tmp_logflags = 0;
ASSERT(bma.cur);
@@ -4992,10 +4987,10 @@ xfs_bmapi_write(
if (error)
goto error0;
}
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+ XFS_IFORK_NEXTENTS(ip, whichfork) >
+ XFS_IFORK_MAXEXT(ip, whichfork));
error = 0;
error0:
/*
@@ -5095,8 +5090,7 @@ xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
if (!(ifp->if_flags & XFS_IFEXTENTS) &&
(error = xfs_iread_extents(tp, ip, whichfork)))
return error;
@@ -5322,7 +5316,8 @@ xfs_bunmapi(
*/
if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+ XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+ XFS_IFORK_MAXEXT(ip, whichfork) &&
del.br_startoff > got.br_startoff &&
del.br_startoff + del.br_blockcount <
got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5348,11 @@ nodelete:
}
}
*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
/*
* Convert to a btree if necessary.
*/
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+ if (xfs_bmap_needs_btree(ip, whichfork)) {
ASSERT(cur == NULL);
error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
&cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5363,7 @@ nodelete:
/*
* transform from btree to extents, give it cur
*/
- else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+ else if (xfs_bmap_wants_extents(ip, whichfork)) {
ASSERT(cur != NULL);
error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
whichfork);
@@ -5382,8 +5374,6 @@ nodelete:
/*
* transform from extents to local?
*/
- ASSERT(ifp->if_ext_max ==
- XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
error = 0;
error0:
/*
@@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
if (startblock == HOLESTARTBLOCK) {
mp = ip->i_mount;
out->bmv_block = -1;
- fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+ fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
fixlen -= out->bmv_offset;
if (prealloced && out->bmv_offset + out->bmv_length == end) {
/* Came to hole at EOF. Trim it. */
@@ -5522,7 +5512,7 @@ xfs_getbmap(
fixlen = XFS_MAXIOFFSET(mp);
} else {
prealloced = 0;
- fixlen = ip->i_size;
+ fixlen = XFS_ISIZE(ip);
}
}
@@ -5551,7 +5541,7 @@ xfs_getbmap(
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
- if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+ if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
if (error)
goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05ba..dd974a55c77 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
/* Check temp in extent form to max in target */
if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+ XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
return EINVAL;
/* Check target in extent form to max in temp */
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+ XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
return EINVAL;
/*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
* (a common defrag case) which will occur when the temp inode is in
* extent format...
*/
- if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- ((XFS_IFORK_BOFF(ip) &&
- tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
- XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
- return EINVAL;
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(ip) &&
+ tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+ return EINVAL;
+ if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+ return EINVAL;
+ }
/* Reciprocal target->temp btree format checks */
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- ((XFS_IFORK_BOFF(tip) &&
- ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
- XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
- return EINVAL;
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+ if (XFS_IFORK_BOFF(tip) &&
+ ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+ return EINVAL;
+
+ if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+ XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+ return EINVAL;
+ }
return 0;
}
@@ -349,16 +358,6 @@ xfs_swap_extents(
*tifp = *tempifp; /* struct copy */
/*
- * Fix the in-memory data fork values that are dependent on the fork
- * offset in the inode. We can't assume they remain the same as attr2
- * has dynamic fork offsets.
- */
- ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
- (uint)sizeof(xfs_bmbt_rec_t);
- tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
- (uint)sizeof(xfs_bmbt_rec_t);
-
- /*
* Fix the on-disk inode values
*/
tmp = (__uint64_t)ip->i_d.di_nblocks;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index b4ff40b5f91..53db20ee3e7 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -63,82 +63,6 @@ int xfs_dqerror_mod = 33;
static struct lock_class_key xfs_dquot_other_class;
/*
- * Allocate and initialize a dquot. We don't always allocate fresh memory;
- * we try to reclaim a free dquot if the number of incore dquots are above
- * a threshold.
- * The only field inside the core that gets initialized at this point
- * is the d_id field. The idea is to fill in the entire q_core
- * when we read in the on disk dquot.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqinit(
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type)
-{
- xfs_dquot_t *dqp;
- boolean_t brandnewdquot;
-
- brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
- dqp->dq_flags = type;
- dqp->q_core.d_id = cpu_to_be32(id);
- dqp->q_mount = mp;
-
- /*
- * No need to re-initialize these if this is a reclaimed dquot.
- */
- if (brandnewdquot) {
- INIT_LIST_HEAD(&dqp->q_freelist);
- mutex_init(&dqp->q_qlock);
- init_waitqueue_head(&dqp->q_pinwait);
-
- /*
- * Because we want to use a counting completion, complete
- * the flush completion once to allow a single access to
- * the flush completion without blocking.
- */
- init_completion(&dqp->q_flush);
- complete(&dqp->q_flush);
-
- trace_xfs_dqinit(dqp);
- } else {
- /*
- * Only the q_core portion was zeroed in dqreclaim_one().
- * So, we need to reset others.
- */
- dqp->q_nrefs = 0;
- dqp->q_blkno = 0;
- INIT_LIST_HEAD(&dqp->q_mplist);
- INIT_LIST_HEAD(&dqp->q_hashlist);
- dqp->q_bufoffset = 0;
- dqp->q_fileoffset = 0;
- dqp->q_transp = NULL;
- dqp->q_gdquot = NULL;
- dqp->q_res_bcount = 0;
- dqp->q_res_icount = 0;
- dqp->q_res_rtbcount = 0;
- atomic_set(&dqp->q_pincount, 0);
- dqp->q_hash = NULL;
- ASSERT(list_empty(&dqp->q_freelist));
-
- trace_xfs_dqreuse(dqp);
- }
-
- /*
- * In either case we need to make sure group quotas have a different
- * lock class than user quotas, to make sure lockdep knows we can
- * locks of one of each at the same time.
- */
- if (!(type & XFS_DQ_USER))
- lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
-
- /*
- * log item gets initialized later
- */
- return (dqp);
-}
-
-/*
* This is called to free all the memory associated with a dquot
*/
void
@@ -215,10 +139,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_btimer) {
if ((d->d_blk_softlimit &&
- (be64_to_cpu(d->d_bcount) >=
+ (be64_to_cpu(d->d_bcount) >
be64_to_cpu(d->d_blk_softlimit))) ||
(d->d_blk_hardlimit &&
- (be64_to_cpu(d->d_bcount) >=
+ (be64_to_cpu(d->d_bcount) >
be64_to_cpu(d->d_blk_hardlimit)))) {
d->d_btimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_btimelimit);
@@ -227,10 +151,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_blk_softlimit ||
- (be64_to_cpu(d->d_bcount) <
+ (be64_to_cpu(d->d_bcount) <=
be64_to_cpu(d->d_blk_softlimit))) &&
(!d->d_blk_hardlimit ||
- (be64_to_cpu(d->d_bcount) <
+ (be64_to_cpu(d->d_bcount) <=
be64_to_cpu(d->d_blk_hardlimit)))) {
d->d_btimer = 0;
}
@@ -238,10 +162,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_itimer) {
if ((d->d_ino_softlimit &&
- (be64_to_cpu(d->d_icount) >=
+ (be64_to_cpu(d->d_icount) >
be64_to_cpu(d->d_ino_softlimit))) ||
(d->d_ino_hardlimit &&
- (be64_to_cpu(d->d_icount) >=
+ (be64_to_cpu(d->d_icount) >
be64_to_cpu(d->d_ino_hardlimit)))) {
d->d_itimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_itimelimit);
@@ -250,10 +174,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_ino_softlimit ||
- (be64_to_cpu(d->d_icount) <
+ (be64_to_cpu(d->d_icount) <=
be64_to_cpu(d->d_ino_softlimit))) &&
(!d->d_ino_hardlimit ||
- (be64_to_cpu(d->d_icount) <
+ (be64_to_cpu(d->d_icount) <=
be64_to_cpu(d->d_ino_hardlimit)))) {
d->d_itimer = 0;
}
@@ -261,10 +185,10 @@ xfs_qm_adjust_dqtimers(
if (!d->d_rtbtimer) {
if ((d->d_rtb_softlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
+ (be64_to_cpu(d->d_rtbcount) >
be64_to_cpu(d->d_rtb_softlimit))) ||
(d->d_rtb_hardlimit &&
- (be64_to_cpu(d->d_rtbcount) >=
+ (be64_to_cpu(d->d_rtbcount) >
be64_to_cpu(d->d_rtb_hardlimit)))) {
d->d_rtbtimer = cpu_to_be32(get_seconds() +
mp->m_quotainfo->qi_rtbtimelimit);
@@ -273,10 +197,10 @@ xfs_qm_adjust_dqtimers(
}
} else {
if ((!d->d_rtb_softlimit ||
- (be64_to_cpu(d->d_rtbcount) <
+ (be64_to_cpu(d->d_rtbcount) <=
be64_to_cpu(d->d_rtb_softlimit))) &&
(!d->d_rtb_hardlimit ||
- (be64_to_cpu(d->d_rtbcount) <
+ (be64_to_cpu(d->d_rtbcount) <=
be64_to_cpu(d->d_rtb_hardlimit)))) {
d->d_rtbtimer = 0;
}
@@ -567,7 +491,32 @@ xfs_qm_dqread(
int error;
int cancelflags = 0;
- dqp = xfs_qm_dqinit(mp, id, type);
+
+ dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+
+ dqp->dq_flags = type;
+ dqp->q_core.d_id = cpu_to_be32(id);
+ dqp->q_mount = mp;
+ INIT_LIST_HEAD(&dqp->q_freelist);
+ mutex_init(&dqp->q_qlock);
+ init_waitqueue_head(&dqp->q_pinwait);
+
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&dqp->q_flush);
+ complete(&dqp->q_flush);
+
+ /*
+ * Make sure group quotas have a different lock class than user
+ * quotas.
+ */
+ if (!(type & XFS_DQ_USER))
+ lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+
+ atomic_inc(&xfs_Gqm->qm_totaldquots);
trace_xfs_dqread(dqp);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f675f3d9d7b..7e5bc872f2b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -327,7 +327,7 @@ xfs_file_aio_read(
mp->m_rtdev_targp : mp->m_ddev_targp;
if ((iocb->ki_pos & target->bt_smask) ||
(size & target->bt_smask)) {
- if (iocb->ki_pos == ip->i_size)
+ if (iocb->ki_pos == i_size_read(inode))
return 0;
return -XFS_ERROR(EINVAL);
}
@@ -412,51 +412,6 @@ xfs_file_splice_read(
return ret;
}
-STATIC void
-xfs_aio_write_isize_update(
- struct inode *inode,
- loff_t *ppos,
- ssize_t bytes_written)
-{
- struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t isize = i_size_read(inode);
-
- if (bytes_written > 0)
- XFS_STATS_ADD(xs_write_bytes, bytes_written);
-
- if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
- *ppos > isize))
- *ppos = isize;
-
- if (*ppos > ip->i_size) {
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- if (*ppos > ip->i_size)
- ip->i_size = *ppos;
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- }
-}
-
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred. In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
- struct xfs_inode *ip,
- xfs_fsize_t new_size)
-{
- if (new_size == ip->i_new_size) {
- xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- if (new_size == ip->i_new_size)
- ip->i_new_size = 0;
- if (ip->i_d.di_size > ip->i_size)
- ip->i_d.di_size = ip->i_size;
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
- }
-}
-
/*
* xfs_file_splice_write() does not use xfs_rw_ilock() because
* generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +430,6 @@ xfs_file_splice_write(
{
struct inode *inode = outfilp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t new_size;
int ioflags = 0;
ssize_t ret;
@@ -489,19 +443,12 @@ xfs_file_splice_write(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- new_size = *ppos + count;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (new_size > ip->i_size)
- ip->i_new_size = new_size;
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+ if (ret > 0)
+ XFS_STATS_ADD(xs_write_bytes, ret);
- xfs_aio_write_isize_update(inode, ppos, ret);
- xfs_aio_write_newsize_update(ip, new_size);
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
@@ -689,28 +636,26 @@ out_lock:
/*
* Common pre-write limit and setup checks.
*
- * Returns with iolock held according to @iolock.
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held. Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
*/
STATIC ssize_t
xfs_file_aio_write_checks(
struct file *file,
loff_t *pos,
size_t *count,
- xfs_fsize_t *new_sizep,
int *iolock)
{
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- xfs_fsize_t new_size;
int error = 0;
xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
- *new_sizep = 0;
restart:
error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
if (error) {
- xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
- *iolock = 0;
+ xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -720,36 +665,21 @@ restart:
/*
* If the offset is beyond the size of the file, we need to zero any
* blocks that fall between the existing EOF and the start of this
- * write. There is no need to issue zeroing if another in-flght IO ends
- * at or before this one If zeronig is needed and we are currently
- * holding the iolock shared, we need to update it to exclusive which
- * involves dropping all locks and relocking to maintain correct locking
- * order. If we do this, restart the function to ensure all checks and
- * values are still valid.
+ * write. If zeroing is needed and we are currently holding the
+ * iolock shared, we need to update it to exclusive which involves
+ * dropping all locks and relocking to maintain correct locking order.
+ * If we do this, restart the function to ensure all checks and values
+ * are still valid.
*/
- if ((ip->i_new_size && *pos > ip->i_new_size) ||
- (!ip->i_new_size && *pos > ip->i_size)) {
+ if (*pos > i_size_read(inode)) {
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
*iolock = XFS_IOLOCK_EXCL;
xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
goto restart;
}
- error = -xfs_zero_eof(ip, *pos, ip->i_size);
+ error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
}
-
- /*
- * If this IO extends beyond EOF, we may need to update ip->i_new_size.
- * We have already zeroed space beyond EOF (if necessary). Only update
- * ip->i_new_size if this IO ends beyond any other in-flight writes.
- */
- new_size = *pos + *count;
- if (new_size > ip->i_size) {
- if (new_size > ip->i_new_size)
- ip->i_new_size = new_size;
- *new_sizep = new_size;
- }
-
xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
@@ -794,9 +724,7 @@ xfs_file_dio_aio_write(
const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos,
- size_t ocount,
- xfs_fsize_t *new_size,
- int *iolock)
+ size_t ocount)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -806,10 +734,10 @@ xfs_file_dio_aio_write(
ssize_t ret = 0;
size_t count = ocount;
int unaligned_io = 0;
+ int iolock;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- *iolock = 0;
if ((pos & target->bt_smask) || (count & target->bt_smask))
return -XFS_ERROR(EINVAL);
@@ -824,31 +752,31 @@ xfs_file_dio_aio_write(
* EOF zeroing cases and fill out the new inode size as appropriate.
*/
if (unaligned_io || mapping->nrpages)
- *iolock = XFS_IOLOCK_EXCL;
+ iolock = XFS_IOLOCK_EXCL;
else
- *iolock = XFS_IOLOCK_SHARED;
- xfs_rw_ilock(ip, *iolock);
+ iolock = XFS_IOLOCK_SHARED;
+ xfs_rw_ilock(ip, iolock);
/*
* Recheck if there are cached pages that need invalidate after we got
* the iolock to protect against other threads adding new pages while
* we were waiting for the iolock.
*/
- if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, iolock);
+ iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, iolock);
}
- ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
- return ret;
+ goto out;
if (mapping->nrpages) {
ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
FI_REMAPF_LOCKED);
if (ret)
- return ret;
+ goto out;
}
/*
@@ -857,15 +785,18 @@ xfs_file_dio_aio_write(
*/
if (unaligned_io)
inode_dio_wait(inode);
- else if (*iolock == XFS_IOLOCK_EXCL) {
+ else if (iolock == XFS_IOLOCK_EXCL) {
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
- *iolock = XFS_IOLOCK_SHARED;
+ iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
ret = generic_file_direct_write(iocb, iovp,
&nr_segs, pos, &iocb->ki_pos, count, ocount);
+out:
+ xfs_rw_iunlock(ip, iolock);
+
/* No fallback to buffered IO on errors for XFS. */
ASSERT(ret < 0 || ret == count);
return ret;
@@ -877,9 +808,7 @@ xfs_file_buffered_aio_write(
const struct iovec *iovp,
unsigned long nr_segs,
loff_t pos,
- size_t ocount,
- xfs_fsize_t *new_size,
- int *iolock)
+ size_t ocount)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -887,14 +816,14 @@ xfs_file_buffered_aio_write(
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
int enospc = 0;
+ int iolock = XFS_IOLOCK_EXCL;
size_t count = ocount;
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_rw_ilock(ip, iolock);
- ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+ ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
- return ret;
+ goto out;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +837,15 @@ write_retry:
* page locks and retry *once*
*/
if (ret == -ENOSPC && !enospc) {
- ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
- if (ret)
- return ret;
enospc = 1;
- goto write_retry;
+ ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+ if (!ret)
+ goto write_retry;
}
+
current->backing_dev_info = NULL;
+out:
+ xfs_rw_iunlock(ip, iolock);
return ret;
}
@@ -930,9 +861,7 @@ xfs_file_aio_write(
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- int iolock;
size_t ocount = 0;
- xfs_fsize_t new_size = 0;
XFS_STATS_INC(xs_write_calls);
@@ -951,33 +880,22 @@ xfs_file_aio_write(
return -EIO;
if (unlikely(file->f_flags & O_DIRECT))
- ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &new_size, &iolock);
+ ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
else
ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount, &new_size, &iolock);
-
- xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
+ ocount);
- if (ret <= 0)
- goto out_unlock;
+ if (ret > 0) {
+ ssize_t err;
- /* Handle various SYNC-type writes */
- if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- loff_t end = pos + ret - 1;
- int error;
+ XFS_STATS_ADD(xs_write_bytes, ret);
- xfs_rw_iunlock(ip, iolock);
- error = xfs_file_fsync(file, pos, end,
- (file->f_flags & __O_SYNC) ? 0 : 1);
- xfs_rw_ilock(ip, iolock);
- if (error)
- ret = error;
+ /* Handle various SYNC-type writes */
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0)
+ ret = err;
}
-out_unlock:
- xfs_aio_write_newsize_update(ip, new_size);
- xfs_rw_iunlock(ip, iolock);
return ret;
}
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811..652b875a9d4 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
return -filemap_fdatawait_range(mapping, first,
- last == -1 ? ip->i_size - 1 : last);
+ last == -1 ? XFS_ISIZE(ip) - 1 : last);
}
return 0;
}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3960a066d7f..8c3e46394d4 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(completion_done(&ip->i_flush));
+ ASSERT(!xfs_isiflocked(ip));
ASSERT(ip->i_ino == 0);
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -94,8 +94,6 @@ xfs_inode_alloc(
ip->i_update_core = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
- ip->i_size = 0;
- ip->i_new_size = 0;
return ip;
}
@@ -150,7 +148,7 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(completion_done(&ip->i_flush));
+ ASSERT(!xfs_isiflocked(ip));
/*
* Because we use RCU freeing we need to ensure the inode always
@@ -450,8 +448,6 @@ again:
*ipp = ip;
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
/*
* If we have a real type for an on-disk inode, we can set ops(&unlock)
* now. If it's a new inode being created, xfs_ialloc will handle it.
@@ -715,3 +711,19 @@ xfs_isilocked(
return 0;
}
#endif
+
+void
+__xfs_iflock(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+ do {
+ prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_isiflocked(ip))
+ io_schedule();
+ } while (!xfs_iflock_nowait(ip));
+
+ finish_wait(wq, &wait.wait);
+}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9dda7cc3284..b21022499c2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -299,11 +299,8 @@ xfs_iformat(
{
xfs_attr_shortform_t *atp;
int size;
- int error;
+ int error = 0;
xfs_fsize_t di_size;
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- error = 0;
if (unlikely(be32_to_cpu(dip->di_nextents) +
be16_to_cpu(dip->di_anextents) >
@@ -350,7 +347,6 @@ xfs_iformat(
return XFS_ERROR(EFSCORRUPTED);
}
ip->i_d.di_size = 0;
- ip->i_size = 0;
ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
break;
@@ -409,10 +405,10 @@ xfs_iformat(
}
if (!XFS_DFORK_Q(dip))
return 0;
+
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
- ip->i_afp->if_ext_max =
- XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
switch (dip->di_aformat) {
case XFS_DINODE_FMT_LOCAL:
atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +600,11 @@ xfs_iformat_btree(
* or the number of extents is greater than the number of
* blocks.
*/
- if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
- || XFS_BMDR_SPACE_CALC(nrecs) >
- XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
- || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+ if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+ XFS_IFORK_MAXEXT(ip, whichfork) ||
+ XFS_BMDR_SPACE_CALC(nrecs) >
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+ XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
(unsigned long long) ip->i_ino);
XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +832,6 @@ xfs_iread(
* with the uninitialized part of it.
*/
ip->i_d.di_mode = 0;
- /*
- * Initialize the per-fork minima and maxima for a new
- * inode here. xfs_iformat will do it for old inodes.
- */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
}
/*
@@ -861,7 +852,6 @@ xfs_iread(
}
ip->i_delayed_blks = 0;
- ip->i_size = ip->i_d.di_size;
/*
* Mark the buffer containing the inode as something to keep
@@ -1051,7 +1041,6 @@ xfs_ialloc(
}
ip->i_d.di_size = 0;
- ip->i_size = 0;
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
@@ -1166,52 +1155,6 @@ xfs_ialloc(
}
/*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file. We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
- */
-#ifdef DEBUG
-STATIC void
-xfs_isize_check(
- struct xfs_inode *ip,
- xfs_fsize_t isize)
-{
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t map_first;
- int nimaps;
- xfs_bmbt_irec_t imaps[2];
- int error;
-
- if (!S_ISREG(ip->i_d.di_mode))
- return;
-
- if (XFS_IS_REALTIME_INODE(ip))
- return;
-
- if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
- return;
-
- nimaps = 2;
- map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
- /*
- * The filesystem could be shutting down, so bmapi may return
- * an error.
- */
- error = xfs_bmapi_read(ip, map_first,
- (XFS_B_TO_FSB(mp,
- (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
- imaps, &nimaps, XFS_BMAPI_ENTIRE);
- if (error)
- return;
- ASSERT(nimaps == 1);
- ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
-}
-#else /* DEBUG */
-#define xfs_isize_check(ip, isize)
-#endif /* DEBUG */
-
-/*
* Free up the underlying blocks past new_size. The new size must be smaller
* than the current size. This routine can be used both for the attribute and
* data fork, and does not modify the inode size, which is left to the caller.
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents(
int done = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- ASSERT(new_size <= ip->i_size);
+ ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(ip->i_itemp != NULL);
ASSERT(ip->i_itemp->ili_lock_flags == 0);
ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+ trace_xfs_itruncate_extents_start(ip, new_size);
+
/*
* Since it is possible for space to become allocated beyond
* the end of the file (in a crash where the space is allocated
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents(
goto out;
}
+ /*
+ * Always re-log the inode so that our permanent transaction can keep
+ * on rolling it forward in the log.
+ */
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ trace_xfs_itruncate_extents_end(ip, new_size);
+
out:
*tpp = tp;
return error;
@@ -1338,74 +1291,6 @@ out_bmap_cancel:
goto out;
}
-int
-xfs_itruncate_data(
- struct xfs_trans **tpp,
- struct xfs_inode *ip,
- xfs_fsize_t new_size)
-{
- int error;
-
- trace_xfs_itruncate_data_start(ip, new_size);
-
- /*
- * The first thing we do is set the size to new_size permanently on
- * disk. This way we don't have to worry about anyone ever being able
- * to look at the data being freed even in the face of a crash.
- * What we're getting around here is the case where we free a block, it
- * is allocated to another file, it is written to, and then we crash.
- * If the new data gets written to the file but the log buffers
- * containing the free and reallocation don't, then we'd end up with
- * garbage in the blocks being freed. As long as we make the new_size
- * permanent before actually freeing any blocks it doesn't matter if
- * they get written to.
- */
- if (ip->i_d.di_nextents > 0) {
- /*
- * If we are not changing the file size then do not update
- * the on-disk file size - we may be called from
- * xfs_inactive_free_eofblocks(). If we update the on-disk
- * file size and then the system crashes before the contents
- * of the file are flushed to disk then the files may be
- * full of holes (ie NULL files bug).
- */
- if (ip->i_size != new_size) {
- ip->i_d.di_size = new_size;
- ip->i_size = new_size;
- xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
- }
- }
-
- error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
- if (error)
- return error;
-
- /*
- * If we are not changing the file size then do not update the on-disk
- * file size - we may be called from xfs_inactive_free_eofblocks().
- * If we update the on-disk file size and then the system crashes
- * before the contents of the file are flushed to disk then the files
- * may be full of holes (ie NULL files bug).
- */
- xfs_isize_check(ip, new_size);
- if (ip->i_size != new_size) {
- ip->i_d.di_size = new_size;
- ip->i_size = new_size;
- }
-
- ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
- ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
-
- /*
- * Always re-log the inode so that our permanent transaction can keep
- * on rolling it forward in the log.
- */
- xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-
- trace_xfs_itruncate_data_end(ip, new_size);
- return 0;
-}
-
/*
* This is called when the inode's link count goes to 0.
* We place the on-disk inode on a list in the AGI. It
@@ -1824,8 +1709,7 @@ xfs_ifree(
ASSERT(ip->i_d.di_nlink == 0);
ASSERT(ip->i_d.di_nextents == 0);
ASSERT(ip->i_d.di_anextents == 0);
- ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
- (!S_ISREG(ip->i_d.di_mode)));
+ ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
ASSERT(ip->i_d.di_nblocks == 0);
/*
@@ -1844,8 +1728,6 @@ xfs_ifree(
ip->i_d.di_flags = 0;
ip->i_d.di_dmevmask = 0;
ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
- ip->i_df.if_ext_max =
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
/*
@@ -2151,7 +2033,7 @@ xfs_idestroy_fork(
* once someone is waiting for it to be unpinned.
*/
static void
-xfs_iunpin_nowait(
+xfs_iunpin(
struct xfs_inode *ip)
{
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2045,29 @@ xfs_iunpin_nowait(
}
+static void
+__xfs_iunpin_wait(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+ xfs_iunpin(ip);
+
+ do {
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_ipincount(ip))
+ io_schedule();
+ } while (xfs_ipincount(ip));
+ finish_wait(wq, &wait.wait);
+}
+
void
xfs_iunpin_wait(
struct xfs_inode *ip)
{
- if (xfs_ipincount(ip)) {
- xfs_iunpin_nowait(ip);
- wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
- }
+ if (xfs_ipincount(ip))
+ __xfs_iunpin_wait(ip);
}
/*
@@ -2510,9 +2407,9 @@ xfs_iflush(
XFS_STATS_INC(xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
iip = ip->i_itemp;
mp = ip->i_mount;
@@ -2529,7 +2426,7 @@ xfs_iflush(
* out for us if they occur after the log force completes.
*/
if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
- xfs_iunpin_nowait(ip);
+ xfs_iunpin(ip);
xfs_ifunlock(ip);
return EAGAIN;
}
@@ -2626,9 +2523,9 @@ xfs_iflush_int(
#endif
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
- ip->i_d.di_nextents > ip->i_df.if_ext_max);
+ ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
iip = ip->i_itemp;
mp = ip->i_mount;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f0e6b151ba3..2f27b745408 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
- unsigned char if_ext_max; /* max # of extent records */
union {
xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
((w) == XFS_DATA_FORK ? \
((ip)->i_d.di_nextents = (n)) : \
((ip)->i_d.di_anextents = (n)))
-
+#define XFS_IFORK_MAXEXT(ip, w) \
+ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
#ifdef __KERNEL__
-struct bhv_desc;
struct xfs_buf;
struct xfs_bmap_free;
struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
struct xfs_trans;
struct xfs_dquot;
-typedef struct dm_attrs_s {
- __uint32_t da_dmevmask; /* DMIG event mask */
- __uint16_t da_dmstate; /* DMIG state info */
- __uint16_t da_pad; /* DMIG extra padding */
-} dm_attrs_t;
-
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_mount *i_mount; /* fs mount struct ptr */
@@ -244,27 +237,19 @@ typedef struct xfs_inode {
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
mrlock_t i_iolock; /* inode IO lock */
- struct completion i_flush; /* inode flush completion q */
atomic_t i_pincount; /* inode pin count */
- wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
spinlock_t i_flags_lock; /* inode i_flags lock */
/* Miscellaneous state. */
- unsigned short i_flags; /* see defined flags below */
+ unsigned long i_flags; /* see defined flags below */
unsigned char i_update_core; /* timestamps/size is dirty */
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
- xfs_fsize_t i_size; /* in-memory size */
- xfs_fsize_t i_new_size; /* size when write completes */
-
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
} xfs_inode_t;
-#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \
- (ip)->i_size : (ip)->i_d.di_size;
-
/* Convert from vfs inode to xfs inode */
static inline struct xfs_inode *XFS_I(struct inode *inode)
{
@@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
}
/*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk. Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+ if (S_ISREG(ip->i_d.di_mode))
+ return i_size_read(VFS_I(ip));
+ return ip->i_d.di_size;
+}
+
+/*
* i_flags helper functions
*/
static inline void
@@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
return ret;
}
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+ int ret;
+
+ spin_lock(&ip->i_flags_lock);
+ ret = ip->i_flags & flags;
+ if (!ret)
+ ip->i_flags |= flags;
+ spin_unlock(&ip->i_flags_lock);
+ return ret;
+}
+
/*
* Project quota id helpers (previously projid was 16bit only
* and using two 16bit values to hold new 32bit projid was chosen
@@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip,
}
/*
- * Manage the i_flush queue embedded in the inode. This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
- wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
- return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
- complete(&ip->i_flush);
-}
-
-/*
* In-core inode flags.
*/
-#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
-#define XFS_ISTALE 0x0002 /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_INEW 0x0008 /* inode has just been allocated */
-#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
-#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
+#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE (1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
+#define XFS_INEW (1 << 3) /* inode has just been allocated */
+#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
+#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
+#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
+#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
/*
* Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
XFS_IFILESTREAM);
/*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+
+extern void __xfs_iflock(struct xfs_inode *ip);
+
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+ return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+ if (!xfs_iflock_nowait(ip))
+ __xfs_iflock(ip);
+}
+
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+ xfs_iflags_clear(ip, XFS_IFLOCK);
+ wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+ return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
+/*
* Flags for inode locking.
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -491,8 +513,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
struct xfs_bmap_free *);
int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
int, xfs_fsize_t);
-int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
- xfs_fsize_t);
int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cfd6c7f8cc3..91d71dcd485 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -79,8 +79,6 @@ xfs_inode_item_size(
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
iip->ili_format.ilf_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
XFS_ILOG_DEV | XFS_ILOG_UUID);
@@ -557,7 +555,7 @@ xfs_inode_item_unpin(
trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(atomic_read(&ip->i_pincount) > 0);
if (atomic_dec_and_test(&ip->i_pincount))
- wake_up(&ip->i_ipin_wait);
+ wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
}
/*
@@ -719,7 +717,7 @@ xfs_inode_item_pushbuf(
* If a flush is not in progress anymore, chances are that the
* inode was taken off the AIL. So, just get out.
*/
- if (completion_done(&ip->i_flush) ||
+ if (!xfs_isiflocked(ip) ||
!(lip->li_flags & XFS_LI_IN_AIL)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return true;
@@ -752,7 +750,7 @@ xfs_inode_item_push(
struct xfs_inode *ip = iip->ili_inode;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
- ASSERT(!completion_done(&ip->i_flush));
+ ASSERT(xfs_isiflocked(ip));
/*
* Since we were able to lock the inode's flush lock and
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa93..246c7d57c6f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
xfs_fileoff_t *last_fsb)
{
xfs_fileoff_t new_last_fsb = 0;
- xfs_extlen_t align;
+ xfs_extlen_t align = 0;
int eof, error;
- if (XFS_IS_REALTIME_INODE(ip))
- ;
- /*
- * If mounted with the "-o swalloc" option, roundup the allocation
- * request to a stripe width boundary if the file size is >=
- * stripe width and we are allocating past the allocation eof.
- */
- else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
- (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
- new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
- /*
- * Roundup the allocation request to a stripe unit (m_dalign) boundary
- * if the file size is >= stripe unit size, and we are allocating past
- * the allocation eof.
- */
- else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
- new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+ if (!XFS_IS_REALTIME_INODE(ip)) {
+ /*
+ * Round up the allocation request to a stripe unit
+ * (m_dalign) boundary if the file size is >= stripe unit
+ * size, and we are allocating past the allocation eof.
+ *
+ * If mounted with the "-o swalloc" option the alignment is
+ * increased from the strip unit size to the stripe width.
+ */
+ if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ align = mp->m_swidth;
+ else if (mp->m_dalign)
+ align = mp->m_dalign;
+
+ if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
+ new_last_fsb = roundup_64(*last_fsb, align);
+ }
/*
* Always round up the allocation request to an extent boundary
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- if ((offset + count) > ip->i_size) {
+ if ((offset + count) > XFS_ISIZE(ip)) {
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
xfs_trans_ijoin(tp, ip, 0);
bmapi_flag = 0;
- if (offset < ip->i_size || extsz)
+ if (offset < XFS_ISIZE(ip) || extsz)
bmapi_flag |= XFS_BMAPI_PREALLOC;
/*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
int found_delalloc = 0;
*prealloc = 0;
- if ((offset + count) <= ip->i_size)
+ if (offset + count <= XFS_ISIZE(ip))
return 0;
/*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
* if we pass in alloc_blocks = 0. Hence the "+ 1" to
* ensure we always pass in a non-zero value.
*/
- alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+ alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
rounddown_pow_of_two(alloc_blocks));
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
* back....
*/
nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+ end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
error = xfs_bmap_last_offset(NULL, ip, &last_block,
XFS_DATA_FORK);
if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f9babd17922..ab302539e5b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -750,6 +750,7 @@ xfs_setattr_size(
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
int mask = iattr->ia_valid;
+ xfs_off_t oldsize, newsize;
struct xfs_trans *tp;
int error;
uint lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
lock_flags |= XFS_IOLOCK_EXCL;
xfs_ilock(ip, lock_flags);
+ oldsize = inode->i_size;
+ newsize = iattr->ia_size;
+
/*
* Short circuit the truncate case for zero length files.
*/
- if (iattr->ia_size == 0 &&
- ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+ if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
goto out_unlock;
@@ -807,14 +810,14 @@ xfs_setattr_size(
* the inode to the transaction, because the inode cannot be unlocked
* once it is a part of the transaction.
*/
- if (iattr->ia_size > ip->i_size) {
+ if (newsize > oldsize) {
/*
* Do the first part of growing a file: zero any data in the
* last block that is beyond the old EOF. We need to do this
* before the inode is joined to the transaction to modify
* i_size.
*/
- error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+ error = xfs_zero_eof(ip, newsize, oldsize);
if (error)
goto out_unlock;
}
@@ -833,8 +836,8 @@ xfs_setattr_size(
* here and prevents waiting for other data not within the range we
* care about here.
*/
- if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
- error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+ if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+ error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
FI_NONE);
if (error)
goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
- error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
- xfs_get_blocks);
+ error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
if (error)
goto out_unlock;
@@ -857,7 +859,7 @@ xfs_setattr_size(
if (error)
goto out_trans_cancel;
- truncate_setsize(inode, iattr->ia_size);
+ truncate_setsize(inode, newsize);
commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,29 @@ xfs_setattr_size(
* these flags set. For all other operations the VFS set these flags
* explicitly if it wants a timestamp update.
*/
- if (iattr->ia_size != ip->i_size &&
- (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
iattr->ia_ctime = iattr->ia_mtime =
current_fs_time(inode->i_sb);
mask |= ATTR_CTIME | ATTR_MTIME;
}
- if (iattr->ia_size > ip->i_size) {
- ip->i_d.di_size = iattr->ia_size;
- ip->i_size = iattr->ia_size;
- } else if (iattr->ia_size <= ip->i_size ||
- (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
- error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+ /*
+ * The first thing we do is set the size to new_size permanently on
+ * disk. This way we don't have to worry about anyone ever being able
+ * to look at the data being freed even in the face of a crash.
+ * What we're getting around here is the case where we free a block, it
+ * is allocated to another file, it is written to, and then we crash.
+ * If the new data gets written to the file but the log buffers
+ * containing the free and reallocation don't, then we'd end up with
+ * garbage in the blocks being freed. As long as we make the new size
+ * permanent before actually freeing any blocks it doesn't matter if
+ * they get written to.
+ */
+ ip->i_d.di_size = newsize;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ if (newsize <= oldsize) {
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
if (error)
goto out_trans_abort;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 541a508adea..0ed9ee77937 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1489,7 +1489,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
+ ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
memcpy(&ptr[old_len], dp, len); /* d, s, l */
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -1981,7 +1981,7 @@ xfs_qm_dqcheck(
if (!errs && ddq->d_id) {
if (ddq->d_blk_softlimit &&
- be64_to_cpu(ddq->d_bcount) >=
+ be64_to_cpu(ddq->d_bcount) >
be64_to_cpu(ddq->d_blk_softlimit)) {
if (!ddq->d_btimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -1992,7 +1992,7 @@ xfs_qm_dqcheck(
}
}
if (ddq->d_ino_softlimit &&
- be64_to_cpu(ddq->d_icount) >=
+ be64_to_cpu(ddq->d_icount) >
be64_to_cpu(ddq->d_ino_softlimit)) {
if (!ddq->d_itimer) {
if (flags & XFS_QMOPT_DOWARN)
@@ -2003,7 +2003,7 @@ xfs_qm_dqcheck(
}
}
if (ddq->d_rtb_softlimit &&
- be64_to_cpu(ddq->d_rtbcount) >=
+ be64_to_cpu(ddq->d_rtbcount) >
be64_to_cpu(ddq->d_rtb_softlimit)) {
if (!ddq->d_rtbtimer) {
if (flags & XFS_QMOPT_DOWARN)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 671f37eae1c..c436def733b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -50,7 +50,6 @@
*/
struct mutex xfs_Gqm_lock;
struct xfs_qm *xfs_Gqm;
-uint ndquot;
kmem_zone_t *qm_dqzone;
kmem_zone_t *qm_dqtrxzone;
@@ -93,7 +92,6 @@ xfs_Gqm_init(void)
goto out_free_udqhash;
hsize /= sizeof(xfs_dqhash_t);
- ndquot = hsize << 8;
xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
xqm->qm_dqhashmask = hsize - 1;
@@ -137,7 +135,6 @@ xfs_Gqm_init(void)
xqm->qm_dqtrxzone = qm_dqtrxzone;
atomic_set(&xqm->qm_totaldquots, 0);
- xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
xqm->qm_nrefs = 0;
return xqm;
@@ -1600,216 +1597,150 @@ xfs_qm_init_quotainos(
return 0;
}
+STATIC void
+xfs_qm_dqfree_one(
+ struct xfs_dquot *dqp)
+{
+ struct xfs_mount *mp = dqp->q_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ mutex_lock(&dqp->q_hash->qh_lock);
+ list_del_init(&dqp->q_hashlist);
+ dqp->q_hash->qh_version++;
+ mutex_unlock(&dqp->q_hash->qh_lock);
-/*
- * Pop the least recently used dquot off the freelist and recycle it.
- */
-STATIC struct xfs_dquot *
-xfs_qm_dqreclaim_one(void)
+ mutex_lock(&qi->qi_dqlist_lock);
+ list_del_init(&dqp->q_mplist);
+ qi->qi_dquots--;
+ qi->qi_dqreclaims++;
+ mutex_unlock(&qi->qi_dqlist_lock);
+
+ xfs_qm_dqdestroy(dqp);
+}
+
+STATIC void
+xfs_qm_dqreclaim_one(
+ struct xfs_dquot *dqp,
+ struct list_head *dispose_list)
{
- struct xfs_dquot *dqp;
- int restarts = 0;
+ struct xfs_mount *mp = dqp->q_mount;
+ int error;
- mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-restart:
- list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
- struct xfs_mount *mp = dqp->q_mount;
+ if (!xfs_dqlock_nowait(dqp))
+ goto out_busy;
- if (!xfs_dqlock_nowait(dqp))
- continue;
+ /*
+ * This dquot has acquired a reference in the meantime remove it from
+ * the freelist and try again.
+ */
+ if (dqp->q_nrefs) {
+ xfs_dqunlock(dqp);
- /*
- * This dquot has already been grabbed by dqlookup.
- * Remove it from the freelist and try again.
- */
- if (dqp->q_nrefs) {
- trace_xfs_dqreclaim_want(dqp);
- XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
- restarts++;
- goto dqunlock;
- }
+ trace_xfs_dqreclaim_want(dqp);
+ XQM_STATS_INC(xqmstats.xs_qm_dqwants);
- ASSERT(dqp->q_hash);
- ASSERT(!list_empty(&dqp->q_mplist));
+ list_del_init(&dqp->q_freelist);
+ xfs_Gqm->qm_dqfrlist_cnt--;
+ return;
+ }
- /*
- * Try to grab the flush lock. If this dquot is in the process
- * of getting flushed to disk, we don't want to reclaim it.
- */
- if (!xfs_dqflock_nowait(dqp))
- goto dqunlock;
+ ASSERT(dqp->q_hash);
+ ASSERT(!list_empty(&dqp->q_mplist));
- /*
- * We have the flush lock so we know that this is not in the
- * process of being flushed. So, if this is dirty, flush it
- * DELWRI so that we don't get a freelist infested with
- * dirty dquots.
- */
- if (XFS_DQ_IS_DIRTY(dqp)) {
- int error;
+ /*
+ * Try to grab the flush lock. If this dquot is in the process of
+ * getting flushed to disk, we don't want to reclaim it.
+ */
+ if (!xfs_dqflock_nowait(dqp))
+ goto out_busy;
- trace_xfs_dqreclaim_dirty(dqp);
+ /*
+ * We have the flush lock so we know that this is not in the
+ * process of being flushed. So, if this is dirty, flush it
+ * DELWRI so that we don't get a freelist infested with
+ * dirty dquots.
+ */
+ if (XFS_DQ_IS_DIRTY(dqp)) {
+ trace_xfs_dqreclaim_dirty(dqp);
- /*
- * We flush it delayed write, so don't bother
- * releasing the freelist lock.
- */
- error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK);
- if (error) {
- xfs_warn(mp, "%s: dquot %p flush failed",
- __func__, dqp);
- }
- goto dqunlock;
+ /*
+ * We flush it delayed write, so don't bother releasing the
+ * freelist lock.
+ */
+ error = xfs_qm_dqflush(dqp, 0);
+ if (error) {
+ xfs_warn(mp, "%s: dquot %p flush failed",
+ __func__, dqp);
}
- xfs_dqfunlock(dqp);
/*
- * Prevent lookup now that we are going to reclaim the dquot.
- * Once XFS_DQ_FREEING is set lookup won't touch the dquot,
- * thus we can drop the lock now.
+ * Give the dquot another try on the freelist, as the
+ * flushing will take some time.
*/
- dqp->dq_flags |= XFS_DQ_FREEING;
- xfs_dqunlock(dqp);
-
- mutex_lock(&dqp->q_hash->qh_lock);
- list_del_init(&dqp->q_hashlist);
- dqp->q_hash->qh_version++;
- mutex_unlock(&dqp->q_hash->qh_lock);
-
- mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
- list_del_init(&dqp->q_mplist);
- mp->m_quotainfo->qi_dquots--;
- mp->m_quotainfo->qi_dqreclaims++;
- mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+ goto out_busy;
+ }
+ xfs_dqfunlock(dqp);
- ASSERT(dqp->q_nrefs == 0);
- list_del_init(&dqp->q_freelist);
- xfs_Gqm->qm_dqfrlist_cnt--;
+ /*
+ * Prevent lookups now that we are past the point of no return.
+ */
+ dqp->dq_flags |= XFS_DQ_FREEING;
+ xfs_dqunlock(dqp);
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- return dqp;
-dqunlock:
- xfs_dqunlock(dqp);
- if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
- break;
- goto restart;
- }
+ ASSERT(dqp->q_nrefs == 0);
+ list_move_tail(&dqp->q_freelist, dispose_list);
+ xfs_Gqm->qm_dqfrlist_cnt--;
- mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- return NULL;
-}
+ trace_xfs_dqreclaim_done(dqp);
+ XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
+ return;
-/*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- */
-STATIC int
-xfs_qm_shake_freelist(
- int howmany)
-{
- int nreclaimed = 0;
- xfs_dquot_t *dqp;
+out_busy:
+ xfs_dqunlock(dqp);
- if (howmany <= 0)
- return 0;
+ /*
+ * Move the dquot to the tail of the list so that we don't spin on it.
+ */
+ list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
- while (nreclaimed < howmany) {
- dqp = xfs_qm_dqreclaim_one();
- if (!dqp)
- return nreclaimed;
- xfs_qm_dqdestroy(dqp);
- nreclaimed++;
- }
- return nreclaimed;
+ trace_xfs_dqreclaim_busy(dqp);
+ XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
}
-/*
- * The kmem_shake interface is invoked when memory is running low.
- */
-/* ARGSUSED */
STATIC int
xfs_qm_shake(
- struct shrinker *shrink,
- struct shrink_control *sc)
+ struct shrinker *shrink,
+ struct shrink_control *sc)
{
- int ndqused, nfree, n;
- gfp_t gfp_mask = sc->gfp_mask;
-
- if (!kmem_shake_allow(gfp_mask))
- return 0;
- if (!xfs_Gqm)
- return 0;
-
- nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
- /* incore dquots in all f/s's */
- ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
-
- ASSERT(ndqused >= 0);
+ int nr_to_scan = sc->nr_to_scan;
+ LIST_HEAD (dispose_list);
+ struct xfs_dquot *dqp;
- if (nfree <= ndqused && nfree < ndquot)
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
return 0;
+ if (!nr_to_scan)
+ goto out;
- ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */
- n = nfree - ndqused - ndquot; /* # over target */
-
- return xfs_qm_shake_freelist(MAX(nfree, n));
-}
-
-
-/*------------------------------------------------------------------*/
-
-/*
- * Return a new incore dquot. Depending on the number of
- * dquots in the system, we either allocate a new one on the kernel heap,
- * or reclaim a free one.
- * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
- * to reclaim an existing one from the freelist.
- */
-boolean_t
-xfs_qm_dqalloc_incore(
- xfs_dquot_t **O_dqpp)
-{
- xfs_dquot_t *dqp;
-
- /*
- * Check against high water mark to see if we want to pop
- * a nincompoop dquot off the freelist.
- */
- if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
- /*
- * Try to recycle a dquot from the freelist.
- */
- if ((dqp = xfs_qm_dqreclaim_one())) {
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
- /*
- * Just zero the core here. The rest will get
- * reinitialized by caller. XXX we shouldn't even
- * do this zero ...
- */
- memset(&dqp->q_core, 0, sizeof(dqp->q_core));
- *O_dqpp = dqp;
- return B_FALSE;
- }
- XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+ mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+ while (!list_empty(&xfs_Gqm->qm_dqfrlist)) {
+ if (nr_to_scan-- <= 0)
+ break;
+ dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot,
+ q_freelist);
+ xfs_qm_dqreclaim_one(dqp, &dispose_list);
}
+ mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
- /*
- * Allocate a brand new dquot on the kernel heap and return it
- * to the caller to initialize.
- */
- ASSERT(xfs_Gqm->qm_dqzone != NULL);
- *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
- atomic_inc(&xfs_Gqm->qm_totaldquots);
-
- return B_TRUE;
+ while (!list_empty(&dispose_list)) {
+ dqp = list_first_entry(&dispose_list, struct xfs_dquot,
+ q_freelist);
+ list_del_init(&dqp->q_freelist);
+ xfs_qm_dqfree_one(dqp);
+ }
+out:
+ return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure;
}
-
/*
* Start a transaction and write the incore superblock changes to
* disk. flags parameter indicates which fields have changed.
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 9b4f3adefbc..9a9b997e1a0 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -26,24 +26,12 @@
struct xfs_qm;
struct xfs_inode;
-extern uint ndquot;
extern struct mutex xfs_Gqm_lock;
extern struct xfs_qm *xfs_Gqm;
extern kmem_zone_t *qm_dqzone;
extern kmem_zone_t *qm_dqtrxzone;
/*
- * Ditto, for xfs_qm_dqreclaim_one.
- */
-#define XFS_QM_RECLAIM_MAX_RESTARTS 4
-
-/*
- * Ideal ratio of free to in use dquots. Quota manager makes an attempt
- * to keep this balance.
- */
-#define XFS_QM_DQFREE_RATIO 2
-
-/*
* Dquot hashtable constants/threshold values.
*/
#define XFS_QM_HASHSIZE_LOW (PAGE_SIZE / sizeof(xfs_dqhash_t))
@@ -74,7 +62,6 @@ typedef struct xfs_qm {
int qm_dqfrlist_cnt;
atomic_t qm_totaldquots; /* total incore dquots */
uint qm_nrefs; /* file systems with quota on */
- int qm_dqfree_ratio;/* ratio of free to inuse dquots */
kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */
kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */
} xfs_qm_t;
@@ -143,7 +130,6 @@ extern int xfs_qm_quotacheck(xfs_mount_t *);
extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
/* dquot stuff */
-extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **);
extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint);
extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
index 8671a0b3264..5729ba57087 100644
--- a/fs/xfs/xfs_qm_stats.c
+++ b/fs/xfs/xfs_qm_stats.c
@@ -42,9 +42,9 @@ static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
seq_printf(m, "%d\t%d\t%d\t%u\n",
- ndquot,
+ 0,
xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
- xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
+ 0,
xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
return 0;
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc9..711a86e39ff 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -31,6 +31,7 @@
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_itable.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
@@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, 0);
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
XFS_TRANS_ABORT);
goto out_unlock;
}
+ ASSERT(ip->i_d.di_nextents == 0);
+
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
@@ -807,11 +813,11 @@ xfs_qm_export_dquot(
(XFS_IS_OQUOTA_ENFORCED(mp) &&
(dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
dst->d_id != 0) {
- if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+ if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) &&
(dst->d_blk_softlimit > 0)) {
ASSERT(dst->d_btimer != 0);
}
- if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+ if (((int) dst->d_icount > (int) dst->d_ino_softlimit) &&
(dst->d_ino_softlimit > 0)) {
ASSERT(dst->d_itimer != 0);
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 281961c1d81..ee5b695c99a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -828,14 +828,6 @@ xfs_fs_inode_init_once(
/* xfs inode */
atomic_set(&ip->i_pincount, 0);
spin_lock_init(&ip->i_flags_lock);
- init_waitqueue_head(&ip->i_ipin_wait);
- /*
- * Because we want to use a counting completion, complete
- * the flush completion once to allow a single access to
- * the flush completion without blocking.
- */
- init_completion(&ip->i_flush);
- complete(&ip->i_flush);
mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
"xfsino", ip->i_ino);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 72c01a1c16e..40b75eecd2b 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab(
return 1;
/*
- * do some unlocked checks first to avoid unnecessary lock traffic.
- * The first is a flush lock check, the second is a already in reclaim
- * check. Only do these checks if we are not going to block on locks.
+ * If we are asked for non-blocking operation, do unlocked checks to
+ * see if the inode already is being flushed or in reclaim to avoid
+ * lock traffic.
*/
if ((flags & SYNC_TRYLOCK) &&
- (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+ __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
return 1;
- }
/*
* The radix tree lock here protects a thread in xfs_iget from racing
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a9d5b1e06ef..bb134a81993 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -733,11 +733,10 @@ DEFINE_EVENT(xfs_dquot_class, name, \
DEFINE_DQUOT_EVENT(xfs_dqadjust);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
DEFINE_DQUOT_EVENT(xfs_dqattach_found);
DEFINE_DQUOT_EVENT(xfs_dqattach_get);
-DEFINE_DQUOT_EVENT(xfs_dqinit);
-DEFINE_DQUOT_EVENT(xfs_dqreuse);
DEFINE_DQUOT_EVENT(xfs_dqalloc);
DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
DEFINE_DQUOT_EVENT(xfs_dqread);
@@ -891,7 +890,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(xfs_fsize_t, size)
- __field(xfs_fsize_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
__field(int, flags)
@@ -900,17 +898,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
__entry->flags = flags;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
"offset 0x%llx count 0x%zx ioflags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->new_size,
__entry->offset,
__entry->count,
__print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +974,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
- __field(loff_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
__field(int, type)
@@ -990,7 +985,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
__entry->size = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
__entry->type = type;
@@ -998,13 +992,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
- TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
- "offset 0x%llx count %zd type %s "
- "startoff 0x%llx startblock %lld blockcount 0x%llx",
+ TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
+ "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
- __entry->new_size,
__entry->offset,
__entry->count,
__print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,26 +1023,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
__field(xfs_ino_t, ino)
__field(loff_t, isize)
__field(loff_t, disize)
- __field(loff_t, new_size)
__field(loff_t, offset)
__field(size_t, count)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->isize = ip->i_size;
+ __entry->isize = VFS_I(ip)->i_size;
__entry->disize = ip->i_d.di_size;
- __entry->new_size = ip->i_new_size;
__entry->offset = offset;
__entry->count = count;
),
- TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+ TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
"offset 0x%llx count %zd",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->isize,
__entry->disize,
- __entry->new_size,
__entry->offset,
__entry->count)
);
@@ -1090,8 +1079,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
DEFINE_EVENT(xfs_itrunc_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
TRACE_EVENT(xfs_pagecache_inval,
TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1568,7 +1557,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(xfs_ino_t, ino)
__field(int, format)
__field(int, nex)
- __field(int, max_nex)
__field(int, broot_size)
__field(int, fork_off)
),
@@ -1578,18 +1566,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->ino = ip->i_ino;
__entry->format = ip->i_d.di_format;
__entry->nex = ip->i_d.di_nextents;
- __entry->max_nex = ip->i_df.if_ext_max;
__entry->broot_size = ip->i_df.if_broot_bytes;
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
- "Max in-fork extents %d, broot size %d, fork offset %d",
+ "broot size %d, fork offset %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
__print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
__entry->nex,
- __entry->max_nex,
__entry->broot_size,
__entry->fork_off)
)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 329b06aba1c..7adcdf15ae0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1151,8 +1151,8 @@ xfs_trans_add_item(
{
struct xfs_log_item_desc *lidp;
- ASSERT(lip->li_mountp = tp->t_mountp);
- ASSERT(lip->li_ailp = tp->t_mountp->m_ail);
+ ASSERT(lip->li_mountp == tp->t_mountp);
+ ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 4d00ee67792..c4ba366d24e 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -649,12 +649,12 @@ xfs_trans_dqresv(
* nblks.
*/
if (hardlimit > 0ULL &&
- hardlimit <= nblks + *resbcountp) {
+ hardlimit < nblks + *resbcountp) {
xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
goto error_return;
}
if (softlimit > 0ULL &&
- softlimit <= nblks + *resbcountp) {
+ softlimit < nblks + *resbcountp) {
if ((timer != 0 && get_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
@@ -677,11 +677,13 @@ xfs_trans_dqresv(
if (!softlimit)
softlimit = q->qi_isoftlimit;
- if (hardlimit > 0ULL && count >= hardlimit) {
+ if (hardlimit > 0ULL &&
+ hardlimit < ninos + count) {
xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
goto error_return;
}
- if (softlimit > 0ULL && count >= softlimit) {
+ if (softlimit > 0ULL &&
+ softlimit < ninos + count) {
if ((timer != 0 && get_seconds() > timer) ||
(warns != 0 && warns >= warnlimit)) {
xfs_quota_warn(mp, dqp,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f2fea868d4d..ebdb88840a4 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -131,7 +131,8 @@ xfs_readlink(
__func__, (unsigned long long) ip->i_ino,
(long long) pathlen);
ASSERT(0);
- return XFS_ERROR(EFSCORRUPTED);
+ error = XFS_ERROR(EFSCORRUPTED);
+ goto out;
}
@@ -175,7 +176,7 @@ xfs_free_eofblocks(
* Figure out if there are any blocks beyond the end
* of the file. If not, then there is nothing to do.
*/
- end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+ end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
if (last_fsb <= end_fsb)
return 0;
@@ -226,7 +227,14 @@ xfs_free_eofblocks(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, ip->i_size);
+ /*
+ * Do not update the on-disk file size. If we update the
+ * on-disk file size and then the system crashes before the
+ * contents of the file are flushed to disk then the files
+ * may be full of holes (ie NULL files bug).
+ */
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+ XFS_ISIZE(ip));
if (error) {
/*
* If we get an error at this point we simply don't
@@ -540,8 +548,8 @@ xfs_release(
return 0;
if ((S_ISREG(ip->i_d.di_mode) &&
- ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
- ip->i_delayed_blks > 0)) &&
+ (VFS_I(ip)->i_size > 0 ||
+ (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
(ip->i_df.if_flags & XFS_IFEXTENTS)) &&
(!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
@@ -618,7 +626,7 @@ xfs_inactive(
* only one with a reference to the inode.
*/
truncate = ((ip->i_d.di_nlink == 0) &&
- ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+ ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
(ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
S_ISREG(ip->i_d.di_mode));
@@ -632,12 +640,12 @@ xfs_inactive(
if (ip->i_d.di_nlink != 0) {
if ((S_ISREG(ip->i_d.di_mode) &&
- ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
- ip->i_delayed_blks > 0)) &&
- (ip->i_df.if_flags & XFS_IFEXTENTS) &&
- (!(ip->i_d.di_flags &
+ (VFS_I(ip)->i_size > 0 ||
+ (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+ (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+ (!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
- (ip->i_delayed_blks != 0)))) {
+ ip->i_delayed_blks != 0))) {
error = xfs_free_eofblocks(mp, ip, 0);
if (error)
return VN_INACTIVE_CACHE;
@@ -670,13 +678,18 @@ xfs_inactive(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_itruncate_data(&tp, ip, 0);
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
if (error) {
xfs_trans_cancel(tp,
XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
return VN_INACTIVE_CACHE;
}
+
+ ASSERT(ip->i_d.di_nextents == 0);
} else if (S_ISLNK(ip->i_d.di_mode)) {
/*
@@ -1961,11 +1974,11 @@ xfs_zero_remaining_bytes(
* since nothing can read beyond eof. The space will
* be zeroed when the file is extended anyway.
*/
- if (startoff >= ip->i_size)
+ if (startoff >= XFS_ISIZE(ip))
return 0;
- if (endoff > ip->i_size)
- endoff = ip->i_size;
+ if (endoff > XFS_ISIZE(ip))
+ endoff = XFS_ISIZE(ip);
bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2260,7 +2273,7 @@ xfs_change_file_space(
bf->l_start += offset;
break;
case 2: /*SEEK_END*/
- bf->l_start += ip->i_size;
+ bf->l_start += XFS_ISIZE(ip);
break;
default:
return XFS_ERROR(EINVAL);
@@ -2277,7 +2290,7 @@ xfs_change_file_space(
bf->l_whence = 0;
startoffset = bf->l_start;
- fsize = ip->i_size;
+ fsize = XFS_ISIZE(ip);
/*
* XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve