From ca01d6dd2d7a2652000307520777538740efc286 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 28 Dec 2010 14:25:21 -0800
Subject: pstore: new filesystem interface to platform persistent storage

Some platforms have a small amount of non-volatile storage that
can be used to store information useful to diagnose the cause of
a system crash.  This is the generic part of a file system interface
that presents information from the crash as a series of files in
/dev/pstore.  Once the information has been seen, the underlying
storage is freed by deleting the files.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/Kconfig           |   1 +
 fs/Makefile          |   1 +
 fs/pstore/Kconfig    |  13 +++
 fs/pstore/Makefile   |   7 ++
 fs/pstore/inode.c    | 280 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/pstore/internal.h |   7 ++
 fs/pstore/platform.c | 202 +++++++++++++++++++++++++++++++++++++
 7 files changed, 511 insertions(+)
 create mode 100644 fs/pstore/Kconfig
 create mode 100644 fs/pstore/Makefile
 create mode 100644 fs/pstore/inode.c
 create mode 100644 fs/pstore/internal.h
 create mode 100644 fs/pstore/platform.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 771f457402d..2bbe47fec1e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -188,6 +188,7 @@ source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
+source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c..db71a5b21a4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -121,3 +121,4 @@ obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
+obj-$(CONFIG_PSTORE)		+= pstore/
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
new file mode 100644
index 00000000000..867d0ac026c
--- /dev/null
+++ b/fs/pstore/Kconfig
@@ -0,0 +1,13 @@
+config PSTORE
+	bool "Persistant store support"
+	default n
+	help
+	   This option enables generic access to platform level
+	   persistent storage via "pstore" filesystem that can
+	   be mounted as /dev/pstore.  Only useful if you have
+	   a platform level driver that registers with pstore to
+	   provide the data, so you probably should just go say "Y"
+	   (or "M") to a platform specific persistent store driver
+	   (e.g. ACPI_APEI on X86) which will select this for you.
+	   If you don't have a platform persistent store driver,
+	   say N.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
new file mode 100644
index 00000000000..760f4bce7d1
--- /dev/null
+++ b/fs/pstore/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux pstorefs routines.
+#
+
+obj-y += pstore.o
+
+pstore-objs += inode.o platform.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
new file mode 100644
index 00000000000..0e806aafe85
--- /dev/null
+++ b/fs/pstore/inode.c
@@ -0,0 +1,280 @@
+/*
+ * Persistent Storage - ramfs parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/pstore.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+#define	PSTORE_NAMELEN	64
+
+struct pstore_private {
+	u64	id;
+	int	(*erase)(u64);
+};
+
+#define pstore_get_inode ramfs_get_inode
+
+/*
+ * When a file is unlinked from our file system we call the
+ * platform driver to erase the record from persistent store.
+ */
+static int pstore_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct pstore_private *p = dentry->d_inode->i_private;
+
+	p->erase(p->id);
+	kfree(p);
+
+	return simple_unlink(dir, dentry);
+}
+
+static const struct inode_operations pstore_dir_inode_operations = {
+	.lookup		= simple_lookup,
+	.unlink		= pstore_unlink,
+};
+
+static const struct super_operations pstore_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.show_options	= generic_show_options,
+};
+
+static struct super_block *pstore_sb;
+static struct vfsmount *pstore_mnt;
+
+int pstore_is_mounted(void)
+{
+	return pstore_mnt != NULL;
+}
+
+/*
+ * Set up a file structure as if we had opened this file and
+ * write our data to it.
+ */
+static int pstore_writefile(struct inode *inode, struct dentry *dentry,
+	char *data, size_t size)
+{
+	struct file f;
+	ssize_t n;
+	mm_segment_t old_fs = get_fs();
+
+	memset(&f, '0', sizeof f);
+	f.f_mapping = inode->i_mapping;
+	f.f_path.dentry = dentry;
+	f.f_path.mnt = pstore_mnt;
+	f.f_pos = 0;
+	f.f_op = inode->i_fop;
+	set_fs(KERNEL_DS);
+	n = do_sync_write(&f, data, size, &f.f_pos);
+	set_fs(old_fs);
+
+	fsnotify_modify(&f);
+
+	return n == size;
+}
+
+/*
+ * Make a regular file in the root directory of our file system.
+ * Load it up with "size" bytes of data from "buf".
+ * Set the mtime & ctime to the date that this record was originally stored.
+ */
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+			      char *data, size_t size,
+			      struct timespec time, int (*erase)(u64))
+{
+	struct dentry		*root = pstore_sb->s_root;
+	struct dentry		*dentry;
+	struct inode		*inode;
+	int			rc;
+	char			name[PSTORE_NAMELEN];
+	struct pstore_private	*private;
+
+	rc = -ENOMEM;
+	inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
+	if (!inode)
+		goto fail;
+	inode->i_uid = inode->i_gid = 0;
+	private = kmalloc(sizeof *private, GFP_KERNEL);
+	if (!private)
+		goto fail_alloc;
+	private->id = id;
+	private->erase = erase;
+
+	switch (type) {
+	case PSTORE_TYPE_DMESG:
+		sprintf(name, "dmesg-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_MCE:
+		sprintf(name, "mce-%s-%lld", psname, id);
+		break;
+	case PSTORE_TYPE_UNKNOWN:
+		sprintf(name, "unknown-%s-%lld", psname, id);
+		break;
+	default:
+		sprintf(name, "type%d-%s-%lld", type, psname, id);
+		break;
+	}
+
+	mutex_lock(&root->d_inode->i_mutex);
+
+	rc = -ENOSPC;
+	dentry = d_alloc_name(root, name);
+	if (IS_ERR(dentry))
+		goto fail_lockedalloc;
+
+	d_add(dentry, inode);
+
+	mutex_unlock(&root->d_inode->i_mutex);
+
+	if (!pstore_writefile(inode, dentry, data, size))
+		goto fail_write;
+
+	inode->i_private = private;
+
+	if (time.tv_sec)
+		inode->i_mtime = inode->i_ctime = time;
+
+	return 0;
+
+fail_write:
+	kfree(private);
+	inode->i_nlink--;
+	mutex_lock(&root->d_inode->i_mutex);
+	d_delete(dentry);
+	dput(dentry);
+	mutex_unlock(&root->d_inode->i_mutex);
+	goto fail;
+
+fail_lockedalloc:
+	mutex_unlock(&root->d_inode->i_mutex);
+	kfree(private);
+fail_alloc:
+	iput(inode);
+
+fail:
+	return rc;
+}
+
+int pstore_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode = NULL;
+	struct dentry *root;
+	int err;
+
+	save_mount_options(sb, data);
+
+	pstore_sb = sb;
+
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= PSTOREFS_MAGIC;
+	sb->s_op		= &pstore_ops;
+	sb->s_time_gran		= 1;
+
+	inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+	if (!inode) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	/* override ramfs "dir" options so we catch unlink(2) */
+	inode->i_op = &pstore_dir_inode_operations;
+
+	root = d_alloc_root(inode);
+	sb->s_root = root;
+	if (!root) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	pstore_get_records();
+
+	return 0;
+fail:
+	iput(inode);
+	return err;
+}
+
+static int pstore_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	struct dentry *root;
+
+	root = mount_nodev(fs_type, flags, data, pstore_fill_super);
+	if (IS_ERR(root))
+		return -ENOMEM;
+
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
+	pstore_mnt = mnt;
+
+	return 0;
+}
+
+static void pstore_kill_sb(struct super_block *sb)
+{
+	kill_litter_super(sb);
+	pstore_sb = NULL;
+	pstore_mnt = NULL;
+}
+
+static struct file_system_type pstore_fs_type = {
+	.name		= "pstore",
+	.get_sb		= pstore_get_sb,
+	.kill_sb	= pstore_kill_sb,
+};
+
+static int __init init_pstore_fs(void)
+{
+	int ret = 0;
+	struct kobject *pstorefs_kobj;
+
+	pstorefs_kobj = kobject_create_and_add("pstore", fs_kobj);
+	if (!pstorefs_kobj)
+		return -ENOMEM;
+
+	sysfs_create_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+
+	ret = register_filesystem(&pstore_fs_type);
+
+	if (ret) {
+		sysfs_remove_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+		kobject_put(pstorefs_kobj);
+	}
+
+	return ret;
+}
+module_init(init_pstore_fs)
+
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
new file mode 100644
index 00000000000..76c26d2fab2
--- /dev/null
+++ b/fs/pstore/internal.h
@@ -0,0 +1,7 @@
+extern void	pstore_get_records(void);
+extern int	pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
+			      char *data, size_t size,
+			      struct timespec time, int (*erase)(u64));
+extern int	pstore_is_mounted(void);
+
+extern struct kobj_attribute pstore_kmsg_bytes_attr;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
new file mode 100644
index 00000000000..705fdf8abf6
--- /dev/null
+++ b/fs/pstore/platform.c
@@ -0,0 +1,202 @@
+/*
+ * Persistent Storage - platform driver interface parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kmsg_dump.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/*
+ * pstore_lock just protects "psinfo" during
+ * calls to pstore_register()
+ */
+static DEFINE_SPINLOCK(pstore_lock);
+static struct pstore_info *psinfo;
+
+/* How much of the console log to snapshot. /sys/fs/pstore/kmsg_bytes */
+static unsigned long kmsg_bytes = 10240;
+
+static ssize_t b_show(struct kobject *kobj,
+		      struct kobj_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%lu\n", kmsg_bytes);
+}
+
+static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr,
+		       const char *buf, size_t count)
+{
+	return (sscanf(buf, "%lu", &kmsg_bytes) > 0) ? count : 0;
+}
+
+struct kobj_attribute pstore_kmsg_bytes_attr =
+	__ATTR(kmsg_bytes, S_IRUGO | S_IWUSR, b_show, b_store);
+
+/* Tag each group of saved records with a sequence number */
+static int	oopscount;
+
+/*
+ * callback from kmsg_dump. (s2,l2) has the most recently
+ * written bytes, older bytes are in (s1,l1). Save as much
+ * as we can from the end of the buffer.
+ */
+static void pstore_dump(struct kmsg_dumper *dumper,
+	    enum kmsg_dump_reason reason,
+	    const char *s1, unsigned long l1,
+	    const char *s2, unsigned long l2)
+{
+	unsigned long	s1_start, s2_start;
+	unsigned long	l1_cpy, l2_cpy;
+	unsigned long	size, total = 0;
+	char		*dst;
+	u64		id;
+	int		hsize, part = 1;
+
+	mutex_lock(&psinfo->buf_mutex);
+	oopscount++;
+	while (total < kmsg_bytes) {
+		dst = psinfo->buf;
+		hsize = sprintf(dst, "Oops#%d Part%d\n", oopscount, part++);
+		size = psinfo->bufsize - hsize;
+		dst += hsize;
+
+		l2_cpy = min(l2, size);
+		l1_cpy = min(l1, size - l2_cpy);
+
+		if (l1_cpy + l2_cpy == 0)
+			break;
+
+		s2_start = l2 - l2_cpy;
+		s1_start = l1 - l1_cpy;
+
+		memcpy(dst, s1 + s1_start, l1_cpy);
+		memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
+
+		id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
+		if (pstore_is_mounted())
+			pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
+				      psinfo->buf, hsize + l1_cpy + l2_cpy,
+				      CURRENT_TIME, psinfo->erase);
+		l1 -= l1_cpy;
+		l2 -= l2_cpy;
+		total += l1_cpy + l2_cpy;
+	}
+	mutex_unlock(&psinfo->buf_mutex);
+}
+
+static struct kmsg_dumper pstore_dumper = {
+	.dump = pstore_dump,
+};
+
+/*
+ * platform specific persistent storage driver registers with
+ * us here. If pstore is already mounted, call the platform
+ * read function right away to populate the file system. If not
+ * then the pstore mount code will call us later to fill out
+ * the file system.
+ *
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+int pstore_register(struct pstore_info *psi)
+{
+	struct module *owner = psi->owner;
+
+	spin_lock(&pstore_lock);
+	if (psinfo) {
+		spin_unlock(&pstore_lock);
+		return -EBUSY;
+	}
+	psinfo = psi;
+	spin_unlock(&pstore_lock);
+
+	if (owner && !try_module_get(owner)) {
+		psinfo = NULL;
+		return -EINVAL;
+	}
+
+	if (pstore_is_mounted())
+		pstore_get_records();
+
+	kmsg_dump_register(&pstore_dumper);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_register);
+
+/*
+ * Read all the records from the persistent store. Create and
+ * file files in our filesystem.
+ */
+void pstore_get_records(void)
+{
+	struct pstore_info *psi = psinfo;
+	size_t			size;
+	u64			id;
+	enum pstore_type_id	type;
+	struct timespec		time;
+	int			failed = 0;
+
+	if (!psi)
+		return;
+
+	mutex_lock(&psinfo->buf_mutex);
+	while ((size = psi->read(&id, &type, &time)) > 0) {
+		if (pstore_mkfile(type, psi->name, id, psi->buf, size,
+				  time, psi->erase))
+			failed++;
+	}
+	mutex_unlock(&psinfo->buf_mutex);
+
+	if (failed)
+		printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
+		       failed, psi->name);
+}
+
+/*
+ * Call platform driver to write a record to the
+ * persistent store.
+ */
+int pstore_write(enum pstore_type_id type, char *buf, size_t size)
+{
+	u64	id;
+
+	if (!psinfo)
+		return -ENODEV;
+
+	if (size > psinfo->bufsize)
+		return -EFBIG;
+
+	mutex_lock(&psinfo->buf_mutex);
+	memcpy(psinfo->buf, buf, size);
+	id = psinfo->write(type, size);
+	if (pstore_is_mounted())
+		pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
+			      size, CURRENT_TIME, psinfo->erase);
+	mutex_unlock(&psinfo->buf_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_write);
-- 
cgit v1.2.3


From 168f2e14319aba3125946649604e858cbae85be6 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 6 Jan 2011 16:58:58 -0800
Subject: pstore: fix build warning for unused return value from
 sysfs_create_file

fs/pstore/inode.c: In function 'init_pstore_fs':
fs/pstore/inode.c:266: warning: ignoring return value of 'sysfs_create_file', declared with attribute warn_unused_result

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/inode.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 0e806aafe85..549d245d0b4 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -256,23 +256,28 @@ static struct file_system_type pstore_fs_type = {
 
 static int __init init_pstore_fs(void)
 {
-	int ret = 0;
+	int rc = 0;
 	struct kobject *pstorefs_kobj;
 
 	pstorefs_kobj = kobject_create_and_add("pstore", fs_kobj);
-	if (!pstorefs_kobj)
-		return -ENOMEM;
+	if (!pstorefs_kobj) {
+		rc = -ENOMEM;
+		goto done;
+	}
 
-	sysfs_create_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+	rc = sysfs_create_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+	if (rc)
+		goto done1;
 
-	ret = register_filesystem(&pstore_fs_type);
+	rc = register_filesystem(&pstore_fs_type);
+	if (rc == 0)
+		goto done;
 
-	if (ret) {
-		sysfs_remove_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
-		kobject_put(pstorefs_kobj);
-	}
-
-	return ret;
+	sysfs_remove_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
+done1:
+	kobject_put(pstorefs_kobj);
+done:
+	return rc;
 }
 module_init(init_pstore_fs)
 
-- 
cgit v1.2.3


From 1c1266bb916e6a6b362d3be95f2cc7f3c41277a6 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 12 Jan 2011 16:53:27 -0800
Subject: ceph: fix getattr on directory when using norbytes

The norbytes mount option was broken, and when doing getattr
on a directory it return the rbytes instead of the number of
entities. This commit fixes it.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e791fa34b23..50001de66c6 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -701,10 +701,6 @@ static int fill_inode(struct inode *inode,
 			ci->i_ceph_flags |= CEPH_I_COMPLETE;
 			ci->i_max_offset = 2;
 		}
-
-		/* it may be better to set st_size in getattr instead? */
-		if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
-			inode->i_size = ci->i_rbytes;
 		break;
 	default:
 		pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -1805,7 +1801,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		else
 			stat->dev = 0;
 		if (S_ISDIR(inode->i_mode)) {
-			stat->size = ci->i_rbytes;
+			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+						RBYTES))
+				stat->size = ci->i_rbytes;
+			else
+				stat->size = ci->i_files + ci->i_subdirs;
 			stat->blocks = 0;
 			stat->blksize = 65536;
 		}
-- 
cgit v1.2.3


From 17db143fc091238c43ab9f373974ca2224a4c3f8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 13 Jan 2011 15:27:29 -0800
Subject: ceph: fix xattr rbtree search

Fix xattr name comparison in rbtree search for strings that share a prefix.
The *name argument is null terminated, but the xattr name is not, so we
need to use strncmp, but that means adjusting for the case where name is
a prefix of xattr->name.

The corresponding case in __set_xattr() already handles this properly
(although in that case *name is also not null terminated).

Reported-by: Sergiy Kibrik <sakib@meta.ua>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 6e12a6ba5f7..8c9eba6ef9d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct ceph_inode_xattr *xattr = NULL;
+	int name_len = strlen(name);
 	int c;
 
 	p = &ci->i_xattrs.index.rb_node;
@@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
 		parent = *p;
 		xattr = rb_entry(parent, struct ceph_inode_xattr, node);
 		c = strncmp(name, xattr->name, xattr->name_len);
+		if (c == 0 && name_len > xattr->name_len)
+			c = 1;
 		if (c < 0)
 			p = &(*p)->rb_left;
 		else if (c > 0)
-- 
cgit v1.2.3


From d8cdda3efb9331bedbcca2343591eab2316f4cae Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 17 Jan 2011 20:27:45 +0200
Subject: UBIFS: re-arrange variables in ubifs_info

This is a cosmetic patch which re-arranges variables in 'struct ubifs_info'
so that all boolean-like variables which are only changed during mounting or
re-mounting to R/W mode are places together. Then they are turned into
bit-fields, which makes the structure a little bit smaller.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/ubifs.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 381d6b207a5..d1efa37d80a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1166,22 +1166,22 @@ struct ubifs_debug_info;
  * @rp_uid: reserved pool user ID
  * @rp_gid: reserved pool group ID
  *
- * @empty: if the UBI device is empty
+ * @empty: %1 if the UBI device is empty
+ * @need_recovery: %1 if the file-system needs recovery
+ * @replaying: %1 during journal replay
+ * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
+ * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
+ *                  mode)
  * @replay_tree: temporary tree used during journal replay
  * @replay_list: temporary list used during journal replay
  * @replay_buds: list of buds to replay
  * @cs_sqnum: sequence number of first node in the log (commit start node)
  * @replay_sqnum: sequence number of node currently being replayed
- * @need_recovery: file-system needs recovery
- * @replaying: set to %1 during journal replay
  * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
  *                    mode
  * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
  *                  FS to R/W mode
  * @size_tree: inode size information for recovery
- * @remounting_rw: set while re-mounting from R/O mode to R/W mode
- * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
- *                  mode)
  * @mount_opts: UBIFS-specific mount options
  *
  * @dbg: debugging-related information
@@ -1402,19 +1402,19 @@ struct ubifs_info {
 	gid_t rp_gid;
 
 	/* The below fields are used only during mounting and re-mounting */
-	int empty;
+	unsigned int empty:1;
+	unsigned int need_recovery:1;
+	unsigned int replaying:1;
+	unsigned int remounting_rw:1;
+	unsigned int always_chk_crc:1;
 	struct rb_root replay_tree;
 	struct list_head replay_list;
 	struct list_head replay_buds;
 	unsigned long long cs_sqnum;
 	unsigned long long replay_sqnum;
-	int need_recovery;
-	int replaying;
 	struct list_head unclean_leb_list;
 	struct ubifs_mst_node *rcvrd_mst_node;
 	struct rb_root size_tree;
-	int remounting_rw;
-	int always_chk_crc;
 	struct ubifs_mount_opts mount_opts;
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
-- 
cgit v1.2.3


From 18d1d7fbcc260e67d249bf90b454d8cf34288453 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 17 Jan 2011 22:27:56 +0200
Subject: UBIFS: introduce mounting flag

This is a preparational patch which removes the 'c->always_chk_crc' which was
set during mounting and remounting to R/W mode and introduces 'c->mounting'
flag which is set when mounting. Now the 'c->always_chk_crc' flag is the
same as 'c->remounting_rw && c->mounting'.

This patch is a preparation for the next one which will need to know when we
are mounting and remounting to R/W mode, which is exactly what
'c->always_chk_crc' effectively is, but its name does not suite the
next patch. The other possibility would be to just re-name it, but then
we'd end up with less logical flags coverage.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/io.c    | 12 ++++++++----
 fs/ubifs/super.c | 11 ++---------
 fs/ubifs/tnc.c   | 10 +++++++---
 fs/ubifs/ubifs.h |  5 ++---
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index d82173182ee..d1fe56203a1 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -88,8 +88,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
  * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
  * true, which is controlled by corresponding UBIFS mount option. However, if
  * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
- * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
- * ignored and CRC is checked.
+ * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are
+ * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC
+ * is checked. This is because during mounting or re-mounting from R/O mode to
+ * R/W mode we may read journal nodes (when replying the journal or doing the
+ * recovery) and the journal nodes may potentially be corrupted, so checking is
+ * required.
  *
  * This function returns zero in case of success and %-EUCLEAN in case of bad
  * CRC or magic.
@@ -131,8 +135,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 		   node_len > c->ranges[type].max_len)
 		goto out_len;
 
-	if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
-	     c->no_chk_data_crc)
+	if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting &&
+	    !c->remounting_rw && c->no_chk_data_crc)
 		return 0;
 
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 91fac54c70e..703a62109cf 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1194,11 +1194,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (c->bulk_read == 1)
 		bu_init(c);
 
-	/*
-	 * We have to check all CRCs, even for data nodes, when we mount the FS
-	 * (specifically, when we are replaying).
-	 */
-	c->always_chk_crc = 1;
+	c->mounting = 1;
 
 	err = ubifs_read_superblock(c);
 	if (err)
@@ -1374,7 +1370,7 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_infos;
 
-	c->always_chk_crc = 0;
+	c->mounting = 0;
 
 	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
 		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1535,7 +1531,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	mutex_lock(&c->umount_mutex);
 	dbg_save_space_info(c);
 	c->remounting_rw = 1;
-	c->always_chk_crc = 1;
 
 	err = check_free_space(c);
 	if (err)
@@ -1642,7 +1637,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	dbg_gen("re-mounted read-write");
 	c->ro_mount = 0;
 	c->remounting_rw = 0;
-	c->always_chk_crc = 0;
 	err = dbg_check_space_info(c);
 	mutex_unlock(&c->umount_mutex);
 	return err;
@@ -1659,7 +1653,6 @@ out:
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
 	c->remounting_rw = 0;
-	c->always_chk_crc = 0;
 	mutex_unlock(&c->umount_mutex);
 	return err;
 }
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index ad9cf013362..de485979ca3 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
  *
  * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
  * is true (it is controlled by corresponding mount option). However, if
- * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
- * checked.
+ * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to
+ * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is
+ * because during mounting or re-mounting from R/O mode to R/W mode we may read
+ * journal nodes (when replying the journal or doing the recovery) and the
+ * journal nodes may potentially be corrupted, so checking is required.
  */
 static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 			 int len, int lnum, int offs)
@@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 	if (node_len != len)
 		return 0;
 
-	if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
+	if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting &&
+	    !c->remounting_rw)
 		return 1;
 
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d1efa37d80a..d1823541f98 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1169,9 +1169,8 @@ struct ubifs_debug_info;
  * @empty: %1 if the UBI device is empty
  * @need_recovery: %1 if the file-system needs recovery
  * @replaying: %1 during journal replay
+ * @mounting: %1 while mounting
  * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
- * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
- *                  mode)
  * @replay_tree: temporary tree used during journal replay
  * @replay_list: temporary list used during journal replay
  * @replay_buds: list of buds to replay
@@ -1405,8 +1404,8 @@ struct ubifs_info {
 	unsigned int empty:1;
 	unsigned int need_recovery:1;
 	unsigned int replaying:1;
+	unsigned int mounting:1;
 	unsigned int remounting_rw:1;
-	unsigned int always_chk_crc:1;
 	struct rb_root replay_tree;
 	struct list_head replay_list;
 	struct list_head replay_buds;
-- 
cgit v1.2.3


From f0940cee222790e6e995a23f25c4ffb23f939a24 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 11 Jan 2011 21:15:03 +0900
Subject: dio: fix typos in comments

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Jiri Kosina <trivial@kernel.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/direct-io.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 85882f6ba5f..8201c2558d8 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -641,11 +641,11 @@ static int dio_send_cur_page(struct dio *dio)
 		/*
 		 * See whether this new request is contiguous with the old.
 		 *
-		 * Btrfs cannot handl having logically non-contiguous requests
-		 * submitted.  For exmple if you have
+		 * Btrfs cannot handle having logically non-contiguous requests
+		 * submitted.  For example if you have
 		 *
 		 * Logical:  [0-4095][HOLE][8192-12287]
-		 * Phyiscal: [0-4095]      [4096-8181]
+		 * Physical: [0-4095]      [4096-8191]
 		 *
 		 * We cannot submit those pages together as one BIO.  So if our
 		 * current logical offset in the file does not equal what would
-- 
cgit v1.2.3


From 42b16b3fbb5ee4555f5dee6220f3ccaa6e1ebe47 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 17 Jan 2011 00:09:38 +0100
Subject: =?UTF-8?q?Kill=20off=20warning:=20=E2=80=98inline=E2=80=99=20is?=
 =?UTF-8?q?=20not=20at=20beginning=20of=20declaration?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a bunch of
	warning: ‘inline’ is not at beginning of declaration
messages when building a 'make allyesconfig' kernel with -Wextra.

These warnings are trivial to kill, yet rather annoying when building with
-Wextra.
The more we can cut down on pointless crap like this the better (IMHO).

A previous patch to do this for a 'allnoconfig' build has already been
merged. This just takes the cleanup a little further.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/ocfs2/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d417b3f9b0c..f97b6f1c61d 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -354,7 +354,7 @@ static inline int ocfs2_match(int len,
 /*
  * Returns 0 if not found, -1 on failure, and 1 on success
  */
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+static inline int ocfs2_search_dirblock(struct buffer_head *bh,
 					struct inode *dir,
 					const char *name, int namelen,
 					unsigned long offset,
-- 
cgit v1.2.3


From 50aac4fec503960380ab594a93a6fbfdf3f8915f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 18 Jan 2011 07:59:40 -0800
Subject: ceph: fix cap_wanted_delay_{min,max} mount option initialization

These were initialized to 0 instead of the default, fallout from the RBD
refactor in 3d14c5d2b6e15c21d8e5467dc62d33127c23a644.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bf6f0f34082..9c5085465a6 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -290,6 +290,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 
         fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
         fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
         fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
         fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
         fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
-- 
cgit v1.2.3


From 24be0c481067560b11441e794e27f166a3568863 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 18 Jan 2011 08:48:06 -0800
Subject: ceph: fix erroneous cap flush to non-auth mds

The int flushing is global and not clear on each iteration of the loop,
which can cause a second flush of caps to any MDSs with ids greater than
the auth.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 60d27bc9eb8..f654c7e933a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1658,6 +1658,8 @@ ack:
 
 		if (cap == ci->i_auth_cap && ci->i_dirty_caps)
 			flushing = __mark_caps_flushing(inode, session);
+		else
+			flushing = 0;
 
 		mds = cap->mds;  /* remember mds, so we don't repeat */
 		sent++;
-- 
cgit v1.2.3


From 088b3f5e9ee2649f5cfc2f08d8ce654e3eeba310 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 18 Jan 2011 08:56:01 -0800
Subject: ceph: fix flushing of caps vs cap import

If we are mid-flush and a cap is migrated to another node, we need to
resend the cap flush message to the new MDS, and do so with the original
flush_seq to avoid leaking across a sync boundary.  Previously we didn't
redo the flush (we only flushed newly dirty data), which would cause a
later sync to hang forever.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f654c7e933a..7def3f5903d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1560,9 +1560,10 @@ retry_locked:
 		/* NOTE: no side-effects allowed, until we take s_mutex */
 
 		revoking = cap->implemented & ~cap->issued;
-		if (revoking)
-			dout(" mds%d revoking %s\n", cap->mds,
-			     ceph_cap_string(revoking));
+		dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+		     cap->mds, cap, ceph_cap_string(cap->issued),
+		     ceph_cap_string(cap->implemented),
+		     ceph_cap_string(revoking));
 
 		if (cap == ci->i_auth_cap &&
 		    (cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1942,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
 	}
 }
 
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+				     struct ceph_mds_session *session,
+				     struct inode *inode)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_cap *cap;
+	int delayed = 0;
+
+	spin_lock(&inode->i_lock);
+	cap = ci->i_auth_cap;
+	dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
+	     ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+	__ceph_flush_snaps(ci, &session, 1);
+	if (ci->i_flushing_caps) {
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+				     __ceph_caps_used(ci),
+				     __ceph_caps_wanted(ci),
+				     cap->issued | cap->implemented,
+				     ci->i_flushing_caps, NULL);
+		if (delayed) {
+			spin_lock(&inode->i_lock);
+			__cap_delay_requeue(mdsc, ci);
+			spin_unlock(&inode->i_lock);
+		}
+	} else {
+		spin_unlock(&inode->i_lock);
+	}
+}
+
 
 /*
  * Take references to capabilities we hold, so that we don't release
@@ -2689,7 +2719,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
 	ceph_add_cap(inode, session, cap_id, -1,
 		     issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
 		     NULL /* no caps context */);
-	try_flush_caps(inode, session, NULL);
+	kick_flushing_inode_caps(mdsc, session, inode);
 	up_read(&mdsc->snap_rwsem);
 
 	/* make sure we re-request max_size, if necessary */
-- 
cgit v1.2.3


From 7e57b81c7688c762bc9e775bc83f9fc17946f527 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 18 Jan 2011 09:00:01 -0800
Subject: ceph: avoid immediate cap check after import

The NODELAY flag avoids the heuristics that delay cap (issued/wanted)
release.  There's no reason for that after we import a cap, and it kills
whatever benefit we get from those delays.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7def3f5903d..6b61ded701e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2817,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	case CEPH_CAP_OP_IMPORT:
 		handle_cap_import(mdsc, inode, h, session,
 				  snaptrace, snaptrace_len);
-		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
-				session);
+		ceph_check_caps(ceph_inode(inode), 0, session);
 		goto done_unlocked;
 	}
 
-- 
cgit v1.2.3


From 0da2a4ac33c291728d8be5bdb865467dcb078d13 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Wed, 19 Jan 2011 14:18:50 -0500
Subject: NFS: fix handling of malloc failure during nfs_flush_multi()

Cleanup of the allocated list entries should not call
put_nfs_open_context() on each entry, as the context will
always be NULL, causing an oops.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 10d648ea128..c8278f4046c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -932,7 +932,7 @@ out_bad:
 	while (!list_empty(&list)) {
 		data = list_entry(list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
-		nfs_writedata_release(data);
+		nfs_writedata_free(data);
 	}
 	nfs_redirty_request(req);
 	return -ENOMEM;
-- 
cgit v1.2.3


From bc015cb84129eb1451913cfebece270bf7a39e0f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 19 Jan 2011 09:30:01 +0000
Subject: GFS2: Use RCU for glock hash table

This has a number of advantages:

 - Reduces contention on the hash table lock
 - Makes the code smaller and simpler
 - Should speed up glock dumps when under load
 - Removes ref count changing in examine_bucket
 - No longer need hash chain lock in glock_put() in common case

There are some further changes which this enables and which
we may do in the future. One is to look at using SLAB_RCU,
and another is to look at using a per-cpu counter for the
per-sb glock counter, since that is touched twice in the
lifetime of each glock (but only used at umount time).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 fs/gfs2/glock.c      | 390 +++++++++++++++++++--------------------------------
 fs/gfs2/glock.h      |  39 +++---
 fs/gfs2/glops.c      |  23 ++-
 fs/gfs2/incore.h     |   5 +-
 fs/gfs2/lock_dlm.c   |  14 +-
 fs/gfs2/lops.c       |   3 +-
 fs/gfs2/main.c       |   6 +-
 fs/gfs2/ops_fstype.c |   7 +-
 8 files changed, 190 insertions(+), 297 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 08a8beb152e..c75d4998519 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -26,6 +26,9 @@
 #include <linux/freezer.h>
 #include <linux/workqueue.h>
 #include <linux/jiffies.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -41,10 +44,6 @@
 #define CREATE_TRACE_POINTS
 #include "trace_gfs2.h"
 
-struct gfs2_gl_hash_bucket {
-        struct hlist_head hb_list;
-};
-
 struct gfs2_glock_iter {
 	int hash;			/* hash bucket index         */
 	struct gfs2_sbd *sdp;		/* incore superblock         */
@@ -54,7 +53,6 @@ struct gfs2_glock_iter {
 
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
 static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
@@ -70,57 +68,9 @@ static DEFINE_SPINLOCK(lru_lock);
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
 
-static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
 static struct dentry *gfs2_root;
 
-/*
- * Despite what you might think, the numbers below are not arbitrary :-)
- * They are taken from the ipv4 routing hash code, which is well tested
- * and thus should be nearly optimal. Later on we might tweek the numbers
- * but for now this should be fine.
- *
- * The reason for putting the locks in a separate array from the list heads
- * is that we can have fewer locks than list heads and save memory. We use
- * the same hash function for both, but with a different hash mask.
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
-
-#ifdef CONFIG_LOCKDEP
-# define GL_HASH_LOCK_SZ        256
-#else
-# if NR_CPUS >= 32
-#  define GL_HASH_LOCK_SZ       4096
-# elif NR_CPUS >= 16
-#  define GL_HASH_LOCK_SZ       2048
-# elif NR_CPUS >= 8
-#  define GL_HASH_LOCK_SZ       1024
-# elif NR_CPUS >= 4
-#  define GL_HASH_LOCK_SZ       512
-# else
-#  define GL_HASH_LOCK_SZ       256
-# endif
-#endif
-
-/* We never want more locks than chains */
-#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
-# undef GL_HASH_LOCK_SZ
-# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
-#endif
-
-static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
-
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-	return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
-}
-#else /* not SMP, so no spinlocks required */
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-	return NULL;
-}
-#endif
-
 /**
  * gl_hash() - Turn glock number into hash bucket number
  * @lock: The glock number
@@ -141,25 +91,30 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
 	return h;
 }
 
-/**
- * glock_free() - Perform a few checks and then release struct gfs2_glock
- * @gl: The glock to release
- *
- * Also calls lock module to release its internal structure for this glock.
- *
- */
+static inline void spin_lock_bucket(unsigned int hash)
+{
+	struct hlist_bl_head *bl = &gl_hash_table[hash];
+	bit_spin_lock(0, (unsigned long *)bl);
+}
+
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+	struct hlist_bl_head *bl = &gl_hash_table[hash];
+	__bit_spin_unlock(0, (unsigned long *)bl);
+}
 
-static void glock_free(struct gfs2_glock *gl)
+void gfs2_glock_free(struct rcu_head *rcu)
 {
+	struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct address_space *mapping = gfs2_glock2aspace(gl);
-	struct kmem_cache *cachep = gfs2_glock_cachep;
 
-	GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-	trace_gfs2_glock_put(gl);
-	if (mapping)
-		cachep = gfs2_glock_aspace_cachep;
-	sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
+	if (gl->gl_ops->go_flags & GLOF_ASPACE)
+		kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+	else
+		kmem_cache_free(gfs2_glock_cachep, gl);
+
+	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
+		wake_up(&sdp->sd_glock_wait);
 }
 
 /**
@@ -185,34 +140,49 @@ static int demote_ok(const struct gfs2_glock *gl)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 
+	/* assert_spin_locked(&gl->gl_spin); */
+
 	if (gl->gl_state == LM_ST_UNLOCKED)
 		return 0;
-	if (!list_empty(&gl->gl_holders))
+	if (test_bit(GLF_LFLUSH, &gl->gl_flags))
+		return 0;
+	if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
+	    !list_empty(&gl->gl_holders))
 		return 0;
 	if (glops->go_demote_ok)
 		return glops->go_demote_ok(gl);
 	return 1;
 }
 
+
 /**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
  * @gl: the glock
  *
+ * If the glock is demotable, then we add it (or move it) to the end
+ * of the glock LRU list.
  */
 
-static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
-	int may_reclaim;
-	may_reclaim = (demote_ok(gl) &&
-		       (atomic_read(&gl->gl_ref) == 1 ||
-			(gl->gl_name.ln_type == LM_TYPE_INODE &&
-			 atomic_read(&gl->gl_ref) <= 2)));
-	spin_lock(&lru_lock);
-	if (list_empty(&gl->gl_lru) && may_reclaim) {
+	if (demote_ok(gl)) {
+		spin_lock(&lru_lock);
+
+		if (!list_empty(&gl->gl_lru))
+			list_del_init(&gl->gl_lru);
+		else
+			atomic_inc(&lru_count);
+
 		list_add_tail(&gl->gl_lru, &lru_list);
-		atomic_inc(&lru_count);
+		spin_unlock(&lru_lock);
 	}
-	spin_unlock(&lru_lock);
+}
+
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+	spin_lock(&gl->gl_spin);
+	__gfs2_glock_schedule_for_reclaim(gl);
+	spin_unlock(&gl->gl_spin);
 }
 
 /**
@@ -227,7 +197,6 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 {
 	if (atomic_dec_and_test(&gl->gl_ref))
 		GLOCK_BUG_ON(gl, 1);
-	gfs2_glock_schedule_for_reclaim(gl);
 }
 
 /**
@@ -236,30 +205,26 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
  *
  */
 
-int gfs2_glock_put(struct gfs2_glock *gl)
+void gfs2_glock_put(struct gfs2_glock *gl)
 {
-	int rv = 0;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = gfs2_glock2aspace(gl);
 
-	write_lock(gl_lock_addr(gl->gl_hash));
-	if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
-		hlist_del(&gl->gl_list);
+	if (atomic_dec_and_test(&gl->gl_ref)) {
+		spin_lock_bucket(gl->gl_hash);
+		hlist_bl_del_rcu(&gl->gl_list);
+		spin_unlock_bucket(gl->gl_hash);
+		spin_lock(&lru_lock);
 		if (!list_empty(&gl->gl_lru)) {
 			list_del_init(&gl->gl_lru);
 			atomic_dec(&lru_count);
 		}
 		spin_unlock(&lru_lock);
-		write_unlock(gl_lock_addr(gl->gl_hash));
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-		glock_free(gl);
-		rv = 1;
-		goto out;
+		GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+		trace_gfs2_glock_put(gl);
+		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
 	}
-	spin_lock(&gl->gl_spin);
-	gfs2_glock_schedule_for_reclaim(gl);
-	spin_unlock(&gl->gl_spin);
-	write_unlock(gl_lock_addr(gl->gl_hash));
-out:
-	return rv;
 }
 
 /**
@@ -275,17 +240,15 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
 					const struct lm_lockname *name)
 {
 	struct gfs2_glock *gl;
-	struct hlist_node *h;
+	struct hlist_bl_node *h;
 
-	hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+	hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
 		if (!lm_name_equal(&gl->gl_name, name))
 			continue;
 		if (gl->gl_sbd != sdp)
 			continue;
-
-		atomic_inc(&gl->gl_ref);
-
-		return gl;
+		if (atomic_inc_not_zero(&gl->gl_ref))
+			return gl;
 	}
 
 	return NULL;
@@ -743,10 +706,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	struct gfs2_glock *gl, *tmp;
 	unsigned int hash = gl_hash(sdp, &name);
 	struct address_space *mapping;
+	struct kmem_cache *cachep;
 
-	read_lock(gl_lock_addr(hash));
+	rcu_read_lock();
 	gl = search_bucket(hash, sdp, &name);
-	read_unlock(gl_lock_addr(hash));
+	rcu_read_unlock();
 
 	*glp = gl;
 	if (gl)
@@ -755,9 +719,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		return -ENOENT;
 
 	if (glops->go_flags & GLOF_ASPACE)
-		gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
+		cachep = gfs2_glock_aspace_cachep;
 	else
-		gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+		cachep = gfs2_glock_cachep;
+	gl = kmem_cache_alloc(cachep, GFP_KERNEL);
 	if (!gl)
 		return -ENOMEM;
 
@@ -790,15 +755,15 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		mapping->writeback_index = 0;
 	}
 
-	write_lock(gl_lock_addr(hash));
+	spin_lock_bucket(hash);
 	tmp = search_bucket(hash, sdp, &name);
 	if (tmp) {
-		write_unlock(gl_lock_addr(hash));
-		glock_free(gl);
+		spin_unlock_bucket(hash);
+		kmem_cache_free(cachep, gl);
 		gl = tmp;
 	} else {
-		hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
-		write_unlock(gl_lock_addr(hash));
+		hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
+		spin_unlock_bucket(hash);
 	}
 
 	*glp = gl;
@@ -1113,6 +1078,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
 			fast_path = 1;
 	}
+	__gfs2_glock_schedule_for_reclaim(gl);
 	trace_gfs2_glock_queue(gh, 0);
 	spin_unlock(&gl->gl_spin);
 	if (likely(fast_path))
@@ -1440,42 +1406,30 @@ static struct shrinker glock_shrinker = {
  * @sdp: the filesystem
  * @bucket: the bucket
  *
- * Returns: 1 if the bucket has entries
  */
 
-static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
 			  unsigned int hash)
 {
-	struct gfs2_glock *gl, *prev = NULL;
-	int has_entries = 0;
-	struct hlist_head *head = &gl_hash_table[hash].hb_list;
+	struct gfs2_glock *gl;
+	struct hlist_bl_head *head = &gl_hash_table[hash];
+	struct hlist_bl_node *pos;
 
-	read_lock(gl_lock_addr(hash));
-	/* Can't use hlist_for_each_entry - don't want prefetch here */
-	if (hlist_empty(head))
-		goto out;
-	gl = list_entry(head->first, struct gfs2_glock, gl_list);
-	while(1) {
-		if (!sdp || gl->gl_sbd == sdp) {
-			gfs2_glock_hold(gl);
-			read_unlock(gl_lock_addr(hash));
-			if (prev)
-				gfs2_glock_put(prev);
-			prev = gl;
+	rcu_read_lock();
+	hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
+		if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref))
 			examiner(gl);
-			has_entries = 1;
-			read_lock(gl_lock_addr(hash));
-		}
-		if (gl->gl_list.next == NULL)
-			break;
-		gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
 	}
-out:
-	read_unlock(gl_lock_addr(hash));
-	if (prev)
-		gfs2_glock_put(prev);
+	rcu_read_unlock();
 	cond_resched();
-	return has_entries;
+}
+
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
+{
+	unsigned x;
+
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+		examine_bucket(examiner, sdp, x);
 }
 
 
@@ -1529,10 +1483,21 @@ static void clear_glock(struct gfs2_glock *gl)
 
 void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 {
-	unsigned x;
+	glock_hash_walk(thaw_glock, sdp);
+}
 
-	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-		examine_bucket(thaw_glock, sdp, x);
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+	int ret;
+	spin_lock(&gl->gl_spin);
+	ret = __dump_glock(seq, gl);
+	spin_unlock(&gl->gl_spin);
+	return ret;
+}
+
+static void dump_glock_func(struct gfs2_glock *gl)
+{
+	dump_glock(NULL, gl);
 }
 
 /**
@@ -1545,13 +1510,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
-	unsigned int x;
-
-	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-		examine_bucket(clear_glock, sdp, x);
+	glock_hash_walk(clear_glock, sdp);
 	flush_workqueue(glock_workqueue);
 	wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
-	gfs2_dump_lockstate(sdp);
+	glock_hash_walk(dump_glock_func, sdp);
 }
 
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1717,66 +1679,15 @@ out:
 	return error;
 }
 
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
-{
-	int ret;
-	spin_lock(&gl->gl_spin);
-	ret = __dump_glock(seq, gl);
-	spin_unlock(&gl->gl_spin);
-	return ret;
-}
-
-/**
- * gfs2_dump_lockstate - print out the current lockstate
- * @sdp: the filesystem
- * @ub: the buffer to copy the information into
- *
- * If @ub is NULL, dump the lockstate to the console.
- *
- */
-
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
-{
-	struct gfs2_glock *gl;
-	struct hlist_node *h;
-	unsigned int x;
-	int error = 0;
-
-	for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
-
-		read_lock(gl_lock_addr(x));
-
-		hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
-			if (gl->gl_sbd != sdp)
-				continue;
-
-			error = dump_glock(NULL, gl);
-			if (error)
-				break;
-		}
-
-		read_unlock(gl_lock_addr(x));
-
-		if (error)
-			break;
-	}
-
 
-	return error;
-}
 
 
 int __init gfs2_glock_init(void)
 {
 	unsigned i;
 	for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
-		INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
-	}
-#ifdef GL_HASH_LOCK_SZ
-	for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
-		rwlock_init(&gl_hash_locks[i]);
+		INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
 	}
-#endif
 
 	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
 					  WQ_HIGHPRI | WQ_FREEZEABLE, 0);
@@ -1802,62 +1713,54 @@ void gfs2_glock_exit(void)
 	destroy_workqueue(gfs2_delete_workqueue);
 }
 
+static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+{
+	return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
+			      struct gfs2_glock, gl_list);
+}
+
+static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
+{
+	return hlist_bl_entry(rcu_dereference_raw(gl->gl_list.next),
+			      struct gfs2_glock, gl_list);
+}
+
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
 	struct gfs2_glock *gl;
 
-restart:
-	read_lock(gl_lock_addr(gi->hash));
-	gl = gi->gl;
-	if (gl) {
-		gi->gl = hlist_entry(gl->gl_list.next,
-				     struct gfs2_glock, gl_list);
-	} else {
-		gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
-				     struct gfs2_glock, gl_list);
-	}
-	if (gi->gl)
-		gfs2_glock_hold(gi->gl);
-	read_unlock(gl_lock_addr(gi->hash));
-	if (gl)
-		gfs2_glock_put(gl);
-	while (gi->gl == NULL) {
-		gi->hash++;
-		if (gi->hash >= GFS2_GL_HASH_SIZE)
-			return 1;
-		read_lock(gl_lock_addr(gi->hash));
-		gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
-				     struct gfs2_glock, gl_list);
-		if (gi->gl)
-			gfs2_glock_hold(gi->gl);
-		read_unlock(gl_lock_addr(gi->hash));
-	}
-
-	if (gi->sdp != gi->gl->gl_sbd)
-		goto restart;
+	do {
+		gl = gi->gl;
+		if (gl) {
+			gi->gl = glock_hash_next(gl);
+		} else {
+			gi->gl = glock_hash_chain(gi->hash);
+		}
+		while (gi->gl == NULL) {
+			gi->hash++;
+			if (gi->hash >= GFS2_GL_HASH_SIZE) {
+				rcu_read_unlock();
+				return 1;
+			}
+			gi->gl = glock_hash_chain(gi->hash);
+		}
+	/* Skip entries for other sb and dead entries */
+	} while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
 
 	return 0;
 }
 
-static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
-{
-	if (gi->gl)
-		gfs2_glock_put(gi->gl);
-	gi->gl = NULL;
-}
-
 static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct gfs2_glock_iter *gi = seq->private;
 	loff_t n = *pos;
 
 	gi->hash = 0;
+	rcu_read_lock();
 
 	do {
-		if (gfs2_glock_iter_next(gi)) {
-			gfs2_glock_iter_free(gi);
+		if (gfs2_glock_iter_next(gi))
 			return NULL;
-		}
 	} while (n--);
 
 	return gi->gl;
@@ -1870,10 +1773,8 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 
 	(*pos)++;
 
-	if (gfs2_glock_iter_next(gi)) {
-		gfs2_glock_iter_free(gi);
+	if (gfs2_glock_iter_next(gi))
 		return NULL;
-	}
 
 	return gi->gl;
 }
@@ -1881,7 +1782,10 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
 	struct gfs2_glock_iter *gi = seq->private;
-	gfs2_glock_iter_free(gi);
+
+	if (gi->gl)
+		rcu_read_unlock();
+	gi->gl = NULL;
 }
 
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 691851ceb61..afa8bfea564 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -118,7 +118,7 @@ struct lm_lockops {
 	int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
  	void (*lm_unmount) (struct gfs2_sbd *sdp);
 	void (*lm_withdraw) (struct gfs2_sbd *sdp);
-	void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
+	void (*lm_put_lock) (struct gfs2_glock *gl);
 	int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
 			unsigned int flags);
 	void (*lm_cancel) (struct gfs2_glock *gl);
@@ -174,7 +174,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp,
 		   int create, struct gfs2_glock **glp);
 void gfs2_glock_hold(struct gfs2_glock *gl);
 void gfs2_glock_put_nolock(struct gfs2_glock *gl);
-int gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
 		      struct gfs2_holder *gh);
 void gfs2_holder_reinit(unsigned int state, unsigned flags,
@@ -223,25 +223,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
 	return error;
 }
 
-/*  Lock Value Block functions  */
-
-int gfs2_lvb_hold(struct gfs2_glock *gl);
-void gfs2_lvb_unhold(struct gfs2_glock *gl);
-
-void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
-void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
-void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-
-int __init gfs2_glock_init(void);
-void gfs2_glock_exit(void);
-
-int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
-void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-int gfs2_register_debugfs(void);
-void gfs2_unregister_debugfs(void);
+extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
+extern void gfs2_glock_free(struct rcu_head *rcu);
+
+extern int __init gfs2_glock_init(void);
+extern void gfs2_glock_exit(void);
+
+extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+extern int gfs2_register_debugfs(void);
+extern void gfs2_unregister_debugfs(void);
 
 extern const struct lm_lockops gfs2_dlm_ops;
 
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 263561bf1a5..ac5fac948f8 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -206,8 +206,17 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_holder *gh;
+
 	if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
 		return 0;
+
+	if (!list_empty(&gl->gl_holders)) {
+		gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+		if (gh->gh_list.next != &gl->gl_holders)
+			return 0;
+	}
+
 	return 1;
 }
 
@@ -271,19 +280,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	return 0;
 }
 
-/**
- * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-
-static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
-{
-	const struct address_space *mapping = (const struct address_space *)(gl + 1);
-	return !mapping->nrpages;
-}
-
 /**
  * rgrp_go_lock - operation done after an rgrp lock is locked by
  *    a first holder on this node.
@@ -410,7 +406,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_xmote_th = rgrp_go_sync,
 	.go_inval = rgrp_go_inval,
-	.go_demote_ok = rgrp_go_demote_ok,
 	.go_lock = rgrp_go_lock,
 	.go_unlock = rgrp_go_unlock,
 	.go_dump = gfs2_rgrp_dump,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a79790c0627..720c1e66b34 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -15,6 +15,8 @@
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
@@ -201,7 +203,7 @@ enum {
 };
 
 struct gfs2_glock {
-	struct hlist_node gl_list;
+	struct hlist_bl_node gl_list;
 	unsigned long gl_flags;		/* GLF_... */
 	struct lm_lockname gl_name;
 	atomic_t gl_ref;
@@ -234,6 +236,7 @@ struct gfs2_glock {
 	atomic_t gl_ail_count;
 	struct delayed_work gl_work;
 	struct work_struct gl_delete;
+	struct rcu_head gl_rcu;
 };
 
 #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6e493aee28f..c80485cb6f2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -22,7 +22,6 @@ static void gdlm_ast(void *arg)
 {
 	struct gfs2_glock *gl = arg;
 	unsigned ret = gl->gl_state;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
 
 	BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
 
@@ -31,12 +30,7 @@ static void gdlm_ast(void *arg)
 
 	switch (gl->gl_lksb.sb_status) {
 	case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
-		if (gl->gl_ops->go_flags & GLOF_ASPACE)
-			kmem_cache_free(gfs2_glock_aspace_cachep, gl);
-		else
-			kmem_cache_free(gfs2_glock_cachep, gl);
-		if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-			wake_up(&sdp->sd_glock_wait);
+		call_rcu(&gl->gl_rcu, gfs2_glock_free);
 		return;
 	case -DLM_ECANCEL: /* Cancel while getting lock */
 		ret |= LM_OUT_CANCELED;
@@ -164,16 +158,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
 			GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
 }
 
-static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
+static void gdlm_put_lock(struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 	int error;
 
 	if (gl->gl_lksb.sb_lkid == 0) {
-		kmem_cache_free(cachep, gl);
-		if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-			wake_up(&sdp->sd_glock_wait);
+		call_rcu(&gl->gl_rcu, gfs2_glock_free);
 		return;
 	}
 
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index bf33f822058..11a73efa826 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -91,7 +91,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	}
 	bd->bd_ail = ai;
 	list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
-	clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
+		gfs2_glock_schedule_for_reclaim(bd->bd_gl);
 	trace_gfs2_pin(bd, 0);
 	gfs2_log_unlock(sdp);
 	unlock_buffer(bh);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index ebef7ab6e17..d850004f208 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,6 +14,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 #include <asm/atomic.h>
 
 #include "gfs2.h"
@@ -45,7 +47,7 @@ static void gfs2_init_glock_once(void *foo)
 {
 	struct gfs2_glock *gl = foo;
 
-	INIT_HLIST_NODE(&gl->gl_list);
+	INIT_HLIST_BL_NODE(&gl->gl_list);
 	spin_lock_init(&gl->gl_spin);
 	INIT_LIST_HEAD(&gl->gl_holders);
 	INIT_LIST_HEAD(&gl->gl_lru);
@@ -198,6 +200,8 @@ static void __exit exit_gfs2_fs(void)
 	unregister_filesystem(&gfs2meta_fs_type);
 	destroy_workqueue(gfs_recovery_wq);
 
+	rcu_barrier();
+
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
 	kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 777927ce6f7..a39c103ba49 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -928,12 +928,9 @@ static const match_table_t nolock_tokens = {
 	{ Opt_err, NULL },
 };
 
-static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
+static void nolock_put_lock(struct gfs2_glock *gl)
 {
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	kmem_cache_free(cachep, gl);
-	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-		wake_up(&sdp->sd_glock_wait);
+	call_rcu(&gl->gl_rcu, gfs2_glock_free);
 }
 
 static const struct lm_lockops nolock_ops = {
-- 
cgit v1.2.3


From 75d5cfbe4b78cc26af7b042e23f61700b50bc294 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 19 Jan 2011 09:42:40 +0000
Subject: GFS2: Post-VFS scale update for RCU path walk

We can allow a few more cases to use RCU path walking than
originally allowed. It should be possible to also enable
RCU path walking when the glock is already cached. Thats
a bit more complicated though, so left for a future patch.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Nick Piggin <npiggin@gmail.com>
---
 fs/gfs2/acl.c       |  7 +++++--
 fs/gfs2/ops_inode.c | 10 +++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7118f1a780a..cbc07155b1a 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -80,8 +80,11 @@ int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
 	struct posix_acl *acl;
 	int error;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
+	if (flags & IPERM_FLAG_RCU) {
+		if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+			return -ECHILD;
+		return -EAGAIN;
+	}
 
 	acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
 	if (IS_ERR(acl))
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d8b26ac2e20..09e436a5072 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1026,9 +1026,9 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 
 /**
  * gfs2_permission -
- * @inode:
- * @mask:
- * @nd: passed from Linux VFS, ignored by us
+ * @inode: The inode
+ * @mask: The mask to be tested
+ * @flags: Indicates whether this is an RCU path walk or not
  *
  * This may be called from the VFS directly, or from within GFS2 with the
  * inode locked, so we look to see if the glock is already locked and only
@@ -1044,11 +1044,11 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
 	int error;
 	int unlock = 0;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
 
 	ip = GFS2_I(inode);
 	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+		if (flags & IPERM_FLAG_RCU)
+			return -ECHILD;
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
 		if (error)
 			return error;
-- 
cgit v1.2.3


From b8d6568a122ab7bd47b151ff9f9a40cebea579c0 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 21 Jan 2011 23:21:31 +0800
Subject: ext4: Fix comment typo "especiially".

Change "especiially" to "especially".

Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c4068f6abf0..6b90b6825d3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -131,7 +131,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 		 * fragmenting the file system's free space.  Maybe we
 		 * should have some hueristics or some way to allow
 		 * userspace to pass a hint to file system,
-		 * especiially if the latter case turns out to be
+		 * especially if the latter case turns out to be
 		 * common.
 		 */
 		ex = path[depth].p_ext;
-- 
cgit v1.2.3


From 0ca7a5b9ac5d301845dd6382ff25a699b6263a81 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 21 Jan 2011 16:40:31 +0900
Subject: nilfs2: fix crash after one superblock became unavailable

Fixes the following kernel oops in nilfs_setup_super() which could
arise if one of two super-blocks is unavailable.

> BUG: unable to handle kernel NULL pointer dereference at   (null)
> Pid: 3529, comm: mount.nilfs2 Not tainted 2.6.37 #1 /
> EIP: 0060:[<c03196bc>] EFLAGS: 00010202 CPU: 3
> EIP is at memcpy+0xc/0x1b
> Call Trace:
>  [<f953720e>] ? nilfs_setup_super+0x6c/0xa5 [nilfs2]
>  [<f95369e9>] ? nilfs_get_root_dentry+0x81/0xcb [nilfs2]
>  [<f9537a08>] ? nilfs_mount+0x4f9/0x62c [nilfs2]
>  [<c02745cf>] ? kstrdup+0x36/0x3f
>  [<f953750f>] ? nilfs_mount+0x0/0x62c [nilfs2]
>  [<c0293940>] ? vfs_kern_mount+0x4d/0x12c
>  [<c02a5100>] ? get_fs_type+0x76/0x8f
>  [<c0293a68>] ? do_kern_mount+0x33/0xbf
>  [<c02a784a>] ? do_mount+0x2ed/0x714
>  [<c02a6171>] ? copy_mount_options+0x28/0xfc
>  [<c02a7ce3>] ? sys_mount+0x72/0xaf
>  [<c0473085>] ? syscall_call+0x7/0xb

Reported-by: Wakko Warner <wakko@animx.eu.org>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Wakko Warner <wakko@animx.eu.org>
Cc: stable <stable@kernel.org> [2.6.37, 2.6.36]
LKML-Reference: <20110121024918.GA29598@animx.eu.org>
---
 fs/nilfs2/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0994f6a76c0..58fd707174e 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -704,7 +704,8 @@ skip_mount_setup:
 	sbp[0]->s_state =
 		cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
 	/* synchronize sbp[1] with sbp[0] */
-	memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+	if (sbp[1])
+		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
 	return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
 }
 
-- 
cgit v1.2.3


From ff5fdb61493d95332945630fcae249f896098652 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 22 Jan 2011 20:16:06 -0800
Subject: fs: fix new dcache.c kernel-doc warnings

Fix new fs/dcache.c kernel-doc warnings:

  Warning(fs/dcache.c:184): No description found for parameter 'dentry'
  Warning(fs/dcache.c:296): No description found for parameter 'parent'
  Warning(fs/dcache.c:1985): No description found for parameter 'dparent'
  Warning(fs/dcache.c:1985): Excess function parameter 'parent' description in 'd_validate'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc:	Alexander Viro <viro@zeniv.linux.org.uk>
Cc:	Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 9f493ee4dcb..2a6bd9a4ae9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -176,6 +176,7 @@ static void d_free(struct dentry *dentry)
 
 /**
  * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups
+ * @dentry: the target dentry
  * After this call, in-progress rcu-walk path lookup will fail. This
  * should be called after unhashing, and after changing d_inode (if
  * the dentry has not already been unhashed).
@@ -281,6 +282,7 @@ static void dentry_lru_move_tail(struct dentry *dentry)
 /**
  * d_kill - kill dentry and return parent
  * @dentry: dentry to kill
+ * @parent: parent dentry
  *
  * The dentry must already be unhashed and removed from the LRU.
  *
@@ -1973,7 +1975,7 @@ out:
 /**
  * d_validate - verify dentry provided from insecure source (deprecated)
  * @dentry: The dentry alleged to be valid child of @dparent
- * @parent: The parent dentry (known to be valid)
+ * @dparent: The parent dentry (known to be valid)
  *
  * An insecure source has sent us a dentry, here we verify it and dget() it.
  * This is used by ncpfs in its readdir implementation.
-- 
cgit v1.2.3


From 821404434f3324bf23f545050ff64055a149766e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Dec 2010 14:48:35 +0000
Subject: CacheFiles: Add calls to path-based security hooks

Add calls to path-based security hooks into CacheFiles as, unlike inode-based
security, these aren't implicit in the vfs_mkdir() and similar calls.

Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/cachefiles/namei.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 42c7fafc8bf..a0358c2189c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -275,6 +275,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 				  bool preemptive)
 {
 	struct dentry *grave, *trap;
+	struct path path, path_to_graveyard;
 	char nbuffer[8 + 8 + 1];
 	int ret;
 
@@ -287,10 +288,18 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 	/* non-directories can just be unlinked */
 	if (!S_ISDIR(rep->d_inode->i_mode)) {
 		_debug("unlink stale object");
-		ret = vfs_unlink(dir->d_inode, rep);
 
-		if (preemptive)
-			cachefiles_mark_object_buried(cache, rep);
+		path.mnt = cache->mnt;
+		path.dentry = dir;
+		ret = security_path_unlink(&path, rep);
+		if (ret < 0) {
+			cachefiles_io_error(cache, "Unlink security error");
+		} else {
+			ret = vfs_unlink(dir->d_inode, rep);
+
+			if (preemptive)
+				cachefiles_mark_object_buried(cache, rep);
+		}
 
 		mutex_unlock(&dir->d_inode->i_mutex);
 
@@ -379,12 +388,23 @@ try_again:
 	}
 
 	/* attempt the rename */
-	ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
-	if (ret != 0 && ret != -ENOMEM)
-		cachefiles_io_error(cache, "Rename failed with error %d", ret);
+	path.mnt = cache->mnt;
+	path.dentry = dir;
+	path_to_graveyard.mnt = cache->mnt;
+	path_to_graveyard.dentry = cache->graveyard;
+	ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+	if (ret < 0) {
+		cachefiles_io_error(cache, "Rename security error %d", ret);
+	} else {
+		ret = vfs_rename(dir->d_inode, rep,
+				 cache->graveyard->d_inode, grave);
+		if (ret != 0 && ret != -ENOMEM)
+			cachefiles_io_error(cache,
+					    "Rename failed with error %d", ret);
 
-	if (preemptive)
-		cachefiles_mark_object_buried(cache, rep);
+		if (preemptive)
+			cachefiles_mark_object_buried(cache, rep);
+	}
 
 	unlock_rename(cache->graveyard, dir);
 	dput(grave);
@@ -448,6 +468,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 {
 	struct cachefiles_cache *cache;
 	struct dentry *dir, *next = NULL;
+	struct path path;
 	unsigned long start;
 	const char *name;
 	int ret, nlen;
@@ -458,6 +479,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 
 	cache = container_of(parent->fscache.cache,
 			     struct cachefiles_cache, cache);
+	path.mnt = cache->mnt;
 
 	ASSERT(parent->dentry);
 	ASSERT(parent->dentry->d_inode);
@@ -511,6 +533,10 @@ lookup_again:
 			if (ret < 0)
 				goto create_error;
 
+			path.dentry = dir;
+			ret = security_path_mkdir(&path, next, 0);
+			if (ret < 0)
+				goto create_error;
 			start = jiffies;
 			ret = vfs_mkdir(dir->d_inode, next, 0);
 			cachefiles_hist(cachefiles_mkdir_histogram, start);
@@ -536,6 +562,10 @@ lookup_again:
 			if (ret < 0)
 				goto create_error;
 
+			path.dentry = dir;
+			ret = security_path_mknod(&path, next, S_IFREG, 0);
+			if (ret < 0)
+				goto create_error;
 			start = jiffies;
 			ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
 			cachefiles_hist(cachefiles_create_histogram, start);
@@ -692,6 +722,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 {
 	struct dentry *subdir;
 	unsigned long start;
+	struct path path;
 	int ret;
 
 	_enter(",,%s", dirname);
@@ -719,6 +750,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 
 		_debug("attempt mkdir");
 
+		path.mnt = cache->mnt;
+		path.dentry = dir;
+		ret = security_path_mkdir(&path, subdir, 0700);
+		if (ret < 0)
+			goto mkdir_error;
 		ret = vfs_mkdir(dir->d_inode, subdir, 0700);
 		if (ret < 0)
 			goto mkdir_error;
-- 
cgit v1.2.3


From 3f391c79b0686ce183668c6e2b7d02f3e716766c Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sat, 22 Jan 2011 21:07:16 +0100
Subject: CIFS: Remove pointless variable assignment in cifs_dfs_do_automount()

In fs/cifs/cifs_dfs_ref.c::cifs_dfs_do_automount() we have this code:

	...
	mnt = ERR_PTR(-EINVAL);
	if (IS_ERR(tlink)) {
		mnt = ERR_CAST(tlink);
		goto free_full_path;
	}
	ses = tlink_tcon(tlink)->ses;

	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
		&num_referrals, &referrals,
		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);

	cifs_put_tlink(tlink);

	mnt = ERR_PTR(-ENOENT);
	...

The assignment of 'mnt = ERR_PTR(-EINVAL);' is completely pointless. If we
take the 'if (IS_ERR(tlink))' branch we'll set 'mnt' again and we'll also
do so if we do not take the branch. There is no way we'll ever use 'mnt'
with the assigned 'ERR_PTR(-EINVAL)' value, so we may as well just remove
the pointless assignment.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 7ed36536e75..f1c68629f27 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -297,7 +297,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 
 	cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
-	mnt = ERR_PTR(-EINVAL);
 	if (IS_ERR(tlink)) {
 		mnt = ERR_CAST(tlink);
 		goto free_full_path;
-- 
cgit v1.2.3


From f1d0c998653f1eeec60ee6420e550135b62dbab4 Mon Sep 17 00:00:00 2001
From: Rob Landley <rlandley@parallels.com>
Date: Sat, 22 Jan 2011 15:44:05 -0600
Subject: Make CIFS mount work in a container.

Teach cifs about network namespaces, so mounting uses adresses/routing
visible from the container rather than from init context.

A container is a chroot on steroids that changes more than just the root
filesystem the new processes see.  One thing containers can isolate is
"network namespaces", meaning each container can have its own set of
ethernet interfaces, each with its own own IP address and routing to the
outside world.  And if you open a socket in _userspace_ from processes
within such a container, this works fine.

But sockets opened from within the kernel still use a single global
networking context in a lot of places, meaning the new socket's address
and routing are correct for PID 1 on the host, but are _not_ what
userspace processes in the container get to use.

So when you mount a network filesystem from within in a container, the
mount code in the CIFS driver uses the host's networking context and not
the container's networking context, so it gets the wrong address, uses
the wrong routing, and may even try to go out an interface that the
container can't even access...  Bad stuff.

This patch copies the mount process's network context into the CIFS
structure that stores the rest of the server information for that mount
point, and changes the socket open code to use the saved network context
instead of the global network context.  I.E. "when you attempt to use
these addresses, do so relative to THIS set of network interfaces and
routing rules, not the old global context from back before we supported
containers".

The big long HOWTO sets up a test environment on the assumption you've
never used ocntainers before.  It basically says:

1) configure and build a new kernel that has container support
2) build a new root filesystem that includes the userspace container
control package (LXC)
3) package/run them under KVM (so you don't have to mess up your host
system in order to play with containers).
4) set up some containers under the KVM system
5) set up contradictory routing in the KVM system and the container so
that the host and the container see different things for the same address
6) try to mount a CIFS share from both contexts so you can both force it
to work and force it to fail.

For a long drawn out test reproduction sequence, see:

  http://landley.livejournal.com/47024.html
  http://landley.livejournal.com/47205.html
  http://landley.livejournal.com/47476.html

Signed-off-by: Rob Landley <rlandley@parallels.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 33 +++++++++++++++++++++++++++++++++
 fs/cifs/connect.c  | 12 ++++++++++--
 2 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5bfb75346cb..edd5b29b53c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -166,6 +166,9 @@ struct TCP_Server_Info {
 	struct socket *ssocket;
 	struct sockaddr_storage dstaddr;
 	struct sockaddr_storage srcaddr; /* locally bind to this IP */
+#ifdef CONFIG_NET_NS
+	struct net *net;
+#endif
 	wait_queue_head_t response_q;
 	wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
 	struct list_head pending_mid_q;
@@ -216,6 +219,36 @@ struct TCP_Server_Info {
 #endif
 };
 
+/*
+ * Macros to allow the TCP_Server_Info->net field and related code to drop out
+ * when CONFIG_NET_NS isn't set.
+ */
+
+#ifdef CONFIG_NET_NS
+
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+	return srv->net;
+}
+
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+	srv->net = net;
+}
+
+#else
+
+static inline struct net *cifs_net_ns(struct TCP_Server_Info *srv)
+{
+	return &init_net;
+}
+
+static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
+{
+}
+
+#endif
+
 /*
  * Session structure.  One of these for each uid session with a particular host
  */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 18d3c7724d6..0cc3b81c2e8 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1568,6 +1568,9 @@ cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+		if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
+			continue;
+
 		if (!match_address(server, addr,
 				   (struct sockaddr *)&vol->srcaddr))
 			continue;
@@ -1598,6 +1601,8 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 		return;
 	}
 
+	put_net(cifs_net_ns(server));
+
 	list_del_init(&server->tcp_ses_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
@@ -1672,6 +1677,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		goto out_err;
 	}
 
+	cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
 	tcp_ses->hostname = extract_hostname(volume_info->UNC);
 	if (IS_ERR(tcp_ses->hostname)) {
 		rc = PTR_ERR(tcp_ses->hostname);
@@ -1752,6 +1758,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 out_err_crypto_release:
 	cifs_crypto_shash_release(tcp_ses);
 
+	put_net(cifs_net_ns(tcp_ses));
+
 out_err:
 	if (tcp_ses) {
 		if (!IS_ERR(tcp_ses->hostname))
@@ -2263,8 +2271,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
 	}
 
 	if (socket == NULL) {
-		rc = sock_create_kern(sfamily, SOCK_STREAM,
-				      IPPROTO_TCP, &socket);
+		rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
+				   IPPROTO_TCP, &socket, 1);
 		if (rc < 0) {
 			cERROR(1, "Error %d creating socket", rc);
 			server->ssocket = NULL;
-- 
cgit v1.2.3


From 944fdef52ca9fc0fe077578f51201ef397e30abe Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 16 Jan 2011 19:22:02 +0200
Subject: UBIFS: do not start the commit if there is nothing to commit

This patch fixes suboptimal UBIFS 'sync_fs()' implementation which causes
flash I/O even if the file-system is synchronized. E.g., a 'printk()'
in the MTD erasure function (e.g., 'nand_erase_nand()') can show that
for every 'sync' shell command UBIFS erases at least one eraseblock.

So '$ while true; do sync; done' will cause huge amount of flash I/O.

The reason for this is that UBIFS commits in 'sync_fs()', and starts the
commit even if there is nothing to commit, e.g., it anyway changes the
log. This patch adds a check in the 'do_commit()' UBIFS functions which
prevents the commit if there is nothing to commit.

Reported-by: Hans J. Koch <hjk@linutronix.de>
Tested-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/commit.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 02429d81ca3..b148fbc80f8 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -48,6 +48,56 @@
 #include <linux/slab.h>
 #include "ubifs.h"
 
+/*
+ * nothing_to_commit - check if there is nothing to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which checks if there is anything to commit. It is
+ * used as an optimization to avoid starting the commit if it is not really
+ * necessary. Indeed, the commit operation always assumes flash I/O (e.g.,
+ * writing the commit start node to the log), and it is better to avoid doing
+ * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is
+ * nothing to commit, it is more optimal to avoid any flash I/O.
+ *
+ * This function has to be called with @c->commit_sem locked for writing -
+ * this function does not take LPT/TNC locks because the @c->commit_sem
+ * guarantees that we have exclusive access to the TNC and LPT data structures.
+ *
+ * This function returns %1 if there is nothing to commit and %0 otherwise.
+ */
+static int nothing_to_commit(struct ubifs_info *c)
+{
+	/*
+	 * During mounting or remounting from R/O mode to R/W mode we may
+	 * commit for various recovery-related reasons.
+	 */
+	if (c->mounting || c->remounting_rw)
+		return 0;
+
+	/*
+	 * If the root TNC node is dirty, we definitely have something to
+	 * commit.
+	 */
+	if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags))
+		return 0;
+
+	/*
+	 * Even though the TNC is clean, the LPT tree may have dirty nodes. For
+	 * example, this may happen if the budgeting subsystem invoked GC to
+	 * make some free space, and the GC found an LEB with only dirty and
+	 * free space. In this case GC would just change the lprops of this
+	 * LEB (by turning all space into free space) and unmap it.
+	 */
+	if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
+		return 0;
+
+	ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+	ubifs_assert(c->dirty_pn_cnt == 0);
+	ubifs_assert(c->dirty_nn_cnt == 0);
+
+	return 1;
+}
+
 /**
  * do_commit - commit the journal.
  * @c: UBIFS file-system description object
@@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c)
 		goto out_up;
 	}
 
+	if (nothing_to_commit(c)) {
+		up_write(&c->commit_sem);
+		err = 0;
+		goto out_cancel;
+	}
+
 	/* Sync all write buffers (necessary for recovery) */
 	for (i = 0; i < c->jhead_cnt; i++) {
 		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
@@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c)
 	if (err)
 		goto out;
 
+out_cancel:
 	spin_lock(&c->cs_lock);
 	c->cmt_state = COMMIT_RESTING;
 	wake_up(&c->cmt_wq);
 	dbg_cmt("commit end");
 	spin_unlock(&c->cs_lock);
-
 	return 0;
 
 out_up:
-- 
cgit v1.2.3


From ada609ee2ac2e03bd8abb07f9b3e92cd2e650f19 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jan 2011 14:35:54 +0100
Subject: workqueue: use WQ_MEM_RECLAIM instead of WQ_RESCUER

WQ_RESCUER is now an internal flag and should only be used in the
workqueue implementation proper.  Use WQ_MEM_RECLAIM instead.

This doesn't introduce any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: dm-devel@redhat.com
Cc: Neil Brown <neilb@suse.de>
---
 fs/nfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d8512423ba7..0855acdfe70 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1505,7 +1505,7 @@ static int nfsiod_start(void)
 {
 	struct workqueue_struct *wq;
 	dprintk("RPC:       creating workqueue nfsiod\n");
-	wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
+	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
 	if (wq == NULL)
 		return -ENOMEM;
 	nfsiod_workqueue = wq;
-- 
cgit v1.2.3


From d66bbd441c08fe00ed2add1cf70cb243ebc2b27e Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 21 Jan 2011 21:16:46 -0800
Subject: ceph: avoid picking MDS that is not active

Ignore replication or auth frag data if it indicates an MDS that is not
active.  This can happen if the MDS shuts down and the client has stale
data about the namespace distribution across the MDS cluster.  If that's
the case, fall back to directing the request based on the auth cap (which
should always be accurate).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 509339ceef7..a6949cc7c69 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -693,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 				dout("choose_mds %p %llx.%llx "
 				     "frag %u mds%d (%d/%d)\n",
 				     inode, ceph_vinop(inode),
-				     frag.frag, frag.mds,
+				     frag.frag, mds,
 				     (int)r, frag.ndist);
-				return mds;
+				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+				    CEPH_MDS_STATE_ACTIVE)
+					return mds;
 			}
 
 			/* since this file/dir wasn't known to be
@@ -708,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 				dout("choose_mds %p %llx.%llx "
 				     "frag %u mds%d (auth)\n",
 				     inode, ceph_vinop(inode), frag.frag, mds);
-				return mds;
+				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+				    CEPH_MDS_STATE_ACTIVE)
+					return mds;
 			}
 		}
 	}
-- 
cgit v1.2.3


From 93c100c0b423266c0ee28497e90fdf27c05e6b8e Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 25 Jan 2011 19:28:43 +0000
Subject: [CIFS] Replace cifs md5 hashing functions with kernel crypto APIs

Replace remaining use of md5 hash functions local to cifs module
with kernel crypto APIs.
Remove header and source file containing those local functions.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Makefile      |   2 +-
 fs/cifs/cifsencrypt.c |   1 -
 fs/cifs/link.c        |  59 ++++++--
 fs/cifs/md5.c         | 366 --------------------------------------------------
 fs/cifs/md5.h         |  38 ------
 fs/cifs/smbencrypt.c  |   1 -
 6 files changed, 51 insertions(+), 416 deletions(-)
 delete mode 100644 fs/cifs/md5.c
 delete mode 100644 fs/cifs/md5.h

(limited to 'fs')

diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 43b19dd3919..e1322296cb6 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+	  md4.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o
 
 cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 66f3d50d067..35bf329c90e 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -24,7 +24,6 @@
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
-#include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
 #include "ntlmssp.h"
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 306769de2fb..d3444ea6ac7 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,7 +28,6 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
-#include "md5.h"
 
 #define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
 #define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
@@ -46,6 +45,45 @@
 	md5_hash[8],  md5_hash[9],  md5_hash[10], md5_hash[11],\
 	md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
 
+static int
+symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
+{
+	int rc;
+	unsigned int size;
+	struct crypto_shash *md5;
+	struct sdesc *sdescmd5;
+
+	md5 = crypto_alloc_shash("md5", 0, 0);
+	if (!md5 || IS_ERR(md5)) {
+		rc = PTR_ERR(md5);
+		cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
+		return rc;
+	}
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
+	sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!sdescmd5) {
+		rc = -ENOMEM;
+		cERROR(1, "%s: Memory allocation failure\n", __func__);
+		goto symlink_hash_err;
+	}
+	sdescmd5->shash.tfm = md5;
+	sdescmd5->shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdescmd5->shash);
+	if (rc) {
+		cERROR(1, "%s: Could not init md5 shash\n", __func__);
+		goto symlink_hash_err;
+	}
+	crypto_shash_update(&sdescmd5->shash, link_str, link_len);
+	rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
+
+symlink_hash_err:
+	crypto_free_shash(md5);
+	kfree(sdescmd5);
+
+	return rc;
+}
+
 static int
 CIFSParseMFSymlink(const u8 *buf,
 		   unsigned int buf_len,
@@ -56,7 +94,6 @@ CIFSParseMFSymlink(const u8 *buf,
 	unsigned int link_len;
 	const char *md5_str1;
 	const char *link_str;
-	struct MD5Context md5_ctx;
 	u8 md5_hash[16];
 	char md5_str2[34];
 
@@ -70,9 +107,11 @@ CIFSParseMFSymlink(const u8 *buf,
 	if (rc != 1)
 		return -EINVAL;
 
-	cifs_MD5_init(&md5_ctx);
-	cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
-	cifs_MD5_final(md5_hash, &md5_ctx);
+	rc = symlink_hash(link_len, link_str, md5_hash);
+	if (rc) {
+		cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+		return rc;
+	}
 
 	snprintf(md5_str2, sizeof(md5_str2),
 		 CIFS_MF_SYMLINK_MD5_FORMAT,
@@ -94,9 +133,9 @@ CIFSParseMFSymlink(const u8 *buf,
 static int
 CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 {
+	int rc;
 	unsigned int link_len;
 	unsigned int ofs;
-	struct MD5Context md5_ctx;
 	u8 md5_hash[16];
 
 	if (buf_len != CIFS_MF_SYMLINK_FILE_SIZE)
@@ -107,9 +146,11 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
 	if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
 		return -ENAMETOOLONG;
 
-	cifs_MD5_init(&md5_ctx);
-	cifs_MD5_update(&md5_ctx, (const u8 *)link_str, link_len);
-	cifs_MD5_final(md5_hash, &md5_ctx);
+	rc = symlink_hash(link_len, link_str, md5_hash);
+	if (rc) {
+		cFYI(1, "%s: MD5 hash failure: %d\n", __func__, rc);
+		return rc;
+	}
 
 	snprintf(buf, buf_len,
 		 CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT,
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
deleted file mode 100644
index 98b66a54c31..00000000000
--- a/fs/cifs/md5.c
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * This code implements the MD5 message-digest algorithm.
- * The algorithm is due to Ron Rivest.  This code was
- * written by Colin Plumb in 1993, no copyright is claimed.
- * This code is in the public domain; do with it what you wish.
- *
- * Equivalent code is available from RSA Data Security, Inc.
- * This code has been tested against that, and is equivalent,
- * except that you don't need to include two pages of legalese
- * with every copy.
- *
- * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
- * needed on buffers full of bytes, and then call cifs_MD5_final, which
- * will fill a supplied 16-byte array with the digest.
- */
-
-/* This code slightly modified to fit into Samba by
-   abartlet@samba.org Jun 2001
-   and to fit the cifs vfs by
-   Steve French sfrench@us.ibm.com */
-
-#include <linux/string.h>
-#include "md5.h"
-
-static void MD5Transform(__u32 buf[4], __u32 const in[16]);
-
-/*
- * Note: this code is harmless on little-endian machines.
- */
-static void
-byteReverse(unsigned char *buf, unsigned longs)
-{
-	__u32 t;
-	do {
-		t = (__u32) ((unsigned) buf[3] << 8 | buf[2]) << 16 |
-		    ((unsigned) buf[1] << 8 | buf[0]);
-		*(__u32 *) buf = t;
-		buf += 4;
-	} while (--longs);
-}
-
-/*
- * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
- * initialization constants.
- */
-void
-cifs_MD5_init(struct MD5Context *ctx)
-{
-	ctx->buf[0] = 0x67452301;
-	ctx->buf[1] = 0xefcdab89;
-	ctx->buf[2] = 0x98badcfe;
-	ctx->buf[3] = 0x10325476;
-
-	ctx->bits[0] = 0;
-	ctx->bits[1] = 0;
-}
-
-/*
- * Update context to reflect the concatenation of another buffer full
- * of bytes.
- */
-void
-cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
-{
-	register __u32 t;
-
-	/* Update bitcount */
-
-	t = ctx->bits[0];
-	if ((ctx->bits[0] = t + ((__u32) len << 3)) < t)
-		ctx->bits[1]++;	/* Carry from low to high */
-	ctx->bits[1] += len >> 29;
-
-	t = (t >> 3) & 0x3f;	/* Bytes already in shsInfo->data */
-
-	/* Handle any leading odd-sized chunks */
-
-	if (t) {
-		unsigned char *p = (unsigned char *) ctx->in + t;
-
-		t = 64 - t;
-		if (len < t) {
-			memmove(p, buf, len);
-			return;
-		}
-		memmove(p, buf, t);
-		byteReverse(ctx->in, 16);
-		MD5Transform(ctx->buf, (__u32 *) ctx->in);
-		buf += t;
-		len -= t;
-	}
-	/* Process data in 64-byte chunks */
-
-	while (len >= 64) {
-		memmove(ctx->in, buf, 64);
-		byteReverse(ctx->in, 16);
-		MD5Transform(ctx->buf, (__u32 *) ctx->in);
-		buf += 64;
-		len -= 64;
-	}
-
-	/* Handle any remaining bytes of data. */
-
-	memmove(ctx->in, buf, len);
-}
-
-/*
- * Final wrapup - pad to 64-byte boundary with the bit pattern
- * 1 0* (64-bit count of bits processed, MSB-first)
- */
-void
-cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
-{
-	unsigned int count;
-	unsigned char *p;
-
-	/* Compute number of bytes mod 64 */
-	count = (ctx->bits[0] >> 3) & 0x3F;
-
-	/* Set the first char of padding to 0x80.  This is safe since there is
-	   always at least one byte free */
-	p = ctx->in + count;
-	*p++ = 0x80;
-
-	/* Bytes of padding needed to make 64 bytes */
-	count = 64 - 1 - count;
-
-	/* Pad out to 56 mod 64 */
-	if (count < 8) {
-		/* Two lots of padding:  Pad the first block to 64 bytes */
-		memset(p, 0, count);
-		byteReverse(ctx->in, 16);
-		MD5Transform(ctx->buf, (__u32 *) ctx->in);
-
-		/* Now fill the next block with 56 bytes */
-		memset(ctx->in, 0, 56);
-	} else {
-		/* Pad block to 56 bytes */
-		memset(p, 0, count - 8);
-	}
-	byteReverse(ctx->in, 14);
-
-	/* Append length in bits and transform */
-	((__u32 *) ctx->in)[14] = ctx->bits[0];
-	((__u32 *) ctx->in)[15] = ctx->bits[1];
-
-	MD5Transform(ctx->buf, (__u32 *) ctx->in);
-	byteReverse((unsigned char *) ctx->buf, 4);
-	memmove(digest, ctx->buf, 16);
-	memset(ctx, 0, sizeof(*ctx));	/* In case it's sensitive */
-}
-
-/* The four core functions - F1 is optimized somewhat */
-
-/* #define F1(x, y, z) (x & y | ~x & z) */
-#define F1(x, y, z) (z ^ (x & (y ^ z)))
-#define F2(x, y, z) F1(z, x, y)
-#define F3(x, y, z) (x ^ y ^ z)
-#define F4(x, y, z) (y ^ (x | ~z))
-
-/* This is the central step in the MD5 algorithm. */
-#define MD5STEP(f, w, x, y, z, data, s) \
-	(w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x)
-
-/*
- * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data.  cifs_MD5_update blocks
- * the data and converts bytes into longwords for this routine.
- */
-static void
-MD5Transform(__u32 buf[4], __u32 const in[16])
-{
-	register __u32 a, b, c, d;
-
-	a = buf[0];
-	b = buf[1];
-	c = buf[2];
-	d = buf[3];
-
-	MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
-	MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
-	MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
-	MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
-	MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
-	MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
-	MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
-	MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
-	MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
-	MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
-	MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
-	MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
-	MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
-	MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
-	MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
-	MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-
-	MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
-	MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
-	MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
-	MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
-	MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
-	MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
-	MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
-	MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
-	MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
-	MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
-	MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
-	MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
-	MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
-	MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
-	MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
-	MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-
-	MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
-	MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
-	MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
-	MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
-	MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
-	MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
-	MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
-	MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
-	MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
-	MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
-	MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
-	MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
-	MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
-	MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
-	MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
-	MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-
-	MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
-	MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
-	MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
-	MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
-	MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
-	MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
-	MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
-	MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
-	MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
-	MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
-	MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
-	MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
-	MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
-	MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
-	MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
-	MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-
-	buf[0] += a;
-	buf[1] += b;
-	buf[2] += c;
-	buf[3] += d;
-}
-
-#if 0   /* currently unused */
-/***********************************************************************
- the rfc 2104 version of hmac_md5 initialisation.
-***********************************************************************/
-static void
-hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-		      struct HMACMD5Context *ctx)
-{
-	int i;
-
-	/* if key is longer than 64 bytes reset it to key=MD5(key) */
-	if (key_len > 64) {
-		unsigned char tk[16];
-		struct MD5Context tctx;
-
-		cifs_MD5_init(&tctx);
-		cifs_MD5_update(&tctx, key, key_len);
-		cifs_MD5_final(tk, &tctx);
-
-		key = tk;
-		key_len = 16;
-	}
-
-	/* start out by storing key in pads */
-	memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-	memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-	memcpy(ctx->k_ipad, key, key_len);
-	memcpy(ctx->k_opad, key, key_len);
-
-	/* XOR key with ipad and opad values */
-	for (i = 0; i < 64; i++) {
-		ctx->k_ipad[i] ^= 0x36;
-		ctx->k_opad[i] ^= 0x5c;
-	}
-
-	cifs_MD5_init(&ctx->ctx);
-	cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-#endif
-
-/***********************************************************************
- the microsoft version of hmac_md5 initialisation.
-***********************************************************************/
-void
-hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-			 struct HMACMD5Context *ctx)
-{
-	int i;
-
-	/* if key is longer than 64 bytes truncate it */
-	if (key_len > 64)
-		key_len = 64;
-
-	/* start out by storing key in pads */
-	memset(ctx->k_ipad, 0, sizeof(ctx->k_ipad));
-	memset(ctx->k_opad, 0, sizeof(ctx->k_opad));
-	memcpy(ctx->k_ipad, key, key_len);
-	memcpy(ctx->k_opad, key, key_len);
-
-	/* XOR key with ipad and opad values */
-	for (i = 0; i < 64; i++) {
-		ctx->k_ipad[i] ^= 0x36;
-		ctx->k_opad[i] ^= 0x5c;
-	}
-
-	cifs_MD5_init(&ctx->ctx);
-	cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
-}
-
-/***********************************************************************
- update hmac_md5 "inner" buffer
-***********************************************************************/
-void
-hmac_md5_update(const unsigned char *text, int text_len,
-		struct HMACMD5Context *ctx)
-{
-	cifs_MD5_update(&ctx->ctx, text, text_len);	/* then text of datagram */
-}
-
-/***********************************************************************
- finish off hmac_md5 "inner" buffer and generate outer one.
-***********************************************************************/
-void
-hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
-{
-	struct MD5Context ctx_o;
-
-	cifs_MD5_final(digest, &ctx->ctx);
-
-	cifs_MD5_init(&ctx_o);
-	cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
-	cifs_MD5_update(&ctx_o, digest, 16);
-	cifs_MD5_final(digest, &ctx_o);
-}
-
-/***********************************************************
- single function to calculate an HMAC MD5 digest from data.
- use the microsoft hmacmd5 init method because the key is 16 bytes.
-************************************************************/
-#if 0 /* currently unused */
-static void
-hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-	 unsigned char *digest)
-{
-	struct HMACMD5Context ctx;
-	hmac_md5_init_limK_to_64(key, 16, &ctx);
-	if (data_len != 0)
-		hmac_md5_update(data, data_len, &ctx);
-
-	hmac_md5_final(digest, &ctx);
-}
-#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
deleted file mode 100644
index 6fba8cb402f..00000000000
--- a/fs/cifs/md5.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef MD5_H
-#define MD5_H
-#ifndef HEADER_MD5_H
-/* Try to avoid clashes with OpenSSL */
-#define HEADER_MD5_H
-#endif
-
-struct MD5Context {
-	__u32 buf[4];
-	__u32 bits[2];
-	unsigned char in[64];
-};
-#endif				/* !MD5_H */
-
-#ifndef _HMAC_MD5_H
-struct HMACMD5Context {
-	struct MD5Context ctx;
-	unsigned char k_ipad[65];
-	unsigned char k_opad[65];
-};
-#endif				/* _HMAC_MD5_H */
-
-void cifs_MD5_init(struct MD5Context *context);
-void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
-			unsigned len);
-void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
-
-/* The following definitions come from lib/hmacmd5.c  */
-
-/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-			struct HMACMD5Context *ctx);*/
-void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
-			struct HMACMD5Context *ctx);
-void hmac_md5_update(const unsigned char *text, int text_len,
-			struct HMACMD5Context *ctx);
-void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
-/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-			unsigned char *digest);*/
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 192ea51af20..30135005e4f 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -32,7 +32,6 @@
 #include "cifs_unicode.h"
 #include "cifspdu.h"
 #include "cifsglob.h"
-#include "md5.h"
 #include "cifs_debug.h"
 #include "cifsencrypt.h"
 
-- 
cgit v1.2.3


From 72432ffcf555decbbae47f1be338e1d2f210aa69 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Mon, 24 Jan 2011 14:16:35 -0500
Subject: CIFS: Implement cifs_strict_writev (try #4)

If we don't have Exclusive oplock we write a data to the server.
Also set invalidate_mapping flag on the inode if we wrote something
to the server. Add cifs_iovec_write to let the client write iovec
buffers through CIFSSMBWrite2.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c    |  15 ++--
 fs/cifs/cifsfs.h    |   4 +-
 fs/cifs/cifsproto.h |   2 +
 fs/cifs/file.c      | 202 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 217 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index a8323f1dc1c..f2970136d17 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -600,10 +600,17 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
 	ssize_t written;
+	int rc;
 
 	written = generic_file_aio_write(iocb, iov, nr_segs, pos);
-	if (!CIFS_I(inode)->clientCanCacheAll)
-		filemap_fdatawrite(inode->i_mapping);
+
+	if (CIFS_I(inode)->clientCanCacheAll)
+		return written;
+
+	rc = filemap_fdatawrite(inode->i_mapping);
+	if (rc)
+		cFYI(1, "cifs_file_aio_write: %d rc on %p inode", rc, inode);
+
 	return written;
 }
 
@@ -737,7 +744,7 @@ const struct file_operations cifs_file_strict_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
 	.aio_read = cifs_strict_readv,
-	.aio_write = cifs_file_aio_write,
+	.aio_write = cifs_strict_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.lock = cifs_lock,
@@ -793,7 +800,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
 	.aio_read = cifs_strict_readv,
-	.aio_write = cifs_file_aio_write,
+	.aio_write = cifs_strict_writev,
 	.open = cifs_open,
 	.release = cifs_close,
 	.fsync = cifs_strict_fsync,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index f23206d4653..14789a97304 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -85,7 +85,9 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
 extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
 				 unsigned long nr_segs, loff_t pos);
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
-			 size_t write_size, loff_t *poffset);
+			       size_t write_size, loff_t *poffset);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+				  unsigned long nr_segs, loff_t pos);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, int);
 extern int cifs_strict_fsync(struct file *, int);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 982895fa761..35c989f4924 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -85,6 +85,8 @@ extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern bool is_valid_oplock_break(struct smb_hdr *smb,
 				  struct TCP_Server_Info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
+extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
+			    unsigned int bytes_written);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
 extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d7d65a70678..0de17c1db60 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -848,7 +848,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 }
 
 /* update the file size (if needed) after a write */
-static void
+void
 cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 		      unsigned int bytes_written)
 {
@@ -1619,6 +1619,206 @@ int cifs_flush(struct file *file, fl_owner_t id)
 	return rc;
 }
 
+static int
+cifs_write_allocate_pages(struct page **pages, unsigned long num_pages)
+{
+	int rc = 0;
+	unsigned long i;
+
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = alloc_page(__GFP_HIGHMEM);
+		if (!pages[i]) {
+			/*
+			 * save number of pages we have already allocated and
+			 * return with ENOMEM error
+			 */
+			num_pages = i;
+			rc = -ENOMEM;
+			goto error;
+		}
+	}
+
+	return rc;
+
+error:
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	return rc;
+}
+
+static inline
+size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
+{
+	size_t num_pages;
+	size_t clen;
+
+	clen = min_t(const size_t, len, wsize);
+	num_pages = clen / PAGE_CACHE_SIZE;
+	if (clen % PAGE_CACHE_SIZE)
+		num_pages++;
+
+	if (cur_len)
+		*cur_len = clen;
+
+	return num_pages;
+}
+
+static ssize_t
+cifs_iovec_write(struct file *file, const struct iovec *iov,
+		 unsigned long nr_segs, loff_t *poffset)
+{
+	size_t total_written = 0, written = 0;
+	unsigned long num_pages, npages;
+	size_t copied, len, cur_len, i;
+	struct kvec *to_send;
+	struct page **pages;
+	struct iov_iter it;
+	struct inode *inode;
+	struct cifsFileInfo *open_file;
+	struct cifsTconInfo *pTcon;
+	struct cifs_sb_info *cifs_sb;
+	int xid, rc;
+
+	len = iov_length(iov, nr_segs);
+	if (!len)
+		return 0;
+
+	rc = generic_write_checks(file, poffset, &len, 0);
+	if (rc)
+		return rc;
+
+	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+	num_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+
+	pages = kmalloc(sizeof(struct pages *)*num_pages, GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	to_send = kmalloc(sizeof(struct kvec)*(num_pages + 1), GFP_KERNEL);
+	if (!to_send) {
+		kfree(pages);
+		return -ENOMEM;
+	}
+
+	rc = cifs_write_allocate_pages(pages, num_pages);
+	if (rc) {
+		kfree(pages);
+		kfree(to_send);
+		return rc;
+	}
+
+	xid = GetXid();
+	open_file = file->private_data;
+	pTcon = tlink_tcon(open_file->tlink);
+	inode = file->f_path.dentry->d_inode;
+
+	iov_iter_init(&it, iov, nr_segs, len, 0);
+	npages = num_pages;
+
+	do {
+		size_t save_len = cur_len;
+		for (i = 0; i < npages; i++) {
+			copied = min_t(const size_t, cur_len, PAGE_CACHE_SIZE);
+			copied = iov_iter_copy_from_user(pages[i], &it, 0,
+							 copied);
+			cur_len -= copied;
+			iov_iter_advance(&it, copied);
+			to_send[i+1].iov_base = kmap(pages[i]);
+			to_send[i+1].iov_len = copied;
+		}
+
+		cur_len = save_len - cur_len;
+
+		do {
+			if (open_file->invalidHandle) {
+				rc = cifs_reopen_file(open_file, false);
+				if (rc != 0)
+					break;
+			}
+			rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid,
+					   cur_len, *poffset, &written,
+					   to_send, npages, 0);
+		} while (rc == -EAGAIN);
+
+		for (i = 0; i < npages; i++)
+			kunmap(pages[i]);
+
+		if (written) {
+			len -= written;
+			total_written += written;
+			cifs_update_eof(CIFS_I(inode), *poffset, written);
+			*poffset += written;
+		} else if (rc < 0) {
+			if (!total_written)
+				total_written = rc;
+			break;
+		}
+
+		/* get length and number of kvecs of the next write */
+		npages = get_numpages(cifs_sb->wsize, len, &cur_len);
+	} while (len > 0);
+
+	if (total_written > 0) {
+		spin_lock(&inode->i_lock);
+		if (*poffset > inode->i_size)
+			i_size_write(inode, *poffset);
+		spin_unlock(&inode->i_lock);
+	}
+
+	cifs_stats_bytes_written(pTcon, total_written);
+	mark_inode_dirty_sync(inode);
+
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+	kfree(to_send);
+	kfree(pages);
+	FreeXid(xid);
+	return total_written;
+}
+
+static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	ssize_t written;
+	struct inode *inode;
+
+	inode = iocb->ki_filp->f_path.dentry->d_inode;
+
+	/*
+	 * BB - optimize the way when signing is disabled. We can drop this
+	 * extra memory-to-memory copying and use iovec buffers for constructing
+	 * write request.
+	 */
+
+	written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+	if (written > 0) {
+		CIFS_I(inode)->invalid_mapping = true;
+		iocb->ki_pos = pos;
+	}
+
+	return written;
+}
+
+ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
+			   unsigned long nr_segs, loff_t pos)
+{
+	struct inode *inode;
+
+	inode = iocb->ki_filp->f_path.dentry->d_inode;
+
+	if (CIFS_I(inode)->clientCanCacheAll)
+		return generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+	/*
+	 * In strict cache mode we need to write the data to the server exactly
+	 * from the pos to pos+len-1 rather than flush all affected pages
+	 * because it may cause a error with mandatory locks on these pages but
+	 * not on the region from pos to ppos+len-1.
+	 */
+
+	return cifs_user_writev(iocb, iov, nr_segs, pos);
+}
+
 static ssize_t
 cifs_iovec_read(struct file *file, const struct iovec *iov,
 		 unsigned long nr_segs, loff_t *poffset)
-- 
cgit v1.2.3


From d39454ffe4a3c85428483b8a8a8e5e797b6363d5 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Mon, 24 Jan 2011 14:16:35 -0500
Subject: CIFS: Add strictcache mount option

Use for switching on strict cache mode. In this mode the
client reads from the cache all the time it has Oplock Level II,
otherwise - read from the server. As for write - the client stores
a data in the cache in Exclusive Oplock case, otherwise - write
directly to the server.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README    | 5 +++++
 fs/cifs/connect.c | 5 +++++
 2 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index 46af99ab361..fe168359082 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -452,6 +452,11 @@ A partial list of the supported mount options follows:
 		if oplock (caching token) is granted and held. Note that
 		direct allows write operations larger than page size
 		to be sent to the server.
+  strictcache   Use for switching on strict cache mode. In this mode the
+		client read from the cache all the time it has Oplock Level II,
+		otherwise - read from the server. All written data are stored
+		in the cache, but if the client doesn't have Exclusive Oplock,
+		it writes the data to the server.
   acl   	Allow setfacl and getfacl to manage posix ACLs if server
 		supports them.  (default)
   noacl 	Do not allow setfacl and getfacl calls on this mount
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0cc3b81c2e8..47034af67b0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -87,6 +87,7 @@ struct smb_vol {
 	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
 	bool server_ino:1; /* use inode numbers from server ie UniqueId */
 	bool direct_io:1;
+	bool strict_io:1; /* strict cache behavior */
 	bool remap:1;      /* set to remap seven reserved chars in filenames */
 	bool posix_paths:1; /* unset to not ask for posix pathnames. */
 	bool no_linux_ext:1;
@@ -1344,6 +1345,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->direct_io = 1;
 		} else if (strnicmp(data, "forcedirectio", 13) == 0) {
 			vol->direct_io = 1;
+		} else if (strnicmp(data, "strictcache", 11) == 0) {
+			vol->strict_io = 1;
 		} else if (strnicmp(data, "noac", 4) == 0) {
 			printk(KERN_WARNING "CIFS: Mount option noac not "
 				"supported. Instead set "
@@ -2584,6 +2587,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	if (pvolume_info->multiuser)
 		cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER |
 					    CIFS_MOUNT_NO_PERM);
+	if (pvolume_info->strict_io)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO;
 	if (pvolume_info->direct_io) {
 		cFYI(1, "mounting share using direct i/o");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
-- 
cgit v1.2.3


From ad3d2eedf0ed3611f5f86b9e4d0d15cc76c63465 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Mon, 17 Jan 2011 18:41:50 +0000
Subject: NFS4: Avoid potential NULL pointer dereference in
 decode_and_add_ds().

On Mon, 17 Jan 2011, Mi Jinlong wrote:

>
>
> Jesper Juhl:
> > strrchr() can return NULL if nothing is found. If this happens we'll
> > dereference a NULL pointer in
> > fs/nfs/nfs4filelayoutdev.c::decode_and_add_ds().
> >
> > I tried to find some other code that guarantees that this can never
> > happen but I was unsuccessful. So, unless someone else can point to some
> > code that ensures this can never be a problem, I believe this patch is
> > needed.
> >
> > While I was changing this code I also noticed that all the dprintk()
> > statements, except one, start with "%s:". The one missing the ":" I added
> > it to.
>
>   Maybe another one also should be changed at decode_and_add_ds() at line 243:
>
>    243  printk("%s Decoded address and port %s\n", __func__, buf);
>
Missed that one. Thanks.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayoutdev.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 51fe64ace55..f5c9b125e8c 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -214,7 +214,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
 
 	/* ipv6 length plus port is legal */
 	if (rlen > INET6_ADDRSTRLEN + 8) {
-		dprintk("%s Invalid address, length %d\n", __func__,
+		dprintk("%s: Invalid address, length %d\n", __func__,
 			rlen);
 		goto out_err;
 	}
@@ -225,6 +225,11 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
 	/* replace the port dots with dashes for the in4_pton() delimiter*/
 	for (i = 0; i < 2; i++) {
 		char *res = strrchr(buf, '.');
+		if (!res) {
+			dprintk("%s: Failed finding expected dots in port\n",
+				__func__);
+			goto out_free;
+		}
 		*res = '-';
 	}
 
@@ -240,7 +245,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
 	port = htons((tmp[0] << 8) | (tmp[1]));
 
 	ds = nfs4_pnfs_ds_add(inode, ip_addr, port);
-	dprintk("%s Decoded address and port %s\n", __func__, buf);
+	dprintk("%s: Decoded address and port %s\n", __func__, buf);
 out_free:
 	kfree(buf);
 out_err:
-- 
cgit v1.2.3


From 839f7ad6932d95f4d5ae7267b95c574714ff3d5b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Jan 2011 15:54:57 +0000
Subject: NFS: Fix "kernel BUG at fs/aio.c:554!"

Nick Piggin reports:

> I'm getting use after frees in aio code in NFS
>
> [ 2703.396766] Call Trace:
> [ 2703.396858]  [<ffffffff8100b057>] ? native_sched_clock+0x27/0x80
> [ 2703.396959]  [<ffffffff8108509e>] ? put_lock_stats+0xe/0x40
> [ 2703.397058]  [<ffffffff81088348>] ? lock_release_holdtime+0xa8/0x140
> [ 2703.397159]  [<ffffffff8108a2a5>] lock_acquire+0x95/0x1b0
> [ 2703.397260]  [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
> [ 2703.397361]  [<ffffffff81039701>] ? get_parent_ip+0x11/0x50
> [ 2703.397464]  [<ffffffff81612a31>] _raw_spin_lock_irq+0x41/0x80
> [ 2703.397564]  [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
> [ 2703.397662]  [<ffffffff811627db>] aio_put_req+0x2b/0x60
> [ 2703.397761]  [<ffffffff811647fe>] do_io_submit+0x2be/0x7c0
> [ 2703.397895]  [<ffffffff81164d0b>] sys_io_submit+0xb/0x10
> [ 2703.397995]  [<ffffffff8100307b>] system_call_fastpath+0x16/0x1b
>
> Adding some tracing, it is due to nfs completing the request then
> returning something other than -EIOCBQUEUED, so aio.c
> also completes the request.

To address this, prevent the NFS direct I/O engine from completing
async iocbs when the forward path returns an error without starting
any I/O.

This fix appears to survive ^C during both "xfstest no. 208" and "fsx
-Z."

It's likely this bug has existed for a very long while, as we are seeing
very similar symptoms in OEL 5.  Copying stable.

Cc: Stable <stable@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e6ace0d93c7..9943a75bb6d 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
 		pos += vec->iov_len;
 	}
 
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
 	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-
-	if (requested_bytes != 0)
-		return 0;
-
-	if (result < 0)
-		return result;
-	return -EIO;
+	return 0;
 }
 
 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 		pos += vec->iov_len;
 	}
 
+	/*
+	 * If no bytes were started, return the error, and let the
+	 * generic layer handle the completion.
+	 */
+	if (requested_bytes == 0) {
+		nfs_direct_req_release(dreq);
+		return result < 0 ? result : -EIO;
+	}
+
 	if (put_dreq(dreq))
 		nfs_direct_write_complete(dreq, dreq->inode);
-
-	if (requested_bytes != 0)
-		return 0;
-
-	if (result < 0)
-		return result;
-	return -EIO;
+	return 0;
 }
 
 static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
-- 
cgit v1.2.3


From ee5dc7732bd557bae6d10873a0aac606d2c551fb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Jan 2011 03:05:18 +0000
Subject: NFS: Fix "kernel BUG at fs/nfs/nfs3xdr.c:1338!"

Milan Broz <mbroz@redhat.com> reports:

> on today Linus' tree I get OOps if using nfs.
>
> server (2.6.36) exports dir:
> /dir   172.16.1.0/24(rw,async,all_squash,no_subtree_check,anonuid=500,anongid=500)
>
> on client it is mounted  in fstab
> server:/dir  /mnt/tst  nfs  rw,soft 0 0
>
> and these commands OOpses it (simplified from a configure script):
>
> cd /dir
> touch x
> install x y
>
> [  105.327701] ------------[ cut here ]------------
> [  105.327979] kernel BUG at fs/nfs/nfs3xdr.c:1338!
> [  105.328075] invalid opcode: 0000 [#1] PREEMPT SMP
> [  105.328223] last sysfs file: /sys/devices/virtual/bdi/0:16/uevent
> [  105.328349] Modules linked in: usbcore dm_mod
> [  105.328553]
> [  105.328678] Pid: 3710, comm: install Not tainted 2.6.37+ #423 440BX Desktop Reference Platform/VMware Virtual Platform
> [  105.328853] EIP: 0060:[<c116c06c>] EFLAGS: 00010282 CPU: 0
> [  105.329152] EIP is at nfs3_xdr_enc_setacl3args+0x61/0x98
> [  105.329249] EAX: ffffffea EBX: ce941d98 ECX: 00000000 EDX: 00000004
> [  105.329340] ESI: ce941cd0 EDI: 000000a4 EBP: ce941cc0 ESP: ce941cb4
> [  105.329431]  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
> [  105.329525] Process install (pid: 3710, ti=ce940000 task=ced36f20 task.ti=ce940000)
> [  105.336600] Stack:
> [  105.336693]  ce941cd0 ce9dc000 00000000 ce941cf8 c12ecd02 c12f43e0 c116c00b cf754158
> [  105.336982]  ce9dc004 cf754284 ce9dc004 cf7ffee8 ceff9978 ce9dc000 cf7ffee8 ce9dc000
> [  105.337182]  ce9dc000 ce941d14 c12e698d cf75412c ce941d98 cf7ffee8 cf7fff20 00000000
> [  105.337405] Call Trace:
> [  105.337695]  [<c12ecd02>] rpcauth_wrap_req+0x75/0x7f
> [  105.337806]  [<c12f43e0>] ? xdr_encode_opaque+0x12/0x15
> [  105.337898]  [<c116c00b>] ? nfs3_xdr_enc_setacl3args+0x0/0x98
> [  105.337988]  [<c12e698d>] call_transmit+0x17e/0x1e8
> [  105.338072]  [<c12ec307>] __rpc_execute+0x6d/0x1a6
> [  105.338155]  [<c12ec474>] rpc_execute+0x34/0x37
> [  105.338235]  [<c12e738d>] rpc_run_task+0xb5/0xbd
> [  105.338316]  [<c12e7474>] rpc_call_sync+0x3d/0x58
> [  105.338402]  [<c116d0c6>] nfs3_proc_setacls+0x18e/0x24f
> [  105.338493]  [<c10b3f76>] ? __kmalloc+0x148/0x1c4
> [  105.338579]  [<c10ecd01>] ? posix_acl_alloc+0x12/0x22
> [  105.338665]  [<c116d5c8>] nfs3_proc_setacl+0xa0/0xca
> [  105.338748]  [<c116d69c>] nfs3_setxattr+0x62/0x88
> [  105.338834]  [<c1317042>] ? sub_preempt_count+0x7c/0x89
> [  105.338926]  [<c116d63a>] ? nfs3_setxattr+0x0/0x88
> [  105.339026]  [<c10cfa79>] __vfs_setxattr_noperm+0x26/0x95
> [  105.339114]  [<c10cfb43>] vfs_setxattr+0x5b/0x76
> [  105.339211]  [<c10cfbfb>] setxattr+0x9d/0xc3
> [  105.339298]  [<c10a2ea8>] ? handle_pte_fault+0x258/0x5cb
> [  105.339428]  [<c1091ff6>] ? __free_pages+0x1a/0x23
> [  105.339517]  [<c10498ea>] ? up_read+0x16/0x2c
> [  105.339599]  [<c10b8365>] ? fget+0x0/0xa3
> [  105.339677]  [<c10b8365>] ? fget+0x0/0xa3
> [  105.339760]  [<c1025d23>] ? get_parent_ip+0xb/0x31
> [  105.339843]  [<c1317042>] ? sub_preempt_count+0x7c/0x89
> [  105.339931]  [<c10cfc72>] sys_fsetxattr+0x51/0x79
> [  105.340014]  [<c1002853>] sysenter_do_call+0x12/0x32
> [  105.340133] Code: 2e 76 18 00 58 31 d2 8b 7f 28 f6 43 04 01 74 03 8b 53 08 6a 00 8b 46 04 6a 01 8b 0b 52 89 fa e8 85 10 f8 ff 83 c4 0c 85 c0 79 04 <0f> 0b eb fe 31 c9 f6 43 04 04 74 03 8b 4b 0c 68 00 10 00 00 8d
> [  105.350321] EIP: [<c116c06c>] nfs3_xdr_enc_setacl3args+0x61/0x98 SS:ESP 0068:ce941cb4
> [  105.364385] ---[ end trace 01fcfe7f0f7f6e4a ]---

nfs3_xdr_enc_setacl3args() is not properly setting up the target
buffer before nfsacl_encode() attempts to encode the ACL.

Introduced by commit d9c407b1 "NFS: Introduce new-style XDR encoding
functions for NFSv3."

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3xdr.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 01c5e8b1941..183c6b123d0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1328,10 +1328,13 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
 
 	encode_nfs_fh3(xdr, NFS_FH(args->inode));
 	encode_uint32(xdr, args->mask);
+
+	base = req->rq_slen;
 	if (args->npages != 0)
 		xdr_write_pages(xdr, args->pages, 0, args->len);
+	else
+		xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
 
-	base = req->rq_slen;
 	error = nfsacl_encode(xdr->buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
 			    args->acl_access : NULL, 1, 0);
-- 
cgit v1.2.3


From 731f3f482ad3b2c58a1af2d0a9a634a82803706a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Jan 2011 03:05:28 +0000
Subject: NFS: nfsacl_{encode,decode} should return signed integer

Clean up.

The nfsacl_encode() and nfsacl_decode() functions return negative
errno values, and each call site verifies that the returned value
is not negative.  Change the synopsis of both of these functions
to reflect this usage.

Document the synopsis and return values.

Reported-by: Trond Myklebust <trond.myklebust@netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs_common/nfsacl.c | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index fc1c52571c0..a3e78bd1867 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -72,9 +72,20 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 	return 0;
 }
 
-unsigned int
-nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
-	      struct posix_acl *acl, int encode_entries, int typeflag)
+/**
+ * nfsacl_encode - Encode an NFSv3 ACL
+ *
+ * @buf: destination xdr_buf to contain XDR encoded ACL
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @inode: inode of file whose ACL this is
+ * @acl: posix_acl to encode
+ * @encode_entries: whether to encode ACEs as well
+ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
+ *
+ * Returns size of encoded ACL in bytes or a negative errno value.
+ */
+int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+		  struct posix_acl *acl, int encode_entries, int typeflag)
 {
 	int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
 	struct nfsacl_encode_desc nfsacl_desc = {
@@ -224,9 +235,18 @@ posix_acl_from_nfsacl(struct posix_acl *acl)
 	return 0;
 }
 
-unsigned int
-nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
-	      struct posix_acl **pacl)
+/**
+ * nfsacl_decode - Decode an NFSv3 ACL
+ *
+ * @buf: xdr_buf containing XDR'd ACL data to decode
+ * @base: byte offset in xdr_buf where XDR'd ACL begins
+ * @aclcnt: count of ACEs in decoded posix_acl
+ * @pacl: buffer in which to place decoded posix_acl
+ *
+ * Returns the length of the decoded ACL in bytes, or a negative errno value.
+ */
+int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+		  struct posix_acl **pacl)
 {
 	struct nfsacl_decode_desc nfsacl_desc = {
 		.desc = {
-- 
cgit v1.2.3


From f61f6da0d53842e849bab7f69e1431bd3de1136d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 21 Jan 2011 03:05:38 +0000
Subject: NFS: Prevent memory allocation failure in nfsacl_encode()

nfsacl_encode() allocates memory in certain cases.  This of course
is not guaranteed to work.

Since commit 9f06c719 "SUNRPC: New xdr_streams XDR encoder API", the
kernel's XDR encoders can't return a result indicating possibly a
failure, so a memory allocation failure in nfsacl_encode() has become
fatal (ie, the XDR code Oopses) in some cases.

However, the allocated memory is a tiny fixed amount, on the order
of 40-50 bytes.  We can easily use a stack-allocated buffer for
this, with only a wee bit of nose-holding.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3acl.c       |  4 ++--
 fs/nfs_common/nfsacl.c | 22 +++++++++++++++-------
 fs/posix_acl.c         | 17 +++++++++++++----
 3 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9f88c5f4c7e..27434277165 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -311,8 +311,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
 		goto out;
 
-	/* We are doing this here, because XDR marshalling can only
-	   return -ENOMEM. */
+	/* We are doing this here because XDR marshalling does not
+	 * return any results, it BUGs. */
 	status = -ENOSPC;
 	if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
 		goto out;
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index a3e78bd1867..84c27d69d42 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,6 +42,11 @@ struct nfsacl_encode_desc {
 	gid_t gid;
 };
 
+struct nfsacl_simple_acl {
+	struct posix_acl acl;
+	struct posix_acl_entry ace[4];
+};
+
 static int
 xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 {
@@ -99,17 +104,22 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
 		.uid = inode->i_uid,
 		.gid = inode->i_gid,
 	};
+	struct nfsacl_simple_acl aclbuf;
 	int err;
-	struct posix_acl *acl2 = NULL;
 
 	if (entries > NFS_ACL_MAX_ENTRIES ||
 	    xdr_encode_word(buf, base, entries))
 		return -EINVAL;
 	if (encode_entries && acl && acl->a_count == 3) {
-		/* Fake up an ACL_MASK entry. */
-		acl2 = posix_acl_alloc(4, GFP_KERNEL);
-		if (!acl2)
-			return -ENOMEM;
+		struct posix_acl *acl2 = &aclbuf.acl;
+
+		/* Avoid the use of posix_acl_alloc().  nfsacl_encode() is
+		 * invoked in contexts where a memory allocation failure is
+		 * fatal.  Fortunately this fake ACL is small enough to
+		 * construct on the stack. */
+		memset(acl2, 0, sizeof(acl2));
+		posix_acl_init(acl2, 4);
+
 		/* Insert entries in canonical order: other orders seem
 		 to confuse Solaris VxFS. */
 		acl2->a_entries[0] = acl->a_entries[0];  /* ACL_USER_OBJ */
@@ -120,8 +130,6 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
 		nfsacl_desc.acl = acl2;
 	}
 	err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
-	if (acl2)
-		posix_acl_release(acl2);
 	if (!err)
 		err = 8 + nfsacl_desc.desc.elem_size *
 			  nfsacl_desc.desc.array_len;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 39df95a0ec2..b1cf6bf4b41 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -22,6 +22,7 @@
 
 #include <linux/errno.h>
 
+EXPORT_SYMBOL(posix_acl_init);
 EXPORT_SYMBOL(posix_acl_alloc);
 EXPORT_SYMBOL(posix_acl_clone);
 EXPORT_SYMBOL(posix_acl_valid);
@@ -31,6 +32,16 @@ EXPORT_SYMBOL(posix_acl_create_masq);
 EXPORT_SYMBOL(posix_acl_chmod_masq);
 EXPORT_SYMBOL(posix_acl_permission);
 
+/*
+ * Init a fresh posix_acl
+ */
+void
+posix_acl_init(struct posix_acl *acl, int count)
+{
+	atomic_set(&acl->a_refcount, 1);
+	acl->a_count = count;
+}
+
 /*
  * Allocate a new ACL with the specified number of entries.
  */
@@ -40,10 +51,8 @@ posix_acl_alloc(int count, gfp_t flags)
 	const size_t size = sizeof(struct posix_acl) +
 	                    count * sizeof(struct posix_acl_entry);
 	struct posix_acl *acl = kmalloc(size, flags);
-	if (acl) {
-		atomic_set(&acl->a_refcount, 1);
-		acl->a_count = count;
-	}
+	if (acl)
+		posix_acl_init(acl, count);
 	return acl;
 }
 
-- 
cgit v1.2.3


From 80c30e8de4f81851b1f712bcc596e11d53bc76f1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 24 Jan 2011 20:50:26 +0000
Subject: NLM: Fix "kernel BUG at fs/lockd/host.c:417!" or ".../host.c:283!"

Nick Bowler <nbowler@elliptictech.com> reports:

> We were just having some NFS server troubles, and my client machine
> running 2.6.38-rc1+ (specifically, commit 2b1caf6ed7b888c95) crashed
> hard (syslog output appended to this mail).
>
> I'm not sure what the exact timeline was or how to reproduce this,
> but the server was rebooted during all this.  Since I've never seen
> this happen before, it is possibly a regression from previous kernel
> releases.  However, I recently updated my nfs-utils (on the client) to
> version 1.2.3, so that might be related as well.

  [ BUG output redacted ]

When done searching, the for_each_host loop in next_host_state() falls
through and returns the final host on the host chain without bumping
it's reference count.

Since the host's ref count is only one at that point, releasing the
host in nlm_host_rebooted() attempts to destroy the host prematurely,
and therefore hits a BUG().

Likely, the original intent of the for_each_host behavior in
next_host_state() was to handle the case when the host chain is empty.
Searching the chain and finding no suitable host to return needs to be
handled as well.

Defensively restructure next_host_state() always to return NULL when
the loop falls through.

Introduced by commit b10e30f6 "lockd: reorganize nlm_host_rebooted".

Cc: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 5f1bcb2f06f..b7c99bfb3da 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -520,7 +520,7 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
 					struct nsm_handle *nsm,
 					const struct nlm_reboot *info)
 {
-	struct nlm_host *host = NULL;
+	struct nlm_host *host;
 	struct hlist_head *chain;
 	struct hlist_node *pos;
 
@@ -532,12 +532,13 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
 			host->h_state++;
 
 			nlm_get_host(host);
-			goto out;
+			mutex_unlock(&nlm_host_mutex);
+			return host;
 		}
 	}
-out:
+
 	mutex_unlock(&nlm_host_mutex);
-	return host;
+	return NULL;
 }
 
 /**
-- 
cgit v1.2.3


From 778be232a207e79088ba70d832ac25dfea6fbf1a Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 25 Jan 2011 15:38:01 +0000
Subject: NFS do not find client in NFSv4 pg_authenticate

The information required to find the nfs_client cooresponding to the incoming
back channel request is contained in the NFS layer. Perform minimal checking
in the RPC layer pg_authenticate method, and push more detailed checking into
the NFS layer where the nfs_client can be found.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c      | 109 +++++++++++++------------------------------------
 fs/nfs/callback.h      |   4 +-
 fs/nfs/callback_proc.c |  10 +----
 fs/nfs/callback_xdr.c  |   5 +--
 fs/nfs/client.c        |  15 +++----
 fs/nfs/internal.h      |   3 +-
 fs/nfs/nfs4state.c     |   6 ---
 7 files changed, 41 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 199016528fc..e3d29426905 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -134,33 +134,6 @@ out_err:
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/*
- *  * CB_SEQUENCE operations will fail until the callback sessionid is set.
- *   */
-int nfs4_set_callback_sessionid(struct nfs_client *clp)
-{
-	struct svc_serv *serv = clp->cl_rpcclient->cl_xprt->bc_serv;
-	struct nfs4_sessionid *bc_sid;
-
-	if (!serv->sv_bc_xprt)
-		return -EINVAL;
-
-	/* on success freed in xprt_free */
-	bc_sid = kmalloc(sizeof(struct nfs4_sessionid), GFP_KERNEL);
-	if (!bc_sid)
-		return -ENOMEM;
-	memcpy(bc_sid->data, &clp->cl_session->sess_id.data,
-		NFS4_MAX_SESSIONID_LEN);
-	spin_lock_bh(&serv->sv_cb_lock);
-	serv->sv_bc_xprt->xpt_bc_sid = bc_sid;
-	spin_unlock_bh(&serv->sv_cb_lock);
-	dprintk("%s set xpt_bc_sid=%u:%u:%u:%u for sv_bc_xprt %p\n", __func__,
-		((u32 *)bc_sid->data)[0], ((u32 *)bc_sid->data)[1],
-		((u32 *)bc_sid->data)[2], ((u32 *)bc_sid->data)[3],
-		serv->sv_bc_xprt);
-	return 0;
-}
-
 /*
  * The callback service for NFSv4.1 callbacks
  */
@@ -266,10 +239,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct nfs_callback_data *cb_info)
 {
 }
-int nfs4_set_callback_sessionid(struct nfs_client *clp)
-{
-	return 0;
-}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -359,78 +328,58 @@ void nfs_callback_down(int minorversion)
 	mutex_unlock(&nfs_callback_mutex);
 }
 
-static int check_gss_callback_principal(struct nfs_client *clp,
-					struct svc_rqst *rqstp)
+/* Boolean check of RPC_AUTH_GSS principal */
+int
+check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
 {
 	struct rpc_clnt *r = clp->cl_rpcclient;
 	char *p = svc_gss_principal(rqstp);
 
+	if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
+		return 1;
+
 	/* No RPC_AUTH_GSS on NFSv4.1 back channel yet */
 	if (clp->cl_minorversion != 0)
-		return SVC_DROP;
+		return 0;
 	/*
 	 * It might just be a normal user principal, in which case
 	 * userspace won't bother to tell us the name at all.
 	 */
 	if (p == NULL)
-		return SVC_DENIED;
+		return 0;
 
 	/* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
 
 	if (memcmp(p, "nfs@", 4) != 0)
-		return SVC_DENIED;
+		return 0;
 	p += 4;
 	if (strcmp(p, r->cl_server) != 0)
-		return SVC_DENIED;
-	return SVC_OK;
+		return 0;
+	return 1;
 }
 
-/* pg_authenticate method helper */
-static struct nfs_client *nfs_cb_find_client(struct svc_rqst *rqstp)
-{
-	struct nfs4_sessionid *sessionid = bc_xprt_sid(rqstp);
-	int is_cb_compound = rqstp->rq_proc == CB_COMPOUND ? 1 : 0;
-
-	dprintk("--> %s rq_proc %d\n", __func__, rqstp->rq_proc);
-	if (svc_is_backchannel(rqstp))
-		/* Sessionid (usually) set after CB_NULL ping */
-		return nfs4_find_client_sessionid(svc_addr(rqstp), sessionid,
-						  is_cb_compound);
-	else
-		/* No callback identifier in pg_authenticate */
-		return nfs4_find_client_no_ident(svc_addr(rqstp));
-}
-
-/* pg_authenticate method for nfsv4 callback threads. */
+/*
+ * pg_authenticate method for nfsv4 callback threads.
+ *
+ * The authflavor has been negotiated, so an incorrect flavor is a server
+ * bug. Drop packets with incorrect authflavor.
+ *
+ * All other checking done after NFS decoding where the nfs_client can be
+ * found in nfs4_callback_compound
+ */
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
-	struct nfs_client *clp;
-	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-	int ret = SVC_OK;
-
-	/* Don't talk to strangers */
-	clp = nfs_cb_find_client(rqstp);
-	if (clp == NULL)
-		return SVC_DROP;
-
-	dprintk("%s: %s NFSv4 callback!\n", __func__,
-			svc_print_addr(rqstp, buf, sizeof(buf)));
-
 	switch (rqstp->rq_authop->flavour) {
-		case RPC_AUTH_NULL:
-			if (rqstp->rq_proc != CB_NULL)
-				ret = SVC_DENIED;
-			break;
-		case RPC_AUTH_UNIX:
-			break;
-		case RPC_AUTH_GSS:
-			ret = check_gss_callback_principal(clp, rqstp);
-			break;
-		default:
-			ret = SVC_DENIED;
+	case RPC_AUTH_NULL:
+		if (rqstp->rq_proc != CB_NULL)
+			return SVC_DROP;
+		break;
+	case RPC_AUTH_GSS:
+		/* No RPC_AUTH_GSS support yet in NFSv4.1 */
+		 if (svc_is_backchannel(rqstp))
+			return SVC_DROP;
 	}
-	nfs_put_client(clp);
-	return ret;
+	return SVC_OK;
 }
 
 /*
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index d3b44f9bd74..46d93ce7311 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -7,6 +7,7 @@
  */
 #ifndef __LINUX_FS_NFS_CALLBACK_H
 #define __LINUX_FS_NFS_CALLBACK_H
+#include <linux/sunrpc/svc.h>
 
 #define NFS4_CALLBACK 0x40000000
 #define NFS4_CALLBACK_XDRSIZE 2048
@@ -37,7 +38,6 @@ enum nfs4_callback_opnum {
 struct cb_process_state {
 	__be32			drc_status;
 	struct nfs_client	*clp;
-	struct nfs4_sessionid	*svc_sid; /* v4.1 callback service sessionid */
 };
 
 struct cb_compound_hdr_arg {
@@ -168,7 +168,7 @@ extern unsigned nfs4_callback_layoutrecall(
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
 extern void nfs4_cb_take_slot(struct nfs_client *clp);
 #endif /* CONFIG_NFS_V4_1 */
-
+extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
 				    struct cb_getattrres *res,
 				    struct cb_process_state *cps);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 4bb91cb2620..829f406e91d 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -373,17 +373,11 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 {
 	struct nfs_client *clp;
 	int i;
-	__be32 status;
+	__be32 status = htonl(NFS4ERR_BADSESSION);
 
 	cps->clp = NULL;
 
-	status = htonl(NFS4ERR_BADSESSION);
-	/* Incoming session must match the callback session */
-	if (memcmp(&args->csa_sessionid, cps->svc_sid, NFS4_MAX_SESSIONID_LEN))
-		goto out;
-
-	clp = nfs4_find_client_sessionid(args->csa_addr,
-					 &args->csa_sessionid, 1);
+	clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
 	if (clp == NULL)
 		goto out;
 
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 23112c263f8..14e0f9371d1 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -794,10 +794,9 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
 	if (hdr_arg.minorversion == 0) {
 		cps.clp = nfs4_find_client_ident(hdr_arg.cb_ident);
-		if (!cps.clp)
+		if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
 			return rpc_drop_reply;
-	} else
-		cps.svc_sid = bc_xprt_sid(rqstp);
+	}
 
 	hdr_res.taglen = hdr_arg.taglen;
 	hdr_res.tag = hdr_arg.tag;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 192f2f86026..bd3ca32879e 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1206,16 +1206,11 @@ nfs4_find_client_ident(int cb_ident)
  * For CB_COMPOUND calls, find a client by IP address, protocol version,
  * minorversion, and sessionID
  *
- * CREATE_SESSION triggers a CB_NULL ping from servers. The callback service
- * sessionid can only be set after the CREATE_SESSION return, so a CB_NULL
- * can arrive before the callback sessionid is set. For CB_NULL calls,
- * find a client by IP address protocol version, and minorversion.
- *
  * Returns NULL if no such client
  */
 struct nfs_client *
 nfs4_find_client_sessionid(const struct sockaddr *addr,
-			   struct nfs4_sessionid *sid, int is_cb_compound)
+			   struct nfs4_sessionid *sid)
 {
 	struct nfs_client *clp;
 
@@ -1227,9 +1222,9 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
 		if (!nfs4_has_session(clp))
 			continue;
 
-		/* Match sessionid unless cb_null call*/
-		if (is_cb_compound && (memcmp(clp->cl_session->sess_id.data,
-		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0))
+		/* Match sessionid*/
+		if (memcmp(clp->cl_session->sess_id.data,
+		    sid->data, NFS4_MAX_SESSIONID_LEN) != 0)
 			continue;
 
 		atomic_inc(&clp->cl_count);
@@ -1244,7 +1239,7 @@ nfs4_find_client_sessionid(const struct sockaddr *addr,
 
 struct nfs_client *
 nfs4_find_client_sessionid(const struct sockaddr *addr,
-			   struct nfs4_sessionid *sid, int is_cb_compound)
+			   struct nfs4_sessionid *sid)
 {
 	return NULL;
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4644f04b4b4..cf9fdbdabc6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -133,8 +133,7 @@ extern void nfs_put_client(struct nfs_client *);
 extern struct nfs_client *nfs4_find_client_no_ident(const struct sockaddr *);
 extern struct nfs_client *nfs4_find_client_ident(int);
 extern struct nfs_client *
-nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *,
-			   int);
+nfs4_find_client_sessionid(const struct sockaddr *, struct nfs4_sessionid *);
 extern struct nfs_server *nfs_create_server(
 					const struct nfs_parsed_mount_data *,
 					struct nfs_fh *);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2336d532cf6..e6742b57a04 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -232,12 +232,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 	status = nfs4_proc_create_session(clp);
 	if (status != 0)
 		goto out;
-	status = nfs4_set_callback_sessionid(clp);
-	if (status != 0) {
-		printk(KERN_WARNING "Sessionid not set. No callback service\n");
-		nfs_callback_down(1);
-		status = 0;
-	}
 	nfs41_setup_state_renewal(clp);
 	nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
-- 
cgit v1.2.3


From 2c4cdf8f6d3cfb48036400952329555099c8c92c Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 25 Jan 2011 15:38:02 +0000
Subject: NFS fix cb_sequence error processing

Always assign the cb_process_state nfs_client pointer so a processing error
in cb_sequence after the nfs_client is found and referenced returns
a non-NULL cb_process_state nfs_client and the matching nfs_put_client in
nfs4_callback_compound dereferences the client.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 829f406e91d..89587573fe5 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -408,9 +408,9 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
 	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
 	nfs4_cb_take_slot(clp);
-	cps->clp = clp; /* put in nfs4_callback_compound */
 
 out:
+	cps->clp = clp; /* put in nfs4_callback_compound */
 	for (i = 0; i < args->csa_nrclists; i++)
 		kfree(args->csa_rclists[i].rcl_refcalls);
 	kfree(args->csa_rclists);
-- 
cgit v1.2.3


From b2a2897dc4a59684321de425652061c62a0569d0 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 25 Jan 2011 15:38:03 +0000
Subject: NFS improve pnfs_put_deviceid_cache debug print

What we really want to know is the ref count.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bc408976973..1b1bc1a0fb0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -951,7 +951,7 @@ pnfs_put_deviceid_cache(struct nfs_client *clp)
 {
 	struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
 
-	dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache);
+	dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
 	if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
 		int i;
 		/* Verify cache is empty */
-- 
cgit v1.2.3


From 27dc1cd3ad9300f81e1219e5fc305d91d85353f8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 25 Jan 2011 15:28:21 -0500
Subject: NFS: nfs_wcc_update_inode() should set nfsi->attr_gencount

If the call to nfs_wcc_update_inode() results in an attribute update, we
need to ensure that the inode's attr_gencount gets bumped too, otherwise
we are not protected against races with other GETATTR calls.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d8512423ba7..1cc600e77bb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -881,9 +881,10 @@ out:
 	return ret;
 }
 
-static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
+static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long ret = 0;
 
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
 			&& (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -891,25 +892,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfsi->change_attr = fattr->change_attr;
 		if (S_ISDIR(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		ret |= NFS_INO_INVALID_ATTR;
 	}
 	/* If we have atomic WCC data, we may update some attributes */
 	if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
 			&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
-			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
-			memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+			&& timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
+		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+		ret |= NFS_INO_INVALID_ATTR;
+	}
 
 	if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
 			&& (fattr->valid & NFS_ATTR_FATTR_MTIME)
 			&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
-			memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-			if (S_ISDIR(inode->i_mode))
-				nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+		if (S_ISDIR(inode->i_mode))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		ret |= NFS_INO_INVALID_ATTR;
 	}
 	if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
 			&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
 			&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
-			&& nfsi->npages == 0)
-			i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+			&& nfsi->npages == 0) {
+		i_size_write(inode, nfs_size_to_loff_t(fattr->size));
+		ret |= NFS_INO_INVALID_ATTR;
+	}
+	return ret;
 }
 
 /**
@@ -1223,7 +1231,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			| NFS_INO_REVAL_PAGECACHE);
 
 	/* Do atomic weak cache consistency updates */
-	nfs_wcc_update_inode(inode, fattr);
+	invalid |= nfs_wcc_update_inode(inode, fattr);
 
 	/* More cache consistency checks */
 	if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
-- 
cgit v1.2.3


From 3689456b4bd36027022b3215eb2acba51cd0e6b5 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Tue, 25 Jan 2011 15:07:34 -0800
Subject: squashfs: fix use of uninitialised variable in zlib & xz
 decompressors

Fix potential use of uninitialised variable caused by recent
decompressor code optimisations.

In zlib_uncompress (zlib_wrapper.c) we have

	int zlib_err, zlib_init = 0;
	...
	do {
		...
			if (avail == 0) {
				offset = 0;
				put_bh(bh[k++]);
				continue;
			}
		...
		zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH);
		...
	} while (zlib_err == Z_OK);

If continue is executed (avail == 0) then the while condition will be
evaluated testing zlib_err, which is uninitialised first time around the
loop.

Fix this by getting rid of the 'if (avail == 0)' condition test, this
edge condition should not be being handled in the decompressor code, and
instead handle it generically in the caller code.

Similarly for xz_wrapper.c.

Incidentally, on most architectures (bar Mips and Parisc), no
uninitialised variable warning is generated by gcc, this is because the
while condition test on continue is optimised out and not performed
(when executing continue zlib_err has not been changed since entering
the loop, and logically if the while condition was true previously, then
it's still true).

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
Reported-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/squashfs/block.c        | 8 ++++++++
 fs/squashfs/xz_wrapper.c   | 6 ------
 fs/squashfs/zlib_wrapper.c | 6 ------
 3 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2fb2882f0fa..8ab48bc2fa7 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -63,6 +63,14 @@ static struct buffer_head *get_block_length(struct super_block *sb,
 		*length = (unsigned char) bh->b_data[*offset] |
 			(unsigned char) bh->b_data[*offset + 1] << 8;
 		*offset += 2;
+
+		if (*offset == msblk->devblksize) {
+			put_bh(bh);
+			bh = sb_bread(sb, ++(*cur_index));
+			if (bh == NULL)
+				return NULL;
+			*offset = 0;
+		}
 	}
 
 	return bh;
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 856756ca5ee..c4eb4001825 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -95,12 +95,6 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 			if (!buffer_uptodate(bh[k]))
 				goto release_mutex;
 
-			if (avail == 0) {
-				offset = 0;
-				put_bh(bh[k++]);
-				continue;
-			}
-
 			stream->buf.in = bh[k]->b_data + offset;
 			stream->buf.in_size = avail;
 			stream->buf.in_pos = 0;
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 818a5e063fa..4661ae2b1ce 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -82,12 +82,6 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
 			if (!buffer_uptodate(bh[k]))
 				goto release_mutex;
 
-			if (avail == 0) {
-				offset = 0;
-				put_bh(bh[k++]);
-				continue;
-			}
-
 			stream->next_in = bh[k]->b_data + offset;
 			stream->avail_in = avail;
 			offset = 0;
-- 
cgit v1.2.3


From ac751efa6a0d70f2c9daef5c7e3a92270f5c2dff Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Tue, 25 Jan 2011 15:07:35 -0800
Subject: console: rename acquire/release_console_sem() to
 console_lock/unlock()

The -rt patches change the console_semaphore to console_mutex.  As a
result, a quite large chunk of the patches changes all
acquire/release_console_sem() to acquire/release_console_mutex()

This commit makes things use more neutral function names which dont make
implications about the underlying lock.

The only real change is the return value of console_trylock which is
inverted from try_acquire_console_sem()

This patch also paves the way to switching console_sem from a semaphore to
a mutex.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make console_trylock return 1 on success, per Geert]
Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Thomas Gleixner <tglx@tglx.de>
Cc: Greg KH <gregkh@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/consoles.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index eafc22ab1fd..b701eaa482b 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -67,7 +67,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
 	struct console *con;
 	loff_t off = 0;
 
-	acquire_console_sem();
+	console_lock();
 	for_each_console(con)
 		if (off++ == *pos)
 			break;
@@ -84,7 +84,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void c_stop(struct seq_file *m, void *v)
 {
-	release_console_sem();
+	console_unlock();
 }
 
 static const struct seq_operations consoles_op = {
-- 
cgit v1.2.3


From c7a360b05b5430ac1d75dc7d53c586ada60a05cb Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 25 Jan 2011 19:15:32 -0500
Subject: NFS construct consistent co_ownerid for v4.1

As stated in section 2.4 of RFC 5661, subsequent instances of the client need
to present the same co_ownerid. Concatinate the client's IP dot address,
host name, and the rpc_auth pseudoflavor to form the co_ownerid.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9d992b0346e..78936a8f40a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -50,6 +50,7 @@
 #include <linux/module.h>
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
+#include <linux/utsname.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -4572,27 +4573,16 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
 	args.verifier = &verifier;
 
-	while (1) {
-		args.id_len = scnprintf(args.id, sizeof(args.id),
-					"%s/%s %u",
-					clp->cl_ipaddr,
-					rpc_peeraddr2str(clp->cl_rpcclient,
-							 RPC_DISPLAY_ADDR),
-					clp->cl_id_uniquifier);
-
-		status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
-
-		if (status != -NFS4ERR_CLID_INUSE)
-			break;
-
-		if (signalled())
-			break;
-
-		if (++clp->cl_id_uniquifier == 0)
-			break;
-	}
+	args.id_len = scnprintf(args.id, sizeof(args.id),
+				"%s/%s.%s/%u",
+				clp->cl_ipaddr,
+				init_utsname()->nodename,
+				init_utsname()->domainname,
+				clp->cl_rpcclient->cl_auth->au_flavor);
 
-	status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
+	if (!status)
+		status = nfs4_check_cl_exchange_flags(clp->cl_exchange_flags);
 	dprintk("<-- %s status= %d\n", __func__, status);
 	return status;
 }
-- 
cgit v1.2.3


From d37adaa1596246929f7ab49843fd124595506175 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 26 Jan 2011 17:42:27 +0100
Subject: fs/aio: aio_wq isn't used in memory reclaim path

aio_wq isn't used during memory reclaim.  Convert to alloc_workqueue()
without WQ_MEM_RECLAIM.  It's possible to use system_wq but given that
the number of work items is determined from userland and the work item
may block, enforcing strict concurrency limit would be a good idea.

Also, move fput_work to system_wq so that aio_wq is used soley to
throttle the max concurrency of aio work items and fput_work doesn't
interact with other work items.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: linux-aio@kvack.org
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a..8007bd67588 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -85,7 +85,7 @@ static int __init aio_setup(void)
 	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
-	aio_wq = create_workqueue("aio");
+	aio_wq = alloc_workqueue("aio", 0, 1);	/* used to limit concurrency */
 	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
 	BUG_ON(!aio_wq || !abe_pool);
 
@@ -569,7 +569,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
 		spin_unlock(&fput_lock);
-		queue_work(aio_wq, &fput_work);
+		schedule_work(&fput_work);
 	} else {
 		req->ki_filp = NULL;
 		really_put_req(ctx, req);
-- 
cgit v1.2.3


From 8eb2d829ffea3677c21bd038f19e5d8ca6b43e36 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:48:01 +0800
Subject: btrfs: Fix threshold calculation for block groups smaller than 1GB

If a block group is smaller than 1GB, the extent entry threadhold
calculation will always set the threshold to 0.

So as free space gets fragmented, btrfs will switch to use bitmap
to manage free space, but then will never switch back to extents
due to this bug.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 60d68426695..42f4015988e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1016,14 +1016,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
 	u64 max_bytes;
 	u64 bitmap_bytes;
 	u64 extent_bytes;
+	u64 size = block_group->key.offset;
 
 	/*
 	 * The goal is to keep the total amount of memory used per 1gb of space
 	 * at or below 32k, so we need to adjust how much memory we allow to be
 	 * used by extent based free space tracking
 	 */
-	max_bytes = MAX_CACHE_BYTES_PER_GIG *
-		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+	if (size < 1024 * 1024 * 1024)
+		max_bytes = MAX_CACHE_BYTES_PER_GIG;
+	else
+		max_bytes = MAX_CACHE_BYTES_PER_GIG *
+			div64_u64(size, 1024 * 1024 * 1024);
 
 	/*
 	 * we want to account for 1 more bitmap than what we have so we can make
-- 
cgit v1.2.3


From edf6e2d1ddbac7f326b34a27adbca71ece53ccce Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:50:07 +0800
Subject: btrfs: Add helper function free_bitmap()

Remove some duplicated code.

This prepares for the next patch.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 42f4015988e..850104f0517 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1175,6 +1175,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
 	recalculate_thresholds(block_group);
 }
 
+static void free_bitmap(struct btrfs_block_group_cache *block_group,
+			struct btrfs_free_space *bitmap_info)
+{
+	unlink_free_space(block_group, bitmap_info);
+	kfree(bitmap_info->bitmap);
+	kfree(bitmap_info);
+	block_group->total_bitmaps--;
+	recalculate_thresholds(block_group);
+}
+
 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
 			      struct btrfs_free_space *bitmap_info,
 			      u64 *offset, u64 *bytes)
@@ -1215,13 +1225,8 @@ again:
 
 	if (*bytes) {
 		struct rb_node *next = rb_next(&bitmap_info->offset_index);
-		if (!bitmap_info->bytes) {
-			unlink_free_space(block_group, bitmap_info);
-			kfree(bitmap_info->bitmap);
-			kfree(bitmap_info);
-			block_group->total_bitmaps--;
-			recalculate_thresholds(block_group);
-		}
+		if (!bitmap_info->bytes)
+			free_bitmap(block_group, bitmap_info);
 
 		/*
 		 * no entry after this bitmap, but we still have bytes to
@@ -1254,13 +1259,8 @@ again:
 			return -EAGAIN;
 
 		goto again;
-	} else if (!bitmap_info->bytes) {
-		unlink_free_space(block_group, bitmap_info);
-		kfree(bitmap_info->bitmap);
-		kfree(bitmap_info);
-		block_group->total_bitmaps--;
-		recalculate_thresholds(block_group);
-	}
+	} else if (!bitmap_info->bytes)
+		free_bitmap(block_group, bitmap_info);
 
 	return 0;
 }
@@ -1689,13 +1689,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 	ret = offset;
 	if (entry->bitmap) {
 		bitmap_clear_bits(block_group, entry, offset, bytes);
-		if (!entry->bytes) {
-			unlink_free_space(block_group, entry);
-			kfree(entry->bitmap);
-			kfree(entry);
-			block_group->total_bitmaps--;
-			recalculate_thresholds(block_group);
-		}
+		if (!entry->bytes)
+			free_bitmap(block_group, entry);
 	} else {
 		unlink_free_space(block_group, entry);
 		entry->offset += bytes;
-- 
cgit v1.2.3


From 70b7da304f9f9bbf1566085155895e32e775a745 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:51:45 +0800
Subject: btrfs: Free fully occupied bitmap in cluster

If there's no more free space in a bitmap, we should free it.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 850104f0517..cb0137e4047 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1788,6 +1788,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 
 	ret = search_start;
 	bitmap_clear_bits(block_group, entry, ret, bytes);
+	if (entry->bytes == 0)
+		free_bitmap(block_group, entry);
 out:
 	spin_unlock(&cluster->lock);
 	spin_unlock(&block_group->tree_lock);
-- 
cgit v1.2.3


From 5e71b5d5ec07e4b3fb4c78c4e4b108ff667f123f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:55:34 +0800
Subject: btrfs: Update stats when allocating from a cluster

When allocating extent entry from a cluster, we should update
the free_space and free_extents fields of the block group.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb0137e4047..2974c4744d5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1843,15 +1843,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 		entry->offset += bytes;
 		entry->bytes -= bytes;
 
-		if (entry->bytes == 0) {
+		if (entry->bytes == 0)
 			rb_erase(&entry->offset_index, &cluster->root);
-			kfree(entry);
-		}
 		break;
 	}
 out:
 	spin_unlock(&cluster->lock);
 
+	if (!ret)
+		return 0;
+
+	spin_lock(&block_group->tree_lock);
+
+	block_group->free_space -= bytes;
+	if (entry->bytes == 0) {
+		block_group->free_extents--;
+		kfree(entry);
+	}
+
+	spin_unlock(&block_group->tree_lock);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 120d66eec0dcb966fbd03f743598b2ff2513436b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:56:50 +0800
Subject: btrfs: Add a helper try_merge_free_space()

When adding a new extent, we'll firstly see if we can merge
this extent to the left or/and right extent. Extract this as
a helper try_merge_free_space().

As a side effect, we fix a small bug that if the new extent
has non-bitmap left entry but is unmergeble, we'll directly
link the extent without trying to drop it into bitmap.

This also prepares for the next patch.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 75 ++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2974c4744d5..cf67dc3b7bf 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1363,22 +1363,14 @@ out:
 	return ret;
 }
 
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-			 u64 offset, u64 bytes)
+bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
+			  struct btrfs_free_space *info)
 {
-	struct btrfs_free_space *right_info = NULL;
-	struct btrfs_free_space *left_info = NULL;
-	struct btrfs_free_space *info = NULL;
-	int ret = 0;
-
-	info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-	if (!info)
-		return -ENOMEM;
-
-	info->offset = offset;
-	info->bytes = bytes;
-
-	spin_lock(&block_group->tree_lock);
+	struct btrfs_free_space *left_info;
+	struct btrfs_free_space *right_info;
+	bool merged = false;
+	u64 offset = info->offset;
+	u64 bytes = info->bytes;
 
 	/*
 	 * first we want to see if there is free space adjacent to the range we
@@ -1392,27 +1384,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	else
 		left_info = tree_search_offset(block_group, offset - 1, 0, 0);
 
-	/*
-	 * If there was no extent directly to the left or right of this new
-	 * extent then we know we're going to have to allocate a new extent, so
-	 * before we do that see if we need to drop this into a bitmap
-	 */
-	if ((!left_info || left_info->bitmap) &&
-	    (!right_info || right_info->bitmap)) {
-		ret = insert_into_bitmap(block_group, info);
-
-		if (ret < 0) {
-			goto out;
-		} else if (ret) {
-			ret = 0;
-			goto out;
-		}
-	}
-
 	if (right_info && !right_info->bitmap) {
 		unlink_free_space(block_group, right_info);
 		info->bytes += right_info->bytes;
 		kfree(right_info);
+		merged = true;
 	}
 
 	if (left_info && !left_info->bitmap &&
@@ -1421,8 +1397,43 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 		info->offset = left_info->offset;
 		info->bytes += left_info->bytes;
 		kfree(left_info);
+		merged = true;
 	}
 
+	return merged;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+	if (!info)
+		return -ENOMEM;
+
+	info->offset = offset;
+	info->bytes = bytes;
+
+	spin_lock(&block_group->tree_lock);
+
+	if (try_merge_free_space(block_group, info))
+		goto link;
+
+	/*
+	 * There was no extent directly to the left or right of this new
+	 * extent then we know we're going to have to allocate a new extent, so
+	 * before we do that see if we need to drop this into a bitmap
+	 */
+	ret = insert_into_bitmap(block_group, info);
+	if (ret < 0) {
+		goto out;
+	} else if (ret) {
+		ret = 0;
+		goto out;
+	}
+link:
 	ret = link_free_space(block_group, info);
 	if (ret)
 		kfree(info);
-- 
cgit v1.2.3


From f333adb5d64bc1c4d6099072fc341c3c8f84e0cf Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Nov 2010 14:57:39 +0800
Subject: btrfs: Check mergeable free space when removing a cluster

After returing extents from a cluster to the block group, some
extents in the block group may be mergeable.

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/free-space-cache.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cf67dc3b7bf..a5501edc3c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -987,11 +987,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
 	return entry;
 }
 
-static void unlink_free_space(struct btrfs_block_group_cache *block_group,
-			      struct btrfs_free_space *info)
+static inline void
+__unlink_free_space(struct btrfs_block_group_cache *block_group,
+		    struct btrfs_free_space *info)
 {
 	rb_erase(&info->offset_index, &block_group->free_space_offset);
 	block_group->free_extents--;
+}
+
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_free_space *info)
+{
+	__unlink_free_space(block_group, info);
 	block_group->free_space -= info->bytes;
 }
 
@@ -1364,7 +1371,7 @@ out:
 }
 
 bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
-			  struct btrfs_free_space *info)
+			  struct btrfs_free_space *info, bool update_stat)
 {
 	struct btrfs_free_space *left_info;
 	struct btrfs_free_space *right_info;
@@ -1385,7 +1392,10 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
 		left_info = tree_search_offset(block_group, offset - 1, 0, 0);
 
 	if (right_info && !right_info->bitmap) {
-		unlink_free_space(block_group, right_info);
+		if (update_stat)
+			unlink_free_space(block_group, right_info);
+		else
+			__unlink_free_space(block_group, right_info);
 		info->bytes += right_info->bytes;
 		kfree(right_info);
 		merged = true;
@@ -1393,7 +1403,10 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (left_info && !left_info->bitmap &&
 	    left_info->offset + left_info->bytes == offset) {
-		unlink_free_space(block_group, left_info);
+		if (update_stat)
+			unlink_free_space(block_group, left_info);
+		else
+			__unlink_free_space(block_group, left_info);
 		info->offset = left_info->offset;
 		info->bytes += left_info->bytes;
 		kfree(left_info);
@@ -1418,7 +1431,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 
 	spin_lock(&block_group->tree_lock);
 
-	if (try_merge_free_space(block_group, info))
+	if (try_merge_free_space(block_group, info, true))
 		goto link;
 
 	/*
@@ -1636,6 +1649,7 @@ __btrfs_return_cluster_to_free_space(
 		node = rb_next(&entry->offset_index);
 		rb_erase(&entry->offset_index, &cluster->root);
 		BUG_ON(entry->bitmap);
+		try_merge_free_space(block_group, entry, false);
 		tree_insert_offset(&block_group->free_space_offset,
 				   entry->offset, &entry->offset_index, 0);
 	}
-- 
cgit v1.2.3


From 83a4d54840c88a4a45c49670f044b8c7ddeaa8c7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 27 Dec 2010 16:19:53 +0800
Subject: Btrfs: Fix memory leak at umount

fs_info, which is allocated in open_ctree(), should be freed
in close_ctree().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/disk-io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a5d2249e6da..089871e5cd5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2513,6 +2513,8 @@ int close_ctree(struct btrfs_root *root)
 	kfree(fs_info->chunk_root);
 	kfree(fs_info->dev_root);
 	kfree(fs_info->csum_root);
+	kfree(fs_info);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From bdc924bb4cdac92b945945c3149ab8191c92d75d Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Mon, 27 Dec 2010 16:33:15 +0800
Subject: Btrfs: Fix memory leak on finding existing super

We missed a memory deallocation in commit 450ba0ea.

If an existing super block is found at mount and there is no
error condition then the pre-allocated tree_root and fs_info
are no not used and are not freeded.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 61bd79abb80..f50253c2279 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -654,6 +654,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		}
 
 		btrfs_close_devices(fs_devices);
+		kfree(fs_info);
+		kfree(tree_root);
 	} else {
 		char b[BDEVNAME_SIZE];
 
-- 
cgit v1.2.3


From 3f3d0bc0df041236fad4ffa82188a6e4ef9af75e Mon Sep 17 00:00:00 2001
From: Tero Roponen <tero.roponen@gmail.com>
Date: Mon, 27 Dec 2010 16:43:13 +0800
Subject: Btrfs: Free correct pointer after using strsep

We must save and free the original kstrdup()'ed pointer
because strsep() modifies its first argument.

Signed-off-by: Tero Roponen <tero.roponen@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f50253c2279..78ee681465a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -277,7 +277,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
-	char *opts, *p;
+	char *opts, *orig, *p;
 	int error = 0;
 	int intarg;
 
@@ -291,6 +291,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	opts = kstrdup(options, GFP_KERNEL);
 	if (!opts)
 		return -ENOMEM;
+	orig = opts;
 
 	while ((p = strsep(&opts, ",")) != NULL) {
 		int token;
@@ -326,7 +327,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	}
 
  out_free_opts:
-	kfree(opts);
+	kfree(orig);
  out:
 	/*
 	 * If no subvolume name is specified we use the default one.  Allocate
-- 
cgit v1.2.3


From d0f69686c2ae775529aadc7a8acc6f13ad41de66 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 25 Jan 2011 15:46:17 +0800
Subject: Btrfs: Don't return acl info when mounting with noacl option

Steps to reproduce:

  # mkfs.btrfs /dev/sda2
  # mount /dev/sda2 /mnt
  # touch /mnt/file0
  # setfacl -m 'u:root:x,g::x,o::x' /mnt/file0
  # umount /mnt
  # mount /dev/sda2 -o noacl /mnt
  # getfacl /mnt/file0
  ...
  user::rw-
  user:root:--x
  group::--x
  mask::--x
  other::--x

The output should be:

  user::rw-
  group::--x
  other::--x

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/acl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b..3c52fc8afe2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 	char *value = NULL;
 	struct posix_acl *acl;
 
+	if (!IS_POSIXACL(inode))
+		return NULL;
+
 	acl = get_cached_acl(inode, type);
 	if (acl != ACL_NOT_CACHED)
 		return acl;
@@ -82,6 +85,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
 	struct posix_acl *acl;
 	int ret = 0;
 
+	if (!IS_POSIXACL(dentry->d_inode))
+		return -EOPNOTSUPP;
+
 	acl = btrfs_get_acl(dentry->d_inode, type);
 
 	if (IS_ERR(acl))
-- 
cgit v1.2.3


From b897abec032deb7cc3ce67392a1f544ac965ddea Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 16:19:22 +0800
Subject: Btrfs: Fix memory leak in writepage fixup work

fixup, which is allocated when starting page write to fix up the
extent without ORDERED bit set, should be freed after this work
is done.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f9194438f7..3a6edc4c564 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1544,6 +1544,7 @@ out:
 out_page:
 	unlock_page(page);
 	page_cache_release(page);
+	kfree(fixup);
 }
 
 /*
-- 
cgit v1.2.3


From 4d728ec7aefdca5419d2ebfb28c147e81a4b59f4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 14:10:43 +0800
Subject: Btrfs: Fix file clone when source offset is not 0

Suppose:
- the source extent is: [0, 100]
- the src offset is 10
- the clone length is 90
- the dest offset is 0

This statement:

	new_key.offset = key.offset + destoff - off

will produce such an extent for the dest file:

	[ino, BTRFS_EXTENT_DATA_KEY, -10]

, which is obviously wrong.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f87552a1d7e..1b61dab6406 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1788,7 +1788,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.objectid = inode->i_ino;
-			new_key.offset = key.offset + destoff - off;
+			if (off <= key.offset)
+				new_key.offset = key.offset + destoff - off;
+			else
+				new_key.offset = destoff;
 
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
-- 
cgit v1.2.3


From 7db37c5e6575b229a5051be1d3ef15257ae0ba5d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:02:00 +1100
Subject: xfs: fix log ticket leak on forced shutdown.

The kmemleak detector shows this after test 139:

unreferenced object 0xffff880079b88bb0 (size 264):
  comm "xfs_io", pid 4904, jiffies 4294909382 (age 276.824s)
  hex dump (first 32 bytes):
    00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00  .....N..........
    ff ff ff ff ff ff ff ff 48 7b c9 82 ff ff ff ff  ........H{......
  backtrace:
    [<ffffffff81afb04d>] kmemleak_alloc+0x2d/0x60
    [<ffffffff8115c6cf>] kmem_cache_alloc+0x13f/0x2b0
    [<ffffffff814aaa97>] kmem_zone_alloc+0x77/0xf0
    [<ffffffff814aab2e>] kmem_zone_zalloc+0x1e/0x50
    [<ffffffff8148f394>] xlog_ticket_alloc+0x34/0x170
    [<ffffffff81494444>] xlog_cil_push+0xa4/0x3f0
    [<ffffffff81494eca>] xlog_cil_force_lsn+0x15a/0x160
    [<ffffffff814933a5>] _xfs_log_force_lsn+0x75/0x2d0
    [<ffffffff814a264d>] _xfs_trans_commit+0x2bd/0x2f0
    [<ffffffff8148bfdd>] xfs_iomap_write_allocate+0x1ad/0x350
    [<ffffffff814ac17f>] xfs_map_blocks+0x21f/0x370
    [<ffffffff814ad1b7>] xfs_vm_writepage+0x1c7/0x550
    [<ffffffff8112200a>] __writepage+0x1a/0x50
    [<ffffffff81122df2>] write_cache_pages+0x1c2/0x4c0
    [<ffffffff81123117>] generic_writepages+0x27/0x30
    [<ffffffff814aba5d>] xfs_vm_writepages+0x5d/0x80

By inspection, the leak occurs when xlog_write() returns and error
and we jump to the abort path without dropping the reference on the
active ticket.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_log_cil.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9dc8125d04e..c7eac5acbfe 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -543,7 +543,7 @@ xlog_cil_push(
 
 	error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
 	if (error)
-		goto out_abort;
+		goto out_abort_free_ticket;
 
 	/*
 	 * now that we've written the checkpoint into the log, strictly
@@ -569,8 +569,9 @@ restart:
 	}
 	spin_unlock(&cil->xc_cil_lock);
 
+	/* xfs_log_done always frees the ticket on error. */
 	commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
-	if (error || commit_lsn == -1)
+	if (commit_lsn == -1)
 		goto out_abort;
 
 	/* attach all the transactions w/ busy extents to iclog */
@@ -600,6 +601,8 @@ out_free_ticket:
 	kmem_free(new_ctx);
 	return 0;
 
+out_abort_free_ticket:
+	xfs_log_ticket_put(tic);
 out_abort:
 	xlog_cil_committed(ctx, XFS_LI_ABORTED);
 	return XFS_ERROR(EIO);
-- 
cgit v1.2.3


From ee2c9258501f83d3ed0fd09ce5df1cec53312cf0 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 27 Jan 2011 09:58:04 -0600
Subject: cifs: More crypto cleanup (try #2)

Replaced md4 hashing function local to cifs module with kernel crypto APIs.
As a result, md4 hashing function and its supporting functions in
file md4.c are not needed anymore.

Cleaned up function declarations, removed forward function declarations,
and removed a header file that is being deleted from being included.

Verified that sec=ntlm/i, sec=ntlmv2/i, and sec=ntlmssp/i work correctly.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Makefile      |   2 +-
 fs/cifs/cifsencrypt.c |  32 +++++---
 fs/cifs/cifsencrypt.h |  33 --------
 fs/cifs/cifsproto.h   |   9 ++-
 fs/cifs/connect.c     |   6 +-
 fs/cifs/link.c        |   5 +-
 fs/cifs/md4.c         | 205 --------------------------------------------------
 fs/cifs/smbdes.c      |   1 -
 fs/cifs/smbencrypt.c  |  90 +++++++++++++++-------
 9 files changed, 97 insertions(+), 286 deletions(-)
 delete mode 100644 fs/cifs/cifsencrypt.h
 delete mode 100644 fs/cifs/md4.c

(limited to 'fs')

diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index e1322296cb6..d87558448e3 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-	  md4.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
+	  cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o
 
 cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 35bf329c90e..0db5f1de022 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -36,11 +36,6 @@
 /* Note that the smb header signature field on input contains the
 	sequence number before this function is called */
 
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-		       unsigned char *p24);
-
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
 				struct TCP_Server_Info *server, char *signature)
 {
@@ -233,6 +228,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 /* first calculate 24 bytes ntlm response and then 16 byte session key */
 int setup_ntlm_response(struct cifsSesInfo *ses)
 {
+	int rc = 0;
 	unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE;
 	char temp_key[CIFS_SESS_KEY_SIZE];
 
@@ -246,13 +242,26 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
 	}
 	ses->auth_key.len = temp_len;
 
-	SMBNTencrypt(ses->password, ses->server->cryptkey,
+	rc = SMBNTencrypt(ses->password, ses->server->cryptkey,
 			ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+	if (rc) {
+		cFYI(1, "%s Can't generate NTLM response, error: %d",
+			__func__, rc);
+		return rc;
+	}
+
+	rc = E_md4hash(ses->password, temp_key);
+	if (rc) {
+		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		return rc;
+	}
 
-	E_md4hash(ses->password, temp_key);
-	mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+	rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE);
+	if (rc)
+		cFYI(1, "%s Can't generate NTLM session key, error: %d",
+			__func__, rc);
 
-	return 0;
+	return rc;
 }
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -699,14 +708,13 @@ cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
 	unsigned int size;
 
 	server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
-	if (!server->secmech.hmacmd5 ||
-			IS_ERR(server->secmech.hmacmd5)) {
+	if (IS_ERR(server->secmech.hmacmd5)) {
 		cERROR(1, "could not allocate crypto hmacmd5\n");
 		return PTR_ERR(server->secmech.hmacmd5);
 	}
 
 	server->secmech.md5 = crypto_alloc_shash("md5", 0, 0);
-	if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) {
+	if (IS_ERR(server->secmech.md5)) {
 		cERROR(1, "could not allocate crypto md5\n");
 		rc = PTR_ERR(server->secmech.md5);
 		goto crypto_allocate_md5_fail;
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
deleted file mode 100644
index 15d2ec00647..00000000000
--- a/fs/cifs/cifsencrypt.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *   fs/cifs/cifsencrypt.h
- *
- *   Copyright (c) International Business Machines  Corp., 2005
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   Externs for misc. small encryption routines
- *   so we do not have to put them in cifsproto.h
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-/* md4.c */
-extern void mdfour(unsigned char *out, unsigned char *in, int n);
-/* smbdes.c */
-extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, const unsigned char *c8,
-		  unsigned char *p24);
-
-
-
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 35c989f4924..8096f27ad9a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -375,7 +375,7 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 extern int cifs_verify_signature(struct smb_hdr *,
 				 struct TCP_Server_Info *server,
 				__u32 expected_sequence_number);
-extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
+extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *);
 extern int setup_ntlm_response(struct cifsSesInfo *);
 extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *);
 extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
@@ -425,4 +425,11 @@ extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
 extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
 		const unsigned char *path,
 		struct cifs_sb_info *cifs_sb, int xid);
+extern int mdfour(unsigned char *, unsigned char *, int);
+extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+			unsigned char *p24);
+extern void E_P16(unsigned char *p14, unsigned char *p16);
+extern void E_P24(unsigned char *p21, const unsigned char *c8,
+			unsigned char *p24);
 #endif			/* _CIFSPROTO_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 47034af67b0..47d8ff62368 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -55,9 +55,6 @@
 /* SMB echo "timeout" -- FIXME: tunable? */
 #define SMB_ECHO_INTERVAL (60 * HZ)
 
-extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
-			 unsigned char *p24);
-
 extern mempool_t *cifs_req_poolp;
 
 struct smb_vol {
@@ -2990,7 +2987,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 					 bcc_ptr);
 		else
 #endif /* CIFS_WEAK_PW_HASH */
-		SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr);
+		rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
+					bcc_ptr);
 
 		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 		if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index d3444ea6ac7..02cd60aefbf 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -54,10 +54,9 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
 	struct sdesc *sdescmd5;
 
 	md5 = crypto_alloc_shash("md5", 0, 0);
-	if (!md5 || IS_ERR(md5)) {
-		rc = PTR_ERR(md5);
+	if (IS_ERR(md5)) {
 		cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
-		return rc;
+		return PTR_ERR(md5);
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
 	sdescmd5 = kmalloc(size, GFP_KERNEL);
diff --git a/fs/cifs/md4.c b/fs/cifs/md4.c
deleted file mode 100644
index a725c2609d6..00000000000
--- a/fs/cifs/md4.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
-   Unix SMB/Netbios implementation.
-   Version 1.9.
-   a implementation of MD4 designed for use in the SMB authentication protocol
-   Copyright (C) Andrew Tridgell 1997-1998.
-   Modified by Steve French (sfrench@us.ibm.com) 2002-2003
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 2 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-#include <linux/module.h>
-#include <linux/fs.h>
-#include "cifsencrypt.h"
-
-/* NOTE: This code makes no attempt to be fast! */
-
-static __u32
-F(__u32 X, __u32 Y, __u32 Z)
-{
-	return (X & Y) | ((~X) & Z);
-}
-
-static __u32
-G(__u32 X, __u32 Y, __u32 Z)
-{
-	return (X & Y) | (X & Z) | (Y & Z);
-}
-
-static __u32
-H(__u32 X, __u32 Y, __u32 Z)
-{
-	return X ^ Y ^ Z;
-}
-
-static __u32
-lshift(__u32 x, int s)
-{
-	x &= 0xFFFFFFFF;
-	return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
-}
-
-#define ROUND1(a,b,c,d,k,s) (*a) = lshift((*a) + F(*b,*c,*d) + X[k], s)
-#define ROUND2(a,b,c,d,k,s) (*a) = lshift((*a) + G(*b,*c,*d) + X[k] + (__u32)0x5A827999,s)
-#define ROUND3(a,b,c,d,k,s) (*a) = lshift((*a) + H(*b,*c,*d) + X[k] + (__u32)0x6ED9EBA1,s)
-
-/* this applies md4 to 64 byte chunks */
-static void
-mdfour64(__u32 *M, __u32 *A, __u32 *B, __u32 *C, __u32 *D)
-{
-	int j;
-	__u32 AA, BB, CC, DD;
-	__u32 X[16];
-
-
-	for (j = 0; j < 16; j++)
-		X[j] = M[j];
-
-	AA = *A;
-	BB = *B;
-	CC = *C;
-	DD = *D;
-
-	ROUND1(A, B, C, D, 0, 3);
-	ROUND1(D, A, B, C, 1, 7);
-	ROUND1(C, D, A, B, 2, 11);
-	ROUND1(B, C, D, A, 3, 19);
-	ROUND1(A, B, C, D, 4, 3);
-	ROUND1(D, A, B, C, 5, 7);
-	ROUND1(C, D, A, B, 6, 11);
-	ROUND1(B, C, D, A, 7, 19);
-	ROUND1(A, B, C, D, 8, 3);
-	ROUND1(D, A, B, C, 9, 7);
-	ROUND1(C, D, A, B, 10, 11);
-	ROUND1(B, C, D, A, 11, 19);
-	ROUND1(A, B, C, D, 12, 3);
-	ROUND1(D, A, B, C, 13, 7);
-	ROUND1(C, D, A, B, 14, 11);
-	ROUND1(B, C, D, A, 15, 19);
-
-	ROUND2(A, B, C, D, 0, 3);
-	ROUND2(D, A, B, C, 4, 5);
-	ROUND2(C, D, A, B, 8, 9);
-	ROUND2(B, C, D, A, 12, 13);
-	ROUND2(A, B, C, D, 1, 3);
-	ROUND2(D, A, B, C, 5, 5);
-	ROUND2(C, D, A, B, 9, 9);
-	ROUND2(B, C, D, A, 13, 13);
-	ROUND2(A, B, C, D, 2, 3);
-	ROUND2(D, A, B, C, 6, 5);
-	ROUND2(C, D, A, B, 10, 9);
-	ROUND2(B, C, D, A, 14, 13);
-	ROUND2(A, B, C, D, 3, 3);
-	ROUND2(D, A, B, C, 7, 5);
-	ROUND2(C, D, A, B, 11, 9);
-	ROUND2(B, C, D, A, 15, 13);
-
-	ROUND3(A, B, C, D, 0, 3);
-	ROUND3(D, A, B, C, 8, 9);
-	ROUND3(C, D, A, B, 4, 11);
-	ROUND3(B, C, D, A, 12, 15);
-	ROUND3(A, B, C, D, 2, 3);
-	ROUND3(D, A, B, C, 10, 9);
-	ROUND3(C, D, A, B, 6, 11);
-	ROUND3(B, C, D, A, 14, 15);
-	ROUND3(A, B, C, D, 1, 3);
-	ROUND3(D, A, B, C, 9, 9);
-	ROUND3(C, D, A, B, 5, 11);
-	ROUND3(B, C, D, A, 13, 15);
-	ROUND3(A, B, C, D, 3, 3);
-	ROUND3(D, A, B, C, 11, 9);
-	ROUND3(C, D, A, B, 7, 11);
-	ROUND3(B, C, D, A, 15, 15);
-
-	*A += AA;
-	*B += BB;
-	*C += CC;
-	*D += DD;
-
-	*A &= 0xFFFFFFFF;
-	*B &= 0xFFFFFFFF;
-	*C &= 0xFFFFFFFF;
-	*D &= 0xFFFFFFFF;
-
-	for (j = 0; j < 16; j++)
-		X[j] = 0;
-}
-
-static void
-copy64(__u32 *M, unsigned char *in)
-{
-	int i;
-
-	for (i = 0; i < 16; i++)
-		M[i] = (in[i * 4 + 3] << 24) | (in[i * 4 + 2] << 16) |
-		    (in[i * 4 + 1] << 8) | (in[i * 4 + 0] << 0);
-}
-
-static void
-copy4(unsigned char *out, __u32 x)
-{
-	out[0] = x & 0xFF;
-	out[1] = (x >> 8) & 0xFF;
-	out[2] = (x >> 16) & 0xFF;
-	out[3] = (x >> 24) & 0xFF;
-}
-
-/* produce a md4 message digest from data of length n bytes */
-void
-mdfour(unsigned char *out, unsigned char *in, int n)
-{
-	unsigned char buf[128];
-	__u32 M[16];
-	__u32 b = n * 8;
-	int i;
-	__u32 A = 0x67452301;
-	__u32 B = 0xefcdab89;
-	__u32 C = 0x98badcfe;
-	__u32 D = 0x10325476;
-
-	while (n > 64) {
-		copy64(M, in);
-		mdfour64(M, &A, &B, &C, &D);
-		in += 64;
-		n -= 64;
-	}
-
-	for (i = 0; i < 128; i++)
-		buf[i] = 0;
-	memcpy(buf, in, n);
-	buf[n] = 0x80;
-
-	if (n <= 55) {
-		copy4(buf + 56, b);
-		copy64(M, buf);
-		mdfour64(M, &A, &B, &C, &D);
-	} else {
-		copy4(buf + 120, b);
-		copy64(M, buf);
-		mdfour64(M, &A, &B, &C, &D);
-		copy64(M, buf + 64);
-		mdfour64(M, &A, &B, &C, &D);
-	}
-
-	for (i = 0; i < 128; i++)
-		buf[i] = 0;
-	copy64(M, buf);
-
-	copy4(out, A);
-	copy4(out + 4, B);
-	copy4(out + 8, C);
-	copy4(out + 12, D);
-
-	A = B = C = D = 0;
-}
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index b6b6dcb500b..04721485925 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -45,7 +45,6 @@
    up with a different answer to the one above)
 */
 #include <linux/slab.h>
-#include "cifsencrypt.h"
 #define uchar unsigned char
 
 static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 30135005e4f..b5450e9f40c 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -33,7 +33,7 @@
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifs_debug.h"
-#include "cifsencrypt.h"
+#include "cifsproto.h"
 
 #ifndef false
 #define false 0
@@ -47,14 +47,57 @@
 #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
 #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
 
-/*The following definitions come from  libsmb/smbencrypt.c  */
+/* produce a md4 message digest from data of length n bytes */
+int
+mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
+{
+	int rc;
+	unsigned int size;
+	struct crypto_shash *md4;
+	struct sdesc *sdescmd4;
+
+	md4 = crypto_alloc_shash("md4", 0, 0);
+	if (IS_ERR(md4)) {
+		cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
+		return PTR_ERR(md4);
+	}
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
+	sdescmd4 = kmalloc(size, GFP_KERNEL);
+	if (!sdescmd4) {
+		rc = -ENOMEM;
+		cERROR(1, "%s: Memory allocation failure\n", __func__);
+		goto mdfour_err;
+	}
+	sdescmd4->shash.tfm = md4;
+	sdescmd4->shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdescmd4->shash);
+	if (rc) {
+		cERROR(1, "%s: Could not init md4 shash\n", __func__);
+		goto mdfour_err;
+	}
+	crypto_shash_update(&sdescmd4->shash, link_str, link_len);
+	rc = crypto_shash_final(&sdescmd4->shash, md4_hash);
 
-void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
-		unsigned char *p24);
-void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-		   unsigned char p24[24]);
-void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
+mdfour_err:
+	crypto_free_shash(md4);
+	kfree(sdescmd4);
+
+	return rc;
+}
+
+/* Does the des encryption from the NT or LM MD4 hash. */
+static void
+SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
+	      unsigned char p24[24])
+{
+	unsigned char p21[21];
+
+	memset(p21, '\0', 21);
+
+	memcpy(p21, passwd, 16);
+	E_P24(p21, c8, p24);
+}
 
 /*
    This implements the X/Open SMB password encryption
@@ -117,9 +160,10 @@ _my_mbstowcs(__u16 *dst, const unsigned char *src, int len)
  * Creates the MD4 Hash of the users password in NT UNICODE.
  */
 
-void
+int
 E_md4hash(const unsigned char *passwd, unsigned char *p16)
 {
+	int rc;
 	int len;
 	__u16 wpwd[129];
 
@@ -138,8 +182,10 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
 	/* Calculate length in bytes */
 	len = _my_wcslen(wpwd) * sizeof(__u16);
 
-	mdfour(p16, (unsigned char *) wpwd, len);
+	rc = mdfour(p16, (unsigned char *) wpwd, len);
 	memset(wpwd, 0, 129 * 2);
+
+	return rc;
 }
 
 #if 0 /* currently unused */
@@ -211,19 +257,6 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
 }
 #endif
 
-/* Does the des encryption from the NT or LM MD4 hash. */
-static void
-SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
-	      unsigned char p24[24])
-{
-	unsigned char p21[21];
-
-	memset(p21, '\0', 21);
-
-	memcpy(p21, passwd, 16);
-	E_P24(p21, c8, p24);
-}
-
 /* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
 #if 0 /* currently unused */
 static void
@@ -241,16 +274,21 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
 #endif
 
 /* Does the NT MD4 hash then des encryption. */
-
-void
+int
 SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
 {
+	int rc;
 	unsigned char p21[21];
 
 	memset(p21, '\0', 21);
 
-	E_md4hash(passwd, p21);
+	rc = E_md4hash(passwd, p21);
+	if (rc) {
+		cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
+		return rc;
+	}
 	SMBOWFencrypt(p21, c8, p24);
+	return rc;
 }
 
 
-- 
cgit v1.2.3


From e34a314c5e49fe6b763568f6576b19f1299c33c2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:13:35 +1100
Subject: xfs: fix efi item leak on forced shutdown

After test 139, kmemleak shows:

unreferenced object 0xffff880078b405d8 (size 400):
  comm "xfs_io", pid 4904, jiffies 4294909383 (age 1186.728s)
  hex dump (first 32 bytes):
    60 c1 17 79 00 88 ff ff 60 c1 17 79 00 88 ff ff  `..y....`..y....
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff81afb04d>] kmemleak_alloc+0x2d/0x60
    [<ffffffff8115c6cf>] kmem_cache_alloc+0x13f/0x2b0
    [<ffffffff814aaa97>] kmem_zone_alloc+0x77/0xf0
    [<ffffffff814aab2e>] kmem_zone_zalloc+0x1e/0x50
    [<ffffffff8147cd6b>] xfs_efi_init+0x4b/0xb0
    [<ffffffff814a4ee8>] xfs_trans_get_efi+0x58/0x90
    [<ffffffff81455fab>] xfs_bmap_finish+0x8b/0x1d0
    [<ffffffff814851b4>] xfs_itruncate_finish+0x2c4/0x5d0
    [<ffffffff814a970f>] xfs_setattr+0x8df/0xa70
    [<ffffffff814b5c7b>] xfs_vn_setattr+0x1b/0x20
    [<ffffffff8117dc00>] notify_change+0x170/0x2e0
    [<ffffffff81163bf6>] do_truncate+0x66/0xa0
    [<ffffffff81163d0b>] sys_ftruncate+0xdb/0xe0
    [<ffffffff8103a002>] system_call_fastpath+0x16/0x1b
    [<ffffffffffffffff>] 0xffffffffffffffff

The cause of the leak is that the "remove" parameter of IOP_UNPIN()
is never set when a CIL push is aborted. This means that the EFI
item is never freed if it was in the push being cancelled. The
problem is specific to delayed logging, but has uncovered a couple
of problems with the handling of IOP_UNPIN(remove).

Firstly, we cannot safely call xfs_trans_del_item() from IOP_UNPIN()
in the CIL commit failure path or the iclog write failure path
because for delayed loging we have no transaction context. Hence we
must only call xfs_trans_del_item() if the log item being unpinned
has an active log item descriptor.

Secondly, xfs_trans_uncommit() does not handle log item descriptor
freeing during the traversal of log items on a transaction. It can
reference a freed log item descriptor when unpinning an EFI item.
Hence it needs to use a safe list traversal method to allow items to
be removed from the transaction during IOP_UNPIN().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_buf_item.c     | 12 +++++++-----
 fs/xfs/xfs_extfree_item.c |  3 ++-
 fs/xfs/xfs_trans.c        | 36 +++++++++++++++++++++++++++++-------
 3 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 98c6f73b675..6f8c21ce0d6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -427,13 +427,15 @@ xfs_buf_item_unpin(
 
 		if (remove) {
 			/*
-			 * We have to remove the log item from the transaction
-			 * as we are about to release our reference to the
-			 * buffer.  If we don't, the unlock that occurs later
-			 * in xfs_trans_uncommit() will ry to reference the
+			 * If we are in a transaction context, we have to
+			 * remove the log item from the transaction as we are
+			 * about to release our reference to the buffer.  If we
+			 * don't, the unlock that occurs later in
+			 * xfs_trans_uncommit() will try to reference the
 			 * buffer which we no longer have a hold on.
 			 */
-			xfs_trans_del_item(lip);
+			if (lip->li_desc)
+				xfs_trans_del_item(lip);
 
 			/*
 			 * Since the transaction no longer refers to the buffer,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 75f2ef60e57..d22e6262343 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -138,7 +138,8 @@ xfs_efi_item_unpin(
 
 	if (remove) {
 		ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
-		xfs_trans_del_item(lip);
+		if (lip->li_desc)
+			xfs_trans_del_item(lip);
 		xfs_efi_item_free(efip);
 		return;
 	}
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 33dbc4e0ad6..29f5e542489 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1446,6 +1446,14 @@ xfs_log_item_batch_insert(
  * Bulk operation version of xfs_trans_committed that takes a log vector of
  * items to insert into the AIL. This uses bulk AIL insertion techniques to
  * minimise lock traffic.
+ *
+ * If we are called with the aborted flag set, it is because a log write during
+ * a CIL checkpoint commit has failed. In this case, all the items in the
+ * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which
+ * means that checkpoint commit abort handling is treated exactly the same
+ * as an iclog write error even though we haven't started any IO yet. Hence in
+ * this case all we need to do is IOP_COMMITTED processing, followed by an
+ * IOP_UNPIN(aborted) call.
  */
 void
 xfs_trans_committed_bulk(
@@ -1472,6 +1480,16 @@ xfs_trans_committed_bulk(
 		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
 			continue;
 
+		/*
+		 * if we are aborting the operation, no point in inserting the
+		 * object into the AIL as we are in a shutdown situation.
+		 */
+		if (aborted) {
+			ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
+			IOP_UNPIN(lip, 1);
+			continue;
+		}
+
 		if (item_lsn != commit_lsn) {
 
 			/*
@@ -1503,20 +1521,24 @@ xfs_trans_committed_bulk(
 }
 
 /*
- * Called from the trans_commit code when we notice that
- * the filesystem is in the middle of a forced shutdown.
+ * Called from the trans_commit code when we notice that the filesystem is in
+ * the middle of a forced shutdown.
+ *
+ * When we are called here, we have already pinned all the items in the
+ * transaction. However, neither IOP_COMMITTING or IOP_UNLOCK has been called
+ * so we can simply walk the items in the transaction, unpin them with an abort
+ * flag and then free the items. Note that unpinning the items can result in
+ * them being freed immediately, so we need to use a safe list traversal method
+ * here.
  */
 STATIC void
 xfs_trans_uncommit(
 	struct xfs_trans	*tp,
 	uint			flags)
 {
-	struct xfs_log_item_desc *lidp;
+	struct xfs_log_item_desc *lidp, *n;
 
-	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
-		/*
-		 * Unpin all but those that aren't dirty.
-		 */
+	list_for_each_entry_safe(lidp, n, &tp->t_items, lid_trans) {
 		if (lidp->lid_flags & XFS_LID_DIRTY)
 			IOP_UNPIN(lidp->lid_item, 1);
 	}
-- 
cgit v1.2.3


From b8fc82630ae289bb4e661567808afc59e3298dce Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:14:12 +1100
Subject: xfs: speculative delayed allocation uses rounddown_power_of_2 badly

rounddown_power_of_2() returns an undefined result when passed a
value of zero. The specualtive delayed allocation code is doing this
when the inode is zero length. Hence occasionally the preallocation
is much, much larger than is necessary (e.g. 8GB for a 270 _byte_
file). Ensure we don't even pass a zero value to this function so
the result of preallocation is always the desired size.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_iomap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 55582bd6665..8a0f044750c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -337,7 +337,12 @@ xfs_iomap_prealloc_size(
 		int shift = 0;
 		int64_t freesp;
 
-		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+		/*
+		 * rounddown_pow_of_two() returns an undefined result
+		 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
+		 * ensure we always pass in a non-zero value.
+		 */
+		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
 		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
 					rounddown_pow_of_two(alloc_blocks));
 
-- 
cgit v1.2.3


From 14b064ceaa6f51a7426cc45b4b43685b94380658 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:16:28 +1100
Subject: xfs: limit extent length for allocation to AG size

Delayed allocation extents can be larger than AGs, so when trying to
convert a large range we may scan every AG inside
xfs_bmap_alloc_nullfb() trying to find an AG with a size larger than
an AG. We should stop when we find the first AG with a maximum
possible allocation size. This causes excessive CPU usage when there
are lots of AGs.

The same problem occurs when doing preallocation of a range larger
than an AG.

Fix the problem by limiting real allocation lengths to the maximum
that an AG can support. This means if we have empty AGs, we'll stop
the search at the first of them. If there are no empty AGs, we'll
still scan them all, but that is a different problem....

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_alloc.h | 16 ++++++++++++++++
 fs/xfs/xfs_bmap.c  | 18 ++++++++++--------
 2 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 0ab56b32c7e..d0b3bc72005 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -74,6 +74,22 @@ typedef unsigned int xfs_alloctype_t;
  */
 #define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
 
+/*
+ * When deciding how much space to allocate out of an AG, we limit the
+ * allocation maximum size to the size the AG. However, we cannot use all the
+ * blocks in the AG - some are permanently used by metadata. These
+ * blocks are generally:
+ *	- the AG superblock, AGF, AGI and AGFL
+ *	- the AGF (bno and cnt) and AGI btree root blocks
+ *	- 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits
+ *
+ * The AG headers are sector sized, so the amount of space they take up is
+ * dependent on filesystem geometry. The others are all single blocks.
+ */
+#define XFS_ALLOC_AG_MAX_USABLE(mp)	\
+	((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7)
+
+
 /*
  * Argument structure for xfs_alloc routines.
  * This is turned into a structure to avoid having 20 arguments passed
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4111cd3966c..f3a3768189b 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2430,7 +2430,7 @@ xfs_bmap_btalloc_nullfb(
 		startag = ag = 0;
 
 	pag = xfs_perag_get(mp, ag);
-	while (*blen < ap->alen) {
+	while (*blen < args->maxlen) {
 		if (!pag->pagf_init) {
 			error = xfs_alloc_pagf_init(mp, args->tp, ag,
 						    XFS_ALLOC_FLAG_TRYLOCK);
@@ -2452,7 +2452,7 @@ xfs_bmap_btalloc_nullfb(
 			notinit = 1;
 
 		if (xfs_inode_is_filestream(ap->ip)) {
-			if (*blen >= ap->alen)
+			if (*blen >= args->maxlen)
 				break;
 
 			if (ap->userdata) {
@@ -2498,14 +2498,14 @@ xfs_bmap_btalloc_nullfb(
 	 * If the best seen length is less than the request
 	 * length, use the best as the minimum.
 	 */
-	else if (*blen < ap->alen)
+	else if (*blen < args->maxlen)
 		args->minlen = *blen;
 	/*
-	 * Otherwise we've seen an extent as big as alen,
+	 * Otherwise we've seen an extent as big as maxlen,
 	 * use that as the minimum.
 	 */
 	else
-		args->minlen = ap->alen;
+		args->minlen = args->maxlen;
 
 	/*
 	 * set the failure fallback case to look in the selected
@@ -2573,7 +2573,9 @@ xfs_bmap_btalloc(
 	args.tp = ap->tp;
 	args.mp = mp;
 	args.fsbno = ap->rval;
-	args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+
+	/* Trim the allocation back to the maximum an AG can fit. */
+	args.maxlen = MIN(ap->alen, XFS_ALLOC_AG_MAX_USABLE(mp));
 	args.firstblock = ap->firstblock;
 	blen = 0;
 	if (nullfb) {
@@ -2621,7 +2623,7 @@ xfs_bmap_btalloc(
 			/*
 			 * Adjust for alignment
 			 */
-			if (blen > args.alignment && blen <= ap->alen)
+			if (blen > args.alignment && blen <= args.maxlen)
 				args.minlen = blen - args.alignment;
 			args.minalignslop = 0;
 		} else {
@@ -2640,7 +2642,7 @@ xfs_bmap_btalloc(
 			 * of minlen+alignment+slop doesn't go up
 			 * between the calls.
 			 */
-			if (blen > mp->m_dalign && blen <= ap->alen)
+			if (blen > mp->m_dalign && blen <= args.maxlen)
 				nextminlen = blen - mp->m_dalign;
 			else
 				nextminlen = args.minlen;
-- 
cgit v1.2.3


From 4ce159890c00e2cc705e955a939bf1dca7b07ab8 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:17:58 +1100
Subject: xfs: prevent extsize alignment from exceeding maximum extent size

When doing delayed allocation, if the allocation size is for a
maximally sized extent, extent size alignment can push it over this
limit. This results in an assert failure in xfs_bmbt_set_allf() as
the extent length is too large to find in the extent record.

Fix this by ensuring that we allow for space that extent size
alignment requires (up to 2 * (extsize -1) blocks as we have to
handle both head and tail alignment) when limiting the maximum size
of the extent.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f3a3768189b..3e9c278a8f7 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4487,6 +4487,16 @@ xfs_bmapi(
 				/* Figure out the extent size, adjust alen */
 				extsz = xfs_get_extsz_hint(ip);
 				if (extsz) {
+					/*
+					 * make sure we don't exceed a single
+					 * extent length when we align the
+					 * extent by reducing length we are
+					 * going to allocate by the maximum
+					 * amount extent size aligment may
+					 * require.
+					 */
+					alen = XFS_FILBLKS_MIN(len,
+						   MAXEXTLEN - (2 * extsz - 1));
 					error = xfs_bmap_extsize_align(mp,
 							&got, &prev, extsz,
 							rt, eof,
-- 
cgit v1.2.3


From 5315837daee7ed76c31ef643915f7d76ef8c1aa3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 12:18:18 +1100
Subject: xfs: limit extsize to size of AGs and/or MAXEXTLEN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The extent size hint can be set to larger than an AG. This means
that the alignment process can push the range to be allocated
outside the bounds of the AG, resulting in assert failures or
corrupted bmbt records. Similarly, if the extsize is larger than the
maximum extent size supported, the alignment process will produce
extents that are too large to fit into the bmbt records, resulting
in a different type of assert/corruption failure.

Fix this by limiting extsize at the time іt is set firstly to be
less than MAXEXTLEN, then to be a maximum of half the size of the
AGs in the filesystem for non-realtime inodes. Realtime inodes do
not allocate out of AGs, so don't have to be restricted by the size
of AGs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index b06ede1d0be..f5e2a19e0f8 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -985,10 +985,22 @@ xfs_ioctl_setattr(
 
 		/*
 		 * Extent size must be a multiple of the appropriate block
-		 * size, if set at all.
+		 * size, if set at all. It must also be smaller than the
+		 * maximum extent size supported by the filesystem.
+		 *
+		 * Also, for non-realtime files, limit the extent size hint to
+		 * half the size of the AGs in the filesystem so alignment
+		 * doesn't result in extents larger than an AG.
 		 */
 		if (fa->fsx_extsize != 0) {
-			xfs_extlen_t	size;
+			xfs_extlen_t    size;
+			xfs_fsblock_t   extsize_fsb;
+
+			extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+			if (extsize_fsb > MAXEXTLEN) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
 
 			if (XFS_IS_REALTIME_INODE(ip) ||
 			    ((mask & FSX_XFLAGS) &&
@@ -997,6 +1009,10 @@ xfs_ioctl_setattr(
 				       mp->m_sb.sb_blocklog;
 			} else {
 				size = mp->m_sb.sb_blocksize;
+				if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+					code = XFS_ERROR(EINVAL);
+					goto error_return;
+				}
 			}
 
 			if (fa->fsx_extsize % size) {
-- 
cgit v1.2.3


From c6f990d1ff8e4e53b12f4175eb7d7ea710c3ca73 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 27 Jan 2011 13:23:28 +1100
Subject: xfs: handle CIl transaction commit failures correctly

Failure to commit a transaction into the CIL is not handled
correctly. This currently can only happen when racing with a
shutdown and requires an explicit shutdown check, so it rare and can
be avoided. Remove the shutdown check and make the CIL commit a void
function to indicate it will always succeed, thereby removing the
incorrectly handled failure case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_log.h     | 2 +-
 fs/xfs/xfs_log_cil.c | 8 +-------
 fs/xfs/xfs_trans.c   | 5 +----
 3 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 916eb7db14d..3bd3291ef8d 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -191,7 +191,7 @@ void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
 
 xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
 
-int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+void	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 				struct xfs_log_vec *log_vector,
 				xfs_lsn_t *commit_lsn, int flags);
 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index c7eac5acbfe..9ca59be0897 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -625,7 +625,7 @@ out_abort:
  * background commit, returns without it held once background commits are
  * allowed again.
  */
-int
+void
 xfs_log_commit_cil(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
@@ -640,11 +640,6 @@ xfs_log_commit_cil(
 	if (flags & XFS_TRANS_RELEASE_LOG_RES)
 		log_flags = XFS_LOG_REL_PERM_RESERV;
 
-	if (XLOG_FORCED_SHUTDOWN(log)) {
-		xlog_cil_free_logvec(log_vector);
-		return XFS_ERROR(EIO);
-	}
-
 	/*
 	 * do all the hard work of formatting items (including memory
 	 * allocation) outside the CIL context lock. This prevents stalling CIL
@@ -704,7 +699,6 @@ xfs_log_commit_cil(
 	 */
 	if (push)
 		xlog_cil_push(log, 0);
-	return 0;
 }
 
 /*
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 29f5e542489..76922793f64 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1755,7 +1755,6 @@ xfs_trans_commit_cil(
 	int			flags)
 {
 	struct xfs_log_vec	*log_vector;
-	int			error;
 
 	/*
 	 * Get each log item to allocate a vector structure for
@@ -1766,9 +1765,7 @@ xfs_trans_commit_cil(
 	if (!log_vector)
 		return ENOMEM;
 
-	error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
-	if (error)
-		return error;
+	xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
 
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 	xfs_trans_free(tp);
-- 
cgit v1.2.3


From 0fbca4d1c3932c27c4794bf5c2b5fc961cf5a54f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 28 Jan 2011 11:20:46 +1100
Subject: xfs: fix dquot shaker deadlock

Commit 368e136 ("xfs: remove duplicate code from dquot reclaim") fails
to unlock the dquot freelist when the number of loop restarts is
exceeded in xfs_qm_dqreclaim_one(). This causes hangs in memory
reclaim.

Rework the loop control logic into an unwind stack that all the
different cases jump into. This means there is only one set of code
that processes the loop exit criteria, and simplifies the unlocking
of all the items from different points in the loop. It also fixes a
double increment of the restart counter from the qi_dqlist_lock
case.

Reported-by: Malcolm Scott <lkml@malc.org.uk>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_qm.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f8e854b4fde..206a2815ced 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1863,12 +1863,14 @@ xfs_qm_dqreclaim_one(void)
 	xfs_dquot_t	*dqpout;
 	xfs_dquot_t	*dqp;
 	int		restarts;
+	int		startagain;
 
 	restarts = 0;
 	dqpout = NULL;
 
 	/* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-startagain:
+again:
+	startagain = 0;
 	mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
 
 	list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
@@ -1885,13 +1887,10 @@ startagain:
 			ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
 
 			trace_xfs_dqreclaim_want(dqp);
-
-			xfs_dqunlock(dqp);
-			mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return NULL;
 			XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-			goto startagain;
+			restarts++;
+			startagain = 1;
+			goto dqunlock;
 		}
 
 		/*
@@ -1906,23 +1905,20 @@ startagain:
 			ASSERT(list_empty(&dqp->q_mplist));
 			list_del_init(&dqp->q_freelist);
 			xfs_Gqm->qm_dqfrlist_cnt--;
-			xfs_dqunlock(dqp);
 			dqpout = dqp;
 			XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-			break;
+			goto dqunlock;
 		}
 
 		ASSERT(dqp->q_hash);
 		ASSERT(!list_empty(&dqp->q_mplist));
 
 		/*
-		 * Try to grab the flush lock. If this dquot is in the process of
-		 * getting flushed to disk, we don't want to reclaim it.
+		 * Try to grab the flush lock. If this dquot is in the process
+		 * of getting flushed to disk, we don't want to reclaim it.
 		 */
-		if (!xfs_dqflock_nowait(dqp)) {
-			xfs_dqunlock(dqp);
-			continue;
-		}
+		if (!xfs_dqflock_nowait(dqp))
+			goto dqunlock;
 
 		/*
 		 * We have the flush lock so we know that this is not in the
@@ -1944,8 +1940,7 @@ startagain:
 				xfs_fs_cmn_err(CE_WARN, mp,
 			"xfs_qm_dqreclaim: dquot %p flush failed", dqp);
 			}
-			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
-			continue;
+			goto dqunlock;
 		}
 
 		/*
@@ -1967,13 +1962,8 @@ startagain:
 		 */
 		if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
 			restarts++;
-			mutex_unlock(&dqp->q_hash->qh_lock);
-			xfs_dqfunlock(dqp);
-			xfs_dqunlock(dqp);
-			mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-			if (restarts++ >= XFS_QM_RECLAIM_MAX_RESTARTS)
-				return NULL;
-			goto startagain;
+			startagain = 1;
+			goto qhunlock;
 		}
 
 		ASSERT(dqp->q_nrefs == 0);
@@ -1986,14 +1976,20 @@ startagain:
 		xfs_Gqm->qm_dqfrlist_cnt--;
 		dqpout = dqp;
 		mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
 		mutex_unlock(&dqp->q_hash->qh_lock);
 dqfunlock:
 		xfs_dqfunlock(dqp);
+dqunlock:
 		xfs_dqunlock(dqp);
 		if (dqpout)
 			break;
 		if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-			return NULL;
+			break;
+		if (startagain) {
+			mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+			goto again;
+		}
 	}
 	mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
 	return dqpout;
-- 
cgit v1.2.3


From 24446fc66fdebbdd8baca0f44fd2a47ad77ba580 Mon Sep 17 00:00:00 2001
From: "bpm@sgi.com" <bpm@sgi.com>
Date: Wed, 19 Jan 2011 17:41:58 +0000
Subject: xfs: xfs_bmap_add_extent_delay_real should init br_startblock

When filling in the middle of a previous delayed allocation in
xfs_bmap_add_extent_delay_real, set br_startblock of the new delay
extent to the right to nullstartblock instead of 0 before inserting
the extent into the ifork (xfs_iext_insert), rather than setting
br_startblock afterward.

Adding the extent into the ifork with br_startblock=0 can lead to
the extent being copied into the btree by xfs_bmap_extent_to_btree
if we happen to convert from extents format to btree format before
updating br_startblock with the correct value.  The unexpected
addition of this delay extent to the btree can cause subsequent
XFS_WANT_CORRUPTED_GOTO filesystem shutdown in several
xfs_bmap_add_extent_delay_real cases where we are converting a delay
extent to real and unexpectedly find an extent already inserted.
For example:

911         case BMAP_LEFT_FILLING:
912                 /*
913                  * Filling in the first part of a previous delayed allocation.
914                  * The left neighbor is not contiguous.
915                  */
916                 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
917                 xfs_bmbt_set_startoff(ep, new_endoff);
918                 temp = PREV.br_blockcount - new->br_blockcount;
919                 xfs_bmbt_set_blockcount(ep, temp);
920                 xfs_iext_insert(ip, idx, 1, new, state);
921                 ip->i_df.if_lastex = idx;
922                 ip->i_d.di_nextents++;
923                 if (cur == NULL)
924                         rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
925                 else {
926                         rval = XFS_ILOG_CORE;
927                         if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
928                                         new->br_startblock, new->br_blockcount,
929                                         &i)))
930                                 goto done;
931                         XFS_WANT_CORRUPTED_GOTO(i == 0, done);

With the bogus extent in the btree we shutdown the filesystem at
931.  The conversion from extents to btree format happens when the
number of extents in the inode increases above ip->i_df.if_ext_max.
xfs_bmap_extent_to_btree copies extents from the ifork into the
btree, ignoring all delalloc extents which are denoted by
br_startblock having some value of nullstartblock.

SGI-PV: 1013221

Signed-off-by: Ben Myers <bpm@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3e9c278a8f7..dc3afd7739f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1038,17 +1038,34 @@ xfs_bmap_add_extent_delay_real(
 		 * Filling in the middle part of a previous delayed allocation.
 		 * Contiguity is impossible here.
 		 * This case is avoided almost all the time.
+		 *
+		 * We start with a delayed allocation:
+		 *
+		 * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+
+		 *  PREV @ idx
+		 *
+	         * and we are allocating:
+		 *                     +rrrrrrrrrrrrrrrrr+
+		 *			      new
+		 *
+		 * and we set it up for insertion as:
+		 * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+
+		 *                            new
+		 *  PREV @ idx          LEFT              RIGHT
+		 *                      inserted at idx + 1
 		 */
 		temp = new->br_startoff - PREV.br_startoff;
-		trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
-		xfs_bmbt_set_blockcount(ep, temp);
-		r[0] = *new;
-		r[1].br_state = PREV.br_state;
-		r[1].br_startblock = 0;
-		r[1].br_startoff = new_endoff;
 		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
-		r[1].br_blockcount = temp2;
-		xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
+		trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);	/* truncate PREV */
+		LEFT = *new;
+		RIGHT.br_state = PREV.br_state;
+		RIGHT.br_startblock = nullstartblock(
+				(int)xfs_bmap_worst_indlen(ip, temp2));
+		RIGHT.br_startoff = new_endoff;
+		RIGHT.br_blockcount = temp2;
+		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
+		xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
 		ip->i_df.if_lastex = idx + 1;
 		ip->i_d.di_nextents++;
 		if (cur == NULL)
-- 
cgit v1.2.3


From e00b8a24041f37e56b4b8415ce4eba1cbc238065 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 27 Jan 2011 14:55:39 -0500
Subject: NFS: Fix an NFS client lockdep issue

There is no reason to be freeing the delegation cred in the rcu callback,
and doing so is resulting in a lockdep complaint that rpc_credcache_lock
is being called from both softirq and non-softirq contexts.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org
---
 fs/nfs/delegation.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 364e4328f39..bbbc6bf5cb2 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -23,8 +23,6 @@
 
 static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
-	if (delegation->cred)
-		put_rpccred(delegation->cred);
 	kfree(delegation);
 }
 
@@ -37,6 +35,10 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
+	if (delegation->cred) {
+		put_rpccred(delegation->cred);
+		delegation->cred = NULL;
+	}
 	call_rcu(&delegation->rcu, nfs_free_delegation_callback);
 }
 
-- 
cgit v1.2.3


From c08e76d0cd4beb759a73c1835d98f5fccc126ed1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Jan 2011 12:40:55 -0500
Subject: NFS: Micro-optimize nfs4_decode_dirent()

Make the decoding of NFSv4 directory entries slightly more efficient
by:

  1.  Avoiding unnecessary byte swapping when checking XDR booleans,
      and

  2.  Not bumping "p" when its value will be immediately replaced by
      xdr_inline_decode()

This commit makes nfs4_decode_dirent() consistent with similar logic
in the other two decode_dirent() functions.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2ab8e5cb8f5..009aef9e12b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6086,11 +6086,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	__be32 *p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	if (!ntohl(*p++)) {
+	if (*p == xdr_zero) {
 		p = xdr_inline_decode(xdr, 4);
 		if (unlikely(!p))
 			goto out_overflow;
-		if (!ntohl(*p++))
+		if (*p == xdr_zero)
 			return -EAGAIN;
 		entry->eof = 1;
 		return -EBADCOOKIE;
@@ -6101,7 +6101,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		goto out_overflow;
 	entry->prev_cookie = entry->cookie;
 	p = xdr_decode_hyper(p, &entry->cookie);
-	entry->len = ntohl(*p++);
+	entry->len = be32_to_cpup(p);
 
 	p = xdr_inline_decode(xdr, entry->len);
 	if (unlikely(!p))
-- 
cgit v1.2.3


From d1205f87bbb8040c1408bbd9e0a720310b2b0b9b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 28 Jan 2011 12:41:05 -0500
Subject: NFS: NFSv4 readdir loses entries

On recent 2.6.38-rc kernels, connectathon basic test 6 fails on
NFSv4 mounts of OpenSolaris with something like:

> ./test6: readdir
> 	./test6: (/mnt/klimt/matisse.test) didn't read expected 'file.12' dir entry, pass 0
> 	./test6: (/mnt/klimt/matisse.test) didn't read expected 'file.82' dir entry, pass 0
> 	./test6: (/mnt/klimt/matisse.test) didn't read expected 'file.164' dir entry, pass 0
> 	./test6: (/mnt/klimt/matisse.test) Test failed with 3 errors
> basic tests failed
> Tests failed, leaving /mnt/klimt mounted
> [cel@matisse cthon04]$

I narrowed the problem down to nfs4_decode_dirent() reporting that the
decode buffer had overflowed while decoding the entries for those
missing files.

verify_attr_len() assumes both it's pointer arguments reside on the
same page.  When these arguments point to locations on two different
pages, verify_attr_len() can report false errors.  This can happen now
that a large NFSv4 readdir result can span pages.

We have reasonably good checking in nfs4_decode_dirent() anyway, so
it should be safe to simply remove the extra checking.

At a guess, this was introduced by commit 6650239a, "NFS: Don't use
vm_map_ram() in readdir".

Cc: stable@kernel.org [2.6.37]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 009aef9e12b..4e2c168b6ee 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6132,9 +6132,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
 		entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
 
-	if (verify_attr_len(xdr, p, len) < 0)
-		goto out_overflow;
-
 	return 0;
 
 out_overflow:
-- 
cgit v1.2.3


From 6b82ce8d824bd46053e46a895876cde39d9026e4 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 06:21:39 +0000
Subject: btrfs: fix uncheck memory allocation in btrfs_submit_compressed_read

btrfs_submit_compressed_read() is lack of memory allocation checks and
corresponding error route.

After this fix, if it comes to "no memory" case, errno will be returned
to userland step by step, and tell users this operation cannot go on.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 25 +++++++++++++++++++++++--
 fs/btrfs/extent_io.c   |  4 ++--
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f745287fbf2..3a932f183da 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -562,7 +562,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	u64 em_len;
 	u64 em_start;
 	struct extent_map *em;
-	int ret;
+	int ret = -ENOMEM;
 	u32 *sums;
 
 	tree = &BTRFS_I(inode)->io_tree;
@@ -577,6 +577,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	compressed_len = em->block_len;
 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+	if (!cb)
+		goto out;
+
 	atomic_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
@@ -597,13 +600,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
 				 PAGE_CACHE_SIZE;
-	cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+	cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
 				       GFP_NOFS);
+	if (!cb->compressed_pages)
+		goto fail1;
+
 	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	for (page_index = 0; page_index < nr_pages; page_index++) {
 		cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
 							      __GFP_HIGHMEM);
+		if (!cb->compressed_pages[page_index])
+			goto fail2;
 	}
 	cb->nr_pages = nr_pages;
 
@@ -614,6 +622,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	cb->len = uncompressed_len;
 
 	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+	if (!comp_bio)
+		goto fail2;
 	comp_bio->bi_private = cb;
 	comp_bio->bi_end_io = end_compressed_bio_read;
 	atomic_inc(&cb->pending_bios);
@@ -681,6 +691,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	bio_put(comp_bio);
 	return 0;
+
+fail2:
+	for (page_index = 0; page_index < nr_pages; page_index++)
+		free_page((unsigned long)cb->compressed_pages[page_index]);
+
+	kfree(cb->compressed_pages);
+fail1:
+	kfree(cb);
+out:
+	free_extent_map(em);
+	return ret;
 }
 
 static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8b8d3d99ae6..6411ed6ca44 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1865,7 +1865,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 	bio_get(bio);
 
 	if (tree->ops && tree->ops->submit_bio_hook)
-		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags, start);
 	else
 		submit_bio(rw, bio);
@@ -2126,7 +2126,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
 				      &bio_flags);
 	if (bio)
-		submit_one_bio(READ, bio, 0, bio_flags);
+		ret = submit_one_bio(READ, bio, 0, bio_flags);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2a29edc6b60a5248ccab588e7ba7dad38cef0235 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 06:22:08 +0000
Subject: btrfs: fix several uncheck memory allocations

To make btrfs more stable, add several missing necessary memory allocation
checks, and when no memory, return proper errno.

We've checked that some of those -ENOMEM errors will be returned to
userspace, and some will be catched by BUG_ON() in the upper callers,
and none will be ignored silently.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/export.c    |  2 ++
 fs/btrfs/file-item.c |  2 ++
 fs/btrfs/file.c      |  4 ++++
 fs/btrfs/tree-log.c  | 25 +++++++++++++++++++++++++
 4 files changed, 33 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 6f044447359..3220ad1aafc 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -176,6 +176,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	int ret;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return ERR_PTR(-ENOMEM);
 
 	if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		key.objectid = root->root_key.objectid;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae7..d0bc72657cd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -536,6 +536,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 	root = root->fs_info->csum_root;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
 	while (1) {
 		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f903433f5bd..65b2424a411 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -945,6 +945,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
 		     (sizeof(struct page *)));
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	/* generic_write_checks can change our pos */
 	start_pos = pos;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 054744ac571..c25a41d8611 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 		}
 		dst_copy = kmalloc(item_size, GFP_NOFS);
 		src_copy = kmalloc(item_size, GFP_NOFS);
+		if (!dst_copy || !src_copy) {
+			btrfs_release_path(root, path);
+			kfree(dst_copy);
+			kfree(src_copy);
+			return -ENOMEM;
+		}
 
 		read_extent_buffer(eb, src_copy, src_ptr, item_size);
 
@@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 	btrfs_dir_item_key_to_cpu(leaf, di, &location);
 	name_len = btrfs_dir_name_len(leaf, di);
 	name = kmalloc(name_len, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+
 	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 	btrfs_release_path(root, path);
 
@@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
 	int match = 0;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 	if (ret != 0)
 		goto out;
@@ -967,6 +979,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 	key.offset = (u64)-1;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1178,6 +1192,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 
 	name_len = btrfs_dir_name_len(eb, di);
 	name = kmalloc(name_len, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+
 	log_type = btrfs_dir_type(eb, di);
 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
 		   name_len);
@@ -1692,6 +1709,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		root_owner = btrfs_header_owner(parent);
 
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+		if (!next)
+			return -ENOMEM;
 
 		if (*level == 1) {
 			wc->process_func(root, next, wc, ptr_gen);
@@ -2194,6 +2213,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 
 	log = root->log_root;
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
 				   name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -2594,6 +2616,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 
 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
 			   nr * sizeof(u32), GFP_NOFS);
+	if (!ins_data)
+		return -ENOMEM;
+
 	ins_sizes = (u32 *)ins_data;
 	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
 
-- 
cgit v1.2.3


From 333e8105445d4f51101fc3d23199a919d66730b3 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Wed, 26 Jan 2011 06:22:33 +0000
Subject: btrfs: fix missing break in switch phrase

There is a missing break in switch, fix it.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/print-tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b6..fb2605d998e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 #else
 			BUG();
 #endif
+			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
-- 
cgit v1.2.3


From 34d19bada00f4825588b338a8ee193820f9ceeb0 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 24 Jan 2011 19:55:19 +0000
Subject: fs/btrfs/inode.c: Add missing IS_ERR test

After the conditional that precedes the following code, inode may be an
ERR_PTR value.  This can eg result from a memory allocation failure via the
call to btrfs_iget, and thus does not imply that root is different than
sub_root.  Thus, an IS_ERR check is added to ensure that there is no
dereference of inode in this case.

The semantic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@r@
identifier f;
@@
f(...) { ... return ERR_PTR(...); }

@@
identifier r.f, fld;
expression x;
statement S1,S2;
@@
 x = f(...)
 ... when != IS_ERR(x)
(
 if (IS_ERR(x) ||...) S1 else S2
|
*x->fld
)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2c9a2f7d563..2b7d251d6ad 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4137,7 +4137,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	}
 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
 
-	if (root != sub_root) {
+	if (!IS_ERR(inode) && root != sub_root) {
 		down_read(&root->fs_info->cleanup_work_sem);
 		if (!(inode->i_sb->s_flags & MS_RDONLY))
 			btrfs_orphan_cleanup(sub_root);
-- 
cgit v1.2.3


From 3612b49598c303cfb22a4b609427f829828e2427 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Tue, 25 Jan 2011 02:51:38 +0000
Subject: btrfs: fix return value check of btrfs_join_transaction()

The error check of btrfs_join_transaction()/btrfs_join_transaction_nolock()
is added, and the mistake of the error check in several places is
corrected.

For more stable Btrfs, I think that we should reduce BUG_ON().
But, I think that long time is necessary for this.
So, I propose this patch as a short-term solution.

With this patch:
 - To more stable Btrfs, the part that should be corrected is clarified.
 - The panic isn't done by the NULL pointer reference etc. (even if
   BUG_ON() is increased temporarily)
 - The error code is returned in the place where the error can be easily
   returned.

As a long-term plan:
 - BUG_ON() is reduced by using the forced-readonly framework, etc.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |  5 +++++
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/inode.c       | 24 ++++++++++++++++--------
 fs/btrfs/ioctl.c       |  2 +-
 fs/btrfs/relocation.c  | 26 +++++++++++++++++++++++---
 fs/btrfs/transaction.c |  5 +++++
 6 files changed, 51 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2887b8be6fd..b36eeef1919 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1550,6 +1550,7 @@ static int transaction_kthread(void *arg)
 		spin_unlock(&root->fs_info->new_trans_lock);
 
 		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(IS_ERR(trans));
 		if (transid == trans->transid) {
 			ret = btrfs_commit_transaction(trans, root);
 			BUG_ON(ret);
@@ -2464,10 +2465,14 @@ int btrfs_commit_super(struct btrfs_root *root)
 	up_write(&root->fs_info->cleanup_work_sem);
 
 	trans = btrfs_join_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	ret = btrfs_commit_transaction(trans, root);
 	BUG_ON(ret);
 	/* run commit again to drop the original snapshot */
 	trans = btrfs_join_transaction(root, 1);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	btrfs_commit_transaction(trans, root);
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bcf303204f7..98ee139885c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7478,7 +7478,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
 		BUG_ON(reloc_root->commit_root != NULL);
 		while (1) {
 			trans = btrfs_join_transaction(root, 1);
-			BUG_ON(!trans);
+			BUG_ON(IS_ERR(trans));
 
 			mutex_lock(&root->fs_info->drop_mutex);
 			ret = btrfs_drop_snapshot(trans, reloc_root);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2b7d251d6ad..40fee137dd1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -416,7 +416,7 @@ again:
 	}
 	if (start == 0) {
 		trans = btrfs_join_transaction(root, 1);
-		BUG_ON(!trans);
+		BUG_ON(IS_ERR(trans));
 		btrfs_set_trans_block_group(trans, inode);
 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -612,6 +612,7 @@ retry:
 			    GFP_NOFS);
 
 		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(IS_ERR(trans));
 		ret = btrfs_reserve_extent(trans, root,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
@@ -771,7 +772,7 @@ static noinline int cow_file_range(struct inode *inode,
 
 	BUG_ON(root == root->fs_info->tree_root);
 	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -1049,7 +1050,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	} else {
 		trans = btrfs_join_transaction(root, 1);
 	}
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 
 	cow_start = (u64)-1;
 	cur_offset = start;
@@ -1704,7 +1705,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				trans = btrfs_join_transaction_nolock(root, 1);
 			else
 				trans = btrfs_join_transaction(root, 1);
-			BUG_ON(!trans);
+			BUG_ON(IS_ERR(trans));
 			btrfs_set_trans_block_group(trans, inode);
 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 			ret = btrfs_update_inode(trans, root, inode);
@@ -1721,6 +1722,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 		trans = btrfs_join_transaction_nolock(root, 1);
 	else
 		trans = btrfs_join_transaction(root, 1);
+	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -2382,6 +2384,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 
 	if (root->orphan_block_rsv || root->orphan_item_inserted) {
 		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(IS_ERR(trans));
 		btrfs_end_transaction(trans, root);
 	}
 
@@ -4350,6 +4353,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 			trans = btrfs_join_transaction_nolock(root, 1);
 		else
 			trans = btrfs_join_transaction(root, 1);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
 		btrfs_set_trans_block_group(trans, inode);
 		if (nolock)
 			ret = btrfs_end_transaction_nolock(trans, root);
@@ -4375,6 +4380,7 @@ void btrfs_dirty_inode(struct inode *inode)
 		return;
 
 	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(IS_ERR(trans));
 	btrfs_set_trans_block_group(trans, inode);
 
 	ret = btrfs_update_inode(trans, root, inode);
@@ -5179,6 +5185,8 @@ again:
 				em = NULL;
 				btrfs_release_path(root, path);
 				trans = btrfs_join_transaction(root, 1);
+				if (IS_ERR(trans))
+					return ERR_CAST(trans);
 				goto again;
 			}
 			map = kmap(page);
@@ -5283,8 +5291,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 	btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
 
 	trans = btrfs_join_transaction(root, 0);
-	if (!trans)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(trans))
+		return ERR_CAST(trans);
 
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
@@ -5508,7 +5516,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 		 * while we look for nocow cross refs
 		 */
 		trans = btrfs_join_transaction(root, 0);
-		if (!trans)
+		if (IS_ERR(trans))
 			goto must_cow;
 
 		if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5643,7 +5651,7 @@ again:
 	BUG_ON(!ordered);
 
 	trans = btrfs_join_transaction(root, 1);
-	if (!trans) {
+	if (IS_ERR(trans)) {
 		err = -ENOMEM;
 		goto out;
 	}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index edd82becbb9..04b4fb9144a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -203,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
 
 	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 045c9c2b2d7..ea996543024 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2147,6 +2147,12 @@ again:
 	}
 
 	trans = btrfs_join_transaction(rc->extent_root, 1);
+	if (IS_ERR(trans)) {
+		if (!err)
+			btrfs_block_rsv_release(rc->extent_root,
+						rc->block_rsv, num_bytes);
+		return PTR_ERR(trans);
+	}
 
 	if (!err) {
 		if (num_bytes != rc->merging_rsv_size) {
@@ -3222,6 +3228,7 @@ truncate:
 	trans = btrfs_join_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
+		ret = PTR_ERR(trans);
 		goto out;
 	}
 
@@ -3628,6 +3635,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	set_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root, 1);
+	BUG_ON(IS_ERR(trans));
 	btrfs_commit_transaction(trans, rc->extent_root);
 	return 0;
 }
@@ -3804,7 +3812,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
 	/* get rid of pinned extents */
 	trans = btrfs_join_transaction(rc->extent_root, 1);
-	btrfs_commit_transaction(trans, rc->extent_root);
+	if (IS_ERR(trans))
+		err = PTR_ERR(trans);
+	else
+		btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
 	btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
 	btrfs_free_path(path);
@@ -4125,6 +4136,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	set_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root, 1);
+	if (IS_ERR(trans)) {
+		unset_reloc_control(rc);
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
 
 	rc->merge_reloc_tree = 1;
 
@@ -4154,9 +4170,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	unset_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root, 1);
-	btrfs_commit_transaction(trans, rc->extent_root);
-out:
+	if (IS_ERR(trans))
+		err = PTR_ERR(trans);
+	else
+		btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
 	kfree(rc);
+out:
 	while (!list_empty(&reloc_roots)) {
 		reloc_root = list_entry(reloc_roots.next,
 					struct btrfs_root, root_list);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bae5c7b8bbe..3d73c8d93bb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1161,6 +1161,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
 	INIT_DELAYED_WORK(&ac->work, do_async_commit);
 	ac->root = root;
 	ac->newtrans = btrfs_join_transaction(root, 0);
+	if (IS_ERR(ac->newtrans)) {
+		int err = PTR_ERR(ac->newtrans);
+		kfree(ac);
+		return err;
+	}
 
 	/* take transaction reference */
 	mutex_lock(&root->fs_info->trans_mutex);
-- 
cgit v1.2.3


From abd30bb0af9d4671506502278e8631bed9e3c35c Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Mon, 24 Jan 2011 00:57:10 +0000
Subject: btrfs: check return value of btrfs_start_ioctl_transaction() properly

btrfs_start_ioctl_transaction() returns ERR_PTR(), not NULL.
So, it is necessary to use IS_ERR() to check the return value.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 04b4fb9144a..12dabe28cf5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2085,7 +2085,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
 
 	ret = -ENOMEM;
 	trans = btrfs_start_ioctl_transaction(root, 0);
-	if (!trans)
+	if (IS_ERR(trans))
 		goto out_drop;
 
 	file->private_data = trans;
-- 
cgit v1.2.3


From dedefd7215d3ec451291ca393e5c8e4c1882c8c6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 24 Jan 2011 21:43:18 +0000
Subject: Btrfs: fix check_path_shared so it returns the right value

When running xfstests 224 I kept getting ENOSPC when trying to remove the files,
and this is because we were returning ret from check_path_shared while it was
uninitalized, which isn't right.  Fix this to return 0 properly, and now
xfstests 224 doesn't freak out when it tries to clean itself up.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 40fee137dd1..5621818921f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2718,9 +2718,10 @@ static int check_path_shared(struct btrfs_root *root,
 	struct extent_buffer *eb;
 	int level;
 	u64 refs = 1;
-	int uninitialized_var(ret);
 
 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		int ret;
+
 		if (!path->nodes[level])
 			break;
 		eb = path->nodes[level];
@@ -2731,7 +2732,7 @@ static int check_path_shared(struct btrfs_root *root,
 		if (refs > 1)
 			return 1;
 	}
-	return ret; /* XXX callers? */
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From e9e22899de661af94cb9995885fd04e4c738838b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 24 Jan 2011 21:43:19 +0000
Subject: Btrfs: do not release more reserved bytes to the global_block_rsv
 than we need

When we do btrfs_block_rsv_release, if global_block_rsv is not full we will
release all the extra bytes to global_block_rsv, even if it's only a little
short of the amount of space that we need to reserve.  This causes us to starve
ourselves of reservable space during the transaction which will force us to
shrink delalloc bytes and commit the transaction more often than we should.  So
instead just add the amount of bytes we need to add to the global reserve so
reserved == size, and then add the rest back into the space_info for general
use.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 98ee139885c..7af618dcf2c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3589,8 +3589,20 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 
 	if (num_bytes > 0) {
 		if (dest) {
-			block_rsv_add_bytes(dest, num_bytes, 0);
-		} else {
+			spin_lock(&dest->lock);
+			if (!dest->full) {
+				u64 bytes_to_add;
+
+				bytes_to_add = dest->size - dest->reserved;
+				bytes_to_add = min(num_bytes, bytes_to_add);
+				dest->reserved += bytes_to_add;
+				if (dest->reserved >= dest->size)
+					dest->full = 1;
+				num_bytes -= bytes_to_add;
+			}
+			spin_unlock(&dest->lock);
+		}
+		if (num_bytes) {
 			spin_lock(&space_info->lock);
 			space_info->bytes_reserved -= num_bytes;
 			spin_unlock(&space_info->lock);
-- 
cgit v1.2.3


From 68a82277b8619e6d0f2738b1d9b160b627e81e92 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 24 Jan 2011 21:43:20 +0000
Subject: Btrfs: use the global block reserve if we cannot reserve space

We call use_block_rsv right before we make an allocation in order to make sure
we have enough space.  Now normally people have called btrfs_start_transaction()
with the appropriate amount of space that we need, so we just use some of that
pre-reserved space and move along happily.  The problem is where people use
btrfs_join_transaction(), which doesn't actually reserve any space.  So we try
and reserve space here, but we cannot flush delalloc, so this forces us to
return -ENOSPC when in reality we have plenty of space.  The most common symptom
is seeing a bunch of "couldn't dirty inode" messages in syslog.  With
xfstests 224 we end up falling back to start_transaction and then doing all the
flush delalloc stuff which causes to hang for a very long time.

So instead steal from the global reserve, which is what this is meant for
anyway.  With this patch and the other 2 I have sent xfstests 224 now passes
successfully.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7af618dcf2c..ff6bbfd75cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5646,6 +5646,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	      struct btrfs_root *root, u32 blocksize)
 {
 	struct btrfs_block_rsv *block_rsv;
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	int ret;
 
 	block_rsv = get_block_rsv(trans, root);
@@ -5653,14 +5654,39 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	if (block_rsv->size == 0) {
 		ret = reserve_metadata_bytes(trans, root, block_rsv,
 					     blocksize, 0);
-		if (ret)
+		/*
+		 * If we couldn't reserve metadata bytes try and use some from
+		 * the global reserve.
+		 */
+		if (ret && block_rsv != global_rsv) {
+			ret = block_rsv_use_bytes(global_rsv, blocksize);
+			if (!ret)
+				return global_rsv;
+			return ERR_PTR(ret);
+		} else if (ret) {
 			return ERR_PTR(ret);
+		}
 		return block_rsv;
 	}
 
 	ret = block_rsv_use_bytes(block_rsv, blocksize);
 	if (!ret)
 		return block_rsv;
+	if (ret) {
+		WARN_ON(1);
+		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
+					     0);
+		if (!ret) {
+			spin_lock(&block_rsv->lock);
+			block_rsv->size += blocksize;
+			spin_unlock(&block_rsv->lock);
+			return block_rsv;
+		} else if (ret && block_rsv != global_rsv) {
+			ret = block_rsv_use_bytes(global_rsv, blocksize);
+			if (!ret)
+				return global_rsv;
+		}
+	}
 
 	return ERR_PTR(-ENOSPC);
 }
-- 
cgit v1.2.3


From ad0397a7a97f55fd7f70998ec208c5d8b90310ff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 28 Jan 2011 18:44:44 +0000
Subject: Btrfs: do error checking in btrfs_del_csums

Got a report of a box panicing because we got a NULL eb in read_extent_buffer.
His fs was borked and btrfs_search_path returned EIO, but we don't check for
errors so the box paniced.  Yes I know this will just make something higher up
the stack panic, but that's a problem for future Josef.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d0bc72657cd..4f19a3e1bf3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -550,7 +550,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 			if (path->slots[0] == 0)
 				goto out;
 			path->slots[0]--;
+		} else if (ret < 0) {
+			goto out;
 		}
+
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
-- 
cgit v1.2.3


From 7adf5dfbb3af65a00e20b3ead224c3a1b40e4ec4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 25 Jan 2011 22:11:54 +0000
Subject: Btrfs: handle no memory properly in prepare_pages

Instead of doing a BUG_ON(1) in prepare_pages if grab_cache_page() fails, just
loop through the pages we've already grabbed and unlock and release them, then
return -ENOMEM like we should.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 65b2424a411..9e097fbfc78 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -792,8 +792,12 @@ again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
-			err = -ENOMEM;
-			BUG_ON(1);
+			int c;
+			for (c = i - 1; c >= 0; c--) {
+				unlock_page(pages[c]);
+				page_cache_release(pages[c]);
+			}
+			return -ENOMEM;
 		}
 		wait_on_page_writeback(pages[i]);
 	}
-- 
cgit v1.2.3


From af5eb745efe97d91d2cbe793029838b3311c15da Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <anton@tuxera.com>
Date: Fri, 28 Jan 2011 20:45:28 +0000
Subject: NTFS: Fix invalid pointer dereference in ntfs_mft_record_alloc().

In ntfs_mft_record_alloc() when mapping the new extent mft record with
map_extent_mft_record() we overwrite @m with the return value and on
error, we then try to use the old @m but that is no longer there as @m
now contains an error code instead so we crash when dereferencing the
error code as if it were a pointer.

The simple fix is to use a temporary variable to store the return value
thus preserving the original @m for later use.  This is a backport from
the commercial Tuxera-NTFS driver and is well tested...

Thanks go to Julia Lawall for pointing this out (whilst I had fixed it
in the commercial driver I had failed to fix it in the Linux kernel).

Signed-off-by: Anton Altaparmakov <anton@tuxera.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/mft.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index b572b672718..326e7475a22 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
 /**
  * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -2576,6 +2576,8 @@ mft_rec_already_initialized:
 	flush_dcache_page(page);
 	SetPageUptodate(page);
 	if (base_ni) {
+		MFT_RECORD *m_tmp;
+
 		/*
 		 * Setup the base mft record in the extent mft record.  This
 		 * completes initialization of the allocated extent mft record
@@ -2588,11 +2590,11 @@ mft_rec_already_initialized:
 		 * attach it to the base inode @base_ni and map, pin, and lock
 		 * its, i.e. the allocated, mft record.
 		 */
-		m = map_extent_mft_record(base_ni, bit, &ni);
-		if (IS_ERR(m)) {
+		m_tmp = map_extent_mft_record(base_ni, bit, &ni);
+		if (IS_ERR(m_tmp)) {
 			ntfs_error(vol->sb, "Failed to map allocated extent "
 					"mft record 0x%llx.", (long long)bit);
-			err = PTR_ERR(m);
+			err = PTR_ERR(m_tmp);
 			/* Set the mft record itself not in use. */
 			m->flags &= cpu_to_le16(
 					~le16_to_cpu(MFT_RECORD_IN_USE));
@@ -2603,6 +2605,7 @@ mft_rec_already_initialized:
 			ntfs_unmap_page(page);
 			goto undo_mftbmp_alloc;
 		}
+		BUG_ON(m != m_tmp);
 		/*
 		 * Make sure the allocated mft record is written out to disk.
 		 * No need to set the inode dirty because the caller is going
-- 
cgit v1.2.3


From ffeb414a59291d5891f09727beb793c109f19f08 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 29 Jan 2011 07:03:02 -0500
Subject: cifs: fix two compiler warning about uninitialized vars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fs/cifs/link.c: In function ‘symlink_hash’:
fs/cifs/link.c:58:3: warning: ‘rc’ may be used uninitialized in this
function [-Wuninitialized]

fs/cifs/smbencrypt.c: In function ‘mdfour’:
fs/cifs/smbencrypt.c:61:3: warning: ‘rc’ may be used uninitialized in this
function [-Wuninitialized]

Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/link.c       | 3 ++-
 fs/cifs/smbencrypt.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 02cd60aefbf..e8804d37340 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -55,8 +55,9 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
 
 	md5 = crypto_alloc_shash("md5", 0, 0);
 	if (IS_ERR(md5)) {
+		rc = PTR_ERR(md5);
 		cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
-		return PTR_ERR(md5);
+		return rc;
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
 	sdescmd5 = kmalloc(size, GFP_KERNEL);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index b5450e9f40c..b5041c84998 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -58,8 +58,9 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
 
 	md4 = crypto_alloc_shash("md4", 0, 0);
 	if (IS_ERR(md4)) {
+		rc = PTR_ERR(md4);
 		cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
-		return PTR_ERR(md4);
+		return rc;
 	}
 	size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
 	sdescmd4 = kmalloc(size, GFP_KERNEL);
-- 
cgit v1.2.3


From 1be912dde772b77aaaa21770eeabb0a7a5e297a6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jan 2011 07:08:28 -0500
Subject: cifs: handle cancelled requests better

Currently, when a request is cancelled via signal, we delete the mid
immediately. If the request was already transmitted however, the client
is still likely to receive a response. When it does, it won't recognize
it however and will pop a printk.

It's also a little dangerous to just delete the mid entry like this. We
may end up reusing that mid. If we do then we could potentially get the
response from the first request confused with the later one.

Prevent the reuse of mids by marking them as cancelled and keeping them
on the pending_mid_q list. If the reply comes in, we'll delete it from
the list then. If it never comes, then we'll delete it at reconnect
or when cifsd comes down.

Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 43 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c1ccca1a933..9b2d0373a8a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -579,8 +579,17 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		goto out;
 
 	rc = wait_for_response(ses->server, midQ);
-	if (rc != 0)
-		goto out;
+	if (rc != 0) {
+		spin_lock(&GlobalMid_Lock);
+		if (midQ->midState == MID_REQUEST_SUBMITTED) {
+			midQ->callback = DeleteMidQEntry;
+			spin_unlock(&GlobalMid_Lock);
+			atomic_dec(&ses->server->inFlight);
+			wake_up(&ses->server->request_q);
+			return rc;
+		}
+		spin_unlock(&GlobalMid_Lock);
+	}
 
 	rc = sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
@@ -724,8 +733,18 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		goto out;
 
 	rc = wait_for_response(ses->server, midQ);
-	if (rc != 0)
-		goto out;
+	if (rc != 0) {
+		spin_lock(&GlobalMid_Lock);
+		if (midQ->midState == MID_REQUEST_SUBMITTED) {
+			/* no longer considered to be "in-flight" */
+			midQ->callback = DeleteMidQEntry;
+			spin_unlock(&GlobalMid_Lock);
+			atomic_dec(&ses->server->inFlight);
+			wake_up(&ses->server->request_q);
+			return rc;
+		}
+		spin_unlock(&GlobalMid_Lock);
+	}
 
 	rc = sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
@@ -922,10 +941,20 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 			}
 		}
 
-		if (wait_for_response(ses->server, midQ) == 0) {
-			/* We got the response - restart system call. */
-			rstart = 1;
+		rc = wait_for_response(ses->server, midQ);
+		if (rc) {
+			spin_lock(&GlobalMid_Lock);
+			if (midQ->midState == MID_REQUEST_SUBMITTED) {
+				/* no longer considered to be "in-flight" */
+				midQ->callback = DeleteMidQEntry;
+				spin_unlock(&GlobalMid_Lock);
+				return rc;
+			}
+			spin_unlock(&GlobalMid_Lock);
 		}
+
+		/* We got the response - restart system call. */
+		rstart = 1;
 	}
 
 	rc = sync_mid_result(midQ, ses->server);
-- 
cgit v1.2.3


From 2db7c5815555d8daabf7d4ab1253ce690852c140 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jan 2011 07:08:28 -0500
Subject: cifs: send an NT_CANCEL request when a process is signalled

Use the new send_nt_cancel function to send an NT_CANCEL when the
process is delivered a fatal signal. This is a "best effort" enterprise
however, so don't bother to check the return code. There's nothing we
can reasonably do if it fails anyway.

Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 9b2d0373a8a..bdaa4aa58b0 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -570,20 +570,25 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #endif
 
 	mutex_unlock(&ses->server->srv_mutex);
-	cifs_small_buf_release(in_buf);
 
-	if (rc < 0)
+	if (rc < 0) {
+		cifs_small_buf_release(in_buf);
 		goto out;
+	}
 
-	if (long_op == CIFS_ASYNC_OP)
+	if (long_op == CIFS_ASYNC_OP) {
+		cifs_small_buf_release(in_buf);
 		goto out;
+	}
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
+		send_nt_cancel(ses->server, in_buf, midQ);
 		spin_lock(&GlobalMid_Lock);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			midQ->callback = DeleteMidQEntry;
 			spin_unlock(&GlobalMid_Lock);
+			cifs_small_buf_release(in_buf);
 			atomic_dec(&ses->server->inFlight);
 			wake_up(&ses->server->request_q);
 			return rc;
@@ -591,6 +596,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		spin_unlock(&GlobalMid_Lock);
 	}
 
+	cifs_small_buf_release(in_buf);
+
 	rc = sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
 		atomic_dec(&ses->server->inFlight);
@@ -734,6 +741,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
+		send_nt_cancel(ses->server, in_buf, midQ);
 		spin_lock(&GlobalMid_Lock);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			/* no longer considered to be "in-flight" */
@@ -943,6 +951,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 
 		rc = wait_for_response(ses->server, midQ);
 		if (rc) {
+			send_nt_cancel(ses->server, in_buf, midQ);
 			spin_lock(&GlobalMid_Lock);
 			if (midQ->midState == MID_REQUEST_SUBMITTED) {
 				/* no longer considered to be "in-flight" */
-- 
cgit v1.2.3


From 68abaffa6bbd3cadfaa4b7216d10bcd32406090b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jan 2011 15:05:42 -0500
Subject: cifs: simplify SMB header check routine

...just cleanup. There should be no behavior change.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/misc.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index a09e077ba92..72e99ece78c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -381,29 +381,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 }
 
 static int
-checkSMBhdr(struct smb_hdr *smb, __u16 mid)
+check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 {
-	/* Make sure that this really is an SMB, that it is a response,
-	   and that the message ids match */
-	if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) &&
-		(mid == smb->Mid)) {
-		if (smb->Flags & SMBFLG_RESPONSE)
-			return 0;
-		else {
-		/* only one valid case where server sends us request */
-			if (smb->Command == SMB_COM_LOCKING_ANDX)
-				return 0;
-			else
-				cERROR(1, "Received Request not response");
-		}
-	} else { /* bad signature or mid */
-		if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-			cERROR(1, "Bad protocol string signature header %x",
-				*(unsigned int *) smb->Protocol);
-		if (mid != smb->Mid)
-			cERROR(1, "Mids do not match");
+	/* does it have the right SMB "signature" ? */
+	if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
+		cERROR(1, "Bad protocol string signature header 0x%x",
+			*(unsigned int *)smb->Protocol);
+		return 1;
 	}
-	cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
+
+	/* Make sure that message ids match */
+	if (mid != smb->Mid) {
+		cERROR(1, "Mids do not match. received=%u expected=%u",
+			smb->Mid, mid);
+		return 1;
+	}
+
+	/* if it's a response then accept */
+	if (smb->Flags & SMBFLG_RESPONSE)
+		return 0;
+
+	/* only one valid case where server sends us request */
+	if (smb->Command == SMB_COM_LOCKING_ANDX)
+		return 0;
+
+	cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
 	return 1;
 }
 
@@ -448,7 +450,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 		return 1;
 	}
 
-	if (checkSMBhdr(smb, mid))
+	if (check_smb_hdr(smb, mid))
 		return 1;
 	clc_len = smbCalcSize_LE(smb);
 
-- 
cgit v1.2.3


From d804d41d163c0975d2890c82d7135ada7a2f23a4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 28 Jan 2011 15:05:43 -0500
Subject: cifs: don't pop a printk when sending on a socket is interrupted

If we kill the process while it's sending on a socket then the
kernel_sendmsg will return -EINTR. This is normal. No need to spam the
ring buffer with this info.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bdaa4aa58b0..b8c5e2eb43d 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -236,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 		server->tcpStatus = CifsNeedReconnect;
 	}
 
-	if (rc < 0) {
+	if (rc < 0 && rc != -EINTR)
 		cERROR(1, "Error %d sending data on socket to server", rc);
-	} else
+	else
 		rc = 0;
 
 	/* Don't want to modify the buffer as a
-- 
cgit v1.2.3


From 92a4e0f0169498867ecb19c2244510dd4beba149 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 29 Jan 2011 07:02:28 -0500
Subject: cifs: force a reconnect if there are too many MIDs in flight

Currently, we allow the pending_mid_q to grow without bound with
SIGKILL'ed processes. This could eventually be a DoS'able problem. An
unprivileged user could a process that does a long-running call and then
SIGKILL it.

If he can also intercept the NT_CANCEL calls or the replies from the
server, then the pending_mid_q could grow very large, possibly even to
2^16 entries which might leave GetNextMid in an infinite loop. Fix this
by imposing a hard limit of 32k calls per server. If we cross that
limit, set the tcpStatus to CifsNeedReconnect to force cifsd to
eventually reconnect the socket and clean out the pending_mid_q.

While we're at it, clean up the function a bit and eliminate an
unnecessary NULL pointer check.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/misc.c | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 72e99ece78c..24f0a9d97ad 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -236,10 +236,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
 {
 	__u16 mid = 0;
 	__u16 last_mid;
-	int   collision;
-
-	if (server == NULL)
-		return mid;
+	bool collision;
 
 	spin_lock(&GlobalMid_Lock);
 	last_mid = server->CurrentMid; /* we do not want to loop forever */
@@ -252,24 +249,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
 	(and it would also have to have been a request that
 	 did not time out) */
 	while (server->CurrentMid != last_mid) {
-		struct list_head *tmp;
 		struct mid_q_entry *mid_entry;
+		unsigned int num_mids;
 
-		collision = 0;
+		collision = false;
 		if (server->CurrentMid == 0)
 			server->CurrentMid++;
 
-		list_for_each(tmp, &server->pending_mid_q) {
-			mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-
-			if ((mid_entry->mid == server->CurrentMid) &&
-			    (mid_entry->midState == MID_REQUEST_SUBMITTED)) {
+		num_mids = 0;
+		list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+			++num_mids;
+			if (mid_entry->mid == server->CurrentMid &&
+			    mid_entry->midState == MID_REQUEST_SUBMITTED) {
 				/* This mid is in use, try a different one */
-				collision = 1;
+				collision = true;
 				break;
 			}
 		}
-		if (collision == 0) {
+
+		/*
+		 * if we have more than 32k mids in the list, then something
+		 * is very wrong. Possibly a local user is trying to DoS the
+		 * box by issuing long-running calls and SIGKILL'ing them. If
+		 * we get to 2^16 mids then we're in big trouble as this
+		 * function could loop forever.
+		 *
+		 * Go ahead and assign out the mid in this situation, but force
+		 * an eventual reconnect to clean out the pending_mid_q.
+		 */
+		if (num_mids > 32768)
+			server->tcpStatus = CifsNeedReconnect;
+
+		if (!collision) {
 			mid = server->CurrentMid;
 			break;
 		}
-- 
cgit v1.2.3


From edae38a6431276c50d4b51543c36de258722358e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 31 Jan 2011 09:38:12 +0000
Subject: GFS2: Fix glock queue trace point

Somehow this tracepoint landed up in the wrong place. This moves it
to where it should be.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c75d4998519..ddc3e1e3faa 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -972,13 +972,13 @@ fail:
 			insert_pt = &gh2->gh_list;
 	}
 	set_bit(GLF_QUEUED, &gl->gl_flags);
+	trace_gfs2_glock_queue(gh, 1);
 	if (likely(insert_pt == NULL)) {
 		list_add_tail(&gh->gh_list, &gl->gl_holders);
 		if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
 			goto do_cancel;
 		return;
 	}
-	trace_gfs2_glock_queue(gh, 1);
 	list_add_tail(&gh->gh_list, insert_pt);
 do_cancel:
 	gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
-- 
cgit v1.2.3


From f855f6cbeb4f94cd4e4a225c2246ee8012c384a2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Jan 2011 08:41:36 -0500
Subject: cifs: make CIFS depend on CRYPTO_MD4

Recently CIFS was changed to use the kernel crypto API for MD4 hashes,
but the Kconfig dependencies were not changed to reflect this.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reported-and-Tested-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index ee45648b0d1..7cb0f7f847e 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -3,6 +3,7 @@ config CIFS
 	depends on INET
 	select NLS
 	select CRYPTO
+	select CRYPTO_MD4
 	select CRYPTO_MD5
 	select CRYPTO_HMAC
 	select CRYPTO_ARC4
-- 
cgit v1.2.3


From 31c2659d78c8be970833bc1e633593d291553ed3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Jan 2011 07:24:46 -0500
Subject: cifs: clean up some compiler warnings

New compiler warnings that I noticed when building a patchset based
on recent Fedora kernel:

fs/cifs/cifssmb.c: In function 'CIFSSMBSetFileSize':
fs/cifs/cifssmb.c:4813:8: warning: variable 'data_offset' set but not used
[-Wunused-but-set-variable]

fs/cifs/file.c: In function 'cifs_open':
fs/cifs/file.c:349:24: warning: variable 'pCifsInode' set but not used
[-Wunused-but-set-variable]
fs/cifs/file.c: In function 'cifs_partialpagewrite':
fs/cifs/file.c:1149:23: warning: variable 'cifs_sb' set but not used
[-Wunused-but-set-variable]
fs/cifs/file.c: In function 'cifs_iovec_write':
fs/cifs/file.c:1740:9: warning: passing argument 6 of 'CIFSSMBWrite2' from
incompatible pointer type [enabled by default]
fs/cifs/cifsproto.h:337:12: note: expected 'unsigned int *' but argument is
of type 'size_t *'

fs/cifs/readdir.c: In function 'cifs_readdir':
fs/cifs/readdir.c:767:23: warning: variable 'cifs_sb' set but not used
[-Wunused-but-set-variable]

fs/cifs/cifs_dfs_ref.c: In function 'cifs_dfs_d_automount':
fs/cifs/cifs_dfs_ref.c:342:2: warning: 'rc' may be used uninitialized in
this function [-Wuninitialized]
fs/cifs/cifs_dfs_ref.c:278:6: note: 'rc' was declared here

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_dfs_ref.c | 9 ++++-----
 fs/cifs/cifssmb.c      | 3 ---
 fs/cifs/file.c         | 8 ++------
 fs/cifs/readdir.c      | 3 ---
 4 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index f1c68629f27..0a265ad9e42 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -282,8 +282,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	cFYI(1, "in %s", __func__);
 	BUG_ON(IS_ROOT(mntpt));
 
-	xid = GetXid();
-
 	/*
 	 * The MSDFS spec states that paths in DFS referral requests and
 	 * responses must be prefixed by a single '\' character instead of
@@ -293,7 +291,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	mnt = ERR_PTR(-ENOMEM);
 	full_path = build_path_from_dentry(mntpt);
 	if (full_path == NULL)
-		goto free_xid;
+		goto cdda_exit;
 
 	cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
@@ -303,9 +301,11 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	}
 	ses = tlink_tcon(tlink)->ses;
 
+	xid = GetXid();
 	rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
 		&num_referrals, &referrals,
 		cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	FreeXid(xid);
 
 	cifs_put_tlink(tlink);
 
@@ -338,8 +338,7 @@ success:
 	free_dfs_info_array(referrals, num_referrals);
 free_full_path:
 	kfree(full_path);
-free_xid:
-	FreeXid(xid);
+cdda_exit:
 	cFYI(1, "leaving %s" , __func__);
 	return mnt;
 }
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3106f5e5c63..46c66ed01af 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4914,7 +4914,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 		   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
 {
 	struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-	char *data_offset;
 	struct file_end_of_file_info *parm_data;
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
@@ -4938,8 +4937,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 	param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
 	offset = param_offset + params;
 
-	data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
-
 	count = sizeof(struct file_end_of_file_info);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
 	/* BB find exact max SMB PDU from sess structure BB */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0de17c1db60..74c0a282d01 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -346,7 +346,6 @@ int cifs_open(struct inode *inode, struct file *file)
 	struct cifsTconInfo *tcon;
 	struct tcon_link *tlink;
 	struct cifsFileInfo *pCifsFile = NULL;
-	struct cifsInodeInfo *pCifsInode;
 	char *full_path = NULL;
 	bool posix_open_ok = false;
 	__u16 netfid;
@@ -361,8 +360,6 @@ int cifs_open(struct inode *inode, struct file *file)
 	}
 	tcon = tlink_tcon(tlink);
 
-	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
-
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
@@ -1146,7 +1143,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 	char *write_data;
 	int rc = -EFAULT;
 	int bytes_written = 0;
-	struct cifs_sb_info *cifs_sb;
 	struct inode *inode;
 	struct cifsFileInfo *open_file;
 
@@ -1154,7 +1150,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 		return -EFAULT;
 
 	inode = page->mapping->host;
-	cifs_sb = CIFS_SB(inode->i_sb);
 
 	offset += (loff_t)from;
 	write_data = kmap(page);
@@ -1667,7 +1662,8 @@ static ssize_t
 cifs_iovec_write(struct file *file, const struct iovec *iov,
 		 unsigned long nr_segs, loff_t *poffset)
 {
-	size_t total_written = 0, written = 0;
+	size_t total_written = 0;
+	unsigned int written = 0;
 	unsigned long num_pages, npages;
 	size_t copied, len, cur_len, i;
 	struct kvec *to_send;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 7f25cc3d225..f8e4cd2a791 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -764,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
 	int rc = 0;
 	int xid, i;
-	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	struct cifsFileInfo *cifsFile = NULL;
 	char *current_entry;
@@ -775,8 +774,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 
 	xid = GetXid();
 
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-
 	/*
 	 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
 	 * '..'. Otherwise we won't be able to notify VFS in case of failure.
-- 
cgit v1.2.3


From 7a8587e7c8e4e32ba778bfbbb822a0a7e8d5f3e3 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Sat, 29 Jan 2011 13:54:58 -0600
Subject: cifs: No need to check crypto blockcipher allocation

Missed one change as per earlier suggestion.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 0db5f1de022..a51585f9852 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -657,9 +657,10 @@ calc_seckey(struct cifsSesInfo *ses)
 	get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
 
 	tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
-	if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+	if (IS_ERR(tfm_arc4)) {
+		rc = PTR_ERR(tfm_arc4);
 		cERROR(1, "could not allocate crypto API arc4\n");
-		return PTR_ERR(tfm_arc4);
+		return rc;
 	}
 
 	desc.tfm = tfm_arc4;
-- 
cgit v1.2.3


From b1953bcec95c189b1eea690a08e89646d7750bda Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 21 Jan 2011 21:10:01 +0000
Subject: Btrfs: make shrink_delalloc a little friendlier

Xfstests 224 will just sit there and spin for ever until eventually we give up
flushing delalloc and exit.  On my box this took several hours.  I could not
interrupt this process either, even though we use INTERRUPTIBLE.  So do 2 things

1) Keep us from looping over and over again without reclaiming anything
2) If we get interrupted exit the loop

I tested this and the test now exits in a reasonable amount of time, and can be
interrupted with ctrl+c.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ff6bbfd75cf..f96641a93fc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3345,8 +3345,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
+	long time_left;
 	int pause = 1;
 	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+	int loops = 0;
 
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
@@ -3359,7 +3361,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 
 	max_reclaim = min(reserved, to_reclaim);
 
-	while (1) {
+	while (loops < 1024) {
 		/* have the flusher threads jump in and do some IO */
 		smp_mb();
 		nr_pages = min_t(unsigned long, nr_pages,
@@ -3367,8 +3369,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_reserved)
+		if (reserved > space_info->bytes_reserved) {
+			loops = 0;
 			reclaimed += reserved - space_info->bytes_reserved;
+		} else {
+			loops++;
+		}
 		reserved = space_info->bytes_reserved;
 		spin_unlock(&space_info->lock);
 
@@ -3379,7 +3385,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 			return -EAGAIN;
 
 		__set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(pause);
+		time_left = schedule_timeout(pause);
+
+		/* We were interrupted, exit */
+		if (time_left)
+			break;
+
 		pause <<= 1;
 		if (pause > HZ / 10)
 			pause = HZ / 10;
-- 
cgit v1.2.3


From b31eabd86eb68d3c217e6821078249bc045e698a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 31 Jan 2011 16:48:24 -0500
Subject: Btrfs: catch errors from btrfs_sync_log

btrfs_sync_log returns -EAGAIN when we need full transaction commits
instead of small log commits, but sometimes we were dropping the return
value.

In practice, we check for this a few different ways, but this is still a
bug that can leave off full log commits when we really need them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c25a41d8611..42dfc307704 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2051,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		wait_log_commit(trans, log_root_tree,
 				log_root_tree->log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
+		ret = 0;
 		goto out;
 	}
 	atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2115,7 +2116,7 @@ out:
 	smp_mb();
 	if (waitqueue_active(&root->log_commit_wait[index1]))
 		wake_up(&root->log_commit_wait[index1]);
-	return 0;
+	return ret;
 }
 
 static void free_log_tree(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From cab6958da0094e36a098751f844409fc9ee26251 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 31 Jan 2011 21:56:35 +0000
Subject: [CIFS] Update cifs minor version

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 14789a97304..4a3330235d5 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -127,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.69"
+#define CIFS_VERSION   "1.70"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.3


From 6284644e8de1f4005166c918c3d2aa4c510ab9f6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 31 Jan 2011 09:14:17 -0500
Subject: cifs: fix length checks in checkSMB

The cERROR message in checkSMB when the calculated length doesn't match
the RFC1001 length is incorrect in many cases. It always says that the
RFC1001 length is bigger than the SMB, even when it's actually the
reverse.

Fix the error message to say the reverse of what it does now when the
SMB length goes beyond the end of the received data. Also, clarify the
error message when the RFC length is too big. Finally, clarify the
comments to show that the 512 byte limit on extra data at the end of
the packet is arbitrary.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/misc.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 24f0a9d97ad..2a930a752a7 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -478,25 +478,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 			if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
 				return 0; /* bcc wrapped */
 		}
-		cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
+		cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
 				clc_len, 4 + len, smb->Mid);
-		/* Windows XP can return a few bytes too much, presumably
-		an illegal pad, at the end of byte range lock responses
-		so we allow for that three byte pad, as long as actual
-		received length is as long or longer than calculated length */
-		/* We have now had to extend this more, since there is a
-		case in which it needs to be bigger still to handle a
-		malformed response to transact2 findfirst from WinXP when
-		access denied is returned and thus bcc and wct are zero
-		but server says length is 0x21 bytes too long as if the server
-		forget to reset the smb rfc1001 length when it reset the
-		wct and bcc to minimum size and drop the t2 parms and data */
-		if ((4+len > clc_len) && (len <= clc_len + 512))
-			return 0;
-		else {
-			cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
+
+		if (4 + len < clc_len) {
+			cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
 					len, smb->Mid);
 			return 1;
+		} else if (len > clc_len + 512) {
+			/*
+			 * Some servers (Windows XP in particular) send more
+			 * data than the lengths in the SMB packet would
+			 * indicate on certain calls (byte range locks and
+			 * trans2 find first calls in particular). While the
+			 * client can handle such a frame by ignoring the
+			 * trailing data, we choose limit the amount of extra
+			 * data to 512 bytes.
+			 */
+			cERROR(1, "RFC1001 size %u more than 512 bytes larger "
+				  "than SMB for mid=%u", len, smb->Mid);
+			return 1;
 		}
 	}
 	return 0;
-- 
cgit v1.2.3


From c87fb6fdcaf7560940b31a0c78c3e6370e3433cf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 31 Jan 2011 19:54:59 -0500
Subject: Btrfs: avoid uninit variable warnings in ordered-data.c

This one isn't really an uninit variable, but for pretty
obscure reasons.  Let's make it clearly correct.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2b61e1ddcd9..083a5547737 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 					  u64 file_offset)
 {
 	struct rb_root *root = &tree->tree;
-	struct rb_node *prev;
+	struct rb_node *prev = NULL;
 	struct rb_node *ret;
 	struct btrfs_ordered_extent *entry;
 
-- 
cgit v1.2.3


From fd89d5f2030ac83324330bfd0bc73abf1beadaa6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Feb 2011 11:42:42 +0100
Subject: ext4: convert to alloc_workqueue()

Convert create_workqueue() to alloc_workqueue().  This is an identity
conversion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/super.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 48ce561fafa..0fcf6720af0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3507,7 +3507,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
 
 no_journal:
-	EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+	/*
+	 * The maximum number of concurrent works can be high and
+	 * concurrency isn't really necessary.  Limit it to 1.
+	 */
+	EXT4_SB(sb)->dio_unwritten_wq =
+		alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1);
 	if (!EXT4_SB(sb)->dio_unwritten_wq) {
 		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
 		goto failed_mount_wq;
-- 
cgit v1.2.3


From 316873c958eee302952edcadb8dc72d6d3d19d3c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Feb 2011 11:42:42 +0100
Subject: ocfs2: use system_wq instead of ocfs2_quota_wq

ocfs2_quota_wq is not depended upon during memory reclaim and, with
cmwq, there's no reason to use a dedicated workqueue.  Drop
ocfs2_quota_wq and use system_wq instead.  dqi_sync_work is already
sync canceled on quota disable and no further synchronization is
necessary.

This change makes ocfs2_quota_setup/shutdown() noops.  Both functions
removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/quota.h        |  3 ---
 fs/ocfs2/quota_global.c | 27 ++++-----------------------
 fs/ocfs2/super.c        |  7 -------
 3 files changed, 4 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 196fcb52d95..d5ab56cbe5c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -114,7 +114,4 @@ int ocfs2_local_write_dquot(struct dquot *dquot);
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
 
-int ocfs2_quota_setup(void);
-void ocfs2_quota_shutdown(void);
-
 #endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 4607923eb24..a73f6416648 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -63,8 +63,6 @@
  *        write to gf
  */
 
-static struct workqueue_struct *ocfs2_quota_wq = NULL;
-
 static void qsync_work_fn(struct work_struct *work);
 
 static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
@@ -400,8 +398,8 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 						OCFS2_QBLK_RESERVED_SPACE;
 	oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
 	INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
-	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-			   msecs_to_jiffies(oinfo->dqi_syncms));
+	schedule_delayed_work(&oinfo->dqi_sync_work,
+			      msecs_to_jiffies(oinfo->dqi_syncms));
 
 out_err:
 	mlog_exit(status);
@@ -635,8 +633,8 @@ static void qsync_work_fn(struct work_struct *work)
 	struct super_block *sb = oinfo->dqi_gqinode->i_sb;
 
 	dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
-	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-			   msecs_to_jiffies(oinfo->dqi_syncms));
+	schedule_delayed_work(&oinfo->dqi_sync_work,
+			      msecs_to_jiffies(oinfo->dqi_syncms));
 }
 
 /*
@@ -923,20 +921,3 @@ const struct dquot_operations ocfs2_quota_operations = {
 	.alloc_dquot	= ocfs2_alloc_dquot,
 	.destroy_dquot	= ocfs2_destroy_dquot,
 };
-
-int ocfs2_quota_setup(void)
-{
-	ocfs2_quota_wq = create_workqueue("o2quot");
-	if (!ocfs2_quota_wq)
-		return -ENOMEM;
-	return 0;
-}
-
-void ocfs2_quota_shutdown(void)
-{
-	if (ocfs2_quota_wq) {
-		flush_workqueue(ocfs2_quota_wq);
-		destroy_workqueue(ocfs2_quota_wq);
-		ocfs2_quota_wq = NULL;
-	}
-}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 38f986d2447..84a70113b43 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1645,16 +1645,11 @@ static int __init ocfs2_init(void)
 		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
 	}
 
-	status = ocfs2_quota_setup();
-	if (status)
-		goto leave;
-
 	ocfs2_set_locking_protocol();
 
 	status = register_quota_format(&ocfs2_quota_format);
 leave:
 	if (status < 0) {
-		ocfs2_quota_shutdown();
 		ocfs2_free_mem_caches();
 		exit_ocfs2_uptodate_cache();
 	}
@@ -1671,8 +1666,6 @@ static void __exit ocfs2_exit(void)
 {
 	mlog_entry_void();
 
-	ocfs2_quota_shutdown();
-
 	if (ocfs2_wq) {
 		flush_workqueue(ocfs2_wq);
 		destroy_workqueue(ocfs2_wq);
-- 
cgit v1.2.3


From 28aadf51693f56c41326ebbc795318a49011b12d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Feb 2011 11:42:42 +0100
Subject: reiserfs: make commit_wq use the default concurrency level

The maximum number of concurrent work items queued on commit_wq is
bound by the number of active journals.  Convert to alloc_workqueue()
and use the default concurrency level so that they can be processed in
parallel.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: reiserfs-devel@vger.kernel.org
---
 fs/reiserfs/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3eea859e699..c77514bd577 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2876,7 +2876,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	reiserfs_mounted_fs_count++;
 	if (reiserfs_mounted_fs_count <= 1) {
 		reiserfs_write_unlock(sb);
-		commit_wq = create_workqueue("reiserfs");
+		commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
 		reiserfs_write_lock(sb);
 	}
 
-- 
cgit v1.2.3


From 83e759043abe9d0291f58f2427ba12bbb0a6e4f1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Feb 2011 11:42:43 +0100
Subject: xfs: convert to alloc_workqueue()

Convert from create[_singlethread]_workqueue() to alloc_workqueue().

* xfsdatad_workqueue and xfsconvertd_workqueue are identity converted.
  Using higher concurrency limit might be useful but given the
  complexity of workqueue usage in xfs, proceeding cautiously seems
  better.

* xfs_mru_reap_wq is converted to non-ordered workqueue with max
  concurrency of 1 as the work items don't require any specific
  ordering and already have proper synchronization.  It seems it was
  singlethreaded to save worker threads, which is no longer a concern.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Alex Elder <aelder@sgi.com>
Cc: xfs-masters@oss.sgi.com
Cc: Christoph Hellwig <hch@infradead.org>
---
 fs/xfs/linux-2.6/xfs_buf.c | 5 +++--
 fs/xfs/xfs_mru_cache.c     | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378d..f83a4c830a6 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -2022,11 +2022,12 @@ xfs_buf_init(void)
 	if (!xfslogd_workqueue)
 		goto out_free_buf_zone;
 
-	xfsdatad_workqueue = create_workqueue("xfsdatad");
+	xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
 	if (!xfsdatad_workqueue)
 		goto out_destroy_xfslogd_workqueue;
 
-	xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+	xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+						WQ_MEM_RECLAIM, 1);
 	if (!xfsconvertd_workqueue)
 		goto out_destroy_xfsdatad_workqueue;
 
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index edfa178bafb..4aff5639573 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
 	if (!xfs_mru_elem_zone)
 		goto out;
 
-	xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
+	xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
 	if (!xfs_mru_reap_wq)
 		goto out_destroy_mru_elem_zone;
 
-- 
cgit v1.2.3


From 5df67083488ccbad925f583b698ab38f8629a016 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Tue, 1 Feb 2011 09:17:35 +0000
Subject: btrfs: checking NULL or not in some functions

Because NULL is returned when the memory allocation fails,
it is checked whether it is NULL.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 fs/btrfs/extent_io.c   | 2 ++
 fs/btrfs/tree-log.c    | 6 ++++++
 3 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f96641a93fc..9de4ff03882 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6496,6 +6496,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start,
 	int ret = 0;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
 
 	mutex_lock(&inode->i_mutex);
 	first_index = start >> PAGE_CACHE_SHIFT;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6411ed6ca44..8862dda46ff 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1920,6 +1920,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		nr = bio_get_nr_vecs(bdev);
 
 	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+	if (!bio)
+		return -ENOMEM;
 
 	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 42dfc307704..6d66e5caff9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2751,7 +2751,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	log = root->log_root;
 
 	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 	dst_path = btrfs_alloc_path();
+	if (!dst_path) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
 
 	min_key.objectid = inode->i_ino;
 	min_key.type = BTRFS_INODE_ITEM_KEY;
-- 
cgit v1.2.3


From 98d5dc13e7e74b77ca3b4c3cbded9f48d2dbbbb7 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 20 Jan 2011 06:19:37 +0000
Subject: btrfs: fix return value check of btrfs_start_transaction()

The error check of btrfs_start_transaction() is added, and the mistake
of the error check on several places is corrected.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  7 +++++--
 fs/btrfs/inode.c       |  1 +
 fs/btrfs/ioctl.c       | 10 ++++++++--
 fs/btrfs/relocation.c  |  3 +++
 fs/btrfs/super.c       |  2 ++
 fs/btrfs/tree-log.c    |  1 +
 fs/btrfs/volumes.c     | 19 +++++++++++++++++--
 7 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9de4ff03882..f07ba21cbf0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6271,6 +6271,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	BUG_ON(!wc);
 
 	trans = btrfs_start_transaction(tree_root, 0);
+	BUG_ON(IS_ERR(trans));
+
 	if (block_rsv)
 		trans->block_rsv = block_rsv;
 
@@ -6368,6 +6370,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 
 			btrfs_end_transaction_throttle(trans, tree_root);
 			trans = btrfs_start_transaction(tree_root, 0);
+			BUG_ON(IS_ERR(trans));
 			if (block_rsv)
 				trans->block_rsv = block_rsv;
 		}
@@ -7587,7 +7590,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
 
 	if (found) {
 		trans = btrfs_start_transaction(root, 1);
-		BUG_ON(!trans);
+		BUG_ON(IS_ERR(trans));
 		ret = btrfs_commit_transaction(trans, root);
 		BUG_ON(ret);
 	}
@@ -7831,7 +7834,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 
 
 	trans = btrfs_start_transaction(extent_root, 1);
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 
 	if (extent_key->objectid == 0) {
 		ret = del_extent_zero(trans, extent_root, path, extent_key);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5621818921f..36bc3f49ebf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2357,6 +2357,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 */
 		if (is_bad_inode(inode)) {
 			trans = btrfs_start_transaction(root, 0);
+			BUG_ON(IS_ERR(trans));
 			btrfs_orphan_del(trans, inode);
 			btrfs_end_transaction(trans, root);
 			iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 12dabe28cf5..02d224e8c83 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -907,6 +907,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
 	if (new_size > old_size) {
 		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			goto out_unlock;
+		}
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
 	} else {
@@ -2141,9 +2145,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	path->leave_spinning = 1;
 
 	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
+	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
-		return -ENOMEM;
+		return PTR_ERR(trans);
 	}
 
 	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -2337,6 +2341,8 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
 	u64 transid;
 
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	transid = trans->transid;
 	btrfs_commit_transaction_async(trans, root, 0);
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ea996543024..1f5556acb53 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2028,6 +2028,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 
 	while (1) {
 		trans = btrfs_start_transaction(root, 0);
+		BUG_ON(IS_ERR(trans));
 		trans->block_rsv = rc->block_rsv;
 
 		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -3665,6 +3666,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 
 	while (1) {
 		trans = btrfs_start_transaction(rc->extent_root, 0);
+		BUG_ON(IS_ERR(trans));
 
 		if (update_backref_cache(trans, &rc->backref_cache)) {
 			btrfs_end_transaction(trans, rc->extent_root);
@@ -4033,6 +4035,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
 	int ret;
 
 	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
+	BUG_ON(IS_ERR(trans));
 
 	memset(&root->root_item.drop_progress, 0,
 		sizeof(root->root_item.drop_progress));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f4e45fdded3..0209b5fc772 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -623,6 +623,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	btrfs_wait_ordered_extents(root, 0, 0);
 
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 	ret = btrfs_commit_transaction(trans, root);
 	return ret;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6d66e5caff9..a4bbb854dfd 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3112,6 +3112,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 	BUG_ON(!path);
 
 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
+	BUG_ON(IS_ERR(trans));
 
 	wc.trans = trans;
 	wc.pin = 1;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2d2f4ccc73..7cad59353b0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1212,6 +1212,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 		return -ENOMEM;
 
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = device->devid;
@@ -1604,6 +1608,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		kfree(device);
+		ret = PTR_ERR(trans);
+		goto error;
+	}
+
 	lock_chunks(root);
 
 	device->barriers = 1;
@@ -1872,7 +1882,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 		return ret;
 
 	trans = btrfs_start_transaction(root, 0);
-	BUG_ON(!trans);
+	BUG_ON(IS_ERR(trans));
 
 	lock_chunks(root);
 
@@ -2046,7 +2056,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		BUG_ON(ret);
 
 		trans = btrfs_start_transaction(dev_root, 0);
-		BUG_ON(!trans);
+		BUG_ON(IS_ERR(trans));
 
 		ret = btrfs_grow_device(trans, device, old_size);
 		BUG_ON(ret);
@@ -2212,6 +2222,11 @@ again:
 
 	/* Shrinking succeeded, else we would be at "done". */
 	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto done;
+	}
+
 	lock_chunks(root);
 
 	device->disk_total_bytes = new_size;
-- 
cgit v1.2.3


From 2a7dba391e5628ad665ce84ef9a6648da541ebab Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 1 Feb 2011 11:05:39 -0500
Subject: fs/vfs/security: pass last path component to LSM on inode creation

SELinux would like to implement a new labeling behavior of newly created
inodes.  We currently label new inodes based on the parent and the creating
process.  This new behavior would also take into account the name of the
new object when deciding the new label.  This is not the (supposed) full path,
just the last component of the path.

This is very useful because creating /etc/shadow is different than creating
/etc/passwd but the kernel hooks are unable to differentiate these
operations.  We currently require that userspace realize it is doing some
difficult operation like that and than userspace jumps through SELinux hoops
to get things set up correctly.  This patch does not implement new
behavior, that is obviously contained in a seperate SELinux patch, but it
does pass the needed name down to the correct LSM hook.  If no such name
exists it is fine to pass NULL.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/btrfs/inode.c             | 13 +++++++------
 fs/btrfs/xattr.c             |  6 ++++--
 fs/btrfs/xattr.h             |  3 ++-
 fs/ext2/ext2.h               |  2 +-
 fs/ext2/ialloc.c             |  5 +++--
 fs/ext2/namei.c              |  8 ++++----
 fs/ext2/xattr.h              |  6 ++++--
 fs/ext2/xattr_security.c     |  5 +++--
 fs/ext3/ialloc.c             |  5 +++--
 fs/ext3/namei.c              |  8 ++++----
 fs/ext3/xattr.h              |  4 ++--
 fs/ext3/xattr_security.c     |  5 +++--
 fs/ext4/ialloc.c             |  2 +-
 fs/ext4/xattr.h              |  4 ++--
 fs/ext4/xattr_security.c     |  5 +++--
 fs/gfs2/inode.c              |  7 ++++---
 fs/jffs2/dir.c               |  9 ++++-----
 fs/jffs2/nodelist.h          |  2 +-
 fs/jffs2/security.c          |  5 +++--
 fs/jffs2/write.c             | 18 ++++++++++--------
 fs/jffs2/xattr.h             |  5 +++--
 fs/jfs/jfs_xattr.h           |  5 +++--
 fs/jfs/namei.c               |  8 ++++----
 fs/jfs/xattr.c               |  6 ++++--
 fs/ocfs2/namei.c             |  4 ++--
 fs/ocfs2/refcounttree.c      |  3 ++-
 fs/ocfs2/xattr.c             | 10 ++++++----
 fs/ocfs2/xattr.h             |  4 +++-
 fs/reiserfs/namei.c          |  9 +++++----
 fs/reiserfs/xattr_security.c |  3 ++-
 fs/xfs/linux-2.6/xfs_iops.c  |  9 +++++----
 31 files changed, 107 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0ff46a4789..49c04bec6a9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -90,13 +90,14 @@ static noinline int cow_file_range(struct inode *inode,
 				   unsigned long *nr_written, int unlock);
 
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
-				     struct inode *inode,  struct inode *dir)
+				     struct inode *inode,  struct inode *dir,
+				     const struct qstr *qstr)
 {
 	int err;
 
 	err = btrfs_init_acl(trans, inode, dir);
 	if (!err)
-		err = btrfs_xattr_security_init(trans, inode, dir);
+		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
 	return err;
 }
 
@@ -4675,7 +4676,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_inode_security(trans, inode, dir);
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -4736,7 +4737,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_inode_security(trans, inode, dir);
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -4864,7 +4865,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	drop_on_err = 1;
 
-	err = btrfs_init_inode_security(trans, inode, dir);
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err)
 		goto out_fail;
 
@@ -6946,7 +6947,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_inode_security(trans, inode, dir);
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 698fdd2c739..3338a7e61d2 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -352,7 +352,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 }
 
 int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-			      struct inode *inode, struct inode *dir)
+			      struct inode *inode, struct inode *dir,
+			      const struct qstr *qstr)
 {
 	int err;
 	size_t len;
@@ -360,7 +361,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
 	char *suffix;
 	char *name;
 
-	err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+	err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+					   &len);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bb..b3cc8039134 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 
 extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-				     struct inode *inode, struct inode *dir);
+				     struct inode *inode, struct inode *dir,
+				     const struct qstr *qstr);
 
 #endif /* __XATTR__ */
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 6346a2acf32..1b48c337087 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
 
 /* ialloc.c */
-extern struct inode * ext2_new_inode (struct inode *, int);
+extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *);
 extern void ext2_free_inode (struct inode *);
 extern unsigned long ext2_count_free_inodes (struct super_block *);
 extern void ext2_check_inodes_bitmap (struct super_block *);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad70479aabf..ee9ed31948e 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -429,7 +429,8 @@ found:
 	return group;
 }
 
-struct inode *ext2_new_inode(struct inode *dir, int mode)
+struct inode *ext2_new_inode(struct inode *dir, int mode,
+			     const struct qstr *qstr)
 {
 	struct super_block *sb;
 	struct buffer_head *bitmap_bh = NULL;
@@ -585,7 +586,7 @@ got:
 	if (err)
 		goto fail_free_drop;
 
-	err = ext2_init_security(inode,dir);
+	err = ext2_init_security(inode, dir, qstr);
 	if (err)
 		goto fail_free_drop;
 
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index f8aecd2e329..368d7049ac8 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -104,7 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
 
 	dquot_initialize(dir);
 
-	inode = ext2_new_inode(dir, mode);
+	inode = ext2_new_inode(dir, mode, &dentry->d_name);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -133,7 +133,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
 
 	dquot_initialize(dir);
 
-	inode = ext2_new_inode (dir, mode);
+	inode = ext2_new_inode (dir, mode, &dentry->d_name);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -159,7 +159,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
 
 	dquot_initialize(dir);
 
-	inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
+	inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out;
@@ -230,7 +230,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 
 	inode_inc_link_count(dir);
 
-	inode = ext2_new_inode (dir, S_IFDIR | mode);
+	inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_dir;
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index a1a1c218461..5e41cccff76 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -116,9 +116,11 @@ exit_ext2_xattr(void)
 # endif  /* CONFIG_EXT2_FS_XATTR */
 
 #ifdef CONFIG_EXT2_FS_SECURITY
-extern int ext2_init_security(struct inode *inode, struct inode *dir);
+extern int ext2_init_security(struct inode *inode, struct inode *dir,
+			      const struct qstr *qstr);
 #else
-static inline int ext2_init_security(struct inode *inode, struct inode *dir)
+static inline int ext2_init_security(struct inode *inode, struct inode *dir,
+				     const struct qstr *qstr)
 {
 	return 0;
 }
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 3004e15d5da..5d979b4347b 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -47,14 +47,15 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
 }
 
 int
-ext2_init_security(struct inode *inode, struct inode *dir)
+ext2_init_security(struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
 {
 	int err;
 	size_t len;
 	void *value;
 	char *name;
 
-	err = security_inode_init_security(inode, dir, &name, &value, &len);
+	err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9724aef2246..bfc2dc43681 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -404,7 +404,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
  * For other inodes, search forward from the parent directory's block
  * group to find a free inode.
  */
-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
+			     const struct qstr *qstr, int mode)
 {
 	struct super_block *sb;
 	struct buffer_head *bitmap_bh = NULL;
@@ -589,7 +590,7 @@ got:
 	if (err)
 		goto fail_free_drop;
 
-	err = ext3_init_security(handle,inode, dir);
+	err = ext3_init_security(handle, inode, dir, qstr);
 	if (err)
 		goto fail_free_drop;
 
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index bce9dce639b..a900033efcc 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1707,7 +1707,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext3_new_inode (handle, dir, mode);
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext3_file_inode_operations;
@@ -1743,7 +1743,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext3_new_inode (handle, dir, mode);
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
 	err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -1781,7 +1781,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
@@ -2195,7 +2195,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+	inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_stop;
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 377fe720116..2be4f69bfa6 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -128,10 +128,10 @@ exit_ext3_xattr(void)
 
 #ifdef CONFIG_EXT3_FS_SECURITY
 extern int ext3_init_security(handle_t *handle, struct inode *inode,
-				struct inode *dir);
+			      struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext3_init_security(handle_t *handle, struct inode *inode,
-				struct inode *dir)
+				     struct inode *dir, const struct qstr *qstr)
 {
 	return 0;
 }
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 03a99bfc59f..b8d9f83aa5c 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -49,14 +49,15 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
 }
 
 int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
 {
 	int err;
 	size_t len;
 	void *value;
 	char *name;
 
-	err = security_inode_init_security(inode, dir, &name, &value, &len);
+	err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1ce240a23eb..49b6cfd1fc4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1042,7 +1042,7 @@ got:
 	if (err)
 		goto fail_free_drop;
 
-	err = ext4_init_security(handle, inode, dir);
+	err = ext4_init_security(handle, inode, dir, qstr);
 	if (err)
 		goto fail_free_drop;
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1ef16520b95..25b7387ff18 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
-				struct inode *dir);
+			      struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext4_init_security(handle_t *handle, struct inode *inode,
-				struct inode *dir)
+				     struct inode *dir, const struct qstr *qstr)
 {
 	return 0;
 }
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 9b21268e121..007c3bfbf09 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
 }
 
 int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+		   const struct qstr *qstr)
 {
 	int err;
 	size_t len;
 	void *value;
 	char *name;
 
-	err = security_inode_init_security(inode, dir, &name, &value, &len);
+	err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2232b3c780b..de35ca7d798 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -791,14 +791,15 @@ fail:
 	return error;
 }
 
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+			      const struct qstr *qstr)
 {
 	int err;
 	size_t len;
 	void *value;
 	char *name;
 
-	err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
+	err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
 					   &name, &value, &len);
 
 	if (err) {
@@ -882,7 +883,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock2;
 
-	error = gfs2_security_init(dip, GFS2_I(inode));
+	error = gfs2_security_init(dip, GFS2_I(inode), name);
 	if (error)
 		goto fail_gunlock2;
 
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 92978658ed1..82faddd1f32 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -215,8 +215,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	   no chance of AB-BA deadlock involving its f->sem). */
 	mutex_unlock(&f->sem);
 
-	ret = jffs2_do_create(c, dir_f, f, ri,
-			      dentry->d_name.name, dentry->d_name.len);
+	ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name);
 	if (ret)
 		goto fail;
 
@@ -386,7 +385,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 
 	jffs2_complete_reservation(c);
 
-	ret = jffs2_init_security(inode, dir_i);
+	ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
 	if (ret)
 		goto fail;
 
@@ -530,7 +529,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 
 	jffs2_complete_reservation(c);
 
-	ret = jffs2_init_security(inode, dir_i);
+	ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
 	if (ret)
 		goto fail;
 
@@ -703,7 +702,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 
 	jffs2_complete_reservation(c);
 
-	ret = jffs2_init_security(inode, dir_i);
+	ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
 	if (ret)
 		goto fail;
 
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 5a53d9bdb2b..e4619b00f7c 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -401,7 +401,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			    struct jffs2_raw_inode *ri, unsigned char *buf,
 			    uint32_t offset, uint32_t writelen, uint32_t *retlen);
 int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
-		    struct jffs2_raw_inode *ri, const char *name, int namelen);
+		    struct jffs2_raw_inode *ri, const struct qstr *qstr);
 int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
 		    int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
 int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 239f51216a6..cfeb7164b08 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -23,14 +23,15 @@
 #include "nodelist.h"
 
 /* ---- Initial Security Label Attachment -------------- */
-int jffs2_init_security(struct inode *inode, struct inode *dir)
+int jffs2_init_security(struct inode *inode, struct inode *dir,
+			const struct qstr *qstr)
 {
 	int rc;
 	size_t len;
 	void *value;
 	char *name;
 
-	rc = security_inode_init_security(inode, dir, &name, &value, &len);
+	rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
 	if (rc) {
 		if (rc == -EOPNOTSUPP)
 			return 0;
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index c819eb0e982..30d175b6d29 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -424,7 +424,9 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 	return ret;
 }
 
-int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen)
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
+		    struct jffs2_inode_info *f, struct jffs2_raw_inode *ri,
+		    const struct qstr *qstr)
 {
 	struct jffs2_raw_dirent *rd;
 	struct jffs2_full_dnode *fn;
@@ -466,15 +468,15 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 	mutex_unlock(&f->sem);
 	jffs2_complete_reservation(c);
 
-	ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode);
+	ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr);
 	if (ret)
 		return ret;
 	ret = jffs2_init_acl_post(&f->vfs_inode);
 	if (ret)
 		return ret;
 
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen,
+				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len));
 
 	if (ret) {
 		/* Eep. */
@@ -493,19 +495,19 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 
 	rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
 	rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
-	rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+	rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len);
 	rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
 
 	rd->pino = cpu_to_je32(dir_f->inocache->ino);
 	rd->version = cpu_to_je32(++dir_f->highest_version);
 	rd->ino = ri->ino;
 	rd->mctime = ri->ctime;
-	rd->nsize = namelen;
+	rd->nsize = qstr->len;
 	rd->type = DT_REG;
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
-	rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
+	rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL);
 
 	jffs2_free_raw_dirent(rd);
 
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index cf4f5759b42..7be4beb306f 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -121,10 +121,11 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #endif /* CONFIG_JFFS2_FS_XATTR */
 
 #ifdef CONFIG_JFFS2_FS_SECURITY
-extern int jffs2_init_security(struct inode *inode, struct inode *dir);
+extern int jffs2_init_security(struct inode *inode, struct inode *dir,
+			       const struct qstr *qstr);
 extern const struct xattr_handler jffs2_security_xattr_handler;
 #else
-#define jffs2_init_security(inode,dir)	(0)
+#define jffs2_init_security(inode,dir,qstr)	(0)
 #endif /* CONFIG_JFFS2_FS_SECURITY */
 
 #endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 88b6cc535bf..e9e100fd7c0 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -62,10 +62,11 @@ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 extern int jfs_removexattr(struct dentry *, const char *);
 
 #ifdef CONFIG_JFS_SECURITY
-extern int jfs_init_security(tid_t, struct inode *, struct inode *);
+extern int jfs_init_security(tid_t, struct inode *, struct inode *,
+			     const struct qstr *);
 #else
 static inline int jfs_init_security(tid_t tid, struct inode *inode,
-				    struct inode *dir)
+				    struct inode *dir, const struct qstr *qstr)
 {
 	return 0;
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4414e3a4226..030b9174e41 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -115,7 +115,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	if (rc)
 		goto out3;
 
-	rc = jfs_init_security(tid, ip, dip);
+	rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
 	if (rc) {
 		txAbort(tid, 0);
 		goto out3;
@@ -253,7 +253,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	if (rc)
 		goto out3;
 
-	rc = jfs_init_security(tid, ip, dip);
+	rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
 	if (rc) {
 		txAbort(tid, 0);
 		goto out3;
@@ -932,7 +932,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
 	mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
 
-	rc = jfs_init_security(tid, ip, dip);
+	rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
 	if (rc)
 		goto out3;
 
@@ -1395,7 +1395,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (rc)
 		goto out3;
 
-	rc = jfs_init_security(tid, ip, dir);
+	rc = jfs_init_security(tid, ip, dir, &dentry->d_name);
 	if (rc) {
 		txAbort(tid, 0);
 		goto out3;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 2d7f165d0f1..3fa4c32272d 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -1091,7 +1091,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 }
 
 #ifdef CONFIG_JFS_SECURITY
-int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+		      const struct qstr *qstr)
 {
 	int rc;
 	size_t len;
@@ -1099,7 +1100,8 @@ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
 	char *suffix;
 	char *name;
 
-	rc = security_inode_init_security(inode, dir, &suffix, &value, &len);
+	rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+					  &len);
 	if (rc) {
 		if (rc == -EOPNOTSUPP)
 			return 0;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d14cad6e2e4..bd8d6461a68 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -294,7 +294,7 @@ static int ocfs2_mknod(struct inode *dir,
 	}
 
 	/* get security xattr */
-	status = ocfs2_init_security_get(inode, dir, &si);
+	status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
 	if (status) {
 		if (status == -EOPNOTSUPP)
 			si.enable = 0;
@@ -1665,7 +1665,7 @@ static int ocfs2_symlink(struct inode *dir,
 	}
 
 	/* get security xattr */
-	status = ocfs2_init_security_get(inode, dir, &si);
+	status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
 	if (status) {
 		if (status == -EOPNOTSUPP)
 			si.enable = 0;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b5f9160e93e..cd3f5b4832e 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4325,7 +4325,8 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
 
 	/* If the security isn't preserved, we need to re-initialize them. */
 	if (!preserve) {
-		error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
+		error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+						    &new_dentry->d_name);
 		if (error)
 			mlog_errno(error);
 	}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 67cd4391464..6bb602486c6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7185,7 +7185,8 @@ out:
  * must not hold any lock expect i_mutex.
  */
 int ocfs2_init_security_and_acl(struct inode *dir,
-				struct inode *inode)
+				struct inode *inode,
+				const struct qstr *qstr)
 {
 	int ret = 0;
 	struct buffer_head *dir_bh = NULL;
@@ -7193,7 +7194,7 @@ int ocfs2_init_security_and_acl(struct inode *dir,
 		.enable = 1,
 	};
 
-	ret = ocfs2_init_security_get(inode, dir, &si);
+	ret = ocfs2_init_security_get(inode, dir, qstr, &si);
 	if (!ret) {
 		ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
 				      si.name, si.value, si.value_len,
@@ -7261,13 +7262,14 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
 
 int ocfs2_init_security_get(struct inode *inode,
 			    struct inode *dir,
+			    const struct qstr *qstr,
 			    struct ocfs2_security_xattr_info *si)
 {
 	/* check whether ocfs2 support feature xattr */
 	if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
 		return -EOPNOTSUPP;
-	return security_inode_init_security(inode, dir, &si->name, &si->value,
-					    &si->value_len);
+	return security_inode_init_security(inode, dir, qstr, &si->name,
+					    &si->value, &si->value_len);
 }
 
 int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index aa64bb37a65..d63cfb72316 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -57,6 +57,7 @@ int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
 					 struct ocfs2_dinode *di);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 int ocfs2_init_security_get(struct inode *, struct inode *,
+			    const struct qstr *,
 			    struct ocfs2_security_xattr_info *);
 int ocfs2_init_security_set(handle_t *, struct inode *,
 			    struct buffer_head *,
@@ -94,5 +95,6 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
 			 struct buffer_head *new_bh,
 			 bool preserve_security);
 int ocfs2_init_security_and_acl(struct inode *dir,
-				struct inode *inode);
+				struct inode *inode,
+				const struct qstr *qstr);
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ba5f51ec345..d5b22ed0677 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -593,7 +593,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	new_inode_init(inode, dir, mode);
 
 	jbegin_count += reiserfs_cache_default_acl(dir);
-	retval = reiserfs_security_init(dir, inode, &security);
+	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
 	if (retval < 0) {
 		drop_new_inode(inode);
 		return retval;
@@ -667,7 +667,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 	new_inode_init(inode, dir, mode);
 
 	jbegin_count += reiserfs_cache_default_acl(dir);
-	retval = reiserfs_security_init(dir, inode, &security);
+	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
 	if (retval < 0) {
 		drop_new_inode(inode);
 		return retval;
@@ -747,7 +747,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	new_inode_init(inode, dir, mode);
 
 	jbegin_count += reiserfs_cache_default_acl(dir);
-	retval = reiserfs_security_init(dir, inode, &security);
+	retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
 	if (retval < 0) {
 		drop_new_inode(inode);
 		return retval;
@@ -1032,7 +1032,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	}
 	new_inode_init(inode, parent_dir, mode);
 
-	retval = reiserfs_security_init(parent_dir, inode, &security);
+	retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
+					&security);
 	if (retval < 0) {
 		drop_new_inode(inode);
 		return retval;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 237c6928d3c..ef66c18a933 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -54,6 +54,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
  * of blocks needed for the transaction. If successful, reiserfs_security
  * must be released using reiserfs_security_free when the caller is done. */
 int reiserfs_security_init(struct inode *dir, struct inode *inode,
+			   const struct qstr *qstr,
 			   struct reiserfs_security_handle *sec)
 {
 	int blocks = 0;
@@ -65,7 +66,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
 	if (IS_PRIVATE(dir))
 		return 0;
 
-	error = security_inode_init_security(inode, dir, &sec->name,
+	error = security_inode_init_security(inode, dir, qstr, &sec->name,
 					     &sec->value, &sec->length);
 	if (error) {
 		if (error == -EOPNOTSUPP)
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 94d5fd6a297..d9298cf6026 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -103,7 +103,8 @@ xfs_mark_inode_dirty(
 STATIC int
 xfs_init_security(
 	struct inode	*inode,
-	struct inode	*dir)
+	struct inode	*dir,
+	const struct qstr *qstr)
 {
 	struct xfs_inode *ip = XFS_I(inode);
 	size_t		length;
@@ -111,7 +112,7 @@ xfs_init_security(
 	unsigned char	*name;
 	int		error;
 
-	error = security_inode_init_security(inode, dir, (char **)&name,
+	error = security_inode_init_security(inode, dir, qstr, (char **)&name,
 					     &value, &length);
 	if (error) {
 		if (error == -EOPNOTSUPP)
@@ -195,7 +196,7 @@ xfs_vn_mknod(
 
 	inode = VFS_I(ip);
 
-	error = xfs_init_security(inode, dir);
+	error = xfs_init_security(inode, dir, &dentry->d_name);
 	if (unlikely(error))
 		goto out_cleanup_inode;
 
@@ -368,7 +369,7 @@ xfs_vn_symlink(
 
 	inode = VFS_I(cip);
 
-	error = xfs_init_security(inode, dir);
+	error = xfs_init_security(inode, dir, &dentry->d_name);
 	if (unlikely(error))
 		goto out_cleanup_inode;
 
-- 
cgit v1.2.3


From 8e6c96935fcc1ed3dbebc96fddfef3f2f2395afc Mon Sep 17 00:00:00 2001
From: Lucian Adrian Grijincu <lucian.grijincu@gmail.com>
Date: Tue, 1 Feb 2011 18:42:22 +0200
Subject: security/selinux: fix /proc/sys/ labeling

This fixes an old (2007) selinux regression: filesystem labeling for
/proc/sys returned
     -r--r--r-- unknown                          /proc/sys/fs/file-nr
instead of
     -r--r--r-- system_u:object_r:sysctl_fs_t:s0 /proc/sys/fs/file-nr

Events that lead to breaking of /proc/sys/ selinux labeling:

1) sysctl was reimplemented to route all calls through /proc/sys/

    commit 77b14db502cb85a031fe8fde6c85d52f3e0acb63
    [PATCH] sysctl: reimplement the sysctl proc support

2) proc_dir_entry was removed from ctl_table:

    commit 3fbfa98112fc3962c416452a0baf2214381030e6
    [PATCH] sysctl: remove the proc_dir_entry member for the sysctl tables

3) selinux still walked the proc_dir_entry tree to apply
   labeling. Because ctl_tables don't have a proc_dir_entry, we did
   not label /proc/sys/ inodes any more. To achieve this the /proc/sys/
   inodes were marked private and private inodes were ignored by
   selinux.

    commit bbaca6c2e7ef0f663bc31be4dad7cf530f6c4962
    [PATCH] selinux: enhance selinux to always ignore private inodes

    commit 86a71dbd3e81e8870d0f0e56b87875f57e58222b
    [PATCH] sysctl: hide the sysctl proc inodes from selinux

Access control checks have been done by means of a special sysctl hook
that was called for read/write accesses to any /proc/sys/ entry.

We don't have to do this because, instead of walking the
proc_dir_entry tree we can walk the dentry tree (as done in this
patch). With this patch:
* we don't mark /proc/sys/ inodes as private
* we don't need the sysclt security hook
* we walk the dentry tree to find the path to the inode.

We have to strip the PID in /proc/PID/ entries that have a
proc_dir_entry because selinux does not know how to label paths like
'/1/net/rpc/nfsd.fh' (and defaults to 'proc_t' labeling). Selinux does
know of '/net/rpc/nfsd.fh' (and applies the 'sysctl_rpc_t' label).

PID stripping from the path was done implicitly in the previous code
because the proc_dir_entry tree had the root in '/net' in the example
from above. The dentry tree has the root in '/1'.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Lucian Adrian Grijincu <lucian.grijincu@gmail.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/proc/proc_sysctl.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92a34e..fb707e018a8 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -32,7 +32,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	ei->sysctl_entry = table;
 
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
 	inode->i_mode = table->mode;
 	if (!table->child) {
 		inode->i_mode |= S_IFREG;
-- 
cgit v1.2.3


From 9587fcff42f5bece3c0a44066b079235ee73cbb3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 1 Feb 2011 08:40:43 -0500
Subject: cifs: fix length vs. total_read confusion in cifs_demultiplex_thread

length at this point is the length returned by the last kernel_recvmsg
call. total_read is the length of all of the data read so far. length
is more or less meaningless at this point, so use total_read for
everything.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 47d8ff62368..945b2202275 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -578,12 +578,12 @@ incomplete_rcv:
 		else if (reconnect == 1)
 			continue;
 
-		length += 4; /* account for rfc1002 hdr */
+		total_read += 4; /* account for rfc1002 hdr */
 
-
-		dump_smb(smb_buffer, length);
-		if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) {
-			cifs_dump_mem("Bad SMB: ", smb_buffer, 48);
+		dump_smb(smb_buffer, total_read);
+		if (checkSMB(smb_buffer, smb_buffer->Mid, total_read)) {
+			cifs_dump_mem("Bad SMB: ", smb_buffer,
+					total_read < 48 ? total_read : 48);
 			continue;
 		}
 
-- 
cgit v1.2.3


From b9c93bb7deadc7cdd00415edc6b38d67a26c1c7a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 2 Feb 2011 14:48:10 +0000
Subject: GFS2: Improve cluster mmap scalability

The mmap system call grabs a glock when an update to atime maybe
required. It does this in order to ensure that the flags on the
inode are uptodate, but since it will only mark atime for a future
update, an exclusive lock is not required here (one will be taken
later when the actual update is performed).

Also, the lock can be skipped when the mount is marked noatime in
addition to the original check which only looked at the noatime
flag for the inode itself.

This should increase the scalability of the mmap call when multiple
nodes are all mmaping the same file.

Reported-by: Scooter Morris <scooter@cgl.ucsf.edu>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7cfdcb91336..216ad2774a6 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -448,15 +448,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 
-	if (!(file->f_flags & O_NOATIME)) {
+	if (!(file->f_flags & O_NOATIME) &&
+	    !IS_NOATIME(&ip->i_inode)) {
 		struct gfs2_holder i_gh;
 		int error;
 
-		gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
 		error = gfs2_glock_nq(&i_gh);
-		file_accessed(file);
-		if (error == 0)
-			gfs2_glock_dq_uninit(&i_gh);
+		if (error == 0) {
+			file_accessed(file);
+			gfs2_glock_dq(&i_gh);
+		}
+		gfs2_holder_uninit(&i_gh);
+		if (error)
+			return error;
 	}
 	vma->vm_ops = &gfs2_vm_ops;
 	vma->vm_flags |= VM_CAN_NONLINEAR;
-- 
cgit v1.2.3


From 0781b909b5586f4db720b5d1838b78f9d8e42f14 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 1 Feb 2011 15:52:35 -0800
Subject: epoll: epoll_wait() should not use timespec_add_ns()

commit 95aac7b1cd224f ("epoll: make epoll_wait() use the hrtimer range
feature") added a performance regression because it uses timespec_add_ns()
with potential very large 'ns' values.

[akpm@linux-foundation.org: s/epoll_set_mstimeout/ep_set_mstimeout/, per Davide]
Reported-by: Simon Kirby <sim@hostway.ca>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Shawn Bohrer <shawn.bohrer@gmail.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: <stable@kernel.org>		[2.6.37.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7d606..267d0ada454 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1114,6 +1114,17 @@ static int ep_send_events(struct eventpoll *ep,
 	return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 }
 
+static inline struct timespec ep_set_mstimeout(long ms)
+{
+	struct timespec now, ts = {
+		.tv_sec = ms / MSEC_PER_SEC,
+		.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
+	};
+
+	ktime_get_ts(&now);
+	return timespec_add_safe(now, ts);
+}
+
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout)
 {
@@ -1121,12 +1132,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	unsigned long flags;
 	long slack;
 	wait_queue_t wait;
-	struct timespec end_time;
 	ktime_t expires, *to = NULL;
 
 	if (timeout > 0) {
-		ktime_get_ts(&end_time);
-		timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
+		struct timespec end_time = ep_set_mstimeout(timeout);
+
 		slack = select_estimate_accuracy(&end_time);
 		to = &expires;
 		*to = timespec_to_ktime(end_time);
-- 
cgit v1.2.3


From 3cd90ea42f2c15f928b70ed66f6d8ed0a8e7aadd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Tue, 1 Feb 2011 15:52:46 -0800
Subject: vfs: sparse: add __FMODE_EXEC

FMODE_EXEC is a constant type of fmode_t but was used with normal integer
constants.  This results in following warnings from sparse.  Fix it using
new macro __FMODE_EXEC.

 fs/exec.c:116:58: warning: restricted fmode_t degrades to integer
 fs/exec.c:689:58: warning: restricted fmode_t degrades to integer
 fs/fcntl.c:777:9: warning: restricted fmode_t degrades to integer

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c  | 4 ++--
 fs/fcntl.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index c62efcb959c..52a447d9b6a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -120,7 +120,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 		goto out;
 
 	file = do_filp_open(AT_FDCWD, tmp,
-				O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
 				MAY_READ | MAY_EXEC | MAY_OPEN);
 	putname(tmp);
 	error = PTR_ERR(file);
@@ -723,7 +723,7 @@ struct file *open_exec(const char *name)
 	int err;
 
 	file = do_filp_open(AT_FDCWD, name,
-				O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
+				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
 				MAY_EXEC | MAY_OPEN);
 	if (IS_ERR(file))
 		goto out;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ecc8b3954ed..cb1026181bd 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -815,7 +815,7 @@ static int __init fcntl_init(void)
 		__O_SYNC	| O_DSYNC	| FASYNC	|
 		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
 		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
-		FMODE_EXEC
+		__FMODE_EXEC
 		));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
-- 
cgit v1.2.3


From d54cdc8ca7aabc69e145a62155855db42b04ed0b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 1 Feb 2011 15:52:47 -0800
Subject: fs: make block fiemap mapping length at least blocksize long

Some filesystems don't deal well with being asked to map less than
blocksize blocks (GFS2 for example).  Since we are always mapping at least
blocksize sections anyway, just make sure len is at least as big as a
blocksize so we don't trip up any filesystems.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ioctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index a59635e295f..1eebeb72b20 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -273,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode,
 		len = isize;
 	}
 
+	/*
+	 * Some filesystems can't deal with being asked to map less than
+	 * blocksize, so make sure our len is at least block length.
+	 */
+	if (logical_to_blk(inode, len) == 0)
+		len = blk_to_logical(inode, 1);
+
 	start_blk = logical_to_blk(inode, start);
 	last_blk = logical_to_blk(inode, start + len - 1);
 
-- 
cgit v1.2.3


From 0b0abeaf3d30cec03ac6497fe978b8f7edecc5ae Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 2 Feb 2011 21:02:12 +0200
Subject: Revert "exofs: Set i_mapping->backing_dev_info anyway"

This reverts commit 115e19c53501edc11f730191f7f047736815ae3d.

Apparently setting inode->bdi to one's own sb->s_bdi stops VFS from
sending *read-aheads*.  This problem was bisected to this commit.  A
revert fixes it.  I'll investigate farther why is this happening for the
next Kernel, but for now a revert.

I'm sending to stable@kernel.org as well, since it exists also in
2.6.37.  2.6.36 is good and does not have this patch.

CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exofs/inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 42685424817..a7555238c41 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1030,7 +1030,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
 	}
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &exofs_file_inode_operations;
 		inode->i_fop = &exofs_file_operations;
@@ -1131,7 +1130,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	sbi = sb->s_fs_info;
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	sb->s_dirt = 1;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
-- 
cgit v1.2.3


From 8f1f745331c1b560f53c0d60e55a4f4f43f7cce5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 3 Feb 2011 14:33:15 -0500
Subject: ext4: fix panic on module unload when stopping lazyinit thread

https://bugzilla.kernel.org/show_bug.cgi?id=27652

If the lazyinit thread is running, the teardown function
ext4_destroy_lazyinit_thread() has problems:

        ext4_clear_request_list();
        while (ext4_li_info->li_task) {
                wake_up(&ext4_li_info->li_wait_daemon);
                wait_event(ext4_li_info->li_wait_task,
                           ext4_li_info->li_task == NULL);
        }

Clearing the request list will cause the thread to exit and free
ext4_li_info, so then we're waiting on something which is getting
freed.

Fix this up by making the thread respond to kthread_stop, and exit,
without the need to wait for that exit in some other homegrown way.

Cc: stable@kernel.org
Reported-and-Tested-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 48ce561fafa..3d8cf2cab37 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
 
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
@@ -2716,6 +2717,8 @@ static void ext4_unregister_li_request(struct super_block *sb)
 	mutex_unlock(&ext4_li_info->li_list_mtx);
 }
 
+static struct task_struct *ext4_lazyinit_task;
+
 /*
  * This is the function where ext4lazyinit thread lives. It walks
  * through the request list searching for next scheduled filesystem.
@@ -2784,6 +2787,10 @@ cont_thread:
 		if (time_before(jiffies, next_wakeup))
 			schedule();
 		finish_wait(&eli->li_wait_daemon, &wait);
+		if (kthread_should_stop()) {
+			ext4_clear_request_list();
+			goto exit_thread;
+		}
 	}
 
 exit_thread:
@@ -2808,6 +2815,7 @@ exit_thread:
 	wake_up(&eli->li_wait_task);
 
 	kfree(ext4_li_info);
+	ext4_lazyinit_task = NULL;
 	ext4_li_info = NULL;
 	mutex_unlock(&ext4_li_mtx);
 
@@ -2830,11 +2838,10 @@ static void ext4_clear_request_list(void)
 
 static int ext4_run_lazyinit_thread(void)
 {
-	struct task_struct *t;
-
-	t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
-	if (IS_ERR(t)) {
-		int err = PTR_ERR(t);
+	ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+					 ext4_li_info, "ext4lazyinit");
+	if (IS_ERR(ext4_lazyinit_task)) {
+		int err = PTR_ERR(ext4_lazyinit_task);
 		ext4_clear_request_list();
 		del_timer_sync(&ext4_li_info->li_timer);
 		kfree(ext4_li_info);
@@ -2985,16 +2992,10 @@ static void ext4_destroy_lazyinit_thread(void)
 	 * If thread exited earlier
 	 * there's nothing to be done.
 	 */
-	if (!ext4_li_info)
+	if (!ext4_li_info || !ext4_lazyinit_task)
 		return;
 
-	ext4_clear_request_list();
-
-	while (ext4_li_info->li_task) {
-		wake_up(&ext4_li_info->li_wait_daemon);
-		wait_event(ext4_li_info->li_wait_task,
-			   ext4_li_info->li_task == NULL);
-	}
+	kthread_stop(ext4_lazyinit_task);
 }
 
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
-- 
cgit v1.2.3


From 8f021222c1e2756ea4c9dde93b23e1d2a0a4ec37 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Thu, 3 Feb 2011 14:33:33 -0500
Subject: ext4: unregister features interface on module unload

Ext4 features interface was not properly unregistered which led to
problems while unloading/reloading ext4 module. This commit fixes that by
adding proper kobject unregistration code into ext4_exit_fs() as well as
fail-path of ext4_init_fs()

Reported-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3d8cf2cab37..4898cb1ff60 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4769,7 +4769,7 @@ static struct file_system_type ext4_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-int __init ext4_init_feat_adverts(void)
+static int __init ext4_init_feat_adverts(void)
 {
 	struct ext4_features *ef;
 	int ret = -ENOMEM;
@@ -4793,6 +4793,13 @@ out:
 	return ret;
 }
 
+static void ext4_exit_feat_adverts(void)
+{
+	kobject_put(&ext4_feat->f_kobj);
+	wait_for_completion(&ext4_feat->f_kobj_unregister);
+	kfree(ext4_feat);
+}
+
 static int __init ext4_init_fs(void)
 {
 	int err;
@@ -4839,7 +4846,7 @@ out1:
 out2:
 	ext4_exit_mballoc();
 out3:
-	kfree(ext4_feat);
+	ext4_exit_feat_adverts();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 out4:
@@ -4858,6 +4865,7 @@ static void __exit ext4_exit_fs(void)
 	destroy_inodecache();
 	ext4_exit_xattr();
 	ext4_exit_mballoc();
+	ext4_exit_feat_adverts();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
 	ext4_exit_system_zone();
-- 
cgit v1.2.3


From dd68314ccf3fb918c1fb6471817edbc60ece4b52 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 3 Feb 2011 14:33:49 -0500
Subject: ext4: fix up ext4 error handling

Make sure we the correct cleanup happens if we die while trying to
load the ext4 file system.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4898cb1ff60..86b05486dc6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4810,13 +4810,17 @@ static int __init ext4_init_fs(void)
 		return err;
 	err = ext4_init_system_zone();
 	if (err)
-		goto out5;
+		goto out7;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 	if (!ext4_kset)
-		goto out4;
+		goto out6;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+	if (!ext4_proc_root)
+		goto out5;
 
 	err = ext4_init_feat_adverts();
+	if (err)
+		goto out4;
 
 	err = ext4_init_mballoc();
 	if (err)
@@ -4847,11 +4851,13 @@ out2:
 	ext4_exit_mballoc();
 out3:
 	ext4_exit_feat_adverts();
+out4:
 	remove_proc_entry("fs/ext4", NULL);
+out5:
 	kset_unregister(ext4_kset);
-out4:
+out6:
 	ext4_exit_system_zone();
-out5:
+out7:
 	ext4_exit_pageio();
 	return err;
 }
-- 
cgit v1.2.3


From c5b8d0bce052949e173b5b32f96bd59bceaa2ab0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Wed, 2 Feb 2011 09:32:39 -0700
Subject: hfsplus: fix failed mount handling

Currently the error handling in hfsplus_fill_super is a mess, and can
lead to accessing fields in the superblock that haven't been even set
up yet.  Fix this by making sure we do not set up sb->s_root until we
have the mount fully set up, and before that do proper step by step
unwinding instead of using hfsplus_put_super as a big hammer.

Reported-by: Dan Williams <dcbw@redhat.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/super.c | 106 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a3b4795f43..b49b55584c8 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -338,20 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root, *inode;
 	struct qstr str;
 	struct nls_table *nls = NULL;
-	int err = -EINVAL;
+	int err;
 
+	err = -EINVAL;
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
-		return -ENOMEM;
+		goto out;
 
 	sb->s_fs_info = sbi;
 	mutex_init(&sbi->alloc_mutex);
 	mutex_init(&sbi->vh_mutex);
 	hfsplus_fill_defaults(sbi);
+
+	err = -EINVAL;
 	if (!hfsplus_parse_options(data, sbi)) {
 		printk(KERN_ERR "hfs: unable to parse mount options\n");
-		err = -EINVAL;
-		goto cleanup;
+		goto out_unload_nls;
 	}
 
 	/* temporarily use utf8 to correctly find the hidden dir below */
@@ -359,16 +361,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->nls = load_nls("utf8");
 	if (!sbi->nls) {
 		printk(KERN_ERR "hfs: unable to load nls for utf8\n");
-		err = -EINVAL;
-		goto cleanup;
+		goto out_unload_nls;
 	}
 
 	/* Grab the volume header */
 	if (hfsplus_read_wrapper(sb)) {
 		if (!silent)
 			printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
-		err = -EINVAL;
-		goto cleanup;
+		goto out_unload_nls;
 	}
 	vhdr = sbi->s_vhdr;
 
@@ -377,7 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
 	    be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
 		printk(KERN_ERR "hfs: wrong filesystem version\n");
-		goto cleanup;
+		goto out_free_vhdr;
 	}
 	sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
 	sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
@@ -421,19 +421,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
 	if (!sbi->ext_tree) {
 		printk(KERN_ERR "hfs: failed to load extents file\n");
-		goto cleanup;
+		goto out_free_vhdr;
 	}
 	sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
 	if (!sbi->cat_tree) {
 		printk(KERN_ERR "hfs: failed to load catalog file\n");
-		goto cleanup;
+		goto out_close_ext_tree;
 	}
 
 	inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
 	if (IS_ERR(inode)) {
 		printk(KERN_ERR "hfs: failed to load allocation file\n");
 		err = PTR_ERR(inode);
-		goto cleanup;
+		goto out_close_cat_tree;
 	}
 	sbi->alloc_file = inode;
 
@@ -442,14 +442,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	if (IS_ERR(root)) {
 		printk(KERN_ERR "hfs: failed to load root directory\n");
 		err = PTR_ERR(root);
-		goto cleanup;
-	}
-	sb->s_d_op = &hfsplus_dentry_operations;
-	sb->s_root = d_alloc_root(root);
-	if (!sb->s_root) {
-		iput(root);
-		err = -ENOMEM;
-		goto cleanup;
+		goto out_put_alloc_file;
 	}
 
 	str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
@@ -459,46 +452,69 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
 		hfs_find_exit(&fd);
 		if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
-			goto cleanup;
+			goto out_put_root;
 		inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
-			goto cleanup;
+			goto out_put_root;
 		}
 		sbi->hidden_dir = inode;
 	} else
 		hfs_find_exit(&fd);
 
-	if (sb->s_flags & MS_RDONLY)
-		goto out;
+	if (!(sb->s_flags & MS_RDONLY)) {
+		/*
+		 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+		 * all three are registered with Apple for our use
+		 */
+		vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+		vhdr->modify_date = hfsp_now2mt();
+		be32_add_cpu(&vhdr->write_count, 1);
+		vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
+		vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+		hfsplus_sync_fs(sb, 1);
 
-	/* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
-	 * all three are registered with Apple for our use
-	 */
-	vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
-	vhdr->modify_date = hfsp_now2mt();
-	be32_add_cpu(&vhdr->write_count, 1);
-	vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
-	vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
-	hfsplus_sync_fs(sb, 1);
-
-	if (!sbi->hidden_dir) {
-		mutex_lock(&sbi->vh_mutex);
-		sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-		hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
-				   &str, sbi->hidden_dir);
-		mutex_unlock(&sbi->vh_mutex);
-
-		hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
+		if (!sbi->hidden_dir) {
+			mutex_lock(&sbi->vh_mutex);
+			sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+			hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
+					   sbi->hidden_dir);
+			mutex_unlock(&sbi->vh_mutex);
+
+			hfsplus_mark_inode_dirty(sbi->hidden_dir,
+						 HFSPLUS_I_CAT_DIRTY);
+		}
 	}
-out:
+
+	sb->s_d_op = &hfsplus_dentry_operations;
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		err = -ENOMEM;
+		goto out_put_hidden_dir;
+	}
+
 	unload_nls(sbi->nls);
 	sbi->nls = nls;
 	return 0;
 
-cleanup:
-	hfsplus_put_super(sb);
+out_put_hidden_dir:
+	iput(sbi->hidden_dir);
+out_put_root:
+	iput(sbi->alloc_file);
+out_put_alloc_file:
+	iput(sbi->alloc_file);
+out_close_cat_tree:
+	hfs_btree_close(sbi->cat_tree);
+out_close_ext_tree:
+	hfs_btree_close(sbi->ext_tree);
+out_free_vhdr:
+	kfree(sbi->s_vhdr);
+	kfree(sbi->s_backup_vhdr);
+out_unload_nls:
+	unload_nls(sbi->nls);
 	unload_nls(nls);
+	kfree(sbi);
+out:
 	return err;
 }
 
-- 
cgit v1.2.3


From 14dd01f88319a37b06ca909738044e39ec5bfdee Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Tue, 1 Feb 2011 16:41:55 -0500
Subject: hfsplus: do not leak buffer on error

Signed-Off-By: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/part_tbl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index d66ad113b1c..40ad88c12c6 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -134,7 +134,7 @@ int hfs_part_find(struct super_block *sb,
 	res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
 				 data, READ);
 	if (res)
-		return res;
+		goto out;
 
 	switch (be16_to_cpu(*((__be16 *)data))) {
 	case HFS_OLD_PMAP_MAGIC:
@@ -147,7 +147,7 @@ int hfs_part_find(struct super_block *sb,
 		res = -ENOENT;
 		break;
 	}
-
+out:
 	kfree(data);
 	return res;
 }
-- 
cgit v1.2.3


From a1dbcef0172555464b5329f8ba47d43c98132dfa Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Wed, 2 Feb 2011 10:55:06 -0500
Subject: hfsplus: fix two memory leaks in wrapper.c

Signed-Off-By: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/wrapper.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 196231794f6..3031d81f5f0 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -167,7 +167,7 @@ reread:
 		break;
 	case cpu_to_be16(HFSP_WRAP_MAGIC):
 		if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
-			goto out;
+			goto out_free_backup_vhdr;
 		wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
 		part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
 		part_size = wd.embed_count * wd.ablk_size;
@@ -179,7 +179,7 @@ reread:
 		 * (should do this only for cdrom/loop though)
 		 */
 		if (hfs_part_find(sb, &part_start, &part_size))
-			goto out;
+			goto out_free_backup_vhdr;
 		goto reread;
 	}
 
-- 
cgit v1.2.3


From 1065348d472f97b4b8eb53b60ec67e99148cbbca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@tuxera.com>
Date: Wed, 2 Feb 2011 09:40:33 -0700
Subject: hfsplus: fix up a comparism in hfsplus_file_extend

Revert an incorrect hunk from commit b2837fcf4994e699a4def002e26f274d95b387c1,

	"hfsplus: %L-to-%ll, macro correction, and remove unneeded braces"

revert a pointless change of comparism operation argument order, which turned
out to not even be equivalent.

Reported-by: Joe Perches <joe@perches.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
---
 fs/hfsplus/extents.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 52a0bcaa7b6..b1991a2a08e 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -397,8 +397,8 @@ int hfsplus_file_extend(struct inode *inode)
 	u32 start, len, goal;
 	int res;
 
-	if (sbi->total_blocks - sbi->free_blocks + 8 >
-			sbi->alloc_file->i_size * 8) {
+	if (sbi->alloc_file->i_size * 8 <
+	    sbi->total_blocks - sbi->free_blocks + 8) {
 		/* extend alloc file */
 		printk(KERN_ERR "hfs: extend alloc file! "
 				"(%llu,%u,%u)\n",
-- 
cgit v1.2.3


From 1f7da214e26a8ee4fbb66af50e27147d5d115c5a Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 18 Jan 2011 13:09:21 -0800
Subject: debugfs: remove module_exit()

debugfs can't be a module, so module_exit() is meaningless for it.

Signed-off-by: WANG Cong <amwang@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/inode.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 37a8ca7c122..d38c88fb63a 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -13,9 +13,6 @@
  *
  */
 
-/* uncomment to get debug messages from the debug filesystem, ah the irony. */
-/* #define DEBUG */
-
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -540,17 +537,5 @@ static int __init debugfs_init(void)
 
 	return retval;
 }
-
-static void __exit debugfs_exit(void)
-{
-	debugfs_registered = false;
-
-	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-	unregister_filesystem(&debug_fs_type);
-	kobject_put(debug_kobj);
-}
-
 core_initcall(debugfs_init);
-module_exit(debugfs_exit);
-MODULE_LICENSE("GPL");
 
-- 
cgit v1.2.3


From 76429c148b939f5a6863c0a024eb8960ae91469a Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Mon, 31 Jan 2011 16:03:08 +0300
Subject: CIFS: Fix variable types in cifs_iovec_read/write (try #2)

Variable 'i' should be unsigned long as it's used in circle with num_pages,
and bytes_read/total_written should be ssize_t according to return value.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Reviewed-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 74c0a282d01..e964b1cd5dd 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1662,10 +1662,10 @@ static ssize_t
 cifs_iovec_write(struct file *file, const struct iovec *iov,
 		 unsigned long nr_segs, loff_t *poffset)
 {
-	size_t total_written = 0;
-	unsigned int written = 0;
-	unsigned long num_pages, npages;
-	size_t copied, len, cur_len, i;
+	unsigned int written;
+	unsigned long num_pages, npages, i;
+	size_t copied, len, cur_len;
+	ssize_t total_written = 0;
 	struct kvec *to_send;
 	struct page **pages;
 	struct iov_iter it;
@@ -1821,7 +1821,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 {
 	int rc;
 	int xid;
-	unsigned int total_read, bytes_read = 0;
+	ssize_t total_read;
+	unsigned int bytes_read = 0;
 	size_t len, cur_len;
 	int iov_offset = 0;
 	struct cifs_sb_info *cifs_sb;
-- 
cgit v1.2.3


From 78d2978874e4e10e97dfd4fd79db45bdc0748550 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Fri, 4 Feb 2011 18:13:24 +0000
Subject: CRED: Fix kernel panic upon security_file_alloc() failure.

In get_empty_filp() since 2.6.29, file_free(f) is called with f->f_cred == NULL
when security_file_alloc() returned an error.  As a result, kernel will panic()
due to put_cred(NULL) call within RCU callback.

Fix this bug by assigning f->f_cred before calling security_file_alloc().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file_table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index c3e89adf53c..eb36b6b17e2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -125,13 +125,13 @@ struct file *get_empty_filp(void)
 		goto fail;
 
 	percpu_counter_inc(&nr_files);
+	f->f_cred = get_cred(cred);
 	if (security_file_alloc(f))
 		goto fail_sec;
 
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
-	f->f_cred = get_cred(cred);
 	spin_lock_init(&f->f_lock);
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
-- 
cgit v1.2.3


From 64474bdd07f673cc48509ea0375274422c8f73bf Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 3 Feb 2011 14:31:18 -0600
Subject: cifs: Possible slab memory corruption while updating extended stats
 (repost)

Updating extended statistics here can cause slab memory corruption
if a callback function frees slab memory (mid_entry).

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 945b2202275..1f32a2893b5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -633,11 +633,11 @@ incomplete_rcv:
 				mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
 				mid_entry->midState = MID_RESPONSE_RECEIVED;
-				list_del_init(&mid_entry->qhead);
-				mid_entry->callback(mid_entry);
 #ifdef CONFIG_CIFS_STATS2
 				mid_entry->when_received = jiffies;
 #endif
+				list_del_init(&mid_entry->qhead);
+				mid_entry->callback(mid_entry);
 				break;
 			}
 			mid_entry = NULL;
-- 
cgit v1.2.3


From e3f0dadb2b44746f6223ce4560406d19e02fb1cc Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 4 Feb 2011 07:21:26 -0500
Subject: cifs: enable signing flag in SMB header when server has it on

cifs_sign_smb only generates a signature if the correct Flags2 bit is
set. Make sure that it gets set correctly if we're sending an async
call.

This patch fixes:

    https://bugzilla.kernel.org/show_bug.cgi?id=28142

Reported-and-Tested-by: JG <jg@cms.ac>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/transport.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index b8c5e2eb43d..fbc5aace54b 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -359,6 +359,10 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
 	if (rc)
 		return rc;
 
+	/* enable signing if server requires it */
+	if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+		in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
 	mutex_lock(&server->srv_mutex);
 	mid = AllocMidQEntry(in_buf, server);
 	if (mid == NULL) {
-- 
cgit v1.2.3


From 247ec9b418ba50c9022280035330059364d54540 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 4 Feb 2011 17:09:50 -0500
Subject: cifs: don't send an echo request unless NegProt has been done

When the socket to the server is disconnected, the client more or less
immediately calls cifs_reconnect to reconnect the socket. The NegProt
and SessSetup however are not done until an actual call needs to be
made.

With the addition of the SMB echo code, it's possible that the server
will initiate a disconnect on an idle socket. The client will then
reconnect the socket but no NegotiateProtocol request is done. The
SMBEcho workqueue job will then eventually pop, and an SMBEcho will be
sent on the socket. The server will then reject it since no NegProt was
done.

The ideal fix would be to either have the socket not be reconnected
until we plan to use it, or to immediately do a NegProt when the
reconnect occurs. The code is not structured for this however. For now
we must just settle for not sending any echoes until the NegProt is
done.

Reported-by: JG <jg@cms.ac>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1f32a2893b5..257b6d895e2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -337,8 +337,12 @@ cifs_echo_request(struct work_struct *work)
 	struct TCP_Server_Info *server = container_of(work,
 					struct TCP_Server_Info, echo.work);
 
-	/* no need to ping if we got a response recently */
-	if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+	/*
+	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done.
+	 * Also, no need to ping if we got a response recently
+	 */
+	if (server->tcpStatus != CifsGood ||
+	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
 		goto requeue_echo;
 
 	rc = CIFSSMBEcho(server);
-- 
cgit v1.2.3


From e8e1ba96b207deba1339b09983f8b29f92cb1497 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 4 Feb 2011 20:45:58 -0800
Subject: ceph: queue cap_snaps once per realm

We were forming a dirty list, and then queueing cap_snaps for each realm
_and_ its children, regardless of whether the children were already in the
dirty list.  This meant we did it twice for some realms.  Which in turn
meant we corrupted mdsc->snap_flush_list when the cap_snap was re-added to
the list it was already on, and could trigger an infinite loop.

We were also using recursion to do reach all the children, a no-no when
stack is limited.

Instead, (re)queue any children on the dirty list, avoiding processing
anything twice and avoiding any recursion.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 39c243acd06..f40b9139e43 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -584,10 +584,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
 	if (lastinode)
 		iput(lastinode);
 
-	dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
-	list_for_each_entry(child, &realm->children, child_item)
-		queue_realm_cap_snaps(child);
+	list_for_each_entry(child, &realm->children, child_item) {
+		dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
+		     realm, realm->ino, child, child->ino);
+		list_del_init(&child->dirty_item);
+		list_add(&child->dirty_item, &realm->dirty_item);
+	}
 
+	list_del_init(&realm->dirty_item);
 	dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
 }
 
@@ -683,7 +687,9 @@ more:
 	 * queue cap snaps _after_ we've built the new snap contexts,
 	 * so that i_head_snapc can be set appropriately.
 	 */
-	list_for_each_entry(realm, &dirty_realms, dirty_item) {
+	while (!list_empty(&dirty_realms)) {
+		realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
+					 dirty_item);
 		queue_realm_cap_snaps(realm);
 	}
 
-- 
cgit v1.2.3


From 8132b65bc6ce6d9a4baafdfc28c7cd9c258ed6e4 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <kernel@fomichev.me>
Date: Sun, 6 Feb 2011 02:05:28 +0300
Subject: cifs: add check for kmalloc in parse_dacl

Exit from parse_dacl if no memory returned from the call to kmalloc.

Signed-off-by: Stanislav Fomichev <kernel@fomichev.me>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1e7636b145a..beeebf19423 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -372,6 +372,10 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 		ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
 				GFP_KERNEL);
+		if (!ppace) {
+			cERROR(1, "DACL memory allocation error");
+			return;
+		}
 
 		for (i = 0; i < num_aces; ++i) {
 			ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
-- 
cgit v1.2.3


From 13dbc08987f25d9dba488a34b44b43e3844b027c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 3 Feb 2011 02:39:52 +0000
Subject: Btrfs: make sure search_bitmap finds something in remove_from_bitmap

When we're cleaning up the tree log we need to be able to remove free space from
the block group.  The problem is if that free space spans bitmaps we would not
find the space since we're looking for too many bytes.  So make sure the amount
of bytes we search for is limited to either the number of bytes we want, or the
number of bytes left in the bitmap.  This was tested by a user who was hitting
the BUG() after search_bitmap.  With this patch he can now mount his fs.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a5501edc3c9..a0390657451 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1216,6 +1216,7 @@ again:
 	 */
 	search_start = *offset;
 	search_bytes = *bytes;
+	search_bytes = min(search_bytes, end - search_start + 1);
 	ret = search_bitmap(block_group, bitmap_info, &search_start,
 			    &search_bytes);
 	BUG_ON(ret < 0 || search_start != *offset);
-- 
cgit v1.2.3


From 3c14874acc71180553fb5aba528e3cf57c5b958b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 2 Feb 2011 15:53:47 +0000
Subject: Btrfs: exclude super blocks when we read in block groups

This has been resulting in a BUT_ON(ret) after btrfs_reserve_extent in
btrfs_cow_file_range.  The reason is we don't actually calculate the bytes_super
for a block group until we go to cache it, which means that the space_info can
hand out reservations for space that it doesn't actually have, and we can run
out of data space.  This is also a problem if you are using space caching since
we don't ever calculate bytes_super for the block groups.  So instead everytime
we read a block group call exclude_super_stripes, which calculates the
bytes_super for the block group so it can be left out of the space_info.  Then
whenever caching completes we just call free_excluded_extents so that the super
excluded extents are freed up.  Also if we are unmounting and we hit any block
groups that haven't been cached we still need to call free_excluded_extents to
make sure things are cleaned up properly.  Thanks,

Reported-by: Arne Jansen <sensille@gmx.net>
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f07ba21cbf0..565e22d77b1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,11 +320,6 @@ static int caching_kthread(void *data)
 	if (!path)
 		return -ENOMEM;
 
-	exclude_super_stripes(extent_root, block_group);
-	spin_lock(&block_group->space_info->lock);
-	block_group->space_info->bytes_readonly += block_group->bytes_super;
-	spin_unlock(&block_group->space_info->lock);
-
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 
 	/*
@@ -467,8 +462,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 			cache->cached = BTRFS_CACHE_NO;
 		}
 		spin_unlock(&cache->lock);
-		if (ret == 1)
+		if (ret == 1) {
+			free_excluded_extents(fs_info->extent_root, cache);
 			return 0;
+		}
 	}
 
 	if (load_cache_only)
@@ -4036,6 +4033,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
 	atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+	WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
 
 	spin_lock(&BTRFS_I(inode)->accounting_lock);
 	nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
@@ -8325,6 +8323,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		if (block_group->cached == BTRFS_CACHE_STARTED)
 			wait_block_group_cache_done(block_group);
 
+		/*
+		 * We haven't cached this block group, which means we could
+		 * possibly have excluded extents on this block group.
+		 */
+		if (block_group->cached == BTRFS_CACHE_NO)
+			free_excluded_extents(info->extent_root, block_group);
+
 		btrfs_remove_free_space_cache(block_group);
 		btrfs_put_block_group(block_group);
 
@@ -8439,6 +8444,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		cache->flags = btrfs_block_group_flags(&cache->item);
 		cache->sectorsize = root->sectorsize;
 
+		/*
+		 * We need to exclude the super stripes now so that the space
+		 * info has super bytes accounted for, otherwise we'll think
+		 * we have more space than we actually do.
+		 */
+		exclude_super_stripes(root, cache);
+
 		/*
 		 * check for two cases, either we are full, and therefore
 		 * don't need to bother with the caching work since we won't
@@ -8447,12 +8459,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		 * time, particularly in the full case.
 		 */
 		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
-			exclude_super_stripes(root, cache);
 			cache->last_byte_to_unpin = (u64)-1;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			free_excluded_extents(root, cache);
 		} else if (btrfs_block_group_used(&cache->item) == 0) {
-			exclude_super_stripes(root, cache);
 			cache->last_byte_to_unpin = (u64)-1;
 			cache->cached = BTRFS_CACHE_FINISHED;
 			add_new_free_space(cache, root->fs_info,
-- 
cgit v1.2.3


From 554233a6e0e8557e8e81e54cc70628d101291122 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 3 Feb 2011 03:16:25 +0000
Subject: btrfs: cleanup error handling in btrfs_unlink_inode()

When btrfs_alloc_path() fails, btrfs_free_path() need not be called.
Therefore, it changes the branch ahead.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 36bc3f49ebf..c9bc0afdbfc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2646,7 +2646,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
-		goto err;
+		goto out;
 	}
 
 	path->leave_spinning = 1;
-- 
cgit v1.2.3


From 8e4eef7a60eeca0fe7503e5cbd3b24ff4941c732 Mon Sep 17 00:00:00 2001
From: Alexey Charkov <alchark@gmail.com>
Date: Wed, 2 Feb 2011 21:15:35 +0000
Subject: btrfs: Drop __exit attribute on btrfs_exit_compress

As this function is called in some error paths while not
removing the module, the __exit attribute prevents the kernel
image from linking when btrfs is compiled in statically.

Signed-off-by: Alexey Charkov <alchark@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 3a932f183da..4d2110eafe2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -921,7 +921,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 	return ret;
 }
 
-void __exit btrfs_exit_compress(void)
+void btrfs_exit_compress(void)
 {
 	free_workspaces();
 }
-- 
cgit v1.2.3


From 822ed64c5b5d15474c6abb1834726643e2cff558 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 6 Feb 2011 14:49:26 +0200
Subject: UBIFS: remove double semicolon

Just a tiny clean-up - remove ;;

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/scan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 3e1ee57dbea..36216b46f77 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
 		if (!quiet)
 			ubifs_err("empty space starts at non-aligned offset %d",
 				  offs);
-		goto corrupted;;
+		goto corrupted;
 	}
 
 	ubifs_end_scan(c, sleb, lnum, offs);
-- 
cgit v1.2.3


From be7b42a5cb4c5050bcab4f57022007155c119d45 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 6 Feb 2011 16:41:06 +0200
Subject: UBIFS: describe UBIFS recovery logic some more

This patch adds more commentaries about UBIFS recovery logic which should
explain the famous UBIFS "corrupt empty space" errors.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/recovery.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 77e9b874b6c..6ecbc91ef9a 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -28,6 +28,23 @@
  * UBIFS always cleans away all remnants of an unclean un-mount, so that
  * errors do not accumulate. However UBIFS defers recovery if it is mounted
  * read-only, and the flash is not modified in that case.
+ *
+ * The general UBIFS approach to the recovery is that it recovers from
+ * corruptions which could be caused by power cuts, but it refuses to recover
+ * from corruption caused by other reasons. And UBIFS tries to distinguish
+ * between these 2 reasons of corruptions and silently recover in the former
+ * case and loudly complain in the latter case.
+ *
+ * UBIFS writes only to erased LEBs, so it writes only to the flash space
+ * containing only 0xFFs. UBIFS also always writes strictly from the beginning
+ * of the LEB to the end. And UBIFS assumes that the underlying flash media
+ * writes in @c->min_io_unit bytes at a time.
+ *
+ * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
+ * I/O unit corresponding to offset X to contain corrupted data, all the
+ * following min. I/O units have to contain empty space (all 0xFFs). If this is
+ * not true, the corruption cannot be the result of a power cut, and UBIFS
+ * refuses to mount.
  */
 
 #include <linux/crc32.h>
@@ -671,6 +688,10 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 		} else {
 			int corruption = first_non_ff(buf, len);
 
+			/*
+			 * See header comment for this file for more
+			 * explanations about the reasons we have this check.
+			 */
 			ubifs_err("corrupt empty space LEB %d:%d, corruption "
 				  "starts at %d", lnum, offs, corruption);
 			/* Make sure we dump interesting non-0xFF data */
-- 
cgit v1.2.3


From 8c559d30b4e59cf6994215ada1fe744928f494bf Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segoon@openwall.com>
Date: Fri, 4 Feb 2011 15:24:19 +0300
Subject: UBIFS: restrict world-writable debugfs files

Don't allow everybody to dump sensitive information about filesystems.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 0bee4dbffc3..bcb1acb7926 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2813,19 +2813,19 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
 	}
 
 	fname = "dump_lprops";
-	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
 	d->dfs_dump_lprops = dent;
 
 	fname = "dump_budg";
-	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
 	d->dfs_dump_budg = dent;
 
 	fname = "dump_tnc";
-	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+	dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
 	d->dfs_dump_tnc = dent;
-- 
cgit v1.2.3


From d402539b8fc3fa21f16eb5e654be742670399e8a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 7 Feb 2011 08:54:35 -0500
Subject: cifs: remove checks for ses->status == CifsExiting

ses->status is never set to CifsExiting, so these checks are
always false.

Tested-by: JG <jg@cms.ac>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 46c66ed01af..904aa47e351 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -136,9 +136,6 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		}
 	}
 
-	if (ses->status == CifsExiting)
-		return -EIO;
-
 	/*
 	 * Give demultiplex thread up to 10 seconds to reconnect, should be
 	 * greater than cifs socket timeout which is 7 seconds
@@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		 * retrying until process is killed or server comes
 		 * back on-line
 		 */
-		if (!tcon->retry || ses->status == CifsExiting) {
+		if (!tcon->retry) {
 			cFYI(1, "gave up waiting on reconnect in smb_init");
 			return -EHOSTDOWN;
 		}
-- 
cgit v1.2.3


From d50bdd5aa55127635fd8a5c74bd2abb256bd34e3 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Mon, 7 Feb 2011 12:46:14 -0500
Subject: ext4: Fix data corruption with multi-block writepages support

This fixes a corruption problem with the multi-block
writepages submittal change for ext4, from commit
bd2d0210cf22f2bd0cef72eb97cf94fc7d31d8cc ("ext4: use bio
layer instead of buffer layer in mpage_da_submit_io").

(Note that this corruption is not present in 2.6.37 on
ext4, because the corruption was detected after the
feature was merged in 2.6.37-rc1, and so it was turned
off by adding a non-default mount option,
mblk_io_submit.  With this commit, which hopefully
fixes the last of the bugs with this feature, we'll be
able to turn on this performance feature by default in
2.6.38, and remove the mblk_io_submit option.)

The ext4 code path to bundle multiple pages for
writeback in ext4_bio_write_page() had a bug: we should
be clearing buffer head dirty flags *before* we submit
the bio, not in the completion routine.

The patch below was tested on 2.6.37 under KVM with the
postgresql script which was submitted by Jon Nelson as
documented in commit 1449032be1.

Without the patch, I'd hit the corruption problem about
50-70% of the time.  With the patch, I executed the
script > 100 times with no corruption seen.

I also fixed a bug to make sure ext4_end_bio() doesn't
dereference the bio after the bio_put() call.

Reported-by: Jon Nelson <jnelson@jamponi.net>
Reported-by: Matthias Bayer <jackdachef@gmail.com>
Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/page-io.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7270dcfca92..4e9b0a242f4 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -190,6 +190,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 	struct inode *inode;
 	unsigned long flags;
 	int i;
+	sector_t bi_sector = bio->bi_sector;
 
 	BUG_ON(!io_end);
 	bio->bi_private = NULL;
@@ -207,9 +208,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 		if (error)
 			SetPageError(page);
 		BUG_ON(!head);
-		if (head->b_size == PAGE_CACHE_SIZE)
-			clear_buffer_dirty(head);
-		else {
+		if (head->b_size != PAGE_CACHE_SIZE) {
 			loff_t offset;
 			loff_t io_end_offset = io_end->offset + io_end->size;
 
@@ -221,7 +220,6 @@ static void ext4_end_bio(struct bio *bio, int error)
 					if (error)
 						buffer_io_error(bh);
 
-					clear_buffer_dirty(bh);
 				}
 				if (buffer_delay(bh))
 					partial_write = 1;
@@ -257,7 +255,7 @@ static void ext4_end_bio(struct bio *bio, int error)
 			     (unsigned long long) io_end->offset,
 			     (long) io_end->size,
 			     (unsigned long long)
-			     bio->bi_sector >> (inode->i_blkbits - 9));
+			     bi_sector >> (inode->i_blkbits - 9));
 	}
 
 	/* Add the io_end to per-inode completed io list*/
@@ -380,6 +378,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 
 	blocksize = 1 << inode->i_blkbits;
 
+	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	ClearPageError(page);
@@ -397,12 +396,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	for (bh = head = page_buffers(page), block_start = 0;
 	     bh != head || !block_start;
 	     block_start = block_end, bh = bh->b_this_page) {
+
 		block_end = block_start + blocksize;
 		if (block_start >= len) {
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
 			continue;
 		}
+		clear_buffer_dirty(bh);
 		ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
 		if (ret) {
 			/*
-- 
cgit v1.2.3


From 3a90983dbdcb2f4f48c0d771d8e5b4d88f27fae6 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@linux.intel.com>
Date: Tue, 18 Jan 2011 13:34:40 +0800
Subject: Btrfs: Fix page count calculation

take offset of start position into account when calculating page count.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9e097fbfc78..b0ff34b9660 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -991,8 +991,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 		size_t write_bytes = min(iov_iter_count(&i),
 					 nrptrs * (size_t)PAGE_CACHE_SIZE -
 					 offset);
-		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
-					PAGE_CACHE_SHIFT;
+		size_t num_pages = (write_bytes + offset +
+				    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(struct page *) * nrptrs);
@@ -1022,8 +1022,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
 		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, &i);
-		dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
-					PAGE_CACHE_SHIFT;
+		dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
+				PAGE_CACHE_SHIFT;
 
 		if (num_pages > dirty_pages) {
 			if (copied > 0)
-- 
cgit v1.2.3


From 04e99455ea5bb17ea7c2e7bb0970168efb736242 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 25 Jan 2011 09:06:19 +0000
Subject: xfs: only lock the rt bitmap inode once per allocation

Currently both xfs_rtpick_extent and xfs_rtallocate_extent call
xfs_trans_iget to grab and lock the rt bitmap inode, which results in a
deadlock since the removal of the lock recursion counters in commit

	"xfs: simplify inode to transaction joining"

Fix this by acquiring and locking the inode in xfs_bmap_rtalloc before
calling into xfs_rtpick_extent and xfs_rtallocate_extent.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c    | 11 +++++++++++
 fs/xfs/xfs_rtalloc.c | 34 +++++++++++++---------------------
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index dc3afd7739f..2f89af25996 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2333,6 +2333,7 @@ xfs_bmap_rtalloc(
 	xfs_extlen_t	prod = 0;	/* product factor for allocators */
 	xfs_extlen_t	ralen = 0;	/* realtime allocation length */
 	xfs_extlen_t	align;		/* minimum allocation alignment */
+	xfs_inode_t	*ip;		/* bitmap incore inode */
 	xfs_rtblock_t	rtb;
 
 	mp = ap->ip->i_mount;
@@ -2365,6 +2366,16 @@ xfs_bmap_rtalloc(
 	 */
 	if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
 		ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+
+	/*
+	 * Lock out other modifications to the RT bitmap inode.
+	 */
+	error = xfs_trans_iget(mp, ap->tp, mp->m_sb.sb_rbmino, 0,
+			       XFS_ILOCK_EXCL, &ip);
+	if (error)
+		return error;
+	ASSERT(ip == mp->m_rbmip);
+
 	/*
 	 * If it's an allocation to an empty file at offset 0,
 	 * pick an extent that will space things out in the rt area.
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 12a19138531..037fab14024 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2075,15 +2075,15 @@ xfs_rtallocate_extent(
 	xfs_extlen_t	prod,		/* extent product factor */
 	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
 {
+	xfs_mount_t	*mp = tp->t_mountp;
 	int		error;		/* error value */
-	xfs_inode_t	*ip;		/* inode for bitmap file */
-	xfs_mount_t	*mp;		/* file system mount structure */
 	xfs_rtblock_t	r;		/* result allocated block */
 	xfs_fsblock_t	sb;		/* summary file block number */
 	xfs_buf_t	*sumbp;		/* summary file block buffer */
 
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
 	ASSERT(minlen > 0 && minlen <= maxlen);
-	mp = tp->t_mountp;
+
 	/*
 	 * If prod is set then figure out what to do to minlen and maxlen.
 	 */
@@ -2099,12 +2099,7 @@ xfs_rtallocate_extent(
 			return 0;
 		}
 	}
-	/*
-	 * Lock out other callers by grabbing the bitmap inode lock.
-	 */
-	if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-					XFS_ILOCK_EXCL, &ip)))
-		return error;
+
 	sumbp = NULL;
 	/*
 	 * Allocate by size, or near another block, or exactly at some block.
@@ -2123,11 +2118,12 @@ xfs_rtallocate_extent(
 				len, &sumbp, &sb, prod, &r);
 		break;
 	default:
+		error = EIO;
 		ASSERT(0);
 	}
-	if (error) {
+	if (error)
 		return error;
-	}
+
 	/*
 	 * If it worked, update the superblock.
 	 */
@@ -2306,20 +2302,16 @@ xfs_rtpick_extent(
 	xfs_rtblock_t	*pick)		/* result rt extent */
 {
 	xfs_rtblock_t	b;		/* result block */
-	int		error;		/* error return value */
-	xfs_inode_t	*ip;		/* bitmap incore inode */
 	int		log2;		/* log of sequence number */
 	__uint64_t	resid;		/* residual after log removed */
 	__uint64_t	seq;		/* sequence number of file creation */
 	__uint64_t	*seqp;		/* pointer to seqno in inode */
 
-	if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-					XFS_ILOCK_EXCL, &ip)))
-		return error;
-	ASSERT(ip == mp->m_rbmip);
-	seqp = (__uint64_t *)&ip->i_d.di_atime;
-	if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
-		ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+	seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
+	if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
+		mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
 		*seqp = 0;
 	}
 	seq = *seqp;
@@ -2335,7 +2327,7 @@ xfs_rtpick_extent(
 			b = mp->m_sb.sb_rextents - len;
 	}
 	*seqp = seq + 1;
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
 	*pick = b;
 	return 0;
 }
-- 
cgit v1.2.3


From 0d8b30ad19bf13197cbcd786e2cd5a2ecef72e68 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 25 Jan 2011 09:06:20 +0000
Subject: xfs: fix xfs_get_extsz_hint for a zero extent size hint

We can easily set the extsize flag without setting an extent size
hint, or one that evaluates to zero.  Historically the di_extsize
field was only used when it was non-zero, but the commit

	"Cleanup inode extent size hint extraction"

broke this.  Restore the old behaviour, thus fixing xfsqa 090 with
a debug kernel.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_rw.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 56861d5daae..ccd3adf640e 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -173,17 +173,9 @@ xfs_extlen_t
 xfs_get_extsz_hint(
 	struct xfs_inode	*ip)
 {
-	xfs_extlen_t		extsz;
-
-	if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
-		extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-				? ip->i_d.di_extsize
-				: ip->i_mount->m_sb.sb_rextsize;
-		ASSERT(extsz);
-	} else {
-		extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-				? ip->i_d.di_extsize : 0;
-	}
-
-	return extsz;
+	if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+		return ip->i_d.di_extsize;
+	if (XFS_IS_REALTIME_INODE(ip))
+		return ip->i_mount->m_sb.sb_rextsize;
+	return 0;
 }
-- 
cgit v1.2.3


From 9681153b460006923bb1e9d39b05b80ec09d6b4e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 25 Jan 2011 09:06:21 +0000
Subject: xfs: add lockdep annotations for the rt inodes

The rt bitmap and summary inodes do not participate in the normal inode
locking protocol.  Instead the rt bitmap inode can be locked in any
transaction involving rt allocations, and the both of the rt inodes can
be locked at the same time.  Add specific lockdep subclasses for the rt
inodes to prevent lockdep from blowing up.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c    |  2 +-
 fs/xfs/xfs_inode.h   | 23 +++++++++++++++--------
 fs/xfs/xfs_rtalloc.c | 16 ++++++++++------
 3 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 2f89af25996..d8d09066528 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2371,7 +2371,7 @@ xfs_bmap_rtalloc(
 	 * Lock out other modifications to the RT bitmap inode.
 	 */
 	error = xfs_trans_iget(mp, ap->tp, mp->m_sb.sb_rbmino, 0,
-			       XFS_ILOCK_EXCL, &ip);
+			       XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP, &ip);
 	if (error)
 		return error;
 	ASSERT(ip == mp->m_rbmip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 5c95fa8ec11..f753200cef8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -409,28 +409,35 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
  * Flags for lockdep annotations.
  *
- * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes
- * (ie directory operations that require locking a directory inode and
- * an entry inode).  The first inode gets locked with this flag so it
- * gets a lockdep subclass of 1 and the second lock will have a lockdep
- * subclass of 0.
+ * XFS_LOCK_PARENT - for directory operations that require locking a
+ * parent directory inode and a child entry inode.  The parent gets locked
+ * with this flag so it gets a lockdep subclass of 1 and the child entry
+ * lock will have a lockdep subclass of 0.
+ *
+ * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
+ * inodes do not participate in the normal lock order, and thus have their
+ * own subclasses.
  *
  * XFS_LOCK_INUMORDER - for locking several inodes at the some time
  * with xfs_lock_inodes().  This flag is used as the starting subclass
  * and each subsequent lock acquired will increment the subclass by one.
- * So the first lock acquired will have a lockdep subclass of 2, the
- * second lock will have a lockdep subclass of 3, and so on. It is
+ * So the first lock acquired will have a lockdep subclass of 4, the
+ * second lock will have a lockdep subclass of 5, and so on. It is
  * the responsibility of the class builder to shift this to the correct
  * portion of the lock_mode lockdep mask.
  */
 #define XFS_LOCK_PARENT		1
-#define XFS_LOCK_INUMORDER	2
+#define XFS_LOCK_RTBITMAP	2
+#define XFS_LOCK_RTSUM		3
+#define XFS_LOCK_INUMORDER	4
 
 #define XFS_IOLOCK_SHIFT	16
 #define	XFS_IOLOCK_PARENT	(XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
 
 #define XFS_ILOCK_SHIFT		24
 #define	XFS_ILOCK_PARENT	(XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
+#define	XFS_ILOCK_RTBITMAP	(XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
+#define	XFS_ILOCK_RTSUM		(XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
 
 #define XFS_IOLOCK_DEP_MASK	0x00ff0000
 #define XFS_ILOCK_DEP_MASK	0xff000000
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 037fab14024..f592ac97818 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1972,8 +1972,10 @@ xfs_growfs_rt(
 		/*
 		 * Lock out other callers by grabbing the bitmap inode lock.
 		 */
-		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-						XFS_ILOCK_EXCL, &ip)))
+		error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+				       XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP,
+				       &ip);
+		if (error)
 			goto error_cancel;
 		ASSERT(ip == mp->m_rbmip);
 		/*
@@ -1986,8 +1988,9 @@ xfs_growfs_rt(
 		/*
 		 * Get the summary inode into the transaction.
 		 */
-		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
-						XFS_ILOCK_EXCL, &ip)))
+		error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
+				       XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM, &ip);
+		if (error)
 			goto error_cancel;
 		ASSERT(ip == mp->m_rsumip);
 		/*
@@ -2160,8 +2163,9 @@ xfs_rtfree_extent(
 	/*
 	 * Synchronize by locking the bitmap inode.
 	 */
-	if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-					XFS_ILOCK_EXCL, &ip)))
+	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+			       XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP, &ip);
+	if (error)
 		return error;
 #if defined(__KERNEL__) && defined(DEBUG)
 	/*
-- 
cgit v1.2.3


From e79a46a0302a6bf8f879da43c00373b6ed1081ea Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Mon, 7 Feb 2011 11:22:41 -0500
Subject: GFS2: panics on quotacheck update

Handle block allocation for forceful unstuffing of quota dinode during quota
update using quotactl(). Also fix block reservation for special cases when
quotas cross over block boundaries and update 2 blocks instead of 1.

Signed-off-by: Abhi Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a689901963d..6ec964c31dc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1587,6 +1587,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 
 	offset = qd2offset(qd);
 	alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
+	if (gfs2_is_stuffed(ip))
+		alloc_required = 1;
 	if (alloc_required) {
 		al = gfs2_alloc_get(ip);
 		if (al == NULL)
@@ -1600,7 +1602,9 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
 		blocks += gfs2_rg_blocks(al);
 	}
 
-	error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+	/* Some quotas span block boundaries and can update two blocks,
+	   adding an extra block to the transaction to handle such quotas */
+	error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0);
 	if (error)
 		goto out_release;
 
-- 
cgit v1.2.3


From 10ac27970274e9094aee84a6452a25bf1b7e59e1 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 8 Feb 2011 17:21:11 +0200
Subject: UBIFS: fix LEB number in printk

This is a minor patch which fixes the LEB number we print when
corrupted empty space is found.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/recovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 6ecbc91ef9a..e2714f8f05f 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -695,7 +695,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 			ubifs_err("corrupt empty space LEB %d:%d, corruption "
 				  "starts at %d", lnum, offs, corruption);
 			/* Make sure we dump interesting non-0xFF data */
-			offs = corruption;
+			offs += corruption;
 			buf += corruption;
 			goto corrupted;
 		}
-- 
cgit v1.2.3


From 7e90d705fc9f8c5e3a1549281dce0654d049243b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 8 Feb 2011 23:52:32 +0000
Subject: [CIFS] Do not send SMBEcho requests on new sockets until SMBNegotiate

In order to determine whether an SMBEcho request can be sent
we need to know that the socket is established (server tcpStatus == CifsGood)
AND that an SMB NegotiateProtocol has been sent (server maxBuf != 0).
Without the second check we can send an Echo request during reconnection
before the server can accept it.

CC: JG <jg@cms.ac>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h | 2 ++
 fs/cifs/connect.c  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index edd5b29b53c..1ab33eb71d9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -188,6 +188,8 @@ struct TCP_Server_Info {
 	/* multiplexed reads or writes */
 	unsigned int maxBuf;	/* maxBuf specifies the maximum */
 	/* message size the server can send or receive for non-raw SMBs */
+	/* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
+	/* when socket is setup (and during reconnect) before NegProt sent */
 	unsigned int max_rw;	/* maxRw specifies the maximum */
 	/* message size the server can send or receive for */
 	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 257b6d895e2..10011e99b34 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -341,7 +341,7 @@ cifs_echo_request(struct work_struct *work)
 	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done.
 	 * Also, no need to ping if we got a response recently
 	 */
-	if (server->tcpStatus != CifsGood ||
+	if ((server->tcpStatus != CifsGood) || (server->maxBuf == 0) ||
 	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
 		goto requeue_echo;
 
-- 
cgit v1.2.3


From 195291e68c2ad59a046fc56d32bf59635b100e5c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 9 Feb 2011 12:01:42 -0500
Subject: cifs: clean up checks in cifs_echo_request

Follow-on patch to 7e90d705 which is already in Steve's tree...

The check for tcpStatus == CifsGood is not meaningful since it doesn't
indicate whether the NEGOTIATE request has been done. Also, clarify
why we're checking for maxBuf == 0.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 10011e99b34..161f24ca4f6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -338,10 +338,11 @@ cifs_echo_request(struct work_struct *work)
 					struct TCP_Server_Info, echo.work);
 
 	/*
-	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is done.
-	 * Also, no need to ping if we got a response recently
+	 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
+	 * done, which is indicated by maxBuf != 0. Also, no need to ping if
+	 * we got a response recently
 	 */
-	if ((server->tcpStatus != CifsGood) || (server->maxBuf == 0) ||
+	if (server->maxBuf == 0 ||
 	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
 		goto requeue_echo;
 
-- 
cgit v1.2.3


From 890275b5eb79e9933d12290473eab9ac38da0051 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 2 Nov 2010 10:13:07 -0400
Subject: IMA: maintain i_readcount in the VFS layer

ima_counts_get() updated the readcount and invalidated the PCR,
as necessary. Only update the i_readcount in the VFS layer.
Move the PCR invalidation checks to ima_file_check(), where it
belongs.

Maintaining the i_readcount in the VFS layer, will allow other
subsystems to use i_readcount.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Eric Paris <eparis@redhat.com>
---
 fs/file_table.c | 5 ++++-
 fs/open.c       | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index c3dee381f1b..0c724deb46f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -190,7 +190,8 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 		file_take_write(file);
 		WARN_ON(mnt_clone_write(path->mnt));
 	}
-	ima_counts_get(file);
+	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_inc(path->dentry->d_inode);
 	return file;
 }
 EXPORT_SYMBOL(alloc_file);
@@ -251,6 +252,8 @@ static void __fput(struct file *file)
 	fops_put(file->f_op);
 	put_pid(file->f_owner.pid);
 	file_sb_list_del(file);
+	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_dec(inode);
 	if (file->f_mode & FMODE_WRITE)
 		drop_file_write_access(file);
 	file->f_path.dentry = NULL;
diff --git a/fs/open.c b/fs/open.c
index 4197b9ed023..0d485c50bb9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -688,7 +688,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 		if (error)
 			goto cleanup_all;
 	}
-	ima_counts_get(f);
+	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_inc(inode);
 
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 
-- 
cgit v1.2.3


From 71823baff1978be892e7a36eddf6170e1cc6650d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 10 Feb 2011 08:03:50 -0500
Subject: cifs: don't always drop malformed replies on the floor (try #3)

Slight revision to this patch...use min_t() instead of conditional
assignment. Also, remove the FIXME comment and replace it with the
explanation that Steve gave earlier.

After receiving a packet, we currently check the header. If it's no
good, then we toss it out and continue the loop, leaving the caller
waiting on that response.

In cases where the packet has length inconsistencies, but the MID is
valid, this leads to unneeded delays. That's especially problematic now
that the client waits indefinitely for responses.

Instead, don't immediately discard the packet if checkSMB fails. Try to
find a matching mid_q_entry, mark it as having a malformed response and
issue the callback.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h  |  2 +-
 fs/cifs/connect.c   | 30 ++++++++++++++++++++++++------
 fs/cifs/transport.c |  3 +++
 3 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1ab33eb71d9..17afb0fbcae 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -654,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   MID_REQUEST_SUBMITTED 2
 #define   MID_RESPONSE_RECEIVED 4
 #define   MID_RETRY_NEEDED      8 /* session closed while this request out */
-#define   MID_NO_RESP_NEEDED 0x10
+#define   MID_RESPONSE_MALFORMED 0x10
 
 /* Types of response buffer returned from SendReceive2 */
 #define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 161f24ca4f6..8d6c17ab593 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -586,11 +586,20 @@ incomplete_rcv:
 		total_read += 4; /* account for rfc1002 hdr */
 
 		dump_smb(smb_buffer, total_read);
-		if (checkSMB(smb_buffer, smb_buffer->Mid, total_read)) {
+
+		/*
+		 * We know that we received enough to get to the MID as we
+		 * checked the pdu_length earlier. Now check to see
+		 * if the rest of the header is OK. We borrow the length
+		 * var for the rest of the loop to avoid a new stack var.
+		 *
+		 * 48 bytes is enough to display the header and a little bit
+		 * into the payload for debugging purposes.
+		 */
+		length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
+		if (length != 0)
 			cifs_dump_mem("Bad SMB: ", smb_buffer,
-					total_read < 48 ? total_read : 48);
-			continue;
-		}
+					min_t(unsigned int, total_read, 48));
 
 		mid_entry = NULL;
 		server->lstrp = jiffies;
@@ -602,7 +611,8 @@ incomplete_rcv:
 			if ((mid_entry->mid == smb_buffer->Mid) &&
 			    (mid_entry->midState == MID_REQUEST_SUBMITTED) &&
 			    (mid_entry->command == smb_buffer->Command)) {
-				if (check2ndT2(smb_buffer,server->maxBuf) > 0) {
+				if (length == 0 &&
+				   check2ndT2(smb_buffer, server->maxBuf) > 0) {
 					/* We have a multipart transact2 resp */
 					isMultiRsp = true;
 					if (mid_entry->resp_buf) {
@@ -637,7 +647,12 @@ incomplete_rcv:
 				mid_entry->resp_buf = smb_buffer;
 				mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-				mid_entry->midState = MID_RESPONSE_RECEIVED;
+				if (length == 0)
+					mid_entry->midState =
+							MID_RESPONSE_RECEIVED;
+				else
+					mid_entry->midState =
+							MID_RESPONSE_MALFORMED;
 #ifdef CONFIG_CIFS_STATS2
 				mid_entry->when_received = jiffies;
 #endif
@@ -658,6 +673,9 @@ multi_t2_fnd:
 				else
 					smallbuf = NULL;
 			}
+		} else if (length != 0) {
+			/* response sanity checks failed */
+			continue;
 		} else if (!is_valid_oplock_break(smb_buffer, server) &&
 			   !isMultiRsp) {
 			cERROR(1, "No task to wake, unknown frame received! "
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index fbc5aace54b..46d8756f2b2 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -457,6 +457,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 	case MID_RETRY_NEEDED:
 		rc = -EAGAIN;
 		break;
+	case MID_RESPONSE_MALFORMED:
+		rc = -EIO;
+		break;
 	default:
 		cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
 			mid->mid, mid->midState);
-- 
cgit v1.2.3


From 6b155c8fd4d239f7d883d455bbad1be47724bbfc Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 11 Feb 2011 16:44:31 -0600
Subject: dlm: use single thread workqueues

The recent commit to use cmwq for send and recv threads
dcce240ead802d42b1e45ad2fcb2ed4a399cb255 introduced problems,
apparently due to multiple workqueue threads.  Single threads
make the problems go away, so return to that until we fully
understand the concurrency issues with multiple threads.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9c64ae9e4c1..2d8c87b951c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1468,15 +1468,13 @@ static void work_stop(void)
 
 static int work_start(void)
 {
-	recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
-					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+	recv_workqueue = create_singlethread_workqueue("dlm_recv");
 	if (!recv_workqueue) {
 		log_print("can't start dlm_recv");
 		return -ENOMEM;
 	}
 
-	send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
-					 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+	send_workqueue = create_singlethread_workqueue("dlm_send");
 	if (!send_workqueue) {
 		log_print("can't start dlm_send");
 		destroy_workqueue(recv_workqueue);
-- 
cgit v1.2.3


From 2dab597441667d6c04451a7dcf215241ad4c74f6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 11 Feb 2011 15:53:38 -0800
Subject: Fix possible filp_cachep memory corruption

In commit 31e6b01f4183 ("fs: rcu-walk for path lookup") we started doing
path lookup using RCU, which then falls back to a careful non-RCU lookup
in case of problems (LOOKUP_REVAL).  So do_filp_open() has this "re-do
the lookup carefully" looping case.

However, that means that we must not release the open-intent file data
if we are going to loop around and use it once more!

Fix this by moving the release of the open-intent data to the function
that allocates it (do_filp_open() itself) rather than the helper
functions that can get called multiple times (finish_open() and
do_last()).  This makes the logic for the lifetime of that field much
more obvious, and avoids the possible double free.

Reported-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 20 ++++++++++----------
 fs/open.c  |  2 ++
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 7d77f24d32a..ec4b2d0190a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -561,10 +561,14 @@ static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
  */
 void release_open_intent(struct nameidata *nd)
 {
-	if (nd->intent.open.file->f_path.dentry == NULL)
-		put_filp(nd->intent.open.file);
-	else
-		fput(nd->intent.open.file);
+	struct file *file = nd->intent.open.file;
+
+	if (file && !IS_ERR(file)) {
+		if (file->f_path.dentry == NULL)
+			put_filp(file);
+		else
+			fput(file);
+	}
 }
 
 /*
@@ -2265,8 +2269,6 @@ static struct file *finish_open(struct nameidata *nd,
 	return filp;
 
 exit:
-	if (!IS_ERR(nd->intent.open.file))
-		release_open_intent(nd);
 	path_put(&nd->path);
 	return ERR_PTR(error);
 }
@@ -2389,8 +2391,6 @@ exit_mutex_unlock:
 exit_dput:
 	path_put_conditional(path, nd);
 exit:
-	if (!IS_ERR(nd->intent.open.file))
-		release_open_intent(nd);
 	path_put(&nd->path);
 	return ERR_PTR(error);
 }
@@ -2477,6 +2477,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	}
 	audit_inode(pathname, nd.path.dentry);
 	filp = finish_open(&nd, open_flag, acc_mode);
+	release_open_intent(&nd);
 	return filp;
 
 creat:
@@ -2553,6 +2554,7 @@ out:
 		path_put(&nd.root);
 	if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
 		goto reval;
+	release_open_intent(&nd);
 	return filp;
 
 exit_dput:
@@ -2560,8 +2562,6 @@ exit_dput:
 out_path:
 	path_put(&nd.path);
 out_filp:
-	if (!IS_ERR(nd.intent.open.file))
-		release_open_intent(&nd);
 	filp = ERR_PTR(error);
 	goto out;
 }
diff --git a/fs/open.c b/fs/open.c
index e52389e1f05..5a2c6ebc22b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -790,6 +790,8 @@ struct file *nameidata_to_filp(struct nameidata *nd)
 
 	/* Pick up the filp from the open intent */
 	filp = nd->intent.open.file;
+	nd->intent.open.file = NULL;
+
 	/* Has the filesystem initialised the file for us? */
 	if (filp->f_path.dentry == NULL) {
 		path_get(&nd->path);
-- 
cgit v1.2.3


From d863b50ab01333659314c2034890cb76d9fdc3c7 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 10 Feb 2011 15:01:20 -0800
Subject: vfs: call rcu_barrier after ->kill_sb()

In commit fa0d7e3de6d6 ("fs: icache RCU free inodes"), we use rcu free
inode instead of freeing the inode directly.  It causes a crash when we
rmmod immediately after we umount the volume[1].

So we need to call rcu_barrier after we kill_sb so that the inode is
freed before we do rmmod.  The idea is inspired by Aneesh Kumar.
rcu_barrier will wait for all callbacks to end before preceding.  The
original patch was done by Tao Ma, but synchronize_rcu() is not enough
here.

1. http://marc.info/?l=linux-fsdevel&m=129680863330185&w=2

Tested-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 74e149efed8..7e9dd4cc2c0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -177,6 +177,11 @@ void deactivate_locked_super(struct super_block *s)
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
 		fs->kill_sb(s);
+		/*
+		 * We need to call rcu_barrier so all the delayed rcu free
+		 * inodes are flushed before we release the fs module.
+		 */
+		rcu_barrier();
 		put_filesystem(fs);
 		put_super(s);
 	} else {
-- 
cgit v1.2.3


From 2892c15ddda6a76dc10b7499e56c0f3b892e5a69 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 12 Feb 2011 08:12:18 -0500
Subject: ext4: make grpinfo slab cache names static

In 2.6.37 I was running into oopses with repeated module
loads & unloads.  I tracked this down to:

fb1813f4 ext4: use dedicated slab caches for group_info structures

(this was in addition to the features advert unload problem)

The kstrdup & subsequent kfree of the cache name was causing
a double free.  In slub, at least, if I read it right it allocates
& frees the name itself, slab seems to do something different...
so in slub I think we were leaking -our- cachep->name, and double
freeing the one allocated by slub.

After getting lost in slab/slub/slob a bit, I just looked at other
sized-caches that get allocated.  jbd2, biovec, sgpool all do it
more or less the way jbd2 does.  Below patch follows the jbd2
method of dynamically allocating a cache at mount time from
a list of static names.

(This might also possibly fix a race creating the caches with
parallel mounts running).

[Folded in a fix from Dan Carpenter which fixed an off-by-one error in
the original patch]

Cc: stable@kernel.org
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 100 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 60 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 851f49b2f9d..d1fe09aea73 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep;
 /* We create slab caches for groupinfo data structures based on the
  * superblock block size.  There will be one per mounted filesystem for
  * each unique s_blocksize_bits */
-#define NR_GRPINFO_CACHES	\
-	(EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
+#define NR_GRPINFO_CACHES 8
 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
+
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -2414,6 +2419,55 @@ err_freesgi:
 	return -ENOMEM;
 }
 
+static void ext4_groupinfo_destroy_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+		if (ext4_groupinfo_caches[i])
+			kmem_cache_destroy(ext4_groupinfo_caches[i]);
+		ext4_groupinfo_caches[i] = NULL;
+	}
+}
+
+static int ext4_groupinfo_create_slab(size_t size)
+{
+	static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+	int slab_size;
+	int blocksize_bits = order_base_2(size);
+	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+	struct kmem_cache *cachep;
+
+	if (cache_index >= NR_GRPINFO_CACHES)
+		return -EINVAL;
+
+	if (unlikely(cache_index < 0))
+		cache_index = 0;
+
+	mutex_lock(&ext4_grpinfo_slab_create_mutex);
+	if (ext4_groupinfo_caches[cache_index]) {
+		mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+		return 0;	/* Already created */
+	}
+
+	slab_size = offsetof(struct ext4_group_info,
+				bb_counters[blocksize_bits + 2]);
+
+	cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+					slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+					NULL);
+
+	mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+	if (!cachep) {
+		printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
+		return -ENOMEM;
+	}
+
+	ext4_groupinfo_caches[cache_index] = cachep;
+
+	return 0;
+}
+
 int ext4_mb_init(struct super_block *sb, int needs_recovery)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned offset;
 	unsigned max;
 	int ret;
-	int cache_index;
-	struct kmem_cache *cachep;
-	char *namep = NULL;
 
 	i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
 
@@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		goto out;
 	}
 
-	cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
-	cachep = ext4_groupinfo_caches[cache_index];
-	if (!cachep) {
-		char name[32];
-		int len = offsetof(struct ext4_group_info,
-					bb_counters[sb->s_blocksize_bits + 2]);
-
-		sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
-		namep = kstrdup(name, GFP_KERNEL);
-		if (!namep) {
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		/* Need to free the kmem_cache_name() when we
-		 * destroy the slab */
-		cachep = kmem_cache_create(namep, len, 0,
-					     SLAB_RECLAIM_ACCOUNT, NULL);
-		if (!cachep) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		ext4_groupinfo_caches[cache_index] = cachep;
-	}
+	ret = ext4_groupinfo_create_slab(sb->s_blocksize);
+	if (ret < 0)
+		goto out;
 
 	/* order 0 is regular bitmap */
 	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2520,7 +2550,6 @@ out:
 	if (ret) {
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
-		kfree(namep);
 	}
 	return ret;
 }
@@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void)
 
 void ext4_exit_mballoc(void)
 {
-	int i;
 	/*
 	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
 	 * before destroying the slab cache.
@@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void)
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
 	kmem_cache_destroy(ext4_free_ext_cachep);
-
-	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
-		struct kmem_cache *cachep = ext4_groupinfo_caches[i];
-		if (cachep) {
-			char *name = (char *)kmem_cache_name(cachep);
-			kmem_cache_destroy(cachep);
-			kfree(name);
-		}
-	}
+	ext4_groupinfo_destroy_slabs();
 	ext4_remove_debugfs_entry();
 }
 
-- 
cgit v1.2.3


From e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sat, 12 Feb 2011 08:17:34 -0500
Subject: ext4: serialize unaligned asynchronous DIO

ext4 has a data corruption case when doing non-block-aligned
asynchronous direct IO into a sparse file, as demonstrated
by xfstest 240.

The root cause is that while ext4 preallocates space in the
hole, mappings of that space still look "new" and
dio_zero_block() will zero out the unwritten portions.  When
more than one AIO thread is going, they both find this "new"
block and race to zero out their portion; this is uncoordinated
and causes data corruption.

Dave Chinner fixed this for xfs by simply serializing all
unaligned asynchronous direct IO.  I've done the same here.
The difference is that we only wait on conversions, not all IO.
This is a very big hammer, and I'm not very pleased with
stuffing this into ext4_file_write().  But since ext4 is
DIO_LOCKING, we need to serialize it at this high level.

I tried to move this into ext4_ext_direct_IO, but by then
we have the i_mutex already, and we will wait on the
work queue to do conversions - which must also take the
i_mutex.  So that won't work.

This was originally exposed by qemu-kvm installing to
a raw disk image with a normal sector-63 alignment.  I've
tested a backport of this patch with qemu, and it does
avoid the corruption.  It is also quite a lot slower
(14 min for package installs, vs. 8 min for well-aligned)
but I'll take slow correctness over fast corruption any day.

Mingming suggested that we can track outstanding
conversions, and wait on those so that non-sparse
files won't be affected, and I've implemented that here;
unaligned AIO to nonsparse files won't take a perf hit.

[tytso@mit.edu: Keep the mutex as a hashed array instead
 of bloating the ext4 inode]

[tytso@mit.edu: Fix up namespace issues so that global
 variables are protected with an "ext4_" prefix.]

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 10 ++++++++++
 fs/ext4/extents.c | 10 ++++++----
 fs/ext4/file.c    | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/page-io.c | 25 ++++++++++++-----------
 fs/ext4/super.c   | 13 +++++++++++-
 5 files changed, 100 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c8d97b56f3..3aa0b72b3b9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
 	atomic_t i_ioend_count;	/* Number of outstanding io_end structs */
 	/* current io_end structure for async DIO write*/
 	ext4_io_end_t *cur_aio_dio;
+	atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
 
 	spinlock_t i_block_reservation_lock;
 
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ		37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+					    EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+					     EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 63a75810b7c..ccce8a7e94e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3174,9 +3174,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * that this IO needs to convertion to written when IO is
 		 * completed
 		 */
-		if (io)
+		if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
 			io->flag = EXT4_IO_END_UNWRITTEN;
-		else
+			atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+		} else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
 			map->m_flags |= EXT4_MAP_UNINIT;
@@ -3463,9 +3464,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		 * that we need to perform convertion when IO is done.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-			if (io)
+			if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
 				io->flag = EXT4_IO_END_UNWRITTEN;
-			else
+				atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+			} else
 				ext4_set_inode_state(inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
 		}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e8322c8aa8..7b80d543b89 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+static void ext4_aiodio_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = ext4_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+		   unsigned long nr_segs, loff_t pos)
+{
+	struct super_block *sb = inode->i_sb;
+	int blockmask = sb->s_blocksize - 1;
+	size_t count = iov_length(iov, nr_segs);
+	loff_t final_size = pos + count;
+
+	if (pos >= inode->i_size)
+		return 0;
+
+	if ((pos & blockmask) || (final_size & blockmask))
+		return 1;
+
+	return 0;
+}
+
 static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	int unaligned_aio = 0;
+	int ret;
 
 	/*
 	 * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 			nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
 					      sbi->s_bitmap_maxbytes - pos);
 		}
+	} else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+		   !is_sync_kiocb(iocb))) {
+		unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
 	}
 
-	return generic_file_aio_write(iocb, iov, nr_segs, pos);
+	/* Unaligned direct AIO must be serialized; see comment above */
+	if (unaligned_aio) {
+		static unsigned long unaligned_warn_time;
+
+		/* Warn about this once per day */
+		if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+			ext4_msg(inode->i_sb, KERN_WARNING,
+				 "Unaligned AIO/DIO on inode %ld by %s; "
+				 "performance will be poor.",
+				 inode->i_ino, current->comm);
+		mutex_lock(ext4_aio_mutex(inode));
+		ext4_aiodio_wait(inode);
+	}
+
+	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+	if (unaligned_aio)
+		mutex_unlock(ext4_aio_mutex(inode));
+
+	return ret;
 }
 
 static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4e9b0a242f4..955cc309142 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
 
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
 
-#define WQ_HASH_SZ		37
-#define to_ioend_wq(v)	(&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
-static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
-
 int __init ext4_init_pageio(void)
 {
-	int i;
-
 	io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
 	if (io_page_cachep == NULL)
 		return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
 		kmem_cache_destroy(io_page_cachep);
 		return -ENOMEM;
 	}
-	for (i = 0; i < WQ_HASH_SZ; i++)
-		init_waitqueue_head(&ioend_wq[i]);
-
 	return 0;
 }
 
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
 
 void ext4_ioend_wait(struct inode *inode)
 {
-	wait_queue_head_t *wq = to_ioend_wq(inode);
+	wait_queue_head_t *wq = ext4_ioend_wq(inode);
 
 	wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
 }
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
 	for (i = 0; i < io->num_io_pages; i++)
 		put_io_page(io->pages[i]);
 	io->num_io_pages = 0;
-	wq = to_ioend_wq(io->inode);
+	wq = ext4_ioend_wq(io->inode);
 	if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
 	    waitqueue_active(wq))
 		wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 	struct inode *inode = io->inode;
 	loff_t offset = io->offset;
 	ssize_t size = io->size;
+	wait_queue_head_t *wq;
 	int ret = 0;
 
 	ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
 	if (io->iocb)
 		aio_complete(io->iocb, io->result, 0);
 	/* clear the DIO AIO unwritten flag */
-	io->flag &= ~EXT4_IO_END_UNWRITTEN;
+	if (io->flag & EXT4_IO_END_UNWRITTEN) {
+		io->flag &= ~EXT4_IO_END_UNWRITTEN;
+		/* Wake up anyone waiting on unwritten extent conversion */
+		wq = ext4_ioend_wq(io->inode);
+		if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
+		    waitqueue_active(wq)) {
+			wake_up_all(wq);
+		}
+	}
+
 	return ret;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 86b05486dc6..f6a318f836b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -833,6 +833,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
 	atomic_set(&ei->i_ioend_count, 0);
+	atomic_set(&ei->i_aiodio_unwritten, 0);
 
 	return &ei->vfs_inode;
 }
@@ -4800,11 +4801,21 @@ static void ext4_exit_feat_adverts(void)
 	kfree(ext4_feat);
 }
 
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+
 static int __init ext4_init_fs(void)
 {
-	int err;
+	int i, err;
 
 	ext4_check_flag_values();
+
+	for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+		mutex_init(&ext4__aio_mutex[i]);
+		init_waitqueue_head(&ext4__ioend_wq[i]);
+	}
+
 	err = ext4_init_pageio();
 	if (err)
 		return err;
-- 
cgit v1.2.3


From e44718318004a5618d1dfe2d080e2862532d8e5f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 12 Feb 2011 08:18:24 -0500
Subject: jbd2: call __jbd2_log_start_commit with j_state_lock write locked

On an SMP ARM system running ext4, I've received a report that the
first J_ASSERT in jbd2_journal_commit_transaction has been triggering:

	J_ASSERT(journal->j_running_transaction != NULL);

While investigating possible causes for this problem, I noticed that
__jbd2_log_start_commit() is getting called with j_state_lock only
read-locked, in spite of the fact that it's possible for it might
j_commit_request.  Fix this by grabbing the necessary information so
we can test to see if we need to start a new transaction before
dropping the read lock, and then calling jbd2_log_start_commit() which
will grab the write lock.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c     |  9 +++++++--
 fs/jbd2/transaction.c | 21 ++++++++++++++-------
 2 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9e4686900f1..97e73469b2c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 
 /*
- * Called under j_state_lock.  Returns true if a transaction commit was started.
+ * Called with j_state_lock locked for writing.
+ * Returns true if a transaction commit was started.
  */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 {
 	transaction_t *transaction = NULL;
 	tid_t tid;
+	int need_to_start = 0;
 
 	read_lock(&journal->j_state_lock);
 	if (journal->j_running_transaction && !current->journal_info) {
 		transaction = journal->j_running_transaction;
-		__jbd2_log_start_commit(journal, transaction->t_tid);
+		if (!tid_geq(journal->j_commit_request, transaction->t_tid))
+			need_to_start = 1;
 	} else if (journal->j_committing_transaction)
 		transaction = journal->j_committing_transaction;
 
@@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 
 	tid = transaction->t_tid;
 	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
 	jbd2_log_wait_commit(journal, tid);
 	return 1;
 }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index faad2bd787c..1d1191050f9 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction)
 static int start_this_handle(journal_t *journal, handle_t *handle,
 			     int gfp_mask)
 {
-	transaction_t *transaction;
-	int needed;
-	int nblocks = handle->h_buffer_credits;
-	transaction_t *new_transaction = NULL;
+	transaction_t	*transaction, *new_transaction = NULL;
+	tid_t		tid;
+	int		needed, need_to_start;
+	int		nblocks = handle->h_buffer_credits;
 
 	if (nblocks > journal->j_max_transaction_buffers) {
 		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -222,8 +222,11 @@ repeat:
 		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
 				TASK_UNINTERRUPTIBLE);
-		__jbd2_log_start_commit(journal, transaction->t_tid);
+		tid = transaction->t_tid;
+		need_to_start = !tid_geq(journal->j_commit_request, tid);
 		read_unlock(&journal->j_state_lock);
+		if (need_to_start)
+			jbd2_log_start_commit(journal, tid);
 		schedule();
 		finish_wait(&journal->j_wait_transaction_locked, &wait);
 		goto repeat;
@@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int ret;
+	tid_t		tid;
+	int		need_to_start, ret;
 
 	/* If we've had an abort of any type, don't even think about
 	 * actually doing the restart! */
@@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 	spin_unlock(&transaction->t_handle_lock);
 
 	jbd_debug(2, "restarting handle %p\n", handle);
-	__jbd2_log_start_commit(journal, transaction->t_tid);
+	tid = transaction->t_tid;
+	need_to_start = !tid_geq(journal->j_commit_request, tid);
 	read_unlock(&journal->j_state_lock);
+	if (need_to_start)
+		jbd2_log_start_commit(journal, tid);
 
 	lock_map_release(&handle->h_lockdep_map);
 	handle->h_buffer_credits = nblocks;
-- 
cgit v1.2.3


From 541ce98c10111dae7604543dda6c6f7e7a6015d8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 14 Jan 2011 20:00:02 -0500
Subject: nfsd: don't leak dentry count on mnt_want_write failure

The exit cleanup isn't quite right here.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 641117f2188..fda3be23777 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1812,22 +1812,22 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 
 	host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
 	if (host_err)
-		goto out_nfserr;
+		goto out_put;
 
 	host_err = nfsd_break_lease(rdentry->d_inode);
 	if (host_err)
-		goto out_put;
+		goto out_drop_write;
 	if (type != S_IFDIR)
 		host_err = vfs_unlink(dirp, rdentry);
 	else
 		host_err = vfs_rmdir(dirp, rdentry);
-out_put:
-	dput(rdentry);
-
 	if (!host_err)
 		host_err = commit_metadata(fhp);
-
+out_drop_write:
 	mnt_drop_write(fhp->fh_export->ex_path.mnt);
+out_put:
+	dput(rdentry);
+
 out_nfserr:
 	err = nfserrno(host_err);
 out:
-- 
cgit v1.2.3


From 0af3f814ccf0a13d3e01e8115b96f1824379fc72 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Thu, 13 Jan 2011 11:25:31 +0200
Subject: NFSD: use nfserr for status after decode_cb_op_status

Bugs introduced in 85a56480191ca9f08fc775c129b9eb5c8c1f2c05
"NFSD: Update XDR decoders in NFSv4 callback client"

Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3be975e1891..cde36cb0f34 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -484,7 +484,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
 out:
 	return status;
 out_default:
-	return nfs_cb_stat_to_errno(status);
+	return nfs_cb_stat_to_errno(nfserr);
 }
 
 /*
@@ -564,11 +564,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
 	if (unlikely(status))
 		goto out;
 	if (unlikely(nfserr != NFS4_OK))
-		goto out_default;
+		status = nfs_cb_stat_to_errno(nfserr);
 out:
 	return status;
-out_default:
-	return nfs_cb_stat_to_errno(status);
 }
 
 /*
-- 
cgit v1.2.3


From 3aa6e0aa8ab3e64bbfba092c64d42fd1d006b124 Mon Sep 17 00:00:00 2001
From: Konstantin Khorenko <khorenko@parallels.com>
Date: Tue, 1 Feb 2011 17:16:29 +0300
Subject: NFSD: memory corruption due to writing beyond the stat array

If nfsd fails to find an exported via NFS file in the readahead cache, it
should increment corresponding nfsdstats counter (ra_depth[10]), but due to a
bug it may instead write to ra_depth[11], corrupting the following field.

In a kernel with NFSDv4 compiled in the corruption takes the form of an
increment of a counter of the number of NFSv4 operation 0's received; since
there is no operation 0, this is harmless.

In a kernel with NFSDv4 disabled it corrupts whatever happens to be in the
memory beyond nfsdstats.

Signed-off-by: Konstantin Khorenko <khorenko@openvz.org>
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fda3be23777..30c73f8a579 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -808,7 +808,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
 		if (ra->p_count == 0)
 			frap = rap;
 	}
-	depth = nfsdstats.ra_size*11/10;
+	depth = nfsdstats.ra_size;
 	if (!frap) {	
 		spin_unlock(&rab->pb_lock);
 		return NULL;
-- 
cgit v1.2.3


From 6b57d9c86d0ab11c091b6db2edff8b5553fd445b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 31 Jan 2011 11:54:04 -0500
Subject: nfsd4: split up nfsd_break_deleg_cb

We'll be adding some more code here soon.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d98d0213285..ceb66170fda 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2329,23 +2329,8 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
 		nfs4_file_put_access(fp, O_RDONLY);
 }
 
-/*
- * Spawn a thread to perform a recall on the delegation represented
- * by the lease (file_lock)
- *
- * Called from break_lease() with lock_flocks() held.
- * Note: we assume break_lease will only call this *once* for any given
- * lease.
- */
-static
-void nfsd_break_deleg_cb(struct file_lock *fl)
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 {
-	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-
-	dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
-	if (!dp)
-		return;
-
 	/* We're assuming the state code never drops its reference
 	 * without first removing the lease.  Since we're in this lease
 	 * callback (and since the lease code is serialized by the kernel
@@ -2360,15 +2345,28 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
 	/* only place dl_time is set. protected by lock_flocks*/
 	dp->dl_time = get_seconds();
 
+	nfsd4_cb_recall(dp);
+}
+
+/*
+ * Called from break_lease() with lock_flocks() held.
+ * Note: we assume break_lease will only call this *once* for any given
+ * lease.
+ */
+static void nfsd_break_deleg_cb(struct file_lock *fl)
+{
+	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
+
+	BUG_ON(!dp);
 	/*
 	 * We don't want the locks code to timeout the lease for us;
 	 * we'll remove it ourself if the delegation isn't returned
-	 * in time.
+	 * in time:
 	 */
 	fl->fl_break_time = 0;
 
+	nfsd_break_one_deleg(dp);
 	dp->dl_file->fi_had_conflict = true;
-	nfsd4_cb_recall(dp);
 }
 
 static
-- 
cgit v1.2.3


From 22d38c4c10e8344aa406897d99a35d585d2cb77d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 31 Jan 2011 11:55:12 -0500
Subject: nfsd4: add helper function for lease setup

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ceb66170fda..65978a9aa87 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2639,6 +2639,26 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
 	return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
 }
 
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag)
+{
+	struct file_lock *fl;
+
+	fl = locks_alloc_lock();
+	if (!fl)
+		return NULL;
+	locks_init_lock(fl);
+	fl->fl_lmops = &nfsd_lease_mng_ops;
+	fl->fl_flags = FL_LEASE;
+	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+	fl->fl_end = OFFSET_MAX;
+	fl->fl_owner = (fl_owner_t)dp;
+	fl->fl_file = dp->dl_vfs_file;
+	BUG_ON(!fl->fl_file);
+	fl->fl_pid = current->tgid;
+	dp->dl_flock = fl;
+	return fl;
+}
+
 /*
  * Attempt to hand out a delegation.
  */
@@ -2684,20 +2704,9 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 		goto out;
 	}
 	status = -ENOMEM;
-	fl = locks_alloc_lock();
+	fl = nfs4_alloc_init_lease(dp, flag);
 	if (!fl)
 		goto out;
-	locks_init_lock(fl);
-	fl->fl_lmops = &nfsd_lease_mng_ops;
-	fl->fl_flags = FL_LEASE;
-	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-	fl->fl_end = OFFSET_MAX;
-	fl->fl_owner =  (fl_owner_t)dp;
-	fl->fl_file = find_readable_file(stp->st_file);
-	BUG_ON(!fl->fl_file);
-	fl->fl_pid = current->tgid;
-	dp->dl_flock = fl;
-
 	/* vfs_setlease checks to see if delegation should be handed out.
 	 * the lock_manager callback fl_change is used
 	 */
-- 
cgit v1.2.3


From dd239cc05f0ad9f582dd83d88a4fb5edcc57a026 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 31 Jan 2011 17:14:55 -0500
Subject: nfsd4: fix leak on allocation error

Also share some common exit code.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 65978a9aa87..099d6fa64f7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2699,14 +2699,12 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	}
 
 	dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
-	if (dp == NULL) {
-		flag = NFS4_OPEN_DELEGATE_NONE;
-		goto out;
-	}
+	if (dp == NULL)
+		goto out_no_deleg;
 	status = -ENOMEM;
 	fl = nfs4_alloc_init_lease(dp, flag);
 	if (!fl)
-		goto out;
+		goto out_free;
 	/* vfs_setlease checks to see if delegation should be handed out.
 	 * the lock_manager callback fl_change is used
 	 */
@@ -2714,9 +2712,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
 		dp->dl_flock = NULL;
 		locks_free_lock(fl);
-		unhash_delegation(dp);
-		flag = NFS4_OPEN_DELEGATE_NONE;
-		goto out;
+		goto out_free;
 	}
 
 	memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
@@ -2729,6 +2725,12 @@ out:
 			&& open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
 		dprintk("NFSD: WARNING: refusing delegation reclaim\n");
 	open->op_delegate_type = flag;
+	return;
+out_free:
+	unhash_delegation(dp);
+out_no_deleg:
+	flag = NFS4_OPEN_DELEGATE_NONE;
+	goto out;
 }
 
 /*
-- 
cgit v1.2.3


From edab9782b5a16abb8d139d261e81e13ef0be35a9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 31 Jan 2011 17:58:10 -0500
Subject: nfsd4: split lease setting into separate function

Splitting some code into a separate function which we'll be adding some
more to.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 099d6fa64f7..dbb2141cf88 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2659,6 +2659,23 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
 	return fl;
 }
 
+static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
+{
+	struct file_lock *fl;
+	int status;
+
+	fl = nfs4_alloc_init_lease(dp, flag);
+	if (!fl)
+		return -ENOMEM;
+	status = vfs_setlease(dp->dl_vfs_file, fl->fl_type, &fl);
+	if (status) {
+		dp->dl_flock = NULL;
+		locks_free_lock(fl);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
 /*
  * Attempt to hand out a delegation.
  */
@@ -2668,7 +2685,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	struct nfs4_delegation *dp;
 	struct nfs4_stateowner *sop = stp->st_stateowner;
 	int cb_up;
-	struct file_lock *fl;
 	int status, flag = 0;
 
 	cb_up = nfsd4_cb_channel_good(sop->so_client);
@@ -2701,19 +2717,9 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
 	if (dp == NULL)
 		goto out_no_deleg;
-	status = -ENOMEM;
-	fl = nfs4_alloc_init_lease(dp, flag);
-	if (!fl)
-		goto out_free;
-	/* vfs_setlease checks to see if delegation should be handed out.
-	 * the lock_manager callback fl_change is used
-	 */
-	if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
-		dprintk("NFSD: setlease failed [%d], no delegation\n", status);
-		dp->dl_flock = NULL;
-		locks_free_lock(fl);
+	status = nfs4_setlease(dp, flag);
+	if (status)
 		goto out_free;
-	}
 
 	memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
 
-- 
cgit v1.2.3


From 65bc58f5187e2ff4011ef1bd3082e83cd1b036f1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 7 Feb 2011 15:44:12 -0500
Subject: nfsd4: remove unused deleg dprintk's.

These aren't all that useful, and get in the way of the next steps.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index dbb2141cf88..d978192838a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -958,8 +958,6 @@ expire_client(struct nfs4_client *clp)
 	spin_lock(&recall_lock);
 	while (!list_empty(&clp->cl_delegations)) {
 		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-		dprintk("NFSD: expire client. dp %p, fp %p\n", dp,
-				dp->dl_flock);
 		list_del_init(&dp->dl_perclnt);
 		list_move(&dp->dl_recall_lru, &reaplist);
 	}
@@ -2931,8 +2929,6 @@ nfs4_laundromat(void)
 				test_val = u;
 			break;
 		}
-		dprintk("NFSD: purging unused delegation dp %p, fp %p\n",
-			            dp, dp->dl_flock);
 		list_move(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&recall_lock);
-- 
cgit v1.2.3


From 5d926e8c2f46dc09f4ddde86644a5f1d0726a470 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 7 Feb 2011 16:53:46 -0500
Subject: nfsd4: modify fi_delegations under recall_lock

Modify fi_delegations only under the recall_lock, allowing us to use
that list on lease breaks.

Also some trivial cleanup to simplify later changes.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d978192838a..8b6cd3cf483 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -277,9 +277,9 @@ nfs4_close_delegation(struct nfs4_delegation *dp)
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
-	list_del_init(&dp->dl_perfile);
 	list_del_init(&dp->dl_perclnt);
 	spin_lock(&recall_lock);
+	list_del_init(&dp->dl_perfile);
 	list_del_init(&dp->dl_recall_lru);
 	spin_unlock(&recall_lock);
 	nfs4_close_delegation(dp);
@@ -2336,9 +2336,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	 * it's safe to take a reference: */
 	atomic_inc(&dp->dl_count);
 
-	spin_lock(&recall_lock);
 	list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
-	spin_unlock(&recall_lock);
 
 	/* only place dl_time is set. protected by lock_flocks*/
 	dp->dl_time = get_seconds();
@@ -2363,8 +2361,10 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
 	 */
 	fl->fl_break_time = 0;
 
-	nfsd_break_one_deleg(dp);
+	spin_lock(&recall_lock);
 	dp->dl_file->fi_had_conflict = true;
+	nfsd_break_one_deleg(dp);
+	spin_unlock(&recall_lock);
 }
 
 static
-- 
cgit v1.2.3


From acfdf5c383b38f7f4dddae41b97c97f1ae058f49 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 31 Jan 2011 19:20:39 -0500
Subject: nfsd4: acquire only one lease per file

Instead of acquiring one lease each time another client opens a file,
nfsd can acquire just one lease to represent all of them, and reference
count it to determine when to release it.

This fixes a regression introduced by
c45821d263a8a5109d69a9e8942b8d65bcd5f31a "locks: eliminate fl_mylease
callback": after that patch, only the struct file * is used to determine
who owns a given lease.  But since we recently converted the server to
share a single struct file per open, if we acquire multiple leases on
the same file from nfsd, it then becomes impossible on unlocking a lease
to determine which of those leases (all of whom share the same struct
file *) we meant to remove.

Thanks to Takashi Iwai <tiwai@suse.de> for catching a bug in a previous
version of this patch.

Tested-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 95 +++++++++++++++++++++++++++++++----------------------
 fs/nfsd/state.h     |  5 +--
 2 files changed, 58 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8b6cd3cf483..54b60bfceb8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -230,9 +230,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_client = clp;
 	get_nfs4_file(fp);
 	dp->dl_file = fp;
-	dp->dl_vfs_file = find_readable_file(fp);
-	get_file(dp->dl_vfs_file);
-	dp->dl_flock = NULL;
 	dp->dl_type = type;
 	dp->dl_stateid.si_boot = boot_time;
 	dp->dl_stateid.si_stateownerid = current_delegid++;
@@ -241,8 +238,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
-	list_add(&dp->dl_perfile, &fp->fi_delegations);
-	list_add(&dp->dl_perclnt, &clp->cl_delegations);
 	INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
 	return dp;
 }
@@ -253,24 +248,18 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 	if (atomic_dec_and_test(&dp->dl_count)) {
 		dprintk("NFSD: freeing dp %p\n",dp);
 		put_nfs4_file(dp->dl_file);
-		fput(dp->dl_vfs_file);
 		kmem_cache_free(deleg_slab, dp);
 		num_delegations--;
 	}
 }
 
-/* Remove the associated file_lock first, then remove the delegation.
- * lease_modify() is called to remove the FS_LEASE file_lock from
- * the i_flock list, eventually calling nfsd's lock_manager
- * fl_release_callback.
- */
-static void
-nfs4_close_delegation(struct nfs4_delegation *dp)
+static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 {
-	dprintk("NFSD: close_delegation dp %p\n",dp);
-	/* XXX: do we even need this check?: */
-	if (dp->dl_flock)
-		vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
+	if (atomic_dec_and_test(&fp->fi_delegees)) {
+		vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
+		fp->fi_lease = NULL;
+		fp->fi_deleg_file = NULL;
+	}
 }
 
 /* Called under the state lock. */
@@ -282,7 +271,7 @@ unhash_delegation(struct nfs4_delegation *dp)
 	list_del_init(&dp->dl_perfile);
 	list_del_init(&dp->dl_recall_lru);
 	spin_unlock(&recall_lock);
-	nfs4_close_delegation(dp);
+	nfs4_put_deleg_lease(dp->dl_file);
 	nfs4_put_delegation(dp);
 }
 
@@ -2076,6 +2065,7 @@ alloc_init_file(struct inode *ino)
 		fp->fi_inode = igrab(ino);
 		fp->fi_id = current_fileid++;
 		fp->fi_had_conflict = false;
+		fp->fi_lease = NULL;
 		memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
 		memset(fp->fi_access, 0, sizeof(fp->fi_access));
 		spin_lock(&recall_lock);
@@ -2344,26 +2334,26 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	nfsd4_cb_recall(dp);
 }
 
-/*
- * Called from break_lease() with lock_flocks() held.
- * Note: we assume break_lease will only call this *once* for any given
- * lease.
- */
+/* Called from break_lease() with lock_flocks() held. */
 static void nfsd_break_deleg_cb(struct file_lock *fl)
 {
-	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
+	struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
+	struct nfs4_delegation *dp;
 
-	BUG_ON(!dp);
+	BUG_ON(!fp);
+	/* We assume break_lease is only called once per lease: */
+	BUG_ON(fp->fi_had_conflict);
 	/*
 	 * We don't want the locks code to timeout the lease for us;
-	 * we'll remove it ourself if the delegation isn't returned
+	 * we'll remove it ourself if a delegation isn't returned
 	 * in time:
 	 */
 	fl->fl_break_time = 0;
 
 	spin_lock(&recall_lock);
-	dp->dl_file->fi_had_conflict = true;
-	nfsd_break_one_deleg(dp);
+	fp->fi_had_conflict = true;
+	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+		nfsd_break_one_deleg(dp);
 	spin_unlock(&recall_lock);
 }
 
@@ -2455,13 +2445,15 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 static struct nfs4_delegation *
 find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
-	struct nfs4_delegation *dp;
+	struct nfs4_delegation *dp = NULL;
 
+	spin_lock(&recall_lock);
 	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) {
 		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid)
-			return dp;
+			break;
 	}
-	return NULL;
+	spin_unlock(&recall_lock);
+	return dp;
 }
 
 int share_access_to_flags(u32 share_access)
@@ -2649,28 +2641,51 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
 	fl->fl_flags = FL_LEASE;
 	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
 	fl->fl_end = OFFSET_MAX;
-	fl->fl_owner = (fl_owner_t)dp;
-	fl->fl_file = dp->dl_vfs_file;
-	BUG_ON(!fl->fl_file);
+	fl->fl_owner = (fl_owner_t)(dp->dl_file);
 	fl->fl_pid = current->tgid;
-	dp->dl_flock = fl;
 	return fl;
 }
 
 static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
 {
+	struct nfs4_file *fp = dp->dl_file;
 	struct file_lock *fl;
 	int status;
 
 	fl = nfs4_alloc_init_lease(dp, flag);
 	if (!fl)
 		return -ENOMEM;
-	status = vfs_setlease(dp->dl_vfs_file, fl->fl_type, &fl);
+	fl->fl_file = find_readable_file(fp);
+	list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
 	if (status) {
-		dp->dl_flock = NULL;
+		list_del_init(&dp->dl_perclnt);
 		locks_free_lock(fl);
 		return -ENOMEM;
 	}
+	fp->fi_lease = fl;
+	fp->fi_deleg_file = fl->fl_file;
+	get_file(fp->fi_deleg_file);
+	atomic_set(&fp->fi_delegees, 1);
+	list_add(&dp->dl_perfile, &fp->fi_delegations);
+	return 0;
+}
+
+static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
+{
+	struct nfs4_file *fp = dp->dl_file;
+
+	if (!fp->fi_lease)
+		return nfs4_setlease(dp, flag);
+	spin_lock(&recall_lock);
+	if (fp->fi_had_conflict) {
+		spin_unlock(&recall_lock);
+		return -EAGAIN;
+	}
+	atomic_inc(&fp->fi_delegees);
+	list_add(&dp->dl_perfile, &fp->fi_delegations);
+	spin_unlock(&recall_lock);
+	list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
 	return 0;
 }
 
@@ -2715,7 +2730,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
 	if (dp == NULL)
 		goto out_no_deleg;
-	status = nfs4_setlease(dp, flag);
+	status = nfs4_set_delegation(dp, flag);
 	if (status)
 		goto out_free;
 
@@ -2731,7 +2746,7 @@ out:
 	open->op_delegate_type = flag;
 	return;
 out_free:
-	unhash_delegation(dp);
+	nfs4_put_delegation(dp);
 out_no_deleg:
 	flag = NFS4_OPEN_DELEGATE_NONE;
 	goto out;
@@ -3139,7 +3154,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 			goto out;
 		renew_client(dp->dl_client);
 		if (filpp) {
-			*filpp = find_readable_file(dp->dl_file);
+			*filpp = dp->dl_file->fi_deleg_file;
 			BUG_ON(!*filpp);
 		}
 	} else { /* open or lock stateid */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 3074656ba7b..2d31224b07b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -83,8 +83,6 @@ struct nfs4_delegation {
 	atomic_t		dl_count;       /* ref count */
 	struct nfs4_client	*dl_client;
 	struct nfs4_file	*dl_file;
-	struct file		*dl_vfs_file;
-	struct file_lock	*dl_flock;
 	u32			dl_type;
 	time_t			dl_time;
 /* For recall: */
@@ -379,6 +377,9 @@ struct nfs4_file {
 	 */
 	atomic_t		fi_readers;
 	atomic_t		fi_writers;
+	struct file		*fi_deleg_file;
+	struct file_lock	*fi_lease;
+	atomic_t		fi_delegees;
 	struct inode		*fi_inode;
 	u32                     fi_id;      /* used with stateowner->so_id 
 					     * for stateid_hashtbl hash */
-- 
cgit v1.2.3


From 83f6b0c18204f68961f58b9f69e5dba0d36056a2 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 6 Feb 2011 16:46:30 -0500
Subject: nfsd: break lease on unlink due to rename

4795bb37effb7b8fe77e2d2034545d062d3788a8 "nfsd: break lease on unlink,
link, and rename", only broke the lease on the file that was being
renamed, and didn't handle the case where the target path refers to an
already-existing file that will be unlinked by a rename--in that case
the target file should have any leases broken as well.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 30c73f8a579..da1d9701f8e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1742,6 +1742,13 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 		goto out_dput_new;
 
 	host_err = nfsd_break_lease(odentry->d_inode);
+	if (host_err)
+		goto out_drop_write;
+	if (ndentry->d_inode) {
+		host_err = nfsd_break_lease(ndentry->d_inode);
+		if (host_err)
+			goto out_drop_write;
+	}
 	if (host_err)
 		goto out_drop_write;
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
-- 
cgit v1.2.3


From eb14ab8ed24a0405fd056068b28c33a1cd846024 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 Feb 2011 12:35:00 -0500
Subject: Btrfs: fix page->private races

There is a race where btrfs_releasepage can drop the
page->private contents just as alloc_extent_buffer is setting
up pages for metadata.  Because of how the Btrfs page flags work,
this results in us skipping the crc on the page during IO.

This patch sovles the race by waiting until after the extent buffer
is inserted into the radix tree before it sets page private.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   |  8 ++++++--
 fs/btrfs/extent_io.c | 38 +++++++++++++++++++++++++++++++++++---
 2 files changed, 41 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b36eeef1919..3e1ea3e0477 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -359,10 +359,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-	if (page->private == EXTENT_PAGE_PRIVATE)
+	if (page->private == EXTENT_PAGE_PRIVATE) {
+		WARN_ON(1);
 		goto out;
-	if (!page->private)
+	}
+	if (!page->private) {
+		WARN_ON(1);
 		goto out;
+	}
 	len = page->private >> 2;
 	WARN_ON(len == 0);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8862dda46ff..0418bf2c975 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1946,6 +1946,7 @@ void set_page_extent_mapped(struct page *page)
 
 static void set_page_extent_head(struct page *page, unsigned long len)
 {
+	WARN_ON(!PagePrivate(page));
 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
 
@@ -3195,7 +3196,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		}
 		if (!PageUptodate(p))
 			uptodate = 0;
-		unlock_page(p);
+
+		/*
+		 * see below about how we avoid a nasty race with release page
+		 * and why we unlock later
+		 */
+		if (i != 0)
+			unlock_page(p);
 	}
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3219,9 +3226,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	atomic_inc(&eb->refs);
 	spin_unlock(&tree->buffer_lock);
 	radix_tree_preload_end();
+
+	/*
+	 * there is a race where release page may have
+	 * tried to find this extent buffer in the radix
+	 * but failed.  It will tell the VM it is safe to
+	 * reclaim the, and it will clear the page private bit.
+	 * We must make sure to set the page private bit properly
+	 * after the extent buffer is in the radix tree so
+	 * it doesn't get lost
+	 */
+	set_page_extent_mapped(eb->first_page);
+	set_page_extent_head(eb->first_page, eb->len);
+	if (!page0)
+		unlock_page(eb->first_page);
 	return eb;
 
 free_eb:
+	if (eb->first_page && !page0)
+		unlock_page(eb->first_page);
+
 	if (!atomic_dec_and_test(&eb->refs))
 		return exists;
 	btrfs_release_extent_buffer(eb);
@@ -3272,10 +3296,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			continue;
 
 		lock_page(page);
+		WARN_ON(!PagePrivate(page));
+
+		set_page_extent_mapped(page);
 		if (i == 0)
 			set_page_extent_head(page, eb->len);
-		else
-			set_page_private(page, EXTENT_PAGE_PRIVATE);
 
 		clear_page_dirty_for_io(page);
 		spin_lock_irq(&page->mapping->tree_lock);
@@ -3465,6 +3490,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
+
+		WARN_ON(!PagePrivate(page));
+
+		set_page_extent_mapped(page);
+		if (i == 0)
+			set_page_extent_head(page, eb->len);
+
 		if (inc_all_pages)
 			page_cache_get(page);
 		if (!PageUptodate(page)) {
-- 
cgit v1.2.3


From e3f24cc521cb7ba60ac137abd1939e4e03435e80 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Feb 2011 12:52:08 -0500
Subject: Btrfs: don't release pages when we can't clear the uptodate bits

Btrfs tracks uptodate state in an rbtree as well as in the
page bits.  This is supposed to enable us to use block sizes other than
the page size, but there are a few parts still missing before that
completely works.

But, our readpage routine trusts this additional range based tracking
of uptodateness, much in the same way the buffer head up to date bits
are trusted for the other filesystems.

The problem is that sometimes we need to allocate memory in order to
split records in the rbtree, even when we are just clearing bits.  This
can be difficult when our clearing function is called GFP_ATOMIC, which
can happen in the releasepage path.

So, what happens today looks like this:

releasepage called with GFP_ATOMIC
btrfs_releasepage calls clear_extent_bit
clear_extent_bit fails to allocate ram, leaving the up to date bit set
btrfs_releasepage returns success

The end result is the page being gone, but btrfs thinking the range is
up to date.   Later on if someone tries to read that same page, the
btrfs readpage code will return immediately thinking the page is already
up to date.

This commit fixes things to fail the releasepage when we can't clear the
extent state bits.  It covers both data pages and metadata tree blocks.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0418bf2c975..e7aeba24270 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2822,9 +2822,17 @@ int try_release_extent_state(struct extent_map_tree *map,
 		 * at this point we can safely clear everything except the
 		 * locked bit and the nodatasum bit
 		 */
-		clear_extent_bit(tree, start, end,
+		ret = clear_extent_bit(tree, start, end,
 				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
 				 0, 0, NULL, mask);
+
+		/* if clear_extent_bit failed for enomem reasons,
+		 * we can't allow the release to continue.
+		 */
+		if (ret < 0)
+			ret = 0;
+		else
+			ret = 1;
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 6848ad6461e551849ba3c32d945d4f45e96453a6 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@linux.intel.com>
Date: Mon, 14 Feb 2011 16:00:03 -0500
Subject: Btrfs: Fix balance panic

Mark the cloned backref_node as checked in clone_backref_node()

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/relocation.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1f5556acb53..0825e4ed944 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
 	new_node->bytenr = dest->node->start;
 	new_node->level = node->level;
 	new_node->lowest = node->lowest;
+	new_node->checked = 1;
 	new_node->root = dest;
 
 	if (!node->lowest) {
-- 
cgit v1.2.3


From 51788b1bdd0d68345bab0af4301e7fa429277228 Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <drosenberg@vsecurity.com>
Date: Mon, 14 Feb 2011 16:04:23 -0500
Subject: btrfs: prevent heap corruption in btrfs_ioctl_space_info()

Commit bf5fc093c5b625e4259203f1cee7ca73488a5620 refactored
btrfs_ioctl_space_info() and introduced several security issues.

space_args.space_slots is an unsigned 64-bit type controlled by a
possibly unprivileged caller.  The comparison as a signed int type
allows providing values that are treated as negative and cause the
subsequent allocation size calculation to wrap, or be truncated to 0.
By providing a size that's truncated to 0, kmalloc() will return
ZERO_SIZE_PTR.  It's also possible to provide a value smaller than the
slot count.  The subsequent loop ignores the allocation size when
copying data in, resulting in a heap overflow or write to ZERO_SIZE_PTR.

The fix changes the slot count type and comparison typecast to u64,
which prevents truncation or signedness errors, and also ensures that we
don't copy more data than we've allocated in the subsequent loop.  Note
that zero-size allocations are no longer possible since there is already
an explicit check for space_args.space_slots being 0 and truncation of
this value is no longer an issue.

Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
Signed-off-by: Josef Bacik <josef@redhat.com>
Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 02d224e8c83..be2d4f6aaa5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2208,7 +2208,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 	int num_types = 4;
 	int alloc_size;
 	int ret = 0;
-	int slot_count = 0;
+	u64 slot_count = 0;
 	int i, c;
 
 	if (copy_from_user(&space_args,
@@ -2247,7 +2247,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 		goto out;
 	}
 
-	slot_count = min_t(int, space_args.space_slots, slot_count);
+	slot_count = min_t(u64, space_args.space_slots, slot_count);
 
 	alloc_size = sizeof(*dest) * slot_count;
 
@@ -2267,6 +2267,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 	for (i = 0; i < num_types; i++) {
 		struct btrfs_space_info *tmp;
 
+		if (!slot_count)
+			break;
+
 		info = NULL;
 		rcu_read_lock();
 		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
@@ -2288,7 +2291,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 				memcpy(dest, &space, sizeof(space));
 				dest++;
 				space_args.total_spaces++;
+				slot_count--;
 			}
+			if (!slot_count)
+				break;
 		}
 		up_read(&info->groups_sem);
 	}
-- 
cgit v1.2.3


From 67100f255dba284bcbb5ce795355dad1cff35658 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 6 Feb 2011 19:58:21 +0000
Subject: Btrfs - Fix memory leak in btrfs_init_new_device()

Memory allocated by calling kstrdup() should be freed.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7cad59353b0..dadaaa8005c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1603,12 +1603,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 	ret = find_next_devid(root, &device->devid);
 	if (ret) {
+		kfree(device->name);
 		kfree(device);
 		goto error;
 	}
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
+		kfree(device->name);
 		kfree(device);
 		ret = PTR_ERR(trans);
 		goto error;
-- 
cgit v1.2.3


From c26a920373a983b52223eed5a13b97404d8b4158 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Mon, 14 Feb 2011 00:45:29 +0000
Subject: Btrfs: check return value of alloc_extent_map()

I add the check on the return value of alloc_extent_map() to several places.
In addition, alloc_extent_map() returns only the address or NULL.
Therefore, check by IS_ERR() is unnecessary. So, I remove IS_ERR() checking.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/extent_map.c  | 4 ++--
 fs/btrfs/file.c        | 1 +
 fs/btrfs/inode.c       | 3 +++
 4 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 565e22d77b1..a7aaa10c530 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6584,7 +6584,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
 	u64 end = start + extent_key->offset - 1;
 
 	em = alloc_extent_map(GFP_NOFS);
-	BUG_ON(!em || IS_ERR(em));
+	BUG_ON(!em);
 
 	em->start = start;
 	em->len = extent_key->offset;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b0e1fce1253..2b6c12e983b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,8 +51,8 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 {
 	struct extent_map *em;
 	em = kmem_cache_alloc(extent_map_cache, mask);
-	if (!em || IS_ERR(em))
-		return em;
+	if (!em)
+		return NULL;
 	em->in_tree = 0;
 	em->flags = 0;
 	em->compress_type = BTRFS_COMPRESS_NONE;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b0ff34b9660..65338a1d14a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -185,6 +185,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split = alloc_extent_map(GFP_NOFS);
 		if (!split2)
 			split2 = alloc_extent_map(GFP_NOFS);
+		BUG_ON(!split || !split2);
 
 		write_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c9bc0afdbfc..8d392ed73d5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -644,6 +644,7 @@ retry:
 					async_extent->ram_size - 1, 0);
 
 		em = alloc_extent_map(GFP_NOFS);
+		BUG_ON(!em);
 		em->start = async_extent->start;
 		em->len = async_extent->ram_size;
 		em->orig_start = em->start;
@@ -820,6 +821,7 @@ static noinline int cow_file_range(struct inode *inode,
 		BUG_ON(ret);
 
 		em = alloc_extent_map(GFP_NOFS);
+		BUG_ON(!em);
 		em->start = start;
 		em->orig_start = em->start;
 		ram_size = ins.offset;
@@ -1169,6 +1171,7 @@ out_check:
 			struct extent_map_tree *em_tree;
 			em_tree = &BTRFS_I(inode)->extent_tree;
 			em = alloc_extent_map(GFP_NOFS);
+			BUG_ON(!em);
 			em->start = cur_offset;
 			em->orig_start = em->start;
 			em->len = num_bytes;
-- 
cgit v1.2.3


From 844a391799c25d9ba85cbce33e4697db06083ec6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 00:38:26 -0500
Subject: nothing in do_follow_link() is going to see RCU

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index ec4b2d0190a..9ce6d272f4f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -668,9 +668,6 @@ force_reval_path(struct path *path, struct nameidata *nd)
 		return 0;
 
 	if (!status) {
-		/* Don't d_invalidate in rcu-walk mode */
-		if (nameidata_drop_rcu(nd))
-			return -ECHILD;
 		d_invalidate(dentry);
 		status = -ESTALE;
 	}
@@ -777,6 +774,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 	int error;
 	struct dentry *dentry = link->dentry;
 
+	BUG_ON(nd->flags & LOOKUP_RCU);
+
 	touch_atime(link->mnt, dentry);
 	nd_set_link(nd, NULL);
 
@@ -811,6 +810,11 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
 {
 	void *cookie;
 	int err = -ELOOP;
+
+	/* We drop rcu-walk here */
+	if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+		return -ECHILD;
+
 	if (current->link_count >= MAX_NESTED_LINKS)
 		goto loop;
 	if (current->total_link_count >= 40)
@@ -1419,9 +1423,6 @@ exec_again:
 			goto out_dput;
 
 		if (inode->i_op->follow_link) {
-			/* We commonly drop rcu-walk here */
-			if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
-				return -ECHILD;
 			BUG_ON(inode != next.dentry->d_inode);
 			err = do_follow_link(&next, nd);
 			if (err)
@@ -1467,8 +1468,6 @@ last_component:
 			break;
 		if (inode && unlikely(inode->i_op->follow_link) &&
 		    (lookup_flags & LOOKUP_FOLLOW)) {
-			if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
-				return -ECHILD;
 			BUG_ON(inode != next.dentry->d_inode);
 			err = do_follow_link(&next, nd);
 			if (err)
-- 
cgit v1.2.3


From 24643087e748bf192f1182766716e522dc1c972f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 01:26:22 -0500
Subject: in do_lookup() split RCU and non-RCU cases of need_revalidate

and use unlikely() instead of gotos, for fsck sake...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9ce6d272f4f..7609bacc704 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1259,9 +1259,15 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 			return -ECHILD;
 
 		nd->seq = seq;
-		if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-			goto need_revalidate;
-done2:
+		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+			dentry = do_revalidate(dentry, nd);
+			if (!dentry)
+				goto need_lookup;
+			if (IS_ERR(dentry))
+				goto fail;
+			if (!(nd->flags & LOOKUP_RCU))
+				goto done;
+		}
 		path->mnt = mnt;
 		path->dentry = dentry;
 		if (likely(__follow_mount_rcu(nd, path, inode, false)))
@@ -1274,8 +1280,13 @@ done2:
 	if (!dentry)
 		goto need_lookup;
 found:
-	if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-		goto need_revalidate;
+	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+		dentry = do_revalidate(dentry, nd);
+		if (!dentry)
+			goto need_lookup;
+		if (IS_ERR(dentry))
+			goto fail;
+	}
 done:
 	path->mnt = mnt;
 	path->dentry = dentry;
@@ -1317,16 +1328,6 @@ need_lookup:
 	mutex_unlock(&dir->i_mutex);
 	goto found;
 
-need_revalidate:
-	dentry = do_revalidate(dentry, nd);
-	if (!dentry)
-		goto need_lookup;
-	if (IS_ERR(dentry))
-		goto fail;
-	if (nd->flags & LOOKUP_RCU)
-		goto done2;
-	goto done;
-
 fail:
 	return PTR_ERR(dentry);
 }
-- 
cgit v1.2.3


From f5e1c1c1afc1d979e2ac6a24cc99ba7143639f4d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 01:32:55 -0500
Subject: split do_revalidate() into RCU and non-RCU cases

fixing oopsen in lookup_one_len()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 47 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 7609bacc704..a98f7f14178 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -592,12 +592,10 @@ static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return status;
 }
 
-static inline struct dentry *
+static struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	int status;
-
-	status = d_revalidate(dentry, nd);
+	int status = d_revalidate(dentry, nd);
 	if (unlikely(status <= 0)) {
 		/*
 		 * The dentry failed validation.
@@ -606,24 +604,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 		 * to return a fail status.
 		 */
 		if (status < 0) {
-			/* If we're in rcu-walk, we don't have a ref */
-			if (!(nd->flags & LOOKUP_RCU))
-				dput(dentry);
+			dput(dentry);
 			dentry = ERR_PTR(status);
-
-		} else {
-			/* Don't d_invalidate in rcu-walk mode */
-			if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
-				return ERR_PTR(-ECHILD);
-			if (!d_invalidate(dentry)) {
-				dput(dentry);
-				dentry = NULL;
-			}
+		} else if (!d_invalidate(dentry)) {
+			dput(dentry);
+			dentry = NULL;
 		}
 	}
 	return dentry;
 }
 
+static inline struct dentry *
+do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
+{
+	int status = dentry->d_op->d_revalidate(dentry, nd);
+	if (likely(status > 0))
+		return dentry;
+	if (status == -ECHILD) {
+		if (nameidata_dentry_drop_rcu(nd, dentry))
+			return ERR_PTR(-ECHILD);
+		return do_revalidate(dentry, nd);
+	}
+	if (status < 0)
+		return ERR_PTR(status);
+	/* Don't d_invalidate in rcu-walk mode */
+	if (nameidata_dentry_drop_rcu(nd, dentry))
+		return ERR_PTR(-ECHILD);
+	if (!d_invalidate(dentry)) {
+		dput(dentry);
+		dentry = NULL;
+	}
+	return dentry;
+}
+
 static inline int need_reval_dot(struct dentry *dentry)
 {
 	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
@@ -1260,7 +1273,7 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 
 		nd->seq = seq;
 		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-			dentry = do_revalidate(dentry, nd);
+			dentry = do_revalidate_rcu(dentry, nd);
 			if (!dentry)
 				goto need_lookup;
 			if (IS_ERR(dentry))
-- 
cgit v1.2.3


From f60aef7ec625236a6366722bb1be7b37596bf0ae Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 01:35:28 -0500
Subject: drop out of RCU in return_reval

... thus killing the need to handle drop-from-RCU in d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a98f7f14178..10635d32917 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -571,25 +571,9 @@ void release_open_intent(struct nameidata *nd)
 	}
 }
 
-/*
- * Call d_revalidate and handle filesystems that request rcu-walk
- * to be dropped. This may be called and return in rcu-walk mode,
- * regardless of success or error. If -ECHILD is returned, the caller
- * must return -ECHILD back up the path walk stack so path walk may
- * be restarted in ref-walk mode.
- */
-static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	int status;
-
-	status = dentry->d_op->d_revalidate(dentry, nd);
-	if (status == -ECHILD) {
-		if (nameidata_dentry_drop_rcu(nd, dentry))
-			return status;
-		status = dentry->d_op->d_revalidate(dentry, nd);
-	}
-
-	return status;
+	return dentry->d_op->d_revalidate(dentry, nd);
 }
 
 static struct dentry *
@@ -617,7 +601,7 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 static inline struct dentry *
 do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
 {
-	int status = dentry->d_op->d_revalidate(dentry, nd);
+	int status = d_revalidate(dentry, nd);
 	if (likely(status > 0))
 		return dentry;
 	if (status == -ECHILD) {
@@ -1517,12 +1501,15 @@ return_reval:
 		 * We may need to check the cached dentry for staleness.
 		 */
 		if (need_reval_dot(nd->path.dentry)) {
+			if (nameidata_drop_rcu_last_maybe(nd))
+				return -ECHILD;
 			/* Note: we do not d_invalidate() */
 			err = d_revalidate(nd->path.dentry, nd);
 			if (!err)
 				err = -ESTALE;
 			if (err < 0)
 				break;
+			return 0;
 		}
 return_base:
 		if (nameidata_drop_rcu_last_maybe(nd))
-- 
cgit v1.2.3


From 4e924a4f53a0e1ea060bd50695a12a238b250322 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 01:42:59 -0500
Subject: get rid of nameidata_dentry_drop_rcu() calling nameidata_drop_rcu()

can't happen anymore and didn't work right anyway

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 10635d32917..9e701e28a32 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -455,14 +455,6 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 	struct fs_struct *fs = current->fs;
 	struct dentry *parent = nd->path.dentry;
 
-	/*
-	 * It can be possible to revalidate the dentry that we started
-	 * the path walk with. force_reval_path may also revalidate the
-	 * dentry already committed to the nameidata.
-	 */
-	if (unlikely(parent == dentry))
-		return nameidata_drop_rcu(nd);
-
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 	if (nd->root.mnt) {
 		spin_lock(&fs->lock);
-- 
cgit v1.2.3


From 8272f4c9c5a46049a66dad5924850de5d1ef2269 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Tue, 15 Feb 2011 00:05:34 +0100
Subject: fuse/cuse: fix comment typo initilaization

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Reviewed-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/fuse/cuse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3e87cce5837..7c39b885f96 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -458,7 +458,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
  * @file: file struct being opened
  *
  * Userland CUSE server can create a CUSE device by opening /dev/cuse
- * and replying to the initilaization request kernel sends.  This
+ * and replying to the initialization request kernel sends.  This
  * function is responsible for handling CUSE device initialization.
  * Because the fd opened by this function is used during
  * initialization, this function only creates cuse_conn and sends
-- 
cgit v1.2.3


From fd018fe8234e84e05ab05d4176525817c8779cd6 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Tue, 15 Feb 2011 00:05:43 +0100
Subject: ext4: fix comment typo uninitized

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Reviewed-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 686240e89df..7516fb9c0bd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2844,7 +2844,7 @@ fix_extent_len:
  * ext4_get_blocks_dio_write() when DIO to write
  * to an uninitialized extent.
  *
- * Writing to an uninitized extent may result in splitting the uninitialized
+ * Writing to an uninitialized extent may result in splitting the uninitialized
  * extent into multiple /initialized uninitialized extents (up to three)
  * There are three possibilities:
  *   a> There is no split required: Entire extent should be uninitialized
-- 
cgit v1.2.3


From 261cd298a8c363d7985e3482946edb4bfedacf98 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 15 Feb 2011 09:43:32 +0100
Subject: s390: remove task_show_regs

task_show_regs used to be a debugging aid in the early bringup days
of Linux on s390. /proc/<pid>/status is a world readable file, it
is not a good idea to show the registers of a process. The only
correct fix is to remove task_show_regs.

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index df2b703b9d0..7c99c1cf7e5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -353,9 +353,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_cap(m, task);
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
-#if defined(CONFIG_S390)
-	task_show_regs(m, task);
-#endif
 	task_context_switch_counts(m, task);
 	return 0;
 }
-- 
cgit v1.2.3


From 58a69cb47ec6991bf006a3e5d202e8571b0327a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 09:25:31 +0100
Subject: workqueue, freezer: unify spelling of 'freeze' + 'able' to
 'freezable'

There are two spellings in use for 'freeze' + 'able' - 'freezable' and
'freezeable'.  The former is the more prominent one.  The latter is
mostly used by workqueue and in a few other odd places.  Unify the
spelling to 'freezable'.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Dmitry Torokhov <dtor@mail.ru>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 4 ++--
 fs/gfs2/main.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 08a8beb152e..7cd9a5a68d5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1779,11 +1779,11 @@ int __init gfs2_glock_init(void)
 #endif
 
 	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
-					  WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+					  WQ_HIGHPRI | WQ_FREEZABLE, 0);
 	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
 	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
-						WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+						WQ_MEM_RECLAIM | WQ_FREEZABLE,
 						0);
 	if (IS_ERR(gfs2_delete_workqueue)) {
 		destroy_workqueue(glock_workqueue);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index ebef7ab6e17..85ba027d1c4 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -144,7 +144,7 @@ static int __init init_gfs2_fs(void)
 
 	error = -ENOMEM;
 	gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-					  WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
+					  WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
 	if (!gfs_recovery_wq)
 		goto fail_wq;
 
-- 
cgit v1.2.3


From 3abb17e82f08628b59e20d8cbcb55e2204180f69 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 16 Feb 2011 08:56:55 -0800
Subject: vfs: fix BUG_ON() in fs/namei.c:1461

When Al moved the nameidata_dentry_drop_rcu_maybe() call into the
do_follow_link function in commit 844a391799c2 ("nothing in
do_follow_link() is going to see RCU"), he mistakenly left the

	BUG_ON(inode != path->dentry->d_inode);

behind.  Which would otherwise be ok, but that BUG_ON() really needs to
be _after_ dropping RCU, since the dentry isn't necessarily stable
otherwise.

So complete the code movement in that commit, and move the BUG_ON() into
do_follow_link() too.  This means that we need to pass in 'inode' as an
argument (just for this one use), but that's a small thing.  And
eventually we may be confident enough in our path lookup that we can
just remove the BUG_ON() and the unnecessary inode argument.

Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9e701e28a32..0087cf9c2c6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -795,7 +795,7 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
  * Without that kind of total limit, nasty chains of consecutive
  * symlinks can cause almost arbitrarily long lookups. 
  */
-static inline int do_follow_link(struct path *path, struct nameidata *nd)
+static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
 {
 	void *cookie;
 	int err = -ELOOP;
@@ -803,6 +803,7 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
 	/* We drop rcu-walk here */
 	if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
 		return -ECHILD;
+	BUG_ON(inode != path->dentry->d_inode);
 
 	if (current->link_count >= MAX_NESTED_LINKS)
 		goto loop;
@@ -1413,8 +1414,7 @@ exec_again:
 			goto out_dput;
 
 		if (inode->i_op->follow_link) {
-			BUG_ON(inode != next.dentry->d_inode);
-			err = do_follow_link(&next, nd);
+			err = do_follow_link(inode, &next, nd);
 			if (err)
 				goto return_err;
 			nd->inode = nd->path.dentry->d_inode;
@@ -1458,8 +1458,7 @@ last_component:
 			break;
 		if (inode && unlikely(inode->i_op->follow_link) &&
 		    (lookup_flags & LOOKUP_FOLLOW)) {
-			BUG_ON(inode != next.dentry->d_inode);
-			err = do_follow_link(&next, nd);
+			err = do_follow_link(inode, &next, nd);
 			if (err)
 				goto return_err;
 			nd->inode = nd->path.dentry->d_inode;
-- 
cgit v1.2.3


From 91435650c233b93e0da389db74f4b2c11c5ad2d4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Feb 2011 13:10:41 -0500
Subject: Btrfs: put ENOSPC debugging under a mount option

ENOSPC in btrfs is getting to the point where the extra debugging isn't
required.  I've put it under mount -o enospc_debug just in case someone
is having difficult problems.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 1 +
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/super.c       | 7 ++++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 72195378bef..6297701bc19 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1254,6 +1254,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SPACE_CACHE		(1 << 12)
 #define BTRFS_MOUNT_CLEAR_CACHE		(1 << 13)
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a7aaa10c530..d375fc04a06 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5377,7 +5377,7 @@ again:
 			       num_bytes, data, 1);
 		goto again;
 	}
-	if (ret == -ENOSPC) {
+	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
 		struct btrfs_space_info *sinfo;
 
 		sinfo = __find_space_info(root->fs_info, data);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0209b5fc772..db0a827252b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -155,7 +155,8 @@ enum {
 	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
 	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
+	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+	Opt_enospc_debug, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -184,6 +185,7 @@ static match_table_t tokens = {
 	{Opt_space_cache, "space_cache"},
 	{Opt_clear_cache, "clear_cache"},
 	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+	{Opt_enospc_debug, "enospc_debug"},
 	{Opt_err, NULL},
 };
 
@@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_user_subvol_rm_allowed:
 			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
 			break;
+		case Opt_enospc_debug:
+			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
-- 
cgit v1.2.3


From c87f08ca44e83b2c8d28f63f9c33f3a270a04bbe Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Feb 2011 13:57:04 -0500
Subject: Btrfs: allow balance to explicitly allocate chunks as it relocates

Btrfs device shrinking and balancing ends up reallocating all the blocks
in order to allow COW to move them to new destinations.  It is somewhat
awkward in terms of ENOSPC because most of the enospc code is built
around the idea that some operation on a reference counted tree triggers
allocations in the non-reference counted trees.

This commit changes the balancing code to deal with enospc by trying to
allocate a new chunk.  If that allocation succeeds, we go ahead and
retry whatever failed due to enospc.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/extent-tree.c |  7 +++++++
 fs/btrfs/relocation.c  | 13 ++++++++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6297701bc19..28188a786da 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2219,6 +2219,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root,
 				   u64 start, u64 end);
 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
 			       u64 num_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 type);
 
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d375fc04a06..100e409e905 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8066,6 +8066,13 @@ out:
 	return ret;
 }
 
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 type)
+{
+	u64 alloc_flags = get_alloc_profile(root, type);
+	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+}
+
 /*
  * helper to account the unused space of all the readonly block group in the
  * list. takes mirrors into account.
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0825e4ed944..31ade5802ae 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3654,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	u32 item_size;
 	int ret;
 	int err = 0;
+	int progress = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -3666,9 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	}
 
 	while (1) {
+		progress++;
 		trans = btrfs_start_transaction(rc->extent_root, 0);
 		BUG_ON(IS_ERR(trans));
-
+restart:
 		if (update_backref_cache(trans, &rc->backref_cache)) {
 			btrfs_end_transaction(trans, rc->extent_root);
 			continue;
@@ -3781,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 			}
 		}
 	}
+	if (trans && progress && err == -ENOSPC) {
+		ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+					      rc->block_group->flags);
+		if (ret == 0) {
+			err = 0;
+			progress = 0;
+			goto restart;
+		}
+	}
 
 	btrfs_release_path(rc->extent_root, path);
 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
-- 
cgit v1.2.3


From b4dc2b8c694ead005b828f5fb7fa1134db5b6275 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 16 Feb 2011 06:06:34 +0000
Subject: Btrfs: Fix BTRFS_IOC_SUBVOL_SETFLAGS ioctl

- Check user-specified flags correctly
- Check the inode owership
- Search root item in root tree but not fs tree

Reported-by: Dan Rosenberg <drosenberg@vsecurity.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index be2d4f6aaa5..5fdb2abc4fa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1071,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 	if (copy_from_user(&flags, arg, sizeof(flags)))
 		return -EFAULT;
 
-	if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+	if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
 		return -EINVAL;
 
 	if (flags & ~BTRFS_SUBVOL_RDONLY)
 		return -EOPNOTSUPP;
 
+	if (!is_owner_or_cap(inode))
+		return -EACCES;
+
 	down_write(&root->fs_info->subvol_sem);
 
 	/* nothing to do */
@@ -1097,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 		goto out_reset;
 	}
 
-	ret = btrfs_update_root(trans, root,
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
 				&root->root_key, &root->root_item);
 
 	btrfs_commit_transaction(trans, root);
-- 
cgit v1.2.3


From ca9b688c1c9a21635cfc8af8b68565b154185196 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 16 Feb 2011 06:06:41 +0000
Subject: Btrfs: Avoid accessing unmapped kernel address

When decompressing a chunk of data, we'll copy the data out to
a working buffer if the data is stored in more than one page,
otherwise we'll use the mapped page directly to avoid memory
copy.

In the latter case, we'll end up accessing the kernel address
after we've unmapped the page in a corner case.

Reported-by: Juan Francisco Cantero Hurtado <iam@juanfra.info>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/lzo.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cc9b450399d..a178f5ebea7 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
 	unsigned long tot_out;
 	unsigned long tot_len;
 	char *buf;
+	bool may_late_unmap, need_unmap;
 
 	data_in = kmap(pages_in[0]);
 	tot_len = read_compress_length(data_in);
@@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws,
 
 		tot_in += in_len;
 		working_bytes = in_len;
+		may_late_unmap = need_unmap = false;
 
 		/* fast path: avoid using the working buffer */
 		if (in_page_bytes_left >= in_len) {
 			buf = data_in + in_offset;
 			bytes = in_len;
+			may_late_unmap = true;
 			goto cont;
 		}
 
@@ -329,14 +332,17 @@ cont:
 				if (working_bytes == 0 && tot_in >= tot_len)
 					break;
 
-				kunmap(pages_in[page_in_index]);
-				page_in_index++;
-				if (page_in_index >= total_pages_in) {
+				if (page_in_index + 1 >= total_pages_in) {
 					ret = -1;
-					data_in = NULL;
 					goto done;
 				}
-				data_in = kmap(pages_in[page_in_index]);
+
+				if (may_late_unmap)
+					need_unmap = true;
+				else
+					kunmap(pages_in[page_in_index]);
+
+				data_in = kmap(pages_in[++page_in_index]);
 
 				in_page_bytes_left = PAGE_CACHE_SIZE;
 				in_offset = 0;
@@ -346,6 +352,8 @@ cont:
 		out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
 		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
 					    &out_len);
+		if (need_unmap)
+			kunmap(pages_in[page_in_index - 1]);
 		if (ret != LZO_E_OK) {
 			printk(KERN_WARNING "btrfs decompress failed\n");
 			ret = -1;
@@ -363,8 +371,7 @@ cont:
 			break;
 	}
 done:
-	if (data_in)
-		kunmap(pages_in[page_in_index]);
+	kunmap(pages_in[page_in_index]);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 9b3517e9136824346227b7b04f8f7ea1f3a726cc Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 15 Feb 2011 18:14:25 +0000
Subject: Btrfs: make btrfs_rm_device() fail gracefully

If shrinking done as part of the online device removal fails add that
device back to the allocation list and increment the rw_devices counter.
This fixes two bugs:

1) we could have a perfectly good device out of alloc list for no good
reason;

2) in the btrfs consisting of two devices, failure in btrfs_rm_device()
could lead to a situation where it was impossible to remove any of the
devices because of the "unable to remove the only writeable device"
error.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dadaaa8005c..f31c33119bb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1337,11 +1337,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	ret = btrfs_shrink_device(device, 0);
 	if (ret)
-		goto error_brelse;
+		goto error_undo;
 
 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
 	if (ret)
-		goto error_brelse;
+		goto error_undo;
 
 	device->in_fs_metadata = 0;
 
@@ -1415,6 +1415,13 @@ out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
 	return ret;
+error_undo:
+	if (device->writeable) {
+		list_add(&device->dev_alloc_list,
+			 &root->fs_info->fs_devices->alloc_list);
+		root->fs_info->fs_devices->rw_devices++;
+	}
+	goto error_brelse;
 }
 
 /*
-- 
cgit v1.2.3


From fb01aa85b8b29c1a4e1f4a28ea54175de6bf7559 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 15 Feb 2011 18:12:57 +0000
Subject: Btrfs: set FMODE_EXCL in btrfs_device->mode

This fixes a bug introduced in d4d77629, where the device added online
(and therefore initialized via btrfs_init_new_device()) would be left
with the positive bdev->bd_holders after unmount.  Since d4d77629 we no
longer OR FMODE_EXCL explicitly on blkdev_put(), set it in
btrfs_device->mode.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f31c33119bb..94334d95228 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1639,7 +1639,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
-	device->mode = 0;
+	device->mode = FMODE_EXCL;
 	set_blocksize(device->bdev, 4096);
 
 	if (seeding_dev) {
-- 
cgit v1.2.3


From 47c85291d3dd1a51501555000b90f8e281a0458e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 16 Feb 2011 13:08:35 +1100
Subject: nfsd: correctly handle return value from nfsd_map_name_to_*

These functions return an nfs status, not a host_err.  So don't
try to convert  before returning.

This is a regression introduced by
3c726023402a2f3b28f49b9d90ebf9e71151157d; I fixed up two of the callers,
but missed these two.

Cc: stable@kernel.org
Reported-by: Herbert Poetzl <herbert@13thfloor.at>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 956629b9cdc..1275b865507 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -317,8 +317,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		READ_BUF(dummy32);
 		len += (XDR_QUADLEN(dummy32) << 2);
 		READMEM(buf, dummy32);
-		if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
-			goto out_nfserr;
+		if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+			return status;
 		iattr->ia_valid |= ATTR_UID;
 	}
 	if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
@@ -328,8 +328,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		READ_BUF(dummy32);
 		len += (XDR_QUADLEN(dummy32) << 2);
 		READMEM(buf, dummy32);
-		if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
-			goto out_nfserr;
+		if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+			return status;
 		iattr->ia_valid |= ATTR_GID;
 	}
 	if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
-- 
cgit v1.2.3


From e51900f7d38cbcfb481d84567fd92540e7e1d23a Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Wed, 16 Feb 2011 18:11:53 -0500
Subject: block: revert block_dev read-only check

This reverts commit 75f1dc0d076d ("block: check bdev_read_only() from
blkdev_get()").  That commit added stricter checking to make sure
devices that were being used read-only were actually opened in that
mode.

It turns out that the change breaks a bunch of kernel code that opens
block devices.  Affected systems include dm, md, and the loop device.
Because strict checking for read-only opens of block devices was not
done before this, the code that opens the devices was opening them
read-write even if they were being used read-only.  Auditing all that
code will take time, and new userspace packages for dm, mdadm, etc.
will also be required.

Signed-off-by: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 333a7bb4cb9..4fb8a343153 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1215,12 +1215,6 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 
 	res = __blkdev_get(bdev, mode, 0);
 
-	/* __blkdev_get() may alter read only status, check it afterwards */
-	if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
-		__blkdev_put(bdev, mode, 0);
-		res = -EACCES;
-	}
-
 	if (whole) {
 		/* finish claiming */
 		mutex_lock(&bdev->bd_mutex);
@@ -1298,6 +1292,11 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 	if (err)
 		return ERR_PTR(err);
 
+	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+		blkdev_put(bdev, mode);
+		return ERR_PTR(-EACCES);
+	}
+
 	return bdev;
 }
 EXPORT_SYMBOL(blkdev_get_by_path);
-- 
cgit v1.2.3


From 9616125611ee47693186533d76e403856a36b3c8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 16 Feb 2011 09:34:16 -0500
Subject: cifs: fix handling of scopeid in cifs_convert_address
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code finds, the '%' sign in an ipv6 address and copies that to a
buffer allocated on the stack. It then ignores that buffer, and passes
'pct' to simple_strtoul(), which doesn't work right because we're
comparing 'endp' against a completely different string.

Fix it by passing the correct pointer. While we're at it, this is a
good candidate for conversion to strict_strtoul as well.

Cc: stable@kernel.org
Cc: David Howells <dhowells@redhat.com>
Reported-by: BjÃ¶rn JACKE <bj@sernet.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/netmisc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8d9189f6447..79f641eeda3 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -170,7 +170,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 {
 	int rc, alen, slen;
 	const char *pct;
-	char *endp, scope_id[13];
+	char scope_id[13];
 	struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
 	struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
 
@@ -197,9 +197,9 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 		memcpy(scope_id, pct + 1, slen);
 		scope_id[slen] = '\0';
 
-		s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
-		if (endp != scope_id + slen)
-			return 0;
+		rc = strict_strtoul(scope_id, 0,
+					(unsigned long *)&s6->sin6_scope_id);
+		rc = (rc == 0) ? 1 : 0;
 	}
 
 	return rc;
-- 
cgit v1.2.3


From b0a4bb830e099a31bec79452268639a7d8c2c1e4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Sat, 22 Jan 2011 15:31:32 +0900
Subject: fs: update comments to point correct document

dcache-locking.txt is not exist any more, and the path was not
correct anyway. Fix it.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/dcache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 2a6bd9a4ae9..79802bd790e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1781,7 +1781,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
 	 * false-negative result. d_lookup() protects against concurrent
 	 * renames using rename_lock seqlock.
 	 *
-	 * See Documentation/vfs/dcache-locking.txt for more details.
+	 * See Documentation/filesystems/path-lookup.txt for more details.
 	 */
 	hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
 		struct inode *i;
@@ -1901,7 +1901,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
 	 * false-negative result. d_lookup() protects against concurrent
 	 * renames using rename_lock seqlock.
 	 *
-	 * See Documentation/vfs/dcache-locking.txt for more details.
+	 * See Documentation/filesystems/path-lookup.txt for more details.
 	 */
 	rcu_read_lock();
 	
-- 
cgit v1.2.3


From bf6a41db7726e6c09b9c6ac993457b7260473406 Mon Sep 17 00:00:00 2001
From: Daniel Baluta <daniel.baluta@gmail.com>
Date: Sun, 30 Jan 2011 23:42:29 +0200
Subject: fs/eventpoll.c: fix spelling

eventpoll.c has wonderful comments but some annoying typos
sneaked in:
	* toepoll_ctl -> to epoll_ctl
	* rapresent -> represents
	* sructure -> structure
	* machanism -> mechanism
	* trasfering -> transferring

Signed-off-by: Daniel Baluta <daniel.baluta@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/eventpoll.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 267d0ada454..7513066cca2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -62,7 +62,7 @@
  * This mutex is acquired by ep_free() during the epoll file
  * cleanup path and it is also acquired by eventpoll_release_file()
  * if a file has been pushed inside an epoll set and it is then
- * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
  * It is possible to drop the "ep->mtx" and to use the global
  * mutex "epmutex" (together with "ep->lock") to have it working,
  * but having "ep->mtx" will make the interface more scalable.
@@ -145,11 +145,11 @@ struct epitem {
 
 /*
  * This structure is stored inside the "private_data" member of the file
- * structure and rapresent the main data sructure for the eventpoll
+ * structure and represents the main data structure for the eventpoll
  * interface.
  */
 struct eventpoll {
-	/* Protect the this structure access */
+	/* Protect the access to this structure */
 	spinlock_t lock;
 
 	/*
@@ -783,7 +783,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 
 /*
  * This is the callback that is passed to the wait queue wakeup
- * machanism. It is called by the stored file descriptors when they
+ * mechanism. It is called by the stored file descriptors when they
  * have events to report.
  */
 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
@@ -814,9 +814,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 		goto out_unlock;
 
 	/*
-	 * If we are trasfering events to userspace, we can hold no locks
+	 * If we are transferring events to userspace, we can hold no locks
 	 * (because we're accessing user memory, and because of linux f_op->poll()
-	 * semantics). All the events that happens during that period of time are
+	 * semantics). All the events that happen during that period of time are
 	 * chained in ep->ovflist and requeued later on.
 	 */
 	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
-- 
cgit v1.2.3


From fa7ea87a057958a8b7926c1a60a3ca6d696328ed Mon Sep 17 00:00:00 2001
From: Timo Warns <warns@pre-sense.de>
Date: Thu, 17 Feb 2011 22:27:40 +0100
Subject: fs/partitions: Validate map_count in Mac partition tables

Validate number of blocks in map and remove redundant variable.

Signed-off-by: Timo Warns <warns@pre-sense.de>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/mac.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 68d6a216ee7..11f688bd76c 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -29,10 +29,9 @@ static inline void mac_fix_string(char *stg, int len)
 
 int mac_partition(struct parsed_partitions *state)
 {
-	int slot = 1;
 	Sector sect;
 	unsigned char *data;
-	int blk, blocks_in_map;
+	int slot, blocks_in_map;
 	unsigned secsize;
 #ifdef CONFIG_PPC_PMAC
 	int found_root = 0;
@@ -59,10 +58,14 @@ int mac_partition(struct parsed_partitions *state)
 		put_dev_sector(sect);
 		return 0;		/* not a MacOS disk */
 	}
-	strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
 	blocks_in_map = be32_to_cpu(part->map_count);
-	for (blk = 1; blk <= blocks_in_map; ++blk) {
-		int pos = blk * secsize;
+	if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
+	for (slot = 1; slot <= blocks_in_map; ++slot) {
+		int pos = slot * secsize;
 		put_dev_sector(sect);
 		data = read_part_sector(state, pos/512, &sect);
 		if (!data)
@@ -113,13 +116,11 @@ int mac_partition(struct parsed_partitions *state)
 			}
 
 			if (goodness > found_root_goodness) {
-				found_root = blk;
+				found_root = slot;
 				found_root_goodness = goodness;
 			}
 		}
 #endif /* CONFIG_PPC_PMAC */
-
-		++slot;
 	}
 #ifdef CONFIG_PPC_PMAC
 	if (found_root_goodness)
-- 
cgit v1.2.3


From 8787c7a3e0e3f1aa21856d6b6cd6880cc93497e9 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 17 Feb 2011 18:51:24 -0600
Subject: eCryptfs: Revert "dont call lookup_one_len to avoid NULL nameidata"

This reverts commit 21edad32205e97dc7ccb81a85234c77e760364c8 and commit
93c3fe40c279f002906ad14584c30671097d4394, which fixed a regression by
the former.

Al Viro pointed out bypassed dcache lookups in
ecryptfs_new_lower_dentry(), misuse of vfs_path_lookup() in
ecryptfs_lookup_one_lower() and a dislike of passing nameidata to the
lower filesystem.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 106 ++++++----------------------------------------------
 1 file changed, 12 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bd33f87a190..fc44823fea3 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -348,75 +348,6 @@ out:
 	return rc;
 }
 
-/**
- * ecryptfs_new_lower_dentry
- * @name: The name of the new dentry.
- * @lower_dir_dentry: Parent directory of the new dentry.
- * @nd: nameidata from last lookup.
- *
- * Create a new dentry or get it from lower parent dir.
- */
-static struct dentry *
-ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
-			  struct nameidata *nd)
-{
-	struct dentry *new_dentry;
-	struct dentry *tmp;
-	struct inode *lower_dir_inode;
-
-	lower_dir_inode = lower_dir_dentry->d_inode;
-
-	tmp = d_alloc(lower_dir_dentry, name);
-	if (!tmp)
-		return ERR_PTR(-ENOMEM);
-
-	mutex_lock(&lower_dir_inode->i_mutex);
-	new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
-	mutex_unlock(&lower_dir_inode->i_mutex);
-
-	if (!new_dentry)
-		new_dentry = tmp;
-	else
-		dput(tmp);
-
-	return new_dentry;
-}
-
-
-/**
- * ecryptfs_lookup_one_lower
- * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
- * @lower_dir_dentry: lower parent directory
- * @name: lower file name
- *
- * Get the lower dentry from vfs. If lower dentry does not exist yet,
- * create it.
- */
-static struct dentry *
-ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
-			  struct dentry *lower_dir_dentry, struct qstr *name)
-{
-	struct nameidata nd;
-	struct vfsmount *lower_mnt;
-	int err;
-
-	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-				    ecryptfs_dentry->d_parent));
-	err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
-	mntput(lower_mnt);
-
-	if (!err) {
-		/* we dont need the mount */
-		mntput(nd.path.mnt);
-		return nd.path.dentry;
-	}
-	if (err != -ENOENT)
-		return ERR_PTR(err);
-
-	/* create a new lower dentry */
-	return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
-}
-
 /**
  * ecryptfs_lookup
  * @ecryptfs_dir_inode: The eCryptfs directory inode
@@ -434,7 +365,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	size_t encrypted_and_encoded_name_size;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
 	struct dentry *lower_dir_dentry, *lower_dentry;
-	struct qstr lower_name;
 	int rc = 0;
 
 	if ((ecryptfs_dentry->d_name.len == 1
@@ -444,20 +374,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		goto out_d_drop;
 	}
 	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-	lower_name.name = ecryptfs_dentry->d_name.name;
-	lower_name.len = ecryptfs_dentry->d_name.len;
-	lower_name.hash = ecryptfs_dentry->d_name.hash;
-	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
-		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-				lower_dir_dentry->d_inode, &lower_name);
-		if (rc < 0)
-			goto out_d_drop;
-	}
-	lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-						 lower_dir_dentry, &lower_name);
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+				      lower_dir_dentry,
+				      ecryptfs_dentry->d_name.len);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
 				"[%d] on lower_dentry = [%s]\n", __func__, rc,
 				encrypted_and_encoded_name);
 		goto out_d_drop;
@@ -479,20 +403,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		       "filename; rc = [%d]\n", __func__, rc);
 		goto out_d_drop;
 	}
-	lower_name.name = encrypted_and_encoded_name;
-	lower_name.len = encrypted_and_encoded_name_size;
-	lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
-	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
-		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-				lower_dir_dentry->d_inode, &lower_name);
-		if (rc < 0)
-			goto out_d_drop;
-	}
-	lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-						 lower_dir_dentry, &lower_name);
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+				      lower_dir_dentry,
+				      encrypted_and_encoded_name_size);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
 				"[%d] on lower_dentry = [%s]\n", __func__, rc,
 				encrypted_and_encoded_name);
 		goto out_d_drop;
-- 
cgit v1.2.3


From 25d41d8455ec1ee7433e146ee94436dc4195f420 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 7 Feb 2011 15:00:27 +0100
Subject: debugfs: Fix filesystem reference counting on debugfs_remove()
 failure

When __debugfs_remove() fails (because simple_rmdir() fails e.g. when a
directory is not empty), we must not decrement use count of the filesystem
as nothing was in fact deleted.

This fixes use after free caused by debugfs in some cases.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/inode.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d38c88fb63a..e7a7a2f0732 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -307,7 +307,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
 
-static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
@@ -330,6 +330,7 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 			dput(dentry);
 		}
 	}
+	return ret;
 }
 
 /**
@@ -348,7 +349,8 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 void debugfs_remove(struct dentry *dentry)
 {
 	struct dentry *parent;
-	
+	int ret;
+
 	if (!dentry)
 		return;
 
@@ -357,9 +359,10 @@ void debugfs_remove(struct dentry *dentry)
 		return;
 
 	mutex_lock(&parent->d_inode->i_mutex);
-	__debugfs_remove(dentry, parent);
+	ret = __debugfs_remove(dentry, parent);
 	mutex_unlock(&parent->d_inode->i_mutex);
-	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+	if (!ret)
+		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
 
-- 
cgit v1.2.3


From 97d79b403ef03f729883246208ef5d8a2ebc4d68 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 18 Jan 2011 13:37:28 -0800
Subject: ceph: keep reference to parent inode on ceph_dentry

When creating a new dentry we now hold a reference to the parent
inode in the ceph_dentry.  This is required due to the new RCU
changes from 949854d0, which set dentry->d_parent to NULL in d_kill before
calling the ->release() callback.  If/when that behavior is changed, we can
revert this hack.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 5 ++++-
 fs/ceph/super.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 562f9884a4d..6bfaa6a4ec4 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -60,6 +60,7 @@ int ceph_init_dentry(struct dentry *dentry)
 	}
 	di->dentry = dentry;
 	di->lease_session = NULL;
+	di->parent_inode = igrab(dentry->d_parent->d_inode);
 	dentry->d_fsdata = di;
 	dentry->d_time = jiffies;
 	ceph_dentry_lru_add(dentry);
@@ -1025,7 +1026,7 @@ static void ceph_dentry_release(struct dentry *dentry)
 	u64 snapid = CEPH_NOSNAP;
 
 	if (!IS_ROOT(dentry)) {
-		parent_inode = dentry->d_parent->d_inode;
+		parent_inode = di->parent_inode;
 		if (parent_inode)
 			snapid = ceph_snap(parent_inode);
 	}
@@ -1050,6 +1051,8 @@ static void ceph_dentry_release(struct dentry *dentry)
 		kmem_cache_free(ceph_dentry_cachep, di);
 		dentry->d_fsdata = NULL;
 	}
+	if (parent_inode)
+		iput(parent_inode);
 }
 
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6e082669511..c01aa646b40 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -207,6 +207,7 @@ struct ceph_dentry_info {
 	struct dentry *dentry;
 	u64 time;
 	u64 offset;
+	struct inode *parent_inode;
 };
 
 struct ceph_inode_xattrs_info {
-- 
cgit v1.2.3


From 705773a6656bba66f2a80a44ddaacf9620df8a59 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 3 Feb 2011 14:16:19 +0100
Subject: ocfs2: Fix estimate of necessary credits for mkdir

In the rare case that INLINE_DATA, INDEX_DIR, QUOTA, XATTR features are
disabled and both the allocation of the directory inode and the allocation
of the first directory block need to relink allocation group, there need
not be enough credits reserved in a transaction. Fix the estimate.

CC: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
---
 fs/ocfs2/journal.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 43e56b97f9c..6180da1e37e 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
 	       ocfs2_quota_trans_credits(sb);
 }
 
-/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) dx_root update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+/* data block for new dir/symlink, allocation of directory block, dx_root
+ * update for free list */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
 
 static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
 {
-- 
cgit v1.2.3


From acf3bb007e5636ef4c17505affb0974175108553 Mon Sep 17 00:00:00 2001
From: Tristan Ye <tristan.ye@oracle.com>
Date: Fri, 21 Jan 2011 18:20:18 +0800
Subject: Ocfs2/refcounttree: Fix a bug for refcounttree to writeback clusters
 in a right number.

Current refcounttree codes actually didn't writeback the new pages out in
write-back mode, due to a bug of always passing a ZERO number of clusters
to 'ocfs2_cow_sync_writeback', the patch tries to pass a proper one in.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Joel Becker <jlbec@evilplan.org>
---
 fs/ocfs2/refcounttree.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b5f9160e93e..19ebc5aad39 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 					u32 num_clusters, unsigned int e_flags)
 {
 	int ret, delete, index, credits =  0;
-	u32 new_bit, new_len;
+	u32 new_bit, new_len, orig_num_clusters;
 	unsigned int set_len;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	handle_t *handle;
@@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 		goto out;
 	}
 
+	orig_num_clusters = num_clusters;
+
 	while (num_clusters) {
 		ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
 					     p_cluster, num_clusters,
@@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 	 * in write-back mode.
 	 */
 	if (context->get_clusters == ocfs2_di_get_clusters) {
-		ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+		ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+					       orig_num_clusters);
 		if (ret)
 			mlog_errno(ret);
 	}
-- 
cgit v1.2.3


From 52c303c56c3638944b5f733e3961dc58eb8c7270 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 31 Jan 2011 11:31:04 -0800
Subject: ocfs2: Check heartbeat mode for kernel stacks only

Commit 2c442719e90a44a6982c033d69df4aae4b167cfa added some checks for proper
heartbeat mode when the o2cb stack is running.  Unfortunately, it didn't
take into account that a userpsace stack could be running. Fix this by only
doing the check if o2cb is in use. This patch allows userspace stacks to
mount the fs again.

Cc: stable@kernel.org
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
---
 fs/ocfs2/super.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 38f986d2447..36c423fb063 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 			       struct mount_options *mopt,
 			       int is_remount)
 {
-	int status;
+	int status, user_stack = 0;
 	char *p;
 	u32 tmp;
 
@@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb,
 			memcpy(mopt->cluster_stack, args[0].from,
 			       OCFS2_STACK_LABEL_LEN);
 			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+			/*
+			 * Open code the memcmp here as we don't have
+			 * an osb to pass to
+			 * ocfs2_userspace_stack().
+			 */
+			if (memcmp(mopt->cluster_stack,
+				   OCFS2_CLASSIC_CLUSTER_STACK,
+				   OCFS2_STACK_LABEL_LEN))
+				user_stack = 1;
 			break;
 		case Opt_inode64:
 			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
@@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb,
 		}
 	}
 
-	/* Ensure only one heartbeat mode */
-	tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
-				 OCFS2_MOUNT_HB_NONE);
-	if (hweight32(tmp) != 1) {
-		mlog(ML_ERROR, "Invalid heartbeat mount options\n");
-		status = 0;
-		goto bail;
+	if (user_stack == 0) {
+		/* Ensure only one heartbeat mode */
+		tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
+					 OCFS2_MOUNT_HB_GLOBAL |
+					 OCFS2_MOUNT_HB_NONE);
+		if (hweight32(tmp) != 1) {
+			mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+			status = 0;
+			goto bail;
+		}
 	}
 
 	status = 1;
-- 
cgit v1.2.3


From 70b8902199003b098fde86d1db02e7465115a02c Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 17 Feb 2011 17:35:20 -0600
Subject: eCryptfs: Handle NULL nameidata pointers

Allow for NULL nameidata pointers in eCryptfs create, lookup, and
d_revalidate functions.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/dentry.c          | 22 +++++++++++++---------
 fs/ecryptfs/ecryptfs_kernel.h |  3 +--
 fs/ecryptfs/inode.c           | 30 +++++++++++++++---------------
 3 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 6fc4f319b55..534c1d46e69 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -46,24 +46,28 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct dentry *lower_dentry;
 	struct vfsmount *lower_mnt;
-	struct dentry *dentry_save;
-	struct vfsmount *vfsmount_save;
+	struct dentry *dentry_save = NULL;
+	struct vfsmount *vfsmount_save = NULL;
 	int rc = 1;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
 	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
 		goto out;
-	dentry_save = nd->path.dentry;
-	vfsmount_save = nd->path.mnt;
-	nd->path.dentry = lower_dentry;
-	nd->path.mnt = lower_mnt;
+	if (nd) {
+		dentry_save = nd->path.dentry;
+		vfsmount_save = nd->path.mnt;
+		nd->path.dentry = lower_dentry;
+		nd->path.mnt = lower_mnt;
+	}
 	rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
-	nd->path.dentry = dentry_save;
-	nd->path.mnt = vfsmount_save;
+	if (nd) {
+		nd->path.dentry = dentry_save;
+		nd->path.mnt = vfsmount_save;
+	}
 	if (dentry->d_inode) {
 		struct inode *lower_inode =
 			ecryptfs_inode_to_lower(dentry->d_inode);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index dbc84ed9633..e00753496e3 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -632,8 +632,7 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       u32 flags);
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 					struct dentry *lower_dentry,
-					struct inode *ecryptfs_dir_inode,
-					struct nameidata *ecryptfs_nd);
+					struct inode *ecryptfs_dir_inode);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
 					 size_t *decrypted_name_size,
 					 struct dentry *ecryptfs_dentry,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index fc44823fea3..eb0d267ee71 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -74,16 +74,20 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
 	unsigned int flags_save;
 	int rc;
 
-	dentry_save = nd->path.dentry;
-	vfsmount_save = nd->path.mnt;
-	flags_save = nd->flags;
-	nd->path.dentry = lower_dentry;
-	nd->path.mnt = lower_mnt;
-	nd->flags &= ~LOOKUP_OPEN;
+	if (nd) {
+		dentry_save = nd->path.dentry;
+		vfsmount_save = nd->path.mnt;
+		flags_save = nd->flags;
+		nd->path.dentry = lower_dentry;
+		nd->path.mnt = lower_mnt;
+		nd->flags &= ~LOOKUP_OPEN;
+	}
 	rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
-	nd->path.dentry = dentry_save;
-	nd->path.mnt = vfsmount_save;
-	nd->flags = flags_save;
+	if (nd) {
+		nd->path.dentry = dentry_save;
+		nd->path.mnt = vfsmount_save;
+		nd->flags = flags_save;
+	}
 	return rc;
 }
 
@@ -241,8 +245,7 @@ out:
  */
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 					struct dentry *lower_dentry,
-					struct inode *ecryptfs_dir_inode,
-					struct nameidata *ecryptfs_nd)
+					struct inode *ecryptfs_dir_inode)
 {
 	struct dentry *lower_dir_dentry;
 	struct vfsmount *lower_mnt;
@@ -290,8 +293,6 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 		goto out;
 	if (special_file(lower_inode->i_mode))
 		goto out;
-	if (!ecryptfs_nd)
-		goto out;
 	/* Released in this function */
 	page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
 	if (!page_virt) {
@@ -417,8 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	}
 lookup_and_interpose:
 	rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
-						 ecryptfs_dir_inode,
-						 ecryptfs_nd);
+						 ecryptfs_dir_inode);
 	goto out;
 out_d_drop:
 	d_drop(ecryptfs_dentry);
-- 
cgit v1.2.3


From 323ef68faf1bbd9b1e66aea268fd09d358d7e8ab Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Wed, 16 Feb 2011 04:49:59 +0000
Subject: ecryptfs: read on a directory should return EISDIR if not supported

read() calls against a file descriptor connected to a directory are
incorrectly returning EINVAL rather than EISDIR:

  [EISDIR]
    [XSI] [Option Start] The fildes argument refers to a directory and the
    implementation does not allow the directory to be read using read()
    or pread(). The readdir() function should be used instead. [Option End]

This occurs because we do not have a .read operation defined for
ecryptfs directories.  Connect this up to generic_read_dir().

BugLink: http://bugs.launchpad.net/bugs/719691
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 81e10e6a944..7d1050e254f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -317,6 +317,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 const struct file_operations ecryptfs_dir_fops = {
 	.readdir = ecryptfs_readdir,
+	.read = generic_read_dir,
 	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,
-- 
cgit v1.2.3


From 55f9cf6bbaa682958a7dd2755f883b768270c3ce Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Tue, 11 Jan 2011 12:43:42 -0600
Subject: eCryptfs: Copy up lower inode attrs in getattr

The lower filesystem may do some type of inode revalidation during a
getattr call. eCryptfs should take advantage of that by copying the
lower inode attributes to the eCryptfs inode after a call to
vfs_getattr() on the lower inode.

I originally wrote this fix while working on eCryptfs on nfsv3 support,
but discovered it also fixed an eCryptfs on ext4 nanosecond timestamp
bug that was reported.

https://bugs.launchpad.net/bugs/613873

Cc: <stable@kernel.org>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index eb0d267ee71..b592938a84b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1010,6 +1010,8 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
 			 ecryptfs_dentry_to_lower(dentry), &lower_stat);
 	if (!rc) {
+		fsstack_copy_attr_all(dentry->d_inode,
+				      ecryptfs_inode_to_lower(dentry->d_inode));
 		generic_fillattr(dentry->d_inode, stat);
 		stat->blocks = lower_stat.blocks;
 	}
-- 
cgit v1.2.3


From 5e640927a597a7c3e72b61e8bce74c22e906de65 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 17 Feb 2011 14:38:31 -0600
Subject: cifs: Fix regression in LANMAN (LM) auth code

LANMAN response length was changed to 16 bytes instead of 24 bytes.
Revert it back to 24 bytes.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
CC: stable@kernel.org
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 1adc9625a34..16765703131 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -656,13 +656,13 @@ ssetup_ntlmssp_authenticate:
 
 	if (type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-		char lnm_session_key[CIFS_SESS_KEY_SIZE];
+		char lnm_session_key[CIFS_AUTH_RESP_SIZE];
 
 		pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
 
 		/* no capabilities flags in old lanman negotiation */
 
-		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 
 		/* Calculate hash with password and copy into bcc_ptr.
 		 * Encryption Key (stored as in cryptkey) gets used if the
@@ -675,8 +675,8 @@ ssetup_ntlmssp_authenticate:
 					true : false, lnm_session_key);
 
 		ses->flags |= CIFS_SES_LANMAN;
-		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
-		bcc_ptr += CIFS_SESS_KEY_SIZE;
+		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 
 		/* can not sign if LANMAN negotiated so no need
 		to calculate signing key? but what if server
-- 
cgit v1.2.3


From eed9e8307e01d6d8d6170afcb2f00e1a471b87d4 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 21 Feb 2011 22:31:47 +0000
Subject: [CIFS] update cifs version

Update version to 1.71 so we can more easily spot modules with the last two fixes

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 4a3330235d5..a9371b6578c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -127,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.70"
+#define CIFS_VERSION   "1.71"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.3


From 361821854b71fc3a53c9e17701538247bddbd4ba Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 20 Feb 2011 20:08:35 -0800
Subject: Docbook: add fs/eventfd.c and fix typos in it

Add fs/eventfd.c to filesystems docbook.
Make typo corrections in fs/eventfd.c.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventfd.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index e0194b3e14d..d9a59177391 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_get);
  * @ctx: [in] Pointer to eventfd context.
  *
  * The eventfd context reference must have been previously acquired either
- * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ * with eventfd_ctx_get() or eventfd_ctx_fdget().
  */
 void eventfd_ctx_put(struct eventfd_ctx *ctx)
 {
@@ -146,9 +146,9 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
  * @ctx: [in] Pointer to eventfd context.
  * @wait: [in] Wait queue to be removed.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
  *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
  *
  * -EAGAIN      : The operation would have blocked.
  *
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
  * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
  * @ctx: [in] Pointer to eventfd context.
  * @no_wait: [in] Different from zero if the operation should not block.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
  *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
  *
- * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
  * -ERESTARTSYS : A signal interrupted the wait operation.
  *
  * If @no_wait is zero, the function might sleep until the eventfd internal
-- 
cgit v1.2.3


From c4d0c3b097f7584772316ee4d64a09fe0e4ddfca Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <drosenberg@vsecurity.com>
Date: Mon, 14 Feb 2011 13:45:28 +0000
Subject: xfs: prevent leaking uninitialized stack memory in FSGEOMETRY_V1

The FSGEOMETRY_V1 ioctl (and its compat equivalent) calls out to
xfs_fs_geometry() with a version number of 3.  This code path does not
fill in the logsunit member of the passed xfs_fsop_geom_t, leading to
the leaking of four bytes of uninitialized stack data to potentially
unprivileged callers.

v2 switches to memset() to avoid future issues if structure members
change, on suggestion of Dave Chinner.

Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
Reviewed-by: Eugene Teo <eugeneteo@kernel.org>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_fsops.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cec89dd5d7d..85668efb3e3 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
 	xfs_fsop_geom_t		*geo,
 	int			new_version)
 {
+
+	memset(geo, 0, sizeof(*geo));
+
 	geo->blocksize = mp->m_sb.sb_blocksize;
 	geo->rtextsize = mp->m_sb.sb_rextsize;
 	geo->agblocks = mp->m_sb.sb_agblocks;
-- 
cgit v1.2.3


From 5d15765594eeb5d82c5630b3428ea0ac4f7d3c31 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 15 Feb 2011 17:07:36 +0000
Subject: xfs: check if device support discard in xfs_ioc_trim()

Right now we, are relying on the fact that when we attempt to
actually do the discard, blkdev_issue_discar() returns -EOPNOTSUPP
and the user is informed that the device does not support discard.

However, in the case where the we do not hit any suitable free
extent to trim in FITRIM code, it will finish without any error.
This is very confusing, because it seems that FITRIM was successful
even though the device does not actually supports discard.

Solution: Check for the discard support before attempt to search for
free extents.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_discard.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index 05201ae719e..d61611c8801 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -152,6 +152,8 @@ xfs_ioc_trim(
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
+	if (!blk_queue_discard(q))
+		return -XFS_ERROR(EOPNOTSUPP);
 	if (copy_from_user(&range, urange, sizeof(range)))
 		return -XFS_ERROR(EFAULT);
 
-- 
cgit v1.2.3


From 3a3675b7f23f83ca8c67c9c2b6edf707fd28d1ba Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <drosenberg@vsecurity.com>
Date: Mon, 14 Feb 2011 13:45:28 +0000
Subject: xfs: prevent leaking uninitialized stack memory in FSGEOMETRY_V1

The FSGEOMETRY_V1 ioctl (and its compat equivalent) calls out to
xfs_fs_geometry() with a version number of 3.  This code path does not
fill in the logsunit member of the passed xfs_fsop_geom_t, leading to
the leaking of four bytes of uninitialized stack data to potentially
unprivileged callers.

v2 switches to memset() to avoid future issues if structure members
change, on suggestion of Dave Chinner.

Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
Reviewed-by: Eugene Teo <eugeneteo@kernel.org>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_fsops.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cec89dd5d7d..85668efb3e3 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
 	xfs_fsop_geom_t		*geo,
 	int			new_version)
 {
+
+	memset(geo, 0, sizeof(*geo));
+
 	geo->blocksize = mp->m_sb.sb_blocksize;
 	geo->rtextsize = mp->m_sb.sb_rextsize;
 	geo->agblocks = mp->m_sb.sb_agblocks;
-- 
cgit v1.2.3


From be715140b5c3baf8ab6708060cfab80bef279d18 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 15 Feb 2011 17:07:36 +0000
Subject: xfs: check if device support discard in xfs_ioc_trim()

Right now we, are relying on the fact that when we attempt to
actually do the discard, blkdev_issue_discar() returns -EOPNOTSUPP
and the user is informed that the device does not support discard.

However, in the case where the we do not hit any suitable free
extent to trim in FITRIM code, it will finish without any error.
This is very confusing, because it seems that FITRIM was successful
even though the device does not actually supports discard.

Solution: Check for the discard support before attempt to search for
free extents.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_discard.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index 05201ae719e..d61611c8801 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -152,6 +152,8 @@ xfs_ioc_trim(
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
+	if (!blk_queue_discard(q))
+		return -XFS_ERROR(EOPNOTSUPP);
 	if (copy_from_user(&range, urange, sizeof(range)))
 		return -XFS_ERROR(EFAULT);
 
-- 
cgit v1.2.3


From 2c9c8f36c34e1defcaa7e4c298651998b47f5282 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Tue, 22 Feb 2011 14:43:22 -0800
Subject: NFSD: fix decode_cb_sequence4resok

Fix bug introduced in patch
85a56480 NFSD: Update XDR decoders in NFSv4 callback client

Although decode_cb_sequence4resok ignores highest slotid and target highest slotid
it must account for their space in their xdr stream when calling xdr_inline_decode

Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cde36cb0f34..02eb4edf0ec 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
 	 * If the server returns different values for sessionID, slotID or
 	 * sequence number, the server is looney tunes.
 	 */
-	p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+	p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
 	if (unlikely(p == NULL))
 		goto out_overflow;
 	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-- 
cgit v1.2.3


From 1050c71e2925ab0cb025e4c89e08b15529a1ee36 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 13 Feb 2011 13:25:31 +0000
Subject: xfs: stop using xfs_trans_iget in the RT allocator

During mount we establish references to the RT inodes, which we keep for
the lifetime of the filesystem.  Instead of using xfs_trans_iget to grab
additional references when adding RT inodes to transactions use the
combination of xfs_ilock and xfs_trans_ijoin_ref, which archives the same
end result with less overhead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_bmap.c    |  8 ++------
 fs/xfs/xfs_rtalloc.c | 54 ++++++++++++++++++++--------------------------------
 2 files changed, 23 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d8d09066528..e7b441db053 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2333,7 +2333,6 @@ xfs_bmap_rtalloc(
 	xfs_extlen_t	prod = 0;	/* product factor for allocators */
 	xfs_extlen_t	ralen = 0;	/* realtime allocation length */
 	xfs_extlen_t	align;		/* minimum allocation alignment */
-	xfs_inode_t	*ip;		/* bitmap incore inode */
 	xfs_rtblock_t	rtb;
 
 	mp = ap->ip->i_mount;
@@ -2370,11 +2369,8 @@ xfs_bmap_rtalloc(
 	/*
 	 * Lock out other modifications to the RT bitmap inode.
 	 */
-	error = xfs_trans_iget(mp, ap->tp, mp->m_sb.sb_rbmino, 0,
-			       XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP, &ip);
-	if (error)
-		return error;
-	ASSERT(ip == mp->m_rbmip);
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 
 	/*
 	 * If it's an allocation to an empty file at offset 0,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f592ac97818..fbff89344ba 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -76,7 +76,7 @@ xfs_growfs_rt_alloc(
 	xfs_mount_t	*mp,		/* file system mount point */
 	xfs_extlen_t	oblocks,	/* old count of blocks */
 	xfs_extlen_t	nblocks,	/* new count of blocks */
-	xfs_ino_t	ino)		/* inode number (bitmap/summary) */
+	xfs_inode_t	*ip)		/* inode (bitmap/summary) */
 {
 	xfs_fileoff_t	bno;		/* block number in file */
 	xfs_buf_t	*bp;		/* temporary buffer for zeroing */
@@ -86,7 +86,6 @@ xfs_growfs_rt_alloc(
 	xfs_fsblock_t	firstblock;	/* first block allocated in xaction */
 	xfs_bmap_free_t	flist;		/* list of freed blocks */
 	xfs_fsblock_t	fsbno;		/* filesystem block for bno */
-	xfs_inode_t	*ip;		/* pointer to incore inode */
 	xfs_bmbt_irec_t	map;		/* block map output */
 	int		nmap;		/* number of block maps */
 	int		resblks;	/* space reservation */
@@ -112,9 +111,9 @@ xfs_growfs_rt_alloc(
 		/*
 		 * Lock the inode.
 		 */
-		if ((error = xfs_trans_iget(mp, tp, ino, 0,
-						XFS_ILOCK_EXCL, &ip)))
-			goto error_cancel;
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
+
 		xfs_bmap_init(&flist, &firstblock);
 		/*
 		 * Allocate blocks to the bitmap file.
@@ -155,9 +154,8 @@ xfs_growfs_rt_alloc(
 			/*
 			 * Lock the bitmap inode.
 			 */
-			if ((error = xfs_trans_iget(mp, tp, ino, 0,
-							XFS_ILOCK_EXCL, &ip)))
-				goto error_cancel;
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
 			/*
 			 * Get a buffer for the block.
 			 */
@@ -1854,7 +1852,6 @@ xfs_growfs_rt(
 	xfs_rtblock_t	bmbno;		/* bitmap block number */
 	xfs_buf_t	*bp;		/* temporary buffer */
 	int		error;		/* error return value */
-	xfs_inode_t	*ip;		/* bitmap inode, used as lock */
 	xfs_mount_t	*nmp;		/* new (fake) mount structure */
 	xfs_drfsbno_t	nrblocks;	/* new number of realtime blocks */
 	xfs_extlen_t	nrbmblocks;	/* new number of rt bitmap blocks */
@@ -1918,11 +1915,11 @@ xfs_growfs_rt(
 	/*
 	 * Allocate space to the bitmap and summary files, as necessary.
 	 */
-	if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks,
-			mp->m_sb.sb_rbmino)))
+	error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
+	if (error)
 		return error;
-	if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
-			mp->m_sb.sb_rsumino)))
+	error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
+	if (error)
 		return error;
 	/*
 	 * Allocate a new (fake) mount/sb.
@@ -1972,12 +1969,8 @@ xfs_growfs_rt(
 		/*
 		 * Lock out other callers by grabbing the bitmap inode lock.
 		 */
-		error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-				       XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP,
-				       &ip);
-		if (error)
-			goto error_cancel;
-		ASSERT(ip == mp->m_rbmip);
+		xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
 		/*
 		 * Update the bitmap inode's size.
 		 */
@@ -1988,11 +1981,8 @@ xfs_growfs_rt(
 		/*
 		 * Get the summary inode into the transaction.
 		 */
-		error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
-				       XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM, &ip);
-		if (error)
-			goto error_cancel;
-		ASSERT(ip == mp->m_rsumip);
+		xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
 		/*
 		 * Update the summary inode's size.
 		 */
@@ -2154,7 +2144,6 @@ xfs_rtfree_extent(
 	xfs_extlen_t	len)		/* length of extent freed */
 {
 	int		error;		/* error value */
-	xfs_inode_t	*ip;		/* bitmap file inode */
 	xfs_mount_t	*mp;		/* file system mount structure */
 	xfs_fsblock_t	sb;		/* summary file block number */
 	xfs_buf_t	*sumbp;		/* summary file block buffer */
@@ -2163,10 +2152,9 @@ xfs_rtfree_extent(
 	/*
 	 * Synchronize by locking the bitmap inode.
 	 */
-	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-			       XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP, &ip);
-	if (error)
-		return error;
+	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+
 #if defined(__KERNEL__) && defined(DEBUG)
 	/*
 	 * Check to see that this whole range is currently allocated.
@@ -2199,10 +2187,10 @@ xfs_rtfree_extent(
 	 */
 	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
 	    mp->m_sb.sb_rextents) {
-		if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
-			ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-		*(__uint64_t *)&ip->i_d.di_atime = 0;
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+			mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From ec3ba85f4083d10e32fe58b46db02d78ef71f6b8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 13 Feb 2011 13:26:42 +0000
Subject: xfs: more sensible inode refcounting for ialloc

Currently we return iodes from xfs_ialloc with just a single reference held.
But we need two references, as one is dropped during transaction commit and
the second needs to be transfered to the VFS.  Change xfs_ialloc to use
xfs_iget plus xfs_trans_ijoin_ref to grab two references to the inode,
and remove the now superflous IHOLD calls from all callers.  This also
greatly simplifies the error handling in xfs_create and also allow to remove
xfs_trans_iget as no other callers are left.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/quota/xfs_qm.c    |  7 ------
 fs/xfs/xfs_inode.c       |  5 ++--
 fs/xfs/xfs_trans.h       |  2 --
 fs/xfs/xfs_trans_inode.c | 22 -----------------
 fs/xfs/xfs_vnodeops.c    | 61 +++++++++++++-----------------------------------
 5 files changed, 19 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 206a2815ced..f517963aec0 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1229,13 +1229,6 @@ xfs_qm_qino_alloc(
 		return error;
 	}
 
-	/*
-	 * Keep an extra reference to this quota inode. This inode is
-	 * locked exclusively and joined to the transaction already.
-	 */
-	ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
-	IHOLD(*ip);
-
 	/*
 	 * Make the changes in the superblock, and log those too.
 	 * sbfields arg may contain fields other than *QUOTINO;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index be7cf625421..c39278b6c87 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1016,8 +1016,8 @@ xfs_ialloc(
 	 * This is because we're setting fields here we need
 	 * to prevent others from looking at until we're done.
 	 */
-	error = xfs_trans_iget(tp->t_mountp, tp, ino,
-				XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+	error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
+			 XFS_ILOCK_EXCL, &ip);
 	if (error)
 		return error;
 	ASSERT(ip != NULL);
@@ -1166,6 +1166,7 @@ xfs_ialloc(
 	/*
 	 * Log the new values stuffed into the inode.
 	 */
+	xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_log_inode(tp, ip, flags);
 
 	/* now that we have an i_mode we can setup inode ops and unlock */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c2042b736b8..06a9759b635 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -469,8 +469,6 @@ void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
-int		xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
-			       xfs_ino_t , uint, uint, struct xfs_inode **);
 void		xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void		xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
 void		xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ccb34532768..16084d8ea23 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -43,28 +43,6 @@ xfs_trans_inode_broot_debug(
 #define	xfs_trans_inode_broot_debug(ip)
 #endif
 
-/*
- * Get an inode and join it to the transaction.
- */
-int
-xfs_trans_iget(
-	xfs_mount_t	*mp,
-	xfs_trans_t	*tp,
-	xfs_ino_t	ino,
-	uint		flags,
-	uint		lock_flags,
-	xfs_inode_t	**ipp)
-{
-	int			error;
-
-	error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
-	if (!error && tp) {
-		xfs_trans_ijoin(tp, *ipp);
-		(*ipp)->i_itemp->ili_lock_flags = lock_flags;
-	}
-	return error;
-}
-
 /*
  * Add a locked inode to the transaction.
  *
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d8e6f8cd6f0..258d4f98eb9 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1310,7 +1310,7 @@ xfs_create(
 	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
-		goto std_return;
+		return error;
 
 	if (is_dir) {
 		rdev = 0;
@@ -1389,12 +1389,6 @@ xfs_create(
 		goto out_trans_abort;
 	}
 
-	/*
-	 * At this point, we've gotten a newly allocated inode.
-	 * It is locked (and joined to the transaction).
-	 */
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
 	/*
 	 * Now we join the directory inode to the transaction.  We do not do it
 	 * earlier because xfs_dir_ialloc might commit the previous transaction
@@ -1440,22 +1434,13 @@ xfs_create(
 	 */
 	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
 
-	/*
-	 * xfs_trans_commit normally decrements the vnode ref count
-	 * when it unlocks the inode. Since we want to return the
-	 * vnode to the caller, we bump the vnode ref count now.
-	 */
-	IHOLD(ip);
-
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
 	if (error)
-		goto out_abort_rele;
+		goto out_bmap_cancel;
 
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-	if (error) {
-		IRELE(ip);
-		goto out_dqrele;
-	}
+	if (error)
+		goto out_release_inode;
 
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);
@@ -1469,27 +1454,21 @@ xfs_create(
 	cancel_flags |= XFS_TRANS_ABORT;
  out_trans_cancel:
 	xfs_trans_cancel(tp, cancel_flags);
- out_dqrele:
+ out_release_inode:
+	/*
+	 * Wait until after the current transaction is aborted to
+	 * release the inode.  This prevents recursive transactions
+	 * and deadlocks from xfs_inactive.
+	 */
+	if (ip)
+		IRELE(ip);
+
 	xfs_qm_dqrele(udqp);
 	xfs_qm_dqrele(gdqp);
 
 	if (unlock_dp_on_error)
 		xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
 	return error;
-
- out_abort_rele:
-	/*
-	 * Wait until after the current transaction is aborted to
-	 * release the inode.  This prevents recursive transactions
-	 * and deadlocks from xfs_inactive.
-	 */
-	xfs_bmap_cancel(&free_list);
-	cancel_flags |= XFS_TRANS_ABORT;
-	xfs_trans_cancel(tp, cancel_flags);
-	IRELE(ip);
-	unlock_dp_on_error = B_FALSE;
-	goto out_dqrele;
 }
 
 #ifdef DEBUG
@@ -2114,9 +2093,8 @@ xfs_symlink(
 				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
 				  &first_block, resblks, mval, &nmaps,
 				  &free_list);
-		if (error) {
-			goto error1;
-		}
+		if (error)
+			goto error2;
 
 		if (resblks)
 			resblks -= fs_blocks;
@@ -2148,7 +2126,7 @@ xfs_symlink(
 	error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
 					&first_block, &free_list, resblks);
 	if (error)
-		goto error1;
+		goto error2;
 	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
@@ -2161,13 +2139,6 @@ xfs_symlink(
 		xfs_trans_set_sync(tp);
 	}
 
-	/*
-	 * xfs_trans_commit normally decrements the vnode ref count
-	 * when it unlocks the inode. Since we want to return the
-	 * vnode to the caller, we bump the vnode ref count now.
-	 */
-	IHOLD(ip);
-
 	error = xfs_bmap_finish(&tp, &free_list, &committed);
 	if (error) {
 		goto error2;
-- 
cgit v1.2.3


From 20ad9ea9becd34a3c16252ca9d815f2c74f8f30f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sun, 13 Feb 2011 12:06:34 +0000
Subject: xfs: enable delaylog by default

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9731898083a..7ec1fb8c131 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -189,6 +189,7 @@ xfs_parseargs(
 	mp->m_flags |= XFS_MOUNT_BARRIER;
 	mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
 	mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+	mp->m_flags |= XFS_MOUNT_DELAYLOG;
 
 	/*
 	 * These can be overridden by the mount option parsing.
-- 
cgit v1.2.3


From 7e49b6f2480cb9a9e7322a91592e56a5c85361f5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 22 Oct 2010 00:30:26 +0200
Subject: udf: Convert UDF to new truncate calling sequence

Use new truncation sequence in UDF and fix up error handling in the
code.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/file.c     |   7 +-
 fs/udf/inode.c    | 239 +++++++++++++++++++++++++++++++++++++-----------------
 fs/udf/truncate.c | 146 ++++++++++++++-------------------
 fs/udf/udfdecl.h  |  12 ++-
 4 files changed, 234 insertions(+), 170 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/file.c b/fs/udf/file.c
index 89c78486cbb..f391a2adc69 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -123,8 +123,8 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		if (inode->i_sb->s_blocksize <
 				(udf_file_entry_alloc_offset(inode) +
 						pos + count)) {
-			udf_expand_file_adinicb(inode, pos + count, &err);
-			if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			err = udf_expand_file_adinicb(inode);
+			if (err) {
 				udf_debug("udf_expand_adinicb: err=%d\n", err);
 				up_write(&iinfo->i_data_sem);
 				return err;
@@ -237,7 +237,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if ((attr->ia_valid & ATTR_SIZE) &&
 	    attr->ia_size != i_size_read(inode)) {
-		error = vmtruncate(inode, attr->ia_size);
+		error = udf_setsize(inode, attr->ia_size);
 		if (error)
 			return error;
 	}
@@ -249,5 +249,4 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 
 const struct inode_operations udf_file_inode_operations = {
 	.setattr		= udf_setattr,
-	.truncate		= udf_truncate,
 };
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c6a2e782b97..ccc81432141 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -73,14 +73,12 @@ void udf_evict_inode(struct inode *inode)
 	struct udf_inode_info *iinfo = UDF_I(inode);
 	int want_delete = 0;
 
-	truncate_inode_pages(&inode->i_data, 0);
-
 	if (!inode->i_nlink && !is_bad_inode(inode)) {
 		want_delete = 1;
-		inode->i_size = 0;
-		udf_truncate(inode);
+		udf_setsize(inode, 0);
 		udf_update_inode(inode, IS_SYNC(inode));
-	}
+	} else
+		truncate_inode_pages(&inode->i_data, 0);
 	invalidate_inode_buffers(inode);
 	end_writeback(inode);
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
@@ -117,9 +115,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
 
 	ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
 	if (unlikely(ret)) {
-		loff_t isize = mapping->host->i_size;
-		if (pos + len > isize)
-			vmtruncate(mapping->host, isize);
+		struct inode *inode = mapping->host;
+		struct udf_inode_info *iinfo = UDF_I(inode);
+		loff_t isize = inode->i_size;
+
+		if (pos + len > isize) {
+			truncate_pagecache(inode, pos + len, isize);
+			if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+				down_write(&iinfo->i_data_sem);
+				udf_truncate_extents(inode);
+				up_write(&iinfo->i_data_sem);
+			}
+		}
 	}
 
 	return ret;
@@ -139,30 +146,31 @@ const struct address_space_operations udf_aops = {
 	.bmap		= udf_bmap,
 };
 
-void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
+int udf_expand_file_adinicb(struct inode *inode)
 {
 	struct page *page;
 	char *kaddr;
 	struct udf_inode_info *iinfo = UDF_I(inode);
+	int err;
 	struct writeback_control udf_wbc = {
 		.sync_mode = WB_SYNC_NONE,
 		.nr_to_write = 1,
 	};
 
-	/* from now on we have normal address_space methods */
-	inode->i_data.a_ops = &udf_aops;
-
 	if (!iinfo->i_lenAlloc) {
 		if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
 			iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
 		else
 			iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+		/* from now on we have normal address_space methods */
+		inode->i_data.a_ops = &udf_aops;
 		mark_inode_dirty(inode);
-		return;
+		return 0;
 	}
 
-	page = grab_cache_page(inode->i_mapping, 0);
-	BUG_ON(!PageLocked(page));
+	page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
 
 	if (!PageUptodate(page)) {
 		kaddr = kmap(page);
@@ -181,11 +189,24 @@ void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
 	else
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
-
-	inode->i_data.a_ops->writepage(page, &udf_wbc);
+	/* from now on we have normal address_space methods */
+	inode->i_data.a_ops = &udf_aops;
+	err = inode->i_data.a_ops->writepage(page, &udf_wbc);
+	if (err) {
+		/* Restore everything back so that we don't lose data... */
+		lock_page(page);
+		kaddr = kmap(page);
+		memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
+		       inode->i_size);
+		kunmap(page);
+		unlock_page(page);
+		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+		inode->i_data.a_ops = &udf_adinicb_aops;
+	}
 	page_cache_release(page);
-
 	mark_inode_dirty(inode);
+
+	return err;
 }
 
 struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
@@ -348,8 +369,10 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
 }
 
 /* Extend the file by 'blocks' blocks, return the number of extents added */
-int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
-		    struct kernel_long_ad *last_ext, sector_t blocks)
+static int udf_do_extend_file(struct inode *inode,
+			      struct extent_position *last_pos,
+			      struct kernel_long_ad *last_ext,
+			      sector_t blocks)
 {
 	sector_t add;
 	int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
@@ -357,6 +380,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
 	struct kernel_lb_addr prealloc_loc = {};
 	int prealloc_len = 0;
 	struct udf_inode_info *iinfo;
+	int err;
 
 	/* The previous extent is fake and we should not extend by anything
 	 * - there's nothing to do... */
@@ -422,26 +446,29 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
 	/* Create enough extents to cover the whole hole */
 	while (blocks > add) {
 		blocks -= add;
-		if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
-				 last_ext->extLength, 1) == -1)
-			return -1;
+		err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
+				   last_ext->extLength, 1);
+		if (err)
+			return err;
 		count++;
 	}
 	if (blocks) {
 		last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
 			(blocks << sb->s_blocksize_bits);
-		if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
-				 last_ext->extLength, 1) == -1)
-			return -1;
+		err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
+				   last_ext->extLength, 1);
+		if (err)
+			return err;
 		count++;
 	}
 
 out:
 	/* Do we have some preallocated blocks saved? */
 	if (prealloc_len) {
-		if (udf_add_aext(inode, last_pos, &prealloc_loc,
-				 prealloc_len, 1) == -1)
-			return -1;
+		err = udf_add_aext(inode, last_pos, &prealloc_loc,
+				   prealloc_len, 1);
+		if (err)
+			return err;
 		last_ext->extLocation = prealloc_loc;
 		last_ext->extLength = prealloc_len;
 		count++;
@@ -453,11 +480,68 @@ out:
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
 		last_pos->offset -= sizeof(struct long_ad);
 	else
-		return -1;
+		return -EIO;
 
 	return count;
 }
 
+static int udf_extend_file(struct inode *inode, loff_t newsize)
+{
+
+	struct extent_position epos;
+	struct kernel_lb_addr eloc;
+	uint32_t elen;
+	int8_t etype;
+	struct super_block *sb = inode->i_sb;
+	sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
+	int adsize;
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	struct kernel_long_ad extent;
+	int err;
+
+	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		BUG();
+
+	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+
+	/* File has extent covering the new size (could happen when extending
+	 * inside a block)? */
+	if (etype != -1)
+		return 0;
+	if (newsize & (sb->s_blocksize - 1))
+		offset++;
+	/* Extended file just to the boundary of the last file block? */
+	if (offset == 0)
+		return 0;
+
+	/* Truncate is extending the file by 'offset' blocks */
+	if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
+	    (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
+		/* File has no extents at all or has empty last
+		 * indirect extent! Create a fake extent... */
+		extent.extLocation.logicalBlockNum = 0;
+		extent.extLocation.partitionReferenceNum = 0;
+		extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+	} else {
+		epos.offset -= adsize;
+		etype = udf_next_aext(inode, &epos, &extent.extLocation,
+				      &extent.extLength, 0);
+		extent.extLength |= etype << 30;
+	}
+	err = udf_do_extend_file(inode, &epos, &extent, offset);
+	if (err < 0)
+		goto out;
+	err = 0;
+	iinfo->i_lenExtents = newsize;
+out:
+	brelse(epos.bh);
+	return err;
+}
+
 static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 					int *err, sector_t *phys, int *new)
 {
@@ -540,7 +624,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 			elen = EXT_RECORDED_ALLOCATED |
 				((elen + inode->i_sb->s_blocksize - 1) &
 				 ~(inode->i_sb->s_blocksize - 1));
-			etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
+			udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
 		}
 		brelse(prev_epos.bh);
 		brelse(cur_epos.bh);
@@ -564,19 +648,17 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
 			memset(&laarr[0].extLocation, 0x00,
 				sizeof(struct kernel_lb_addr));
 			laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
-			/* Will udf_extend_file() create real extent from
+			/* Will udf_do_extend_file() create real extent from
 			   a fake one? */
 			startnum = (offset > 0);
 		}
 		/* Create extents for the hole between EOF and offset */
-		ret = udf_extend_file(inode, &prev_epos, laarr, offset);
-		if (ret == -1) {
+		ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
+		if (ret < 0) {
 			brelse(prev_epos.bh);
 			brelse(cur_epos.bh);
 			brelse(next_epos.bh);
-			/* We don't really know the error here so we just make
-			 * something up */
-			*err = -ENOSPC;
+			*err = ret;
 			return NULL;
 		}
 		c = 0;
@@ -1005,52 +1087,66 @@ struct buffer_head *udf_bread(struct inode *inode, int block,
 	return NULL;
 }
 
-void udf_truncate(struct inode *inode)
+int udf_setsize(struct inode *inode, loff_t newsize)
 {
-	int offset;
 	int err;
 	struct udf_inode_info *iinfo;
+	int bsize = 1 << inode->i_blkbits;
 
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	      S_ISLNK(inode->i_mode)))
-		return;
+		return -EINVAL;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-		return;
+		return -EPERM;
 
 	iinfo = UDF_I(inode);
-	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+	if (newsize > inode->i_size) {
 		down_write(&iinfo->i_data_sem);
-		if (inode->i_sb->s_blocksize <
-				(udf_file_entry_alloc_offset(inode) +
-				 inode->i_size)) {
-			udf_expand_file_adinicb(inode, inode->i_size, &err);
-			if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
-				inode->i_size = iinfo->i_lenAlloc;
-				up_write(&iinfo->i_data_sem);
-				return;
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			if (bsize <
+			    (udf_file_entry_alloc_offset(inode) + newsize)) {
+				err = udf_expand_file_adinicb(inode);
+				if (err) {
+					up_write(&iinfo->i_data_sem);
+					return err;
+				}
 			} else
-				udf_truncate_extents(inode);
-		} else {
-			offset = inode->i_size & (inode->i_sb->s_blocksize - 1);
-			memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
-				0x00, inode->i_sb->s_blocksize -
-				offset - udf_file_entry_alloc_offset(inode));
-			iinfo->i_lenAlloc = inode->i_size;
+				iinfo->i_lenAlloc = newsize;
+		}
+		err = udf_extend_file(inode, newsize);
+		if (err) {
+			up_write(&iinfo->i_data_sem);
+			return err;
 		}
+		truncate_setsize(inode, newsize);
 		up_write(&iinfo->i_data_sem);
 	} else {
-		block_truncate_page(inode->i_mapping, inode->i_size,
-				    udf_get_block);
+		if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+			down_write(&iinfo->i_data_sem);
+			memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
+			       0x00, bsize - newsize -
+			       udf_file_entry_alloc_offset(inode));
+			iinfo->i_lenAlloc = newsize;
+			truncate_setsize(inode, newsize);
+			up_write(&iinfo->i_data_sem);
+			goto update_time;
+		}
+		err = block_truncate_page(inode->i_mapping, newsize,
+					  udf_get_block);
+		if (err)
+			return err;
 		down_write(&iinfo->i_data_sem);
+		truncate_setsize(inode, newsize);
 		udf_truncate_extents(inode);
 		up_write(&iinfo->i_data_sem);
 	}
-
+update_time:
 	inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
 	if (IS_SYNC(inode))
 		udf_sync_inode(inode);
 	else
 		mark_inode_dirty(inode);
+	return 0;
 }
 
 static void __udf_read_inode(struct inode *inode)
@@ -1637,14 +1733,13 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
 	return NULL;
 }
 
-int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
-		    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
+		 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
 	int adsize;
 	struct short_ad *sad = NULL;
 	struct long_ad *lad = NULL;
 	struct allocExtDesc *aed;
-	int8_t etype;
 	uint8_t *ptr;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
@@ -1660,7 +1755,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 	else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
 		adsize = sizeof(struct long_ad);
 	else
-		return -1;
+		return -EIO;
 
 	if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
 		unsigned char *sptr, *dptr;
@@ -1672,12 +1767,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 						obloc.partitionReferenceNum,
 						obloc.logicalBlockNum, &err);
 		if (!epos->block.logicalBlockNum)
-			return -1;
+			return -ENOSPC;
 		nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
 								 &epos->block,
 								 0));
 		if (!nbh)
-			return -1;
+			return -EIO;
 		lock_buffer(nbh);
 		memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
 		set_buffer_uptodate(nbh);
@@ -1746,7 +1841,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 		epos->bh = nbh;
 	}
 
-	etype = udf_write_aext(inode, epos, eloc, elen, inc);
+	udf_write_aext(inode, epos, eloc, elen, inc);
 
 	if (!epos->bh) {
 		iinfo->i_lenAlloc += adsize;
@@ -1764,11 +1859,11 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
 		mark_buffer_dirty_inode(epos->bh, inode);
 	}
 
-	return etype;
+	return 0;
 }
 
-int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
-		      struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+void udf_write_aext(struct inode *inode, struct extent_position *epos,
+		    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
 	int adsize;
 	uint8_t *ptr;
@@ -1798,7 +1893,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
 		adsize = sizeof(struct long_ad);
 		break;
 	default:
-		return -1;
+		return;
 	}
 
 	if (epos->bh) {
@@ -1817,8 +1912,6 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
 
 	if (inc)
 		epos->offset += adsize;
-
-	return (elen >> 30);
 }
 
 int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 225527cdc88..8424308db4b 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -197,6 +197,11 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
 	mark_buffer_dirty_inode(epos->bh, inode);
 }
 
+/*
+ * Truncate extents of inode to inode->i_size. This function can be used only
+ * for making file shorter. For making file longer, udf_extend_file() has to
+ * be used.
+ */
 void udf_truncate_extents(struct inode *inode)
 {
 	struct extent_position epos;
@@ -219,96 +224,65 @@ void udf_truncate_extents(struct inode *inode)
 	etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
 	byte_offset = (offset << sb->s_blocksize_bits) +
 		(inode->i_size & (sb->s_blocksize - 1));
-	if (etype != -1) {
-		epos.offset -= adsize;
-		extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
-		epos.offset += adsize;
-		if (byte_offset)
-			lenalloc = epos.offset;
-		else
-			lenalloc = epos.offset - adsize;
-
-		if (!epos.bh)
-			lenalloc -= udf_file_entry_alloc_offset(inode);
-		else
-			lenalloc -= sizeof(struct allocExtDesc);
-
-		while ((etype = udf_current_aext(inode, &epos, &eloc,
-						 &elen, 0)) != -1) {
-			if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-				udf_write_aext(inode, &epos, &neloc, nelen, 0);
-				if (indirect_ext_len) {
-					/* We managed to free all extents in the
-					 * indirect extent - free it too */
-					BUG_ON(!epos.bh);
-					udf_free_blocks(sb, inode, &epos.block,
-							0, indirect_ext_len);
-				} else if (!epos.bh) {
-					iinfo->i_lenAlloc = lenalloc;
-					mark_inode_dirty(inode);
-				} else
-					udf_update_alloc_ext_desc(inode,
-							&epos, lenalloc);
-				brelse(epos.bh);
-				epos.offset = sizeof(struct allocExtDesc);
-				epos.block = eloc;
-				epos.bh = udf_tread(sb,
-						udf_get_lb_pblock(sb, &eloc, 0));
-				if (elen)
-					indirect_ext_len =
-						(elen + sb->s_blocksize - 1) >>
-						sb->s_blocksize_bits;
-				else
-					indirect_ext_len = 1;
-			} else {
-				extent_trunc(inode, &epos, &eloc, etype,
-					     elen, 0);
-				epos.offset += adsize;
-			}
-		}
+	if (etype == -1) {
+		/* We should extend the file? */
+		WARN_ON(byte_offset);
+		return;
+	}
+	epos.offset -= adsize;
+	extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
+	epos.offset += adsize;
+	if (byte_offset)
+		lenalloc = epos.offset;
+	else
+		lenalloc = epos.offset - adsize;
 
-		if (indirect_ext_len) {
-			BUG_ON(!epos.bh);
-			udf_free_blocks(sb, inode, &epos.block, 0,
-					indirect_ext_len);
-		} else if (!epos.bh) {
-			iinfo->i_lenAlloc = lenalloc;
-			mark_inode_dirty(inode);
-		} else
-			udf_update_alloc_ext_desc(inode, &epos, lenalloc);
-	} else if (inode->i_size) {
-		if (byte_offset) {
-			struct kernel_long_ad extent;
+	if (!epos.bh)
+		lenalloc -= udf_file_entry_alloc_offset(inode);
+	else
+		lenalloc -= sizeof(struct allocExtDesc);
 
-			/*
-			 *  OK, there is not extent covering inode->i_size and
-			 *  no extent above inode->i_size => truncate is
-			 *  extending the file by 'offset' blocks.
-			 */
-			if ((!epos.bh &&
-			     epos.offset ==
-					udf_file_entry_alloc_offset(inode)) ||
-			    (epos.bh && epos.offset ==
-						sizeof(struct allocExtDesc))) {
-				/* File has no extents at all or has empty last
-				 * indirect extent! Create a fake extent... */
-				extent.extLocation.logicalBlockNum = 0;
-				extent.extLocation.partitionReferenceNum = 0;
-				extent.extLength =
-					EXT_NOT_RECORDED_NOT_ALLOCATED;
-			} else {
-				epos.offset -= adsize;
-				etype = udf_next_aext(inode, &epos,
-						      &extent.extLocation,
-						      &extent.extLength, 0);
-				extent.extLength |= etype << 30;
-			}
-			udf_extend_file(inode, &epos, &extent,
-					offset +
-					((inode->i_size &
-						(sb->s_blocksize - 1)) != 0));
+	while ((etype = udf_current_aext(inode, &epos, &eloc,
+					 &elen, 0)) != -1) {
+		if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
+			udf_write_aext(inode, &epos, &neloc, nelen, 0);
+			if (indirect_ext_len) {
+				/* We managed to free all extents in the
+				 * indirect extent - free it too */
+				BUG_ON(!epos.bh);
+				udf_free_blocks(sb, inode, &epos.block,
+						0, indirect_ext_len);
+			} else if (!epos.bh) {
+				iinfo->i_lenAlloc = lenalloc;
+				mark_inode_dirty(inode);
+			} else
+				udf_update_alloc_ext_desc(inode,
+						&epos, lenalloc);
+			brelse(epos.bh);
+			epos.offset = sizeof(struct allocExtDesc);
+			epos.block = eloc;
+			epos.bh = udf_tread(sb,
+					udf_get_lb_pblock(sb, &eloc, 0));
+			if (elen)
+				indirect_ext_len =
+					(elen + sb->s_blocksize - 1) >>
+					sb->s_blocksize_bits;
+			else
+				indirect_ext_len = 1;
+		} else {
+			extent_trunc(inode, &epos, &eloc, etype, elen, 0);
+			epos.offset += adsize;
 		}
 	}
+
+	if (indirect_ext_len) {
+		BUG_ON(!epos.bh);
+		udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len);
+	} else if (!epos.bh) {
+		iinfo->i_lenAlloc = lenalloc;
+		mark_inode_dirty(inode);
+	} else
+		udf_update_alloc_ext_desc(inode, &epos, lenalloc);
 	iinfo->i_lenExtents = inode->i_size;
 
 	brelse(epos.bh);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index eba48209f9f..dbd52d4b5ee 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -136,22 +136,20 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern void udf_expand_file_adinicb(struct inode *, int, int *);
+extern int udf_expand_file_adinicb(struct inode *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
-extern void udf_truncate(struct inode *);
+extern int udf_setsize(struct inode *, loff_t);
 extern void udf_read_inode(struct inode *);
 extern void udf_evict_inode(struct inode *);
 extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
-extern int udf_extend_file(struct inode *, struct extent_position *,
-			   struct kernel_long_ad *, sector_t);
 extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
 			 struct kernel_lb_addr *, uint32_t *, sector_t *);
-extern int8_t udf_add_aext(struct inode *, struct extent_position *,
+extern int udf_add_aext(struct inode *, struct extent_position *,
+			struct kernel_lb_addr *, uint32_t, int);
+extern void udf_write_aext(struct inode *, struct extent_position *,
 			   struct kernel_lb_addr *, uint32_t, int);
-extern int8_t udf_write_aext(struct inode *, struct extent_position *,
-			     struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_delete_aext(struct inode *, struct extent_position,
 			      struct kernel_lb_addr, uint32_t);
 extern int8_t udf_next_aext(struct inode *, struct extent_position *,
-- 
cgit v1.2.3


From 6f644e5f97cc8dcb8dc7133562159cc20d27c38f Mon Sep 17 00:00:00 2001
From: Dirk Behme <dirk.behme@de.bosch.com>
Date: Tue, 22 Feb 2011 14:04:19 -0500
Subject: UDF: Fix compiler warning

Fix compiler warning

fs/udf/balloc.c: In function 'udf_bitmap_new_block':
fs/udf/balloc.c:273: warning: passing argument 1 of '_find_next_bit_le' from incompatible pointer type
fs/udf/balloc.c:285: warning: passing argument 1 of '_find_next_bit_le' from incompatible pointer type
fs/udf/balloc.c:311: warning: passing argument 1 of '_find_next_bit_le' from incompatible pointer type
fs/udf/balloc.c:325: warning: passing argument 1 of '_find_next_bit_le' from incompatible pointer type

The main fix is to add a cast in ext2_find_next_bit().

As all other usage locations of udf_find_next_one_bit()
directly use bh->b_data (which is a char *), the useless
(char *) cast in line 311 can be removed, too.

Signed-off-by: Dirk Behme <dirk.behme@de.bosch.com>
Signed-off-by: George G. Davis <gdavis@mvista.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/balloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 306ee39ef2c..8994dd04166 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -31,7 +31,7 @@
 #define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
 #define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
 #define udf_find_next_one_bit(addr, size, offset) \
-		ext2_find_next_bit(addr, size, offset)
+		ext2_find_next_bit((unsigned long *)(addr), size, offset)
 
 static int read_block_bitmap(struct super_block *sb,
 			     struct udf_bitmap *bitmap, unsigned int block,
@@ -297,7 +297,7 @@ repeat:
 				break;
 			}
 		} else {
-			bit = udf_find_next_one_bit((char *)bh->b_data,
+			bit = udf_find_next_one_bit(bh->b_data,
 						    sb->s_blocksize << 3,
 						    group_start << 3);
 			if (bit < sb->s_blocksize << 3)
-- 
cgit v1.2.3


From ec29ed5b407d618a8128f5942aade9e1758aa14b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 23 Feb 2011 16:23:20 -0500
Subject: Btrfs: fix fiemap bugs with delalloc

The Btrfs fiemap code wasn't properly returning delalloc extents,
so applications that trust fiemap to decide if there are holes in the
file see holes instead of delalloc.

This reworks the btrfs fiemap code, adding a get_extent helper that
searches for delalloc ranges and also adding a helper for extent_fiemap
that skips past holes in the file.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 138 ++++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/extent_io.h |   2 +-
 fs/btrfs/inode.c     | 126 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 224 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e7aeba24270..ff45b80d90f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
  */
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end, u64 max_bytes,
-		     unsigned long bits)
+		     unsigned long bits, int contig)
 {
 	struct rb_node *node;
 	struct extent_state *state;
 	u64 cur_start = *start;
 	u64 total_bytes = 0;
+	u64 last = 0;
 	int found = 0;
 
 	if (search_end <= cur_start) {
@@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->start > search_end)
 			break;
-		if (state->end >= cur_start && (state->state & bits)) {
+		if (contig && found && state->start > last + 1)
+			break;
+		if (state->end >= cur_start && (state->state & bits) == bits) {
 			total_bytes += min(search_end, state->end) + 1 -
 				       max(cur_start, state->start);
 			if (total_bytes >= max_bytes)
@@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
 				*start = state->start;
 				found = 1;
 			}
+			last = state->end;
+		} else if (contig && found) {
+			break;
 		}
 		node = rb_next(node);
 		if (!node)
@@ -2912,6 +2918,46 @@ out:
 	return sector;
 }
 
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+						u64 offset,
+						u64 last,
+						get_extent_t *get_extent)
+{
+	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+	struct extent_map *em;
+	u64 len;
+
+	if (offset >= last)
+		return NULL;
+
+	while(1) {
+		len = last - offset;
+		if (len == 0)
+			break;
+		len = (len + sectorsize - 1) & ~(sectorsize - 1);
+		em = get_extent(inode, NULL, 0, offset, len, 0);
+		if (!em || IS_ERR(em))
+			return em;
+
+		/* if this isn't a hole return it */
+		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+		    em->block_start != EXTENT_MAP_HOLE) {
+			return em;
+		}
+
+		/* this is a hole, advance to the next extent */
+		offset = extent_map_end(em);
+		free_extent_map(em);
+		if (offset >= last)
+			break;
+	}
+	return NULL;
+}
+
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -2921,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	u32 flags = 0;
 	u32 found_type;
 	u64 last;
+	u64 last_for_get_extent = 0;
 	u64 disko = 0;
+	u64 isize = i_size_read(inode);
 	struct btrfs_key found_key;
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
 	struct btrfs_file_extent_item *item;
 	int end = 0;
-	u64 em_start = 0, em_len = 0;
+	u64 em_start = 0;
+	u64 em_len = 0;
+	u64 em_end = 0;
 	unsigned long emflags;
-	int hole = 0;
 
 	if (len == 0)
 		return -EINVAL;
@@ -2940,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		return -ENOMEM;
 	path->leave_spinning = 1;
 
+	/*
+	 * lookup the last file extent.  We're not using i_size here
+	 * because there might be preallocation past i_size
+	 */
 	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
 				       path, inode->i_ino, -1, 0);
 	if (ret < 0) {
@@ -2953,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
 	found_type = btrfs_key_type(&found_key);
 
-	/* No extents, just return */
+	/* No extents, but there might be delalloc bits */
 	if (found_key.objectid != inode->i_ino ||
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
-		btrfs_free_path(path);
-		return 0;
+		/* have to trust i_size as the end */
+		last = (u64)-1;
+		last_for_get_extent = isize;
+	} else {
+		/*
+		 * remember the start of the last extent.  There are a
+		 * bunch of different factors that go into the length of the
+		 * extent, so its much less complex to remember where it started
+		 */
+		last = found_key.offset;
+		last_for_get_extent = last + 1;
 	}
-	last = found_key.offset;
 	btrfs_free_path(path);
 
+	/*
+	 * we might have some extents allocated but more delalloc past those
+	 * extents.  so, we trust isize unless the start of the last extent is
+	 * beyond isize
+	 */
+	if (last < isize) {
+		last = (u64)-1;
+		last_for_get_extent = isize;
+	}
+
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
 			 &cached_state, GFP_NOFS);
-	em = get_extent(inode, NULL, 0, off, max - off, 0);
+
+	em = get_extent_skip_holes(inode, off, last_for_get_extent,
+				   get_extent);
 	if (!em)
 		goto out;
 	if (IS_ERR(em)) {
@@ -2973,19 +3046,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	}
 
 	while (!end) {
-		hole = 0;
-		off = em->start + em->len;
+		off = extent_map_end(em);
 		if (off >= max)
 			end = 1;
 
-		if (em->block_start == EXTENT_MAP_HOLE) {
-			hole = 1;
-			goto next;
-		}
-
 		em_start = em->start;
 		em_len = em->len;
-
+		em_end = extent_map_end(em);
+		emflags = em->flags;
 		disko = 0;
 		flags = 0;
 
@@ -3004,37 +3072,29 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
 
-next:
-		emflags = em->flags;
 		free_extent_map(em);
 		em = NULL;
-		if (!end) {
-			em = get_extent(inode, NULL, 0, off, max - off, 0);
-			if (!em)
-				goto out;
-			if (IS_ERR(em)) {
-				ret = PTR_ERR(em);
-				goto out;
-			}
-			emflags = em->flags;
-		}
-
-		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
+		if ((em_start >= last) || em_len == (u64)-1 ||
+		   (last == (u64)-1 && isize <= em_end)) {
 			flags |= FIEMAP_EXTENT_LAST;
 			end = 1;
 		}
 
-		if (em_start == last) {
+		/* now scan forward to see if this is really the last extent. */
+		em = get_extent_skip_holes(inode, off, last_for_get_extent,
+					   get_extent);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+		if (!em) {
 			flags |= FIEMAP_EXTENT_LAST;
 			end = 1;
 		}
-
-		if (!hole) {
-			ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-						em_len, flags);
-			if (ret)
-				goto out_free;
-		}
+		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+					      em_len, flags);
+		if (ret)
+			goto out_free;
 	}
 out_free:
 	free_extent_map(em);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7083cfafd06..9318dfefd59 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -191,7 +191,7 @@ void extent_io_exit(void);
 
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end,
-		     u64 max_bytes, unsigned long bits);
+		     u64 max_bytes, unsigned long bits, int contig);
 
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8d392ed73d5..44b926646e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1913,7 +1913,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
 
 	private = 0;
 	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-			     (u64)-1, 1, EXTENT_DIRTY)) {
+			     (u64)-1, 1, EXTENT_DIRTY, 0)) {
 		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
 					start, &private_failure);
 		if (ret == 0) {
@@ -5282,6 +5282,128 @@ out:
 	return em;
 }
 
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+					   size_t pg_offset, u64 start, u64 len,
+					   int create)
+{
+	struct extent_map *em;
+	struct extent_map *hole_em = NULL;
+	u64 range_start = start;
+	u64 end;
+	u64 found;
+	u64 found_end;
+	int err = 0;
+
+	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+	if (IS_ERR(em))
+		return em;
+	if (em) {
+		/*
+		 * if our em maps to a hole, there might
+		 * actually be delalloc bytes behind it
+		 */
+		if (em->block_start != EXTENT_MAP_HOLE)
+			return em;
+		else
+			hole_em = em;
+	}
+
+	/* check to see if we've wrapped (len == -1 or similar) */
+	end = start + len;
+	if (end < start)
+		end = (u64)-1;
+	else
+		end -= 1;
+
+	em = NULL;
+
+	/* ok, we didn't find anything, lets look for delalloc */
+	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+				 end, len, EXTENT_DELALLOC, 1);
+	found_end = range_start + found;
+	if (found_end < range_start)
+		found_end = (u64)-1;
+
+	/*
+	 * we didn't find anything useful, return
+	 * the original results from get_extent()
+	 */
+	if (range_start > end || found_end <= start) {
+		em = hole_em;
+		hole_em = NULL;
+		goto out;
+	}
+
+	/* adjust the range_start to make sure it doesn't
+	 * go backwards from the start they passed in
+	 */
+	range_start = max(start,range_start);
+	found = found_end - range_start;
+
+	if (found > 0) {
+		u64 hole_start = start;
+		u64 hole_len = len;
+
+		em = alloc_extent_map(GFP_NOFS);
+		if (!em) {
+			err = -ENOMEM;
+			goto out;
+		}
+		/*
+		 * when btrfs_get_extent can't find anything it
+		 * returns one huge hole
+		 *
+		 * make sure what it found really fits our range, and
+		 * adjust to make sure it is based on the start from
+		 * the caller
+		 */
+		if (hole_em) {
+			u64 calc_end = extent_map_end(hole_em);
+
+			if (calc_end <= start || (hole_em->start > end)) {
+				free_extent_map(hole_em);
+				hole_em = NULL;
+			} else {
+				hole_start = max(hole_em->start, start);
+				hole_len = calc_end - hole_start;
+			}
+		}
+		em->bdev = NULL;
+		if (hole_em && range_start > hole_start) {
+			/* our hole starts before our delalloc, so we
+			 * have to return just the parts of the hole
+			 * that go until  the delalloc starts
+			 */
+			em->len = min(hole_len,
+				      range_start - hole_start);
+			em->start = hole_start;
+			em->orig_start = hole_start;
+			/*
+			 * don't adjust block start at all,
+			 * it is fixed at EXTENT_MAP_HOLE
+			 */
+			em->block_start = hole_em->block_start;
+			em->block_len = hole_len;
+		} else {
+			em->start = range_start;
+			em->len = found;
+			em->orig_start = range_start;
+			em->block_start = EXTENT_MAP_DELALLOC;
+			em->block_len = found;
+		}
+	} else if (hole_em) {
+		return hole_em;
+	}
+out:
+
+	free_extent_map(hole_em);
+	if (err) {
+		free_extent_map(em);
+		return ERR_PTR(err);
+	}
+	return em;
+}
+
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 						  u64 start, u64 len)
 {
@@ -6104,7 +6226,7 @@ out:
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
-	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 
 int btrfs_readpage(struct file *file, struct page *page)
-- 
cgit v1.2.3


From 2aa15890f3c191326678f1bd68af61ec6b8753ec Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Feb 2011 13:49:47 +0100
Subject: mm: prevent concurrent unmap_mapping_range() on the same inode

Michael Leun reported that running parallel opens on a fuse filesystem
can trigger a "kernel BUG at mm/truncate.c:475"

Gurudas Pai reported the same bug on NFS.

The reason is, unmap_mapping_range() is not prepared for more than
one concurrent invocation per inode.  For example:

  thread1: going through a big range, stops in the middle of a vma and
     stores the restart address in vm_truncate_count.

  thread2: comes in with a small (e.g. single page) unmap request on
     the same vma, somewhere before restart_address, finds that the
     vma was already unmapped up to the restart address and happily
     returns without doing anything.

Another scenario would be two big unmap requests, both having to
restart the unmapping and each one setting vm_truncate_count to its
own value.  This could go on forever without any of them being able to
finish.

Truncate and hole punching already serialize with i_mutex.  Other
callers of unmap_mapping_range() do not, and it's difficult to get
i_mutex protection for all callers.  In particular ->d_revalidate(),
which calls invalidate_inode_pages2_range() in fuse, may be called
with or without i_mutex.

This patch adds a new mutex to 'struct address_space' to prevent
running multiple concurrent unmap_mapping_range() on the same mapping.

[ We'll hopefully get rid of all this with the upcoming mm
  preemptibility series by Peter Zijlstra, the "mm: Remove i_mmap_mutex
  lockbreak" patch in particular.  But that is for 2.6.39 ]

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reported-by: Michael Leun <lkml20101129@newton.leun.net>
Reported-by: Gurudas Pai <gurudas.pai@oracle.com>
Tested-by: Gurudas Pai <gurudas.pai@oracle.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/gfs2/main.c     |  9 +--------
 fs/inode.c         | 22 +++++++++++++++-------
 fs/nilfs2/btnode.c |  5 -----
 fs/nilfs2/btnode.h |  1 -
 fs/nilfs2/mdt.c    |  4 ++--
 fs/nilfs2/page.c   | 13 -------------
 fs/nilfs2/page.h   |  1 -
 fs/nilfs2/super.c  |  2 +-
 8 files changed, 19 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 85ba027d1c4..72c31a315d9 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(void *foo)
 	struct address_space *mapping = (struct address_space *)(gl + 1);
 
 	gfs2_init_glock_once(gl);
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	address_space_init_once(mapping);
 }
 
 /**
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f..9c2b795ccc9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -295,6 +295,20 @@ static void destroy_inode(struct inode *inode)
 		call_rcu(&inode->i_rcu, i_callback);
 }
 
+void address_space_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
+
 /*
  * These are initializations that only need to be done
  * once, because the fields are idempotent across use
@@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_wb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
-	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.tree_lock);
-	spin_lock_init(&inode->i_data.i_mmap_lock);
-	INIT_LIST_HEAD(&inode->i_data.private_list);
-	spin_lock_init(&inode->i_data.private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
+	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f528..85f7baa15f5 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -35,11 +35,6 @@
 #include "btnode.h"
 
 
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-	nilfs_mapping_init_once(btnc);
-}
-
 static const struct address_space_operations def_btnode_aops = {
 	.sync_page		= block_sync_page,
 };
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 79037494f1e..1b8ebd888c2 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
 	struct buffer_head *newbh;
 };
 
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f6..a0babd2bff6 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
-	nilfs_mapping_init_once(&shadow->frozen_data);
+	address_space_init_once(&shadow->frozen_data);
 	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
-	nilfs_mapping_init_once(&shadow->frozen_btnodes);
+	address_space_init_once(&shadow->frozen_btnodes);
 	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
 	mi->mi_shadow = shadow;
 	return 0;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfe..a585b35fd6b 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	return nc;
 }
 
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
-
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops)
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd89..2a00953ebd5 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 58fd707174e..1673b3d9984 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *obj)
 #ifdef CONFIG_NILFS_XATTR
 	init_rwsem(&ii->xattr_sem);
 #endif
-	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	address_space_init_once(&ii->i_btnode_cache);
 	ii->i_bmap = &ii->i_bmap_data;
 	inode_init_once(&ii->vfs_inode);
 }
-- 
cgit v1.2.3


From 93b270f76e7ef3b81001576860c2701931cdc78b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 24 Feb 2011 17:25:47 +1100
Subject: Fix over-zealous flush_disk when changing device size.

There are two cases when we call flush_disk.
In one, the device has disappeared (check_disk_change) so any
data will hold becomes irrelevant.
In the oter, the device has changed size (check_disk_size_change)
so data we hold may be irrelevant.

In both cases it makes sense to discard any 'clean' buffers,
so they will be read back from the device if needed.

In the former case it makes sense to discard 'dirty' buffers
as there will never be anywhere safe to write the data.  In the
second case it *does*not* make sense to discard dirty buffers
as that will lead to file system corruption when you simply enlarge
the containing devices.

flush_disk calls __invalidate_devices.
__invalidate_device calls both invalidate_inodes and invalidate_bdev.

invalidate_inodes *does* discard I_DIRTY inodes and this does lead
to fs corruption.

invalidate_bev *does*not* discard dirty pages, but I don't really care
about that at present.

So this patch adds a flag to __invalidate_device (calling it
__invalidate_device2) to indicate whether dirty buffers should be
killed, and this is passed to invalidate_inodes which can choose to
skip dirty inodes.

flusk_disk then passes true from check_disk_change and false from
check_disk_size_change.

dm avoids tripping over this problem by calling i_size_write directly
rathher than using check_disk_size_change.

md does use check_disk_size_change and so is affected.

This regression was introduced by commit 608aeef17a which causes
check_disk_size_change to call flush_disk, so it is suitable for any
kernel since 2.6.27.

Cc: stable@kernel.org
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Andrew Patterson <andrew.patterson@hp.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/block_dev.c | 12 ++++++------
 fs/inode.c     |  9 ++++++++-
 fs/internal.h  |  2 +-
 3 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 333a7bb4cb9..5e23152d04a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -927,9 +927,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
  * when a disk has been changed -- either by a media change or online
  * resize.
  */
-static void flush_disk(struct block_device *bdev)
+static void flush_disk(struct block_device *bdev, bool kill_dirty)
 {
-	if (__invalidate_device(bdev)) {
+	if (__invalidate_device(bdev, kill_dirty)) {
 		char name[BDEVNAME_SIZE] = "";
 
 		if (bdev->bd_disk)
@@ -966,7 +966,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
 		       "%s: detected capacity change from %lld to %lld\n",
 		       name, bdev_size, disk_size);
 		i_size_write(bdev->bd_inode, disk_size);
-		flush_disk(bdev);
+		flush_disk(bdev, false);
 	}
 }
 EXPORT_SYMBOL(check_disk_size_change);
@@ -1019,7 +1019,7 @@ int check_disk_change(struct block_device *bdev)
 	if (!(events & DISK_EVENT_MEDIA_CHANGE))
 		return 0;
 
-	flush_disk(bdev);
+	flush_disk(bdev, true);
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
 	return 1;
@@ -1601,7 +1601,7 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
 
-int __invalidate_device(struct block_device *bdev)
+int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 {
 	struct super_block *sb = get_super(bdev);
 	int res = 0;
@@ -1614,7 +1614,7 @@ int __invalidate_device(struct block_device *bdev)
 		 * hold).
 		 */
 		shrink_dcache_sb(sb);
-		res = invalidate_inodes(sb);
+		res = invalidate_inodes(sb, kill_dirty);
 		drop_super(sb);
 	}
 	invalidate_bdev(bdev);
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f..c50d7feb87b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -540,11 +540,14 @@ void evict_inodes(struct super_block *sb)
 /**
  * invalidate_inodes	- attempt to free all inodes on a superblock
  * @sb:		superblock to operate on
+ * @kill_dirty: flag to guide handling of dirty inodes
  *
  * Attempts to free all inodes for a given superblock.  If there were any
  * busy inodes return a non-zero value, else zero.
+ * If @kill_dirty is set, discard dirty inodes too, otherwise treat
+ * them as busy.
  */
-int invalidate_inodes(struct super_block *sb)
+int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
 	int busy = 0;
 	struct inode *inode, *next;
@@ -556,6 +559,10 @@ int invalidate_inodes(struct super_block *sb)
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
 			continue;
+		if (inode->i_state & I_DIRTY && !kill_dirty) {
+			busy = 1;
+			continue;
+		}
 		if (atomic_read(&inode->i_count)) {
 			busy = 1;
 			continue;
diff --git a/fs/internal.h b/fs/internal.h
index 0663568b124..9b976b57d7f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -112,4 +112,4 @@ extern void release_open_intent(struct nameidata *);
  */
 extern int get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
-extern int invalidate_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *, bool);
-- 
cgit v1.2.3


From bf9faa2aa30e2ebf30287536712ed2717bb47002 Mon Sep 17 00:00:00 2001
From: "J. R. Okajima" <hooanon05@yahoo.co.jp>
Date: Wed, 23 Feb 2011 16:59:49 +0900
Subject: Unlock vfsmount_lock in do_umount

By the commit
	b3e19d9 2011-01-07 fs: scale mntget/mntput
vfsmount_lock was introduced around testing mnt_count.
Fix the mis-typed 'unlock'

Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 7b0b9537169..d1edf26025d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1244,7 +1244,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 		 */
 		br_write_lock(vfsmount_lock);
 		if (mnt_get_count(mnt) != 2) {
-			br_write_lock(vfsmount_lock);
+			br_write_unlock(vfsmount_lock);
 			return -EBUSY;
 		}
 		br_write_unlock(vfsmount_lock);
-- 
cgit v1.2.3


From 7a39de1510a3fd07a77530440292735d305fe510 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Thu, 20 Jan 2011 15:32:05 -0300
Subject: quota: return -ENOMEM when memory allocation fails

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota_v2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 65444d29406..f1ab3604db5 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -112,7 +112,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	if (!info->dqi_priv) {
 		printk(KERN_WARNING
 		       "Not enough memory for quota information structure.\n");
-		return -1;
+		return -ENOMEM;
 	}
 	qinfo = info->dqi_priv;
 	if (version == 0) {
-- 
cgit v1.2.3


From 4b44dd300d5bfd500f170bae13f95f589de0b28f Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 21 Jan 2011 10:52:56 +0800
Subject: ext3: Adjust trim start with first_data_block.

As we have make the consense in the e-mail[1], the trim start should
be added with first_data_block. So this patch fulfill it and remove
the check for start < first_data_block.

[1] http://www.spinics.net/lists/linux-ext4/msg22737.html

Cc: Jan Kara <jack@suse.cz>
Cc: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 045995c8ce5..5b8344fab79 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -2090,7 +2090,8 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
 	ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
 	int ret = 0;
 
-	start = range->start >> sb->s_blocksize_bits;
+	start = (range->start >> sb->s_blocksize_bits) +
+		le32_to_cpu(es->s_first_data_block);
 	len = range->len >> sb->s_blocksize_bits;
 	minlen = range->minlen >> sb->s_blocksize_bits;
 	trimmed = 0;
@@ -2099,10 +2100,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		return -EINVAL;
 	if (start >= max_blks)
 		goto out;
-	if (start < le32_to_cpu(es->s_first_data_block)) {
-		len -= le32_to_cpu(es->s_first_data_block) - start;
-		start = le32_to_cpu(es->s_first_data_block);
-	}
 	if (start + len > max_blks)
 		len = max_blks - start;
 
-- 
cgit v1.2.3


From bbac751dc85cbf0953a221171a746d69a0b1a85f Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 21 Jan 2011 22:09:20 +0800
Subject: ext3: speed up group trim with the right free block count.

When we trim some free blocks in a group of ext3, we should
calculate the free blocks properly and check whether there are
enough freed blocks left for us to trim. Current solution will
only calculate free spaces if they are large for a trim which
is wrong.

Let us see a small example:
a group has 1.5M free which are 300k, 300k, 300k, 300k, 300k.
And minblocks is 1M. With current solution, we have to iterate
the whole group since these 300k will never be subtracted from
1.5M. But actually we should exit after we find the first 2
free spaces since the left 3 chunks only sum up to 900K if we
subtract the first 600K although they can't be trimed.

Cc: Jan Kara <jack@suse.cz>
Cc: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 5b8344fab79..db1906b4e39 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1991,6 +1991,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
 		spin_unlock(sb_bgl_lock(sbi, group));
 		percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
 
+		free_blocks -= next - start;
 		/* Do not issue a TRIM on extents smaller than minblocks */
 		if ((next - start) < minblocks)
 			goto free_extent;
@@ -2040,7 +2041,7 @@ free_extent:
 		cond_resched();
 
 		/* No more suitable extents */
-		if ((free_blocks - count) < minblocks)
+		if (free_blocks < minblocks)
 			break;
 	}
 
-- 
cgit v1.2.3


From 4c16c36ad62fff8485215bd803d778eb2bd0b8bd Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 23 Feb 2011 16:11:33 -0500
Subject: GFS2: deallocation performance patch

This patch is a performance improvement to GFS2's dealloc code.
Rather than update the quota file and statfs file for every
single block that's stripped off in unlink function do_strip,
this patch keeps track and updates them once for every layer
that's stripped.  This is done entirely inside the existing
transaction, so there should be no risk of corruption.
The other functions that deallocate blocks will be unaffected
because they are using wrapper functions that do the same
thing that they do today.

I tested this code on my roth cluster by creating 200
files in a directory, each of which is 100MB, then on
four nodes, I simultaneously deleted the files, thus competing
for GFS2 resources (but different files).  The commands
I used were:

[root@roth-01]# time for i in `seq 1 4 200` ; do rm /mnt/gfs2/bigdir/gfs2.$i; done
[root@roth-02]# time for i in `seq 2 4 200` ; do rm /mnt/gfs2/bigdir/gfs2.$i; done
[root@roth-03]# time for i in `seq 3 4 200` ; do rm /mnt/gfs2/bigdir/gfs2.$i; done
[root@roth-05]# time for i in `seq 4 4 200` ; do rm /mnt/gfs2/bigdir/gfs2.$i; done

The performance increase was significant:

             roth-01     roth-02     roth-03     roth-05
             ---------   ---------   ---------   ---------
old: real    0m34.027    0m25.021s   0m23.906s   0m35.646s
new: real    0m22.379s   0m24.362s   0m24.133s   0m18.562s

Total time spent deleting:
old: 118.6s
new:  89.4

For this particular case, this showed a 25% performance increase for
GFS2 unlinks.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 20 +++++++++++++++-----
 fs/gfs2/rgrp.c | 34 +++++++++++++++++++++++++++++++---
 fs/gfs2/rgrp.h |  2 ++
 3 files changed, 48 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3c4039d5eef..ef3dc4b9fae 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -21,6 +21,7 @@
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
+#include "super.h"
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
@@ -757,7 +758,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrp_list rlist;
 	u64 bn, bstart;
-	u32 blen;
+	u32 blen, btotal;
 	__be64 *p;
 	unsigned int rg_blocks = 0;
 	int metadata;
@@ -839,6 +840,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 
 	bstart = 0;
 	blen = 0;
+	btotal = 0;
 
 	for (p = top; p < bottom; p++) {
 		if (!*p)
@@ -851,9 +853,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		else {
 			if (bstart) {
 				if (metadata)
-					gfs2_free_meta(ip, bstart, blen);
+					__gfs2_free_meta(ip, bstart, blen);
 				else
-					gfs2_free_data(ip, bstart, blen);
+					__gfs2_free_data(ip, bstart, blen);
+
+				btotal += blen;
 			}
 
 			bstart = bn;
@@ -865,11 +869,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	}
 	if (bstart) {
 		if (metadata)
-			gfs2_free_meta(ip, bstart, blen);
+			__gfs2_free_meta(ip, bstart, blen);
 		else
-			gfs2_free_data(ip, bstart, blen);
+			__gfs2_free_data(ip, bstart, blen);
+
+		btotal += blen;
 	}
 
+	gfs2_statfs_change(sdp, 0, +btotal, 0);
+	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+			  ip->i_inode.i_gid);
+
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 
 	gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7293ea27020..cf930cd9664 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1602,7 +1602,7 @@ rgrp_error:
  *
  */
 
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *rgd;
@@ -1617,7 +1617,21 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	gfs2_trans_add_rg(rgd);
+}
 
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	__gfs2_free_data(ip, bstart, blen);
 	gfs2_statfs_change(sdp, 0, +blen, 0);
 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
 }
@@ -1630,7 +1644,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
  *
  */
 
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *rgd;
@@ -1645,10 +1659,24 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
 	gfs2_trans_add_rg(rgd);
+	gfs2_meta_wipe(ip, bstart, blen);
+}
 
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	__gfs2_free_meta(ip, bstart, blen);
 	gfs2_statfs_change(sdp, 0, +blen, 0);
 	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
-	gfs2_meta_wipe(ip, bstart, blen);
 }
 
 void gfs2_unlink_di(struct inode *inode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 50c2bb04369..a80e3034ac4 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -52,7 +52,9 @@ extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 
+extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
-- 
cgit v1.2.3


From e7407d1619713f4b1fdff3a485e1bd8e77bd480d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Feb 2011 09:56:32 +0100
Subject: block: bd_link_disk_holder() should hold on to holder_dir

The new implementation of bd_link_disk_holder() added by 49731baa41d
(block: restore multiple bd_link_disk_holder() support) didn't get an
extra reference for the holder_dir kobject of the slave bdev; however,
bdev kills holder_dir on removal, not release, so if the slave bdev is
removed while there are holder links, the holder_dir will be destroyed
while there still are holder links, which leads to oops later when
bd_unlink_disk_order() tries to remove those links.

Make bd_link_disk_holder() grab an extra reference for the slave's
holder_dir and put it in bd_unlink_disk_holder().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Hawrylewicz Czarnowski, Przemyslaw" <przemyslaw.hawrylewicz.czarnowski@intel.com>
Tested-by: "Hawrylewicz Czarnowski, Przemyslaw" <przemyslaw.hawrylewicz.czarnowski@intel.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4fb8a343153..94d41db6200 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -873,6 +873,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
 	if (ret)
 		goto out_del;
+	/*
+	 * bdev could be deleted beneath us which would implicitly destroy
+	 * the holder directory.  Hold on to it.
+	 */
+	kobject_get(bdev->bd_part->holder_dir);
 
 	list_add(&holder->list, &bdev->bd_holder_disks);
 	goto out_unlock;
@@ -909,6 +914,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
 		del_symlink(bdev->bd_part->holder_dir,
 			    &disk_to_dev(disk)->kobj);
+		kobject_put(bdev->bd_part->holder_dir);
 		list_del_init(&holder->list);
 		kfree(holder);
 	}
-- 
cgit v1.2.3


From 5a18ec176c934ca1bc9dc61580a5e0e90a9b5733 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Feb 2011 14:44:58 +0100
Subject: fuse: fix hang of single threaded fuseblk filesystem

Single threaded NTFS-3G could get stuck if a delayed RELEASE reply
triggered a DESTROY request via path_put().

Fix this by

 a) making RELEASE requests synchronous, whenever possible, on fuseblk
 filesystems

 b) if not possible (triggered by an asynchronous read/write) then do
 the path_put() in a separate thread with schedule_work().

Reported-by: Oliver Neukum <oneukum@suse.de>
Cc: stable@kernel.org
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c   | 52 +++++++++++++++++++++++++++++++++++++++++++++-------
 fs/fuse/fuse_i.h |  6 +++++-
 2 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 95da1bc1c82..9e0832dbb1e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)
 	return ff;
 }
 
+static void fuse_release_async(struct work_struct *work)
+{
+	struct fuse_req *req;
+	struct fuse_conn *fc;
+	struct path path;
+
+	req = container_of(work, struct fuse_req, misc.release.work);
+	path = req->misc.release.path;
+	fc = get_fuse_conn(path.dentry->d_inode);
+
+	fuse_put_request(fc, req);
+	path_put(&path);
+}
+
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-	path_put(&req->misc.release.path);
+	if (fc->destroy_req) {
+		/*
+		 * If this is a fuseblk mount, then it's possible that
+		 * releasing the path will result in releasing the
+		 * super block and sending the DESTROY request.  If
+		 * the server is single threaded, this would hang.
+		 * For this reason do the path_put() in a separate
+		 * thread.
+		 */
+		atomic_inc(&req->count);
+		INIT_WORK(&req->misc.release.work, fuse_release_async);
+		schedule_work(&req->misc.release.work);
+	} else {
+		path_put(&req->misc.release.path);
+	}
 }
 
-static void fuse_file_put(struct fuse_file *ff)
+static void fuse_file_put(struct fuse_file *ff, bool sync)
 {
 	if (atomic_dec_and_test(&ff->count)) {
 		struct fuse_req *req = ff->reserved_req;
 
-		req->end = fuse_release_end;
-		fuse_request_send_background(ff->fc, req);
+		if (sync) {
+			fuse_request_send(ff->fc, req);
+			path_put(&req->misc.release.path);
+			fuse_put_request(ff->fc, req);
+		} else {
+			req->end = fuse_release_end;
+			fuse_request_send_background(ff->fc, req);
+		}
 		kfree(ff);
 	}
 }
@@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode)
 	 * Normally this will send the RELEASE request, however if
 	 * some asynchronous READ or WRITE requests are outstanding,
 	 * the sending will be delayed.
+	 *
+	 * Make the release synchronous if this is a fuseblk mount,
+	 * synchronous RELEASE is allowed (and desirable) in this case
+	 * because the server can be trusted not to screw up.
 	 */
-	fuse_file_put(ff);
+	fuse_file_put(ff, ff->fc->destroy_req != NULL);
 }
 
 static int fuse_open(struct inode *inode, struct file *file)
@@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 		page_cache_release(page);
 	}
 	if (req->ff)
-		fuse_file_put(req->ff);
+		fuse_file_put(req->ff, false);
 }
 
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
 	__free_page(req->pages[0]);
-	fuse_file_put(req->ff);
+	fuse_file_put(req->ff, false);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ae5744a2f9e..d4286947bc2 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,7 @@
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
 #include <linux/poll.h>
+#include <linux/workqueue.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -262,7 +263,10 @@ struct fuse_req {
 	/** Data for asynchronous requests */
 	union {
 		struct {
-			struct fuse_release_in in;
+			union {
+				struct fuse_release_in in;
+				struct work_struct work;
+			};
 			struct path path;
 		} release;
 		struct fuse_init_in init_in;
-- 
cgit v1.2.3


From 8d56addd70c7c0626502569e22cc8fce49ae39f5 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Feb 2011 14:44:58 +0100
Subject: fuse: fix truncate after open

Commit e1181ee6 "vfs: pass struct file to do_truncate on O_TRUNC
opens" broke the behavior of open(O_TRUNC|O_RDONLY) in fuse.  Fuse
assumed that when called from open, a truncate() will be done, not an
ftruncate().

Fix by restoring the old behavior, based on the ATTR_OPEN flag.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bfed8447ed8..83543b5ff94 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1283,8 +1283,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	if (err)
 		return err;
 
-	if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
-		return 0;
+	if (attr->ia_valid & ATTR_OPEN) {
+		if (fc->atomic_o_trunc)
+			return 0;
+		file = NULL;
+	}
 
 	if (attr->ia_valid & ATTR_SIZE)
 		is_truncate = true;
-- 
cgit v1.2.3


From f129ccc9231c95513a1227ca9da876beeb03e577 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@au1.ibm.com>
Date: Fri, 25 Feb 2011 15:33:02 +0000
Subject: afs: Fix oops in afs_unlink_writeback

I'm seeing the following oops when testing afs:

  Unable to handle kernel paging request for data at address 0x00000008
  ...
  NIP [c0000000003393b0] .afs_unlink_writeback+0x38/0xc0
  LR [c00000000033987c] .afs_put_writeback+0x98/0xec
  Call Trace:
  [c00000000345f600] [c00000000033987c] .afs_put_writeback+0x98/0xec
  [c00000000345f690] [c00000000033ae80] .afs_write_begin+0x6a4/0x75c
  [c00000000345f790] [c00000000012b77c] .generic_file_buffered_write+0x148/0x320
  [c00000000345f8d0] [c00000000012e1b8] .__generic_file_aio_write+0x37c/0x3e4
  [c00000000345f9d0] [c00000000012e2a8] .generic_file_aio_write+0x88/0xfc
  [c00000000345fa90] [c0000000003390a8] .afs_file_write+0x10c/0x178
  [c00000000345fb40] [c000000000188788] .do_sync_write+0xc4/0x128
  [c00000000345fcc0] [c000000000189658] .vfs_write+0xe8/0x1d8
  [c00000000345fd70] [c000000000189884] .SyS_write+0x68/0xb0
  [c00000000345fe30] [c000000000008564] syscall_exit+0x0/0x40

afs_write_begin hits an error and calls afs_unlink_writeback. In there
we do list_del_init on an uninitialised list.

The patch below initialises ->link when creating the afs_writeback struct.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/write.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 15690bb1d3b..789b3afb342 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 	candidate->first = candidate->last = index;
 	candidate->offset_first = from;
 	candidate->to_last = to;
+	INIT_LIST_HEAD(&candidate->link);
 	candidate->usage = 1;
 	candidate->state = AFS_WBACK_PENDING;
 	init_waitqueue_head(&candidate->waitq);
-- 
cgit v1.2.3


From 22bacca48a1755f79b7e0f192ddb9fbb7fc6e64e Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Fri, 25 Feb 2011 14:44:12 -0800
Subject: epoll: prevent creating circular epoll structures

In several places, an epoll fd can call another file's ->f_op->poll()
method with ep->mtx held.  This is in general unsafe, because that other
file could itself be an epoll fd that contains the original epoll fd.

The code defends against this possibility in its own ->poll() method using
ep_call_nested, but there are several other unsafe calls to ->poll
elsewhere that can be made to deadlock.  For example, the following simple
program causes the call in ep_insert recursively call the original fd's
->poll, leading to deadlock:

 #include <unistd.h>
 #include <sys/epoll.h>

 int main(void) {
     int e1, e2, p[2];
     struct epoll_event evt = {
         .events = EPOLLIN
     };

     e1 = epoll_create(1);
     e2 = epoll_create(2);
     pipe(p);

     epoll_ctl(e2, EPOLL_CTL_ADD, e1, &evt);
     epoll_ctl(e1, EPOLL_CTL_ADD, p[0], &evt);
     write(p[1], p, sizeof p);
     epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);

     return 0;
 }

On insertion, check whether the inserted file is itself a struct epoll,
and if so, do a recursive walk to detect whether inserting this file would
create a loop of epoll structures, which could lead to deadlock.

[nelhage@ksplice.com: Use epmutex to serialize concurrent inserts]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Nelson Elhage <nelhage@ksplice.com>
Reported-by: Nelson Elhage <nelhage@ksplice.com>
Tested-by: Nelson Elhage <nelhage@ksplice.com>
Cc: <stable@kernel.org>		[2.6.34+, possibly earlier]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 267d0ada454..4a09af9e9a6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -63,6 +63,13 @@
  * cleanup path and it is also acquired by eventpoll_release_file()
  * if a file has been pushed inside an epoll set and it is then
  * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
  * It is possible to drop the "ep->mtx" and to use the global
  * mutex "epmutex" (together with "ep->lock") to have it working,
  * but having "ep->mtx" will make the interface more scalable.
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
  */
 static DEFINE_MUTEX(epmutex);
 
+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
+
 /* Used for safe wake up implementation */
 static struct nested_calls poll_safewake_ncalls;
 
@@ -1198,6 +1208,62 @@ retry:
 	return res;
 }
 
+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ *                      API, to verify that adding an epoll file inside another
+ *                      epoll structure, does not violate the constraints, in
+ *                      terms of closed loops, or too deep chains (which can
+ *                      result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ *          data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+	int error = 0;
+	struct file *file = priv;
+	struct eventpoll *ep = file->private_data;
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	mutex_lock(&ep->mtx);
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (unlikely(is_file_epoll(epi->ffd.file))) {
+			error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+					       ep_loop_check_proc, epi->ffd.file,
+					       epi->ffd.file->private_data, current);
+			if (error != 0)
+				break;
+		}
+	}
+	mutex_unlock(&ep->mtx);
+
+	return error;
+}
+
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ *                 another epoll file (represented by @ep) does not create
+ *                 closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+	return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+			      ep_loop_check_proc, file, ep, current);
+}
+
 /*
  * Open an eventpoll file descriptor.
  */
@@ -1246,6 +1312,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		struct epoll_event __user *, event)
 {
 	int error;
+	int did_lock_epmutex = 0;
 	struct file *file, *tfile;
 	struct eventpoll *ep;
 	struct epitem *epi;
@@ -1287,6 +1354,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	 */
 	ep = file->private_data;
 
+	/*
+	 * When we insert an epoll file descriptor, inside another epoll file
+	 * descriptor, there is the change of creating closed loops, which are
+	 * better be handled here, than in more critical paths.
+	 *
+	 * We hold epmutex across the loop check and the insert in this case, in
+	 * order to prevent two separate inserts from racing and each doing the
+	 * insert "at the same time" such that ep_loop_check passes on both
+	 * before either one does the insert, thereby creating a cycle.
+	 */
+	if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+		mutex_lock(&epmutex);
+		did_lock_epmutex = 1;
+		error = -ELOOP;
+		if (ep_loop_check(ep, tfile) != 0)
+			goto error_tgt_fput;
+	}
+
+
 	mutex_lock(&ep->mtx);
 
 	/*
@@ -1322,6 +1408,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	mutex_unlock(&ep->mtx);
 
 error_tgt_fput:
+	if (unlikely(did_lock_epmutex))
+		mutex_unlock(&epmutex);
+
 	fput(tfile);
 error_fput:
 	fput(file);
@@ -1441,6 +1530,12 @@ static int __init eventpoll_init(void)
 		EP_ITEM_COST;
 	BUG_ON(max_user_watches < 0);
 
+	/*
+	 * Initialize the structure used to perform epoll file descriptor
+	 * inclusion loops checks.
+	 */
+	ep_nested_calls_init(&poll_loop_ncalls);
+
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_nested_calls_init(&poll_safewake_ncalls);
 
-- 
cgit v1.2.3


From 294f6cf48666825d23c9372ef37631232746e40d Mon Sep 17 00:00:00 2001
From: Timo Warns <Warns@pre-sense.de>
Date: Fri, 25 Feb 2011 14:44:21 -0800
Subject: ldm: corrupted partition table can cause kernel oops

The kernel automatically evaluates partition tables of storage devices.
The code for evaluating LDM partitions (in fs/partitions/ldm.c) contains
a bug that causes a kernel oops on certain corrupted LDM partitions.  A
kernel subsystem seems to crash, because, after the oops, the kernel no
longer recognizes newly connected storage devices.

The patch changes ldm_parse_vmdb() to Validate the value of vblk_size.

Signed-off-by: Timo Warns <warns@pre-sense.de>
Cc: Eugene Teo <eugeneteo@kernel.sg>
Acked-by: Richard Russon <ldm@flatcap.org>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/ldm.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 789c625c7aa..b10e3540d5b 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
 	}
 
 	vm->vblk_size     = get_unaligned_be32(data + 0x08);
+	if (vm->vblk_size == 0) {
+		ldm_error ("Illegal VBLK size");
+		return false;
+	}
+
 	vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
 	vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
 
-- 
cgit v1.2.3


From 3bd9a5d734c7cc7533b27abf451416c7f50095a7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@gmail.com>
Date: Fri, 25 Feb 2011 14:44:26 -0800
Subject: aio: fix rcu ioctx lookup

aio-dio-invalidate-failure GPFs in aio_put_req from io_submit.

lookup_ioctx doesn't implement the rcu lookup pattern properly.
rcu_read_lock does not prevent refcount going to zero, so we might take
a refcount on a zero count ioctx.

Fix the bug by atomically testing for zero refcount before incrementing.

[jack@suse.cz: added comment into the code]
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a..b4dd668fbcc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -239,15 +239,23 @@ static void __put_ioctx(struct kioctx *ctx)
 	call_rcu(&ctx->rcu_head, ctx_rcu_free);
 }
 
-#define get_ioctx(kioctx) do {						\
-	BUG_ON(atomic_read(&(kioctx)->users) <= 0);			\
-	atomic_inc(&(kioctx)->users);					\
-} while (0)
-#define put_ioctx(kioctx) do {						\
-	BUG_ON(atomic_read(&(kioctx)->users) <= 0);			\
-	if (unlikely(atomic_dec_and_test(&(kioctx)->users))) 		\
-		__put_ioctx(kioctx);					\
-} while (0)
+static inline void get_ioctx(struct kioctx *kioctx)
+{
+	BUG_ON(atomic_read(&kioctx->users) <= 0);
+	atomic_inc(&kioctx->users);
+}
+
+static inline int try_get_ioctx(struct kioctx *kioctx)
+{
+	return atomic_inc_not_zero(&kioctx->users);
+}
+
+static inline void put_ioctx(struct kioctx *kioctx)
+{
+	BUG_ON(atomic_read(&kioctx->users) <= 0);
+	if (unlikely(atomic_dec_and_test(&kioctx->users)))
+		__put_ioctx(kioctx);
+}
 
 /* ioctx_alloc
  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
@@ -601,8 +609,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	rcu_read_lock();
 
 	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-		if (ctx->user_id == ctx_id && !ctx->dead) {
-			get_ioctx(ctx);
+		/*
+		 * RCU protects us against accessing freed memory but
+		 * we have to be careful not to get a reference when the
+		 * reference count already dropped to 0 (ctx->dead test
+		 * is unreliable because of races).
+		 */
+		if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
 			ret = ctx;
 			break;
 		}
-- 
cgit v1.2.3


From 7137c6bd455234bcb7560fd829e6ee49cae5fed6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Feb 2011 14:44:27 -0800
Subject: aio: fix race between io_destroy() and io_submit()

A race can occur when io_submit() races with io_destroy():

 CPU1						CPU2
io_submit()
  do_io_submit()
    ...
    ctx = lookup_ioctx(ctx_id);
						io_destroy()
    Now do_io_submit() holds the last reference to ctx.
    ...
    queue new AIO
    put_ioctx(ctx) - frees ctx with active AIOs

We solve this issue by checking whether ctx is being destroyed in AIO
submission path after adding new AIO to ctx.  Then we are guaranteed that
either io_destroy() waits for new AIO or we see that ctx is being
destroyed and bail out.

Cc: Nick Piggin <npiggin@kernel.dk>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index b4dd668fbcc..26869cde395 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1642,6 +1642,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		goto out_put_req;
 
 	spin_lock_irq(&ctx->ctx_lock);
+	/*
+	 * We could have raced with io_destroy() and are currently holding a
+	 * reference to ctx which should be destroyed. We cannot submit IO
+	 * since ctx gets freed as soon as io_submit() puts its reference.  The
+	 * check here is reliable: io_destroy() sets ctx->dead before waiting
+	 * for outstanding IO and the barrier between these two is realized by
+	 * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
+	 * increment ctx->reqs_active before checking for ctx->dead and the
+	 * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
+	 * don't see ctx->dead set here, io_destroy() waits for our IO to
+	 * finish.
+	 */
+	if (ctx->dead) {
+		spin_unlock_irq(&ctx->ctx_lock);
+		ret = -EINVAL;
+		goto out_put_req;
+	}
 	aio_run_iocb(req);
 	if (!list_empty(&ctx->run_list)) {
 		/* drain the run list */
-- 
cgit v1.2.3


From b7fc0ff09d24b372dc04b0c02b80659c0a66fdfe Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 28 Feb 2011 01:45:42 +0000
Subject: Squashfs: extend decompressor framework to handle compression options

Extend decompressor framework to handle compression options stored in
the filesystem.  These options can be used by the relevant decompressor
at initialisation time to over-ride defaults.

The presence of compression options in the filesystem is indicated by
the COMP_OPT filesystem flag.  If present the data is read from the
filesystem and passed to the decompressor init function.  The decompressor
init function signature has been extended to take this data.

Also update the init function signature in the glib, lzo and xz
decompressor wrappers.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/decompressor.c | 34 ++++++++++++++++++++++++++++++++++
 fs/squashfs/decompressor.h |  7 +------
 fs/squashfs/lzo_wrapper.c  |  4 ++--
 fs/squashfs/squashfs.h     |  1 +
 fs/squashfs/squashfs_fs.h  |  4 ++++
 fs/squashfs/super.c        | 11 +++++++----
 fs/squashfs/xz_wrapper.c   |  5 +++--
 fs/squashfs/zlib_wrapper.c |  4 ++--
 8 files changed, 54 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index a5940e54c4d..e921bd21373 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -23,6 +23,7 @@
 
 #include <linux/types.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 
 #include "squashfs_fs.h"
@@ -74,3 +75,36 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
 
 	return decompressor[i];
 }
+
+
+void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	void *strm, *buffer = NULL;
+	int length = 0;
+
+	/*
+	 * Read decompressor specific options from file system if present
+	 */
+	if (SQUASHFS_COMP_OPTS(flags)) {
+		buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		if (buffer == NULL)
+			return ERR_PTR(-ENOMEM);
+
+		length = squashfs_read_data(sb, &buffer,
+			sizeof(struct squashfs_super_block), 0, NULL,
+			PAGE_CACHE_SIZE, 1);
+
+		if (length < 0) {
+			strm = ERR_PTR(length);
+			goto finished;
+		}
+	}
+
+	strm = msblk->decompressor->init(msblk, buffer, length);
+
+finished:
+	kfree(buffer);
+
+	return strm;
+}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 3b305a70f7a..099745ad569 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -24,7 +24,7 @@
  */
 
 struct squashfs_decompressor {
-	void	*(*init)(struct squashfs_sb_info *);
+	void	*(*init)(struct squashfs_sb_info *, void *, int);
 	void	(*free)(void *);
 	int	(*decompress)(struct squashfs_sb_info *, void **,
 		struct buffer_head **, int, int, int, int, int);
@@ -33,11 +33,6 @@ struct squashfs_decompressor {
 	int	supported;
 };
 
-static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
-{
-	return msblk->decompressor->init(msblk);
-}
-
 static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
 	void *s)
 {
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 7da759e34c5..00f4dfc5f08 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -37,7 +37,7 @@ struct squashfs_lzo {
 	void	*output;
 };
 
-static void *lzo_init(struct squashfs_sb_info *msblk)
+static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len)
 {
 	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
 
@@ -58,7 +58,7 @@ failed2:
 failed:
 	ERROR("Failed to allocate lzo workspace\n");
 	kfree(stream);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index ba729d80887..1f2e608b878 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -48,6 +48,7 @@ extern int squashfs_read_table(struct super_block *, void *, u64, int);
 
 /* decompressor.c */
 extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
+extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
 
 /* export.c */
 extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 39533feffd6..4582c568ef4 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -57,6 +57,7 @@
 #define SQUASHFS_ALWAYS_FRAG		5
 #define SQUASHFS_DUPLICATE		6
 #define SQUASHFS_EXPORT			7
+#define SQUASHFS_COMP_OPT		10
 
 #define SQUASHFS_BIT(flag, bit)		((flag >> bit) & 1)
 
@@ -81,6 +82,9 @@
 #define SQUASHFS_EXPORTABLE(flags)		SQUASHFS_BIT(flags, \
 						SQUASHFS_EXPORT)
 
+#define SQUASHFS_COMP_OPTS(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_COMP_OPT)
+
 /* Max number of types and file types */
 #define SQUASHFS_DIR_TYPE		1
 #define SQUASHFS_REG_TYPE		2
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 20700b9f2b4..95467db71a8 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -199,10 +199,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	err = -ENOMEM;
 
-	msblk->stream = squashfs_decompressor_init(msblk);
-	if (msblk->stream == NULL)
-		goto failed_mount;
-
 	msblk->block_cache = squashfs_cache_init("metadata",
 			SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
 	if (msblk->block_cache == NULL)
@@ -215,6 +211,13 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
+	msblk->stream = squashfs_decompressor_init(sb, flags);
+	if (IS_ERR(msblk->stream)) {
+		err = PTR_ERR(msblk->stream);
+		msblk->stream = NULL;
+		goto failed_mount;
+	}
+
 	/* Allocate and read id index table */
 	msblk->id_table = squashfs_read_id_index_table(sb,
 		le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index c4eb4001825..397adea72eb 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -38,7 +38,8 @@ struct squashfs_xz {
 	struct xz_buf buf;
 };
 
-static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
+	int len)
 {
 	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
 
@@ -55,7 +56,7 @@ static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
 failed:
 	ERROR("Failed to allocate xz workspace\n");
 	kfree(stream);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4661ae2b1ce..195b0d035e9 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -32,7 +32,7 @@
 #include "squashfs.h"
 #include "decompressor.h"
 
-static void *zlib_init(struct squashfs_sb_info *dummy)
+static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
 {
 	z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
 	if (stream == NULL)
@@ -47,7 +47,7 @@ static void *zlib_init(struct squashfs_sb_info *dummy)
 failed:
 	ERROR("Failed to allocate zlib workspace\n");
 	kfree(stream);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 
-- 
cgit v1.2.3


From ff750311d30acc9564ef577050794953eee59f01 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 28 Feb 2011 15:31:46 +0000
Subject: Squashfs: add compression options support to xz decompressor

Pass the dictionary size used to compress datablocks.  Using a
dictionary size less than the block size saves memory overhead, in many
cases without adversely affecting compression ratio.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/xz_wrapper.c | 49 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 397adea72eb..06d0d11b482 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -26,6 +26,7 @@
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/xz.h>
+#include <linux/bitops.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -38,25 +39,57 @@ struct squashfs_xz {
 	struct xz_buf buf;
 };
 
+struct comp_opts {
+	__le32 dictionary_size;
+	__le32 flags;
+};
+
 static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
 	int len)
 {
-	int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+	struct comp_opts *comp_opts = buff;
+	struct squashfs_xz *stream;
+	int dict_size = msblk->block_size;
+	int err, n;
+
+	if (comp_opts) {
+		/* check compressor options are the expected length */
+		if (len < sizeof(*comp_opts)) {
+			err = -EIO;
+			goto failed;
+		}
 
-	struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
-	if (stream == NULL)
+		dict_size = le32_to_cpu(comp_opts->dictionary_size);
+
+		/* the dictionary size should be 2^n or 2^n+2^(n+1) */
+		n = ffs(dict_size) - 1;
+		if (dict_size != (1 << n) && dict_size != (1 << n) +
+						(1 << (n + 1))) {
+			err = -EIO;
+			goto failed;
+		}
+	}
+
+	dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE);
+
+	stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+	if (stream == NULL) {
+		err = -ENOMEM;
 		goto failed;
+	}
 
-	stream->state = xz_dec_init(XZ_PREALLOC, block_size);
-	if (stream->state == NULL)
+	stream->state = xz_dec_init(XZ_PREALLOC, dict_size);
+	if (stream->state == NULL) {
+		kfree(stream);
+		err = -ENOMEM;
 		goto failed;
+	}
 
 	return stream;
 
 failed:
-	ERROR("Failed to allocate xz workspace\n");
-	kfree(stream);
-	return ERR_PTR(-ENOMEM);
+	ERROR("Failed to initialise xz decompressor\n");
+	return ERR_PTR(err);
 }
 
 
-- 
cgit v1.2.3


From 681ffe2e4316801529fdafe379743d1d41fe2086 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 28 Feb 2011 16:21:34 +0000
Subject: Squashfs: Update Kconfig help text to include xz compression

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/Kconfig | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index aa68a8a3151..efc309fa303 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,12 +5,12 @@ config SQUASHFS
 	help
 	  Saying Y here includes support for SquashFS 4.0 (a Compressed
 	  Read-Only File System).  Squashfs is a highly compressed read-only
-	  filesystem for Linux.  It uses zlib/lzo compression to compress both
-	  files, inodes and directories.  Inodes in the system are very small
-	  and all blocks are packed to minimise data overhead. Block sizes
-	  greater than 4K are supported up to a maximum of 1 Mbytes (default
-	  block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
-	  (larger than 4GB), full uid/gid information, hard links and
+	  filesystem for Linux.  It uses zlib, lzo or xz compression to
+	  compress both files, inodes and directories.  Inodes in the system
+	  are very small and all blocks are packed to minimise data overhead.
+	  Block sizes greater than 4K are supported up to a maximum of 1 Mbytes
+	  (default block size 128K).  SquashFS 4.0 supports 64 bit filesystems
+	  and files (larger than 4GB), full uid/gid information, hard links and
 	  timestamps.
 
 	  Squashfs is intended for general read-only filesystem use, for
-- 
cgit v1.2.3


From 3ad126641c05f93d2fa153bb8ff762fb4cdbb885 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 28 Feb 2011 16:42:20 +0000
Subject: Squashfs: xz_wrapper doesn't need to include squashfs_fs_i.h anymore

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/xz_wrapper.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 06d0d11b482..aa47a286d1f 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -30,7 +30,6 @@
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
 
-- 
cgit v1.2.3


From 003a3194d36dc22c29cacda4d0c6fede2753c9d0 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 28 Feb 2011 18:43:48 +0000
Subject: Squashfs: wrap squashfs_mount() definition

Squashfs_get_sb() to squashfs_mount() conversion (commit 152a0836)
results in line over 80 characters.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 95467db71a8..5c8184c061a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -373,8 +373,8 @@ static void squashfs_put_super(struct super_block *sb)
 }
 
 
-static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
-				const char *dev_name, void *data)
+static struct dentry *squashfs_mount(struct file_system_type *fs_type,
+				int flags, const char *dev_name, void *data)
 {
 	return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
 }
-- 
cgit v1.2.3


From 03885ac3c79ab7f3f4a8e502486be2ea6c85c3e3 Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Thu, 24 Feb 2011 11:48:22 +0100
Subject: ext2: Fix link count corruption under heavy link+rename load

vfs_rename_other() does not lock renamed inode with i_mutex. Thus changing
i_nlink in a non-atomic manner (which happens in ext2_rename()) can corrupt
it as reported and analyzed by Josh.

In fact, there is no good reason to mess with i_nlink of the moved file.
We did it presumably to simulate linking into the new directory and unlinking
from an old one. But the practical effect of this is disputable because fsck
can possibly treat file as being properly linked into both directories without
writing any error which is confusing. So we just stop increment-decrement
games with i_nlink which also fixes the corruption.

CC: stable@kernel.org
CC: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Josh Hunt <johunt@akamai.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/namei.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2e1d8341d82..adb91855ccd 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 		new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
@@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 			if (new_dir->i_nlink >= EXT2_LINK_MAX)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = ext2_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
@@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	/*
 	 * Like most other Unix systems, set the ctime for inodes on a
  	 * rename.
-	 * inode_dec_link_count() will mark the inode dirty.
 	 */
 	old_inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_inode);
 
 	ext2_delete_entry (old_de, old_page);
-	inode_dec_link_count(old_inode);
 
 	if (dir_de) {
 		if (old_dir != new_dir)
-- 
cgit v1.2.3


From ce654b37f87980d95f339080e4c3bdb2370bdf22 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sat, 26 Feb 2011 22:40:19 +0200
Subject: ext3: skip orphan cleanup on rocompat fs

Orphan cleanup is currently executed even if the file system has some
number of unknown ROCOMPAT features, which deletes inodes and frees
blocks, which could be very bad for some RO_COMPAT features.

This patch skips the orphan cleanup if it contains readonly compatible
features not known by this ext3 implementation, which would prevent
the fs from being mounted (or remounted) readwrite.

Signed-off-by: Amir Goldstein <amir73il@users.sf.net>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f247..0d62f29f213 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1464,6 +1464,13 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 		return;
 	}
 
+	/* Check if feature set allows readwrite operations */
+	if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
+		ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+			 "unknown ROCOMPAT features");
+		return;
+	}
+
 	if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
 		if (es->s_last_orphan)
 			jbd_debug(1, "Errors on filesystem, "
-- 
cgit v1.2.3


From 3c26bdb42320f9f39d30c6f115476bbb66b74dc5 Mon Sep 17 00:00:00 2001
From: "Justin P. Mattock" <justinmattock@gmail.com>
Date: Sat, 26 Feb 2011 20:34:05 -0800
Subject: jbd: Remove one to many n's in a word.

The Patch below removes one to many "n's" in a word..

Signed-off-by: Justin P. Mattock <justinmattock@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: linux-ext4@vger.kernel.org
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/journal.c  | 2 +-
 fs/jbd2/journal.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index da1b5e4ffce..eb11601f2e0 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -839,7 +839,7 @@ journal_t * journal_init_inode (struct inode *inode)
 	err = journal_bmap(journal, 0, &blocknr);
 	/* If that failed, give up */
 	if (err) {
-		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+		printk(KERN_ERR "%s: Cannot locate journal superblock\n",
 		       __func__);
 		goto out_err;
 	}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 97e73469b2c..90407b8fece 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -991,7 +991,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	err = jbd2_journal_bmap(journal, 0, &blocknr);
 	/* If that failed, give up */
 	if (err) {
-		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+		printk(KERN_ERR "%s: Cannot locate journal superblock\n",
 		       __func__);
 		goto out_err;
 	}
-- 
cgit v1.2.3


From e6eb5ce1b202ac9cdcfda5be559c9b9d8ec7542c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 26 Feb 2011 10:54:00 -0800
Subject: fs/block_dev.c: fix new kernel-doc warning

Fix new kernel-doc warning in fs/block_dev.c:

Warning(fs/block_dev.c:937): No description found for parameter 'kill_dirty'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f05bf16cd97..88928701959 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -928,6 +928,7 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
  * flush_disk - invalidates all buffer-cache entries on a disk
  *
  * @bdev:      struct block device to be flushed
+ * @kill_dirty: flag to guide handling of dirty inodes
  *
  * Invalidates all buffer-cache entries on a disk. It should be called
  * when a disk has been changed -- either by a media change or online
-- 
cgit v1.2.3


From ae0e47f02aaedbfdd5e4bec73f79b714d758223d Mon Sep 17 00:00:00 2001
From: "Justin P. Mattock" <justinmattock@gmail.com>
Date: Tue, 1 Mar 2011 15:06:02 +0100
Subject: Remove one to many n's in a word

Signed-off-by: Justin P. Mattock <justinmattock@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/btrfs/disk-io.c                 | 2 +-
 fs/notify/fanotify/fanotify_user.c | 2 +-
 fs/notify/inotify/inotify_user.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fdce8799b98..7b658d2107b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2489,7 +2489,7 @@ int close_ctree(struct btrfs_root *root)
 	 * ERROR state on disk.
 	 *
 	 * 2. when btrfs flips readonly just in btrfs_commit_super,
-	 * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+	 * and in such case, btrfs cannot write sb via btrfs_commit_super,
 	 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
 	 * btrfs will cleanup all FS resources first and write sb then.
 	 */
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8b61220cffc..6b1305dc26c 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -876,7 +876,7 @@ SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
 #endif
 
 /*
- * fanotify_user_setup - Our initialization function.  Note that we cannnot return
+ * fanotify_user_setup - Our initialization function.  Note that we cannot return
  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
  * must result in panic().
  */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4cd5d5d78f9..bd46e7c8a0e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -841,7 +841,7 @@ out:
 }
 
 /*
- * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * inotify_user_setup - Our initialization function.  Note that we cannot return
  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
  * must result in panic().
  */
-- 
cgit v1.2.3


From 4688a066ecf60086ea82f68edb3b036b567d2c08 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 22 Jan 2011 20:05:05 +0100
Subject: adfs: remove the big kernel lock

According to Russell King, adfs was written to not require the big
kernel lock, and all inode updates are done under adfs_dir_lock.

All other metadata in adfs is read-only and does not require locking.
The use of the BKL is the result of various pushdowns from the VFS
operations.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Russell King <rmk@arm.linux.org.uk>
Cc: Stuart Swales <stuart.swales.croftnuisk@gmail.com>
---
 fs/adfs/Kconfig |  1 -
 fs/adfs/dir.c   |  6 ------
 fs/adfs/inode.c |  6 ------
 fs/adfs/super.c | 13 +------------
 4 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 1dd5f34b3cf..e55182a7460 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,7 +1,6 @@
 config ADFS_FS
 	tristate "ADFS file system support (EXPERIMENTAL)"
 	depends on BLOCK && EXPERIMENTAL
-	depends on BKL # need to fix
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 3b4a764ed78..3d83075aaa2 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,7 +9,6 @@
  *
  *  Common directory handling for ADFS
  */
-#include <linux/smp_lock.h>
 #include "adfs.h"
 
 /*
@@ -27,8 +26,6 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct adfs_dir dir;
 	int ret = 0;
 
-	lock_kernel();	
-
 	if (filp->f_pos >> 32)
 		goto out;
 
@@ -70,7 +67,6 @@ free_out:
 	ops->free(&dir);
 
 out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -276,7 +272,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	struct object_info obj;
 	int error;
 
-	lock_kernel();
 	error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
 	if (error == 0) {
 		error = -EACCES;
@@ -288,7 +283,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		if (inode)
 			error = 0;
 	}
-	unlock_kernel();
 	d_add(dentry, inode);
 	return ERR_PTR(error);
 }
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 65794b8fe79..09fe40198d1 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,7 +7,6 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include "adfs.h"
@@ -316,8 +315,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
 	unsigned int ia_valid = attr->ia_valid;
 	int error;
 	
-	lock_kernel();
-
 	error = inode_change_ok(inode, attr);
 
 	/*
@@ -359,7 +356,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
 	if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE))
 		mark_inode_dirty(inode);
 out:
-	unlock_kernel();
 	return error;
 }
 
@@ -374,7 +370,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct object_info obj;
 	int ret;
 
-	lock_kernel();
 	obj.file_id	= inode->i_ino;
 	obj.name_len	= 0;
 	obj.parent_id	= ADFS_I(inode)->parent_id;
@@ -384,6 +379,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	obj.size	= inode->i_size;
 
 	ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
-	unlock_kernel();
 	return ret;
 }
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2d7954049fb..06d7388b477 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -14,7 +14,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include "adfs.h"
 #include "dir_f.h"
@@ -120,15 +119,11 @@ static void adfs_put_super(struct super_block *sb)
 	int i;
 	struct adfs_sb_info *asb = ADFS_SB(sb);
 
-	lock_kernel();
-
 	for (i = 0; i < asb->s_map_size; i++)
 		brelse(asb->s_map[i].dm_bh);
 	kfree(asb->s_map);
 	kfree(asb);
 	sb->s_fs_info = NULL;
-
-	unlock_kernel();
 }
 
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -359,15 +354,11 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct adfs_sb_info *asb;
 	struct inode *root;
 
-	lock_kernel();
-
 	sb->s_flags |= MS_NODIRATIME;
 
 	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
-	if (!asb) {
-		unlock_kernel();
+	if (!asb)
 		return -ENOMEM;
-	}
 	sb->s_fs_info = asb;
 
 	/* set default options */
@@ -485,7 +476,6 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 		adfs_error(sb, "get root inode failed\n");
 		goto error;
 	}
-	unlock_kernel();
 	return 0;
 
 error_free_bh:
@@ -493,7 +483,6 @@ error_free_bh:
 error:
 	sb->s_fs_info = NULL;
 	kfree(asb);
-	unlock_kernel();
 	return -EINVAL;
 }
 
-- 
cgit v1.2.3


From 72746ac643928f6c3113b5aa783d8ea1b13949d2 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 28 Feb 2011 13:41:11 +0900
Subject: nilfs2: fix regression that i-flag is not set on changeless
 checkpoints

According to the report from Jiro SEKIBA titled "regression in
2.6.37?"  (Message-Id: <8739n8vs1f.wl%jir@sekiba.com>), on 2.6.37 and
later kernels, lscp command no longer displays "i" flag on checkpoints
that snapshot operations or garbage collection created.

This is a regression of nilfs2 checkpointing function, and it's
critical since it broke behavior of a part of nilfs2 applications.
For instance, snapshot manager of TimeBrowse gets to create
meaningless snapshots continuously; snapshot creation triggers another
checkpoint, but applications cannot distinguish whether the new
checkpoint contains meaningful changes or not without the i-flag.

This patch fixes the regression and brings that application behavior
back to normal.

Reported-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Jiro SEKIBA <jir@unicus.jp>
Cc: stable <stable@kernel.org>  [2.6.37]
---
 fs/nilfs2/segment.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55ebae5c7f3..2de9f636792 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -430,7 +430,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
 	nilfs_segctor_map_segsum_entry(
 		sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
 
-	if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+	if (NILFS_I(inode)->i_root &&
+	    !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
 		set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 	/* skip finfo */
 }
-- 
cgit v1.2.3


From eeb2036b8a148629b762ae6d85cff0be8106f081 Mon Sep 17 00:00:00 2001
From: Alex Elder <aelder@sgi.com>
Date: Tue, 1 Mar 2011 17:50:00 +0000
Subject: xfs: zero proper structure size for geometry calls

Commit 493f3358cb289ccf716c5a14fa5bb52ab75943e5 added this call to
xfs_fs_geometry() in order to avoid passing kernel stack data back
to user space:

+       memset(geo, 0, sizeof(*geo));

Unfortunately, one of the callers of that function passes the
address of a smaller data type, cast to fit the type that
xfs_fs_geometry() requires.  As a result, this can happen:

Kernel panic - not syncing: stack-protector: Kernel stack is corrupted
in: f87aca93

Pid: 262, comm: xfs_fsr Not tainted 2.6.38-rc6-493f3358cb2+ #1
Call Trace:

[<c12991ac>] ? panic+0x50/0x150
[<c102ed71>] ? __stack_chk_fail+0x10/0x18
[<f87aca93>] ? xfs_ioc_fsgeometry_v1+0x56/0x5d [xfs]

Fix this by fixing that one caller to pass the right type and then
copy out the subset it is interested in.

Note: This patch is an alternative to one originally proposed by
Eric Sandeen.

Reported-by: Jeffrey Hundstad <jeffrey.hundstad@mnsu.edu>
Signed-off-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Tested-by: Jeffrey Hundstad <jeffrey.hundstad@mnsu.edu>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f5e2a19e0f8..0ca0e3c024d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -695,14 +695,19 @@ xfs_ioc_fsgeometry_v1(
 	xfs_mount_t		*mp,
 	void			__user *arg)
 {
-	xfs_fsop_geom_v1_t	fsgeo;
+	xfs_fsop_geom_t         fsgeo;
 	int			error;
 
-	error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+	error = xfs_fs_geometry(mp, &fsgeo, 3);
 	if (error)
 		return -error;
 
-	if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+	/*
+	 * Caller should have passed an argument of type
+	 * xfs_fsop_geom_v1_t.  This is a proper subset of the
+	 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+	 */
+	if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
 		return -XFS_ERROR(EFAULT);
 	return 0;
 }
-- 
cgit v1.2.3


From 10e38391c0e242e53e30094f6c00553418ab2f2e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 2 Mar 2011 14:20:59 +1100
Subject: xfs: introduce new logging API.

Most of the logging infrastructure in XFS is unneccessary and
designed around the infrastructure supplied by Irix rather than
Linux. To rationalise the logging interfaces, start by introducing
simple printk wrappers similar to the dev_printk() infrastructure.
Later patches will convert code to use this new interface.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile                |   1 +
 fs/xfs/linux-2.6/xfs_linux.h   |   1 +
 fs/xfs/linux-2.6/xfs_message.c | 119 +++++++++++++++++++++++++++++++++++++++++
 fs/xfs/linux-2.6/xfs_message.h |  34 ++++++++++++
 4 files changed, 155 insertions(+)
 create mode 100644 fs/xfs/linux-2.6/xfs_message.c
 create mode 100644 fs/xfs/linux-2.6/xfs_message.h

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index faca4499709..077784ed6a7 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -105,6 +105,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 				   xfs_globals.o \
 				   xfs_ioctl.o \
 				   xfs_iops.o \
+				   xfs_message.o \
 				   xfs_super.o \
 				   xfs_sync.o \
 				   xfs_xattr.o)
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 09649499774..1189bfcbcd3 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -86,6 +86,7 @@
 #include <xfs_aops.h>
 #include <xfs_super.h>
 #include <xfs_buf.h>
+#include <xfs_message.h>
 
 /*
  * Feature macros (disable/enable)
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 00000000000..6f3368eec25
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+
+/*
+ * XFS logging functions
+ */
+static int
+__xfs_printk(
+	const char		*level,
+	const struct xfs_mount	*mp,
+	struct va_format	*vaf)
+{
+	if (mp && mp->m_fsname)
+		return printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+	return printk("%sXFS: %pV\n", level, vaf);
+}
+
+int xfs_printk(
+	const char		*level,
+	const struct xfs_mount	*mp,
+	const char		*fmt, ...)
+{
+	struct va_format	vaf;
+	va_list			args;
+	int			 r;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	r = __xfs_printk(level, mp, &vaf);
+	va_end(args);
+
+	return r;
+}
+
+#define define_xfs_printk_level(func, kern_level)		\
+int func(const struct xfs_mount *mp, const char *fmt, ...)	\
+{								\
+	struct va_format	vaf;				\
+	va_list			args;				\
+	int			r;				\
+								\
+	va_start(args, fmt);					\
+								\
+	vaf.fmt = fmt;						\
+	vaf.va = &args;						\
+								\
+	r = __xfs_printk(kern_level, mp, &vaf);			\
+	va_end(args);						\
+								\
+	return r;						\
+}								\
+
+define_xfs_printk_level(xfs_emerg, KERN_EMERG);
+define_xfs_printk_level(xfs_alert, KERN_ALERT);
+define_xfs_printk_level(xfs_crit, KERN_CRIT);
+define_xfs_printk_level(xfs_err, KERN_ERR);
+define_xfs_printk_level(xfs_warn, KERN_WARNING);
+define_xfs_printk_level(xfs_notice, KERN_NOTICE);
+define_xfs_printk_level(xfs_info, KERN_INFO);
+#ifdef DEBUG
+define_xfs_printk_level(xfs_debug, KERN_DEBUG);
+#endif
+
+int
+xfs_alert_tag(
+	const struct xfs_mount	*mp,
+	int			panic_tag,
+	const char		*fmt, ...)
+{
+	struct va_format	vaf;
+	va_list			args;
+	int			panic = 0;
+	int			 r;
+
+	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+		xfs_printk(KERN_ALERT, mp,
+			"XFS: Transforming an alert into a BUG.");
+		panic = 1;
+	}
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	r = __xfs_printk(KERN_ALERT, mp, &vaf);
+	va_end(args);
+
+	BUG_ON(panic);
+
+	return r;
+}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 00000000000..8d2df017530
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,34 @@
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+
+struct xfs_mount;
+
+extern int xfs_printk(const char *level, const struct xfs_mount *mp,
+                      const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern int xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_alert_tag(const struct xfs_mount *mp, int tag,
+			 const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern int xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern int xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+
+#ifdef DEBUG
+extern int xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#else
+#define xfs_debug(mp, fmt, ...)	(0)
+#endif
+
+#endif	/* __XFS_MESSAGE_H */
-- 
cgit v1.2.3


From af24ee9ea8d532e16883251a6684dfa1be8eec29 Mon Sep 17 00:00:00 2001
From: Alex Elder <aelder@sgi.com>
Date: Tue, 1 Mar 2011 17:50:00 +0000
Subject: xfs: zero proper structure size for geometry calls

Commit 493f3358cb289ccf716c5a14fa5bb52ab75943e5 added this call to
xfs_fs_geometry() in order to avoid passing kernel stack data back
to user space:

+       memset(geo, 0, sizeof(*geo));

Unfortunately, one of the callers of that function passes the
address of a smaller data type, cast to fit the type that
xfs_fs_geometry() requires.  As a result, this can happen:

Kernel panic - not syncing: stack-protector: Kernel stack is corrupted
in: f87aca93

Pid: 262, comm: xfs_fsr Not tainted 2.6.38-rc6-493f3358cb2+ #1
Call Trace:

[<c12991ac>] ? panic+0x50/0x150
[<c102ed71>] ? __stack_chk_fail+0x10/0x18
[<f87aca93>] ? xfs_ioc_fsgeometry_v1+0x56/0x5d [xfs]

Fix this by fixing that one caller to pass the right type and then
copy out the subset it is interested in.

Note: This patch is an alternative to one originally proposed by
Eric Sandeen.

Reported-by: Jeffrey Hundstad <jeffrey.hundstad@mnsu.edu>
Signed-off-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Tested-by: Jeffrey Hundstad <jeffrey.hundstad@mnsu.edu>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f5e2a19e0f8..0ca0e3c024d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -695,14 +695,19 @@ xfs_ioc_fsgeometry_v1(
 	xfs_mount_t		*mp,
 	void			__user *arg)
 {
-	xfs_fsop_geom_v1_t	fsgeo;
+	xfs_fsop_geom_t         fsgeo;
 	int			error;
 
-	error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+	error = xfs_fs_geometry(mp, &fsgeo, 3);
 	if (error)
 		return -error;
 
-	if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+	/*
+	 * Caller should have passed an argument of type
+	 * xfs_fsop_geom_v1_t.  This is a proper subset of the
+	 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+	 */
+	if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
 		return -XFS_ERROR(EFAULT);
 	return 0;
 }
-- 
cgit v1.2.3


From e8a80c6f769dd4622d8b211b398452158ee60c0b Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Thu, 24 Feb 2011 11:48:22 +0100
Subject: ext2: Fix link count corruption under heavy link+rename load

vfs_rename_other() does not lock renamed inode with i_mutex. Thus changing
i_nlink in a non-atomic manner (which happens in ext2_rename()) can corrupt
it as reported and analyzed by Josh.

In fact, there is no good reason to mess with i_nlink of the moved file.
We did it presumably to simulate linking into the new directory and unlinking
from an old one. But the practical effect of this is disputable because fsck
can possibly treat file as being properly linked into both directories without
writing any error which is confusing. So we just stop increment-decrement
games with i_nlink which also fixes the corruption.

CC: stable@kernel.org
CC: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Josh Hunt <johunt@akamai.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/namei.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2e1d8341d82..adb91855ccd 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 		new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
@@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 			if (new_dir->i_nlink >= EXT2_LINK_MAX)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = ext2_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
@@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	/*
 	 * Like most other Unix systems, set the ctime for inodes on a
  	 * rename.
-	 * inode_dec_link_count() will mark the inode dirty.
 	 */
 	old_inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_inode);
 
 	ext2_delete_entry (old_de, old_page);
-	inode_dec_link_count(old_inode);
 
 	if (dir_de) {
 		if (old_dir != new_dir)
-- 
cgit v1.2.3


From 8aaccf7fa2a2f148db1edbe7b09e3119c3f910cf Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Mon, 14 Feb 2011 22:34:22 +0100
Subject: of/flattree: Drop an uninteresting message to pr_debug level

This message looks like an error (which it isn't) when booting with a
flattened device tree.  Remove the message from normal kernel builds.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 fs/proc/proc_devtree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index d9396a4fc7f..927cbd115e5 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -233,7 +233,7 @@ void __init proc_device_tree_init(void)
 		return;
 	root = of_find_node_by_path("/");
 	if (root == NULL) {
-		printk(KERN_ERR "/proc/device-tree: can't find root\n");
+		pr_debug("/proc/device-tree: can't find root\n");
 		return;
 	}
 	proc_device_tree_add_node(root, proc_device_tree);
-- 
cgit v1.2.3


From 9a311b96c3065f362e3348cb5d7af1a57ca6bff9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 22 Jan 2011 20:26:12 +0100
Subject: hpfs: remove the BKL

This removes the BKL in hpfs in a rather awful
way, by making the code only work on uniprocessor
systems without kernel preemption, as suggested
by Andi Kleen.

The HPFS code probably has close to zero remaining
users on current kernels, all archeological uses of
the file system can probably be done with the significant
restrictions.

The hpfs_lock/hpfs_unlock functions are left in the
code, sincen Mikulas has indicated that he is still
interested in fixing it in a better way.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Andi Kleen <ak@linux.intel.com>
Cc: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/hpfs/Kconfig   |  2 +-
 fs/hpfs/dir.c     | 23 +++++++++++------------
 fs/hpfs/file.c    |  9 ++++-----
 fs/hpfs/hpfs_fn.h | 22 ++++++++++++++++++++++
 fs/hpfs/inode.c   |  9 ++++-----
 fs/hpfs/namei.c   | 49 ++++++++++++++++++++++++-------------------------
 fs/hpfs/super.c   | 23 +++++++++--------------
 7 files changed, 75 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 63b6f563231..0c39dc3ef7d 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,7 +1,7 @@
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
 	depends on BLOCK
-	depends on BKL # nontrivial to fix
+	depends on BROKEN || !PREEMPT
 	help
 	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
 	  is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index d32f63a569f..b3d7c0ddb60 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -6,16 +6,15 @@
  *  directory VFS functions
  */
 
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "hpfs_fn.h"
 
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
 {
-	lock_kernel();
+	hpfs_lock(inode->i_sb);
 	hpfs_del_pos(inode, &filp->f_pos);
 	/*hpfs_write_if_changed(inode);*/
-	unlock_kernel();
+	hpfs_unlock(inode->i_sb);
 	return 0;
 }
 
@@ -30,7 +29,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
 	struct super_block *s = i->i_sb;
 
-	lock_kernel();
+	hpfs_lock(s);
 
 	/*printk("dir lseek\n");*/
 	if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
@@ -43,12 +42,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 	}
 	mutex_unlock(&i->i_mutex);
 ok:
-	unlock_kernel();
+	hpfs_unlock(s);
 	return filp->f_pos = new_off;
 fail:
 	mutex_unlock(&i->i_mutex);
 	/*printk("illegal lseek: %016llx\n", new_off);*/
-	unlock_kernel();
+	hpfs_unlock(s);
 	return -ESPIPE;
 }
 
@@ -64,7 +63,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int c1, c2 = 0;
 	int ret = 0;
 
-	lock_kernel();
+	hpfs_lock(inode->i_sb);
 
 	if (hpfs_sb(inode->i_sb)->sb_chk) {
 		if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) {
@@ -167,7 +166,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		hpfs_brelse4(&qbh);
 	}
 out:
-	unlock_kernel();
+	hpfs_unlock(inode->i_sb);
 	return ret;
 }
 
@@ -197,10 +196,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	struct inode *result = NULL;
 	struct hpfs_inode_info *hpfs_result;
 
-	lock_kernel();
+	hpfs_lock(dir->i_sb);
 	if ((err = hpfs_chk_name(name, &len))) {
 		if (err == -ENAMETOOLONG) {
-			unlock_kernel();
+			hpfs_unlock(dir->i_sb);
 			return ERR_PTR(-ENAMETOOLONG);
 		}
 		goto end_add;
@@ -298,7 +297,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 
 	end:
 	end_add:
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	d_add(dentry, result);
 	return NULL;
 
@@ -311,7 +310,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
 	
 	/*bail:*/
 
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return ERR_PTR(-ENOENT);
 }
 
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c0340887c7e..2dbae20450f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -6,16 +6,15 @@
  *  file VFS functions
  */
 
-#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 
 #define BLOCKS(size) (((size) + 511) >> 9)
 
 static int hpfs_file_release(struct inode *inode, struct file *file)
 {
-	lock_kernel();
+	hpfs_lock(inode->i_sb);
 	hpfs_write_if_changed(inode);
-	unlock_kernel();
+	hpfs_unlock(inode->i_sb);
 	return 0;
 }
 
@@ -49,14 +48,14 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
 static void hpfs_truncate(struct inode *i)
 {
 	if (IS_IMMUTABLE(i)) return /*-EPERM*/;
-	lock_kernel();
+	hpfs_lock(i->i_sb);
 	hpfs_i(i)->i_n_secs = 0;
 	i->i_blocks = 1 + ((i->i_size + 511) >> 9);
 	hpfs_i(i)->mmu_private = i->i_size;
 	hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9));
 	hpfs_write_inode(i);
 	hpfs_i(i)->i_n_secs = 0;
-	unlock_kernel();
+	hpfs_unlock(i->i_sb);
 }
 
 static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1c43dbea55e..c15adbca07f 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -342,3 +342,25 @@ static inline time32_t gmt_to_local(struct super_block *s, time_t t)
 	extern struct timezone sys_tz;
 	return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
 }
+
+/*
+ * Locking:
+ *
+ * hpfs_lock() is a leftover from the big kernel lock.
+ * Right now, these functions are empty and only left
+ * for documentation purposes. The file system no longer
+ * works on SMP systems, so the lock is not needed
+ * any more.
+ *
+ * If someone is interested in making it work again, this
+ * would be the place to start by adding a per-superblock
+ * mutex and fixing all the bugs and performance issues
+ * caused by that.
+ */
+static inline void hpfs_lock(struct super_block *s)
+{
+}
+
+static inline void hpfs_unlock(struct super_block *s)
+{
+}
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1ae35baa539..87f1f787e76 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -6,7 +6,6 @@
  *  inode VFS functions
  */
 
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "hpfs_fn.h"
 
@@ -267,7 +266,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 	struct inode *inode = dentry->d_inode;
 	int error = -EINVAL;
 
-	lock_kernel();
+	hpfs_lock(inode->i_sb);
 	if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
 		goto out_unlock;
 	if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
@@ -290,7 +289,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
 	hpfs_write_inode(inode);
 
  out_unlock:
-	unlock_kernel();
+	hpfs_unlock(inode->i_sb);
 	return error;
 }
 
@@ -307,8 +306,8 @@ void hpfs_evict_inode(struct inode *inode)
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
 	if (!inode->i_nlink) {
-		lock_kernel();
+		hpfs_lock(inode->i_sb);
 		hpfs_remove_fnode(inode->i_sb, inode->i_ino);
-		unlock_kernel();
+		hpfs_unlock(inode->i_sb);
 	}
 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4ad9e31ddc..d5f8c8a1902 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -6,7 +6,6 @@
  *  adding & removing files & directories
  */
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
@@ -25,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct hpfs_dirent dee;
 	int err;
 	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
-	lock_kernel();
+	hpfs_lock(dir->i_sb);
 	err = -ENOSPC;
 	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
 	if (!fnode)
@@ -103,7 +102,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	}
 	d_instantiate(dentry, result);
 	mutex_unlock(&hpfs_i(dir)->i_mutex);
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return 0;
 bail3:
 	mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -115,7 +114,7 @@ bail1:
 	brelse(bh);
 	hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return err;
 }
 
@@ -132,7 +131,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
 	int err;
 	if ((err = hpfs_chk_name(name, &len)))
 		return err==-ENOENT ? -EINVAL : err;
-	lock_kernel();
+	hpfs_lock(dir->i_sb);
 	err = -ENOSPC;
 	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
 	if (!fnode)
@@ -195,7 +194,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
 	}
 	d_instantiate(dentry, result);
 	mutex_unlock(&hpfs_i(dir)->i_mutex);
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return 0;
 
 bail2:
@@ -205,7 +204,7 @@ bail1:
 	brelse(bh);
 	hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return err;
 }
 
@@ -224,7 +223,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
 	if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
-	lock_kernel();
+	hpfs_lock(dir->i_sb);
 	err = -ENOSPC;
 	fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
 	if (!fnode)
@@ -274,7 +273,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
 	d_instantiate(dentry, result);
 	mutex_unlock(&hpfs_i(dir)->i_mutex);
 	brelse(bh);
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return 0;
 bail2:
 	mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -283,7 +282,7 @@ bail1:
 	brelse(bh);
 	hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return err;
 }
 
@@ -299,9 +298,9 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	struct inode *result;
 	int err;
 	if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
-	lock_kernel();
+	hpfs_lock(dir->i_sb);
 	if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
-		unlock_kernel();
+		hpfs_unlock(dir->i_sb);
 		return -EPERM;
 	}
 	err = -ENOSPC;
@@ -354,7 +353,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	hpfs_write_inode_nolock(result);
 	d_instantiate(dentry, result);
 	mutex_unlock(&hpfs_i(dir)->i_mutex);
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return 0;
 bail2:
 	mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -363,7 +362,7 @@ bail1:
 	brelse(bh);
 	hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return err;
 }
 
@@ -380,7 +379,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 	int rep = 0;
 	int err;
 
-	lock_kernel();
+	hpfs_lock(dir->i_sb);
 	hpfs_adjust_length(name, &len);
 again:
 	mutex_lock(&hpfs_i(inode)->i_parent_mutex);
@@ -416,7 +415,7 @@ again:
 		dentry_unhash(dentry);
 		if (!d_unhashed(dentry)) {
 			dput(dentry);
-			unlock_kernel();
+			hpfs_unlock(dir->i_sb);
 			return -ENOSPC;
 		}
 		if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
@@ -435,7 +434,7 @@ again:
 			if (!err)
 				goto again;
 		}
-		unlock_kernel();
+		hpfs_unlock(dir->i_sb);
 		return -ENOSPC;
 	default:
 		drop_nlink(inode);
@@ -448,7 +447,7 @@ out1:
 out:
 	mutex_unlock(&hpfs_i(dir)->i_mutex);
 	mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return err;
 }
 
@@ -466,7 +465,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int r;
 
 	hpfs_adjust_length(name, &len);
-	lock_kernel();
+	hpfs_lock(dir->i_sb);
 	mutex_lock(&hpfs_i(inode)->i_parent_mutex);
 	mutex_lock(&hpfs_i(dir)->i_mutex);
 	err = -ENOENT;
@@ -508,7 +507,7 @@ out1:
 out:
 	mutex_unlock(&hpfs_i(dir)->i_mutex);
 	mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-	unlock_kernel();
+	hpfs_unlock(dir->i_sb);
 	return err;
 }
 
@@ -521,21 +520,21 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
 	int err;
 
 	err = -EIO;
-	lock_kernel();
+	hpfs_lock(i->i_sb);
 	if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh)))
 		goto fail;
 	err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
 	brelse(bh);
 	if (err)
 		goto fail;
-	unlock_kernel();
+	hpfs_unlock(i->i_sb);
 	SetPageUptodate(page);
 	kunmap(page);
 	unlock_page(page);
 	return 0;
 
 fail:
-	unlock_kernel();
+	hpfs_unlock(i->i_sb);
 	SetPageError(page);
 	kunmap(page);
 	unlock_page(page);
@@ -567,7 +566,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	err = 0;
 	hpfs_adjust_length(old_name, &old_len);
 
-	lock_kernel();
+	hpfs_lock(i->i_sb);
 	/* order doesn't matter, due to VFS exclusion */
 	mutex_lock(&hpfs_i(i)->i_parent_mutex);
 	if (new_inode)
@@ -659,7 +658,7 @@ end1:
 	mutex_unlock(&hpfs_i(i)->i_parent_mutex);
 	if (new_inode)
 		mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex);
-	unlock_kernel();
+	hpfs_unlock(i->i_sb);
 	return err;
 }
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b30426b1fc9..c89b4080858 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,7 +13,6 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/bitmap.h>
 #include <linux/slab.h>
 
@@ -103,15 +102,11 @@ static void hpfs_put_super(struct super_block *s)
 {
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
 
-	lock_kernel();
-
 	kfree(sbi->sb_cp_table);
 	kfree(sbi->sb_bmp_dir);
 	unmark_dirty(s);
 	s->s_fs_info = NULL;
 	kfree(sbi);
-
-	unlock_kernel();
 }
 
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -143,7 +138,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct super_block *s = dentry->d_sb;
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
-	lock_kernel();
+	hpfs_lock(s);
 
 	/*if (sbi->sb_n_free == -1) {*/
 		sbi->sb_n_free = count_bitmaps(s);
@@ -160,7 +155,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 	buf->f_namelen = 254;
 
-	unlock_kernel();
+	hpfs_unlock(s);
 
 	return 0;
 }
@@ -406,7 +401,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	
 	*flags |= MS_NOATIME;
 	
-	lock_kernel();
+	hpfs_lock(s);
 	lock_super(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
 	umask = 0777 & ~sbi->sb_mode;
@@ -441,12 +436,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	replace_mount_options(s, new_opts);
 
 	unlock_super(s);
-	unlock_kernel();
+	hpfs_unlock(s);
 	return 0;
 
 out_err:
 	unlock_super(s);
-	unlock_kernel();
+	hpfs_unlock(s);
 	kfree(new_opts);
 	return -EINVAL;
 }
@@ -484,13 +479,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
 	int o;
 
-	lock_kernel();
+	if (num_possible_cpus() > 1) {
+		printk(KERN_ERR "HPFS is not SMP safe\n");
+		return -EINVAL;
+	}
 
 	save_mount_options(s, options);
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi) {
-		unlock_kernel();
 		return -ENOMEM;
 	}
 	s->s_fs_info = sbi;
@@ -677,7 +674,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 			root->i_blocks = 5;
 		hpfs_brelse4(&qbh);
 	}
-	unlock_kernel();
 	return 0;
 
 bail4:	brelse(bh2);
@@ -689,7 +685,6 @@ bail0:
 	kfree(sbi->sb_cp_table);
 	s->s_fs_info = NULL;
 	kfree(sbi);
-	unlock_kernel();
 	return -EINVAL;
 }
 
-- 
cgit v1.2.3


From 788257d6101d986ac8f2741aaa35974af47f574c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 24 Jan 2011 10:14:12 +0100
Subject: ufs: remove the BKL

This introduces a new per-superblock mutex in UFS to replace
the big kernel lock. I have been careful to avoid nested
calls to lock_ufs and to get the lock order right with
respect to other mutexes, in particular lock_super.

I did not make any attempt to prove that the big kernel
lock is not needed in a particular place in the code,
which is very possible.

The mutex has a significant performance impact, so it is only
used on SMP or PREEMPT configurations.

As Nick Piggin noticed, any allocation inside of the lock
may end up deadlocking when we get to ufs_getfrag_block
in the reclaim task, so we now use GFP_NOFS.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Nick Bowler <nbowler@elliptictech.com>
Cc: Evgeniy Dushistov <dushistov@mail.ru>
Cc: Nick Piggin <npiggin@gmail.com>
---
 fs/ufs/Kconfig    |  1 -
 fs/ufs/inode.c    | 78 +++++++++++++++----------------------------------------
 fs/ufs/namei.c    | 35 ++++++++++++-------------
 fs/ufs/super.c    | 64 ++++++++++++++++++++++++++-------------------
 fs/ufs/truncate.c |  5 ++--
 fs/ufs/ufs.h      |  6 ++++-
 fs/ufs/util.c     |  2 +-
 7 files changed, 83 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 30c8f223253..e4f10a40768 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,7 +1,6 @@
 config UFS_FS
 	tristate "UFS file system support (read only)"
 	depends on BLOCK
-	depends on BKL # probably fixable
 	help
 	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
 	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b251f2093a..03c255f12df 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -34,7 +34,6 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 
@@ -43,7 +42,7 @@
 #include "swab.h"
 #include "util.h"
 
-static u64 ufs_frag_map(struct inode *inode, sector_t frag);
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
 
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
@@ -82,7 +81,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
  * the begining of the filesystem.
  */
 
-static u64 ufs_frag_map(struct inode *inode, sector_t frag)
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -107,7 +106,8 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag)
 
 	p = offsets;
 
-	lock_kernel();
+	if (needs_lock)
+		lock_ufs(sb);
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
 		goto ufs2;
 
@@ -152,7 +152,8 @@ ufs2:
 	ret = temp + (u64) (frag & uspi->s_fpbmask);
 
 out:
-	unlock_kernel();
+	if (needs_lock)
+		unlock_ufs(sb);
 	return ret;
 }
 
@@ -415,14 +416,16 @@ out:
 int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
 {
 	struct super_block * sb = inode->i_sb;
-	struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_sb_info * sbi = UFS_SB(sb);
+	struct ufs_sb_private_info * uspi = sbi->s_uspi;
 	struct buffer_head * bh;
 	int ret, err, new;
 	unsigned long ptr,phys;
 	u64 phys64 = 0;
+	bool needs_lock = (sbi->mutex_owner != current);
 	
 	if (!create) {
-		phys64 = ufs_frag_map(inode, fragment);
+		phys64 = ufs_frag_map(inode, fragment, needs_lock);
 		UFSD("phys64 = %llu\n", (unsigned long long)phys64);
 		if (phys64)
 			map_bh(bh_result, sb, phys64);
@@ -436,7 +439,8 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
 	ret = 0;
 	bh = NULL;
 
-	lock_kernel();
+	if (needs_lock)
+		lock_ufs(sb);
 
 	UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
 	if (fragment >
@@ -498,7 +502,9 @@ out:
 		set_buffer_new(bh_result);
 	map_bh(bh_result, sb, phys);
 abort:
-	unlock_kernel();
+	if (needs_lock)
+		unlock_ufs(sb);
+
 	return err;
 
 abort_too_big:
@@ -506,48 +512,6 @@ abort_too_big:
 	goto abort;
 }
 
-static struct buffer_head *ufs_getfrag(struct inode *inode,
-				       unsigned int fragment,
-				       int create, int *err)
-{
-	struct buffer_head dummy;
-	int error;
-
-	dummy.b_state = 0;
-	dummy.b_blocknr = -1000;
-	error = ufs_getfrag_block(inode, fragment, &dummy, create);
-	*err = error;
-	if (!error && buffer_mapped(&dummy)) {
-		struct buffer_head *bh;
-		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-		if (buffer_new(&dummy)) {
-			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-			set_buffer_uptodate(bh);
-			mark_buffer_dirty(bh);
-		}
-		return bh;
-	}
-	return NULL;
-}
-
-struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
-	int create, int * err)
-{
-	struct buffer_head * bh;
-
-	UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
-	bh = ufs_getfrag (inode, fragment, create, err);
-	if (!bh || buffer_uptodate(bh)) 		
-		return bh;
-	ll_rw_block (READ, 1, &bh);
-	wait_on_buffer (bh);
-	if (buffer_uptodate(bh))
-		return bh;
-	brelse (bh);
-	*err = -EIO;
-	return NULL;
-}
-
 static int ufs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page,ufs_getfrag_block,wbc);
@@ -900,9 +864,9 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int ret;
-	lock_kernel();
+	lock_ufs(inode->i_sb);
 	ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-	unlock_kernel();
+	unlock_ufs(inode->i_sb);
 	return ret;
 }
 
@@ -922,22 +886,22 @@ void ufs_evict_inode(struct inode * inode)
 	if (want_delete) {
 		loff_t old_i_size;
 		/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
-		lock_kernel();
+		lock_ufs(inode->i_sb);
 		mark_inode_dirty(inode);
 		ufs_update_inode(inode, IS_SYNC(inode));
 		old_i_size = inode->i_size;
 		inode->i_size = 0;
 		if (inode->i_blocks && ufs_truncate(inode, old_i_size))
 			ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
-		unlock_kernel();
+		unlock_ufs(inode->i_sb);
 	}
 
 	invalidate_inode_buffers(inode);
 	end_writeback(inode);
 
 	if (want_delete) {
-		lock_kernel();
+		lock_ufs(inode->i_sb);
 		ufs_free_inode (inode);
-		unlock_kernel();
+		unlock_ufs(inode->i_sb);
 	}
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 12f39b9e443..205030a707f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -29,7 +29,6 @@
 
 #include <linux/time.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -55,16 +54,16 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
 	if (dentry->d_name.len > UFS_MAXNAMLEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	lock_kernel();
+	lock_ufs(dir->i_sb);
 	ino = ufs_inode_by_name(dir, &dentry->d_name);
 	if (ino) {
 		inode = ufs_iget(dir->i_sb, ino);
 		if (IS_ERR(inode)) {
-			unlock_kernel();
+			unlock_ufs(dir->i_sb);
 			return ERR_CAST(inode);
 		}
 	}
-	unlock_kernel();
+	unlock_ufs(dir->i_sb);
 	d_add(dentry, inode);
 	return NULL;
 }
@@ -93,9 +92,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
 		inode->i_fop = &ufs_file_operations;
 		inode->i_mapping->a_ops = &ufs_aops;
 		mark_inode_dirty(inode);
-		lock_kernel();
+		lock_ufs(dir->i_sb);
 		err = ufs_add_nondir(dentry, inode);
-		unlock_kernel();
+		unlock_ufs(dir->i_sb);
 	}
 	UFSD("END: err=%d\n", err);
 	return err;
@@ -115,9 +114,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
 		init_special_inode(inode, mode, rdev);
 		ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
 		mark_inode_dirty(inode);
-		lock_kernel();
+		lock_ufs(dir->i_sb);
 		err = ufs_add_nondir(dentry, inode);
-		unlock_kernel();
+		unlock_ufs(dir->i_sb);
 	}
 	return err;
 }
@@ -133,7 +132,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 	if (l > sb->s_blocksize)
 		goto out_notlocked;
 
-	lock_kernel();
+	lock_ufs(dir->i_sb);
 	inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
@@ -156,7 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 
 	err = ufs_add_nondir(dentry, inode);
 out:
-	unlock_kernel();
+	unlock_ufs(dir->i_sb);
 out_notlocked:
 	return err;
 
@@ -172,9 +171,9 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 	struct inode *inode = old_dentry->d_inode;
 	int error;
 
-	lock_kernel();
+	lock_ufs(dir->i_sb);
 	if (inode->i_nlink >= UFS_LINK_MAX) {
-		unlock_kernel();
+		unlock_ufs(dir->i_sb);
 		return -EMLINK;
 	}
 
@@ -183,7 +182,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
 	ihold(inode);
 
 	error = ufs_add_nondir(dentry, inode);
-	unlock_kernel();
+	unlock_ufs(dir->i_sb);
 	return error;
 }
 
@@ -195,7 +194,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	if (dir->i_nlink >= UFS_LINK_MAX)
 		goto out;
 
-	lock_kernel();
+	lock_ufs(dir->i_sb);
 	inode_inc_link_count(dir);
 
 	inode = ufs_new_inode(dir, S_IFDIR|mode);
@@ -216,7 +215,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	err = ufs_add_link(dentry, inode);
 	if (err)
 		goto out_fail;
-	unlock_kernel();
+	unlock_ufs(dir->i_sb);
 
 	d_instantiate(dentry, inode);
 out:
@@ -228,7 +227,7 @@ out_fail:
 	iput (inode);
 out_dir:
 	inode_dec_link_count(dir);
-	unlock_kernel();
+	unlock_ufs(dir->i_sb);
 	goto out;
 }
 
@@ -259,7 +258,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	struct inode * inode = dentry->d_inode;
 	int err= -ENOTEMPTY;
 
-	lock_kernel();
+	lock_ufs(dir->i_sb);
 	if (ufs_empty_dir (inode)) {
 		err = ufs_unlink(dir, dentry);
 		if (!err) {
@@ -268,7 +267,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 			inode_dec_link_count(dir);
 		}
 	}
-	unlock_kernel();
+	unlock_ufs(dir->i_sb);
 	return err;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c61ac5d4e4..7693d629340 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -84,7 +84,6 @@
 #include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/log2.h>
@@ -96,6 +95,26 @@
 #include "swab.h"
 #include "util.h"
 
+void lock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+
+	mutex_lock(&sbi->mutex);
+	sbi->mutex_owner = current;
+#endif
+}
+
+void unlock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+
+	sbi->mutex_owner = NULL;
+	mutex_unlock(&sbi->mutex);
+#endif
+}
+
 static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
 {
 	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -313,7 +332,6 @@ void ufs_panic (struct super_block * sb, const char * function,
 	struct ufs_super_block_first * usb1;
 	va_list args;
 	
-	lock_kernel();
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	
@@ -521,7 +539,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
 	 */
 	size = uspi->s_cssize;
 	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
-	base = space = kmalloc(size, GFP_KERNEL);
+	base = space = kmalloc(size, GFP_NOFS);
 	if (!base)
 		goto failed; 
 	sbi->s_csp = (struct ufs_csum *)space;
@@ -546,7 +564,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
 	 * Read cylinder group (we read only first fragment from block
 	 * at this time) and prepare internal data structures for cg caching.
 	 */
-	if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_KERNEL)))
+	if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS)))
 		goto failed;
 	for (i = 0; i < uspi->s_ncg; i++) 
 		sbi->s_ucg[i] = NULL;
@@ -564,7 +582,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
 		ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
 	}
 	for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
-		if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
+		if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS)))
 			goto failed;
 		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
 	}
@@ -646,8 +664,6 @@ static void ufs_put_super_internal(struct super_block *sb)
 	
 	UFSD("ENTER\n");
 
-	lock_kernel();
-
 	ufs_put_cstotal(sb);
 	size = uspi->s_cssize;
 	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -676,8 +692,6 @@ static void ufs_put_super_internal(struct super_block *sb)
 	kfree (sbi->s_ucg);
 	kfree (base);
 
-	unlock_kernel();
-
 	UFSD("EXIT\n");
 }
 
@@ -696,8 +710,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned maxsymlen;
 	int ret = -EINVAL;
 
-	lock_kernel();
-
 	uspi = NULL;
 	ubh = NULL;
 	flags = 0;
@@ -718,6 +730,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed;
 	}
 #endif
+	mutex_init(&sbi->mutex);
 	/*
 	 * Set default mount options
 	 * Parse mount options
@@ -1165,7 +1178,6 @@ magic_found:
 			goto failed;
 
 	UFSD("EXIT\n");
-	unlock_kernel();
 	return 0;
 
 dalloc_failed:
@@ -1177,12 +1189,10 @@ failed:
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 	UFSD("EXIT (FAILED)\n");
-	unlock_kernel();
 	return ret;
 
 failed_nomem:
 	UFSD("EXIT (NOMEM)\n");
-	unlock_kernel();
 	return -ENOMEM;
 }
 
@@ -1193,8 +1203,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
 	struct ufs_super_block_third * usb3;
 	unsigned flags;
 
+	lock_ufs(sb);
 	lock_super(sb);
-	lock_kernel();
 
 	UFSD("ENTER\n");
 
@@ -1213,8 +1223,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
 	sb->s_dirt = 0;
 
 	UFSD("EXIT\n");
-	unlock_kernel();
 	unlock_super(sb);
+	unlock_ufs(sb);
 
 	return 0;
 }
@@ -1256,7 +1266,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	unsigned new_mount_opt, ufstype;
 	unsigned flags;
 
-	lock_kernel();
+	lock_ufs(sb);
 	lock_super(sb);
 	uspi = UFS_SB(sb)->s_uspi;
 	flags = UFS_SB(sb)->s_flags;
@@ -1272,7 +1282,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	ufs_set_opt (new_mount_opt, ONERROR_LOCK);
 	if (!ufs_parse_options (data, &new_mount_opt)) {
 		unlock_super(sb);
-		unlock_kernel();
+		unlock_ufs(sb);
 		return -EINVAL;
 	}
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1280,14 +1290,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
 		unlock_super(sb);
-		unlock_kernel();
+		unlock_ufs(sb);
 		return -EINVAL;
 	}
 
 	if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
 		unlock_super(sb);
-		unlock_kernel();
+		unlock_ufs(sb);
 		return 0;
 	}
 	
@@ -1313,7 +1323,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
 		unlock_super(sb);
-		unlock_kernel();
+		unlock_ufs(sb);
 		return -EINVAL;
 #else
 		if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1323,13 +1333,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
 			printk("this ufstype is read-only supported\n");
 			unlock_super(sb);
-			unlock_kernel();
+			unlock_ufs(sb);
 			return -EINVAL;
 		}
 		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
 			unlock_super(sb);
-			unlock_kernel();
+			unlock_ufs(sb);
 			return -EPERM;
 		}
 		sb->s_flags &= ~MS_RDONLY;
@@ -1337,7 +1347,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
 	unlock_super(sb);
-	unlock_kernel();
+	unlock_ufs(sb);
 	return 0;
 }
 
@@ -1371,7 +1381,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct ufs_super_block_third *usb3;
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
-	lock_kernel();
+	lock_ufs(sb);
 
 	usb1 = ubh_get_usb_first(uspi);
 	usb2 = ubh_get_usb_second(uspi);
@@ -1395,7 +1405,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 
-	unlock_kernel();
+	unlock_ufs(sb);
 
 	return 0;
 }
@@ -1405,7 +1415,7 @@ static struct kmem_cache * ufs_inode_cachep;
 static struct inode *ufs_alloc_inode(struct super_block *sb)
 {
 	struct ufs_inode_info *ei;
-	ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL);
+	ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
 	ei->vfs_inode.i_version = 1;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index a58f9155fc9..e56a4f56721 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -40,7 +40,6 @@
 #include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/sched.h>
@@ -467,7 +466,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
 
 	block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
 
-	lock_kernel();
 	while (1) {
 		retry = ufs_trunc_direct(inode);
 		retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
@@ -487,7 +485,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
 
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
 	ufsi->i_lastfrag = DIRECT_FRAGMENT;
-	unlock_kernel();
 	mark_inode_dirty(inode);
 out:
 	UFSD("EXIT: err %d\n", err);
@@ -510,7 +507,9 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 		/* XXX(truncate): truncate_setsize should be called last */
 		truncate_setsize(inode, attr->ia_size);
 
+		lock_ufs(inode->i_sb);
 		error = ufs_truncate(inode, old_i_size);
+		unlock_ufs(inode->i_sb);
 		if (error)
 			return error;
 	}
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index c08782e1b48..5be2755dd71 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -18,6 +18,8 @@ struct ufs_sb_info {
 	unsigned s_cgno[UFS_MAX_GROUP_LOADED];
 	unsigned short s_cg_loaded;
 	unsigned s_mount_opt;
+	struct mutex mutex;
+	struct task_struct *mutex_owner;
 };
 
 struct ufs_inode_info {
@@ -109,7 +111,6 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
 extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_evict_inode (struct inode *);
-extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
 
 /* namei.c */
@@ -154,4 +155,7 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
 	return do_div(b, uspi->s_fpg);
 }
 
+extern void lock_ufs(struct super_block *sb);
+extern void unlock_ufs(struct super_block *sb);
+
 #endif /* _UFS_UFS_H */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index d2c36d53fe6..95425b59ce0 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -27,7 +27,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
 	if (count > UFS_MAXFRAG)
 		return NULL;
 	ubh = (struct ufs_buffer_head *)
-		kmalloc (sizeof (struct ufs_buffer_head), GFP_KERNEL);
+		kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
 	if (!ubh)
 		return NULL;
 	ubh->fragment = fragment;
-- 
cgit v1.2.3


From 4787d45fa76b97e224a8299086bb5fb496275796 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 09:38:45 -0500
Subject: sysv: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysv/namei.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b427b1208c2..e474fbcf8bd 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 		new_de = sysv_find_entry(new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		sysv_set_link(new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
@@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 			if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = sysv_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
 
 	sysv_delete_entry(old_de, old_page);
-	inode_dec_link_count(old_inode);
+	mark_inode_dirty(old_inode);
 
 	if (dir_de) {
 		sysv_set_link(dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From 37750cdda36721fa7fa816f5f58258d2c013b248 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 09:40:21 -0500
Subject: ufs: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ufs/namei.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 12f39b9e443..d6f681535eb 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -306,7 +306,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		ufs_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
@@ -318,12 +317,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			if (new_dir->i_nlink >= UFS_LINK_MAX)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = ufs_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
@@ -331,12 +327,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/*
 	 * Like most other Unix systems, set the ctime for inodes on a
  	 * rename.
-	 * inode_dec_link_count() will mark the inode dirty.
 	 */
 	old_inode->i_ctime = CURRENT_TIME_SEC;
 
 	ufs_delete_entry(old_dir, old_de, old_page);
-	inode_dec_link_count(old_inode);
+	mark_inode_dirty(old_inode);
 
 	if (dir_de) {
 		ufs_set_link(old_inode, dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From 6f88049caf56022c773272e03ddfa8cf79867059 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 09:41:38 -0500
Subject: minix: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/minix/namei.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ce7337ddfdb..6e6777f1b4b 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 		new_de = minix_find_entry(new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		minix_set_link(new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
@@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 			if (new_dir->i_nlink >= info->s_link_max)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = minix_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
 
 	minix_delete_entry(old_de, old_page);
-	inode_dec_link_count(old_inode);
+	mark_inode_dirty(old_inode);
 
 	if (dir_de) {
 		minix_set_link(dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From 30eb43d31478f0fca28423623f3ec6af13f845fa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 12:01:13 -0500
Subject: nilfs2: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/namei.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 98034271cd0..161791d2645 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inc_nlink(old_inode);
 		nilfs_set_link(new_dir, new_de, new_page, old_inode);
 		nilfs_mark_inode_dirty(new_dir);
 		new_inode->i_ctime = CURRENT_TIME;
@@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			if (new_dir->i_nlink >= NILFS_LINK_MAX)
 				goto out_dir;
 		}
-		inc_nlink(old_inode);
 		err = nilfs_add_link(new_dentry, old_inode);
-		if (err) {
-			drop_nlink(old_inode);
-			nilfs_mark_inode_dirty(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de) {
 			inc_nlink(new_dir);
 			nilfs_mark_inode_dirty(new_dir);
@@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	old_inode->i_ctime = CURRENT_TIME;
 
 	nilfs_delete_entry(old_de, old_page);
-	drop_nlink(old_inode);
 
 	if (dir_de) {
 		nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From babfe56046885749b6a90a3c4409219a1f16cf48 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 16:42:38 -0500
Subject: exofs: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exofs/namei.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 264e95d0283..4d70db110cf 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		err = exofs_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME;
 		if (dir_de)
@@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			if (new_dir->i_nlink >= EXOFS_LINK_MAX)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = exofs_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
@@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	old_inode->i_ctime = CURRENT_TIME;
 
 	exofs_delete_entry(old_de, old_page);
-	inode_dec_link_count(old_inode);
+	mark_inode_dirty(old_inode);
 
 	if (dir_de) {
 		err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From 99890a3be1ee67346300f1e0a873006588760f2a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 09:35:13 -0500
Subject: fix reiserfs mkdir() breakage

if directory has so many subdirectories that its link count is set
to 1 (i.e. "can't tell accurately") and reiserfs_new_inode() fails,
we shouldn't decrement the parent's link count in cleanup path;
that's what DEC_DIR_INODE_NLINK() is for.  As it is, we end up
with parent suddenly getting zero i_nlink, with very unpleasant
effects.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ba5f51ec345..68fdf45cc6c 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 					EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
 					dentry, inode, &security);
 	if (retval) {
-		dir->i_nlink--;
+		DEC_DIR_INODE_NLINK(dir)
 		goto out_failed;
 	}
 
-- 
cgit v1.2.3


From 810c1b2e48d32a8605928c3609262d94853c3a76 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 10:15:26 -0500
Subject: udf: fix i_nlink limit

(256 << sizeof(x)) - 1 is not the maximal possible value of x...
In reality, the maximal allowed value for UDF FileLinkCount is
65535.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/udf/namei.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2be0f9eb86d..b7c338d5e9d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,8 @@
 #include <linux/crc-itu-t.h>
 #include <linux/exportfs.h>
 
+enum { UDF_MAX_LINKS = 0xffff };
+
 static inline int udf_match(int len1, const unsigned char *name1, int len2,
 			    const unsigned char *name2)
 {
@@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct udf_inode_info *iinfo;
 
 	err = -EMLINK;
-	if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
+	if (dir->i_nlink >= UDF_MAX_LINKS)
 		goto out;
 
 	err = -EIO;
@@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 	struct fileIdentDesc cfi, *fi;
 	int err;
 
-	if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
+	if (inode->i_nlink >= UDF_MAX_LINKS)
 		return -EMLINK;
-	}
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (!fi) {
@@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto end_rename;
 
 		retval = -EMLINK;
-		if (!new_inode &&
-			new_dir->i_nlink >=
-				(256 << sizeof(new_dir->i_nlink)) - 1)
+		if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
 			goto end_rename;
 	}
 	if (!nfi) {
-- 
cgit v1.2.3


From 69102e9b4b61f56a26717659ec2e572a6b18458d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 23:46:51 -0500
Subject: hfs: fix rename() over non-empty directory

merge hfs_unlink() and hfs_rmdir(), while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfs/dir.c | 50 +++++++++++++-------------------------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index afa66aaa223..b4d70b13be9 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 
 /*
- * hfs_unlink()
+ * hfs_remove()
  *
- * This is the unlink() entry in the inode_operations structure for
- * regular HFS directories.  The purpose is to delete an existing
- * file, given the inode for the parent directory and the name
- * (and its length) of the existing file.
- */
-static int hfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode;
-	int res;
-
-	inode = dentry->d_inode;
-	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
-	if (res)
-		return res;
-
-	drop_nlink(inode);
-	hfs_delete_inode(inode);
-	inode->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(inode);
-
-	return res;
-}
-
-/*
- * hfs_rmdir()
+ * This serves as both unlink() and rmdir() in the inode_operations
+ * structure for regular HFS directories.  The purpose is to delete
+ * an existing child, given the inode for the parent directory and
+ * the name (and its length) of the existing directory.
  *
- * This is the rmdir() entry in the inode_operations structure for
- * regular HFS directories.  The purpose is to delete an existing
- * directory, given the inode for the parent directory and the name
- * (and its length) of the existing directory.
+ * HFS does not have hardlinks, so both rmdir and unlink set the
+ * link count to 0.  The only difference is the emptiness check.
  */
-static int hfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int hfs_remove(struct inode *dir, struct dentry *dentry)
 {
-	struct inode *inode;
+	struct inode *inode = dentry->d_inode;
 	int res;
 
-	inode = dentry->d_inode;
-	if (inode->i_size != 2)
+	if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
 		return -ENOTEMPTY;
 	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
 	if (res)
@@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		res = hfs_unlink(new_dir, new_dentry);
+		res = hfs_remove(new_dir, new_dentry);
 		if (res)
 			return res;
 	}
@@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = {
 const struct inode_operations hfs_dir_inode_operations = {
 	.create		= hfs_create,
 	.lookup		= hfs_lookup,
-	.unlink		= hfs_unlink,
+	.unlink		= hfs_remove,
 	.mkdir		= hfs_mkdir,
-	.rmdir		= hfs_rmdir,
+	.rmdir		= hfs_remove,
 	.rename		= hfs_rename,
 	.setattr	= hfs_inode_setattr,
 };
-- 
cgit v1.2.3


From 9bde178d052418af0b8e0f12932cf02ab4764c9d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 28 Feb 2011 09:47:37 -0800
Subject: Revert "ceph: keep reference to parent inode on ceph_dentry"

This reverts commit 97d79b403ef03f729883246208ef5d8a2ebc4d68.

This fails to account for d_parent changes due to rename or disconnected
dentries due to submounts or NFS reexports.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 5 +----
 fs/ceph/super.h | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f0aef787a10..0bc68de8edd 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -60,7 +60,6 @@ int ceph_init_dentry(struct dentry *dentry)
 	}
 	di->dentry = dentry;
 	di->lease_session = NULL;
-	di->parent_inode = igrab(dentry->d_parent->d_inode);
 	dentry->d_fsdata = di;
 	dentry->d_time = jiffies;
 	ceph_dentry_lru_add(dentry);
@@ -1034,7 +1033,7 @@ static void ceph_dentry_release(struct dentry *dentry)
 	u64 snapid = CEPH_NOSNAP;
 
 	if (!IS_ROOT(dentry)) {
-		parent_inode = di->parent_inode;
+		parent_inode = dentry->d_parent->d_inode;
 		if (parent_inode)
 			snapid = ceph_snap(parent_inode);
 	}
@@ -1059,8 +1058,6 @@ static void ceph_dentry_release(struct dentry *dentry)
 		kmem_cache_free(ceph_dentry_cachep, di);
 		dentry->d_fsdata = NULL;
 	}
-	if (parent_inode)
-		iput(parent_inode);
 }
 
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 88fcaa21b80..20b907d76ae 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -207,7 +207,6 @@ struct ceph_dentry_info {
 	struct dentry *dentry;
 	u64 time;
 	u64 offset;
-	struct inode *parent_inode;
 };
 
 struct ceph_inode_xattrs_info {
-- 
cgit v1.2.3


From b545cc1505eb49247071ce9f4092665de788ca00 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 28 Feb 2011 12:46:46 -0800
Subject: ceph: do not set I_COMPLETE

Do not set the I_COMPLETE flag on directories until we resolve races with
dcache pruning.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 2 +-
 fs/ceph/inode.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bc68de8edd..9b4f9d9947b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -409,7 +409,7 @@ more:
 	spin_lock(&inode->i_lock);
 	if (ci->i_release_count == fi->dir_release_count) {
 		dout(" marking %p complete\n", inode);
-		ci->i_ceph_flags |= CEPH_I_COMPLETE;
+		/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
 		ci->i_max_offset = filp->f_pos;
 	}
 	spin_unlock(&inode->i_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5625463aa47..193bfa5e9cb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -707,7 +707,7 @@ static int fill_inode(struct inode *inode,
 		    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
 			dout(" marking %p complete (empty)\n", inode);
-			ci->i_ceph_flags |= CEPH_I_COMPLETE;
+			/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
 			ci->i_max_offset = 2;
 		}
 		break;
-- 
cgit v1.2.3


From 16a8b70a5a757db513f036bbcc73309f6c507d81 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 28 Feb 2011 12:49:15 -0800
Subject: ceph: do not clear I_COMPLETE from d_release

First, this was racy anyway: d_release isn't called until well after the
dentry is unhashed.  Second, this runs afoul of the recent dcache change
that clears d_parent prior to calling d_release (949854d0), causing a NULL
pointer dereference.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 9b4f9d9947b..196fd4c62db 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1029,28 +1029,8 @@ out_touch:
 static void ceph_dentry_release(struct dentry *dentry)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
-	struct inode *parent_inode = NULL;
-	u64 snapid = CEPH_NOSNAP;
 
-	if (!IS_ROOT(dentry)) {
-		parent_inode = dentry->d_parent->d_inode;
-		if (parent_inode)
-			snapid = ceph_snap(parent_inode);
-	}
-	dout("dentry_release %p parent %p\n", dentry, parent_inode);
-	if (parent_inode && snapid != CEPH_SNAPDIR) {
-		struct ceph_inode_info *ci = ceph_inode(parent_inode);
-
-		spin_lock(&parent_inode->i_lock);
-		if (ci->i_shared_gen == di->lease_shared_gen ||
-		    snapid <= CEPH_MAXSNAP) {
-			dout(" clearing %p complete (d_release)\n",
-			     parent_inode);
-			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-			ci->i_release_count++;
-		}
-		spin_unlock(&parent_inode->i_lock);
-	}
+	dout("dentry_release %p\n", dentry);
 	if (di) {
 		ceph_dentry_lru_del(dentry);
 		if (di->lease_session)
-- 
cgit v1.2.3


From ff36fe2c845cab2102e4826c1ffa0a6ebf487c65 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 3 Mar 2011 16:09:14 -0500
Subject: LSM: Pass -o remount options to the LSM

The VFS mount code passes the mount options to the LSM.  The LSM will remove
options it understands from the data and the VFS will then pass the remaining
options onto the underlying filesystem.  This is how options like the
SELinux context= work.  The problem comes in that -o remount never calls
into LSM code.  So if you include an LSM specific option it will get passed
to the filesystem and will cause the remount to fail.  An example of where
this is a problem is the 'seclabel' option.  The SELinux LSM hook will
print this word in /proc/mounts if the filesystem is being labeled using
xattrs.  If you pass this word on mount it will be silently stripped and
ignored.  But if you pass this word on remount the LSM never gets called
and it will be passed to the FS.  The FS doesn't know what seclabel means
and thus should fail the mount.  For example an ext3 fs mounted over loop

# mount -o loop /tmp/fs /mnt/tmp
# cat /proc/mounts | grep /mnt/tmp
/dev/loop0 /mnt/tmp ext3 rw,seclabel,relatime,errors=continue,barrier=0,data=ordered 0 0
# mount -o remount /mnt/tmp
mount: /mnt/tmp not mounted already, or bad option
# dmesg
EXT3-fs (loop0): error: unrecognized mount option "seclabel" or missing value

This patch passes the remount mount options to an new LSM hook.

Signed-off-by: Eric Paris <eparis@redhat.com>
Reviewed-by: James Morris <jmorris@namei.org>
---
 fs/namespace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 3ddfd9046c4..1b3f2ac59c5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1800,6 +1800,10 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
+	err = security_sb_remount(sb, data);
+	if (err)
+		return err;
+
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
-- 
cgit v1.2.3


From 425fa41072b7dce3d88f392b335e561a770aa6c3 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 3 Mar 2011 22:58:37 +0800
Subject: ext3: Fix an overflow in ext3_trim_fs.

In a bs=4096 volume, if we call FITRIM with the following parameter as
fstrim_range(start = 102400, len = 134144000, minlen = 10240), with the
following code:
if (len >= EXT3_BLOCKS_PER_GROUP(sb))
        len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
else
        last_block = first_block + len;

So if len < EXT3_BLOCKS_PER_GROUP while first_block + len >
EXT3_BLOCKS_PER_GROUP, last_block will be set to an overflow value
which exceeds EXT3_BLOCKS_PER_GROUP.

This patch fixes it and adjusts len and last_block accordingly.

Cc: Lukas Czerner <lczerner@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/balloc.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index db1906b4e39..153242187fc 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -2127,10 +2127,15 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		if (free_blocks < minlen)
 			continue;
 
-		if (len >= EXT3_BLOCKS_PER_GROUP(sb))
-			len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
-		else
+		/*
+		 * For all the groups except the last one, last block will
+		 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
+		 * change it for the last group in which case first_block +
+		 * len < EXT3_BLOCKS_PER_GROUP(sb).
+		 */
+		if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
 			last_block = first_block + len;
+		len -= last_block - first_block;
 
 		ret = ext3_trim_all_free(sb, group, first_block,
 					last_block, minlen);
-- 
cgit v1.2.3


From 1858efd471624ecb37e6b5462cab8076f47d1cee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 13:14:21 -0500
Subject: minimal fix for do_filp_open() race

failure exits on the no-O_CREAT side of do_filp_open() merge with
those of O_CREAT one; unfortunately, if do_path_lookup() returns
-ESTALE, we'll get out_filp:, notice that we are about to return
-ESTALE without having trying to create the sucker with LOOKUP_REVAL
and jump right into the O_CREAT side of code.  And proceed to try
and create a file.  Usually that'll fail with -ESTALE again, but
we can race and get that attempt of pathname resolution to succeed.

open() without O_CREAT really shouldn't end up creating files, races
or not.  The real fix is to rearchitect the whole do_filp_open(),
but for now splitting the failure exits will do.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0087cf9c2c6..a5e844fe4b2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2455,22 +2455,29 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	/* !O_CREAT, simple open */
 	error = do_path_lookup(dfd, pathname, flags, &nd);
 	if (unlikely(error))
-		goto out_filp;
+		goto out_filp2;
 	error = -ELOOP;
 	if (!(nd.flags & LOOKUP_FOLLOW)) {
 		if (nd.inode->i_op->follow_link)
-			goto out_path;
+			goto out_path2;
 	}
 	error = -ENOTDIR;
 	if (nd.flags & LOOKUP_DIRECTORY) {
 		if (!nd.inode->i_op->lookup)
-			goto out_path;
+			goto out_path2;
 	}
 	audit_inode(pathname, nd.path.dentry);
 	filp = finish_open(&nd, open_flag, acc_mode);
+out2:
 	release_open_intent(&nd);
 	return filp;
 
+out_path2:
+	path_put(&nd.path);
+out_filp2:
+	filp = ERR_PTR(error);
+	goto out2;
+
 creat:
 	/* OK, have to create the file. Find the parent. */
 	error = path_init_rcu(dfd, pathname,
-- 
cgit v1.2.3


From 455cec0abff563574cca432ced49f734117ca113 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 3 Mar 2011 13:44:35 -0800
Subject: ceph: no .snap inside of snapped namespace

Otherwise you can do things like

# mkdir .snap/foo
# cd .snap/foo/.snap
# ls
<badness>

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 196fd4c62db..099a58615b9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -496,6 +496,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 
 	/* .snap dir? */
 	if (err == -ENOENT &&
+	    ceph_snap(parent) == CEPH_NOSNAP &&
 	    strcmp(dentry->d_name.name,
 		   fsc->mount_options->snapdir_name) == 0) {
 		struct inode *inode = ceph_get_snapdir(parent);
-- 
cgit v1.2.3


From e9e3d724e2145f5039b423c290ce2b2c3d8f94bc Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 4 Mar 2011 19:26:03 -0500
Subject: nfs4: Ensure that ACL pages sent over NFS were not allocated from the
 slab (v3)

The "bad_page()" page allocator sanity check was reported recently (call
chain as follows):

  bad_page+0x69/0x91
  free_hot_cold_page+0x81/0x144
  skb_release_data+0x5f/0x98
  __kfree_skb+0x11/0x1a
  tcp_ack+0x6a3/0x1868
  tcp_rcv_established+0x7a6/0x8b9
  tcp_v4_do_rcv+0x2a/0x2fa
  tcp_v4_rcv+0x9a2/0x9f6
  do_timer+0x2df/0x52c
  ip_local_deliver+0x19d/0x263
  ip_rcv+0x539/0x57c
  netif_receive_skb+0x470/0x49f
  :virtio_net:virtnet_poll+0x46b/0x5c5
  net_rx_action+0xac/0x1b3
  __do_softirq+0x89/0x133
  call_softirq+0x1c/0x28
  do_softirq+0x2c/0x7d
  do_IRQ+0xec/0xf5
  default_idle+0x0/0x50
  ret_from_intr+0x0/0xa
  default_idle+0x29/0x50
  cpu_idle+0x95/0xb8
  start_kernel+0x220/0x225
  _sinittext+0x22f/0x236

It occurs because an skb with a fraglist was freed from the tcp
retransmit queue when it was acked, but a page on that fraglist had
PG_Slab set (indicating it was allocated from the Slab allocator (which
means the free path above can't safely free it via put_page.

We tracked this back to an nfsv4 setacl operation, in which the nfs code
attempted to fill convert the passed in buffer to an array of pages in
__nfs4_proc_set_acl, which gets used by the skb->frags list in
xs_sendpages.  __nfs4_proc_set_acl just converts each page in the buffer
to a page struct via virt_to_page, but the vfs allocates the buffer via
kmalloc, meaning the PG_slab bit is set.  We can't create a buffer with
kmalloc and free it later in the tcp ack path with put_page, so we need
to either:

1) ensure that when we create the list of pages, no page struct has
   PG_Slab set

 or

2) not use a page list to send this data

Given that these buffers can be multiple pages and arbitrarily sized, I
think (1) is the right way to go.  I've written the below patch to
allocate a page from the buddy allocator directly and copy the data over
to it.  This ensures that we have a put_page free-able page for every
entry that winds up on an skb frag list, so it can be safely freed when
the frame is acked.  We do a put page on each entry after the
rpc_call_sync call so as to drop our own reference count to the page,
leaving only the ref count taken by tcp_sendpages.  This way the data
will be properly freed when the ack comes in

Successfully tested by myself to solve the above oops.

Note, as this is the result of a setacl operation that exceeded a page
of data, I think this amounts to a local DOS triggerable by an
uprivlidged user, so I'm CCing security on this as well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Trond Myklebust <Trond.Myklebust@netapp.com>
CC: security@kernel.org
CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/nfs4proc.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 78936a8f40a..1ff76acc7e9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -51,6 +51,7 @@
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
+#include <linux/mm.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -3252,6 +3253,35 @@ static void buf_to_pages(const void *buf, size_t buflen,
 	}
 }
 
+static int buf_to_pages_noslab(const void *buf, size_t buflen,
+		struct page **pages, unsigned int *pgbase)
+{
+	struct page *newpage, **spages;
+	int rc = 0;
+	size_t len;
+	spages = pages;
+
+	do {
+		len = min(PAGE_CACHE_SIZE, buflen);
+		newpage = alloc_page(GFP_KERNEL);
+
+		if (newpage == NULL)
+			goto unwind;
+		memcpy(page_address(newpage), buf, len);
+                buf += len;
+                buflen -= len;
+		*pages++ = newpage;
+		rc++;
+	} while (buflen != 0);
+
+	return rc;
+
+unwind:
+	for(; rc > 0; rc--)
+		__free_page(spages[rc-1]);
+	return -ENOMEM;
+}
+
 struct nfs4_cached_acl {
 	int cached;
 	size_t len;
@@ -3420,13 +3450,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
 	};
-	int ret;
+	int ret, i;
 
 	if (!nfs4_server_supports_acls(server))
 		return -EOPNOTSUPP;
+	i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+	if (i < 0)
+		return i;
 	nfs_inode_return_delegation(inode);
-	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
 	ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+
+	/*
+	 * Free each page after tx, so the only ref left is
+	 * held by the network stack
+	 */
+	for (; i > 0; i--)
+		put_page(pages[i-1]);
+
 	/*
 	 * Acl update can result in inode attribute update.
 	 * so mark the attribute cache invalid.
-- 
cgit v1.2.3


From ae7eb8979ccfa5e9e888101b9c940f20bd0f4115 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@linux.intel.com>
Date: Sun, 27 Feb 2011 13:58:00 +0000
Subject: fs/locks.c: Remove stale FIXME left over from BKL conversion

The comment is no longer true as (now that the BKL conversion is
finished) a spinlock _is_ now used to protect file_lock_list,
blocked_list and inode->i_flock.

Signed-off-by: Matt Fleming <matt.fleming@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 fs/locks.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 0f3998291f7..822c3d1843a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -145,7 +145,6 @@ static DEFINE_SPINLOCK(file_lock_lock);
 
 /*
  * Protects the two list heads above, plus the inode->i_flock list
- * FIXME: should use a spinlock, once lockd and ceph are ready.
  */
 void lock_flocks(void)
 {
-- 
cgit v1.2.3


From 013e4f4a285d8c7d952d8d7be9f10783a85b5d3c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 01:14:55 -0500
Subject: omfs: rename() needs to mark old_inode dirty after ctime update

we *do* mark it dirty before, but it doesn't guarantee that we
don't get preempted just before assignment to ->i_ctime, with
inode getting written out before we get CPU back...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
---
 fs/omfs/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 393f3f659da..9990fc85695 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -423,6 +423,7 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out;
 
 	old_inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_inode);
 out:
 	return err;
 }
-- 
cgit v1.2.3


From cdb26496dba00d5c4375261be6518b3e94260444 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 01:18:19 -0500
Subject: omfs: stop playing silly buggers with omfs_unlink() in ->rename()

Since omfs directories are hashes of inodes and name is part of
inode, we have to remove inode from old directory before we can
put it into new one / under new name.  So instead of
	bump i_nlink
	call omfs_unlink, which does
		omfs_delete_entry()
		decrement i_nlink and mark parent dirty in case of success
	decrement i_nlink if omfs_unlink failed and hadn't done it itself
let's just call omfs_delete_entry() and dirty the parent ourselves...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
---
 fs/omfs/dir.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 9990fc85695..a4c2d31b785 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -412,12 +412,11 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/* since omfs locates files by name, we need to unlink _before_
 	 * adding the new link or we won't find the old one */
 	inode_inc_link_count(old_inode);
-	err = omfs_unlink(old_dir, old_dentry);
-	if (err) {
-		inode_dec_link_count(old_inode);
+	err = omfs_delete_entry(old_dentry);
+	if (err)
 		goto out;
-	}
 
+	mark_inode_dirty(old_dir);
 	err = omfs_add_link(new_dentry, old_inode);
 	if (err)
 		goto out;
-- 
cgit v1.2.3


From d932805b3dc8c6d80d8948f7d7d0d8336d53b2ed Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 01:31:03 -0500
Subject: omfs: merge unlink() and rmdir(), close leak in rename()

In case of directory-overwriting rename(), omfs forgot to mark the
victim doomed, so omfs_evict_inode() didn't free it.

We could fix that by calling omfs_rmdir() for directory victims
instead of doing omfs_unlink(), but it's easier to merge omfs_unlink()
and omfs_rmdir() instead.  Note that we have no hardlinks here.

It also makes the checks in omfs_rename() go away - they fold into
what omfs_remove() does when it runs into a directory.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
---
 fs/omfs/dir.c | 53 +++++++++++++----------------------------------------
 1 file changed, 13 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index a4c2d31b785..fd91f629ceb 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -235,33 +235,22 @@ static int omfs_dir_is_empty(struct inode *inode)
 	return *ptr != ~0;
 }
 
-static int omfs_unlink(struct inode *dir, struct dentry *dentry)
+static int omfs_remove(struct inode *dir, struct dentry *dentry)
 {
-	int ret;
 	struct inode *inode = dentry->d_inode;
+	int ret;
+
+	if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
+		return -ENOTEMPTY;
 
 	ret = omfs_delete_entry(dentry);
 	if (ret)
-		goto end_unlink;
-
-	inode_dec_link_count(inode);
+		return ret;
+	
+	clear_nlink(inode);
+	mark_inode_dirty(inode);
 	mark_inode_dirty(dir);
-
-end_unlink:
-	return ret;
-}
-
-static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	int err = -ENOTEMPTY;
-	struct inode *inode = dentry->d_inode;
-
-	if (omfs_dir_is_empty(inode)) {
-		err = omfs_unlink(dir, dentry);
-		if (!err)
-			inode_dec_link_count(inode);
-	}
-	return err;
+	return 0;
 }
 
 static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
@@ -385,33 +374,17 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *old_inode = old_dentry->d_inode;
-	struct buffer_head *bh;
-	int is_dir;
 	int err;
 
-	is_dir = S_ISDIR(old_inode->i_mode);
-
 	if (new_inode) {
 		/* overwriting existing file/dir */
-		err = -ENOTEMPTY;
-		if (is_dir && !omfs_dir_is_empty(new_inode))
-			goto out;
-
-		err = -ENOENT;
-		bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
-			new_dentry->d_name.len);
-		if (IS_ERR(bh))
-			goto out;
-		brelse(bh);
-
-		err = omfs_unlink(new_dir, new_dentry);
+		err = omfs_remove(new_dir, new_dentry);
 		if (err)
 			goto out;
 	}
 
 	/* since omfs locates files by name, we need to unlink _before_
 	 * adding the new link or we won't find the old one */
-	inode_inc_link_count(old_inode);
 	err = omfs_delete_entry(old_dentry);
 	if (err)
 		goto out;
@@ -488,8 +461,8 @@ const struct inode_operations omfs_dir_inops = {
 	.mkdir = omfs_mkdir,
 	.rename = omfs_rename,
 	.create = omfs_create,
-	.unlink = omfs_unlink,
-	.rmdir = omfs_rmdir,
+	.unlink = omfs_remove,
+	.rmdir = omfs_remove,
 };
 
 const struct file_operations omfs_dir_operations = {
-- 
cgit v1.2.3


From 31be83aeaee22fa165862ad449c7131ceaf1cf91 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 01:43:36 -0500
Subject: omfs: make readdir stop when filldir says so

filldir returning an error does *not* mean "skip this entry, try the
next one"...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Bob Copeland <me@bobcopeland.com>
---
 fs/omfs/dir.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index fd91f629ceb..de4ff29f1e0 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -361,9 +361,10 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
 
 		res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
 			OMFS_NAMELEN), filp->f_pos, self, d_type);
-		if (res == 0)
-			filp->f_pos++;
 		brelse(bh);
+		if (res < 0)
+			break;
+		filp->f_pos++;
 	}
 out:
 	return res;
-- 
cgit v1.2.3


From 4f10700a2e4bb2ff3d3a80f08412e21109e6d4b5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:00:35 +1100
Subject: xfs: Convert linux-2.6/ files to new logging interface

Convert the files in fs/xfs/linux-2.6/ to use the new xfs_<level>
logging format that replaces the old Irix inherited cmn_err()
interfaces. While there, also convert naked printk calls to use the
relevant xfs logging function to standardise output format.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/kmem.c       |   9 +--
 fs/xfs/linux-2.6/xfs_aops.c   |   6 +-
 fs/xfs/linux-2.6/xfs_buf.c    |  17 +++---
 fs/xfs/linux-2.6/xfs_super.c  | 127 +++++++++++++++++++-----------------------
 fs/xfs/linux-2.6/xfs_sync.c   |   5 +-
 fs/xfs/linux-2.6/xfs_sysctl.c |   2 +-
 6 files changed, 76 insertions(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb..a907de565db 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
 #include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
+#include "xfs_message.h"
 
 /*
  * Greedy allocation.  May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
 			return ptr;
 		if (!(++retries % 100))
-			printk(KERN_ERR "XFS: possible memory allocation "
-					"deadlock in %s (mode:0x%x)\n",
+			xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
 					__func__, lflags);
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 		if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
 			return ptr;
 		if (!(++retries % 100))
-			printk(KERN_ERR "XFS: possible memory allocation "
-					"deadlock in %s (mode:0x%x)\n",
+			xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
 					__func__, lflags);
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (1);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index ec7bbb5645b..8c5c8727745 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -854,7 +854,7 @@ xfs_aops_discard_page(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		goto out_invalidate;
 
-	xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+	xfs_alert(ip->i_mount,
 		"page discard on page %p, inode 0x%llx, offset %llu.",
 			page, ip->i_ino, offset);
 
@@ -872,7 +872,7 @@ xfs_aops_discard_page(
 		if (error) {
 			/* something screwed, just bail */
 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+				xfs_alert(ip->i_mount,
 			"page discard unable to remove delalloc mapping.");
 			}
 			break;
@@ -1411,7 +1411,7 @@ xfs_vm_write_failed(
 		if (error) {
 			/* something screwed, just bail */
 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+				xfs_alert(ip->i_mount,
 			"xfs_vm_write_failed: unable to clean up ino %lld",
 						ip->i_ino);
 			}
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378d..3cc671c8a67 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -401,9 +401,8 @@ _xfs_buf_lookup_pages(
 			 * handle buffer allocation failures we can't do much.
 			 */
 			if (!(++retries % 100))
-				printk(KERN_ERR
-					"XFS: possible memory allocation "
-					"deadlock in %s (mode:0x%x)\n",
+				xfs_err(NULL,
+		"possible memory allocation deadlock in %s (mode:0x%x)",
 					__func__, gfp_mask);
 
 			XFS_STATS_INC(xb_page_retries);
@@ -615,8 +614,8 @@ xfs_buf_get(
 	if (!(bp->b_flags & XBF_MAPPED)) {
 		error = _xfs_buf_map_pages(bp, flags);
 		if (unlikely(error)) {
-			printk(KERN_WARNING "%s: failed to map pages\n",
-					__func__);
+			xfs_warn(target->bt_mount,
+				"%s: failed to map pages\n", __func__);
 			goto no_buffer;
 		}
 	}
@@ -850,8 +849,8 @@ xfs_buf_get_uncached(
 
 	error = _xfs_buf_map_pages(bp, XBF_MAPPED);
 	if (unlikely(error)) {
-		printk(KERN_WARNING "%s: failed to map pages\n",
-				__func__);
+		xfs_warn(target->bt_mount,
+			"%s: failed to map pages\n", __func__);
 		goto fail_free_mem;
 	}
 
@@ -1617,8 +1616,8 @@ xfs_setsize_buftarg_flags(
 	btp->bt_smask = sectorsize - 1;
 
 	if (set_blocksize(btp->bt_bdev, sectorsize)) {
-		printk(KERN_WARNING
-			"XFS: Cannot set_blocksize to %u on device %s\n",
+		xfs_warn(btp->bt_mount,
+			"Cannot set_blocksize to %u on device %s\n",
 			sectorsize, XFS_BUFTARG_NAME(btp));
 		return EINVAL;
 	}
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 7ec1fb8c131..818c4cf2de8 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -172,6 +172,15 @@ xfs_parseargs(
 	int			iosize = 0;
 	__uint8_t		iosizelog = 0;
 
+	/*
+	 * set up the mount name first so all the errors will refer to the
+	 * correct device.
+	 */
+	mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+	if (!mp->m_fsname)
+		return ENOMEM;
+	mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+
 	/*
 	 * Copy binary VFS mount flags we are interested in.
 	 */
@@ -208,24 +217,21 @@ xfs_parseargs(
 
 		if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
 			if (!value || !*value) {
-				cmn_err(CE_WARN,
-					"XFS: %s option requires an argument",
+				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			mp->m_logbufs = simple_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
 			if (!value || !*value) {
-				cmn_err(CE_WARN,
-					"XFS: %s option requires an argument",
+				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			mp->m_logbsize = suffix_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
 			if (!value || !*value) {
-				cmn_err(CE_WARN,
-					"XFS: %s option requires an argument",
+				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -233,14 +239,12 @@ xfs_parseargs(
 			if (!mp->m_logname)
 				return ENOMEM;
 		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
-			cmn_err(CE_WARN,
-				"XFS: %s option not allowed on this system",
+			xfs_warn(mp, "%s option not allowed on this system",
 				this_char);
 			return EINVAL;
 		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
 			if (!value || !*value) {
-				cmn_err(CE_WARN,
-					"XFS: %s option requires an argument",
+				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -249,8 +253,7 @@ xfs_parseargs(
 				return ENOMEM;
 		} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
 			if (!value || !*value) {
-				cmn_err(CE_WARN,
-					"XFS: %s option requires an argument",
+				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -258,8 +261,7 @@ xfs_parseargs(
 			iosizelog = ffs(iosize) - 1;
 		} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
 			if (!value || !*value) {
-				cmn_err(CE_WARN,
-					"XFS: %s option requires an argument",
+				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -281,16 +283,14 @@ xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_SWALLOC;
 		} else if (!strcmp(this_char, MNTOPT_SUNIT)) {
 			if (!value || !*value) {
-				cmn_err(CE_WARN,
-					"XFS: %s option requires an argument",
+				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			dsunit = simple_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
 			if (!value || !*value) {
-				cmn_err(CE_WARN,
-					"XFS: %s option requires an argument",
+				xfs_warn(mp, "%s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -298,8 +298,7 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
 			mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
-			cmn_err(CE_WARN,
-				"XFS: %s option not allowed on this system",
+			xfs_warn(mp, "%s option not allowed on this system",
 				this_char);
 			return EINVAL;
 #endif
@@ -357,20 +356,19 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
 			mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
 		} else if (!strcmp(this_char, "ihashsize")) {
-			cmn_err(CE_WARN,
-	"XFS: ihashsize no longer used, option is deprecated.");
+			xfs_warn(mp,
+	"ihashsize no longer used, option is deprecated.");
 		} else if (!strcmp(this_char, "osyncisdsync")) {
-			cmn_err(CE_WARN,
-	"XFS: osyncisdsync has no effect, option is deprecated.");
+			xfs_warn(mp,
+	"osyncisdsync has no effect, option is deprecated.");
 		} else if (!strcmp(this_char, "osyncisosync")) {
-			cmn_err(CE_WARN,
-	"XFS: osyncisosync has no effect, option is deprecated.");
+			xfs_warn(mp,
+	"osyncisosync has no effect, option is deprecated.");
 		} else if (!strcmp(this_char, "irixsgid")) {
-			cmn_err(CE_WARN,
-	"XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
+			xfs_warn(mp,
+	"irixsgid is now a sysctl(2) variable, option is deprecated.");
 		} else {
-			cmn_err(CE_WARN,
-				"XFS: unknown mount option [%s].", this_char);
+			xfs_warn(mp, "unknown mount option [%s].", this_char);
 			return EINVAL;
 		}
 	}
@@ -380,40 +378,37 @@ xfs_parseargs(
 	 */
 	if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
 	    !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+		xfs_warn(mp, "no-recovery mounts must be read-only.");
 		return EINVAL;
 	}
 
 	if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
-		cmn_err(CE_WARN,
-	"XFS: sunit and swidth options incompatible with the noalign option");
+		xfs_warn(mp,
+	"sunit and swidth options incompatible with the noalign option");
 		return EINVAL;
 	}
 
 #ifndef CONFIG_XFS_QUOTA
 	if (XFS_IS_QUOTA_RUNNING(mp)) {
-		cmn_err(CE_WARN,
-			"XFS: quota support not available in this kernel.");
+		xfs_warn(mp, "quota support not available in this kernel.");
 		return EINVAL;
 	}
 #endif
 
 	if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
 	    (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
-		cmn_err(CE_WARN,
-			"XFS: cannot mount with both project and group quota");
+		xfs_warn(mp, "cannot mount with both project and group quota");
 		return EINVAL;
 	}
 
 	if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
-		cmn_err(CE_WARN,
-			"XFS: sunit and swidth must be specified together");
+		xfs_warn(mp, "sunit and swidth must be specified together");
 		return EINVAL;
 	}
 
 	if (dsunit && (dswidth % dsunit != 0)) {
-		cmn_err(CE_WARN,
-	"XFS: stripe width (%d) must be a multiple of the stripe unit (%d)",
+		xfs_warn(mp,
+	"stripe width (%d) must be a multiple of the stripe unit (%d)",
 			dswidth, dsunit);
 		return EINVAL;
 	}
@@ -439,8 +434,7 @@ done:
 	    mp->m_logbufs != 0 &&
 	    (mp->m_logbufs < XLOG_MIN_ICLOGS ||
 	     mp->m_logbufs > XLOG_MAX_ICLOGS)) {
-		cmn_err(CE_WARN,
-			"XFS: invalid logbufs value: %d [not %d-%d]",
+		xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
 			mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
 		return XFS_ERROR(EINVAL);
 	}
@@ -449,22 +443,16 @@ done:
 	    (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
 	     mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
 	     !is_power_of_2(mp->m_logbsize))) {
-		cmn_err(CE_WARN,
-	"XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+		xfs_warn(mp,
+			"invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
 			mp->m_logbsize);
 		return XFS_ERROR(EINVAL);
 	}
 
-	mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
-	if (!mp->m_fsname)
-		return ENOMEM;
-	mp->m_fsname_len = strlen(mp->m_fsname) + 1;
-
 	if (iosizelog) {
 		if (iosizelog > XFS_MAX_IO_LOG ||
 		    iosizelog < XFS_MIN_IO_LOG) {
-			cmn_err(CE_WARN,
-		"XFS: invalid log iosize: %d [not %d-%d]",
+			xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
 				iosizelog, XFS_MIN_IO_LOG,
 				XFS_MAX_IO_LOG);
 			return XFS_ERROR(EINVAL);
@@ -611,7 +599,7 @@ xfs_blkdev_get(
 				    mp);
 	if (IS_ERR(*bdevp)) {
 		error = PTR_ERR(*bdevp);
-		printk("XFS: Invalid device [%s], error=%d\n", name, error);
+		xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
 	}
 
 	return -error;
@@ -665,23 +653,23 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
 	int error;
 
 	if (mp->m_logdev_targp != mp->m_ddev_targp) {
-		xfs_fs_cmn_err(CE_NOTE, mp,
+		xfs_notice(mp,
 		  "Disabling barriers, not supported with external log device");
 		mp->m_flags &= ~XFS_MOUNT_BARRIER;
 		return;
 	}
 
 	if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
-		xfs_fs_cmn_err(CE_NOTE, mp,
-		  "Disabling barriers, underlying device is readonly");
+		xfs_notice(mp,
+			"Disabling barriers, underlying device is readonly");
 		mp->m_flags &= ~XFS_MOUNT_BARRIER;
 		return;
 	}
 
 	error = xfs_barrier_test(mp);
 	if (error) {
-		xfs_fs_cmn_err(CE_NOTE, mp,
-		  "Disabling barriers, trial barrier write failed");
+		xfs_notice(mp,
+			"Disabling barriers, trial barrier write failed");
 		mp->m_flags &= ~XFS_MOUNT_BARRIER;
 		return;
 	}
@@ -744,8 +732,8 @@ xfs_open_devices(
 			goto out_close_logdev;
 
 		if (rtdev == ddev || rtdev == logdev) {
-			cmn_err(CE_WARN,
-	"XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
+			xfs_warn(mp,
+	"Cannot mount filesystem with identical rtdev and ddev/logdev.");
 			error = EINVAL;
 			goto out_close_rtdev;
 		}
@@ -1346,8 +1334,8 @@ xfs_fs_remount(
 			 * options that we can't actually change.
 			 */
 #if 0
-			printk(KERN_INFO
-	"XFS: mount option \"%s\" not supported for remount\n", p);
+			xfs_info(mp,
+		"mount option \"%s\" not supported for remount\n", p);
 			return -EINVAL;
 #else
 			break;
@@ -1368,8 +1356,7 @@ xfs_fs_remount(
 		if (mp->m_update_flags) {
 			error = xfs_mount_log_sb(mp, mp->m_update_flags);
 			if (error) {
-				cmn_err(CE_WARN,
-					"XFS: failed to write sb changes");
+				xfs_warn(mp, "failed to write sb changes");
 				return error;
 			}
 			mp->m_update_flags = 0;
@@ -1453,15 +1440,15 @@ xfs_finish_flags(
 			mp->m_logbsize = mp->m_sb.sb_logsunit;
 		} else if (mp->m_logbsize > 0 &&
 			   mp->m_logbsize < mp->m_sb.sb_logsunit) {
-			cmn_err(CE_WARN,
-	"XFS: logbuf size must be greater than or equal to log stripe size");
+			xfs_warn(mp,
+		"logbuf size must be greater than or equal to log stripe size");
 			return XFS_ERROR(EINVAL);
 		}
 	} else {
 		/* Fail a mount if the logbuf is larger than 32K */
 		if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
-			cmn_err(CE_WARN,
-	"XFS: logbuf size for version 1 logs must be 16K or 32K");
+			xfs_warn(mp,
+		"logbuf size for version 1 logs must be 16K or 32K");
 			return XFS_ERROR(EINVAL);
 		}
 	}
@@ -1478,8 +1465,8 @@ xfs_finish_flags(
 	 * prohibit r/w mounts of read-only filesystems
 	 */
 	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
-		cmn_err(CE_WARN,
-	"XFS: cannot mount a read-only filesystem as read-write");
+		xfs_warn(mp,
+			"cannot mount a read-only filesystem as read-write");
 		return XFS_ERROR(EROFS);
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index e22f0057d21..6c10f1d2e3d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -425,8 +425,7 @@ xfs_quiesce_attr(
 	/* Push the superblock and write an unmount record */
 	error = xfs_log_sbcount(mp, 1);
 	if (error)
-		xfs_fs_cmn_err(CE_WARN, mp,
-				"xfs_attr_quiesce: failed to log sb changes. "
+		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
 				"Frozen image may not be consistent.");
 	xfs_log_unmount_write(mp);
 	xfs_unmountfs_writesb(mp);
@@ -806,7 +805,7 @@ xfs_reclaim_inode(
 	 * pass on the error.
 	 */
 	if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+		xfs_warn(ip->i_mount,
 			"inode 0x%llx background reclaim flush failed with %d",
 			(long long)ip->i_ino, error);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index ee3cee097e7..ee2d2adaa43 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -37,7 +37,7 @@ xfs_stats_clear_proc_handler(
 	ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
 
 	if (!ret && write && *valp) {
-		printk("XFS Clearing xfsstats\n");
+		xfs_notice(NULL, "Clearing xfsstats");
 		for_each_possible_cpu(c) {
 			preempt_disable();
 			/* save vn_active, it's a universal truth! */
-- 
cgit v1.2.3


From a0fa2b679ecd15b4bdbb46cd2420b6affde91cf9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:01:35 +1100
Subject: xfs: Convert xlog_warn to new logging interface

Convert the xfs log operations to use the new error logging
interfaces. This removes the xlog_{warn,panic} wrappers and makes
almost all errors emit the device they belong to instead of just
refering to "XFS".

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/quota/xfs_dquot.c |  10 ++-
 fs/xfs/quota/xfs_qm.c    |   2 +-
 fs/xfs/xfs_log.c         | 124 +++++++++++++-------------
 fs/xfs/xfs_log_priv.h    |   4 -
 fs/xfs/xfs_log_recover.c | 223 +++++++++++++++++++++++------------------------
 fs/xfs/xfs_quota.h       |   3 +-
 6 files changed, 177 insertions(+), 189 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d22aa310310..773adc80d6b 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -544,9 +544,10 @@ xfs_qm_dqtobp(
 	/*
 	 * A simple sanity check in case we got a corrupted dquot...
 	 */
-	if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
+	error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
 			   flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-			   "dqtobp")) {
+			   "dqtobp");
+	if (error) {
 		if (!(flags & XFS_QMOPT_DQREPAIR)) {
 			xfs_trans_brelse(tp, bp);
 			return XFS_ERROR(EIO);
@@ -1207,8 +1208,9 @@ xfs_qm_dqflush(
 	/*
 	 * A simple sanity check in case we got a corrupted dquot..
 	 */
-	if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
-			   XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+	error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+			   XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+	if (error) {
 		xfs_buf_relse(bp);
 		xfs_dqfunlock(dqp);
 		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f517963aec0..dd5cc5738a2 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1292,7 +1292,7 @@ xfs_qm_reset_dqcounts(
 		 * output any warnings because it's perfectly possible to
 		 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
 		 */
-		(void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+		(void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
 				      "xfs_quotacheck");
 		ddq->d_bcount = 0;
 		ddq->d_icount = 0;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ae6fef1ff56..25efa9b8a60 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -374,11 +374,10 @@ xfs_log_mount(
 	int		error;
 
 	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-		cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
+		xfs_notice(mp, "Mounting Filesystem");
 	else {
-		cmn_err(CE_NOTE,
-			"Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
-			mp->m_fsname);
+		xfs_notice(mp,
+"Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 	}
 
@@ -393,7 +392,7 @@ xfs_log_mount(
 	 */
 	error = xfs_trans_ail_init(mp);
 	if (error) {
-		cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
+		xfs_warn(mp, "AIL initialisation failed: error %d", error);
 		goto out_free_log;
 	}
 	mp->m_log->l_ailp = mp->m_ail;
@@ -413,7 +412,8 @@ xfs_log_mount(
 		if (readonly)
 			mp->m_flags |= XFS_MOUNT_RDONLY;
 		if (error) {
-			cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
+			xfs_warn(mp, "log mount/recovery failed: error %d",
+				error);
 			goto out_destroy_ail;
 		}
 	}
@@ -542,10 +542,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 			 */
 		}
 
-		if (error) {
-			xfs_fs_cmn_err(CE_ALERT, mp,
-				"xfs_log_unmount: unmount record failed");
-		}
+		if (error)
+			xfs_alert(mp, "%s: unmount record failed", __func__);
 
 
 		spin_lock(&log->l_icloglock);
@@ -852,7 +850,7 @@ xlog_space_left(
 		 * In this case we just want to return the size of the
 		 * log as the amount of space left.
 		 */
-		xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+		xfs_alert(log->l_mp,
 			"xlog_space_left: head behind tail\n"
 			"  tail_cycle = %d, tail_bytes = %d\n"
 			"  GH   cycle = %d, GH   bytes = %d",
@@ -1001,7 +999,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
 
 	log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
 	if (!log) {
-		xlog_warn("XFS: Log allocation failed: No memory!");
+		xfs_warn(mp, "Log allocation failed: No memory!");
 		goto out;
 	}
 
@@ -1029,24 +1027,24 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	if (xfs_sb_version_hassector(&mp->m_sb)) {
 	        log2_size = mp->m_sb.sb_logsectlog;
 		if (log2_size < BBSHIFT) {
-			xlog_warn("XFS: Log sector size too small "
-				"(0x%x < 0x%x)", log2_size, BBSHIFT);
+			xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
+				log2_size, BBSHIFT);
 			goto out_free_log;
 		}
 
 	        log2_size -= BBSHIFT;
 		if (log2_size > mp->m_sectbb_log) {
-			xlog_warn("XFS: Log sector size too large "
-				"(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
+			xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
+				log2_size, mp->m_sectbb_log);
 			goto out_free_log;
 		}
 
 		/* for larger sector sizes, must have v2 or external log */
 		if (log2_size && log->l_logBBstart > 0 &&
 			    !xfs_sb_version_haslogv2(&mp->m_sb)) {
-
-			xlog_warn("XFS: log sector size (0x%x) invalid "
-				  "for configuration.", log2_size);
+			xfs_warn(mp,
+		"log sector size (0x%x) invalid for configuration.",
+				log2_size);
 			goto out_free_log;
 		}
 	}
@@ -1563,38 +1561,36 @@ xlog_print_tic_res(
 	    "SWAPEXT"
 	};
 
-	xfs_fs_cmn_err(CE_WARN, mp,
-			"xfs_log_write: reservation summary:\n"
-			"  trans type  = %s (%u)\n"
-			"  unit res    = %d bytes\n"
-			"  current res = %d bytes\n"
-			"  total reg   = %u bytes (o/flow = %u bytes)\n"
-			"  ophdrs      = %u (ophdr space = %u bytes)\n"
-			"  ophdr + reg = %u bytes\n"
-			"  num regions = %u\n",
-			((ticket->t_trans_type <= 0 ||
-			  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-			  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
-			ticket->t_trans_type,
-			ticket->t_unit_res,
-			ticket->t_curr_res,
-			ticket->t_res_arr_sum, ticket->t_res_o_flow,
-			ticket->t_res_num_ophdrs, ophdr_spc,
-			ticket->t_res_arr_sum + 
-			ticket->t_res_o_flow + ophdr_spc,
-			ticket->t_res_num);
+	xfs_warn(mp,
+		"xfs_log_write: reservation summary:\n"
+		"  trans type  = %s (%u)\n"
+		"  unit res    = %d bytes\n"
+		"  current res = %d bytes\n"
+		"  total reg   = %u bytes (o/flow = %u bytes)\n"
+		"  ophdrs      = %u (ophdr space = %u bytes)\n"
+		"  ophdr + reg = %u bytes\n"
+		"  num regions = %u\n",
+		((ticket->t_trans_type <= 0 ||
+		  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+		  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+		ticket->t_trans_type,
+		ticket->t_unit_res,
+		ticket->t_curr_res,
+		ticket->t_res_arr_sum, ticket->t_res_o_flow,
+		ticket->t_res_num_ophdrs, ophdr_spc,
+		ticket->t_res_arr_sum +
+		ticket->t_res_o_flow + ophdr_spc,
+		ticket->t_res_num);
 
 	for (i = 0; i < ticket->t_res_num; i++) {
-		uint r_type = ticket->t_res_arr[i].r_type; 
-		cmn_err(CE_WARN,
-			    "region[%u]: %s - %u bytes\n",
-			    i, 
+		uint r_type = ticket->t_res_arr[i].r_type;
+		xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
 			    ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
 			    "bad-rtype" : res_type_str[r_type-1]),
 			    ticket->t_res_arr[i].r_len);
 	}
 
-	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+	xfs_alert_tag(mp, XFS_PTAG_LOGRES,
 		"xfs_log_write: reservation ran out. Need to up reservation");
 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 }
@@ -1682,7 +1678,7 @@ xlog_write_setup_ophdr(
 	case XFS_LOG:
 		break;
 	default:
-		xfs_fs_cmn_err(CE_WARN, log->l_mp,
+		xfs_warn(log->l_mp,
 			"Bad XFS transaction clientid 0x%x in ticket 0x%p",
 			ophdr->oh_clientid, ticket);
 		return NULL;
@@ -2264,7 +2260,7 @@ xlog_state_do_callback(
 		if (repeats > 5000) {
 			flushcnt += repeats;
 			repeats = 0;
-			xfs_fs_cmn_err(CE_WARN, log->l_mp,
+			xfs_warn(log->l_mp,
 				"%s: possible infinite loop (%d iterations)",
 				__func__, flushcnt);
 		}
@@ -3052,10 +3048,8 @@ xfs_log_force(
 	int	error;
 
 	error = _xfs_log_force(mp, flags, NULL);
-	if (error) {
-		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
-			"error %d returned.", error);
-	}
+	if (error)
+		xfs_warn(mp, "%s: error %d returned.", __func__, error);
 }
 
 /*
@@ -3204,10 +3198,8 @@ xfs_log_force_lsn(
 	int	error;
 
 	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
-	if (error) {
-		xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
-			"error %d returned.", error);
-	}
+	if (error)
+		xfs_warn(mp, "%s: error %d returned.", __func__, error);
 }
 
 /*
@@ -3412,7 +3404,7 @@ xlog_verify_dest_ptr(
 	}
 
 	if (!good_ptr)
-		xlog_panic("xlog_verify_dest_ptr: invalid ptr");
+		xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
 }
 
 STATIC void
@@ -3448,16 +3440,16 @@ xlog_verify_tail_lsn(xlog_t	    *log,
 	blocks =
 	    log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
 	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
-	    xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
     } else {
 	ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
 
 	if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
-	    xlog_panic("xlog_verify_tail_lsn: tail wrapped");
+		xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
 
 	blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
 	if (blocks < BTOBB(iclog->ic_offset) + 1)
-	    xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+		xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
     }
 }	/* xlog_verify_tail_lsn */
 
@@ -3497,22 +3489,23 @@ xlog_verify_iclog(xlog_t	 *log,
 	icptr = log->l_iclog;
 	for (i=0; i < log->l_iclog_bufs; i++) {
 		if (icptr == NULL)
-			xlog_panic("xlog_verify_iclog: invalid ptr");
+			xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
 		icptr = icptr->ic_next;
 	}
 	if (icptr != log->l_iclog)
-		xlog_panic("xlog_verify_iclog: corrupt iclog ring");
+		xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
 	spin_unlock(&log->l_icloglock);
 
 	/* check log magic numbers */
 	if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
-		xlog_panic("xlog_verify_iclog: invalid magic num");
+		xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
 
 	ptr = (xfs_caddr_t) &iclog->ic_header;
 	for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
 	     ptr += BBSIZE) {
 		if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
-			xlog_panic("xlog_verify_iclog: unexpected magic num");
+			xfs_emerg(log->l_mp, "%s: unexpected magic num",
+				__func__);
 	}
 
 	/* check fields */
@@ -3542,9 +3535,10 @@ xlog_verify_iclog(xlog_t	 *log,
 			}
 		}
 		if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
-			cmn_err(CE_WARN, "xlog_verify_iclog: "
-				"invalid clientid %d op 0x%p offset 0x%lx",
-				clientid, ophead, (unsigned long)field_offset);
+			xfs_warn(log->l_mp,
+				"%s: invalid clientid %d op 0x%p offset 0x%lx",
+				__func__, clientid, ophead,
+				(unsigned long)field_offset);
 
 		/* check length */
 		field_offset = (__psint_t)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d5f8be8f4bf..15dbf1f9c2b 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -87,10 +87,6 @@ static inline uint xlog_get_client_id(__be32 i)
 	return be32_to_cpu(i) >> 24;
 }
 
-#define xlog_panic(args...)	cmn_err(CE_PANIC, ## args)
-#define xlog_exit(args...)	cmn_err(CE_PANIC, ## args)
-#define xlog_warn(args...)	cmn_err(CE_WARN, ## args)
-
 /*
  * In core log state
  */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index aa0ebb77690..0c4a5618e7a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -92,7 +92,7 @@ xlog_get_bp(
 	int		nbblks)
 {
 	if (!xlog_buf_bbcount_valid(log, nbblks)) {
-		xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 			nbblks);
 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 		return NULL;
@@ -160,7 +160,7 @@ xlog_bread_noalign(
 	int		error;
 
 	if (!xlog_buf_bbcount_valid(log, nbblks)) {
-		xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 			nbblks);
 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 		return EFSCORRUPTED;
@@ -219,7 +219,7 @@ xlog_bwrite(
 	int		error;
 
 	if (!xlog_buf_bbcount_valid(log, nbblks)) {
-		xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 			nbblks);
 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 		return EFSCORRUPTED;
@@ -254,9 +254,9 @@ xlog_header_check_dump(
 	xfs_mount_t		*mp,
 	xlog_rec_header_t	*head)
 {
-	cmn_err(CE_DEBUG, "%s:  SB : uuid = %pU, fmt = %d\n",
+	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
 		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
-	cmn_err(CE_DEBUG, "    log : uuid = %pU, fmt = %d\n",
+	xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
 		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 }
 #else
@@ -279,15 +279,15 @@ xlog_header_check_recover(
 	 * a dirty log created in IRIX.
 	 */
 	if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
-		xlog_warn(
-	"XFS: dirty log written in incompatible format - can't recover");
+		xfs_warn(mp,
+	"dirty log written in incompatible format - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 				 XFS_ERRLEVEL_HIGH, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-		xlog_warn(
-	"XFS: dirty log entry has mismatched uuid - can't recover");
+		xfs_warn(mp,
+	"dirty log entry has mismatched uuid - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 				 XFS_ERRLEVEL_HIGH, mp);
@@ -312,9 +312,9 @@ xlog_header_check_mount(
 		 * h_fs_uuid is nil, we assume this log was last mounted
 		 * by IRIX and continue.
 		 */
-		xlog_warn("XFS: nil uuid in log - IRIX style log");
+		xfs_warn(mp, "nil uuid in log - IRIX style log");
 	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-		xlog_warn("XFS: log has mismatched uuid - can't recover");
+		xfs_warn(mp, "log has mismatched uuid - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_mount",
 				 XFS_ERRLEVEL_HIGH, mp);
@@ -490,8 +490,8 @@ xlog_find_verify_log_record(
 	for (i = (*last_blk) - 1; i >= 0; i--) {
 		if (i < start_blk) {
 			/* valid log record not found */
-			xlog_warn(
-		"XFS: Log inconsistent (didn't find previous header)");
+			xfs_warn(log->l_mp,
+		"Log inconsistent (didn't find previous header)");
 			ASSERT(0);
 			error = XFS_ERROR(EIO);
 			goto out;
@@ -591,12 +591,12 @@ xlog_find_head(
 			 * mkfs etc write a dummy unmount record to a fresh
 			 * log so we can store the uuid in there
 			 */
-			xlog_warn("XFS: totally zeroed log");
+			xfs_warn(log->l_mp, "totally zeroed log");
 		}
 
 		return 0;
 	} else if (error) {
-		xlog_warn("XFS: empty log check failed");
+		xfs_warn(log->l_mp, "empty log check failed");
 		return error;
 	}
 
@@ -819,7 +819,7 @@ validate_head:
 	xlog_put_bp(bp);
 
 	if (error)
-	    xlog_warn("XFS: failed to find log head");
+		xfs_warn(log->l_mp, "failed to find log head");
 	return error;
 }
 
@@ -912,7 +912,7 @@ xlog_find_tail(
 		}
 	}
 	if (!found) {
-		xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
+		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
 		ASSERT(0);
 		return XFS_ERROR(EIO);
 	}
@@ -1028,7 +1028,7 @@ done:
 	xlog_put_bp(bp);
 
 	if (error)
-		xlog_warn("XFS: failed to locate log tail");
+		xfs_warn(log->l_mp, "failed to locate log tail");
 	return error;
 }
 
@@ -1092,7 +1092,8 @@ xlog_find_zeroed(
 		 * the first block must be 1. If it's not, maybe we're
 		 * not looking at a log... Bail out.
 		 */
-		xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
+		xfs_warn(log->l_mp,
+			"Log inconsistent or not a log (last==0, first!=1)");
 		return XFS_ERROR(EINVAL);
 	}
 
@@ -1506,8 +1507,8 @@ xlog_recover_add_to_trans(
 	if (list_empty(&trans->r_itemq)) {
 		/* we need to catch log corruptions here */
 		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
-			xlog_warn("XFS: xlog_recover_add_to_trans: "
-				  "bad header magic number");
+			xfs_warn(log->l_mp, "%s: bad header magic number",
+				__func__);
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
@@ -1534,8 +1535,8 @@ xlog_recover_add_to_trans(
 	if (item->ri_total == 0) {		/* first region to be added */
 		if (in_f->ilf_size == 0 ||
 		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
-			xlog_warn(
-	"XFS: bad number of regions (%d) in inode log format",
+			xfs_warn(log->l_mp,
+		"bad number of regions (%d) in inode log format",
 				  in_f->ilf_size);
 			ASSERT(0);
 			return XFS_ERROR(EIO);
@@ -1592,8 +1593,9 @@ xlog_recover_reorder_trans(
 			list_move_tail(&item->ri_list, &trans->r_itemq);
 			break;
 		default:
-			xlog_warn(
-	"XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
+			xfs_warn(log->l_mp,
+				"%s: unrecognized type of log operation",
+				__func__);
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
@@ -1803,8 +1805,9 @@ xlog_recover_do_inode_buffer(
 		logged_nextp = item->ri_buf[item_index].i_addr +
 				next_unlinked_offset - reg_buf_offset;
 		if (unlikely(*logged_nextp == 0)) {
-			xfs_fs_cmn_err(CE_ALERT, mp,
-				"bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
+			xfs_alert(mp,
+		"Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
+		"Trying to replay bad (0) inode di_next_unlinked field.",
 				item, bp);
 			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
 					 XFS_ERRLEVEL_LOW, mp);
@@ -1863,17 +1866,17 @@ xlog_recover_do_reg_buffer(
 		if (buf_f->blf_flags &
 		   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
 			if (item->ri_buf[i].i_addr == NULL) {
-				cmn_err(CE_ALERT,
+				xfs_alert(mp,
 					"XFS: NULL dquot in %s.", __func__);
 				goto next;
 			}
 			if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
-				cmn_err(CE_ALERT,
+				xfs_alert(mp,
 					"XFS: dquot too small (%d) in %s.",
 					item->ri_buf[i].i_len, __func__);
 				goto next;
 			}
-			error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
+			error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
 			if (error)
@@ -1898,6 +1901,7 @@ xlog_recover_do_reg_buffer(
  */
 int
 xfs_qm_dqcheck(
+	struct xfs_mount *mp,
 	xfs_disk_dquot_t *ddq,
 	xfs_dqid_t	 id,
 	uint		 type,	  /* used only when IO_dorepair is true */
@@ -1924,14 +1928,14 @@ xfs_qm_dqcheck(
 	 */
 	if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
 		if (flags & XFS_QMOPT_DOWARN)
-			cmn_err(CE_ALERT,
+			xfs_alert(mp,
 			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
 			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
 		errs++;
 	}
 	if (ddq->d_version != XFS_DQUOT_VERSION) {
 		if (flags & XFS_QMOPT_DOWARN)
-			cmn_err(CE_ALERT,
+			xfs_alert(mp,
 			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
 			str, id, ddq->d_version, XFS_DQUOT_VERSION);
 		errs++;
@@ -1941,7 +1945,7 @@ xfs_qm_dqcheck(
 	    ddq->d_flags != XFS_DQ_PROJ &&
 	    ddq->d_flags != XFS_DQ_GROUP) {
 		if (flags & XFS_QMOPT_DOWARN)
-			cmn_err(CE_ALERT,
+			xfs_alert(mp,
 			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
 			str, id, ddq->d_flags);
 		errs++;
@@ -1949,7 +1953,7 @@ xfs_qm_dqcheck(
 
 	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
 		if (flags & XFS_QMOPT_DOWARN)
-			cmn_err(CE_ALERT,
+			xfs_alert(mp,
 			"%s : ondisk-dquot 0x%p, ID mismatch: "
 			"0x%x expected, found id 0x%x",
 			str, ddq, id, be32_to_cpu(ddq->d_id));
@@ -1962,9 +1966,8 @@ xfs_qm_dqcheck(
 				be64_to_cpu(ddq->d_blk_softlimit)) {
 			if (!ddq->d_btimer) {
 				if (flags & XFS_QMOPT_DOWARN)
-					cmn_err(CE_ALERT,
-					"%s : Dquot ID 0x%x (0x%p) "
-					"BLK TIMER NOT STARTED",
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
 					str, (int)be32_to_cpu(ddq->d_id), ddq);
 				errs++;
 			}
@@ -1974,9 +1977,8 @@ xfs_qm_dqcheck(
 				be64_to_cpu(ddq->d_ino_softlimit)) {
 			if (!ddq->d_itimer) {
 				if (flags & XFS_QMOPT_DOWARN)
-					cmn_err(CE_ALERT,
-					"%s : Dquot ID 0x%x (0x%p) "
-					"INODE TIMER NOT STARTED",
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
 					str, (int)be32_to_cpu(ddq->d_id), ddq);
 				errs++;
 			}
@@ -1986,9 +1988,8 @@ xfs_qm_dqcheck(
 				be64_to_cpu(ddq->d_rtb_softlimit)) {
 			if (!ddq->d_rtbtimer) {
 				if (flags & XFS_QMOPT_DOWARN)
-					cmn_err(CE_ALERT,
-					"%s : Dquot ID 0x%x (0x%p) "
-					"RTBLK TIMER NOT STARTED",
+					xfs_alert(mp,
+			"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
 					str, (int)be32_to_cpu(ddq->d_id), ddq);
 				errs++;
 			}
@@ -1999,7 +2000,7 @@ xfs_qm_dqcheck(
 		return errs;
 
 	if (flags & XFS_QMOPT_DOWARN)
-		cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
+		xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
 
 	/*
 	 * Typically, a repair is only requested by quotacheck.
@@ -2218,9 +2219,9 @@ xlog_recover_inode_pass2(
 	 */
 	if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
 		xfs_buf_relse(bp);
-		xfs_fs_cmn_err(CE_ALERT, mp,
-			"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
-			dip, bp, in_f->ilf_ino);
+		xfs_alert(mp,
+	"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
+			__func__, dip, bp, in_f->ilf_ino);
 		XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
@@ -2229,9 +2230,9 @@ xlog_recover_inode_pass2(
 	dicp = item->ri_buf[1].i_addr;
 	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
 		xfs_buf_relse(bp);
-		xfs_fs_cmn_err(CE_ALERT, mp,
-			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
-			item, in_f->ilf_ino);
+		xfs_alert(mp,
+			"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
+			__func__, item, in_f->ilf_ino);
 		XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
@@ -2263,9 +2264,10 @@ xlog_recover_inode_pass2(
 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
 					 XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
-			xfs_fs_cmn_err(CE_ALERT, mp,
-				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-				item, dip, bp, in_f->ilf_ino);
+			xfs_alert(mp,
+		"%s: Bad regular inode log record, rec ptr 0x%p, "
+		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				__func__, item, dip, bp, in_f->ilf_ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
@@ -2276,9 +2278,10 @@ xlog_recover_inode_pass2(
 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
 					     XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
-			xfs_fs_cmn_err(CE_ALERT, mp,
-				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
-				item, dip, bp, in_f->ilf_ino);
+			xfs_alert(mp,
+		"%s: Bad dir inode log record, rec ptr 0x%p, "
+		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				__func__, item, dip, bp, in_f->ilf_ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
@@ -2287,9 +2290,10 @@ xlog_recover_inode_pass2(
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
-		xfs_fs_cmn_err(CE_ALERT, mp,
-			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
-			item, dip, bp, in_f->ilf_ino,
+		xfs_alert(mp,
+	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+	"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+			__func__, item, dip, bp, in_f->ilf_ino,
 			dicp->di_nextents + dicp->di_anextents,
 			dicp->di_nblocks);
 		error = EFSCORRUPTED;
@@ -2299,8 +2303,9 @@ xlog_recover_inode_pass2(
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
-		xfs_fs_cmn_err(CE_ALERT, mp,
-			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
+		xfs_alert(mp,
+	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+	"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
 			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
 		error = EFSCORRUPTED;
 		goto error;
@@ -2309,9 +2314,9 @@ xlog_recover_inode_pass2(
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
-		xfs_fs_cmn_err(CE_ALERT, mp,
-			"xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
-			item->ri_buf[1].i_len, item);
+		xfs_alert(mp,
+			"%s: Bad inode log record length %d, rec ptr 0x%p",
+			__func__, item->ri_buf[1].i_len, item);
 		error = EFSCORRUPTED;
 		goto error;
 	}
@@ -2398,7 +2403,7 @@ xlog_recover_inode_pass2(
 			break;
 
 		default:
-			xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
+			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
 			ASSERT(0);
 			xfs_buf_relse(bp);
 			error = EIO;
@@ -2467,13 +2472,11 @@ xlog_recover_dquot_pass2(
 
 	recddq = item->ri_buf[1].i_addr;
 	if (recddq == NULL) {
-		cmn_err(CE_ALERT,
-			"XFS: NULL dquot in %s.", __func__);
+		xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
 		return XFS_ERROR(EIO);
 	}
 	if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
-		cmn_err(CE_ALERT,
-			"XFS: dquot too small (%d) in %s.",
+		xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
 			item->ri_buf[1].i_len, __func__);
 		return XFS_ERROR(EIO);
 	}
@@ -2498,12 +2501,10 @@ xlog_recover_dquot_pass2(
 	 */
 	dq_f = item->ri_buf[0].i_addr;
 	ASSERT(dq_f);
-	if ((error = xfs_qm_dqcheck(recddq,
-			   dq_f->qlf_id,
-			   0, XFS_QMOPT_DOWARN,
-			   "xlog_recover_dquot_pass2 (log copy)"))) {
+	error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+			   "xlog_recover_dquot_pass2 (log copy)");
+	if (error)
 		return XFS_ERROR(EIO);
-	}
 	ASSERT(dq_f->qlf_len == 1);
 
 	error = xfs_read_buf(mp, mp->m_ddev_targp,
@@ -2523,8 +2524,9 @@ xlog_recover_dquot_pass2(
 	 * was among a chunk of dquots created earlier, and we did some
 	 * minimal initialization then.
 	 */
-	if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-			   "xlog_recover_dquot_pass2")) {
+	error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+			   "xlog_recover_dquot_pass2");
+	if (error) {
 		xfs_buf_relse(bp);
 		return XFS_ERROR(EIO);
 	}
@@ -2676,9 +2678,8 @@ xlog_recover_commit_pass1(
 		/* nothing to do in pass 1 */
 		return 0;
 	default:
-		xlog_warn(
-	"XFS: invalid item type (%d) xlog_recover_commit_pass1",
-			ITEM_TYPE(item));
+		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+			__func__, ITEM_TYPE(item));
 		ASSERT(0);
 		return XFS_ERROR(EIO);
 	}
@@ -2707,9 +2708,8 @@ xlog_recover_commit_pass2(
 		/* nothing to do in pass2 */
 		return 0;
 	default:
-		xlog_warn(
-	"XFS: invalid item type (%d) xlog_recover_commit_pass2",
-			ITEM_TYPE(item));
+		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+			__func__, ITEM_TYPE(item));
 		ASSERT(0);
 		return XFS_ERROR(EIO);
 	}
@@ -2751,10 +2751,11 @@ xlog_recover_commit_trans(
 
 STATIC int
 xlog_recover_unmount_trans(
+	struct log		*log,
 	xlog_recover_t		*trans)
 {
 	/* Do nothing now */
-	xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
+	xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
 	return 0;
 }
 
@@ -2797,8 +2798,8 @@ xlog_recover_process_data(
 		dp += sizeof(xlog_op_header_t);
 		if (ohead->oh_clientid != XFS_TRANSACTION &&
 		    ohead->oh_clientid != XFS_LOG) {
-			xlog_warn(
-		"XFS: xlog_recover_process_data: bad clientid");
+			xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+					__func__, ohead->oh_clientid);
 			ASSERT(0);
 			return (XFS_ERROR(EIO));
 		}
@@ -2811,8 +2812,8 @@ xlog_recover_process_data(
 					be64_to_cpu(rhead->h_lsn));
 		} else {
 			if (dp + be32_to_cpu(ohead->oh_len) > lp) {
-				xlog_warn(
-			"XFS: xlog_recover_process_data: bad length");
+				xfs_warn(log->l_mp, "%s: bad length 0x%x",
+					__func__, be32_to_cpu(ohead->oh_len));
 				WARN_ON(1);
 				return (XFS_ERROR(EIO));
 			}
@@ -2825,7 +2826,7 @@ xlog_recover_process_data(
 								trans, pass);
 				break;
 			case XLOG_UNMOUNT_TRANS:
-				error = xlog_recover_unmount_trans(trans);
+				error = xlog_recover_unmount_trans(log, trans);
 				break;
 			case XLOG_WAS_CONT_TRANS:
 				error = xlog_recover_add_to_cont_trans(log,
@@ -2833,8 +2834,8 @@ xlog_recover_process_data(
 						be32_to_cpu(ohead->oh_len));
 				break;
 			case XLOG_START_TRANS:
-				xlog_warn(
-			"XFS: xlog_recover_process_data: bad transaction");
+				xfs_warn(log->l_mp, "%s: bad transaction",
+					__func__);
 				ASSERT(0);
 				error = XFS_ERROR(EIO);
 				break;
@@ -2844,8 +2845,8 @@ xlog_recover_process_data(
 						dp, be32_to_cpu(ohead->oh_len));
 				break;
 			default:
-				xlog_warn(
-			"XFS: xlog_recover_process_data: bad flag");
+				xfs_warn(log->l_mp, "%s: bad flag 0x%x",
+					__func__, flags);
 				ASSERT(0);
 				error = XFS_ERROR(EIO);
 				break;
@@ -3030,8 +3031,7 @@ xlog_recover_clear_agi_bucket(
 out_abort:
 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 out_error:
-	xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
-			"failed to clear agi %d. Continuing.", agno);
+	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
 	return;
 }
 
@@ -3282,7 +3282,7 @@ xlog_valid_rec_header(
 	if (unlikely(
 	    (!rhead->h_version ||
 	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
-		xlog_warn("XFS: %s: unrecognised log version (%d).",
+		xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
 			__func__, be32_to_cpu(rhead->h_version));
 		return XFS_ERROR(EIO);
 	}
@@ -3740,10 +3740,9 @@ xlog_recover(
 			return error;
 		}
 
-		cmn_err(CE_NOTE,
-			"Starting XFS recovery on filesystem: %s (logdev: %s)",
-			log->l_mp->m_fsname, log->l_mp->m_logname ?
-			log->l_mp->m_logname : "internal");
+		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
+				log->l_mp->m_logname ? log->l_mp->m_logname
+						     : "internal");
 
 		error = xlog_do_recover(log, head_blk, tail_blk);
 		log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3776,9 +3775,7 @@ xlog_recover_finish(
 		int	error;
 		error = xlog_recover_process_efis(log);
 		if (error) {
-			cmn_err(CE_ALERT,
-				"Failed to recover EFIs on filesystem: %s",
-				log->l_mp->m_fsname);
+			xfs_alert(log->l_mp, "Failed to recover EFIs");
 			return error;
 		}
 		/*
@@ -3793,15 +3790,12 @@ xlog_recover_finish(
 
 		xlog_recover_check_summary(log);
 
-		cmn_err(CE_NOTE,
-			"Ending XFS recovery on filesystem: %s (logdev: %s)",
-			log->l_mp->m_fsname, log->l_mp->m_logname ?
-			log->l_mp->m_logname : "internal");
+		xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
+				log->l_mp->m_logname ? log->l_mp->m_logname
+						     : "internal");
 		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
 	} else {
-		cmn_err(CE_DEBUG,
-			"Ending clean XFS mount for filesystem: %s\n",
-			log->l_mp->m_fsname);
+		xfs_info(log->l_mp, "Ending clean mount");
 	}
 	return 0;
 }
@@ -3834,10 +3828,8 @@ xlog_recover_check_summary(
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
 		if (error) {
-			xfs_fs_cmn_err(CE_ALERT, mp,
-					"xlog_recover_check_summary(agf)"
-					"agf read failed agno %d error %d",
-							agno, error);
+			xfs_alert(mp, "%s agf read failed agno %d error %d",
+						__func__, agno, error);
 		} else {
 			agfp = XFS_BUF_TO_AGF(agfbp);
 			freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3846,7 +3838,10 @@ xlog_recover_check_summary(
 		}
 
 		error = xfs_read_agi(mp, NULL, agno, &agibp);
-		if (!error) {
+		if (error) {
+			xfs_alert(mp, "%s agi read failed agno %d error %d",
+						__func__, agno, error);
+		} else {
 			struct xfs_agi	*agi = XFS_BUF_TO_AGI(agibp);
 
 			itotal += be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 9bb6eda4cd2..a595f29567f 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -382,7 +382,8 @@ static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
 	xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
 				f | XFS_QMOPT_RES_REGBLKS)
 
-extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
+extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
+				xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 
 #endif	/* __KERNEL__ */
-- 
cgit v1.2.3


From 6a19d9393a5402e69fc52f5da8a828b8499a8265 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:02:35 +1100
Subject: xfs: convert xfs_cmn_err to xfs_alert_tag

Continue the conversion of the old cmn_err interface be converting
all the conditional panic tag errors to xfs_alert_tag() and then
removing xfs_cmn_err().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/support/debug.c | 28 ----------------------------
 fs/xfs/support/debug.h |  2 --
 fs/xfs/xfs_bmap.c      |  2 +-
 fs/xfs/xfs_error.c     |  5 ++---
 fs/xfs/xfs_error.h     |  6 ++----
 fs/xfs/xfs_inode.c     | 40 ++++++++++++++++++++--------------------
 fs/xfs/xfs_iomap.c     |  2 +-
 fs/xfs/xfs_rw.c        | 23 +++++++++--------------
 fs/xfs/xfs_trans_ail.c |  2 +-
 9 files changed, 36 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 0df88897ef8..feaca739d5c 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -64,34 +64,6 @@ xfs_fs_cmn_err(
 	BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
 }
 
-/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
-void
-xfs_cmn_err(
-	int			panic_tag,
-	const char		*lvl,
-	struct xfs_mount	*mp,
-	const char		*fmt,
-	...)
-{
-	struct va_format	vaf;
-	va_list			args;
-	int			do_panic = 0;
-
-	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-		printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
-		do_panic = 1;
-	}
-
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
-	va_end(args);
-
-	BUG_ON(do_panic);
-}
-
 void
 assfail(char *expr, char *file, int line)
 {
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 05699f67d47..eaeaa17ef4b 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -33,8 +33,6 @@ void cmn_err(const char *lvl, const char *fmt, ...)
 		__attribute__ ((format (printf, 2, 3)));
 void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
 		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
-void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
-		const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
 
 extern void assfail(char *expr, char *f, int l);
 
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index e7b441db053..451b4484795 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3526,7 +3526,7 @@ xfs_bmap_search_extents(
 
 	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
 		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-		xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
 				"Access to block zero in inode %llu "
 				"start_block: %llx start_off: %llx "
 				"blkcnt: %llx extent-state: %x lastx: %x\n",
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 4c7db74a05f..34f0e2d264e 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -162,9 +162,8 @@ xfs_error_report(
 	inst_t			*ra)
 {
 	if (level <= xfs_error_level) {
-		xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
-			    CE_ALERT, mp,
-		"XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
+		xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
+		"Internal error %s at line %d of file %s.  Caller 0x%p\n",
 			    tag, linenum, filename, ra);
 
 		xfs_stack_trace();
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 10dce5475f0..e1ba2d2565b 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -145,10 +145,8 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #endif /* DEBUG */
 
 /*
- * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
- *			a panic by setting xfs_panic_mask in a
- *			sysctl.  update xfs_max[XFS_PARAM] if
- *			more are added.
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
+ *			a panic by setting xfs_panic_mask in a sysctl.
  */
 #define		XFS_NO_PTAG			0
 #define		XFS_PTAG_IFLUSH			0x00000001
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c39278b6c87..bf9ca579365 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2940,16 +2940,16 @@ xfs_iflush_int(
 
 	if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
 			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
-		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
-		    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
-			ip->i_ino, be16_to_cpu(dip->di_magic), dip);
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
 		goto corrupt_out;
 	}
 	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
 				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
-		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
-			"xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
-			ip->i_ino, ip, ip->i_d.di_magic);
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+			__func__, ip->i_ino, ip, ip->i_d.di_magic);
 		goto corrupt_out;
 	}
 	if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
@@ -2957,9 +2957,9 @@ xfs_iflush_int(
 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
 		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
-			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
-				"xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
-				ip->i_ino, ip);
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad regular inode %Lu, ptr 0x%p",
+				__func__, ip->i_ino, ip);
 			goto corrupt_out;
 		}
 	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
@@ -2968,28 +2968,28 @@ xfs_iflush_int(
 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
 		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
 		    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
-			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
-				"xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
-				ip->i_ino, ip);
+			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+				"%s: Bad directory inode %Lu, ptr 0x%p",
+				__func__, ip->i_ino, ip);
 			goto corrupt_out;
 		}
 	}
 	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
 				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
 				XFS_RANDOM_IFLUSH_5)) {
-		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
-			"xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
-			ip->i_ino,
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: detected corrupt incore inode %Lu, "
+			"total extents = %d, nblocks = %Ld, ptr 0x%p",
+			__func__, ip->i_ino,
 			ip->i_d.di_nextents + ip->i_d.di_anextents,
-			ip->i_d.di_nblocks,
-			ip);
+			ip->i_d.di_nblocks, ip);
 		goto corrupt_out;
 	}
 	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
 				mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
-		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
-			"xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
-			ip->i_ino, ip->i_d.di_forkoff, ip);
+		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
+			"%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+			__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
 		goto corrupt_out;
 	}
 	/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8a0f044750c..812646fe1b3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -105,7 +105,7 @@ xfs_cmn_err_fsblock_zero(
 	xfs_inode_t	*ip,
 	xfs_bmbt_irec_t	*imap)
 {
-	xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
 			"Access to block zero in inode %llu "
 			"start_block: %llx start_off: %llx "
 			"blkcnt: %llx extent-state: %x\n",
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index ccd3adf640e..ae6f4961341 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -69,25 +69,20 @@ xfs_do_force_shutdown(
 		return;
 
 	if (flags & SHUTDOWN_CORRUPT_INCORE) {
-		xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
-    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
-			mp->m_fsname);
-		if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
+		xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
+    "Corruption of in-memory data detected.  Shutting down filesystem");
+		if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
 			xfs_stack_trace();
-		}
 	} else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
 		if (logerror) {
-			xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
-		"Log I/O Error Detected.  Shutting down filesystem: %s",
-				mp->m_fsname);
+			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+		"Log I/O Error Detected.  Shutting down filesystem");
 		} else if (flags & SHUTDOWN_DEVICE_REQ) {
-			xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
-		"All device paths lost.  Shutting down filesystem: %s",
-				mp->m_fsname);
+			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+		"All device paths lost.  Shutting down filesystem");
 		} else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
-			xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
-		"I/O Error Detected.  Shutting down filesystem: %s",
-				mp->m_fsname);
+			xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+		"I/O Error Detected. Shutting down filesystem");
 		}
 	}
 	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c5bbbc45db9..12aff9584e2 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -563,7 +563,7 @@ xfs_trans_ail_delete_bulk(
 
 			spin_unlock(&ailp->xa_lock);
 			if (!XFS_FORCED_SHUTDOWN(mp)) {
-				xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+				xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
 		"%s: attempting to delete a log item that is not in the AIL",
 						__func__);
 				xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-- 
cgit v1.2.3


From 65333b4c3d46909872796321d15f179cb0e32028 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:03:35 +1100
Subject: xfs: kill xfs_fs_repair_cmn_err() macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In certain cases of inode corruption, the xfs_fs_repair_cmn_err()
macro is used to output an extra message in the corruption report.
That extra message is "unmount and run xfs_repair", which really
applies to any corruption report. Each case that this macro is
called (except one) a following call to xfs_corruption_error() is
made to optionally dump more information about the error.

Hence, move the output of "run xfs_repair" to xfs_corruption_error()
so that it is output on all corruption reports.  Also, convert the
callers of the repair macro that don't call xfs_corruption_error()
to call it, hence provide consiѕtent error reporting for all cases
where xfs_fs_repair_cmn_err() used to be called.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_bmap.c  |  7 +++----
 fs/xfs/xfs_error.c |  1 +
 fs/xfs/xfs_error.h |  3 ---
 fs/xfs/xfs_inode.c | 37 +++++++++++++++----------------------
 4 files changed, 19 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 451b4484795..ded532dc069 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4200,12 +4200,11 @@ xfs_bmap_read_extents(
 		num_recs = xfs_btree_get_numrecs(block);
 		if (unlikely(i + num_recs > room)) {
 			ASSERT(i + num_recs <= room);
-			xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+			xfs_warn(ip->i_mount,
 				"corrupt dinode %Lu, (btree extents).",
 				(unsigned long long) ip->i_ino);
-			XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
-					 XFS_ERRLEVEL_LOW,
-					ip->i_mount);
+			XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
+				XFS_ERRLEVEL_LOW, ip->i_mount, block);
 			goto error0;
 		}
 		XFS_WANT_CORRUPTED_GOTO(
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 34f0e2d264e..03028906f00 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -183,4 +183,5 @@ xfs_corruption_error(
 	if (level <= xfs_error_level)
 		xfs_hex_dump(p, 16);
 	xfs_error_report(tag, level, mp, filename, linenum, ra);
+	xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index e1ba2d2565b..4c8b5007000 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -162,9 +162,6 @@ struct xfs_mount;
 
 extern void xfs_hex_dump(void *p, int length);
 
-#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
-	xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
-
 #define xfs_fs_mount_cmn_err(f, fmt, args...) \
 	do { \
 		if (!(f & XFS_MFSI_QUIET)) 	\
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bf9ca579365..55169bbfc82 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -317,7 +317,7 @@ xfs_iformat(
 	if (unlikely(be32_to_cpu(dip->di_nextents) +
 		     be16_to_cpu(dip->di_anextents) >
 		     be64_to_cpu(dip->di_nblocks))) {
-		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+		xfs_warn(ip->i_mount,
 			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
 			(unsigned long long)ip->i_ino,
 			(int)(be32_to_cpu(dip->di_nextents) +
@@ -330,8 +330,7 @@ xfs_iformat(
 	}
 
 	if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
-		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
-			"corrupt dinode %Lu, forkoff = 0x%x.",
+		xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
 			(unsigned long long)ip->i_ino,
 			dip->di_forkoff);
 		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -341,7 +340,7 @@ xfs_iformat(
 
 	if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
 		     !ip->i_mount->m_rtdev_targp)) {
-		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+		xfs_warn(ip->i_mount,
 			"corrupt dinode %Lu, has realtime flag set.",
 			ip->i_ino);
 		XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
@@ -373,9 +372,8 @@ xfs_iformat(
 			 * no local regular files yet
 			 */
 			if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
-				xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
-					"corrupt inode %Lu "
-					"(local format for regular file).",
+				xfs_warn(ip->i_mount,
+			"corrupt inode %Lu (local format for regular file).",
 					(unsigned long long) ip->i_ino);
 				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 						     XFS_ERRLEVEL_LOW,
@@ -385,9 +383,8 @@ xfs_iformat(
 
 			di_size = be64_to_cpu(dip->di_size);
 			if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-				xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
-					"corrupt inode %Lu "
-					"(bad size %Ld for local inode).",
+				xfs_warn(ip->i_mount,
+			"corrupt inode %Lu (bad size %Ld for local inode).",
 					(unsigned long long) ip->i_ino,
 					(long long) di_size);
 				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -431,9 +428,8 @@ xfs_iformat(
 		size = be16_to_cpu(atp->hdr.totsize);
 
 		if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
-			xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
-				"corrupt inode %Lu "
-				"(bad attr fork size %Ld).",
+			xfs_warn(ip->i_mount,
+				"corrupt inode %Lu (bad attr fork size %Ld).",
 				(unsigned long long) ip->i_ino,
 				(long long) size);
 			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
@@ -488,9 +484,8 @@ xfs_iformat_local(
 	 * kmem_alloc() or memcpy() below.
 	 */
 	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
-			"corrupt inode %Lu "
-			"(bad size %d for local fork, size = %d).",
+		xfs_warn(ip->i_mount,
+	"corrupt inode %Lu (bad size %d for local fork, size = %d).",
 			(unsigned long long) ip->i_ino, size,
 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
 		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -547,8 +542,7 @@ xfs_iformat_extents(
 	 * kmem_alloc() or memcpy() below.
 	 */
 	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
-			"corrupt inode %Lu ((a)extents = %d).",
+		xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
 			(unsigned long long) ip->i_ino, nex);
 		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 				     ip->i_mount, dip);
@@ -623,11 +617,10 @@ xfs_iformat_btree(
 	    || XFS_BMDR_SPACE_CALC(nrecs) >
 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
 	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-		xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
-			"corrupt inode %Lu (btree).",
+		xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
 			(unsigned long long) ip->i_ino);
-		XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-				 ip->i_mount);
+		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+				 ip->i_mount, dip);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
-- 
cgit v1.2.3


From af34e09da42801c97f39f768c715f5511d914b52 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:04:35 +1100
Subject: xfs: kill xfs_fs_mount_cmn_err() macro

The xfs_fs_mount_cmn_err() hides a simple check as to whether the
mount path should output an error or not. Remove the macro and open
code the check.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_error.h |  6 -----
 fs/xfs/xfs_mount.c | 72 ++++++++++++++++++++++++++++++++----------------------
 2 files changed, 43 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 4c8b5007000..e8360514c25 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -162,10 +162,4 @@ struct xfs_mount;
 
 extern void xfs_hex_dump(void *p, int length);
 
-#define xfs_fs_mount_cmn_err(f, fmt, args...) \
-	do { \
-		if (!(f & XFS_MFSI_QUIET)) 	\
-			cmn_err(CE_WARN, "XFS: " fmt, ## args); \
-	} while (0)
-
 #endif	/* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d447aef84bc..1b43ad3d6dd 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -311,6 +311,8 @@ xfs_mount_validate_sb(
 	xfs_sb_t	*sbp,
 	int		flags)
 {
+	int		loud = !(flags & XFS_MFSI_QUIET);
+
 	/*
 	 * If the log device and data device have the
 	 * same device number, the log is internal.
@@ -319,28 +321,32 @@ xfs_mount_validate_sb(
 	 * a volume filesystem in a non-volume manner.
 	 */
 	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-		xfs_fs_mount_cmn_err(flags, "bad magic number");
+		if (loud)
+			xfs_warn(mp, "bad magic number");
 		return XFS_ERROR(EWRONGFS);
 	}
 
 	if (!xfs_sb_good_version(sbp)) {
-		xfs_fs_mount_cmn_err(flags, "bad version");
+		if (loud)
+			xfs_warn(mp, "bad version");
 		return XFS_ERROR(EWRONGFS);
 	}
 
 	if (unlikely(
 	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-		xfs_fs_mount_cmn_err(flags,
-			"filesystem is marked as having an external log; "
-			"specify logdev on the\nmount command line.");
+		if (loud)
+			xfs_warn(mp,
+		"filesystem is marked as having an external log; "
+		"specify logdev on the mount command line.");
 		return XFS_ERROR(EINVAL);
 	}
 
 	if (unlikely(
 	    sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-		xfs_fs_mount_cmn_err(flags,
-			"filesystem is marked as having an internal log; "
-			"do not specify logdev on\nthe mount command line.");
+		if (loud)
+			xfs_warn(mp,
+		"filesystem is marked as having an internal log; "
+		"do not specify logdev on the mount command line.");
 		return XFS_ERROR(EINVAL);
 	}
 
@@ -369,7 +375,8 @@ xfs_mount_validate_sb(
 	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
 	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
-		xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
+		if (loud)
+			xfs_warn(mp, "SB sanity check 1 failed");
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
@@ -382,7 +389,8 @@ xfs_mount_validate_sb(
 	     (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
 	    sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
 			      sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
-		xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed");
+		if (loud)
+			xfs_warn(mp, "SB sanity check 2 failed");
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
@@ -390,12 +398,12 @@ xfs_mount_validate_sb(
 	 * Until this is fixed only page-sized or smaller data blocks work.
 	 */
 	if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-		xfs_fs_mount_cmn_err(flags,
-			"file system with blocksize %d bytes",
-			sbp->sb_blocksize);
-		xfs_fs_mount_cmn_err(flags,
-			"only pagesize (%ld) or less will currently work.",
-			PAGE_SIZE);
+		if (loud) {
+			xfs_warn(mp,
+		"File system with blocksize %d bytes. "
+		"Only pagesize (%ld) or less will currently work.",
+				sbp->sb_blocksize, PAGE_SIZE);
+		}
 		return XFS_ERROR(ENOSYS);
 	}
 
@@ -409,21 +417,23 @@ xfs_mount_validate_sb(
 	case 2048:
 		break;
 	default:
-		xfs_fs_mount_cmn_err(flags,
-			"inode size of %d bytes not supported",
-			sbp->sb_inodesize);
+		if (loud)
+			xfs_warn(mp, "inode size of %d bytes not supported",
+				sbp->sb_inodesize);
 		return XFS_ERROR(ENOSYS);
 	}
 
 	if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
 	    xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-		xfs_fs_mount_cmn_err(flags,
-			"file system too large to be mounted on this system.");
+		if (loud)
+			xfs_warn(mp,
+		"file system too large to be mounted on this system.");
 		return XFS_ERROR(EFBIG);
 	}
 
 	if (unlikely(sbp->sb_inprogress)) {
-		xfs_fs_mount_cmn_err(flags, "file system busy");
+		if (loud)
+			xfs_warn(mp, "file system busy");
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
@@ -431,8 +441,9 @@ xfs_mount_validate_sb(
 	 * Version 1 directory format has never worked on Linux.
 	 */
 	if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-		xfs_fs_mount_cmn_err(flags,
-			"file system using version 1 directory format");
+		if (loud)
+			xfs_warn(mp,
+				"file system using version 1 directory format");
 		return XFS_ERROR(ENOSYS);
 	}
 
@@ -673,6 +684,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	unsigned int	sector_size;
 	xfs_buf_t	*bp;
 	int		error;
+	int		loud = !(flags & XFS_MFSI_QUIET);
 
 	ASSERT(mp->m_sb_bp == NULL);
 	ASSERT(mp->m_ddev_targp != NULL);
@@ -688,7 +700,8 @@ reread:
 	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
 					XFS_SB_DADDR, sector_size, 0);
 	if (!bp) {
-		xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
+		if (loud)
+			xfs_warn(mp, "SB buffer read failed");
 		return EIO;
 	}
 
@@ -699,7 +712,8 @@ reread:
 	xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
 	error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
 	if (error) {
-		xfs_fs_mount_cmn_err(flags, "SB validate failed");
+		if (loud)
+			xfs_warn(mp, "SB validate failed");
 		goto release_buf;
 	}
 
@@ -707,9 +721,9 @@ reread:
 	 * We must be able to do sector-sized and sector-aligned IO.
 	 */
 	if (sector_size > mp->m_sb.sb_sectsize) {
-		xfs_fs_mount_cmn_err(flags,
-			"device supports only %u byte sectors (not %u)",
-			sector_size, mp->m_sb.sb_sectsize);
+		if (loud)
+			xfs_warn(mp, "device supports %u byte sectors (not %u)",
+				sector_size, mp->m_sb.sb_sectsize);
 		error = ENOSYS;
 		goto release_buf;
 	}
-- 
cgit v1.2.3


From 5348778699bba92bf28b79863e09e7181d8cf95c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:05:35 +1100
Subject: xfs: convert xfs_fs_cmn_err to new error logging API

Continue to clean up the error logging code by converting all the
callers of xfs_fs_cmn_err() to the new API. Once done, remove the
unused old API function.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/quota/xfs_dquot.c      |  4 ++--
 fs/xfs/quota/xfs_dquot_item.c |  5 ++---
 fs/xfs/quota/xfs_qm.c         | 13 ++++++-------
 fs/xfs/support/debug.c        | 20 --------------------
 fs/xfs/support/debug.h        |  2 --
 fs/xfs/xfs_bmap.c             |  2 +-
 fs/xfs/xfs_dfrag.c            |  4 ++--
 fs/xfs/xfs_dir2.c             |  2 +-
 fs/xfs/xfs_fsops.c            |  6 +++---
 fs/xfs/xfs_ialloc.c           | 39 +++++++++++++++++----------------------
 fs/xfs/xfs_inode.c            | 17 +++++++----------
 fs/xfs/xfs_mount.c            | 16 ++++++++--------
 fs/xfs/xfs_vnodeops.c         |  8 ++++----
 13 files changed, 53 insertions(+), 85 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 773adc80d6b..e1ff7e50767 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1393,8 +1393,8 @@ xfs_qm_dqpurge(
 		 */
 		error = xfs_qm_dqflush(dqp, SYNC_WAIT);
 		if (error)
-			xfs_fs_cmn_err(CE_WARN, mp,
-				"xfs_qm_dqpurge: dquot %p flush failed", dqp);
+			xfs_warn(mp, "%s: dquot %p flush failed",
+				__func__, dqp);
 		xfs_dqflock(dqp);
 	}
 	ASSERT(atomic_read(&dqp->q_pincount) == 0);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2a1f3dc10a0..9e0e2fa3f2c 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -136,9 +136,8 @@ xfs_qm_dquot_logitem_push(
 	 */
 	error = xfs_qm_dqflush(dqp, 0);
 	if (error)
-		xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
-			"xfs_qm_dquot_logitem_push: push error %d on dqp %p",
-			error, dqp);
+		xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
+			__func__, error, dqp);
 	xfs_dqunlock(dqp);
 }
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index dd5cc5738a2..e34dce1ce54 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -402,14 +402,13 @@ xfs_qm_mount_quotas(
 			 * off, but the on disk superblock doesn't know that !
 			 */
 			ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-			xfs_fs_cmn_err(CE_ALERT, mp,
-				"XFS mount_quotas: Superblock update failed!");
+			xfs_alert(mp, "%s: Superblock update failed!",
+				__func__);
 		}
 	}
 
 	if (error) {
-		xfs_fs_cmn_err(CE_WARN, mp,
-			"Failed to initialize disk quotas.");
+		xfs_warn(mp, "Failed to initialize disk quotas.");
 		return;
 	}
 
@@ -1257,7 +1256,7 @@ xfs_qm_qino_alloc(
 	xfs_mod_sb(tp, sbfields);
 
 	if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
-		xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
+		xfs_alert(mp, "%s failed (error %d)!", __func__, error);
 		return error;
 	}
 	return 0;
@@ -1930,8 +1929,8 @@ again:
 			 */
 			error = xfs_qm_dqflush(dqp, 0);
 			if (error) {
-				xfs_fs_cmn_err(CE_WARN, mp,
-			"xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+				xfs_warn(mp, "%s: dquot %p flush failed",
+					__func__, dqp);
 			}
 			goto dqunlock;
 		}
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index feaca739d5c..a1c7141af48 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -44,26 +44,6 @@ cmn_err(
 	BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
 }
 
-void
-xfs_fs_cmn_err(
-	const char		*lvl,
-	struct xfs_mount	*mp,
-	const char		*fmt,
-	...)
-{
-	struct va_format	vaf;
-	va_list			args;
-
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
-	va_end(args);
-
-	BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-}
-
 void
 assfail(char *expr, char *file, int line)
 {
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index eaeaa17ef4b..4a082b9a116 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -31,8 +31,6 @@ struct xfs_mount;
 
 void cmn_err(const char *lvl, const char *fmt, ...)
 		__attribute__ ((format (printf, 2, 3)));
-void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
-		const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
 
 extern void assfail(char *expr, char *f, int l);
 
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ded532dc069..92612f6b4b3 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -6150,7 +6150,7 @@ xfs_bmap_punch_delalloc_range(
 		if (error) {
 			/* something screwed, just bail */
 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-				xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+				xfs_alert(ip->i_mount,
 			"Failed delalloc mapping lookup ino %lld fsb %lld.",
 						ip->i_ino, start_fsb);
 			}
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e60490bc00a..be628677c28 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -270,9 +270,9 @@ xfs_swap_extents(
 	/* check inode formats now that data is flushed */
 	error = xfs_swap_extents_check_format(ip, tip);
 	if (error) {
-		xfs_fs_cmn_err(CE_NOTE, mp,
+		xfs_notice(mp,
 		    "%s: inode 0x%llx format is incompatible for exchanging.",
-				__FILE__, ip->i_ino);
+				__func__, ip->i_ino);
 		goto out_unlock;
 	}
 
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a1321bc7f19..dba7a71cedf 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -159,7 +159,7 @@ xfs_dir_ino_validate(
 		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
 	if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
 			XFS_RANDOM_DIR_INO_VALIDATE))) {
-		xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
+		xfs_warn(mp, "Invalid inode number 0x%Lx",
 				(unsigned long long) ino);
 		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
 		return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 85668efb3e3..9153d2c77ca 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -385,8 +385,8 @@ xfs_growfs_data_private(
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
 				  XFS_FSS_TO_BB(mp, 1), 0, &bp);
 		if (error) {
-			xfs_fs_cmn_err(CE_WARN, mp,
-			"error %d reading secondary superblock for ag %d",
+			xfs_warn(mp,
+		"error %d reading secondary superblock for ag %d",
 				error, agno);
 			break;
 		}
@@ -399,7 +399,7 @@ xfs_growfs_data_private(
 		if (!(error = xfs_bwrite(mp, bp))) {
 			continue;
 		} else {
-			xfs_fs_cmn_err(CE_WARN, mp,
+			xfs_warn(mp,
 		"write error %d updating secondary superblock for ag %d",
 				error, agno);
 			break; /* no point in continuing */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0626a32c344..fc3a2cb2c07 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1218,10 +1218,9 @@ xfs_imap_lookup(
 
 	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
 	if (error) {
-		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-				"xfs_ialloc_read_agi() returned "
-				"error %d, agno %d",
-				error, agno);
+		xfs_alert(mp,
+			"%s: xfs_ialloc_read_agi() returned error %d, agno %d",
+			__func__, error, agno);
 		return error;
 	}
 
@@ -1299,24 +1298,21 @@ xfs_imap(
 		if (flags & XFS_IGET_UNTRUSTED)
 			return XFS_ERROR(EINVAL);
 		if (agno >= mp->m_sb.sb_agcount) {
-			xfs_fs_cmn_err(CE_ALERT, mp,
-					"xfs_imap: agno (%d) >= "
-					"mp->m_sb.sb_agcount (%d)",
-					agno,  mp->m_sb.sb_agcount);
+			xfs_alert(mp,
+				"%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
+				__func__, agno, mp->m_sb.sb_agcount);
 		}
 		if (agbno >= mp->m_sb.sb_agblocks) {
-			xfs_fs_cmn_err(CE_ALERT, mp,
-					"xfs_imap: agbno (0x%llx) >= "
-					"mp->m_sb.sb_agblocks (0x%lx)",
-					(unsigned long long) agbno,
-					(unsigned long) mp->m_sb.sb_agblocks);
+			xfs_alert(mp,
+		"%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
+				__func__, (unsigned long long)agbno,
+				(unsigned long)mp->m_sb.sb_agblocks);
 		}
 		if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
-			xfs_fs_cmn_err(CE_ALERT, mp,
-					"xfs_imap: ino (0x%llx) != "
-					"XFS_AGINO_TO_INO(mp, agno, agino) "
-					"(0x%llx)",
-					ino, XFS_AGINO_TO_INO(mp, agno, agino));
+			xfs_alert(mp,
+		"%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
+				__func__, ino,
+				XFS_AGINO_TO_INO(mp, agno, agino));
 		}
 		xfs_stack_trace();
 #endif /* DEBUG */
@@ -1388,10 +1384,9 @@ out_map:
 	 */
 	if ((imap->im_blkno + imap->im_len) >
 	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-			"(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
-			" XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
-			(unsigned long long) imap->im_blkno,
+		xfs_alert(mp,
+	"%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
+			__func__, (unsigned long long) imap->im_blkno,
 			(unsigned long long) imap->im_len,
 			XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
 		return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 55169bbfc82..d820ada49b1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -110,8 +110,8 @@ xfs_inobp_check(
 		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 					i * mp->m_sb.sb_inodesize);
 		if (!dip->di_next_unlinked)  {
-			xfs_fs_cmn_err(CE_ALERT, mp,
-				"Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
+			xfs_alert(mp,
+	"Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
 				bp);
 			ASSERT(dip->di_next_unlinked);
 		}
@@ -806,11 +806,9 @@ xfs_iread(
 	 */
 	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
 #ifdef DEBUG
-		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
-				"dip->di_magic (0x%x) != "
-				"XFS_DINODE_MAGIC (0x%x)",
-				be16_to_cpu(dip->di_magic),
-				XFS_DINODE_MAGIC);
+		xfs_alert(mp,
+			"%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
+			__func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
 #endif /* DEBUG */
 		error = XFS_ERROR(EINVAL);
 		goto out_brelse;
@@ -828,9 +826,8 @@ xfs_iread(
 		error = xfs_iformat(ip, dip);
 		if (error)  {
 #ifdef DEBUG
-			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
-					"xfs_iformat() returned error %d",
-					error);
+			xfs_alert(mp, "%s: xfs_iformat() returned error %d",
+				__func__, error);
 #endif /* DEBUG */
 			goto out_brelse;
 		}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 1b43ad3d6dd..e39b082eb04 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -881,8 +881,9 @@ xfs_update_alignment(xfs_mount_t *mp)
 				if (mp->m_flags & XFS_MOUNT_RETERR) {
 					return XFS_ERROR(EINVAL);
 				}
-				xfs_fs_cmn_err(CE_WARN, mp,
-"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
+				xfs_warn(mp,
+		"stripe alignment turned off: sunit(%d)/swidth(%d) "
+		"incompatible with agsize(%d)",
 					mp->m_dalign, mp->m_swidth,
 					sbp->sb_agblocks);
 
@@ -892,9 +893,9 @@ xfs_update_alignment(xfs_mount_t *mp)
 				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
 			} else {
 				if (mp->m_flags & XFS_MOUNT_RETERR) {
-					xfs_fs_cmn_err(CE_WARN, mp,
-"stripe alignment turned off: sunit(%d) less than bsize(%d)",
-                                        	mp->m_dalign,
+					xfs_warn(mp,
+		"stripe alignment turned off: sunit(%d) less than bsize(%d)",
+						mp->m_dalign,
 						mp->m_blockmask +1);
 					return XFS_ERROR(EINVAL);
 				}
@@ -1100,7 +1101,7 @@ xfs_mount_reset_sbqflags(
 		return 0;
 
 #ifdef QUOTADEBUG
-	xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+	xfs_notice(mp, "Writing superblock quota changes");
 #endif
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
@@ -1108,8 +1109,7 @@ xfs_mount_reset_sbqflags(
 				      XFS_DEFAULT_LOG_COUNT);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
-		xfs_fs_cmn_err(CE_ALERT, mp,
-			"xfs_mount_reset_sbqflags: Superblock update failed!");
+		xfs_alert(mp, "%s: Superblock update failed!", __func__);
 		return error;
 	}
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 258d4f98eb9..e919ae1e9c6 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1208,12 +1208,12 @@ xfs_inactive(
 		 */
 		error = xfs_bmap_finish(&tp,  &free_list, &committed);
 		if (error)
-			xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
-				"xfs_bmap_finish() returned error %d", error);
+			xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+				__func__, error);
 		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 		if (error)
-			xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
-				"xfs_trans_commit() returned error %d", error);
+			xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+				__func__, error);
 	}
 
 	/*
-- 
cgit v1.2.3


From 6d4a8ecb344bddbbb8c71deb4dcea0be6955cfc3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:06:35 +1100
Subject: xfs: rename xfs_cmn_err_fsblock_zero()

The "cmn_err" part of the function name is no longer relevant. Rename
the function to xfs_alert_fsblock_zero() to match the new logging
API.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iomap.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 812646fe1b3..091d82b94c4 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -101,7 +101,7 @@ xfs_iomap_eof_align_last_fsb(
 }
 
 STATIC int
-xfs_cmn_err_fsblock_zero(
+xfs_alert_fsblock_zero(
 	xfs_inode_t	*ip,
 	xfs_bmbt_irec_t	*imap)
 {
@@ -246,7 +246,7 @@ xfs_iomap_write_direct(
 	}
 
 	if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
-		error = xfs_cmn_err_fsblock_zero(ip, imap);
+		error = xfs_alert_fsblock_zero(ip, imap);
 		goto error_out;
 	}
 
@@ -464,7 +464,7 @@ retry:
 	}
 
 	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
-		return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
+		return xfs_alert_fsblock_zero(ip, &imap[0]);
 
 	*ret_imap = imap[0];
 	return 0;
@@ -614,7 +614,7 @@ xfs_iomap_write_allocate(
 		 * covers at least part of the callers request
 		 */
 		if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-			return xfs_cmn_err_fsblock_zero(ip, imap);
+			return xfs_alert_fsblock_zero(ip, imap);
 
 		if ((offset_fsb >= imap->br_startoff) &&
 		    (offset_fsb < (imap->br_startoff +
@@ -724,7 +724,7 @@ xfs_iomap_write_unwritten(
 			return XFS_ERROR(error);
 
 		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
-			return xfs_cmn_err_fsblock_zero(ip, &imap);
+			return xfs_alert_fsblock_zero(ip, &imap);
 
 		if ((numblks_fsb = imap.br_blockcount) == 0) {
 			/*
-- 
cgit v1.2.3


From 8221112b4377a3b69f2016b5cc3c550d51dd3139 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:07:35 +1100
Subject: xfs: convert the quota debug prints to new API

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/quota/xfs_qm_syscalls.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index bdebc183223..603ab867259 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -41,12 +41,6 @@
 #include "xfs_qm.h"
 #include "xfs_trace.h"
 
-#ifdef DEBUG
-# define qdprintk(s, args...)	cmn_err(CE_DEBUG, s, ## args)
-#else
-# define qdprintk(s, args...)	do { } while (0)
-#endif
-
 STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
 					uint);
@@ -294,7 +288,8 @@ xfs_qm_scall_trunc_qfiles(
 	int		error = 0, error2 = 0;
 
 	if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
-		qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
+		xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+			__func__, flags, mp->m_qflags);
 		return XFS_ERROR(EINVAL);
 	}
 
@@ -331,7 +326,8 @@ xfs_qm_scall_quotaon(
 	sbflags = 0;
 
 	if (flags == 0) {
-		qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
+		xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+			__func__, mp->m_qflags);
 		return XFS_ERROR(EINVAL);
 	}
 
@@ -352,8 +348,9 @@ xfs_qm_scall_quotaon(
 	    (flags & XFS_GQUOTA_ACCT) == 0 &&
 	    (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
 	    (flags & XFS_OQUOTA_ENFD))) {
-		qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
-			flags, mp->m_sb.sb_qflags);
+		xfs_debug(mp,
+			"%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+			__func__, flags, mp->m_sb.sb_qflags);
 		return XFS_ERROR(EINVAL);
 	}
 	/*
@@ -541,7 +538,7 @@ xfs_qm_scall_setqlim(
 			q->qi_bsoftlimit = soft;
 		}
 	} else {
-		qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
+		xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
 	}
 	hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
 		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -557,7 +554,7 @@ xfs_qm_scall_setqlim(
 			q->qi_rtbsoftlimit = soft;
 		}
 	} else {
-		qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+		xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
 	}
 
 	hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -574,7 +571,7 @@ xfs_qm_scall_setqlim(
 			q->qi_isoftlimit = soft;
 		}
 	} else {
-		qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
+		xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
 	}
 
 	/*
@@ -1137,8 +1134,8 @@ xfs_qm_internalqcheck_adjust(
 
 	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
 		*res = BULKSTAT_RV_NOTHING;
-		qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
-			(unsigned long long) ino,
+		xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
+			__func__, (unsigned long long) ino,
 			(unsigned long long) mp->m_sb.sb_uquotino,
 			(unsigned long long) mp->m_sb.sb_gquotino);
 		return XFS_ERROR(EINVAL);
-- 
cgit v1.2.3


From 0b932cccbdc09a72aa370456a59b40ecd6b10baf Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:08:35 +1100
Subject: xfs: Convert remaining cmn_err() callers to new API

Once converted, kill the remainder of the cmn_err() interface.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/quota/xfs_dquot.c       | 34 +++++++++++++-----------
 fs/xfs/quota/xfs_qm.c          | 27 +++++++++----------
 fs/xfs/quota/xfs_qm_bhv.c      |  3 +--
 fs/xfs/quota/xfs_qm_syscalls.c | 58 +++++++++++++++++++++-------------------
 fs/xfs/quota/xfs_trans_dquot.c |  5 ++--
 fs/xfs/support/debug.c         | 26 ------------------
 fs/xfs/support/debug.h         | 12 ---------
 fs/xfs/xfs_bmap.c              |  6 ++---
 fs/xfs/xfs_buf_item.c          | 15 ++++++-----
 fs/xfs/xfs_da_btree.c          |  9 +++----
 fs/xfs/xfs_dir2_node.c         | 25 +++++++-----------
 fs/xfs/xfs_error.c             | 16 +++++------
 fs/xfs/xfs_ialloc.c            | 43 ++++++++++++------------------
 fs/xfs/xfs_inode.c             | 30 +++++++++------------
 fs/xfs/xfs_mount.c             | 60 +++++++++++++++++-------------------------
 fs/xfs/xfs_rtalloc.c           |  8 +++---
 fs/xfs/xfs_rtalloc.h           |  2 +-
 fs/xfs/xfs_rw.c                | 17 ++++++------
 fs/xfs/xfs_trans_buf.c         |  6 ++---
 fs/xfs/xfs_vnodeops.c          |  5 ++--
 20 files changed, 169 insertions(+), 238 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e1ff7e50767..7e241647850 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -828,7 +828,7 @@ xfs_qm_dqget(
 	if (xfs_do_dqerror) {
 		if ((xfs_dqerror_target == mp->m_ddev_targp) &&
 		    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
-			cmn_err(CE_DEBUG, "Returning error in dqget");
+			xfs_debug(mp, "Returning error in dqget");
 			return (EIO);
 		}
 	}
@@ -1427,36 +1427,38 @@ xfs_qm_dqpurge(
 void
 xfs_qm_dqprint(xfs_dquot_t *dqp)
 {
-	cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------");
-	cmn_err(CE_DEBUG, "---- dquotID =  %d",
+	struct xfs_mount	*mp = dqp->q_mount;
+
+	xfs_debug(mp, "-----------KERNEL DQUOT----------------");
+	xfs_debug(mp, "---- dquotID =  %d",
 		(int)be32_to_cpu(dqp->q_core.d_id));
-	cmn_err(CE_DEBUG, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
-	cmn_err(CE_DEBUG, "---- fs      =  0x%p", dqp->q_mount);
-	cmn_err(CE_DEBUG, "---- blkno   =  0x%x", (int) dqp->q_blkno);
-	cmn_err(CE_DEBUG, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
-	cmn_err(CE_DEBUG, "---- blkhlimit =  %Lu (0x%x)",
+	xfs_debug(mp, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
+	xfs_debug(mp, "---- fs      =  0x%p", dqp->q_mount);
+	xfs_debug(mp, "---- blkno   =  0x%x", (int) dqp->q_blkno);
+	xfs_debug(mp, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
+	xfs_debug(mp, "---- blkhlimit =  %Lu (0x%x)",
 		be64_to_cpu(dqp->q_core.d_blk_hardlimit),
 		(int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-	cmn_err(CE_DEBUG, "---- blkslimit =  %Lu (0x%x)",
+	xfs_debug(mp, "---- blkslimit =  %Lu (0x%x)",
 		be64_to_cpu(dqp->q_core.d_blk_softlimit),
 		(int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
-	cmn_err(CE_DEBUG, "---- inohlimit =  %Lu (0x%x)",
+	xfs_debug(mp, "---- inohlimit =  %Lu (0x%x)",
 		be64_to_cpu(dqp->q_core.d_ino_hardlimit),
 		(int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
-	cmn_err(CE_DEBUG, "---- inoslimit =  %Lu (0x%x)",
+	xfs_debug(mp, "---- inoslimit =  %Lu (0x%x)",
 		be64_to_cpu(dqp->q_core.d_ino_softlimit),
 		(int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
-	cmn_err(CE_DEBUG, "---- bcount  =  %Lu (0x%x)",
+	xfs_debug(mp, "---- bcount  =  %Lu (0x%x)",
 		be64_to_cpu(dqp->q_core.d_bcount),
 		(int)be64_to_cpu(dqp->q_core.d_bcount));
-	cmn_err(CE_DEBUG, "---- icount  =  %Lu (0x%x)",
+	xfs_debug(mp, "---- icount  =  %Lu (0x%x)",
 		be64_to_cpu(dqp->q_core.d_icount),
 		(int)be64_to_cpu(dqp->q_core.d_icount));
-	cmn_err(CE_DEBUG, "---- btimer  =  %d",
+	xfs_debug(mp, "---- btimer  =  %d",
 		(int)be32_to_cpu(dqp->q_core.d_btimer));
-	cmn_err(CE_DEBUG, "---- itimer  =  %d",
+	xfs_debug(mp, "---- itimer  =  %d",
 		(int)be32_to_cpu(dqp->q_core.d_itimer));
-	cmn_err(CE_DEBUG, "---------------------------");
+	xfs_debug(mp, "---------------------------");
 }
 #endif
 
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index e34dce1ce54..254ee062bd7 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -80,7 +80,7 @@ xfs_qm_dquot_list_print(
 	int		i = 0;
 
 	list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
-		cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
+		xfs_debug(mp, "   %d. \"%d (%s)\"   "
 				  "bcnt = %lld, icnt = %lld, refs = %d",
 			i++, be32_to_cpu(dqp->q_core.d_id),
 			DQFLAGTO_TYPESTR(dqp),
@@ -205,7 +205,7 @@ xfs_qm_destroy(
 	list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
 		xfs_dqlock(dqp);
 #ifdef QUOTADEBUG
-		cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+		xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
 #endif
 		list_del_init(&dqp->q_freelist);
 		xfs_Gqm->qm_dqfrlist_cnt--;
@@ -341,9 +341,7 @@ xfs_qm_mount_quotas(
 	 * quotas immediately.
 	 */
 	if (mp->m_sb.sb_rextents) {
-		cmn_err(CE_NOTE,
-			"Cannot turn on quotas for realtime filesystem %s",
-			mp->m_fsname);
+		xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
 		mp->m_qflags = 0;
 		goto write_changes;
 	}
@@ -1668,7 +1666,7 @@ xfs_qm_quotacheck(
 	 */
 	ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
 
-	cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
+	xfs_notice(mp, "Quotacheck needed: Please wait.");
 
 	/*
 	 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
@@ -1746,9 +1744,9 @@ xfs_qm_quotacheck(
 
  error_return:
 	if (error) {
-		cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): "
-			"Disabling quotas.",
-			mp->m_fsname, error);
+		xfs_warn(mp,
+	"Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
+			error);
 		/*
 		 * We must turn off quotas.
 		 */
@@ -1756,12 +1754,11 @@ xfs_qm_quotacheck(
 		ASSERT(xfs_Gqm != NULL);
 		xfs_qm_destroy_quotainfo(mp);
 		if (xfs_mount_reset_sbqflags(mp)) {
-			cmn_err(CE_WARN, "XFS quotacheck %s: "
-				"Failed to reset quota flags.", mp->m_fsname);
+			xfs_warn(mp,
+				"Quotacheck: Failed to reset quota flags.");
 		}
-	} else {
-		cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
-	}
+	} else
+		xfs_notice(mp, "Quotacheck: Done.");
 	return (error);
 }
 
@@ -2107,7 +2104,7 @@ xfs_qm_write_sb_changes(
 	int		error;
 
 #ifdef QUOTADEBUG
-	cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname);
+	xfs_notice(mp, "Writing superblock quota changes");
 #endif
 	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
 	if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 45b5cb1788a..774d7ec6df8 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -119,8 +119,7 @@ xfs_qm_newmount(
 	     (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
 	    (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
 	    xfs_dev_is_read_only(mp, "changing quota state")) {
-		cmn_err(CE_WARN,
-			"XFS: please mount with%s%s%s%s.",
+		xfs_warn(mp, "please mount with%s%s%s%s.",
 			(!quotaondisk ? "out quota" : ""),
 			(uquotaondisk ? " usrquota" : ""),
 			(pquotaondisk ? " prjquota" : ""),
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 603ab867259..c82f06778a2 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -936,10 +936,11 @@ struct mutex  qcheck_lock;
 #define DQTEST_LIST_PRINT(l, NXT, title) \
 { \
 	  xfs_dqtest_t	*dqp; int i = 0;\
-	  cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+	  xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
 	  for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
 	       dqp = (xfs_dqtest_t *)dqp->NXT) { \
-		cmn_err(CE_DEBUG, "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
+		xfs_debug(dqp->q_mount,		\
+			"  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
 			 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),	     \
 			 dqp->d_bcount, dqp->d_icount); } \
 }
@@ -963,16 +964,17 @@ xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 }
 STATIC void
 xfs_qm_dqtest_print(
-	xfs_dqtest_t	*d)
+	struct xfs_mount	*mp,
+	struct dqtest		*d)
 {
-	cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------");
-	cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id);
-	cmn_err(CE_DEBUG, "---- fs       = 0x%p", d->q_mount);
-	cmn_err(CE_DEBUG, "---- bcount   = %Lu (0x%x)",
+	xfs_debug(mp, "-----------DQTEST DQUOT----------------");
+	xfs_debug(mp, "---- dquot ID = %d", d->d_id);
+	xfs_debug(mp, "---- fs       = 0x%p", d->q_mount);
+	xfs_debug(mp, "---- bcount   = %Lu (0x%x)",
 		d->d_bcount, (int)d->d_bcount);
-	cmn_err(CE_DEBUG, "---- icount   = %Lu (0x%x)",
+	xfs_debug(mp, "---- icount   = %Lu (0x%x)",
 		d->d_icount, (int)d->d_icount);
-	cmn_err(CE_DEBUG, "---------------------------");
+	xfs_debug(mp, "---------------------------");
 }
 
 STATIC void
@@ -986,12 +988,14 @@ xfs_qm_dqtest_failed(
 {
 	qmtest_nfails++;
 	if (error)
-		cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s",
-		       d->d_id, error, reason);
+		xfs_debug(dqp->q_mount,
+			"quotacheck failed id=%d, err=%d\nreason: %s",
+			d->d_id, error, reason);
 	else
-		cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]",
-		       d->d_id, reason, (int)a, (int)b);
-	xfs_qm_dqtest_print(d);
+		xfs_debug(dqp->q_mount,
+			"quotacheck failed id=%d (%s) [%d != %d]",
+			d->d_id, reason, (int)a, (int)b);
+	xfs_qm_dqtest_print(dqp->q_mount, d);
 	if (dqp)
 		xfs_qm_dqprint(dqp);
 }
@@ -1018,9 +1022,9 @@ xfs_dqtest_cmp2(
 	    be64_to_cpu(dqp->q_core.d_bcount) >=
 	    be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
 		if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
-			cmn_err(CE_DEBUG,
-				"%d [%s] [0x%p] BLK TIMER NOT STARTED",
-				d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+			xfs_debug(dqp->q_mount,
+				"%d [%s] BLK TIMER NOT STARTED",
+				d->d_id, DQFLAGTO_TYPESTR(d));
 			err++;
 		}
 	}
@@ -1028,16 +1032,16 @@ xfs_dqtest_cmp2(
 	    be64_to_cpu(dqp->q_core.d_icount) >=
 	    be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
 		if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
-			cmn_err(CE_DEBUG,
-				"%d [%s] [0x%p] INO TIMER NOT STARTED",
-				d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+			xfs_debug(dqp->q_mount,
+				"%d [%s] INO TIMER NOT STARTED",
+				d->d_id, DQFLAGTO_TYPESTR(d));
 			err++;
 		}
 	}
 #ifdef QUOTADEBUG
 	if (!err) {
-		cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked",
-			d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+		xfs_debug(dqp->q_mount, "%d [%s] qchecked",
+			d->d_id, DQFLAGTO_TYPESTR(d));
 	}
 #endif
 	return (err);
@@ -1220,12 +1224,12 @@ xfs_qm_internalqcheck(
 				 xfs_qm_internalqcheck_adjust,
 				 0, NULL, &done);
 		if (error) {
-			cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
+			xfs_debug(mp, "Bulkstat returned error 0x%x", error);
 			break;
 		}
 	} while (!done);
 
-	cmn_err(CE_DEBUG, "Checking results against system dquots");
+	xfs_debug(mp, "Checking results against system dquots");
 	for (i = 0; i < qmtest_hashmask; i++) {
 		xfs_dqtest_t	*d, *n;
 		xfs_dqhash_t	*h;
@@ -1243,10 +1247,10 @@ xfs_qm_internalqcheck(
 	}
 
 	if (qmtest_nfails) {
-		cmn_err(CE_DEBUG, "******** quotacheck failed  ********");
-		cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails);
+		xfs_debug(mp, "******** quotacheck failed  ********");
+		xfs_debug(mp, "failures = %d", qmtest_nfails);
 	} else {
-		cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
+		xfs_debug(mp, "******** quotacheck successful! ********");
 	}
 	kmem_free(qmtest_udqtab);
 	kmem_free(qmtest_gdqtab);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de91d1b75c..2a364873133 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -643,8 +643,9 @@ xfs_trans_dqresv(
 	     (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
 	      (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
 #ifdef QUOTADEBUG
-		cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld"
-			  " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit);
+		xfs_debug(mp,
+			"BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
+			nblks, *resbcountp, hardlimit);
 #endif
 		if (nblks > 0) {
 			/*
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index a1c7141af48..79fae3b33bd 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -18,32 +18,6 @@
 #include <xfs.h>
 #include "debug.h"
 
-/* xfs_mount.h drags a lot of crap in, sorry.. */
-#include "xfs_sb.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_error.h"
-
-void
-cmn_err(
-	const char	*lvl,
-	const char	*fmt,
-	...)
-{
-	struct va_format vaf;
-	va_list		args;
-
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-
-	printk("%s%pV", lvl, &vaf);
-	va_end(args);
-
-	BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-}
-
 void
 assfail(char *expr, char *file, int line)
 {
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 4a082b9a116..db36be48a43 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -20,18 +20,6 @@
 
 #include <stdarg.h>
 
-struct xfs_mount;
-
-#define CE_DEBUG        KERN_DEBUG
-#define CE_CONT         KERN_INFO
-#define CE_NOTE         KERN_NOTICE
-#define CE_WARN         KERN_WARNING
-#define CE_ALERT        KERN_ALERT
-#define CE_PANIC        KERN_EMERG
-
-void cmn_err(const char *lvl, const char *fmt, ...)
-		__attribute__ ((format (printf, 2, 3)));
-
 extern void assfail(char *expr, char *f, int l);
 
 #define ASSERT_ALWAYS(expr)	\
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 92612f6b4b3..fa00788de2f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5778,7 +5778,7 @@ xfs_check_block(
 			else
 				thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
 			if (*thispa == *pp) {
-				cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
+				xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
 					__func__, j, i,
 					(unsigned long long)be64_to_cpu(*thispa));
 				panic("%s: ptrs are equal in node\n",
@@ -5943,11 +5943,11 @@ xfs_bmap_check_leaf_extents(
 	return;
 
 error0:
-	cmn_err(CE_WARN, "%s: at error0", __func__);
+	xfs_warn(mp, "%s: at error0", __func__);
 	if (bp_release)
 		xfs_trans_brelse(NULL, bp);
 error_norelse:
-	cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
+	xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
 		__func__, i);
 	panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
 	return;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 6f8c21ce0d6..e5413d96f1a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -130,10 +130,12 @@ xfs_buf_item_log_check(
 	orig = bip->bli_orig;
 	buffer = XFS_BUF_PTR(bp);
 	for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
-		if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
-			cmn_err(CE_PANIC,
-	"xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
-				bip, bp, orig, x);
+		if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
+			xfs_emerg(bp->b_mount,
+				"%s: bip %x buffer %x orig %x index %d",
+				__func__, bip, bp, orig, x);
+			ASSERT(0);
+		}
 	}
 }
 #else
@@ -983,10 +985,9 @@ xfs_buf_iodone_callbacks(
 	if (XFS_BUF_TARGET(bp) != lasttarg ||
 	    time_after(jiffies, (lasttime + 5*HZ))) {
 		lasttime = jiffies;
-		cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
-				" block 0x%llx in %s",
+		xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
 			XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-		      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+		      (__uint64_t)XFS_BUF_ADDR(bp));
 	}
 	lasttarg = XFS_BUF_TARGET(bp);
 
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 1c00bedb317..6102ac6d1df 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,13 +1995,12 @@ xfs_da_do_buf(
 		error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
 		if (unlikely(error == EFSCORRUPTED)) {
 			if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
-				cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
-					(long long)bno);
-				cmn_err(CE_ALERT, "dir: inode %lld\n",
+				xfs_alert(mp, "%s: bno %lld dir: inode %lld",
+					__func__, (long long)bno,
 					(long long)dp->i_ino);
 				for (i = 0; i < nmap; i++) {
-					cmn_err(CE_ALERT,
-						"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n",
+					xfs_alert(mp,
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
 						i,
 						(long long)mapp[i].br_startoff,
 						(long long)mapp[i].br_startblock,
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f9a0864b696..a0aab7d3294 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -899,10 +899,9 @@ xfs_dir2_leafn_rebalance(
 	if(blk2->index < 0) {
 		state->inleaf = 1;
 		blk2->index = 0;
-		cmn_err(CE_ALERT,
-			"xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: "
-			"blk1->index %d\n",
-			blk1->index);
+		xfs_alert(args->dp->i_mount,
+	"%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
+			__func__, blk1->index);
 	}
 }
 
@@ -1641,26 +1640,22 @@ xfs_dir2_node_addname_int(
 			}
 
 			if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
-				cmn_err(CE_ALERT,
-					"xfs_dir2_node_addname_int: dir ino "
-					"%llu needed freesp block %lld for\n"
-					"  data block %lld, got %lld\n"
-					"  ifbno %llu lastfbno %d\n",
-					(unsigned long long)dp->i_ino,
+				xfs_alert(mp,
+			"%s: dir ino " "%llu needed freesp block %lld for\n"
+			"  data block %lld, got %lld ifbno %llu lastfbno %d",
+					__func__, (unsigned long long)dp->i_ino,
 					(long long)xfs_dir2_db_to_fdb(mp, dbno),
 					(long long)dbno, (long long)fbno,
 					(unsigned long long)ifbno, lastfbno);
 				if (fblk) {
-					cmn_err(CE_ALERT,
-						" fblk 0x%p blkno %llu "
-						"index %d magic 0x%x\n",
+					xfs_alert(mp,
+				" fblk 0x%p blkno %llu index %d magic 0x%x",
 						fblk,
 						(unsigned long long)fblk->blkno,
 						fblk->index,
 						fblk->magic);
 				} else {
-					cmn_err(CE_ALERT,
-						" ... fblk is NULL\n");
+					xfs_alert(mp, " ... fblk is NULL");
 				}
 				XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
 						 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 03028906f00..39f06336b99 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -48,7 +48,7 @@ xfs_error_trap(int e)
 			break;
 		if (e != xfs_etrap[i])
 			continue;
-		cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
+		xfs_notice(NULL, "%s: error %d", __func__, e);
 		BUG();
 		break;
 	}
@@ -74,7 +74,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
 
 	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
 		if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
-			cmn_err(CE_WARN,
+			xfs_warn(NULL,
 	"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
 				expression, file, line, xfs_etest_fsname[i]);
 			return 1;
@@ -95,14 +95,14 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
 
 	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
 		if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
-			cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
+			xfs_warn(mp, "error tag #%d on", error_tag);
 			return 0;
 		}
 	}
 
 	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
 		if (xfs_etest[i] == 0) {
-			cmn_err(CE_WARN, "Turned on XFS error tag #%d",
+			xfs_warn(mp, "Turned on XFS error tag #%d",
 				error_tag);
 			xfs_etest[i] = error_tag;
 			xfs_etest_fsid[i] = fsid;
@@ -114,7 +114,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
 		}
 	}
 
-	cmn_err(CE_WARN, "error tag overflow, too many turned on");
+	xfs_warn(mp, "error tag overflow, too many turned on");
 
 	return 1;
 }
@@ -133,7 +133,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 		if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
 		     xfs_etest[i] != 0) {
 			cleared = 1;
-			cmn_err(CE_WARN, "Clearing XFS error tag #%d",
+			xfs_warn(mp, "Clearing XFS error tag #%d",
 				xfs_etest[i]);
 			xfs_etest[i] = 0;
 			xfs_etest_fsid[i] = 0LL;
@@ -144,9 +144,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 	}
 
 	if (loud || cleared)
-		cmn_err(CE_WARN,
-			"Cleared all XFS error tags for filesystem \"%s\"",
-			mp->m_fsname);
+		xfs_warn(mp, "Cleared all XFS error tags for filesystem");
 
 	return 0;
 }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index fc3a2cb2c07..84ebeec1664 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1055,28 +1055,23 @@ xfs_difree(
 	 */
 	agno = XFS_INO_TO_AGNO(mp, inode);
 	if (agno >= mp->m_sb.sb_agcount)  {
-		cmn_err(CE_WARN,
-			"xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s.  Returning EINVAL.",
-			agno, mp->m_sb.sb_agcount, mp->m_fsname);
+		xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+			__func__, agno, mp->m_sb.sb_agcount);
 		ASSERT(0);
 		return XFS_ERROR(EINVAL);
 	}
 	agino = XFS_INO_TO_AGINO(mp, inode);
 	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
-		cmn_err(CE_WARN,
-			"xfs_difree: inode != XFS_AGINO_TO_INO() "
-			"(%llu != %llu) on %s.  Returning EINVAL.",
-			(unsigned long long)inode,
-			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
-			mp->m_fsname);
+		xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+			__func__, (unsigned long long)inode,
+			(unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
 		ASSERT(0);
 		return XFS_ERROR(EINVAL);
 	}
 	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
 	if (agbno >= mp->m_sb.sb_agblocks)  {
-		cmn_err(CE_WARN,
-			"xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s.  Returning EINVAL.",
-			agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
+		xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+			__func__, agbno, mp->m_sb.sb_agblocks);
 		ASSERT(0);
 		return XFS_ERROR(EINVAL);
 	}
@@ -1085,9 +1080,8 @@ xfs_difree(
 	 */
 	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
 	if (error) {
-		cmn_err(CE_WARN,
-			"xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
-			error, mp->m_fsname);
+		xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+			__func__, error);
 		return error;
 	}
 	agi = XFS_BUF_TO_AGI(agbp);
@@ -1106,17 +1100,15 @@ xfs_difree(
 	 * Look for the entry describing this inode.
 	 */
 	if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
-		cmn_err(CE_WARN,
-			"xfs_difree: xfs_inobt_lookup returned()  an error %d on %s.  Returning error.",
-			error, mp->m_fsname);
+		xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
+			__func__, error);
 		goto error0;
 	}
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	error = xfs_inobt_get_rec(cur, &rec, &i);
 	if (error) {
-		cmn_err(CE_WARN,
-			"xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
-			error, mp->m_fsname);
+		xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
+			__func__, error);
 		goto error0;
 	}
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1157,8 +1149,8 @@ xfs_difree(
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
 
 		if ((error = xfs_btree_delete(cur, &i))) {
-			cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
-				error, mp->m_fsname);
+			xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
+				__func__, error);
 			goto error0;
 		}
 
@@ -1170,9 +1162,8 @@ xfs_difree(
 
 		error = xfs_inobt_update(cur, &rec);
 		if (error) {
-			cmn_err(CE_WARN,
-	"xfs_difree: xfs_inobt_update returned an error %d on %s.",
-				error, mp->m_fsname);
+			xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
+				__func__, error);
 			goto error0;
 		}
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d820ada49b1..da871f53223 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -142,10 +142,9 @@ xfs_imap_to_bp(
 				   (int)imap->im_len, buf_flags, &bp);
 	if (error) {
 		if (error != EAGAIN) {
-			cmn_err(CE_WARN,
-				"xfs_imap_to_bp: xfs_trans_read_buf()returned "
-				"an error %d on %s.  Returning error.",
-				error, mp->m_fsname);
+			xfs_warn(mp,
+				"%s: xfs_trans_read_buf() returned error %d.",
+				__func__, error);
 		} else {
 			ASSERT(buf_flags & XBF_TRYLOCK);
 		}
@@ -180,12 +179,11 @@ xfs_imap_to_bp(
 			XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
 						XFS_ERRLEVEL_HIGH, mp, dip);
 #ifdef DEBUG
-			cmn_err(CE_PANIC,
-					"Device %s - bad inode magic/vsn "
-					"daddr %lld #%d (magic=%x)",
-				XFS_BUFTARG_NAME(mp->m_ddev_targp),
+			xfs_emerg(mp,
+				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
 				(unsigned long long)imap->im_blkno, i,
 				be16_to_cpu(dip->di_magic));
+			ASSERT(0);
 #endif
 			xfs_trans_brelse(tp, bp);
 			return XFS_ERROR(EFSCORRUPTED);
@@ -1811,9 +1809,8 @@ xfs_iunlink_remove(
 		 */
 		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
 		if (error) {
-			cmn_err(CE_WARN,
-				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
-				error, mp->m_fsname);
+			xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
+				__func__, error);
 			return error;
 		}
 		next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1858,9 +1855,9 @@ xfs_iunlink_remove(
 			error = xfs_inotobp(mp, tp, next_ino, &last_dip,
 					    &last_ibp, &last_offset, 0);
 			if (error) {
-				cmn_err(CE_WARN,
-			"xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
-					error, mp->m_fsname);
+				xfs_warn(mp,
+					"%s: xfs_inotobp() returned error %d.",
+					__func__, error);
 				return error;
 			}
 			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
@@ -1873,9 +1870,8 @@ xfs_iunlink_remove(
 		 */
 		error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
 		if (error) {
-			cmn_err(CE_WARN,
-				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
-				error, mp->m_fsname);
+			xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
+				__func__, error);
 			return error;
 		}
 		next_agino = be32_to_cpu(dip->di_next_unlinked);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e39b082eb04..bb3f9a7b24e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -133,9 +133,7 @@ xfs_uuid_mount(
 		return 0;
 
 	if (uuid_is_nil(uuid)) {
-		cmn_err(CE_WARN,
-			"XFS: Filesystem %s has nil UUID - can't mount",
-			mp->m_fsname);
+		xfs_warn(mp, "Filesystem has nil UUID - can't mount");
 		return XFS_ERROR(EINVAL);
 	}
 
@@ -163,8 +161,7 @@ xfs_uuid_mount(
 
  out_duplicate:
 	mutex_unlock(&xfs_uuid_table_mutex);
-	cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
-			 mp->m_fsname);
+	xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
 	return XFS_ERROR(EINVAL);
 }
 
@@ -867,8 +864,7 @@ xfs_update_alignment(xfs_mount_t *mp)
 		if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
 		    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
 			if (mp->m_flags & XFS_MOUNT_RETERR) {
-				cmn_err(CE_WARN,
-					"XFS: alignment check 1 failed");
+				xfs_warn(mp, "alignment check 1 failed");
 				return XFS_ERROR(EINVAL);
 			}
 			mp->m_dalign = mp->m_swidth = 0;
@@ -1041,14 +1037,14 @@ xfs_check_sizes(xfs_mount_t *mp)
 
 	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-		cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
+		xfs_warn(mp, "filesystem size mismatch detected");
 		return XFS_ERROR(EFBIG);
 	}
 	bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
 					d - XFS_FSS_TO_BB(mp, 1),
 					BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
 	if (!bp) {
-		cmn_err(CE_WARN, "XFS: last sector read failed");
+		xfs_warn(mp, "last sector read failed");
 		return EIO;
 	}
 	xfs_buf_relse(bp);
@@ -1056,14 +1052,14 @@ xfs_check_sizes(xfs_mount_t *mp)
 	if (mp->m_logdev_targp != mp->m_ddev_targp) {
 		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
 		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-			cmn_err(CE_WARN, "XFS: log size mismatch detected");
+			xfs_warn(mp, "log size mismatch detected");
 			return XFS_ERROR(EFBIG);
 		}
 		bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
 					d - XFS_FSB_TO_BB(mp, 1),
 					XFS_FSB_TO_B(mp, 1), 0);
 		if (!bp) {
-			cmn_err(CE_WARN, "XFS: log device read failed");
+			xfs_warn(mp, "log device read failed");
 			return EIO;
 		}
 		xfs_buf_relse(bp);
@@ -1175,8 +1171,7 @@ xfs_mountfs(
 	 * transaction subsystem is online.
 	 */
 	if (xfs_sb_has_mismatched_features2(sbp)) {
-		cmn_err(CE_WARN,
-			"XFS: correcting sb_features alignment problem");
+		xfs_warn(mp, "correcting sb_features alignment problem");
 		sbp->sb_features2 |= sbp->sb_bad_features2;
 		sbp->sb_bad_features2 = sbp->sb_features2;
 		mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
@@ -1255,7 +1250,7 @@ xfs_mountfs(
 	 */
 	error = xfs_rtmount_init(mp);
 	if (error) {
-		cmn_err(CE_WARN, "XFS: RT mount failed");
+		xfs_warn(mp, "RT mount failed");
 		goto out_remove_uuid;
 	}
 
@@ -1286,12 +1281,12 @@ xfs_mountfs(
 	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
 	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
 	if (error) {
-		cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
+		xfs_warn(mp, "Failed per-ag init: %d", error);
 		goto out_remove_uuid;
 	}
 
 	if (!sbp->sb_logblocks) {
-		cmn_err(CE_WARN, "XFS: no log defined");
+		xfs_warn(mp, "no log defined");
 		XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
 		error = XFS_ERROR(EFSCORRUPTED);
 		goto out_free_perag;
@@ -1304,7 +1299,7 @@ xfs_mountfs(
 			      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
 			      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
 	if (error) {
-		cmn_err(CE_WARN, "XFS: log mount failed");
+		xfs_warn(mp, "log mount failed");
 		goto out_free_perag;
 	}
 
@@ -1341,16 +1336,14 @@ xfs_mountfs(
 	 */
 	error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
 	if (error) {
-		cmn_err(CE_WARN, "XFS: failed to read root inode");
+		xfs_warn(mp, "failed to read root inode");
 		goto out_log_dealloc;
 	}
 
 	ASSERT(rip != NULL);
 
 	if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
-		cmn_err(CE_WARN, "XFS: corrupted root inode");
-		cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
-			XFS_BUFTARG_NAME(mp->m_ddev_targp),
+		xfs_warn(mp, "corrupted root inode %llu: not a directory",
 			(unsigned long long)rip->i_ino);
 		xfs_iunlock(rip, XFS_ILOCK_EXCL);
 		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
@@ -1370,7 +1363,7 @@ xfs_mountfs(
 		/*
 		 * Free up the root inode.
 		 */
-		cmn_err(CE_WARN, "XFS: failed to read RT inodes");
+		xfs_warn(mp, "failed to read RT inodes");
 		goto out_rele_rip;
 	}
 
@@ -1382,7 +1375,7 @@ xfs_mountfs(
 	if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
 		error = xfs_mount_log_sb(mp, mp->m_update_flags);
 		if (error) {
-			cmn_err(CE_WARN, "XFS: failed to write sb changes");
+			xfs_warn(mp, "failed to write sb changes");
 			goto out_rtunmount;
 		}
 	}
@@ -1403,10 +1396,7 @@ xfs_mountfs(
 		 * quotachecked license.
 		 */
 		if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
-			cmn_err(CE_NOTE,
-				"XFS: resetting qflags for filesystem %s",
-				mp->m_fsname);
-
+			xfs_notice(mp, "resetting quota flags");
 			error = xfs_mount_reset_sbqflags(mp);
 			if (error)
 				return error;
@@ -1420,7 +1410,7 @@ xfs_mountfs(
 	 */
 	error = xfs_log_mount_finish(mp);
 	if (error) {
-		cmn_err(CE_WARN, "XFS: log mount finish failed");
+		xfs_warn(mp, "log mount finish failed");
 		goto out_rtunmount;
 	}
 
@@ -1449,8 +1439,8 @@ xfs_mountfs(
 		resblks = xfs_default_resblks(mp);
 		error = xfs_reserve_blocks(mp, &resblks, NULL);
 		if (error)
-			cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
-				"blocks. Continuing without a reserve pool.");
+			xfs_warn(mp,
+	"Unable to allocate reserve blocks. Continuing without reserve pool.");
 	}
 
 	return 0;
@@ -1539,12 +1529,12 @@ xfs_unmountfs(
 	resblks = 0;
 	error = xfs_reserve_blocks(mp, &resblks, NULL);
 	if (error)
-		cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+		xfs_warn(mp, "Unable to free reserved block pool. "
 				"Freespace may not be correct on next mount.");
 
 	error = xfs_log_sbcount(mp, 1);
 	if (error)
-		cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+		xfs_warn(mp, "Unable to update superblock counters. "
 				"Freespace may not be correct on next mount.");
 	xfs_unmountfs_writesb(mp);
 	xfs_unmountfs_wait(mp); 		/* wait for async bufs */
@@ -2027,10 +2017,8 @@ xfs_dev_is_read_only(
 	if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
 	    xfs_readonly_buftarg(mp->m_logdev_targp) ||
 	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
-		cmn_err(CE_NOTE,
-			"XFS: %s required on read-only device.", message);
-		cmn_err(CE_NOTE,
-			"XFS: write access unavailable, cannot proceed.");
+		xfs_notice(mp, "%s required on read-only device.", message);
+		xfs_notice(mp, "write access unavailable, cannot proceed.");
 		return EROFS;
 	}
 	return 0;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index fbff89344ba..8f76fdff4f4 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2210,8 +2210,8 @@ xfs_rtmount_init(
 	if (sbp->sb_rblocks == 0)
 		return 0;
 	if (mp->m_rtdev_targp == NULL) {
-		cmn_err(CE_WARN,
-	"XFS: This filesystem has a realtime volume, use rtdev=device option");
+		xfs_warn(mp,
+	"Filesystem has a realtime volume, use rtdev=device option");
 		return XFS_ERROR(ENODEV);
 	}
 	mp->m_rsumlevels = sbp->sb_rextslog + 1;
@@ -2225,7 +2225,7 @@ xfs_rtmount_init(
 	 */
 	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
 	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
-		cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
+		xfs_warn(mp, "realtime mount -- %llu != %llu",
 			(unsigned long long) XFS_BB_TO_FSB(mp, d),
 			(unsigned long long) mp->m_sb.sb_rblocks);
 		return XFS_ERROR(EFBIG);
@@ -2234,7 +2234,7 @@ xfs_rtmount_init(
 					d - XFS_FSB_TO_BB(mp, 1),
 					XFS_FSB_TO_B(mp, 1), 0);
 	if (!bp) {
-		cmn_err(CE_WARN, "XFS: realtime device size check failed");
+		xfs_warn(mp, "realtime device size check failed");
 		return EIO;
 	}
 	xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ff614c29b44..09e1f4f35e9 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -154,7 +154,7 @@ xfs_rtmount_init(
 	if (mp->m_sb.sb_rblocks == 0)
 		return 0;
 
-	cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+	xfs_warn(mp, "Not built with CONFIG_XFS_RT");
 	return ENOSYS;
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index ae6f4961341..d6d6fdfe942 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -49,9 +49,9 @@ xfs_do_force_shutdown(
 	logerror = flags & SHUTDOWN_LOG_IO_ERROR;
 
 	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-		cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
-				 "line %d of file %s.  Return address = 0x%p",
-			mp->m_fsname, flags, lnnum, fname, __return_address);
+		xfs_notice(mp,
+	"%s(0x%x) called from line %d of file %s.  Return address = 0x%p",
+			__func__, flags, lnnum, fname, __return_address);
 	}
 	/*
 	 * No need to duplicate efforts.
@@ -86,8 +86,8 @@ xfs_do_force_shutdown(
 		}
 	}
 	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-		cmn_err(CE_ALERT, "Please umount the filesystem, "
-				  "and rectify the problem(s)");
+		xfs_alert(mp,
+	"Please umount the filesystem and rectify the problem(s)");
 	}
 }
 
@@ -101,10 +101,9 @@ xfs_ioerror_alert(
 	xfs_buf_t		*bp,
 	xfs_daddr_t		blkno)
 {
-	cmn_err(CE_ALERT,
- "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
- "       (\"%s\") error %d buf count %zd",
-		(!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
+	xfs_alert(mp,
+		 "I/O error occurred: meta-data dev %s block 0x%llx"
+		 "       (\"%s\") error %d buf count %zd",
 		XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
 		(__uint64_t)blkno, func,
 		XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c47918c302a..3bea6613233 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -305,7 +305,7 @@ xfs_trans_read_buf(
 			if (xfs_error_target == target) {
 				if (((xfs_req_num++) % xfs_error_mod) == 0) {
 					xfs_buf_relse(bp);
-					cmn_err(CE_DEBUG, "Returning error!\n");
+					xfs_debug(mp, "Returning error!");
 					return XFS_ERROR(EIO);
 				}
 			}
@@ -403,7 +403,7 @@ xfs_trans_read_buf(
 				xfs_force_shutdown(tp->t_mountp,
 						   SHUTDOWN_META_IO_ERROR);
 				xfs_buf_relse(bp);
-				cmn_err(CE_DEBUG, "Returning trans error!\n");
+				xfs_debug(mp, "Returning trans error!");
 				return XFS_ERROR(EIO);
 			}
 		}
@@ -427,7 +427,7 @@ shutdown_abort:
 	 */
 #if defined(DEBUG)
 	if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
-		cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
+		xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
 #endif
 	ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
 				     (XBF_STALE|XBF_DELWRI));
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index e919ae1e9c6..37d8146ee15 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1189,9 +1189,8 @@ xfs_inactive(
 		 * inode might be lost for a long time or forever.
 		 */
 		if (!XFS_FORCED_SHUTDOWN(mp)) {
-			cmn_err(CE_NOTE,
-		"xfs_inactive:	xfs_ifree() returned an error = %d on %s",
-				error, mp->m_fsname);
+			xfs_notice(mp, "%s: xfs_ifree returned error %d",
+				__func__, error);
 			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 		}
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-- 
cgit v1.2.3


From 9130090b5f04f7e7bc29b944e0b1ba494fff3f98 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 7 Mar 2011 10:09:35 +1100
Subject: xfs: kill support/debug.[ch]

The remaining functionality in debug.[ch] is effectively just assert
handling, conditional debug definitions and hex dumping. The hex
dumping and assert function can be moved into the new printk module,
while the rest can be moved into top-level header files. This allows
fs/xfs/support/debug.[ch] to be completely removed from the
codebase.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile                |  4 +---
 fs/xfs/linux-2.6/xfs_linux.h   | 22 ++++++++++++++++++++-
 fs/xfs/linux-2.6/xfs_message.c | 14 +++++++++++++
 fs/xfs/linux-2.6/xfs_message.h |  4 ++++
 fs/xfs/support/debug.c         | 33 -------------------------------
 fs/xfs/support/debug.h         | 45 ------------------------------------------
 fs/xfs/xfs_error.h             |  4 ----
 7 files changed, 40 insertions(+), 86 deletions(-)
 delete mode 100644 fs/xfs/support/debug.c
 delete mode 100644 fs/xfs/support/debug.h

(limited to 'fs')

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 077784ed6a7..fee7901841e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -111,6 +111,4 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 				   xfs_xattr.o)
 
 # Objects in support/
-xfs-y				+= $(addprefix support/, \
-				   debug.o \
-				   uuid.o)
+xfs-y				+= support/uuid.o
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 1189bfcbcd3..244be9cbfe7 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -39,7 +39,6 @@
 #include <mrlock.h>
 #include <time.h>
 
-#include <support/debug.h>
 #include <support/uuid.h>
 
 #include <linux/semaphore.h>
@@ -281,4 +280,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 #define __arch_pack
 #endif
 
+#define ASSERT_ALWAYS(expr)	\
+	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifndef DEBUG
+#define ASSERT(expr)	((void)0)
+
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+
+#else /* DEBUG */
+
+#define ASSERT(expr)	\
+	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifndef STATIC
+# define STATIC noinline
+#endif
+
+#endif /* DEBUG */
+
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 6f3368eec25..8fe8cf69d80 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -117,3 +117,17 @@ xfs_alert_tag(
 
 	return r;
 }
+
+void
+assfail(char *expr, char *file, int line)
+{
+	xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+		expr, file, line);
+	BUG();
+}
+
+void
+xfs_hex_dump(void *p, int length)
+{
+	print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
index 8d2df017530..e77ffa16745 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -31,4 +31,8 @@ extern int xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
 #define xfs_debug(mp, fmt, ...)	(0)
 #endif
 
+extern void assfail(char *expr, char *f, int l);
+
+extern void xfs_hex_dump(void *p, int length);
+
 #endif	/* __XFS_MESSAGE_H */
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index 79fae3b33bd..00000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-#include "debug.h"
-
-void
-assfail(char *expr, char *file, int line)
-{
-	printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
-	       file, line);
-	BUG();
-}
-
-void
-xfs_hex_dump(void *p, int length)
-{
-	print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
-}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index db36be48a43..00000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef	__XFS_SUPPORT_DEBUG_H__
-#define	__XFS_SUPPORT_DEBUG_H__
-
-#include <stdarg.h>
-
-extern void assfail(char *expr, char *f, int l);
-
-#define ASSERT_ALWAYS(expr)	\
-	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-
-#ifndef DEBUG
-#define ASSERT(expr)	((void)0)
-
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-
-#else /* DEBUG */
-
-#define ASSERT(expr)	\
-	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-
-#ifndef STATIC
-# define STATIC noinline
-#endif
-
-#endif /* DEBUG */
-#endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index e8360514c25..079a367f44e 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -158,8 +158,4 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #define		XFS_PTAG_SHUTDOWN_LOGERROR	0x00000040
 #define		XFS_PTAG_FSBLOCK_ZERO		0x00000080
 
-struct xfs_mount;
-
-extern void xfs_hex_dump(void *p, int length);
-
 #endif	/* __XFS_ERROR_H__ */
-- 
cgit v1.2.3


From b1bf862e9dad431175a1174379476299dbfdc017 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Feb 2011 09:52:08 -0500
Subject: Btrfs: fix regressions in copy_from_user handling

Commit 914ee295af418e936ec20a08c1663eaabe4cd07a fixed deadlocks in
btrfs_file_write where we would catch page faults on pages we had
locked.

But, there were a few problems:

1) The x86-32 iov_iter_copy_from_user_atomic code always fails to copy
data when the amount to copy is more than 4K and the offset to start
copying from is not page aligned.  The result was btrfs_file_write
looping forever retrying the iov_iter_copy_from_user_atomic

We deal with this by changing btrfs_file_write to drop down to single
page copies when iov_iter_copy_from_user_atomic starts returning failure.

2) The btrfs_file_write code was leaking delalloc reservations when
iov_iter_copy_from_user_atomic returned zero.  The looping above would
result in the entire filesystem running out of delalloc reservations and
constantly trying to flush things to disk.

3) btrfs_file_write will lock down page cache pages, make sure
any writeback is finished, do the copy_from_user and then release them.
Before the loop runs we check the first and last pages in the write to
see if they are only being partially modified.  If the start or end of
the write isn't aligned, we make sure the corresponding pages are
up to date so that we don't introduce garbage into the file.

With the copy_from_user changes, we're allowing the VM to reclaim the
pages after a partial update from copy_from_user, but we're not
making sure the page cache page is up to date when we loop around to
resume the write.

We deal with this by pushing the up to date checks down into the page
prep code.  This fits better with how the rest of file_write works.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
cc: stable@kernel.org
---
 fs/btrfs/file.c | 101 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 65338a1d14a..13664b315fe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -761,6 +761,27 @@ out:
 	return 0;
 }
 
+/*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+	int ret = 0;
+
+	if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+		ret = btrfs_readpage(NULL, page);
+		if (ret)
+			return ret;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
 /*
  * this gets pages into the page cache and locks them down, it also properly
  * waits for data=ordered extents to finish before allowing the pages to be
@@ -776,6 +797,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = fdentry(file)->d_inode;
 	int err = 0;
+	int faili = 0;
 	u64 start_pos;
 	u64 last_pos;
 
@@ -793,15 +815,24 @@ again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
-			int c;
-			for (c = i - 1; c >= 0; c--) {
-				unlock_page(pages[c]);
-				page_cache_release(pages[c]);
-			}
-			return -ENOMEM;
+			faili = i - 1;
+			err = -ENOMEM;
+			goto fail;
+		}
+
+		if (i == 0)
+			err = prepare_uptodate_page(pages[i], pos);
+		if (i == num_pages - 1)
+			err = prepare_uptodate_page(pages[i],
+						    pos + write_bytes);
+		if (err) {
+			page_cache_release(pages[i]);
+			faili = i - 1;
+			goto fail;
 		}
 		wait_on_page_writeback(pages[i]);
 	}
+	err = 0;
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -841,6 +872,14 @@ again:
 		WARN_ON(!PageLocked(pages[i]));
 	}
 	return 0;
+fail:
+	while (faili >= 0) {
+		unlock_page(pages[faili]);
+		page_cache_release(pages[faili]);
+		faili--;
+	}
+	return err;
+
 }
 
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -850,7 +889,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page *pinned[2];
 	struct page **pages = NULL;
 	struct iov_iter i;
 	loff_t *ppos = &iocb->ki_pos;
@@ -871,9 +909,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
 		      (file->f_flags & O_DIRECT));
 
-	pinned[0] = NULL;
-	pinned[1] = NULL;
-
 	start_pos = pos;
 
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -961,32 +996,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
 
-	/*
-	 * there are lots of better ways to do this, but this code
-	 * makes sure the first and last page in the file range are
-	 * up to date and ready for cow
-	 */
-	if ((pos & (PAGE_CACHE_SIZE - 1))) {
-		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
-		if (!PageUptodate(pinned[0])) {
-			ret = btrfs_readpage(NULL, pinned[0]);
-			BUG_ON(ret);
-			wait_on_page_locked(pinned[0]);
-		} else {
-			unlock_page(pinned[0]);
-		}
-	}
-	if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
-		if (!PageUptodate(pinned[1])) {
-			ret = btrfs_readpage(NULL, pinned[1]);
-			BUG_ON(ret);
-			wait_on_page_locked(pinned[1]);
-		} else {
-			unlock_page(pinned[1]);
-		}
-	}
-
 	while (iov_iter_count(&i) > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(iov_iter_count(&i),
@@ -1023,8 +1032,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
 		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, &i);
-		dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
-				PAGE_CACHE_SHIFT;
+
+		/*
+		 * if we have trouble faulting in the pages, fall
+		 * back to one page at a time
+		 */
+		if (copied < write_bytes)
+			nrptrs = 1;
+
+		if (copied == 0)
+			dirty_pages = 0;
+		else
+			dirty_pages = (copied + offset +
+				       PAGE_CACHE_SIZE - 1) >>
+				       PAGE_CACHE_SHIFT;
 
 		if (num_pages > dirty_pages) {
 			if (copied > 0)
@@ -1068,10 +1089,6 @@ out:
 		err = ret;
 
 	kfree(pages);
-	if (pinned[0])
-		page_cache_release(pinned[0]);
-	if (pinned[1])
-		page_cache_release(pinned[1]);
 	*ppos = pos;
 
 	/*
-- 
cgit v1.2.3


From 31339acd07b4ba687906702085127895a56eb920 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 7 Mar 2011 11:10:24 -0500
Subject: Btrfs: deal with short returns from copy_from_user

When copy_from_user is only able to copy some of the bytes we requested,
we may end up creating a partially up to date page.  To avoid garbage in
the page, we need to treat a partial copy as a zero length copy.

This makes the rest of the file_write code drop the page and
retry the whole copy instead of marking the partially up to
date page as dirty.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
cc: stable@kernel.org
---
 fs/btrfs/file.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 13664b315fe..ab22ca4f237 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -69,6 +69,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 
 		/* Flush processor's dcache for this page */
 		flush_dcache_page(page);
+
+		/*
+		 * if we get a partial write, we can end up with
+		 * partially up to date pages.  These add
+		 * a lot of complexity, so make sure they don't
+		 * happen by forcing this copy to be retried.
+		 *
+		 * The rest of the btrfs_file_write code will fall
+		 * back to page at a time copies after we return 0.
+		 */
+		if (!PageUptodate(page) && copied < count)
+			copied = 0;
+
 		iov_iter_advance(i, copied);
 		write_bytes -= copied;
 		total_copied += copied;
-- 
cgit v1.2.3


From d7433142b63d727b5a217c37b1a1468b116a9771 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 4 Mar 2011 16:04:08 -0600
Subject: ext3: Always set dx_node's fake_dirent explicitly.

(crossport of 1f7bebb9e911d870fa8f997ddff838e82b5715ea
by Andreas Schlick <schlick@lavabit.com>)

When ext3_dx_add_entry() has to split an index node, it has to ensure that
name_len of dx_node's fake_dirent is also zero, because otherwise e2fsck
won't recognise it as an intermediate htree node and consider the htree to
be corrupted.

CC: stable@kernel.org
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71810e..75c968eaf90 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1540,8 +1540,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			goto cleanup;
 		node2 = (struct dx_node *)(bh2->b_data);
 		entries2 = node2->entries;
+		memset(&node2->fake, 0, sizeof(struct fake_dirent));
 		node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
-		node2->fake.inode = 0;
 		BUFFER_TRACE(frame->bh, "get_write_access");
 		err = ext3_journal_get_write_access(handle, frame->bh);
 		if (err)
-- 
cgit v1.2.3


From 32b007b4e19b50ff4d27ea8b69cd6d744cfec86b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 6 Mar 2011 19:11:03 -0500
Subject: nfsd4: fix bad pointer on failure to find delegation

In case of a nonempty list, the return on error here is obviously bogus;
it ends up being a pointer to the list head instead of to any valid
delegation on the list.

In particular, if nfsd4_delegreturn() hits this case, and you're quite unlucky,
then renew_client may oops, and it may take an embarassingly long time to
figure out why.  Facepalm.

BUG: unable to handle kernel NULL pointer dereference at 0000000000000090
IP: [<ffffffff81292965>] nfsd4_delegreturn+0x125/0x200
...

Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 54b60bfceb8..7b566ec14e1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2445,15 +2445,16 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 static struct nfs4_delegation *
 find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
-	struct nfs4_delegation *dp = NULL;
+	struct nfs4_delegation *dp;
 
 	spin_lock(&recall_lock);
-	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) {
-		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid)
-			break;
-	}
+	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
+			spin_unlock(&recall_lock);
+			return dp;
+		}
 	spin_unlock(&recall_lock);
-	return dp;
+	return NULL;
 }
 
 int share_access_to_flags(u32 share_access)
-- 
cgit v1.2.3


From f32cb53219a956b96a4cd1ee7c6b1b8a48d40e9f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 17 Jan 2011 15:45:59 +0900
Subject: locks: use assign_type()

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 0f3998291f7..2c2d3b804d6 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -415,17 +415,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
 	fl->fl_ops = NULL;
 	fl->fl_lmops = NULL;
 
-	switch (l->l_type) {
-	case F_RDLCK:
-	case F_WRLCK:
-	case F_UNLCK:
-		fl->fl_type = l->l_type;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return (0);
+	return assign_type(fl, l->l_type);
 }
 #endif
 
-- 
cgit v1.2.3


From 35079582e72efcabf7c70f3d3ee4f96e6f196606 Mon Sep 17 00:00:00 2001
From: Shan Wei <shanwei@cn.fujitsu.com>
Date: Fri, 14 Jan 2011 17:35:59 +0800
Subject: nfsd: kill unused macro definition

These macros had never been used for several years.
So, remove them.

Signed-off-by: Shan Wei <shanwei@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/export.c    | 1 -
 fs/nfsd/nfs4idmap.c | 1 -
 fs/nfsd/nfs4state.c | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 8b31e5f8795..ad000aeb21a 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -299,7 +299,6 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 
 #define	EXPORT_HASHBITS		8
 #define	EXPORT_HASHMAX		(1<< EXPORT_HASHBITS)
-#define	EXPORT_HASHMASK		(EXPORT_HASHMAX -1)
 
 static struct cache_head *export_table[EXPORT_HASHMAX];
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6d2c397d458..55780a22fdb 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -63,7 +63,6 @@ struct ent {
 
 #define ENT_HASHBITS          8
 #define ENT_HASHMAX           (1 << ENT_HASHBITS)
-#define ENT_HASHMASK          (ENT_HASHMAX - 1)
 
 static void
 ent_init(struct cache_head *cnew, struct cache_head *citm)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 54b60bfceb8..c4f2b0f63e4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -148,7 +148,7 @@ static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
 /* hash table for nfs4_file */
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-#define FILE_HASH_MASK                  (FILE_HASH_SIZE - 1)
+
 /* hash table for (open)nfs4_stateid */
 #define STATEID_HASH_BITS              10
 #define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
-- 
cgit v1.2.3


From 46d4cef9cf54f2f8b15216e3f6dad69750c69e0c Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sun, 6 Mar 2011 00:30:35 +0100
Subject: NFSD, VFS: Remove dead code in nfsd_rename()

Currently we have the following code in fs/nfsd/vfs.c::nfsd_rename() :

	...
	host_err = nfsd_break_lease(odentry->d_inode);
	if (host_err)
		goto out_drop_write;
	if (ndentry->d_inode) {
		host_err = nfsd_break_lease(ndentry->d_inode);
		if (host_err)
			goto out_drop_write;
	}
	if (host_err)
		goto out_drop_write;
	...

'host_err' is guaranteed to be 0 by the time we test 'ndentry->d_inode'.
If 'host_err' becomes != 0 inside the 'if' statement, then we goto
'out_drop_write'. So, after the 'if' statement there is no way that
'host_err' can be anything but 0, so the test afterwards is just dead
code.
This patch removes the dead code.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index da1d9701f8e..9cc626b70fb 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1749,8 +1749,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 		if (host_err)
 			goto out_drop_write;
 	}
-	if (host_err)
-		goto out_drop_write;
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
 	if (!host_err) {
 		host_err = commit_metadata(tfhp);
-- 
cgit v1.2.3


From b0b0c0a26e846ae6646af9f59a3d2ea06b49cbc7 Mon Sep 17 00:00:00 2001
From: Kevin Coffman <kwc@citi.umich.edu>
Date: Wed, 2 Mar 2011 19:51:42 -0500
Subject: nfsd: add proc file listing kernel's gss_krb5 enctypes

Add a new proc file which lists the encryption types supported
by the kernel's gss_krb5 code.

Newer MIT Kerberos libraries support the assertion of acceptor
subkeys.  This enctype information allows user-land (svcgssd)
to request that the Kerberos libraries limit the encryption
types that it uses when generating the subkeys.

Signed-off-by: Kevin Coffman <kwc@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsctl.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 33b3e2b0677..35dcfa8eba2 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -12,13 +12,14 @@
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/gss_api.h>
 
 #include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
 
 /*
- *	We have a single directory with 9 nodes in it.
+ *	We have a single directory with several nodes in it.
  */
 enum {
 	NFSD_Root = 1,
@@ -42,6 +43,7 @@ enum {
 	NFSD_Versions,
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
+	NFSD_SupportedEnctypes,
 	/*
 	 * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -187,6 +189,32 @@ static struct file_operations export_features_operations = {
 	.release	= single_release,
 };
 
+static int supported_enctypes_show(struct seq_file *m, void *v)
+{
+	struct gss_api_mech *k5mech;
+
+	k5mech = gss_mech_get_by_name("krb5");
+	if (k5mech == NULL)
+		goto out;
+	if (k5mech->gm_upcall_enctypes != NULL)
+		seq_printf(m, k5mech->gm_upcall_enctypes);
+	gss_mech_put(k5mech);
+out:
+	return 0;
+}
+
+static int supported_enctypes_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, supported_enctypes_show, NULL);
+}
+
+static struct file_operations supported_enctypes_ops = {
+	.open		= supported_enctypes_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
 
@@ -1397,6 +1425,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+		[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
-- 
cgit v1.2.3


From 5ece3cafbd88d4da5c734e1810c4a2e6474b57b2 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Fri, 18 Feb 2011 09:08:31 +0800
Subject: nfsd41: modify the members value of nfsd4_op_flags

The members of nfsd4_op_flags, (ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS)
equals to  ALLOWED_AS_FIRST_OP, maybe that's not what we want.

OP_PUTROOTFH with op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
can't appears as the first operation with out SEQUENCE ops.

This patch modify the wrong value of ALLOWED_WITHOUT_FH etc which
was introduced by f9bb94c4.

Cc: stable@kernel.org
Reviewed-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index db52546143d..5fcb1396a7e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -984,8 +984,8 @@ typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
 			      void *);
 enum nfsd4_op_flags {
 	ALLOWED_WITHOUT_FH = 1 << 0,	/* No current filehandle required */
-	ALLOWED_ON_ABSENT_FS = 2 << 0,	/* ops processed on absent fs */
-	ALLOWED_AS_FIRST_OP = 3 << 0,	/* ops reqired first in compound */
+	ALLOWED_ON_ABSENT_FS = 1 << 1,	/* ops processed on absent fs */
+	ALLOWED_AS_FIRST_OP = 1 << 2,	/* ops reqired first in compound */
 };
 
 struct nfsd4_operation {
-- 
cgit v1.2.3


From 25b18d39cca207f8559af1aac00313deda2d652e Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 11 Feb 2011 15:23:27 +0900
Subject: nilfs2: decrement inodes count only if raw inode was successfully
 deleted

This fixes the issue that inodes count will not add up after removal
of raw inodes fails.  Hence, this prevents possible under flow of the
inodes count.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2fd440d8d6b..3a6967d14e1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -707,6 +707,7 @@ void nilfs_evict_inode(struct inode *inode)
 	struct nilfs_transaction_info ti;
 	struct super_block *sb = inode->i_sb;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int ret;
 
 	if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
 		if (inode->i_data.nrpages)
@@ -725,8 +726,9 @@ void nilfs_evict_inode(struct inode *inode)
 	nilfs_mark_inode_dirty(inode);
 	end_writeback(inode);
 
-	nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
-	atomic_dec(&ii->i_root->inodes_count);
+	ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
+	if (!ret)
+		atomic_dec(&ii->i_root->inodes_count);
 
 	nilfs_clear_inode(inode);
 
-- 
cgit v1.2.3


From 9954e7af14868b8b79e76b7b88daaf0b3866db33 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 23 Feb 2011 02:26:17 +0900
Subject: nilfs2: add free entries count only if clear bit operation succeeded

Three functions of the current persistent object allocator,
nilfs_palloc_commit_free_entry, nilfs_palloc_abort_alloc_entry, and
nilfs_palloc_freev functions unconditionally add a counter after doing
clear bit operation on a bitmap block.

If the clear bit operation overlapped, the counter will not add up.
This fixes the issue by making the counter operations conditional.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/alloc.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d7fd696e595..0a0a66d98cc 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -521,8 +521,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 				    group_offset, bitmap))
 		printk(KERN_WARNING "%s: entry number %llu already freed\n",
 		       __func__, (unsigned long long)req->pr_entry_nr);
-
-	nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+	else
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
 
 	kunmap(req->pr_bitmap_bh->b_page);
 	kunmap(req->pr_desc_bh->b_page);
@@ -558,8 +558,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 				    group_offset, bitmap))
 		printk(KERN_WARNING "%s: entry number %llu already freed\n",
 		       __func__, (unsigned long long)req->pr_entry_nr);
-
-	nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+	else
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
 
 	kunmap(req->pr_bitmap_bh->b_page);
 	kunmap(req->pr_desc_bh->b_page);
@@ -665,7 +665,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 		for (j = i, n = 0;
 		     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
 							      entry_nrs[j]);
-		     j++, n++) {
+		     j++) {
 			nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
 			if (!nilfs_clear_bit_atomic(
 				    nilfs_mdt_bgl_lock(inode, group),
@@ -674,6 +674,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 				       "%s: entry number %llu already freed\n",
 				       __func__,
 				       (unsigned long long)entry_nrs[j]);
+			} else {
+				n++;
 			}
 		}
 		nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
-- 
cgit v1.2.3


From f0c9f242f947a37675a883deca7f722cac935b0e Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 20 Jan 2011 02:09:52 +0900
Subject: nilfs2: use common file attribute macros

Replaces uses of own inode flags (i.e. NILFS_SECRM_FL, NILFS_UNRM_FL,
NILFS_COMPR_FL, and so forth) with common inode flags, and removes the
own flag declarations.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/dir.c   |  3 ---
 fs/nilfs2/inode.c | 14 +++++++-------
 2 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 9d45773b79e..b72833a2cc1 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -440,7 +440,6 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
 	nilfs_commit_chunk(page, mapping, from, to);
 	nilfs_put_page(page);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-/*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
 }
 
 /*
@@ -531,7 +530,6 @@ got_it:
 	nilfs_set_de_type(de, inode);
 	nilfs_commit_chunk(page, page->mapping, from, to);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-/*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
 	nilfs_mark_inode_dirty(dir);
 	/* OFFSET_CACHE */
 out_put:
@@ -579,7 +577,6 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
 	dir->inode = 0;
 	nilfs_commit_chunk(page, mapping, from, to);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-/*	NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
 out:
 	nilfs_put_page(page);
 	return err;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 3a6967d14e1..f61f80c7f7c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -317,9 +317,9 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 
 	ii->i_flags = NILFS_I(dir)->i_flags;
 	if (S_ISLNK(mode))
-		ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
+		ii->i_flags &= ~(FS_IMMUTABLE_FL | FS_APPEND_FL);
 	if (!S_ISDIR(mode))
-		ii->i_flags &= ~NILFS_DIRSYNC_FL;
+		ii->i_flags &= ~FS_DIRSYNC_FL;
 
 	/* ii->i_file_acl = 0; */
 	/* ii->i_dir_acl = 0; */
@@ -359,17 +359,17 @@ void nilfs_set_inode_flags(struct inode *inode)
 
 	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
 			    S_DIRSYNC);
-	if (flags & NILFS_SYNC_FL)
+	if (flags & FS_SYNC_FL)
 		inode->i_flags |= S_SYNC;
-	if (flags & NILFS_APPEND_FL)
+	if (flags & FS_APPEND_FL)
 		inode->i_flags |= S_APPEND;
-	if (flags & NILFS_IMMUTABLE_FL)
+	if (flags & FS_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
 #ifndef NILFS_ATIME_DISABLE
-	if (flags & NILFS_NOATIME_FL)
+	if (flags & FS_NOATIME_FL)
 #endif
 		inode->i_flags |= S_NOATIME;
-	if (flags & NILFS_DIRSYNC_FL)
+	if (flags & FS_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
 	mapping_set_gfp_mask(inode->i_mapping,
 			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
-- 
cgit v1.2.3


From 32f4aeb31583a85c1e9a5d6d485055c090cebbfb Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 20 Jan 2011 02:09:52 +0900
Subject: nilfs2: mark S_NOATIME on inodes only if NOATIME attribute is set

At present, nilfs marks S_NOATIME flag on all inodes.  This restricts
nilfs_set_inode_flags function so that it marks S_NOATIME only if a
given inode has an FS_NOATIME_FL flag.

Although nilfs does not support atime yet, touch_atime() still safely
returns on IS_NOATIME check since MS_NOATIME is always set on sb.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index f61f80c7f7c..2e6ac8e9203 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -365,9 +365,7 @@ void nilfs_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_APPEND;
 	if (flags & FS_IMMUTABLE_FL)
 		inode->i_flags |= S_IMMUTABLE;
-#ifndef NILFS_ATIME_DISABLE
 	if (flags & FS_NOATIME_FL)
-#endif
 		inode->i_flags |= S_NOATIME;
 	if (flags & FS_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
-- 
cgit v1.2.3


From b253a3e4f2b8eed69b804952ef926df0ac788595 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 20 Jan 2011 02:09:53 +0900
Subject: nilfs2: tighten restrictions on inode flags

Nilfs has few rectrictions on which flags may be set on which inodes
like ext2/3/4 filesystems used to be.  Specifically DIRSYNC may only
be set on directories and IMMUTABLE and APPEND may not be set on
links.  Tighten that to disallow TOPDIR being set on non-directories
and only NODUMP and NOATIME to be set on non-regular file,
non-directories.

This introduces a flags masking function like those of extN and uses
it during inode creation.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c |  7 ++-----
 fs/nilfs2/nilfs.h | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2e6ac8e9203..2534af8d2b5 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -315,11 +315,8 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 		/* No lock is needed; iget() ensures it. */
 	}
 
-	ii->i_flags = NILFS_I(dir)->i_flags;
-	if (S_ISLNK(mode))
-		ii->i_flags &= ~(FS_IMMUTABLE_FL | FS_APPEND_FL);
-	if (!S_ISDIR(mode))
-		ii->i_flags &= ~FS_DIRSYNC_FL;
+	ii->i_flags = nilfs_mask_flags(
+		mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
 
 	/* ii->i_file_acl = 0; */
 	/* ii->i_dir_acl = 0; */
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 777e8fd0430..3e3acb1fdd2 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -212,6 +212,23 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
 
 #define NILFS_ATIME_DISABLE
 
+/* Flags that should be inherited by new inodes from their parent. */
+#define NILFS_FL_INHERITED						\
+	(FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL |		\
+	 FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\
+	 FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL);
+	else
+		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
 /* dir.c */
 extern int nilfs_add_link(struct dentry *, struct inode *);
 extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
-- 
cgit v1.2.3


From cde98f0f84ccff78e87235cb7b551747d6ad00de Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 20 Jan 2011 02:09:53 +0900
Subject: nilfs2: implement FS_IOC_GETFLAGS/SETFLAGS/GETVERSION

Add support for the standard attributes set via chattr and read via
lsattr.  These attributes are already in the flags value in the nilfs2
inode, but currently we don't have any ioctl commands that expose them
to the userland.

Collaterally, this adds the FS_IOC_GETVERSION ioctl for getting
i_generation, which allows users to list the file's generation number
with "lsattr -v".

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/ioctl.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 496738963fd..3aad6413aba 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -97,6 +97,70 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
 	return ret;
 }
 
+static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
+{
+	unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
+
+	return put_user(flags, (int __user *)argp);
+}
+
+static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
+				void __user *argp)
+{
+	struct nilfs_transaction_info ti;
+	unsigned int flags, oldflags;
+	int ret;
+
+	if (!is_owner_or_cap(inode))
+		return -EACCES;
+
+	if (get_user(flags, (int __user *)argp))
+		return -EFAULT;
+
+	ret = mnt_want_write(filp->f_path.mnt);
+	if (ret)
+		return ret;
+
+	flags = nilfs_mask_flags(inode->i_mode, flags);
+
+	mutex_lock(&inode->i_mutex);
+
+	oldflags = NILFS_I(inode)->i_flags;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by the
+	 * relevant capability.
+	 */
+	ret = -EPERM;
+	if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		goto out;
+
+	ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	if (ret)
+		goto out;
+
+	NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) |
+		(flags & FS_FL_USER_MODIFIABLE);
+
+	nilfs_set_inode_flags(inode);
+	inode->i_ctime = CURRENT_TIME;
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+
+	nilfs_mark_inode_dirty(inode);
+	ret = nilfs_transaction_commit(inode->i_sb);
+out:
+	mutex_unlock(&inode->i_mutex);
+	mnt_drop_write(filp->f_path.mnt);
+	return ret;
+}
+
+static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
+{
+	return put_user(inode->i_generation, (int __user *)argp);
+}
+
 static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 				     unsigned int cmd, void __user *argp)
 {
@@ -666,6 +730,12 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return nilfs_ioctl_getflags(inode, argp);
+	case FS_IOC_SETFLAGS:
+		return nilfs_ioctl_setflags(inode, filp, argp);
+	case FS_IOC_GETVERSION:
+		return nilfs_ioctl_getversion(inode, argp);
 	case NILFS_IOCTL_CHANGE_CPMODE:
 		return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
 	case NILFS_IOCTL_DELETE_CHECKPOINT:
-- 
cgit v1.2.3


From 828b1c50ae11e6dda68f8dfefe43b74c7182b157 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 3 Feb 2011 21:26:17 +0900
Subject: nilfs2: add compat ioctl

The current FS_IOC_GETFLAGS/SETFLAGS/GETVERSION will fail if
application is 32 bit and kernel is 64 bit.

This issue is avoidable by adding compat_ioctl method.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/dir.c   |  2 +-
 fs/nilfs2/file.c  |  2 +-
 fs/nilfs2/ioctl.c | 21 +++++++++++++++++++++
 fs/nilfs2/nilfs.h |  1 +
 4 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index b72833a2cc1..3a1923943b1 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -681,7 +681,7 @@ const struct file_operations nilfs_dir_operations = {
 	.readdir	= nilfs_readdir,
 	.unlocked_ioctl	= nilfs_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl	= nilfs_ioctl,
+	.compat_ioctl	= nilfs_compat_ioctl,
 #endif	/* CONFIG_COMPAT */
 	.fsync		= nilfs_sync_file,
 
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 2f560c9fb80..7a5e4ab15c6 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -142,7 +142,7 @@ const struct file_operations nilfs_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.unlocked_ioctl	= nilfs_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl	= nilfs_ioctl,
+	.compat_ioctl	= nilfs_compat_ioctl,
 #endif	/* CONFIG_COMPAT */
 	.mmap		= nilfs_file_mmap,
 	.open		= generic_file_open,
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 3aad6413aba..d89173edd7f 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -26,6 +26,7 @@
 #include <linux/capability.h>	/* capable() */
 #include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
 #include <linux/vmalloc.h>
+#include <linux/compat.h>	/* compat_ptr() */
 #include <linux/mount.h>	/* mnt_want_write(), mnt_drop_write() */
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
@@ -766,3 +767,23 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return -ENOTTY;
 	}
 }
+
+#ifdef CONFIG_COMPAT
+long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	case FS_IOC32_GETVERSION:
+		cmd = FS_IOC_GETVERSION;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return nilfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 3e3acb1fdd2..45b1fd1d024 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -246,6 +246,7 @@ extern int nilfs_sync_file(struct file *, int);
 
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
+long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
 				       void **);
 
-- 
cgit v1.2.3


From 4138ec23820012009aecc2b02856c62872dd3c34 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 24 Jan 2011 00:28:22 +0900
Subject: nilfs2: append blocksize info to warnings during loading super blocks

At present, the same warning message can be output twice when nilfs
detected a problem on super blocks:

 NILFS warning: broken superblock. using spare superblock.
 NILFS warning: broken superblock. using spare superblock.
 ...

This is because these super blocks are reloaded with the block size
written in a super block if it differs from the first block size, but
this repetition looks somewhat confusing.  So, we hint at what is
going on by appending block size information to those messages.

Reported-by: Wakko Warner <wakko@animx.eu.org>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/the_nilfs.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad4ac607cf5..9098909d5ce 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -475,10 +475,13 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 			return -EIO;
 		}
 		printk(KERN_WARNING
-		       "NILFS warning: unable to read primary superblock\n");
-	} else if (!sbp[1])
+		       "NILFS warning: unable to read primary superblock "
+		       "(blocksize = %d)\n", blocksize);
+	} else if (!sbp[1]) {
 		printk(KERN_WARNING
-		       "NILFS warning: unable to read secondary superblock\n");
+		       "NILFS warning: unable to read secondary superblock "
+		       "(blocksize = %d)\n", blocksize);
+	}
 
 	/*
 	 * Compare two super blocks and set 1 in swp if the secondary
@@ -505,7 +508,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 
 	if (!valid[!swp])
 		printk(KERN_WARNING "NILFS warning: broken superblock. "
-		       "using spare superblock.\n");
+		       "using spare superblock (blocksize = %d).\n", blocksize);
 	if (swp)
 		nilfs_swap_super_block(nilfs);
 
-- 
cgit v1.2.3


From ae191838b0251d73b9d0a7254c6938406f5f6320 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 4 Feb 2011 01:19:38 +0900
Subject: nilfs2: optimize rec_len functions

This is a similar change to those in ext2/ext3 codebase (commit
40a063f6691ce937 and a4ae3094869f18e2, respectively).

The addition of 64k block capability in the rec_len_from_disk and
rec_len_to_disk functions added a bit of math overhead which slows
down file create workloads needlessly when the architecture cannot
even support 64k blocks.  This will cut the corner.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/ioctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d89173edd7f..5471eed5ecc 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -28,6 +28,7 @@
 #include <linux/vmalloc.h>
 #include <linux/compat.h>	/* compat_ptr() */
 #include <linux/mount.h>	/* mnt_want_write(), mnt_drop_write() */
+#include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
 #include "segment.h"
-- 
cgit v1.2.3


From be667377a8b8cd73e1b923f33fb5be4034aa4bfa Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sat, 5 Mar 2011 00:19:32 +0900
Subject: nilfs2: record used amount of each checkpoint in checkpoint list

This records the number of used blocks per checkpoint in each
checkpoint entry of cpfile.  Even though userland tools can get the
block count via nilfs_get_cpinfo ioctl, it was not updated by the
nilfs2 kernel code.  This fixes the issue and makes it available for
userland tools to calculate used amount per checkpoint.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Jiro SEKIBA <jir@unicus.jp>
---
 fs/nilfs2/bmap.c   | 11 -----------
 fs/nilfs2/bmap.h   |  3 ---
 fs/nilfs2/btree.c  |  6 +++---
 fs/nilfs2/direct.c |  4 ++--
 fs/nilfs2/inode.c  | 18 ++++++++++++++++++
 fs/nilfs2/nilfs.h  |  2 ++
 6 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3ee67c67cc5..85447a2fab3 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -425,17 +425,6 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
 /*
  * Internal use only
  */
-
-void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
-{
-	inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-}
-
-void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
-{
-	inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-}
-
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
 			      const struct buffer_head *bh)
 {
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bde1c0aa2e1..40d9f453d31 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -240,9 +240,6 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
 
-void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
-void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
-
 
 /* Assume that bmap semaphore is locked. */
 static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 300c2bc00c3..d451ae0e0bf 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1174,7 +1174,7 @@ static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
 	if (ret < 0)
 		goto out;
 	nilfs_btree_commit_insert(btree, path, level, key, ptr);
-	nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
+	nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
 
  out:
 	nilfs_btree_free_path(path);
@@ -1511,7 +1511,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
 	if (ret < 0)
 		goto out;
 	nilfs_btree_commit_delete(btree, path, level, dat);
-	nilfs_bmap_sub_blocks(btree, stats.bs_nblocks);
+	nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks);
 
 out:
 	nilfs_btree_free_path(path);
@@ -1776,7 +1776,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
 		return ret;
 	nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
 					      di, ni, bh);
-	nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
+	nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
 	return 0;
 }
 
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 324d80c5751..82f4865e86d 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -146,7 +146,7 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 		if (NILFS_BMAP_USE_VBN(bmap))
 			nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
 
-		nilfs_bmap_add_blocks(bmap, 1);
+		nilfs_inode_add_blocks(bmap->b_inode, 1);
 	}
 	return ret;
 }
@@ -168,7 +168,7 @@ static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
 	if (!ret) {
 		nilfs_bmap_commit_end_ptr(bmap, &req, dat);
 		nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
-		nilfs_bmap_sub_blocks(bmap, 1);
+		nilfs_inode_sub_blocks(bmap->b_inode, 1);
 	}
 	return ret;
 }
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2534af8d2b5..22a816ba362 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -41,6 +41,24 @@ struct nilfs_iget_args {
 	int for_gc;
 };
 
+void nilfs_inode_add_blocks(struct inode *inode, int n)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+
+	inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
+	if (root)
+		atomic_add(n, &root->blocks_count);
+}
+
+void nilfs_inode_sub_blocks(struct inode *inode, int n)
+{
+	struct nilfs_root *root = NILFS_I(inode)->i_root;
+
+	inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
+	if (root)
+		atomic_sub(n, &root->blocks_count);
+}
+
 /**
  * nilfs_get_block() - get a file block on the filesystem (callback function)
  * @inode - inode struct of the target file
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 45b1fd1d024..03ba4d88083 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -251,6 +251,8 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
 				       void **);
 
 /* inode.c */
+void nilfs_inode_add_blocks(struct inode *inode, int n);
+void nilfs_inode_sub_blocks(struct inode *inode, int n);
 extern struct inode *nilfs_new_inode(struct inode *, int);
 extern void nilfs_free_inode(struct inode *);
 extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
-- 
cgit v1.2.3


From dfef6dcd35cb4a251f6322ca9b2c06f0bb1aa1f4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Mar 2011 01:25:28 -0500
Subject: unfuck proc_sysctl ->d_compare()

a) struct inode is not going to be freed under ->d_compare();
however, the thing PROC_I(inode)->sysctl points to just might.
Fortunately, it's enough to make freeing that sucker delayed,
provided that we don't step on its ->unregistering, clear
the pointer to it in PROC_I(inode) before dropping the reference
and check if it's NULL in ->d_compare().

b) I'm not sure that we *can* walk into NULL inode here (we recheck
dentry->seq between verifying that it's still hashed / fetching
dentry->d_inode and passing it to ->d_compare() and there's no
negative hashed dentries in /proc/sys/*), but if we can walk into
that, we really should not have ->d_compare() return 0 on it!
Said that, I really suspect that this check can be simply killed.
Nick?

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/inode.c       | 8 ++++++--
 fs/proc/proc_sysctl.c | 7 +++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4cda68..d6a7ca1fdac 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -27,6 +27,7 @@
 static void proc_evict_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
+	struct ctl_table_header *head;
 
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
@@ -38,8 +39,11 @@ static void proc_evict_inode(struct inode *inode)
 	de = PROC_I(inode)->pde;
 	if (de)
 		pde_put(de);
-	if (PROC_I(inode)->sysctl)
-		sysctl_head_put(PROC_I(inode)->sysctl);
+	head = PROC_I(inode)->sysctl;
+	if (head) {
+		rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+		sysctl_head_put(head);
+	}
 }
 
 struct vfsmount *proc_mnt;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92a34e..8eb2522111c 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -408,15 +408,18 @@ static int proc_sys_compare(const struct dentry *parent,
 		const struct dentry *dentry, const struct inode *inode,
 		unsigned int len, const char *str, const struct qstr *name)
 {
+	struct ctl_table_header *head;
 	/* Although proc doesn't have negative dentries, rcu-walk means
 	 * that inode here can be NULL */
+	/* AV: can it, indeed? */
 	if (!inode)
-		return 0;
+		return 1;
 	if (name->len != len)
 		return 1;
 	if (memcmp(name->name, str, len))
 		return 1;
-	return !sysctl_is_seen(PROC_I(inode)->sysctl);
+	head = rcu_dereference(PROC_I(inode)->sysctl);
+	return !head || !sysctl_is_seen(head);
 }
 
 static const struct dentry_operations proc_sys_dentry_operations = {
-- 
cgit v1.2.3


From df677140281beb608f6748c341af7612f7bfe7a0 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 8 Mar 2011 08:28:01 +0100
Subject: block: biovec_slab vs. CONFIG_BLK_DEV_INTEGRITY

The block integrity subsystem no longer uses the bio_vec slabs so this
code can safely be compiled in.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/bio.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 4bd454fa844..5694b756ed0 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -43,7 +43,7 @@ static mempool_t *bio_split_pool __read_mostly;
  * unsigned short
  */
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
@@ -1656,12 +1656,10 @@ static void __init biovec_init_slabs(void)
 		int size;
 		struct biovec_slab *bvs = bvec_slabs + i;
 
-#ifndef CONFIG_BLK_DEV_INTEGRITY
 		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
 			bvs->slab = NULL;
 			continue;
 		}
-#endif
 
 		size = bvs->nr_vecs * sizeof(struct bio_vec);
 		bvs->slab = kmem_cache_create(bvs->name, size, 0,
-- 
cgit v1.2.3


From 3e8e2e0c8da1f1701a8014543c951c41751791cc Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 30 Jan 2011 18:58:32 +0200
Subject: UBIFS: incorporate maximum write size

Incorporate maximum write size into the UBIFS description data
structure. This patch just introduces new 'c->max_write_size'
and 'c->max_write_shift' fields as a preparation for the following
patches.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 19 +++++++++++++++++++
 fs/ubifs/ubifs.h |  5 +++++
 2 files changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 703a62109cf..efc327b92f9 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -507,6 +507,8 @@ static int init_constants_early(struct ubifs_info *c)
 	c->half_leb_size = c->leb_size / 2;
 	c->min_io_size = c->di.min_io_size;
 	c->min_io_shift = fls(c->min_io_size) - 1;
+	c->max_write_size = c->di.max_write_size;
+	c->max_write_shift = fls(c->max_write_size) - 1;
 
 	if (c->leb_size < UBIFS_MIN_LEB_SZ) {
 		ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
@@ -525,6 +527,18 @@ static int init_constants_early(struct ubifs_info *c)
 		return -EINVAL;
 	}
 
+	/*
+	 * Maximum write size has to be greater or equivalent to min. I/O
+	 * size, and be multiple of min. I/O size.
+	 */
+	if (c->max_write_size < c->min_io_size ||
+	    c->max_write_size % c->min_io_size ||
+	    !is_power_of_2(c->max_write_size)) {
+		ubifs_err("bad write buffer size %d for %d min. I/O unit",
+			  c->max_write_size, c->min_io_size);
+		return -EINVAL;
+	}
+
 	/*
 	 * UBIFS aligns all node to 8-byte boundary, so to make function in
 	 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
@@ -533,6 +547,10 @@ static int init_constants_early(struct ubifs_info *c)
 	if (c->min_io_size < 8) {
 		c->min_io_size = 8;
 		c->min_io_shift = 3;
+		if (c->max_write_size < c->min_io_size) {
+			c->max_write_size = c->min_io_size;
+			c->max_write_shift = c->min_io_shift;
+		}
 	}
 
 	c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
@@ -1391,6 +1409,7 @@ static int mount_ubifs(struct ubifs_info *c)
 
 	dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
 	dbg_msg("min. I/O unit size:  %d bytes", c->min_io_size);
+	dbg_msg("max. write size:     %d bytes", c->max_write_size);
 	dbg_msg("LEB size:            %d bytes (%d KiB)",
 		c->leb_size, c->leb_size >> 10);
 	dbg_msg("data journal heads:  %d",
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index d1823541f98..8b519499f14 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1024,6 +1024,9 @@ struct ubifs_debug_info;
  *
  * @min_io_size: minimal input/output unit size
  * @min_io_shift: number of bits in @min_io_size minus one
+ * @max_write_size: maximum amount of bytes the underlying flash can write at a
+ *                  time (MTD write buffer size)
+ * @max_write_shift: number of bits in @max_write_size minus one
  * @leb_size: logical eraseblock size in bytes
  * @half_leb_size: half LEB size
  * @idx_leb_size: how many bytes of an LEB are effectively available when it is
@@ -1270,6 +1273,8 @@ struct ubifs_info {
 
 	int min_io_size;
 	int min_io_shift;
+	int max_write_size;
+	int max_write_shift;
 	int leb_size;
 	int half_leb_size;
 	int idx_leb_size;
-- 
cgit v1.2.3


From ca2ec61d157f23ec24aaa200f8016ea0a8aeb617 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 14 Feb 2011 15:17:55 +0200
Subject: UBI: incorporate LEB offset information

Incorporate the LEB offset information into UBIFS. We'll use this
information in one of the next patches to figure out what are the
max. write size offsets relative to the PEB. So this patch is just
a preparation.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 1 +
 fs/ubifs/ubifs.h | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index efc327b92f9..d4b4cb4596e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -504,6 +504,7 @@ static int init_constants_early(struct ubifs_info *c)
 
 	c->leb_cnt = c->vi.size;
 	c->leb_size = c->vi.usable_leb_size;
+	c->leb_start = c->di.leb_start;
 	c->half_leb_size = c->leb_size / 2;
 	c->min_io_size = c->di.min_io_size;
 	c->min_io_shift = fls(c->min_io_size) - 1;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8b519499f14..942c1d3cb0d 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1028,6 +1028,8 @@ struct ubifs_debug_info;
  *                  time (MTD write buffer size)
  * @max_write_shift: number of bits in @max_write_size minus one
  * @leb_size: logical eraseblock size in bytes
+ * @leb_start: starting offset of logical eraseblocks within physical
+ *             eraseblocks
  * @half_leb_size: half LEB size
  * @idx_leb_size: how many bytes of an LEB are effectively available when it is
  *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
@@ -1276,6 +1278,7 @@ struct ubifs_info {
 	int max_write_size;
 	int max_write_shift;
 	int leb_size;
+	int leb_start;
 	int half_leb_size;
 	int idx_leb_size;
 	int leb_cnt;
-- 
cgit v1.2.3


From 3c89f396dc78671cfbc1eb20ef1d5be6a9a02780 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 1 Feb 2011 19:02:49 +0200
Subject: UBIFS: introduce write-buffer size field

Currently we assume write-buffer size is always min_io_size. But
this is about to change and write-buffers may be of variable size.
Namely, they will be of max_write_size at the beginning, but will
get smaller when we are approaching the end of LEB.

This is a preparation patch which introduces 'size' field in
the write-buffer structure which carries the current write-buffer
size.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/io.c    | 28 +++++++++++++++++++---------
 fs/ubifs/ubifs.h |  2 ++
 2 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index d1fe56203a1..7c2a014b59f 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -361,7 +361,10 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 	dbg_io("LEB %d:%d, %d bytes, jhead %s",
 	       wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
 	ubifs_assert(!(wbuf->avail & 7));
-	ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+	ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size);
+	ubifs_assert(wbuf->size >= c->min_io_size);
+	ubifs_assert(wbuf->size <= c->max_write_size);
+	ubifs_assert(wbuf->size % c->min_io_size == 0);
 	ubifs_assert(!c->ro_media && !c->ro_mount);
 
 	if (c->ro_error)
@@ -369,10 +372,10 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 
 	ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
 	err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
-			    c->min_io_size, wbuf->dtype);
+			    wbuf->size, wbuf->dtype);
 	if (err) {
 		ubifs_err("cannot write %d bytes to LEB %d:%d",
-			  c->min_io_size, wbuf->lnum, wbuf->offs);
+			  wbuf->size, wbuf->lnum, wbuf->offs);
 		dbg_dump_stack();
 		return err;
 	}
@@ -380,8 +383,9 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 	dirt = wbuf->avail;
 
 	spin_lock(&wbuf->lock);
-	wbuf->offs += c->min_io_size;
+	wbuf->offs += wbuf->size;
 	wbuf->avail = c->min_io_size;
+	wbuf->size = c->min_io_size;
 	wbuf->used = 0;
 	wbuf->next_ino = 0;
 	spin_unlock(&wbuf->lock);
@@ -425,6 +429,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
 	wbuf->lnum = lnum;
 	wbuf->offs = offs;
 	wbuf->avail = c->min_io_size;
+	wbuf->size = c->min_io_size;
 	wbuf->used = 0;
 	spin_unlock(&wbuf->lock);
 	wbuf->dtype = dtype;
@@ -522,7 +527,10 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
 	ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
 	ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
-	ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
+	ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size);
+	ubifs_assert(wbuf->size >= c->min_io_size);
+	ubifs_assert(wbuf->size <= c->max_write_size);
+	ubifs_assert(wbuf->size % c->min_io_size == 0);
 	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
 	ubifs_assert(!c->ro_media && !c->ro_mount);
 
@@ -547,7 +555,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 			dbg_io("flush jhead %s wbuf to LEB %d:%d",
 			       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
 			err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
-					    wbuf->offs, c->min_io_size,
+					    wbuf->offs, wbuf->size,
 					    wbuf->dtype);
 			if (err)
 				goto out;
@@ -555,6 +563,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 			spin_lock(&wbuf->lock);
 			wbuf->offs += c->min_io_size;
 			wbuf->avail = c->min_io_size;
+			wbuf->size = c->min_io_size;
 			wbuf->used = 0;
 			wbuf->next_ino = 0;
 			spin_unlock(&wbuf->lock);
@@ -577,11 +586,11 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
 	memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
 	err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
-			    c->min_io_size, wbuf->dtype);
+			    wbuf->size, wbuf->dtype);
 	if (err)
 		goto out;
 
-	offs = wbuf->offs + c->min_io_size;
+	offs = wbuf->offs + wbuf->size;
 	len -= wbuf->avail;
 	aligned_len -= wbuf->avail;
 	written = wbuf->avail;
@@ -618,6 +627,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	wbuf->offs = offs;
 	wbuf->used = aligned_len;
 	wbuf->avail = c->min_io_size - aligned_len;
+	wbuf->size = c->min_io_size;
 	wbuf->next_ino = 0;
 	spin_unlock(&wbuf->lock);
 
@@ -855,7 +865,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
 
 	wbuf->used = 0;
 	wbuf->lnum = wbuf->offs = -1;
-	wbuf->avail = c->min_io_size;
+	wbuf->avail = wbuf->size = c->min_io_size;
 	wbuf->dtype = UBI_UNKNOWN;
 	wbuf->sync_callback = NULL;
 	mutex_init(&wbuf->io_mutex);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 942c1d3cb0d..36249507848 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -646,6 +646,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
  * @offs: write-buffer offset in this logical eraseblock
  * @avail: number of bytes available in the write-buffer
  * @used:  number of used bytes in the write-buffer
+ * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
  * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
  * %UBI_UNKNOWN)
  * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
@@ -680,6 +681,7 @@ struct ubifs_wbuf {
 	int offs;
 	int avail;
 	int used;
+	int size;
 	int dtype;
 	int jhead;
 	int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
-- 
cgit v1.2.3


From 6c7f74f703cc4baf053270a6e78a32f832f03445 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 6 Feb 2011 14:45:26 +0200
Subject: UBIFS: use max_write_size for write-buffers

Switch write-buffers from 'c->min_io_size' to 'c->max_write_size' which
presumably has to be more write speed-efficient. However, when write-buffer
is synchronized, write only the the min. I/O units which contain the
data, do not write whole write-buffer. This is more space-efficient.

Additionally, this patch takes into account that the LEB might not start
from the max. write unit-aligned address.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/io.c | 181 ++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 137 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 7c2a014b59f..dfd168b7807 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -31,6 +31,26 @@
  * buffer is full or when it is not used for some time (by timer). This is
  * similar to the mechanism is used by JFFS2.
  *
+ * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum
+ * write size (@c->max_write_size). The latter is the maximum amount of bytes
+ * the underlying flash is able to program at a time, and writing in
+ * @c->max_write_size units should presumably be faster. Obviously,
+ * @c->min_io_size <= @c->max_write_size. Write-buffers are of
+ * @c->max_write_size bytes in size for maximum performance. However, when a
+ * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size
+ * boundary) which contains data is written, not the whole write-buffer,
+ * because this is more space-efficient.
+ *
+ * This optimization adds few complications to the code. Indeed, on the one
+ * hand, we want to write in optimal @c->max_write_size bytes chunks, which
+ * also means aligning writes at the @c->max_write_size bytes offsets. On the
+ * other hand, we do not want to waste space when synchronizing the write
+ * buffer, so during synchronization we writes in smaller chunks. And this makes
+ * the next write offset to be not aligned to @c->max_write_size bytes. So the
+ * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned
+ * to @c->max_write_size bytes again. We do this by temporarily shrinking
+ * write-buffer size (@wbuf->size).
+ *
  * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
  * mutexes defined inside these objects. Since sometimes upper-level code
  * has to lock the write-buffer (e.g. journal space reservation code), many
@@ -46,8 +66,8 @@
  * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
  * uses padding nodes or padding bytes, if the padding node does not fit.
  *
- * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
- * every time they are read from the flash media.
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when
+ * they are read from the flash media.
  */
 
 #include <linux/crc32.h>
@@ -347,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
  *
  * This function synchronizes write-buffer @buf and returns zero in case of
  * success or a negative error code in case of failure.
+ *
+ * Note, although write-buffers are of @c->max_write_size, this function does
+ * not necessarily writes all @c->max_write_size bytes to the flash. Instead,
+ * if the write-buffer is only partially filled with data, only the used part
+ * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized.
+ * This way we waste less space.
  */
 int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 {
 	struct ubifs_info *c = wbuf->c;
-	int err, dirt;
+	int err, dirt, sync_len;
 
 	cancel_wbuf_timer_nolock(wbuf);
 	if (!wbuf->used || wbuf->lnum == -1)
@@ -366,26 +392,48 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 	ubifs_assert(wbuf->size <= c->max_write_size);
 	ubifs_assert(wbuf->size % c->min_io_size == 0);
 	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->leb_size - wbuf->offs >= c->max_write_size)
+		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
 
 	if (c->ro_error)
 		return -EROFS;
 
-	ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
+	/*
+	 * Do not write whole write buffer but write only the minimum necessary
+	 * amount of min. I/O units.
+	 */
+	sync_len = ALIGN(wbuf->used, c->min_io_size);
+	dirt = sync_len - wbuf->used;
+	if (dirt)
+		ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
 	err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
-			    wbuf->size, wbuf->dtype);
+			    sync_len, wbuf->dtype);
 	if (err) {
 		ubifs_err("cannot write %d bytes to LEB %d:%d",
-			  wbuf->size, wbuf->lnum, wbuf->offs);
+			  sync_len, wbuf->lnum, wbuf->offs);
 		dbg_dump_stack();
 		return err;
 	}
 
-	dirt = wbuf->avail;
-
 	spin_lock(&wbuf->lock);
-	wbuf->offs += wbuf->size;
-	wbuf->avail = c->min_io_size;
-	wbuf->size = c->min_io_size;
+	wbuf->offs += sync_len;
+	/*
+	 * Now @wbuf->offs is not necessarily aligned to @c->max_write_size.
+	 * But our goal is to optimize writes and make sure we write in
+	 * @c->max_write_size chunks and to @c->max_write_size-aligned offset.
+	 * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make
+	 * sure that @wbuf->offs + @wbuf->size is aligned to
+	 * @c->max_write_size. This way we make sure that after next
+	 * write-buffer flush we are again at the optimal offset (aligned to
+	 * @c->max_write_size).
+	 */
+	if (c->leb_size - wbuf->offs < c->max_write_size)
+		wbuf->size = c->leb_size - wbuf->offs;
+	else if (wbuf->offs & (c->max_write_size - 1))
+		wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+	else
+		wbuf->size = c->max_write_size;
+	wbuf->avail = wbuf->size;
 	wbuf->used = 0;
 	wbuf->next_ino = 0;
 	spin_unlock(&wbuf->lock);
@@ -428,8 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
 	spin_lock(&wbuf->lock);
 	wbuf->lnum = lnum;
 	wbuf->offs = offs;
-	wbuf->avail = c->min_io_size;
-	wbuf->size = c->min_io_size;
+	if (c->leb_size - wbuf->offs < c->max_write_size)
+		wbuf->size = c->leb_size - wbuf->offs;
+	else if (wbuf->offs & (c->max_write_size - 1))
+		wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+	else
+		wbuf->size = c->max_write_size;
+	wbuf->avail = wbuf->size;
 	wbuf->used = 0;
 	spin_unlock(&wbuf->lock);
 	wbuf->dtype = dtype;
@@ -509,8 +562,9 @@ out_timers:
  *
  * This function writes data to flash via write-buffer @wbuf. This means that
  * the last piece of the node won't reach the flash media immediately if it
- * does not take whole minimal I/O unit. Instead, the node will sit in RAM
- * until the write-buffer is synchronized (e.g., by timer).
+ * does not take whole max. write unit (@c->max_write_size). Instead, the node
+ * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or
+ * because more data are appended to the write-buffer).
  *
  * This function returns zero in case of success and a negative error code in
  * case of failure. If the node cannot be written because there is no more
@@ -533,6 +587,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	ubifs_assert(wbuf->size % c->min_io_size == 0);
 	ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
 	ubifs_assert(!c->ro_media && !c->ro_mount);
+	if (c->leb_size - wbuf->offs >= c->max_write_size)
+		ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
 
 	if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
 		err = -ENOSPC;
@@ -561,9 +617,12 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 				goto out;
 
 			spin_lock(&wbuf->lock);
-			wbuf->offs += c->min_io_size;
-			wbuf->avail = c->min_io_size;
-			wbuf->size = c->min_io_size;
+			wbuf->offs += wbuf->size;
+			if (c->leb_size - wbuf->offs >= c->max_write_size)
+				wbuf->size = c->max_write_size;
+			else
+				wbuf->size = c->leb_size - wbuf->offs;
+			wbuf->avail = wbuf->size;
 			wbuf->used = 0;
 			wbuf->next_ino = 0;
 			spin_unlock(&wbuf->lock);
@@ -577,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 		goto exit;
 	}
 
-	/*
-	 * The node is large enough and does not fit entirely within current
-	 * minimal I/O unit. We have to fill and flush write-buffer and switch
-	 * to the next min. I/O unit.
-	 */
-	dbg_io("flush jhead %s wbuf to LEB %d:%d",
-	       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
-	memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
-	err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
-			    wbuf->size, wbuf->dtype);
-	if (err)
-		goto out;
+	offs = wbuf->offs;
+	written = 0;
 
-	offs = wbuf->offs + wbuf->size;
-	len -= wbuf->avail;
-	aligned_len -= wbuf->avail;
-	written = wbuf->avail;
+	if (wbuf->used) {
+		/*
+		 * The node is large enough and does not fit entirely within
+		 * current available space. We have to fill and flush
+		 * write-buffer and switch to the next max. write unit.
+		 */
+		dbg_io("flush jhead %s wbuf to LEB %d:%d",
+		       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
+		memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+		err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+				    wbuf->size, wbuf->dtype);
+		if (err)
+			goto out;
+
+		offs += wbuf->size;
+		len -= wbuf->avail;
+		aligned_len -= wbuf->avail;
+		written += wbuf->avail;
+	} else if (wbuf->offs & (c->max_write_size - 1)) {
+		/*
+		 * The write-buffer offset is not aligned to
+		 * @c->max_write_size and @wbuf->size is less than
+		 * @c->max_write_size. Write @wbuf->size bytes to make sure the
+		 * following writes are done in optimal @c->max_write_size
+		 * chunks.
+		 */
+		dbg_io("write %d bytes to LEB %d:%d",
+		       wbuf->size, wbuf->lnum, wbuf->offs);
+		err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs,
+				    wbuf->size, wbuf->dtype);
+		if (err)
+			goto out;
+
+		offs += wbuf->size;
+		len -= wbuf->size;
+		aligned_len -= wbuf->size;
+		written += wbuf->size;
+	}
 
 	/*
-	 * The remaining data may take more whole min. I/O units, so write the
-	 * remains multiple to min. I/O unit size directly to the flash media.
+	 * The remaining data may take more whole max. write units, so write the
+	 * remains multiple to max. write unit size directly to the flash media.
 	 * We align node length to 8-byte boundary because we anyway flash wbuf
 	 * if the remaining space is less than 8 bytes.
 	 */
-	n = aligned_len >> c->min_io_shift;
+	n = aligned_len >> c->max_write_shift;
 	if (n) {
-		n <<= c->min_io_shift;
+		n <<= c->max_write_shift;
 		dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
 		err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
 				    wbuf->dtype);
@@ -619,15 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
 	if (aligned_len)
 		/*
 		 * And now we have what's left and what does not take whole
-		 * min. I/O unit, so write it to the write-buffer and we are
+		 * max. write unit, so write it to the write-buffer and we are
 		 * done.
 		 */
 		memcpy(wbuf->buf, buf + written, len);
 
 	wbuf->offs = offs;
+	if (c->leb_size - wbuf->offs >= c->max_write_size)
+		wbuf->size = c->max_write_size;
+	else
+		wbuf->size = c->leb_size - wbuf->offs;
+	wbuf->avail = wbuf->size - aligned_len;
 	wbuf->used = aligned_len;
-	wbuf->avail = c->min_io_size - aligned_len;
-	wbuf->size = c->min_io_size;
 	wbuf->next_ino = 0;
 	spin_unlock(&wbuf->lock);
 
@@ -851,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
 {
 	size_t size;
 
-	wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
+	wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL);
 	if (!wbuf->buf)
 		return -ENOMEM;
 
-	size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+	size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
 	wbuf->inodes = kmalloc(size, GFP_KERNEL);
 	if (!wbuf->inodes) {
 		kfree(wbuf->buf);
@@ -865,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
 
 	wbuf->used = 0;
 	wbuf->lnum = wbuf->offs = -1;
-	wbuf->avail = wbuf->size = c->min_io_size;
+	/*
+	 * If the LEB starts at the max. write size aligned address, then
+	 * write-buffer size has to be set to @c->max_write_size. Otherwise,
+	 * set it to something smaller so that it ends at the closest max.
+	 * write size boundary.
+	 */
+	size = c->max_write_size - (c->leb_start % c->max_write_size);
+	wbuf->avail = wbuf->size = size;
 	wbuf->dtype = UBI_UNKNOWN;
 	wbuf->sync_callback = NULL;
 	mutex_init(&wbuf->io_mutex);
-- 
cgit v1.2.3


From 2765df7da540687c4d57ca840182122f074c5b9c Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 2 Feb 2011 09:22:54 +0200
Subject: UBIFS: use max_write_size during recovery

When recovering from unclean reboots UBIFS scans the journal and checks nodes.
If a corrupted node is found, UBIFS tries to check if this is the last node
in the LEB or not. This is is done by checking if there only 0xFF bytes
starting from the next min. I/O unit. However, since now we write in
c->max_write_size, we should actually check for 0xFFs starting from the
next max. write unit.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/recovery.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index e2714f8f05f..936f2cbfe6b 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -38,7 +38,7 @@
  * UBIFS writes only to erased LEBs, so it writes only to the flash space
  * containing only 0xFFs. UBIFS also always writes strictly from the beginning
  * of the LEB to the end. And UBIFS assumes that the underlying flash media
- * writes in @c->min_io_unit bytes at a time.
+ * writes in @c->max_write_size bytes at a time.
  *
  * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
  * I/O unit corresponding to offset X to contain corrupted data, all the
@@ -379,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
  * @offs: offset to check
  *
  * This function returns %1 if @offs was in the last write to the LEB whose data
- * is in @buf, otherwise %0 is returned.  The determination is made by checking
- * for subsequent empty space starting from the next @c->min_io_size boundary.
+ * is in @buf, otherwise %0 is returned. The determination is made by checking
+ * for subsequent empty space starting from the next @c->max_write_size
+ * boundary.
  */
 static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
 {
@@ -388,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
 	uint8_t *p;
 
 	/*
-	 * Round up to the next @c->min_io_size boundary i.e. @offs is in the
-	 * last wbuf written. After that should be empty space.
+	 * Round up to the next @c->max_write_size boundary i.e. @offs is in
+	 * the last wbuf written. After that should be empty space.
 	 */
-	empty_offs = ALIGN(offs + 1, c->min_io_size);
+	empty_offs = ALIGN(offs + 1, c->max_write_size);
 	check_len = c->leb_size - empty_offs;
 	p = buf + empty_offs - offs;
 	return is_empty(p, check_len);
@@ -446,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
 	int skip, dlen = le32_to_cpu(ch->len);
 
 	/* Check for empty space after the corrupt node's common header */
-	skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
+	skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs;
 	if (is_empty(buf + skip, len - skip))
 		return 1;
 	/*
@@ -458,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
 		return 0;
 	}
 	/* Now we know the corrupt node's length we can skip over it */
-	skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+	skip = ALIGN(offs + dlen, c->max_write_size) - offs;
 	/* After which there should be empty space */
 	if (is_empty(buf + skip, len - skip))
 		return 1;
@@ -857,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
 static int recover_head(const struct ubifs_info *c, int lnum, int offs,
 			void *sbuf)
 {
-	int len, err;
+	int len = c->max_write_size, err;
 
-	if (c->min_io_size > 1)
-		len = c->min_io_size;
-	else
-		len = 512;
 	if (offs + len > c->leb_size)
 		len = c->leb_size - offs;
 
-- 
cgit v1.2.3


From ea8efc74bd0402b4d5f663d007b4e25fa29ea778 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Mar 2011 11:54:40 -0500
Subject: Btrfs: make sure not to return overlapping extents to fiemap

The btrfs fiemap code was incorrectly returning duplicate or overlapping
extents in some cases.  cp was blindly trusting this result and we would
end up with a destination file that was bigger than the original because
some bytes were copied twice.

The fix here adjusts our offsets to make sure we're always moving
forward in the fiemap results.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ff45b80d90f..9fcb5ede6b7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3046,17 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	}
 
 	while (!end) {
-		off = extent_map_end(em);
-		if (off >= max)
-			end = 1;
+		u64 offset_in_extent;
+
+		/* break if the extent we found is outside the range */
+		if (em->start >= max || extent_map_end(em) < off)
+			break;
+
+		/*
+		 * get_extent may return an extent that starts before our
+		 * requested range.  We have to make sure the ranges
+		 * we return to fiemap always move forward and don't
+		 * overlap, so adjust the offsets here
+		 */
+		em_start = max(em->start, off);
 
-		em_start = em->start;
-		em_len = em->len;
+		/*
+		 * record the offset from the start of the extent
+		 * for adjusting the disk offset below
+		 */
+		offset_in_extent = em_start - em->start;
 		em_end = extent_map_end(em);
+		em_len = em_end - em_start;
 		emflags = em->flags;
 		disko = 0;
 		flags = 0;
 
+		/*
+		 * bump off for our next call to get_extent
+		 */
+		off = extent_map_end(em);
+		if (off >= max)
+			end = 1;
+
 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
 			end = 1;
 			flags |= FIEMAP_EXTENT_LAST;
@@ -3067,7 +3088,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
 		} else {
-			disko = em->block_start;
+			disko = em->block_start + offset_in_extent;
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
-- 
cgit v1.2.3


From 529d7b2a7fa31e9f7d08bc790d232c3cbe64fa24 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 2 Mar 2011 23:48:33 -0500
Subject: nfsd4: minor nfs4state.c reshuffling

Minor cleanup in preparation for a bugfix--moving some code to avoid
forward references, etc.  No change in functionality.

Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 125 +++++++++++++++++++++++++---------------------------
 1 file changed, 61 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c4f2b0f63e4..84d2dd327b2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -316,64 +316,6 @@ static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
 static struct list_head client_lru;
 static struct list_head close_lru;
 
-static void unhash_generic_stateid(struct nfs4_stateid *stp)
-{
-	list_del(&stp->st_hash);
-	list_del(&stp->st_perfile);
-	list_del(&stp->st_perstateowner);
-}
-
-static void free_generic_stateid(struct nfs4_stateid *stp)
-{
-	put_nfs4_file(stp->st_file);
-	kmem_cache_free(stateid_slab, stp);
-}
-
-static void release_lock_stateid(struct nfs4_stateid *stp)
-{
-	struct file *file;
-
-	unhash_generic_stateid(stp);
-	file = find_any_file(stp->st_file);
-	if (file)
-		locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
-	free_generic_stateid(stp);
-}
-
-static void unhash_lockowner(struct nfs4_stateowner *sop)
-{
-	struct nfs4_stateid *stp;
-
-	list_del(&sop->so_idhash);
-	list_del(&sop->so_strhash);
-	list_del(&sop->so_perstateid);
-	while (!list_empty(&sop->so_stateids)) {
-		stp = list_first_entry(&sop->so_stateids,
-				struct nfs4_stateid, st_perstateowner);
-		release_lock_stateid(stp);
-	}
-}
-
-static void release_lockowner(struct nfs4_stateowner *sop)
-{
-	unhash_lockowner(sop);
-	nfs4_put_stateowner(sop);
-}
-
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
-	struct nfs4_stateowner *lock_sop;
-
-	while (!list_empty(&open_stp->st_lockowners)) {
-		lock_sop = list_entry(open_stp->st_lockowners.next,
-				struct nfs4_stateowner, so_perstateid);
-		/* list_del(&open_stp->st_lockowners);  */
-		BUG_ON(lock_sop->so_is_open_owner);
-		release_lockowner(lock_sop);
-	}
-}
-
 /*
  * We store the NONE, READ, WRITE, and BOTH bits separately in the
  * st_{access,deny}_bmap field of the stateid, in order to track not
@@ -446,6 +388,64 @@ static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
 	return nfs4_access_to_omode(access);
 }
 
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+	list_del(&stp->st_hash);
+	list_del(&stp->st_perfile);
+	list_del(&stp->st_perstateowner);
+}
+
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+	put_nfs4_file(stp->st_file);
+	kmem_cache_free(stateid_slab, stp);
+}
+
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+	struct file *file;
+
+	unhash_generic_stateid(stp);
+	file = find_any_file(stp->st_file);
+	if (file)
+		locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+	free_generic_stateid(stp);
+}
+
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_stateid *stp;
+
+	list_del(&sop->so_idhash);
+	list_del(&sop->so_strhash);
+	list_del(&sop->so_perstateid);
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_first_entry(&sop->so_stateids,
+				struct nfs4_stateid, st_perstateowner);
+		release_lock_stateid(stp);
+	}
+}
+
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+	unhash_lockowner(sop);
+	nfs4_put_stateowner(sop);
+}
+
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+	struct nfs4_stateowner *lock_sop;
+
+	while (!list_empty(&open_stp->st_lockowners)) {
+		lock_sop = list_entry(open_stp->st_lockowners.next,
+				struct nfs4_stateowner, so_perstateid);
+		/* list_del(&open_stp->st_lockowners);  */
+		BUG_ON(lock_sop->so_is_open_owner);
+		release_lockowner(lock_sop);
+	}
+}
+
 static void release_open_stateid(struct nfs4_stateid *stp)
 {
 	int oflag = nfs4_access_bmap_to_omode(stp);
@@ -3764,7 +3764,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct file_lock conflock;
 	__be32 status = 0;
 	unsigned int strhashval;
-	unsigned int cmd;
 	int err;
 
 	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -3851,8 +3850,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				filp = find_readable_file(lock_stp->st_file);
 			}
 			file_lock.fl_type = F_RDLCK;
-			cmd = F_SETLK;
-		break;
+			break;
 		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
 			if (find_writeable_file(lock_stp->st_file)) {
@@ -3860,8 +3858,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 				filp = find_writeable_file(lock_stp->st_file);
 			}
 			file_lock.fl_type = F_WRLCK;
-			cmd = F_SETLK;
-		break;
+			break;
 		default:
 			status = nfserr_inval;
 		goto out;
@@ -3885,7 +3882,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	* Note: locks.c uses the BKL to protect the inode's lock list.
 	*/
 
-	err = vfs_lock_file(filp, cmd, &file_lock, &conflock);
+	err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
 	switch (-err) {
 	case 0: /* success! */
 		update_stateid(&lock_stp->st_stateid);
-- 
cgit v1.2.3


From 0997b173609b9229ece28941c118a2a9b278796e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 2 Mar 2011 18:01:35 -0500
Subject: nfsd4: fix struct file leak
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make sure we properly reference count the struct files that a lock
depends on, and release them when the lock stateid is released.

This fixes a major leak of struct files when using locking over nfsv4.

Cc: stable@kernel.org
Reported-by: Rick Koshi <nfs-bug-report@more-right-rudder.com>
Tested-by: Ivo Přikryl <prikryl@eurosat.cz>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 84d2dd327b2..c26dc31fb94 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -397,6 +397,9 @@ static void unhash_generic_stateid(struct nfs4_stateid *stp)
 
 static void free_generic_stateid(struct nfs4_stateid *stp)
 {
+	int oflag = nfs4_access_bmap_to_omode(stp);
+
+	nfs4_file_put_access(stp->st_file, oflag);
 	put_nfs4_file(stp->st_file);
 	kmem_cache_free(stateid_slab, stp);
 }
@@ -448,11 +451,8 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp)
 
 static void release_open_stateid(struct nfs4_stateid *stp)
 {
-	int oflag = nfs4_access_bmap_to_omode(stp);
-
 	unhash_generic_stateid(stp);
 	release_stateid_lockowners(stp);
-	nfs4_file_put_access(stp->st_file, oflag);
 	free_generic_stateid(stp);
 }
 
@@ -3734,6 +3734,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	stp->st_stateid.si_stateownerid = sop->so_id;
 	stp->st_stateid.si_fileid = fp->fi_id;
 	stp->st_stateid.si_generation = 0;
+	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
 
@@ -3748,6 +3749,17 @@ check_lock_length(u64 offset, u64 length)
 	     LOFF_OVERFLOW(offset, length)));
 }
 
+static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
+{
+	struct nfs4_file *fp = lock_stp->st_file;
+	int oflag = nfs4_access_to_omode(access);
+
+	if (test_bit(access, &lock_stp->st_access_bmap))
+		return;
+	nfs4_file_get_access(fp, oflag);
+	__set_bit(access, &lock_stp->st_access_bmap);
+}
+
 /*
  *  LOCK operation 
  */
@@ -3845,18 +3857,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	switch (lock->lk_type) {
 		case NFS4_READ_LT:
 		case NFS4_READW_LT:
-			if (find_readable_file(lock_stp->st_file)) {
-				nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
-				filp = find_readable_file(lock_stp->st_file);
-			}
+			filp = find_readable_file(lock_stp->st_file);
+			if (filp)
+				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
 			file_lock.fl_type = F_RDLCK;
 			break;
 		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
-			if (find_writeable_file(lock_stp->st_file)) {
-				nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
-				filp = find_writeable_file(lock_stp->st_file);
-			}
+			filp = find_writeable_file(lock_stp->st_file);
+			if (filp)
+				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
 			file_lock.fl_type = F_WRLCK;
 			break;
 		default:
-- 
cgit v1.2.3


From 3ec07aa9522e3d5e9d5ede7bef946756e623a0a0 Mon Sep 17 00:00:00 2001
From: roel <roel.kluin@gmail.com>
Date: Tue, 8 Mar 2011 22:32:26 +0100
Subject: nfsd: wrong index used in inner loop

Index i was already used in the outer loop

Cc: stable@kernel.org
Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1275b865507..615f0a9f060 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1142,7 +1142,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 
 	u32 dummy;
 	char *machine_name;
-	int i;
+	int i, j;
 	int nr_secflavs;
 
 	READ_BUF(16);
@@ -1215,7 +1215,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 			READ_BUF(4);
 			READ32(dummy);
 			READ_BUF(dummy * 4);
-			for (i = 0; i < dummy; ++i)
+			for (j = 0; j < dummy; ++j)
 				READ32(dummy);
 			break;
 		case RPC_AUTH_GSS:
-- 
cgit v1.2.3


From 3b2ce58b0f3c1633750529713be0e467282abd78 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 9 Mar 2011 11:05:07 +0900
Subject: nilfs2: move mount options to nilfs object

This moves mount_opt local variable to nilfs object from nilfs_sb_info
struct.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/sb.h        | 16 ---------------
 fs/nilfs2/segment.c   |  9 +++++----
 fs/nilfs2/super.c     | 56 ++++++++++++++++++++++++++-------------------------
 fs/nilfs2/the_nilfs.c |  4 ++--
 fs/nilfs2/the_nilfs.h | 18 +++++++++++++++++
 5 files changed, 54 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 7a17715f215..d7346c949c8 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -34,8 +34,6 @@ struct nilfs_sc_info;
  * NILFS super-block data in memory
  */
 struct nilfs_sb_info {
-	/* Mount options */
-	unsigned long s_mount_opt;
 	uid_t s_resuid;
 	gid_t s_resgid;
 
@@ -68,18 +66,4 @@ static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
 	return sbi->s_sc_info;
 }
 
-/*
- * Bit operations for the mount option
- */
-#define nilfs_clear_opt(sbi, opt)  \
-	do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
-#define nilfs_set_opt(sbi, opt)  \
-	do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
-#define nilfs_test_opt(sbi, opt)   ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
-#define nilfs_write_opt(sbi, mask, opt)					\
-	do { (sbi)->s_mount_opt =					\
-		(((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) |		\
-		 NILFS_MOUNT_##opt);					\
-	} while (0)
-
 #endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2de9f636792..851bcd3890c 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2298,6 +2298,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 				  loff_t start, loff_t end)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct nilfs_sc_info *sci = NILFS_SC(sbi);
 	struct nilfs_inode_info *ii;
 	struct nilfs_transaction_info ti;
@@ -2310,9 +2311,9 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 
 	ii = NILFS_I(inode);
 	if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
-	    nilfs_test_opt(sbi, STRICT_ORDER) ||
+	    nilfs_test_opt(nilfs, STRICT_ORDER) ||
 	    test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
-	    nilfs_discontinued(sbi->s_nilfs)) {
+	    nilfs_discontinued(nilfs)) {
 		nilfs_transaction_unlock(sbi);
 		err = nilfs_segctor_sync(sci);
 		return err;
@@ -2480,14 +2481,14 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(sci->sc_interval);
 	}
-	if (nilfs_test_opt(sbi, DISCARD)) {
+	if (nilfs_test_opt(nilfs, DISCARD)) {
 		int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
 						 sci->sc_nfreesegs);
 		if (ret) {
 			printk(KERN_WARNING
 			       "NILFS warning: error %d on discard request, "
 			       "turning discards off for the device\n", ret);
-			nilfs_clear_opt(sbi, DISCARD);
+			nilfs_clear_opt(nilfs, DISCARD);
 		}
 	}
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1673b3d9984..0576cb21d69 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -109,7 +109,7 @@ static void nilfs_set_error(struct nilfs_sb_info *sbi)
 void nilfs_error(struct super_block *sb, const char *function,
 		 const char *fmt, ...)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct va_format vaf;
 	va_list args;
 
@@ -126,13 +126,13 @@ void nilfs_error(struct super_block *sb, const char *function,
 	if (!(sb->s_flags & MS_RDONLY)) {
 		nilfs_set_error(sbi);
 
-		if (nilfs_test_opt(sbi, ERRORS_RO)) {
+		if (nilfs_test_opt(nilfs, ERRORS_RO)) {
 			printk(KERN_CRIT "Remounting filesystem read-only\n");
 			sb->s_flags |= MS_RDONLY;
 		}
 	}
 
-	if (nilfs_test_opt(sbi, ERRORS_PANIC))
+	if (nilfs_test_opt(nilfs, ERRORS_PANIC))
 		panic("NILFS (device %s): panic forced after error\n",
 		      sb->s_id);
 }
@@ -196,7 +196,7 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 
  retry:
 	set_buffer_dirty(nilfs->ns_sbh[0]);
-	if (nilfs_test_opt(sbi, BARRIER)) {
+	if (nilfs_test_opt(nilfs, BARRIER)) {
 		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
 					  WRITE_SYNC | WRITE_FLUSH_FUA);
 	} else {
@@ -530,22 +530,22 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct super_block *sb = vfs->mnt_sb;
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
 
-	if (!nilfs_test_opt(sbi, BARRIER))
+	if (!nilfs_test_opt(nilfs, BARRIER))
 		seq_puts(seq, ",nobarrier");
 	if (root->cno != NILFS_CPTREE_CURRENT_CNO)
 		seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
-	if (nilfs_test_opt(sbi, ERRORS_PANIC))
+	if (nilfs_test_opt(nilfs, ERRORS_PANIC))
 		seq_puts(seq, ",errors=panic");
-	if (nilfs_test_opt(sbi, ERRORS_CONT))
+	if (nilfs_test_opt(nilfs, ERRORS_CONT))
 		seq_puts(seq, ",errors=continue");
-	if (nilfs_test_opt(sbi, STRICT_ORDER))
+	if (nilfs_test_opt(nilfs, STRICT_ORDER))
 		seq_puts(seq, ",order=strict");
-	if (nilfs_test_opt(sbi, NORECOVERY))
+	if (nilfs_test_opt(nilfs, NORECOVERY))
 		seq_puts(seq, ",norecovery");
-	if (nilfs_test_opt(sbi, DISCARD))
+	if (nilfs_test_opt(nilfs, DISCARD))
 		seq_puts(seq, ",discard");
 
 	return 0;
@@ -594,7 +594,7 @@ static match_table_t tokens = {
 
 static int parse_options(char *options, struct super_block *sb, int is_remount)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
 
@@ -609,29 +609,29 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_barrier:
-			nilfs_set_opt(sbi, BARRIER);
+			nilfs_set_opt(nilfs, BARRIER);
 			break;
 		case Opt_nobarrier:
-			nilfs_clear_opt(sbi, BARRIER);
+			nilfs_clear_opt(nilfs, BARRIER);
 			break;
 		case Opt_order:
 			if (strcmp(args[0].from, "relaxed") == 0)
 				/* Ordered data semantics */
-				nilfs_clear_opt(sbi, STRICT_ORDER);
+				nilfs_clear_opt(nilfs, STRICT_ORDER);
 			else if (strcmp(args[0].from, "strict") == 0)
 				/* Strict in-order semantics */
-				nilfs_set_opt(sbi, STRICT_ORDER);
+				nilfs_set_opt(nilfs, STRICT_ORDER);
 			else
 				return 0;
 			break;
 		case Opt_err_panic:
-			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
+			nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC);
 			break;
 		case Opt_err_ro:
-			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
+			nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO);
 			break;
 		case Opt_err_cont:
-			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
+			nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT);
 			break;
 		case Opt_snapshot:
 			if (is_remount) {
@@ -642,13 +642,13 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
 			}
 			break;
 		case Opt_norecovery:
-			nilfs_set_opt(sbi, NORECOVERY);
+			nilfs_set_opt(nilfs, NORECOVERY);
 			break;
 		case Opt_discard:
-			nilfs_set_opt(sbi, DISCARD);
+			nilfs_set_opt(nilfs, DISCARD);
 			break;
 		case Opt_nodiscard:
-			nilfs_clear_opt(sbi, DISCARD);
+			nilfs_clear_opt(nilfs, DISCARD);
 			break;
 		default:
 			printk(KERN_ERR
@@ -660,10 +660,12 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
 }
 
 static inline void
-nilfs_set_default_options(struct nilfs_sb_info *sbi,
+nilfs_set_default_options(struct super_block *sb,
 			  struct nilfs_super_block *sbp)
 {
-	sbi->s_mount_opt =
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+
+	nilfs->ns_mount_opt =
 		NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 
@@ -736,7 +738,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 	sb->s_flags |= MS_NOATIME;
 #endif
 
-	nilfs_set_default_options(sbi, sbp);
+	nilfs_set_default_options(sb, sbp);
 
 	sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
@@ -1023,7 +1025,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	int err;
 
 	old_sb_flags = sb->s_flags;
-	old_mount_opt = sbi->s_mount_opt;
+	old_mount_opt = nilfs->ns_mount_opt;
 
 	if (!parse_options(data, sb, 1)) {
 		err = -EINVAL;
@@ -1092,7 +1094,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
  restore_opts:
 	sb->s_flags = old_sb_flags;
-	sbi->s_mount_opt = old_mount_opt;
+	nilfs->ns_mount_opt = old_mount_opt;
 	return err;
 }
 
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 9098909d5ce..d3775336a16 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -283,7 +283,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	if (s_flags & MS_RDONLY) {
 		__u64 features;
 
-		if (nilfs_test_opt(sbi, NORECOVERY)) {
+		if (nilfs_test_opt(nilfs, NORECOVERY)) {
 			printk(KERN_INFO "NILFS: norecovery option specified. "
 			       "skipping roll-forward recovery\n");
 			goto skip_recovery;
@@ -305,7 +305,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 			goto failed_unload;
 		}
 		sbi->s_super->s_flags &= ~MS_RDONLY;
-	} else if (nilfs_test_opt(sbi, NORECOVERY)) {
+	} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
 		printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
 		       "option was specified for a read/write mount\n");
 		err = -EINVAL;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index fd85e4c05c6..438278041d8 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -72,6 +72,7 @@ enum {
  * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
  * @ns_cptree_lock: lock protecting @ns_cptree
  * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_mount_opt: mount options
  * @ns_blocksize_bits: bit length of block size
  * @ns_blocksize: block size
  * @ns_nsegments: number of segments in filesystem
@@ -148,6 +149,9 @@ struct the_nilfs {
 	/* GC inode list */
 	struct list_head	ns_gc_inodes;
 
+	/* Mount options */
+	unsigned long		ns_mount_opt;
+
 	/* Disk layout information (static) */
 	unsigned int		ns_blocksize_bits;
 	unsigned int		ns_blocksize;
@@ -180,6 +184,20 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
 
+/*
+ * Mount option operations
+ */
+#define nilfs_clear_opt(nilfs, opt)  \
+	do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+#define nilfs_set_opt(nilfs, opt)  \
+	do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
+#define nilfs_write_opt(nilfs, mask, opt)				\
+	do { (nilfs)->ns_mount_opt =					\
+		(((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) |	\
+		 NILFS_MOUNT_##opt);					\
+	} while (0)
+
 /**
  * struct nilfs_root - nilfs root object
  * @cno: checkpoint number
-- 
cgit v1.2.3


From 574e6c3145c5754141361c695b58736c294a8ae1 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 9 Mar 2011 11:05:07 +0900
Subject: nilfs2: move parameters on nilfs_sb_info into nilfs object

This moves four parameter variables on nilfs_sb_info s_resuid,
s_resgid, s_interval and s_watermark to the nilfs object.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/sb.h        |  7 -------
 fs/nilfs2/segment.c   |  9 +++++----
 fs/nilfs2/super.c     | 10 +++++-----
 fs/nilfs2/the_nilfs.h |  9 +++++++++
 4 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index d7346c949c8..0512521ba8a 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -34,13 +34,6 @@ struct nilfs_sc_info;
  * NILFS super-block data in memory
  */
 struct nilfs_sb_info {
-	uid_t s_resuid;
-	gid_t s_resgid;
-
-	unsigned long s_interval;	/* construction interval */
-	unsigned long s_watermark;	/* threshold of data amount
-					   for the segment construction */
-
 	/* Fundamental members */
 	struct super_block *s_super;	/* reverse pointer to super_block */
 	struct the_nilfs *s_nilfs;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 851bcd3890c..a32d9cb2842 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2676,6 +2676,7 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
 					       struct nilfs_root *root)
 {
+	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct nilfs_sc_info *sci;
 
 	sci = kzalloc(sizeof(*sci), GFP_KERNEL);
@@ -2703,10 +2704,10 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
 	sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
 	sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
 
-	if (sbi->s_interval)
-		sci->sc_interval = sbi->s_interval;
-	if (sbi->s_watermark)
-		sci->sc_watermark = sbi->s_watermark;
+	if (nilfs->ns_interval)
+		sci->sc_interval = nilfs->ns_interval;
+	if (nilfs->ns_watermark)
+		sci->sc_watermark = nilfs->ns_watermark;
 	return sci;
 }
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0576cb21d69..2f17a2f9ef9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -729,7 +729,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 				 struct nilfs_super_block *sbp,
 				 char *data)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 
 	sb->s_magic = le16_to_cpu(sbp->s_magic);
 
@@ -740,10 +740,10 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 
 	nilfs_set_default_options(sb, sbp);
 
-	sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
-	sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
-	sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
-	sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
+	nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
+	nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
+	nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
+	nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
 
 	return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 438278041d8..4a9bf3913c9 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -73,6 +73,10 @@ enum {
  * @ns_cptree_lock: lock protecting @ns_cptree
  * @ns_gc_inodes: dummy inodes to keep live blocks
  * @ns_mount_opt: mount options
+ * @ns_resuid: uid for reserved blocks
+ * @ns_resgid: gid for reserved blocks
+ * @ns_interval: checkpoint creation interval
+ * @ns_watermark: watermark for the number of dirty buffers
  * @ns_blocksize_bits: bit length of block size
  * @ns_blocksize: block size
  * @ns_nsegments: number of segments in filesystem
@@ -152,6 +156,11 @@ struct the_nilfs {
 	/* Mount options */
 	unsigned long		ns_mount_opt;
 
+	uid_t			ns_resuid;
+	gid_t			ns_resgid;
+	unsigned long		ns_interval;
+	unsigned long		ns_watermark;
+
 	/* Disk layout information (static) */
 	unsigned int		ns_blocksize_bits;
 	unsigned int		ns_blocksize;
-- 
cgit v1.2.3


From 693dd321222f03b17668f88ceb0f7d518900191e Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 9 Mar 2011 11:05:07 +0900
Subject: nilfs2: move s_inode_lock and s_dirty_files into nilfs object

Moves s_inode_lock spinlock and s_dirty_files list to nilfs object
from nilfs_sb_info structure.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c     | 30 +++++++++----------
 fs/nilfs2/sb.h        |  3 --
 fs/nilfs2/segment.c   | 80 +++++++++++++++++++++++++--------------------------
 fs/nilfs2/super.c     |  3 --
 fs/nilfs2/the_nilfs.c |  2 ++
 fs/nilfs2/the_nilfs.h |  6 ++++
 6 files changed, 63 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 22a816ba362..dd5d6d633ea 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -807,18 +807,18 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	int err;
 
-	spin_lock(&sbi->s_inode_lock);
+	spin_lock(&nilfs->ns_inode_lock);
 	if (ii->i_bh == NULL) {
-		spin_unlock(&sbi->s_inode_lock);
+		spin_unlock(&nilfs->ns_inode_lock);
 		err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
 						  inode->i_ino, pbh);
 		if (unlikely(err))
 			return err;
-		spin_lock(&sbi->s_inode_lock);
+		spin_lock(&nilfs->ns_inode_lock);
 		if (ii->i_bh == NULL)
 			ii->i_bh = *pbh;
 		else {
@@ -829,36 +829,36 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 		*pbh = ii->i_bh;
 
 	get_bh(*pbh);
-	spin_unlock(&sbi->s_inode_lock);
+	spin_unlock(&nilfs->ns_inode_lock);
 	return 0;
 }
 
 int nilfs_inode_dirty(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
-	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
 	int ret = 0;
 
 	if (!list_empty(&ii->i_dirty)) {
-		spin_lock(&sbi->s_inode_lock);
+		spin_lock(&nilfs->ns_inode_lock);
 		ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
 			test_bit(NILFS_I_BUSY, &ii->i_state);
-		spin_unlock(&sbi->s_inode_lock);
+		spin_unlock(&nilfs->ns_inode_lock);
 	}
 	return ret;
 }
 
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
 	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
 
-	atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
+	atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
 
 	if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
 		return 0;
 
-	spin_lock(&sbi->s_inode_lock);
+	spin_lock(&nilfs->ns_inode_lock);
 	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
 	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
 		/* Because this routine may race with nilfs_dispose_list(),
@@ -866,18 +866,18 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 		if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
 			/* This will happen when somebody is freeing
 			   this inode. */
-			nilfs_warning(sbi->s_super, __func__,
+			nilfs_warning(inode->i_sb, __func__,
 				      "cannot get inode (ino=%lu)\n",
 				      inode->i_ino);
-			spin_unlock(&sbi->s_inode_lock);
+			spin_unlock(&nilfs->ns_inode_lock);
 			return -EINVAL; /* NILFS_I_DIRTY may remain for
 					   freeing inode */
 		}
 		list_del(&ii->i_dirty);
-		list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
+		list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
 		set_bit(NILFS_I_QUEUED, &ii->i_state);
 	}
-	spin_unlock(&sbi->s_inode_lock);
+	spin_unlock(&nilfs->ns_inode_lock);
 	return 0;
 }
 
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 0512521ba8a..3232e75fab7 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -39,10 +39,7 @@ struct nilfs_sb_info {
 	struct the_nilfs *s_nilfs;
 
 	/* Segment constructor */
-	struct list_head s_dirty_files;	/* dirty files list */
 	struct nilfs_sc_info *s_sc_info; /* segment constructor info */
-	spinlock_t s_inode_lock;	/* Lock for the nilfs inode.
-					   It covers s_dirty_files list */
 
 	/* Inode allocator */
 	spinlock_t s_next_gen_lock;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a32d9cb2842..6ac50d81184 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -104,8 +104,7 @@ struct nilfs_sc_operations {
 static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
 static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
 static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
-static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
-			       int);
+static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
 
 #define nilfs_cnt32_gt(a, b)   \
 	(typecheck(__u32, a) && typecheck(__u32, b) && \
@@ -325,14 +324,15 @@ static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
 static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
 {
 	struct nilfs_transaction_info *ti = current->journal_info;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
 
 	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
 	BUG_ON(ti->ti_count > 0);
 
-	up_write(&sbi->s_nilfs->ns_segctor_sem);
+	up_write(&nilfs->ns_segctor_sem);
 	current->journal_info = ti->ti_save;
 	if (!list_empty(&ti->ti_garbage))
-		nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
+		nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
 }
 
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -714,7 +714,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
 	}
 }
 
-static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
+static void nilfs_dispose_list(struct the_nilfs *nilfs,
 			       struct list_head *head, int force)
 {
 	struct nilfs_inode_info *ii, *n;
@@ -722,7 +722,7 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
 	unsigned nv = 0;
 
 	while (!list_empty(head)) {
-		spin_lock(&sbi->s_inode_lock);
+		spin_lock(&nilfs->ns_inode_lock);
 		list_for_each_entry_safe(ii, n, head, i_dirty) {
 			list_del_init(&ii->i_dirty);
 			if (force) {
@@ -733,14 +733,14 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
 			} else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
 				set_bit(NILFS_I_QUEUED, &ii->i_state);
 				list_add_tail(&ii->i_dirty,
-					      &sbi->s_dirty_files);
+					      &nilfs->ns_dirty_files);
 				continue;
 			}
 			ivec[nv++] = ii;
 			if (nv == SC_N_INODEVEC)
 				break;
 		}
-		spin_unlock(&sbi->s_inode_lock);
+		spin_unlock(&nilfs->ns_inode_lock);
 
 		for (pii = ivec; nv > 0; pii++, nv--)
 			iput(&(*pii)->vfs_inode);
@@ -773,17 +773,17 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
 
 static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
 {
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
 	int ret = 0;
 
-	if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
+	if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
 		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
 
-	spin_lock(&sbi->s_inode_lock);
-	if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
+	spin_lock(&nilfs->ns_inode_lock);
+	if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci))
 		ret++;
 
-	spin_unlock(&sbi->s_inode_lock);
+	spin_unlock(&nilfs->ns_inode_lock);
 	return ret;
 }
 
@@ -1963,30 +1963,30 @@ static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
 	return ret;
 }
 
-static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
-					struct nilfs_sb_info *sbi)
+static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
 {
 	struct nilfs_inode_info *ii, *n;
 	struct inode *ifile = sci->sc_root->ifile;
 
-	spin_lock(&sbi->s_inode_lock);
+	spin_lock(&nilfs->ns_inode_lock);
  retry:
-	list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
+	list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) {
 		if (!ii->i_bh) {
 			struct buffer_head *ibh;
 			int err;
 
-			spin_unlock(&sbi->s_inode_lock);
+			spin_unlock(&nilfs->ns_inode_lock);
 			err = nilfs_ifile_get_inode_block(
 				ifile, ii->vfs_inode.i_ino, &ibh);
 			if (unlikely(err)) {
-				nilfs_warning(sbi->s_super, __func__,
+				nilfs_warning(sci->sc_super, __func__,
 					      "failed to get inode block.\n");
 				return err;
 			}
 			nilfs_mdt_mark_buffer_dirty(ibh);
 			nilfs_mdt_mark_dirty(ifile);
-			spin_lock(&sbi->s_inode_lock);
+			spin_lock(&nilfs->ns_inode_lock);
 			if (likely(!ii->i_bh))
 				ii->i_bh = ibh;
 			else
@@ -1999,18 +1999,18 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
 		list_del(&ii->i_dirty);
 		list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
 	}
-	spin_unlock(&sbi->s_inode_lock);
+	spin_unlock(&nilfs->ns_inode_lock);
 
 	return 0;
 }
 
-static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
-					  struct nilfs_sb_info *sbi)
+static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
 {
 	struct nilfs_transaction_info *ti = current->journal_info;
 	struct nilfs_inode_info *ii, *n;
 
-	spin_lock(&sbi->s_inode_lock);
+	spin_lock(&nilfs->ns_inode_lock);
 	list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
 		if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
 		    test_bit(NILFS_I_DIRTY, &ii->i_state))
@@ -2022,7 +2022,7 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
 		list_del(&ii->i_dirty);
 		list_add_tail(&ii->i_dirty, &ti->ti_garbage);
 	}
-	spin_unlock(&sbi->s_inode_lock);
+	spin_unlock(&nilfs->ns_inode_lock);
 }
 
 /*
@@ -2038,7 +2038,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 	sci->sc_stage.scnt = NILFS_ST_INIT;
 	sci->sc_cno = nilfs->ns_cno;
 
-	err = nilfs_segctor_check_in_files(sci, sbi);
+	err = nilfs_segctor_collect_dirty_files(sci, nilfs);
 	if (unlikely(err))
 		goto out;
 
@@ -2116,7 +2116,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
 
  out:
-	nilfs_segctor_check_out_files(sci, sbi);
+	nilfs_segctor_drop_written_files(sci, nilfs);
 	return err;
 
  failed_to_write:
@@ -2319,14 +2319,14 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 		return err;
 	}
 
-	spin_lock(&sbi->s_inode_lock);
+	spin_lock(&nilfs->ns_inode_lock);
 	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
 	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
-		spin_unlock(&sbi->s_inode_lock);
+		spin_unlock(&nilfs->ns_inode_lock);
 		nilfs_transaction_unlock(sbi);
 		return 0;
 	}
-	spin_unlock(&sbi->s_inode_lock);
+	spin_unlock(&nilfs->ns_inode_lock);
 	sci->sc_dsync_inode = ii;
 	sci->sc_dsync_start = start;
 	sci->sc_dsync_end = end;
@@ -2738,10 +2738,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
  */
 static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 {
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
 	int flag;
 
-	up_write(&sbi->s_nilfs->ns_segctor_sem);
+	up_write(&nilfs->ns_segctor_sem);
 
 	spin_lock(&sci->sc_state_lock);
 	nilfs_segctor_kill_thread(sci);
@@ -2755,9 +2755,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 	WARN_ON(!list_empty(&sci->sc_copied_buffers));
 
 	if (!list_empty(&sci->sc_dirty_files)) {
-		nilfs_warning(sbi->s_super, __func__,
+		nilfs_warning(sci->sc_super, __func__,
 			      "dirty file(s) after the final construction\n");
-		nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
+		nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
 	}
 
 	WARN_ON(!list_empty(&sci->sc_segbufs));
@@ -2765,7 +2765,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 
 	nilfs_put_root(sci->sc_root);
 
-	down_write(&sbi->s_nilfs->ns_segctor_sem);
+	down_write(&nilfs->ns_segctor_sem);
 
 	del_timer_sync(&sci->sc_timer);
 	kfree(sci);
@@ -2829,15 +2829,15 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
 	}
 
 	/* Force to free the list of dirty files */
-	spin_lock(&sbi->s_inode_lock);
-	if (!list_empty(&sbi->s_dirty_files)) {
-		list_splice_init(&sbi->s_dirty_files, &garbage_list);
+	spin_lock(&nilfs->ns_inode_lock);
+	if (!list_empty(&nilfs->ns_dirty_files)) {
+		list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
 		nilfs_warning(sbi->s_super, __func__,
 			      "Non empty dirty list after the last "
 			      "segment construction\n");
 	}
-	spin_unlock(&sbi->s_inode_lock);
+	spin_unlock(&nilfs->ns_inode_lock);
 	up_write(&nilfs->ns_segctor_sem);
 
-	nilfs_dispose_list(sbi, &garbage_list, 1);
+	nilfs_dispose_list(nilfs, &garbage_list, 1);
 }
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2f17a2f9ef9..6dc8b3cad12 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -943,9 +943,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto failed_nilfs;
 
-	spin_lock_init(&sbi->s_inode_lock);
-	INIT_LIST_HEAD(&sbi->s_dirty_files);
-
 	/*
 	 * Following initialization is overlapped because
 	 * nilfs_sb_info structure has been cleared at the beginning.
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d3775336a16..40239a932de 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -75,7 +75,9 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	nilfs->ns_bdev = bdev;
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
+	INIT_LIST_HEAD(&nilfs->ns_dirty_files);
 	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
+	spin_lock_init(&nilfs->ns_inode_lock);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
 	nilfs->ns_cptree = RB_ROOT;
 	spin_lock_init(&nilfs->ns_cptree_lock);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 4a9bf3913c9..6106ec5ad89 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -71,6 +71,8 @@ enum {
  * @ns_sufile: segusage file inode
  * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
  * @ns_cptree_lock: lock protecting @ns_cptree
+ * @ns_dirty_files: list of dirty files
+ * @ns_inode_lock: lock protecting @ns_dirty_files
  * @ns_gc_inodes: dummy inodes to keep live blocks
  * @ns_mount_opt: mount options
  * @ns_resuid: uid for reserved blocks
@@ -150,6 +152,10 @@ struct the_nilfs {
 	struct rb_root		ns_cptree;
 	spinlock_t		ns_cptree_lock;
 
+	/* Dirty inode list */
+	struct list_head	ns_dirty_files;
+	spinlock_t		ns_inode_lock;
+
 	/* GC inode list */
 	struct list_head	ns_gc_inodes;
 
-- 
cgit v1.2.3


From 9b1fc4e4973469dd3fab27ba5d78eca1cd5c13fe Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 9 Mar 2011 11:05:08 +0900
Subject: nilfs2: move next generation counter into nilfs object

Moves s_next_generation counter and a spinlock protecting it to nilfs
object from nilfs_sb_info structure.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c     |  8 ++++----
 fs/nilfs2/sb.h        |  4 ----
 fs/nilfs2/super.c     | 11 -----------
 fs/nilfs2/the_nilfs.c |  5 +++++
 fs/nilfs2/the_nilfs.h |  6 ++++++
 5 files changed, 15 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index dd5d6d633ea..7a3dbe4f229 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -295,7 +295,7 @@ const struct address_space_operations nilfs_aops = {
 struct inode *nilfs_new_inode(struct inode *dir, int mode)
 {
 	struct super_block *sb = dir->i_sb;
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct inode *inode;
 	struct nilfs_inode_info *ii;
 	struct nilfs_root *root;
@@ -340,9 +340,9 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
 	/* ii->i_dir_acl = 0; */
 	ii->i_dir_start_lookup = 0;
 	nilfs_set_inode_flags(inode);
-	spin_lock(&sbi->s_next_gen_lock);
-	inode->i_generation = sbi->s_next_generation++;
-	spin_unlock(&sbi->s_next_gen_lock);
+	spin_lock(&nilfs->ns_next_gen_lock);
+	inode->i_generation = nilfs->ns_next_generation++;
+	spin_unlock(&nilfs->ns_next_gen_lock);
 	insert_inode_hash(inode);
 
 	err = nilfs_init_acl(inode, dir);
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 3232e75fab7..6f190dfdc64 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -40,10 +40,6 @@ struct nilfs_sb_info {
 
 	/* Segment constructor */
 	struct nilfs_sc_info *s_sc_info; /* segment constructor info */
-
-	/* Inode allocator */
-	spinlock_t s_next_gen_lock;
-	u32 s_next_generation;
 };
 
 static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6dc8b3cad12..1368c4293c7 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -43,7 +43,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/random.h>
 #include <linux/crc32.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
@@ -943,16 +942,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (err)
 		goto failed_nilfs;
 
-	/*
-	 * Following initialization is overlapped because
-	 * nilfs_sb_info structure has been cleared at the beginning.
-	 * But we reserve them to keep our interest and make ready
-	 * for the future change.
-	 */
-	get_random_bytes(&sbi->s_next_generation,
-			 sizeof(sbi->s_next_generation));
-	spin_lock_init(&sbi->s_next_gen_lock);
-
 	sb->s_op = &nilfs_sops;
 	sb->s_export_op = &nilfs_export_ops;
 	sb->s_root = NULL;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 40239a932de..1bf695e887a 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/random.h>
 #include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -78,6 +79,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	INIT_LIST_HEAD(&nilfs->ns_dirty_files);
 	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
 	spin_lock_init(&nilfs->ns_inode_lock);
+	spin_lock_init(&nilfs->ns_next_gen_lock);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
 	nilfs->ns_cptree = RB_ROOT;
 	spin_lock_init(&nilfs->ns_cptree_lock);
@@ -593,6 +595,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 	nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
 	nilfs->ns_blocksize = blocksize;
 
+	get_random_bytes(&nilfs->ns_next_generation,
+			 sizeof(nilfs->ns_next_generation));
+
 	err = nilfs_store_disk_layout(nilfs, sbp);
 	if (err)
 		goto failed_sbh;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 6106ec5ad89..3ecc968f212 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -74,6 +74,8 @@ enum {
  * @ns_dirty_files: list of dirty files
  * @ns_inode_lock: lock protecting @ns_dirty_files
  * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_next_generation: next generation number for inodes
+ * @ns_next_gen_lock: lock protecting @ns_next_generation
  * @ns_mount_opt: mount options
  * @ns_resuid: uid for reserved blocks
  * @ns_resgid: gid for reserved blocks
@@ -159,6 +161,10 @@ struct the_nilfs {
 	/* GC inode list */
 	struct list_head	ns_gc_inodes;
 
+	/* Inode allocator */
+	u32			ns_next_generation;
+	spinlock_t		ns_next_gen_lock;
+
 	/* Mount options */
 	unsigned long		ns_mount_opt;
 
-- 
cgit v1.2.3


From 3fd3fe5aeaa171a5638d2bb54a1a170eab7b7cdc Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 9 Mar 2011 11:05:08 +0900
Subject: nilfs2: move log writer onto nilfs object

Log writer is held by the nilfs_sb_info structure.  This moves it into
nilfs object and replaces all uses of NILFS_SC() accessor.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/sb.h        |  9 --------
 fs/nilfs2/segment.c   | 58 +++++++++++++++++++++++++--------------------------
 fs/nilfs2/the_nilfs.h |  6 +++++-
 3 files changed, 34 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index 6f190dfdc64..44553f42eba 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -28,7 +28,6 @@
 #include <linux/fs.h>
 
 struct the_nilfs;
-struct nilfs_sc_info;
 
 /*
  * NILFS super-block data in memory
@@ -37,9 +36,6 @@ struct nilfs_sb_info {
 	/* Fundamental members */
 	struct super_block *s_super;	/* reverse pointer to super_block */
 	struct the_nilfs *s_nilfs;
-
-	/* Segment constructor */
-	struct nilfs_sc_info *s_sc_info; /* segment constructor info */
 };
 
 static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
@@ -47,9 +43,4 @@ static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
-{
-	return sbi->s_sc_info;
-}
-
 #endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 6ac50d81184..e3d1785faf1 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -224,8 +224,7 @@ int nilfs_transaction_begin(struct super_block *sb,
 int nilfs_transaction_commit(struct super_block *sb)
 {
 	struct nilfs_transaction_info *ti = current->journal_info;
-	struct nilfs_sb_info *sbi;
-	struct nilfs_sc_info *sci;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	int err = 0;
 
 	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
@@ -234,16 +233,15 @@ int nilfs_transaction_commit(struct super_block *sb)
 		ti->ti_count--;
 		return 0;
 	}
-	sbi = NILFS_SB(sb);
-	sci = NILFS_SC(sbi);
-	if (sci != NULL) {
+	if (nilfs->ns_writer) {
+		struct nilfs_sc_info *sci = nilfs->ns_writer;
+
 		if (ti->ti_flags & NILFS_TI_COMMIT)
 			nilfs_segctor_start_timer(sci);
-		if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
-		    sci->sc_watermark)
+		if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark)
 			nilfs_segctor_do_flush(sci, 0);
 	}
-	up_read(&sbi->s_nilfs->ns_segctor_sem);
+	up_read(&nilfs->ns_segctor_sem);
 	current->journal_info = ti->ti_save;
 
 	if (ti->ti_flags & NILFS_TI_SYNC)
@@ -271,9 +269,8 @@ void nilfs_transaction_abort(struct super_block *sb)
 
 void nilfs_relax_pressure_in_lock(struct super_block *sb)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct nilfs_sc_info *sci = NILFS_SC(sbi);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
 
 	if (!sci || !sci->sc_flush_request)
 		return;
@@ -298,6 +295,8 @@ static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
 				   int gcflag)
 {
 	struct nilfs_transaction_info *cur_ti = current->journal_info;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
 
 	WARN_ON(cur_ti);
 	ti->ti_flags = NILFS_TI_WRITER;
@@ -308,11 +307,11 @@ static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
 	current->journal_info = ti;
 
 	for (;;) {
-		down_write(&sbi->s_nilfs->ns_segctor_sem);
-		if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
+		down_write(&nilfs->ns_segctor_sem);
+		if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
 			break;
 
-		nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
+		nilfs_segctor_do_immediate_flush(sci);
 
 		up_write(&sbi->s_nilfs->ns_segctor_sem);
 		yield();
@@ -2169,8 +2168,8 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
  */
 void nilfs_flush_segment(struct super_block *sb, ino_t ino)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
 
 	if (!sci || nilfs_doing_construction())
 		return;
@@ -2259,8 +2258,8 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
  */
 int nilfs_construct_segment(struct super_block *sb)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
 	struct nilfs_transaction_info *ti;
 	int err;
 
@@ -2299,7 +2298,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
-	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
 	struct nilfs_inode_info *ii;
 	struct nilfs_transaction_info ti;
 	int err = 0;
@@ -2445,8 +2444,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 			 void **kbufs)
 {
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct nilfs_sc_info *sci = NILFS_SC(sbi);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_sc_info *sci = nilfs->ns_writer;
 	struct nilfs_transaction_info ti;
 	int err;
 
@@ -2787,9 +2786,10 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
 				     struct nilfs_root *root)
 {
+	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err;
 
-	if (NILFS_SC(sbi)) {
+	if (nilfs->ns_writer) {
 		/*
 		 * This happens if the filesystem was remounted
 		 * read/write after nilfs_error degenerated it into a
@@ -2798,14 +2798,14 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
 		nilfs_detach_segment_constructor(sbi);
 	}
 
-	sbi->s_sc_info = nilfs_segctor_new(sbi, root);
-	if (!sbi->s_sc_info)
+	nilfs->ns_writer = nilfs_segctor_new(sbi, root);
+	if (!nilfs->ns_writer)
 		return -ENOMEM;
 
-	err = nilfs_segctor_start_thread(NILFS_SC(sbi));
+	err = nilfs_segctor_start_thread(nilfs->ns_writer);
 	if (err) {
-		kfree(sbi->s_sc_info);
-		sbi->s_sc_info = NULL;
+		kfree(nilfs->ns_writer);
+		nilfs->ns_writer = NULL;
 	}
 	return err;
 }
@@ -2823,9 +2823,9 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
 	LIST_HEAD(garbage_list);
 
 	down_write(&nilfs->ns_segctor_sem);
-	if (NILFS_SC(sbi)) {
-		nilfs_segctor_destroy(NILFS_SC(sbi));
-		sbi->s_sc_info = NULL;
+	if (nilfs->ns_writer) {
+		nilfs_segctor_destroy(nilfs->ns_writer);
+		nilfs->ns_writer = NULL;
 	}
 
 	/* Force to free the list of dirty files */
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 3ecc968f212..10521b97ded 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -33,6 +33,8 @@
 #include <linux/slab.h>
 #include "sb.h"
 
+struct nilfs_sc_info;
+
 /* the_nilfs struct */
 enum {
 	THE_NILFS_INIT = 0,     /* Information from super_block is set */
@@ -65,7 +67,8 @@ enum {
  * @ns_last_cno: checkpoint number of the latest segment
  * @ns_prot_seq: least sequence number of segments which must not be reclaimed
  * @ns_prev_seq: base sequence number used to decide if advance log cursor
- * @ns_segctor_sem: segment constructor semaphore
+ * @ns_writer: log writer
+ * @ns_segctor_sem: semaphore protecting log write
  * @ns_dat: DAT file inode
  * @ns_cpfile: checkpoint file inode
  * @ns_sufile: segusage file inode
@@ -140,6 +143,7 @@ struct the_nilfs {
 	u64			ns_prot_seq;
 	u64			ns_prev_seq;
 
+	struct nilfs_sc_info   *ns_writer;
 	struct rw_semaphore	ns_segctor_sem;
 
 	/*
-- 
cgit v1.2.3


From d96bbfa28aa7a1d5a5bf549026a594d7a273c5d7 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 9 Mar 2011 11:05:08 +0900
Subject: nilfs2: get rid of sc_sbi back pointer

Removes sci->sc_sbi which is a back pointer to nilfs_sb_info struct
from log writer object (nilfs_sc_info).

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/segment.c | 29 ++++++++++++-----------------
 fs/nilfs2/segment.h |  2 --
 2 files changed, 12 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index e3d1785faf1..b14788ec0d1 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -772,7 +772,7 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
 
 static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
 {
-	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 	int ret = 0;
 
 	if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
@@ -788,8 +788,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
 
 static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 {
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 
 	nilfs_mdt_clear_dirty(sci->sc_root->ifile);
 	nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
@@ -799,7 +798,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 {
-	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 	struct buffer_head *bh_cp;
 	struct nilfs_checkpoint *raw_cp;
 	int err;
@@ -823,8 +822,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 
 static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 {
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 	struct buffer_head *bh_cp;
 	struct nilfs_checkpoint *raw_cp;
 	int err;
@@ -1048,8 +1046,7 @@ static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
 
 static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 {
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 	struct list_head *head;
 	struct nilfs_inode_info *ii;
 	size_t ndone;
@@ -1858,7 +1855,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 {
 	struct nilfs_segment_buffer *segbuf;
 	struct page *bd_page = NULL, *fs_page = NULL;
-	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 	int update_sr = false;
 
 	list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -2029,8 +2026,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
  */
 static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 {
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 	struct page *failed_page;
 	int err;
 
@@ -2388,7 +2384,7 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
  */
 static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct nilfs_sb_info *sbi = NILFS_SB(sci->sc_super);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	struct nilfs_super_block **sbp;
 	int err = 0;
@@ -2501,7 +2497,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 
 static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 {
-	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct nilfs_sb_info *sbi = NILFS_SB(sci->sc_super);
 	struct nilfs_transaction_info ti;
 
 	nilfs_transaction_lock(sbi, &ti, 0);
@@ -2561,7 +2557,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
 static int nilfs_segctor_thread(void *arg)
 {
 	struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
-	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 	int timeout = 0;
 
 	sci->sc_timer.data = (unsigned long)current;
@@ -2682,7 +2678,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
 	if (!sci)
 		return NULL;
 
-	sci->sc_sbi = sbi;
 	sci->sc_super = sbi->s_super;
 
 	nilfs_get_root(root);
@@ -2717,7 +2712,7 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
 	/* The segctord thread was stopped and its timer was removed.
 	   But some tasks remain. */
 	do {
-		struct nilfs_sb_info *sbi = sci->sc_sbi;
+		struct nilfs_sb_info *sbi = NILFS_SB(sci->sc_super);
 		struct nilfs_transaction_info ti;
 
 		nilfs_transaction_lock(sbi, &ti, 0);
@@ -2737,7 +2732,7 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
  */
 static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 {
-	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 	int flag;
 
 	up_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index cd8056e7cbe..9544aa97dd4 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -88,7 +88,6 @@ struct nilfs_segsum_pointer {
 /**
  * struct nilfs_sc_info - Segment constructor information
  * @sc_super: Back pointer to super_block struct
- * @sc_sbi: Back pointer to nilfs_sb_info struct
  * @sc_root: root object of the current filesystem tree
  * @sc_nblk_inc: Block count of current generation
  * @sc_dirty_files: List of files to be written
@@ -131,7 +130,6 @@ struct nilfs_segsum_pointer {
  */
 struct nilfs_sc_info {
 	struct super_block     *sc_super;
-	struct nilfs_sb_info   *sc_sbi;
 	struct nilfs_root      *sc_root;
 
 	unsigned long		sc_nblk_inc;
-- 
cgit v1.2.3


From f7545144c2e3d280139260df934043e0a6ccce6f Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 9 Mar 2011 11:05:08 +0900
Subject: nilfs2: use sb instance instead of nilfs_sb_info struct

This replaces sbi uses with direct reference to sb instance.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/nilfs.h     |  10 ++---
 fs/nilfs2/recovery.c  |  32 +++++++--------
 fs/nilfs2/segment.c   |  90 ++++++++++++++++++++-----------------------
 fs/nilfs2/segment.h   |  10 ++---
 fs/nilfs2/super.c     | 105 ++++++++++++++++++++++++--------------------------
 fs/nilfs2/the_nilfs.c |  22 +++++------
 fs/nilfs2/the_nilfs.h |   5 +--
 7 files changed, 129 insertions(+), 145 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 03ba4d88083..eba1aaa7fb7 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -295,11 +295,11 @@ extern int nilfs_check_feature_compatibility(struct super_block *,
 					     struct nilfs_super_block *);
 extern void nilfs_set_log_cursor(struct nilfs_super_block *,
 				 struct the_nilfs *);
-extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
-						      int flip);
-extern int nilfs_commit_super(struct nilfs_sb_info *, int);
-extern int nilfs_cleanup_super(struct nilfs_sb_info *);
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
+					       int flip);
+int nilfs_commit_super(struct super_block *sb, int flag);
+int nilfs_cleanup_super(struct super_block *sb);
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 			    struct nilfs_root **root);
 int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
 
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 3dfcd3b7d38..ba4a64518f3 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -425,7 +425,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
 }
 
 static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
-					      struct nilfs_sb_info *sbi,
+					      struct super_block *sb,
 					      struct nilfs_recovery_info *ri)
 {
 	struct list_head *head = &ri->ri_used_segments;
@@ -501,7 +501,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 }
 
 static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
-				      struct nilfs_sb_info *sbi,
+				      struct super_block *sb,
 				      struct nilfs_root *root,
 				      struct list_head *head,
 				      unsigned long *nr_salvaged_blocks)
@@ -514,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 	int err = 0, err2 = 0;
 
 	list_for_each_entry_safe(rb, n, head, list) {
-		inode = nilfs_iget(sbi->s_super, root, rb->ino);
+		inode = nilfs_iget(sb, root, rb->ino);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			inode = NULL;
@@ -572,11 +572,11 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
  * nilfs_do_roll_forward - salvage logical segments newer than the latest
  * checkpoint
  * @nilfs: nilfs object
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
  * @ri: pointer to a nilfs_recovery_info
  */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
-				 struct nilfs_sb_info *sbi,
+				 struct super_block *sb,
 				 struct nilfs_root *root,
 				 struct nilfs_recovery_info *ri)
 {
@@ -648,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 				goto failed;
 			if (flags & NILFS_SS_LOGEND) {
 				err = nilfs_recover_dsync_blocks(
-					nilfs, sbi, root, &dsync_blocks,
+					nilfs, sb, root, &dsync_blocks,
 					&nsalvaged_blocks);
 				if (unlikely(err))
 					goto failed;
@@ -681,7 +681,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 
 	if (nsalvaged_blocks) {
 		printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
-		       sbi->s_super->s_id, nsalvaged_blocks);
+		       sb->s_id, nsalvaged_blocks);
 		ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
 	}
  out:
@@ -695,7 +695,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
 	printk(KERN_ERR
 	       "NILFS (device %s): Error roll-forwarding "
 	       "(err=%d, pseg block=%llu). ",
-	       sbi->s_super->s_id, err, (unsigned long long)pseg_start);
+	       sb->s_id, err, (unsigned long long)pseg_start);
 	goto out;
 }
 
@@ -724,7 +724,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 /**
  * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
  * @nilfs: nilfs object
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
  * @ri: pointer to a nilfs_recovery_info struct to store search results.
  *
  * Return Value: On success, 0 is returned.  On error, one of the following
@@ -741,7 +741,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
  * %-ENOMEM - Insufficient memory available.
  */
 int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
-			      struct nilfs_sb_info *sbi,
+			      struct super_block *sb,
 			      struct nilfs_recovery_info *ri)
 {
 	struct nilfs_root *root;
@@ -750,32 +750,32 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
 	if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
 		return 0;
 
-	err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
+	err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
 	if (unlikely(err)) {
 		printk(KERN_ERR
 		       "NILFS: error loading the latest checkpoint.\n");
 		return err;
 	}
 
-	err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
+	err = nilfs_do_roll_forward(nilfs, sb, root, ri);
 	if (unlikely(err))
 		goto failed;
 
 	if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
-		err = nilfs_prepare_segment_for_recovery(nilfs, sbi, ri);
+		err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
 		if (unlikely(err)) {
 			printk(KERN_ERR "NILFS: Error preparing segments for "
 			       "recovery.\n");
 			goto failed;
 		}
 
-		err = nilfs_attach_segment_constructor(sbi, root);
+		err = nilfs_attach_log_writer(sb, root);
 		if (unlikely(err))
 			goto failed;
 
 		set_nilfs_discontinued(nilfs);
-		err = nilfs_construct_segment(sbi->s_super);
-		nilfs_detach_segment_constructor(sbi);
+		err = nilfs_construct_segment(sb);
+		nilfs_detach_log_writer(sb);
 
 		if (unlikely(err)) {
 			printk(KERN_ERR "NILFS: Oops! recovery failed. "
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index b14788ec0d1..90e3130303a 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -181,7 +181,6 @@ int nilfs_transaction_begin(struct super_block *sb,
 			    struct nilfs_transaction_info *ti,
 			    int vacancy_check)
 {
-	struct nilfs_sb_info *sbi;
 	struct the_nilfs *nilfs;
 	int ret = nilfs_prepare_segment_lock(ti);
 
@@ -192,8 +191,7 @@ int nilfs_transaction_begin(struct super_block *sb,
 
 	vfs_check_frozen(sb, SB_FREEZE_WRITE);
 
-	sbi = NILFS_SB(sb);
-	nilfs = sbi->s_nilfs;
+	nilfs = NILFS_SB(sb)->s_nilfs;
 	down_read(&nilfs->ns_segctor_sem);
 	if (vacancy_check && nilfs_near_disk_full(nilfs)) {
 		up_read(&nilfs->ns_segctor_sem);
@@ -290,12 +288,12 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb)
 	downgrade_write(&nilfs->ns_segctor_sem);
 }
 
-static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
+static void nilfs_transaction_lock(struct super_block *sb,
 				   struct nilfs_transaction_info *ti,
 				   int gcflag)
 {
 	struct nilfs_transaction_info *cur_ti = current->journal_info;
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_sc_info *sci = nilfs->ns_writer;
 
 	WARN_ON(cur_ti);
@@ -313,17 +311,17 @@ static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
 
 		nilfs_segctor_do_immediate_flush(sci);
 
-		up_write(&sbi->s_nilfs->ns_segctor_sem);
+		up_write(&nilfs->ns_segctor_sem);
 		yield();
 	}
 	if (gcflag)
 		ti->ti_flags |= NILFS_TI_GC;
 }
 
-static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
+static void nilfs_transaction_unlock(struct super_block *sb)
 {
 	struct nilfs_transaction_info *ti = current->journal_info;
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 
 	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
 	BUG_ON(ti->ti_count > 0);
@@ -2292,8 +2290,7 @@ int nilfs_construct_segment(struct super_block *sb)
 int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 				  loff_t start, loff_t end)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_sc_info *sci = nilfs->ns_writer;
 	struct nilfs_inode_info *ii;
 	struct nilfs_transaction_info ti;
@@ -2302,14 +2299,14 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 	if (!sci)
 		return -EROFS;
 
-	nilfs_transaction_lock(sbi, &ti, 0);
+	nilfs_transaction_lock(sb, &ti, 0);
 
 	ii = NILFS_I(inode);
 	if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
 	    nilfs_test_opt(nilfs, STRICT_ORDER) ||
 	    test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
 	    nilfs_discontinued(nilfs)) {
-		nilfs_transaction_unlock(sbi);
+		nilfs_transaction_unlock(sb);
 		err = nilfs_segctor_sync(sci);
 		return err;
 	}
@@ -2318,7 +2315,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
 	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
 		spin_unlock(&nilfs->ns_inode_lock);
-		nilfs_transaction_unlock(sbi);
+		nilfs_transaction_unlock(sb);
 		return 0;
 	}
 	spin_unlock(&nilfs->ns_inode_lock);
@@ -2328,7 +2325,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 
 	err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
 
-	nilfs_transaction_unlock(sbi);
+	nilfs_transaction_unlock(sb);
 	return err;
 }
 
@@ -2384,8 +2381,7 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
  */
 static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sci->sc_super);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
 	struct nilfs_super_block **sbp;
 	int err = 0;
 
@@ -2403,11 +2399,12 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 		    nilfs_discontinued(nilfs)) {
 			down_write(&nilfs->ns_sem);
 			err = -EIO;
-			sbp = nilfs_prepare_super(sbi,
+			sbp = nilfs_prepare_super(sci->sc_super,
 						  nilfs_sb_will_flip(nilfs));
 			if (likely(sbp)) {
 				nilfs_set_log_cursor(sbp[0], nilfs);
-				err = nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+				err = nilfs_commit_super(sci->sc_super,
+							 NILFS_SB_COMMIT);
 			}
 			up_write(&nilfs->ns_sem);
 		}
@@ -2439,8 +2436,7 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
 int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 			 void **kbufs)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_sc_info *sci = nilfs->ns_writer;
 	struct nilfs_transaction_info ti;
 	int err;
@@ -2448,7 +2444,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 	if (unlikely(!sci))
 		return -EROFS;
 
-	nilfs_transaction_lock(sbi, &ti, 1);
+	nilfs_transaction_lock(sb, &ti, 1);
 
 	err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
 	if (unlikely(err))
@@ -2491,16 +2487,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 	sci->sc_freesegs = NULL;
 	sci->sc_nfreesegs = 0;
 	nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
-	nilfs_transaction_unlock(sbi);
+	nilfs_transaction_unlock(sb);
 	return err;
 }
 
 static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sci->sc_super);
 	struct nilfs_transaction_info ti;
 
-	nilfs_transaction_lock(sbi, &ti, 0);
+	nilfs_transaction_lock(sci->sc_super, &ti, 0);
 	nilfs_segctor_construct(sci, mode);
 
 	/*
@@ -2511,7 +2506,7 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 	if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
 		nilfs_segctor_start_timer(sci);
 
-	nilfs_transaction_unlock(sbi);
+	nilfs_transaction_unlock(sci->sc_super);
 }
 
 static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
@@ -2668,17 +2663,17 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 /*
  * Setup & clean-up functions
  */
-static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
+static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
 					       struct nilfs_root *root)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_sc_info *sci;
 
 	sci = kzalloc(sizeof(*sci), GFP_KERNEL);
 	if (!sci)
 		return NULL;
 
-	sci->sc_super = sbi->s_super;
+	sci->sc_super = sb;
 
 	nilfs_get_root(root);
 	sci->sc_root = root;
@@ -2712,12 +2707,11 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
 	/* The segctord thread was stopped and its timer was removed.
 	   But some tasks remain. */
 	do {
-		struct nilfs_sb_info *sbi = NILFS_SB(sci->sc_super);
 		struct nilfs_transaction_info ti;
 
-		nilfs_transaction_lock(sbi, &ti, 0);
+		nilfs_transaction_lock(sci->sc_super, &ti, 0);
 		ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
-		nilfs_transaction_unlock(sbi);
+		nilfs_transaction_unlock(sci->sc_super);
 
 	} while (ret && retrycount-- > 0);
 }
@@ -2766,22 +2760,21 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 }
 
 /**
- * nilfs_attach_segment_constructor - attach a segment constructor
- * @sbi: nilfs_sb_info
+ * nilfs_attach_log_writer - attach log writer
+ * @sb: super block instance
  * @root: root object of the current filesystem tree
  *
- * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
- * initializes it, and starts the segment constructor.
+ * This allocates a log writer object, initializes it, and starts the
+ * log writer.
  *
  * Return Value: On success, 0 is returned. On error, one of the following
  * negative error code is returned.
  *
  * %-ENOMEM - Insufficient memory available.
  */
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
-				     struct nilfs_root *root)
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	int err;
 
 	if (nilfs->ns_writer) {
@@ -2790,10 +2783,10 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
 		 * read/write after nilfs_error degenerated it into a
 		 * read-only mount.
 		 */
-		nilfs_detach_segment_constructor(sbi);
+		nilfs_detach_log_writer(sb);
 	}
 
-	nilfs->ns_writer = nilfs_segctor_new(sbi, root);
+	nilfs->ns_writer = nilfs_segctor_new(sb, root);
 	if (!nilfs->ns_writer)
 		return -ENOMEM;
 
@@ -2806,15 +2799,15 @@ int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
 }
 
 /**
- * nilfs_detach_segment_constructor - destroy the segment constructor
- * @sbi: nilfs_sb_info
+ * nilfs_detach_log_writer - destroy log writer
+ * @sb: super block instance
  *
- * nilfs_detach_segment_constructor() kills the segment constructor daemon,
- * frees the struct nilfs_sc_info, and destroy the dirty file list.
+ * This kills log writer daemon, frees the log writer object, and
+ * destroys list of dirty files.
  */
-void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
+void nilfs_detach_log_writer(struct super_block *sb)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	LIST_HEAD(garbage_list);
 
 	down_write(&nilfs->ns_segctor_sem);
@@ -2827,9 +2820,8 @@ void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
 	spin_lock(&nilfs->ns_inode_lock);
 	if (!list_empty(&nilfs->ns_dirty_files)) {
 		list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
-		nilfs_warning(sbi->s_super, __func__,
-			      "Non empty dirty list after the last "
-			      "segment construction\n");
+		nilfs_warning(sb, __func__,
+			      "Hit dirty file after stopped log writer\n");
 	}
 	spin_unlock(&nilfs->ns_inode_lock);
 	up_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 9544aa97dd4..e01998e33b3 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -233,18 +233,16 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
 				void **);
 
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
-				     struct nilfs_root *root);
-extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root);
+void nilfs_detach_log_writer(struct super_block *sb);
 
 /* recovery.c */
 extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
 				       struct buffer_head **, int);
 extern int nilfs_search_super_root(struct the_nilfs *,
 				   struct nilfs_recovery_info *);
-extern int nilfs_salvage_orphan_logs(struct the_nilfs *,
-				     struct nilfs_sb_info *,
-				     struct nilfs_recovery_info *);
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb,
+			      struct nilfs_recovery_info *ri);
 extern void nilfs_dispose_segment_list(struct list_head *);
 
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1368c4293c7..a8cbd695441 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -71,23 +71,23 @@ struct kmem_cache *nilfs_transaction_cachep;
 struct kmem_cache *nilfs_segbuf_cachep;
 struct kmem_cache *nilfs_btree_path_cache;
 
-static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
+static int nilfs_setup_super(struct super_block *sb, int is_mount);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 
-static void nilfs_set_error(struct nilfs_sb_info *sbi)
+static void nilfs_set_error(struct super_block *sb)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_super_block **sbp;
 
 	down_write(&nilfs->ns_sem);
 	if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
 		nilfs->ns_mount_state |= NILFS_ERROR_FS;
-		sbp = nilfs_prepare_super(sbi, 0);
+		sbp = nilfs_prepare_super(sb, 0);
 		if (likely(sbp)) {
 			sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
 			if (sbp[1])
 				sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
-			nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
+			nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
 		}
 	}
 	up_write(&nilfs->ns_sem);
@@ -108,7 +108,7 @@ static void nilfs_set_error(struct nilfs_sb_info *sbi)
 void nilfs_error(struct super_block *sb, const char *function,
 		 const char *fmt, ...)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct va_format vaf;
 	va_list args;
 
@@ -123,7 +123,7 @@ void nilfs_error(struct super_block *sb, const char *function,
 	va_end(args);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		nilfs_set_error(sbi);
+		nilfs_set_error(sb);
 
 		if (nilfs_test_opt(nilfs, ERRORS_RO)) {
 			printk(KERN_CRIT "Remounting filesystem read-only\n");
@@ -188,9 +188,9 @@ void nilfs_destroy_inode(struct inode *inode)
 	call_rcu(&inode->i_rcu, nilfs_i_callback);
 }
 
-static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
+static int nilfs_sync_super(struct super_block *sb, int flag)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	int err;
 
  retry:
@@ -262,10 +262,10 @@ void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
 	spin_unlock(&nilfs->ns_last_segment_lock);
 }
 
-struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
 					       int flip)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
 
 	/* nilfs->ns_sem must be locked by the caller. */
@@ -275,7 +275,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
 			memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
 		} else {
 			printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
-			       sbi->s_super->s_id);
+			       sb->s_id);
 			return NULL;
 		}
 	} else if (sbp[1] &&
@@ -289,9 +289,9 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
 	return sbp;
 }
 
-int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
+int nilfs_commit_super(struct super_block *sb, int flag)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
 	time_t t;
 
@@ -311,27 +311,28 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
 					    nilfs->ns_sbsize));
 	}
 	clear_nilfs_sb_dirty(nilfs);
-	return nilfs_sync_super(sbi, flag);
+	return nilfs_sync_super(sb, flag);
 }
 
 /**
  * nilfs_cleanup_super() - write filesystem state for cleanup
- * @sbi: nilfs_sb_info to be unmounted or degraded to read-only
+ * @sb: super block instance to be unmounted or degraded to read-only
  *
  * This function restores state flags in the on-disk super block.
  * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
  * filesystem was not clean previously.
  */
-int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
+int nilfs_cleanup_super(struct super_block *sb)
 {
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_super_block **sbp;
 	int flag = NILFS_SB_COMMIT;
 	int ret = -EIO;
 
-	sbp = nilfs_prepare_super(sbi, 0);
+	sbp = nilfs_prepare_super(sb, 0);
 	if (sbp) {
-		sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state);
-		nilfs_set_log_cursor(sbp[0], sbi->s_nilfs);
+		sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+		nilfs_set_log_cursor(sbp[0], nilfs);
 		if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
 			/*
 			 * make the "clean" flag also to the opposite
@@ -341,7 +342,7 @@ int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
 			sbp[1]->s_state = sbp[0]->s_state;
 			flag = NILFS_SB_COMMIT_ALL;
 		}
-		ret = nilfs_commit_super(sbi, flag);
+		ret = nilfs_commit_super(sb, flag);
 	}
 	return ret;
 }
@@ -351,11 +352,11 @@ static void nilfs_put_super(struct super_block *sb)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
-	nilfs_detach_segment_constructor(sbi);
+	nilfs_detach_log_writer(sb);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_write(&nilfs->ns_sem);
-		nilfs_cleanup_super(sbi);
+		nilfs_cleanup_super(sb);
 		up_write(&nilfs->ns_sem);
 	}
 
@@ -371,8 +372,7 @@ static void nilfs_put_super(struct super_block *sb)
 
 static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_super_block **sbp;
 	int err = 0;
 
@@ -382,10 +382,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 
 	down_write(&nilfs->ns_sem);
 	if (nilfs_sb_dirty(nilfs)) {
-		sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs));
+		sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs));
 		if (likely(sbp)) {
 			nilfs_set_log_cursor(sbp[0], nilfs);
-			nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+			nilfs_commit_super(sb, NILFS_SB_COMMIT);
 		}
 	}
 	up_write(&nilfs->ns_sem);
@@ -393,10 +393,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 	return err;
 }
 
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 			    struct nilfs_root **rootp)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_root *root;
 	struct nilfs_checkpoint *raw_cp;
 	struct buffer_head *bh_cp;
@@ -425,7 +425,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 		goto failed;
 	}
 
-	err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
+	err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
 			       &raw_cp->cp_ifile_inode, &root->ifile);
 	if (err)
 		goto failed_bh;
@@ -449,8 +449,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 
 static int nilfs_freeze(struct super_block *sb)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	int err;
 
 	if (sb->s_flags & MS_RDONLY)
@@ -458,21 +457,20 @@ static int nilfs_freeze(struct super_block *sb)
 
 	/* Mark super block clean */
 	down_write(&nilfs->ns_sem);
-	err = nilfs_cleanup_super(sbi);
+	err = nilfs_cleanup_super(sb);
 	up_write(&nilfs->ns_sem);
 	return err;
 }
 
 static int nilfs_unfreeze(struct super_block *sb)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
 	down_write(&nilfs->ns_sem);
-	nilfs_setup_super(sbi, false);
+	nilfs_setup_super(sb, false);
 	up_write(&nilfs->ns_sem);
 	return 0;
 }
@@ -668,15 +666,15 @@ nilfs_set_default_options(struct super_block *sb,
 		NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
 
-static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
+static int nilfs_setup_super(struct super_block *sb, int is_mount)
 {
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	struct nilfs_super_block **sbp;
 	int max_mnt_count;
 	int mnt_count;
 
 	/* nilfs->ns_sem must be locked by the caller. */
-	sbp = nilfs_prepare_super(sbi, 0);
+	sbp = nilfs_prepare_super(sb, 0);
 	if (!sbp)
 		return -EIO;
 
@@ -707,7 +705,7 @@ skip_mount_setup:
 	/* synchronize sbp[1] with sbp[0] */
 	if (sbp[1])
 		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
-	return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
+	return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
 }
 
 struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -841,7 +839,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 		goto out;
 	}
 
-	ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
+	ret = nilfs_attach_checkpoint(s, cno, false, &root);
 	if (ret) {
 		printk(KERN_ERR "NILFS: error loading snapshot "
 		       "(checkpoint number=%llu).\n",
@@ -938,7 +936,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sbi->s_nilfs = nilfs;
 
-	err = init_nilfs(nilfs, sbi, (char *)data);
+	err = init_nilfs(nilfs, sb, (char *)data);
 	if (err)
 		goto failed_nilfs;
 
@@ -950,12 +948,12 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 	sb->s_bdi = bdi ? : &default_backing_dev_info;
 
-	err = load_nilfs(nilfs, sbi);
+	err = load_nilfs(nilfs, sb);
 	if (err)
 		goto failed_nilfs;
 
 	cno = nilfs_last_cno(nilfs);
-	err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
+	err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
 	if (err) {
 		printk(KERN_ERR "NILFS: error loading last checkpoint "
 		       "(checkpoint number=%llu).\n", (unsigned long long)cno);
@@ -963,7 +961,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		err = nilfs_attach_segment_constructor(sbi, fsroot);
+		err = nilfs_attach_log_writer(sb, fsroot);
 		if (err)
 			goto failed_checkpoint;
 	}
@@ -976,14 +974,14 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		down_write(&nilfs->ns_sem);
-		nilfs_setup_super(sbi, true);
+		nilfs_setup_super(sb, true);
 		up_write(&nilfs->ns_sem);
 	}
 
 	return 0;
 
  failed_segctor:
-	nilfs_detach_segment_constructor(sbi);
+	nilfs_detach_log_writer(sb);
 
  failed_checkpoint:
 	nilfs_put_root(fsroot);
@@ -1004,8 +1002,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 
 static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
 	unsigned long old_sb_flags;
 	unsigned long old_mount_opt;
 	int err;
@@ -1031,8 +1028,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		goto out;
 	if (*flags & MS_RDONLY) {
-		/* Shutting down the segment constructor */
-		nilfs_detach_segment_constructor(sbi);
+		/* Shutting down log writer */
+		nilfs_detach_log_writer(sb);
 		sb->s_flags |= MS_RDONLY;
 
 		/*
@@ -1040,7 +1037,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		 * the RDONLY flag and then mark the partition as valid again.
 		 */
 		down_write(&nilfs->ns_sem);
-		nilfs_cleanup_super(sbi);
+		nilfs_cleanup_super(sb);
 		up_write(&nilfs->ns_sem);
 	} else {
 		__u64 features;
@@ -1067,12 +1064,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		sb->s_flags &= ~MS_RDONLY;
 
 		root = NILFS_I(sb->s_root->d_inode)->i_root;
-		err = nilfs_attach_segment_constructor(sbi, root);
+		err = nilfs_attach_log_writer(sb, root);
 		if (err)
 			goto restore_opts;
 
 		down_write(&nilfs->ns_sem);
-		nilfs_setup_super(sbi, true);
+		nilfs_setup_super(sb, true);
 		up_write(&nilfs->ns_sem);
 	}
  out:
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 1bf695e887a..d2acd1a651f 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -201,16 +201,16 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
 /**
  * load_nilfs - load and recover the nilfs
  * @nilfs: the_nilfs structure to be released
- * @sbi: nilfs_sb_info used to recover past segment
+ * @sb: super block isntance used to recover past segment
  *
  * load_nilfs() searches and load the latest super root,
  * attaches the last segment, and does recovery if needed.
  * The caller must call this exclusively for simultaneous mounts.
  */
-int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 {
 	struct nilfs_recovery_info ri;
-	unsigned int s_flags = sbi->s_super->s_flags;
+	unsigned int s_flags = sb->s_flags;
 	int really_read_only = bdev_read_only(nilfs->ns_bdev);
 	int valid_fs = nilfs_valid_fs(nilfs);
 	int err;
@@ -275,7 +275,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 			goto scan_error;
 	}
 
-	err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
+	err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
 	if (unlikely(err)) {
 		printk(KERN_ERR "NILFS: error loading super root.\n");
 		goto failed;
@@ -308,7 +308,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 			err = -EROFS;
 			goto failed_unload;
 		}
-		sbi->s_super->s_flags &= ~MS_RDONLY;
+		sb->s_flags &= ~MS_RDONLY;
 	} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
 		printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
 		       "option was specified for a read/write mount\n");
@@ -316,13 +316,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 		goto failed_unload;
 	}
 
-	err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri);
+	err = nilfs_salvage_orphan_logs(nilfs, sb, &ri);
 	if (err)
 		goto failed_unload;
 
 	down_write(&nilfs->ns_sem);
 	nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
-	err = nilfs_cleanup_super(sbi);
+	err = nilfs_cleanup_super(sb);
 	up_write(&nilfs->ns_sem);
 
 	if (err) {
@@ -334,7 +334,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 
  skip_recovery:
 	nilfs_clear_recovery_info(&ri);
-	sbi->s_super->s_flags = s_flags;
+	sb->s_flags = s_flags;
 	return 0;
 
  scan_error:
@@ -348,7 +348,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 
  failed:
 	nilfs_clear_recovery_info(&ri);
-	sbi->s_super->s_flags = s_flags;
+	sb->s_flags = s_flags;
 	return err;
 }
 
@@ -526,7 +526,6 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 /**
  * init_nilfs - initialize a NILFS instance.
  * @nilfs: the_nilfs structure
- * @sbi: nilfs_sb_info
  * @sb: super block
  * @data: mount options
  *
@@ -537,9 +536,8 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
  * Return Value: On success, 0 is returned. On error, a negative error
  * code is returned.
  */
-int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
 {
-	struct super_block *sb = sbi->s_super;
 	struct nilfs_super_block *sbp;
 	int blocksize;
 	int err;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 10521b97ded..793bd272f9e 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -267,15 +267,14 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
 struct the_nilfs *alloc_nilfs(struct block_device *bdev);
 void destroy_nilfs(struct the_nilfs *nilfs);
-int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
-int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
 struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
 					     __u64 cno);
 void nilfs_put_root(struct nilfs_root *root);
-struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
 void nilfs_swap_super_block(struct the_nilfs *);
-- 
cgit v1.2.3


From b306419ae08d9def53f2142a37cc0a58622307a8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Mar 2011 21:16:28 -0500
Subject: nd->inode is not set on the second attempt in path_walk()

We leave it at whatever it had been pointing to after the
first link_path_walk() had failed with -ESTALE.  Things
do not work well after that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a5e844fe4b2..a4689eb2df2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1546,6 +1546,7 @@ static int path_walk(const char *name, struct nameidata *nd)
 		/* nd->path had been dropped */
 		current->total_link_count = 0;
 		nd->path = save;
+		nd->inode = save.dentry->d_inode;
 		path_get(&nd->path);
 		nd->flags |= LOOKUP_REVAL;
 		result = link_path_walk(name, nd);
-- 
cgit v1.2.3


From e3154e9748f0f337e9f6ff9dc7d7bf24d426bd1a Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 9 Mar 2011 11:05:08 +0900
Subject: nilfs2: get rid of nilfs_sb_info structure

This directly uses sb->s_fs_info to keep a nilfs filesystem object and
fully removes the intermediate nilfs_sb_info structure.  With this
change, the hierarchy of on-memory structures of nilfs will be
simplified as follows:

Before:
  super_block
       -> nilfs_sb_info
             -> the_nilfs
                   -> cptree --+-> nilfs_root (current file system)
                               +-> nilfs_root (snapshot A)
                               +-> nilfs_root (snapshot B)
                               :
             -> nilfs_sc_info (log writer structure)
After:
  super_block
       -> the_nilfs
             -> cptree --+-> nilfs_root (current file system)
                         +-> nilfs_root (snapshot A)
                         +-> nilfs_root (snapshot B)
                         :
             -> nilfs_sc_info (log writer structure)

The reason why we didn't design so from the beginning is because the
initial shape also differed from the above.  The early hierachy was
composed of "per-mount-point" super_block -> nilfs_sb_info pairs and a
shared nilfs object.  On the kernel 2.6.37, it was changed to the
current shape in order to unify super block instances into one per
device, and this cleanup became applicable as the result.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/bmap.c      |  1 -
 fs/nilfs2/file.c      |  2 +-
 fs/nilfs2/inode.c     | 10 ++++----
 fs/nilfs2/ioctl.c     | 23 +++++++++----------
 fs/nilfs2/mdt.h       |  2 +-
 fs/nilfs2/namei.c     |  2 +-
 fs/nilfs2/nilfs.h     |  3 +--
 fs/nilfs2/sb.h        | 46 -------------------------------------
 fs/nilfs2/segment.c   | 47 +++++++++++++++++++-------------------
 fs/nilfs2/segment.h   |  2 +-
 fs/nilfs2/super.c     | 63 +++++++++++++++++++--------------------------------
 fs/nilfs2/the_nilfs.h |  1 -
 12 files changed, 68 insertions(+), 134 deletions(-)
 delete mode 100644 fs/nilfs2/sb.h

(limited to 'fs')

diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 85447a2fab3..4723f04e9b1 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -25,7 +25,6 @@
 #include <linux/errno.h>
 #include "nilfs.h"
 #include "bmap.h"
-#include "sb.h"
 #include "btree.h"
 #include "direct.h"
 #include "btnode.h"
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7a5e4ab15c6..93589fccdd9 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -59,7 +59,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct nilfs_transaction_info ti;
 	int ret;
 
-	if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
+	if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
 		return VM_FAULT_SIGBUS; /* -ENOSPC */
 
 	lock_page(page);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7a3dbe4f229..d5625be236a 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -295,7 +295,7 @@ const struct address_space_operations nilfs_aops = {
 struct inode *nilfs_new_inode(struct inode *dir, int mode)
 {
 	struct super_block *sb = dir->i_sb;
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct inode *inode;
 	struct nilfs_inode_info *ii;
 	struct nilfs_root *root;
@@ -433,7 +433,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 			      struct nilfs_root *root, unsigned long ino,
 			      struct inode *inode)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct buffer_head *bh;
 	struct nilfs_inode *raw_inode;
 	int err;
@@ -807,7 +807,7 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 {
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 	int err;
 
@@ -836,7 +836,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 int nilfs_inode_dirty(struct inode *inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	int ret = 0;
 
 	if (!list_empty(&ii->i_dirty)) {
@@ -851,7 +851,7 @@ int nilfs_inode_dirty(struct inode *inode)
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 
 	atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
 
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 5471eed5ecc..95c04c2f2b3 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -166,8 +166,7 @@ static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
 static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 				     unsigned int cmd, void __user *argp)
 {
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
-	struct inode *cpfile = nilfs->ns_cpfile;
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	struct nilfs_transaction_info ti;
 	struct nilfs_cpmode cpmode;
 	int ret;
@@ -187,7 +186,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
 
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
 	ret = nilfs_cpfile_change_cpmode(
-		cpfile, cpmode.cm_cno, cpmode.cm_mode);
+		nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode);
 	if (unlikely(ret < 0))
 		nilfs_transaction_abort(inode->i_sb);
 	else
@@ -203,7 +202,7 @@ static int
 nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
 			      unsigned int cmd, void __user *argp)
 {
-	struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	struct nilfs_transaction_info ti;
 	__u64 cno;
 	int ret;
@@ -220,7 +219,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
 		goto out;
 
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
-	ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
+	ret = nilfs_cpfile_delete_checkpoint(nilfs->ns_cpfile, cno);
 	if (unlikely(ret < 0))
 		nilfs_transaction_abort(inode->i_sb);
 	else
@@ -246,7 +245,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
 				  unsigned int cmd, void __user *argp)
 {
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	struct nilfs_cpstat cpstat;
 	int ret;
 
@@ -277,7 +276,7 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
 				  unsigned int cmd, void __user *argp)
 {
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	struct nilfs_sustat sustat;
 	int ret;
 
@@ -333,7 +332,7 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
 static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
 				  unsigned int cmd, void __user *argp)
 {
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	struct nilfs_argv argv;
 	int ret;
 
@@ -402,7 +401,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
 				   struct nilfs_argv *argv, void *buf)
 {
 	size_t nmembs = argv->v_nmembs;
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct inode *inode;
 	struct nilfs_vdesc *vdesc;
 	struct buffer_head *bh, *n;
@@ -616,7 +615,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
 		ret = PTR_ERR(kbufs[4]);
 		goto out;
 	}
-	nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	nilfs = inode->i_sb->s_fs_info;
 
 	for (n = 0; n < 4; n++) {
 		ret = -EINVAL;
@@ -689,7 +688,7 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
 		return ret;
 
 	if (argp != NULL) {
-		nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+		nilfs = inode->i_sb->s_fs_info;
 		down_read(&nilfs->ns_segctor_sem);
 		cno = nilfs->ns_cno - 1;
 		up_read(&nilfs->ns_segctor_sem);
@@ -707,7 +706,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
 						  void *, size_t, size_t))
 
 {
-	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
 	struct nilfs_argv argv;
 	int ret;
 
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index b13734bf352..ed68563ec70 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -66,7 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 
 static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
 {
-	return NILFS_SB(inode->i_sb)->s_nilfs;
+	return inode->i_sb->s_fs_info;
 }
 
 /* Default GFP flags using highmem */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 161791d2645..546849b3e88 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -482,7 +482,7 @@ static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
 	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
 		return ERR_PTR(-ESTALE);
 
-	root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+	root = nilfs_lookup_root(sb->s_fs_info, cno);
 	if (!root)
 		return ERR_PTR(-ESTALE);
 
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index eba1aaa7fb7..856e8e4e0b7 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -30,7 +30,6 @@
 #include <linux/blkdev.h>
 #include <linux/nilfs2_fs.h>
 #include "the_nilfs.h"
-#include "sb.h"
 #include "bmap.h"
 
 /*
@@ -122,7 +121,7 @@ enum {
 #define NILFS_SYS_INO_BITS   \
   ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
 
-#define NILFS_FIRST_INO(sb)  (NILFS_SB(sb)->s_nilfs->ns_first_ino)
+#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
 
 #define NILFS_MDT_INODE(sb, ino) \
   ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
deleted file mode 100644
index 44553f42eba..00000000000
--- a/fs/nilfs2/sb.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * sb.h - NILFS on-memory super block structure.
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
- *
- */
-
-#ifndef _NILFS_SB
-#define _NILFS_SB
-
-#include <linux/types.h>
-#include <linux/fs.h>
-
-struct the_nilfs;
-
-/*
- * NILFS super-block data in memory
- */
-struct nilfs_sb_info {
-	/* Fundamental members */
-	struct super_block *s_super;	/* reverse pointer to super_block */
-	struct the_nilfs *s_nilfs;
-};
-
-static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 90e3130303a..afe4f218345 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -191,7 +191,7 @@ int nilfs_transaction_begin(struct super_block *sb,
 
 	vfs_check_frozen(sb, SB_FREEZE_WRITE);
 
-	nilfs = NILFS_SB(sb)->s_nilfs;
+	nilfs = sb->s_fs_info;
 	down_read(&nilfs->ns_segctor_sem);
 	if (vacancy_check && nilfs_near_disk_full(nilfs)) {
 		up_read(&nilfs->ns_segctor_sem);
@@ -222,7 +222,7 @@ int nilfs_transaction_begin(struct super_block *sb,
 int nilfs_transaction_commit(struct super_block *sb)
 {
 	struct nilfs_transaction_info *ti = current->journal_info;
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	int err = 0;
 
 	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
@@ -252,13 +252,14 @@ int nilfs_transaction_commit(struct super_block *sb)
 void nilfs_transaction_abort(struct super_block *sb)
 {
 	struct nilfs_transaction_info *ti = current->journal_info;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 
 	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
 	if (ti->ti_count > 0) {
 		ti->ti_count--;
 		return;
 	}
-	up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
+	up_read(&nilfs->ns_segctor_sem);
 
 	current->journal_info = ti->ti_save;
 	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
@@ -267,7 +268,7 @@ void nilfs_transaction_abort(struct super_block *sb)
 
 void nilfs_relax_pressure_in_lock(struct super_block *sb)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_sc_info *sci = nilfs->ns_writer;
 
 	if (!sci || !sci->sc_flush_request)
@@ -293,7 +294,7 @@ static void nilfs_transaction_lock(struct super_block *sb,
 				   int gcflag)
 {
 	struct nilfs_transaction_info *cur_ti = current->journal_info;
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_sc_info *sci = nilfs->ns_writer;
 
 	WARN_ON(cur_ti);
@@ -321,7 +322,7 @@ static void nilfs_transaction_lock(struct super_block *sb,
 static void nilfs_transaction_unlock(struct super_block *sb)
 {
 	struct nilfs_transaction_info *ti = current->journal_info;
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 
 	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
 	BUG_ON(ti->ti_count > 0);
@@ -770,7 +771,7 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
 
 static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	int ret = 0;
 
 	if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
@@ -786,7 +787,7 @@ static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
 
 static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 
 	nilfs_mdt_clear_dirty(sci->sc_root->ifile);
 	nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
@@ -796,7 +797,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	struct buffer_head *bh_cp;
 	struct nilfs_checkpoint *raw_cp;
 	int err;
@@ -820,7 +821,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 
 static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	struct buffer_head *bh_cp;
 	struct nilfs_checkpoint *raw_cp;
 	int err;
@@ -1044,7 +1045,7 @@ static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
 
 static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	struct list_head *head;
 	struct nilfs_inode_info *ii;
 	size_t ndone;
@@ -1853,7 +1854,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 {
 	struct nilfs_segment_buffer *segbuf;
 	struct page *bd_page = NULL, *fs_page = NULL;
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	int update_sr = false;
 
 	list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -2024,7 +2025,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
  */
 static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	struct page *failed_page;
 	int err;
 
@@ -2162,7 +2163,7 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
  */
 void nilfs_flush_segment(struct super_block *sb, ino_t ino)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_sc_info *sci = nilfs->ns_writer;
 
 	if (!sci || nilfs_doing_construction())
@@ -2252,7 +2253,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
  */
 int nilfs_construct_segment(struct super_block *sb)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_sc_info *sci = nilfs->ns_writer;
 	struct nilfs_transaction_info *ti;
 	int err;
@@ -2290,7 +2291,7 @@ int nilfs_construct_segment(struct super_block *sb)
 int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 				  loff_t start, loff_t end)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_sc_info *sci = nilfs->ns_writer;
 	struct nilfs_inode_info *ii;
 	struct nilfs_transaction_info ti;
@@ -2381,7 +2382,7 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
  */
 static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	struct nilfs_super_block **sbp;
 	int err = 0;
 
@@ -2436,7 +2437,7 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
 int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
 			 void **kbufs)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_sc_info *sci = nilfs->ns_writer;
 	struct nilfs_transaction_info ti;
 	int err;
@@ -2552,7 +2553,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
 static int nilfs_segctor_thread(void *arg)
 {
 	struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	int timeout = 0;
 
 	sci->sc_timer.data = (unsigned long)current;
@@ -2666,7 +2667,7 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
 					       struct nilfs_root *root)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_sc_info *sci;
 
 	sci = kzalloc(sizeof(*sci), GFP_KERNEL);
@@ -2726,7 +2727,7 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
  */
 static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sci->sc_super)->s_nilfs;
+	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
 	int flag;
 
 	up_write(&nilfs->ns_segctor_sem);
@@ -2774,7 +2775,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
  */
 int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	int err;
 
 	if (nilfs->ns_writer) {
@@ -2807,7 +2808,7 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
  */
 void nilfs_detach_log_writer(struct super_block *sb)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	LIST_HEAD(garbage_list);
 
 	down_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index e01998e33b3..6c02a86745f 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -27,7 +27,7 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
-#include "sb.h"
+#include "nilfs.h"
 
 struct nilfs_root;
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index a8cbd695441..062cca06519 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -76,7 +76,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 
 static void nilfs_set_error(struct super_block *sb)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_super_block **sbp;
 
 	down_write(&nilfs->ns_sem);
@@ -108,7 +108,7 @@ static void nilfs_set_error(struct super_block *sb)
 void nilfs_error(struct super_block *sb, const char *function,
 		 const char *fmt, ...)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct va_format vaf;
 	va_list args;
 
@@ -190,7 +190,7 @@ void nilfs_destroy_inode(struct inode *inode)
 
 static int nilfs_sync_super(struct super_block *sb, int flag)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	int err;
 
  retry:
@@ -265,7 +265,7 @@ void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
 struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
 					       int flip)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
 
 	/* nilfs->ns_sem must be locked by the caller. */
@@ -291,7 +291,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
 
 int nilfs_commit_super(struct super_block *sb, int flag)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
 	time_t t;
 
@@ -324,7 +324,7 @@ int nilfs_commit_super(struct super_block *sb, int flag)
  */
 int nilfs_cleanup_super(struct super_block *sb)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_super_block **sbp;
 	int flag = NILFS_SB_COMMIT;
 	int ret = -EIO;
@@ -349,8 +349,7 @@ int nilfs_cleanup_super(struct super_block *sb)
 
 static void nilfs_put_super(struct super_block *sb)
 {
-	struct nilfs_sb_info *sbi = NILFS_SB(sb);
-	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 
 	nilfs_detach_log_writer(sb);
 
@@ -365,14 +364,12 @@ static void nilfs_put_super(struct super_block *sb)
 	iput(nilfs->ns_dat);
 
 	destroy_nilfs(nilfs);
-	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
-	kfree(sbi);
 }
 
 static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_super_block **sbp;
 	int err = 0;
 
@@ -396,7 +393,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 			    struct nilfs_root **rootp)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_root *root;
 	struct nilfs_checkpoint *raw_cp;
 	struct buffer_head *bh_cp;
@@ -449,7 +446,7 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 
 static int nilfs_freeze(struct super_block *sb)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	int err;
 
 	if (sb->s_flags & MS_RDONLY)
@@ -464,7 +461,7 @@ static int nilfs_freeze(struct super_block *sb)
 
 static int nilfs_unfreeze(struct super_block *sb)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
@@ -527,7 +524,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct super_block *sb = vfs->mnt_sb;
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
 
 	if (!nilfs_test_opt(nilfs, BARRIER))
@@ -591,7 +588,7 @@ static match_table_t tokens = {
 
 static int parse_options(char *options, struct super_block *sb, int is_remount)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
 
@@ -660,7 +657,7 @@ static inline void
 nilfs_set_default_options(struct super_block *sb,
 			  struct nilfs_super_block *sbp)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 
 	nilfs->ns_mount_opt =
 		NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
@@ -668,7 +665,7 @@ nilfs_set_default_options(struct super_block *sb,
 
 static int nilfs_setup_super(struct super_block *sb, int is_mount)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_super_block **sbp;
 	int max_mnt_count;
 	int mnt_count;
@@ -726,7 +723,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 				 struct nilfs_super_block *sbp,
 				 char *data)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 
 	sb->s_magic = le16_to_cpu(sbp->s_magic);
 
@@ -821,7 +818,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
 static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
 				 struct dentry **root_dentry)
 {
-	struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
+	struct the_nilfs *nilfs = s->s_fs_info;
 	struct nilfs_root *root;
 	int ret;
 
@@ -873,7 +870,7 @@ static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
 
 int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_root *root;
 	struct inode *inode;
 	struct dentry *dentry;
@@ -886,7 +883,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
 		return true;	/* protect recent checkpoints */
 
 	ret = false;
-	root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+	root = nilfs_lookup_root(nilfs, cno);
 	if (root) {
 		inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
 		if (inode) {
@@ -916,25 +913,16 @@ static int
 nilfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct the_nilfs *nilfs;
-	struct nilfs_sb_info *sbi;
 	struct nilfs_root *fsroot;
 	struct backing_dev_info *bdi;
 	__u64 cno;
 	int err;
 
-	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
+	nilfs = alloc_nilfs(sb->s_bdev);
+	if (!nilfs)
 		return -ENOMEM;
 
-	sb->s_fs_info = sbi;
-	sbi->s_super = sb;
-
-	nilfs = alloc_nilfs(sb->s_bdev);
-	if (!nilfs) {
-		err = -ENOMEM;
-		goto failed_sbi;
-	}
-	sbi->s_nilfs = nilfs;
+	sb->s_fs_info = nilfs;
 
 	err = init_nilfs(nilfs, sb, (char *)data);
 	if (err)
@@ -993,16 +981,12 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 
  failed_nilfs:
 	destroy_nilfs(nilfs);
-
- failed_sbi:
-	sb->s_fs_info = NULL;
-	kfree(sbi);
 	return err;
 }
 
 static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+	struct the_nilfs *nilfs = sb->s_fs_info;
 	unsigned long old_sb_flags;
 	unsigned long old_mount_opt;
 	int err;
@@ -1083,7 +1067,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 struct nilfs_super_data {
 	struct block_device *bdev;
-	struct nilfs_sb_info *sbi;
 	__u64 cno;
 	int flags;
 };
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 793bd272f9e..f4968145c2a 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -31,7 +31,6 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
-#include "sb.h"
 
 struct nilfs_sc_info;
 
-- 
cgit v1.2.3


From 662e3a551b468c7338f5291d7a00389fe85885e2 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Tue, 8 Mar 2011 10:40:42 -0500
Subject: GFS2: quota allows exceeding hard limit

Immediately after being synced to disk, cached quotas are zeroed out and a
subsequent access of the cached quotas results in incorrect zero values. This
meant that gfs2 assumed the actual usage to be the zero (or near-zero) usage
values it found in the cached quotas and comparison against warn/limits never
triggered a quota violation.

This patch adds a new flag QDF_REFRESH that is set after a sync so that the
cached quotas are forcefully refreshed from disk on a subsequent access on
seeing this flag set.

Resolves: rhbz#675944
Signed-off-by: Abhi Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 1 +
 fs/gfs2/quota.c  | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 720c1e66b34..59aaaa05113 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -317,6 +317,7 @@ enum {
 	QDF_USER		= 0,
 	QDF_CHANGE		= 1,
 	QDF_LOCKED		= 2,
+	QDF_REFRESH		= 3,
 };
 
 struct gfs2_quota_data {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6ec964c31dc..e23d9864c41 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -834,6 +834,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 			goto out_end_trans;
 
 		do_qc(qd, -qd->qd_change_sync);
+		set_bit(QDF_REFRESH, &qd->qd_flags);
 	}
 
 	error = 0;
@@ -929,6 +930,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_alloc *al = ip->i_alloc;
+	struct gfs2_quota_data *qd;
 	unsigned int x;
 	int error = 0;
 
@@ -942,7 +944,11 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
 	     sort_qd, NULL);
 
 	for (x = 0; x < al->al_qd_num; x++) {
-		error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
+		int force = NO_FORCE;
+		qd = al->al_qd[x];
+		if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+			force = FORCE;
+		error = do_glock(qd, force, &al->al_qd_ghs[x]);
 		if (error)
 			break;
 	}
-- 
cgit v1.2.3


From fc0e38dae645f65424d1fb5d2a938aab8ce48a58 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 9 Mar 2011 10:58:04 +0000
Subject: GFS2: Fix glock deallocation race

This patch fixes a race in deallocating glocks which was introduced
in the RCU glock patch. We need to ensure that the glock count is
kept correct even in the case that there is a race to add a new
glock into the hash table. Also, to avoid having to wait for an
RCU grace period, the glock counter can be decremented before
call_rcu() is called.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c      | 10 ++++++++--
 fs/gfs2/glock.h      |  2 +-
 fs/gfs2/lock_dlm.c   |  4 ++--
 fs/gfs2/ops_fstype.c |  7 +------
 4 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ddc3e1e3faa..3f45a14009b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -103,16 +103,21 @@ static inline void spin_unlock_bucket(unsigned int hash)
 	__bit_spin_unlock(0, (unsigned long *)bl);
 }
 
-void gfs2_glock_free(struct rcu_head *rcu)
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
 {
 	struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
-	struct gfs2_sbd *sdp = gl->gl_sbd;
 
 	if (gl->gl_ops->go_flags & GLOF_ASPACE)
 		kmem_cache_free(gfs2_glock_aspace_cachep, gl);
 	else
 		kmem_cache_free(gfs2_glock_cachep, gl);
+}
+
+void gfs2_glock_free(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
 
+	call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
 	if (atomic_dec_and_test(&sdp->sd_glock_disposal))
 		wake_up(&sdp->sd_glock_wait);
 }
@@ -760,6 +765,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	if (tmp) {
 		spin_unlock_bucket(hash);
 		kmem_cache_free(cachep, gl);
+		atomic_dec(&sdp->sd_glock_disposal);
 		gl = tmp;
 	} else {
 		hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index afa8bfea564..aea160690e9 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -230,7 +230,7 @@ extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
 extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
-extern void gfs2_glock_free(struct rcu_head *rcu);
+extern void gfs2_glock_free(struct gfs2_glock *gl);
 
 extern int __init gfs2_glock_init(void);
 extern void gfs2_glock_exit(void);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index c80485cb6f2..98c80d8c2a6 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -30,7 +30,7 @@ static void gdlm_ast(void *arg)
 
 	switch (gl->gl_lksb.sb_status) {
 	case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
-		call_rcu(&gl->gl_rcu, gfs2_glock_free);
+		gfs2_glock_free(gl);
 		return;
 	case -DLM_ECANCEL: /* Cancel while getting lock */
 		ret |= LM_OUT_CANCELED;
@@ -165,7 +165,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
 	int error;
 
 	if (gl->gl_lksb.sb_lkid == 0) {
-		call_rcu(&gl->gl_rcu, gfs2_glock_free);
+		gfs2_glock_free(gl);
 		return;
 	}
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a39c103ba49..67654d0ba15 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -928,14 +928,9 @@ static const match_table_t nolock_tokens = {
 	{ Opt_err, NULL },
 };
 
-static void nolock_put_lock(struct gfs2_glock *gl)
-{
-	call_rcu(&gl->gl_rcu, gfs2_glock_free);
-}
-
 static const struct lm_lockops nolock_ops = {
 	.lm_proto_name = "lock_nolock",
-	.lm_put_lock = nolock_put_lock,
+	.lm_put_lock = gfs2_glock_free,
 	.lm_tokens = &nolock_tokens,
 };
 
-- 
cgit v1.2.3


From 0a33443b38746f35fc4acc8a5af6c7099e03ea40 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 9 Mar 2011 11:14:32 +0000
Subject: GFS2: Remove potential race in flock code

This patch ensures that we always wait for glock demotion when
dropping flocks on a file in order to prevent any race
conditions associated with further flock calls or closing
the file.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 216ad2774a6..2878481f72a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -981,8 +981,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 
 	mutex_lock(&fp->f_fl_mutex);
 	flock_lock_file_wait(file, fl);
-	if (fl_gh->gh_gl)
-		gfs2_glock_dq_uninit(fl_gh);
+	if (fl_gh->gh_gl) {
+		gfs2_glock_dq_wait(fl_gh);
+		gfs2_holder_uninit(fl_gh);
+	}
 	mutex_unlock(&fp->f_fl_mutex);
 }
 
-- 
cgit v1.2.3


From 86fa8af69d8e90b7b40b8dab84d168527143ae20 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 4 Mar 2011 12:59:54 +0000
Subject: xfs: clean up the xfs_alloc_compute_aligned calling convention

Pass a xfs_alloc_arg structure to xfs_alloc_compute_aligned and derive
the alignment and minlen paramters from it.  This cleans up the existing
callers, and we'll need even more information from the xfs_alloc_arg
in subsequent patches.  Based on a patch from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_alloc.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index f3227984a9b..b5af10713dc 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -147,10 +147,9 @@ xfs_alloc_get_rec(
  */
 STATIC void
 xfs_alloc_compute_aligned(
+	xfs_alloc_arg_t	*args,		/* allocation argument structure */
 	xfs_agblock_t	foundbno,	/* starting block in found extent */
 	xfs_extlen_t	foundlen,	/* length in found extent */
-	xfs_extlen_t	alignment,	/* alignment for allocation */
-	xfs_extlen_t	minlen,		/* minimum length for allocation */
 	xfs_agblock_t	*resbno,	/* result block number */
 	xfs_extlen_t	*reslen)	/* result length */
 {
@@ -158,8 +157,8 @@ xfs_alloc_compute_aligned(
 	xfs_extlen_t	diff;
 	xfs_extlen_t	len;
 
-	if (alignment > 1 && foundlen >= minlen) {
-		bno = roundup(foundbno, alignment);
+	if (args->alignment > 1 && foundlen >= args->minlen) {
+		bno = roundup(foundbno, args->alignment);
 		diff = bno - foundbno;
 		len = diff >= foundlen ? 0 : foundlen - diff;
 	} else {
@@ -693,8 +692,7 @@ xfs_alloc_find_best_extent(
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-		xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
-					  args->minlen, &bno, slena);
+		xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
 
 		/*
 		 * The good extent is closer than this one.
@@ -866,8 +864,8 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
-					args->minlen, &ltbnoa, &ltlena);
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
 			if (ltlena < args->minlen)
 				continue;
 			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -987,8 +985,8 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
-					args->minlen, &ltbnoa, &ltlena);
+			xfs_alloc_compute_aligned(args, ltbno, ltlen,
+						  &ltbnoa, &ltlena);
 			if (ltlena >= args->minlen)
 				break;
 			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1003,8 +1001,8 @@ xfs_alloc_ag_vextent_near(
 			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-			xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
-					args->minlen, &gtbnoa, &gtlena);
+			xfs_alloc_compute_aligned(args, gtbno, gtlen,
+						  &gtbnoa, &gtlena);
 			if (gtlena >= args->minlen)
 				break;
 			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1183,8 +1181,7 @@ xfs_alloc_ag_vextent_size(
 	 * once aligned; if not, we search left for something better.
 	 * This can't happen in the second case above.
 	 */
-	xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
-		&rbno, &rlen);
+	xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
 	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
 	XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
 			(rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1209,8 +1206,8 @@ xfs_alloc_ag_vextent_size(
 			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 			if (flen < bestrlen)
 				break;
-			xfs_alloc_compute_aligned(fbno, flen, args->alignment,
-				args->minlen, &rbno, &rlen);
+			xfs_alloc_compute_aligned(args, fbno, flen,
+						  &rbno, &rlen);
 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
 			XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
 				(rlen <= flen && rbno + rlen <= fbno + flen),
-- 
cgit v1.2.3


From ecb6928fcf969b302929f109e175981df1dba697 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Fri, 4 Mar 2011 12:59:55 +0000
Subject: xfs: factor agf counter updates into a helper

Updating the AGF and transactions counters is duplicated between allocating
and freeing extents.  Factor the code into a common helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_alloc.c | 129 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 68 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index b5af10713dc..4bc3c649aee 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -463,6 +463,27 @@ xfs_alloc_read_agfl(
 	return 0;
 }
 
+STATIC int
+xfs_alloc_update_counters(
+	struct xfs_trans	*tp,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agbp,
+	long			len)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+
+	pag->pagf_freeblks += len;
+	be32_add_cpu(&agf->agf_freeblks, len);
+
+	xfs_trans_agblocks_delta(tp, len);
+	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+		     be32_to_cpu(agf->agf_length)))
+		return EFSCORRUPTED;
+
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+	return 0;
+}
+
 /*
  * Allocation group level functions.
  */
@@ -504,49 +525,44 @@ xfs_alloc_ag_vextent(
 		ASSERT(0);
 		/* NOTREACHED */
 	}
-	if (error)
+
+	if (error || args->agbno == NULLAGBLOCK)
 		return error;
-	/*
-	 * If the allocation worked, need to change the agf structure
-	 * (and log it), and the superblock.
-	 */
-	if (args->agbno != NULLAGBLOCK) {
-		xfs_agf_t	*agf;	/* allocation group freelist header */
-		long		slen = (long)args->len;
 
-		ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
-		ASSERT(!(args->wasfromfl) || !args->isfl);
-		ASSERT(args->agbno % args->alignment == 0);
-		if (!(args->wasfromfl)) {
-
-			agf = XFS_BUF_TO_AGF(args->agbp);
-			be32_add_cpu(&agf->agf_freeblks, -(args->len));
-			xfs_trans_agblocks_delta(args->tp,
-						 -((long)(args->len)));
-			args->pag->pagf_freeblks -= args->len;
-			ASSERT(be32_to_cpu(agf->agf_freeblks) <=
-				be32_to_cpu(agf->agf_length));
-			xfs_alloc_log_agf(args->tp, args->agbp,
-						XFS_AGF_FREEBLKS);
-			/*
-			 * Search the busylist for these blocks and mark the
-			 * transaction as synchronous if blocks are found. This
-			 * avoids the need to block due to a synchronous log
-			 * force to ensure correct ordering as the synchronous
-			 * transaction will guarantee that for us.
-			 */
-			if (xfs_alloc_busy_search(args->mp, args->agno,
-						args->agbno, args->len))
-				xfs_trans_set_sync(args->tp);
-		}
-		if (!args->isfl)
-			xfs_trans_mod_sb(args->tp,
-				args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
-					XFS_TRANS_SB_FDBLOCKS, -slen);
-		XFS_STATS_INC(xs_allocx);
-		XFS_STATS_ADD(xs_allocb, args->len);
+	ASSERT(args->len >= args->minlen);
+	ASSERT(args->len <= args->maxlen);
+	ASSERT(!args->wasfromfl || !args->isfl);
+	ASSERT(args->agbno % args->alignment == 0);
+
+	if (!args->wasfromfl) {
+		error = xfs_alloc_update_counters(args->tp, args->pag,
+						  args->agbp,
+						  -((long)(args->len)));
+		if (error)
+			return error;
+
+		/*
+		 * Search the busylist for these blocks and mark the
+		 * transaction as synchronous if blocks are found. This
+		 * avoids the need to block due to a synchronous log
+		 * force to ensure correct ordering as the synchronous
+		 * transaction will guarantee that for us.
+		 */
+		if (xfs_alloc_busy_search(args->mp, args->agno,
+					args->agbno, args->len))
+			xfs_trans_set_sync(args->tp);
 	}
-	return 0;
+
+	if (!args->isfl) {
+		xfs_trans_mod_sb(args->tp, args->wasdel ?
+				 XFS_TRANS_SB_RES_FDBLOCKS :
+				 XFS_TRANS_SB_FDBLOCKS,
+				 -((long)(args->len)));
+	}
+
+	XFS_STATS_INC(xs_allocx);
+	XFS_STATS_ADD(xs_allocb, args->len);
+	return error;
 }
 
 /*
@@ -1385,6 +1401,7 @@ xfs_free_ag_extent(
 	xfs_mount_t	*mp;		/* mount point struct for filesystem */
 	xfs_agblock_t	nbno;		/* new starting block of freespace */
 	xfs_extlen_t	nlen;		/* new length of freespace */
+	xfs_perag_t	*pag;		/* per allocation group data */
 
 	mp = tp->t_mountp;
 	/*
@@ -1583,30 +1600,20 @@ xfs_free_ag_extent(
 	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 	cnt_cur = NULL;
+
 	/*
 	 * Update the freespace totals in the ag and superblock.
 	 */
-	{
-		xfs_agf_t	*agf;
-		xfs_perag_t	*pag;		/* per allocation group data */
-
-		pag = xfs_perag_get(mp, agno);
-		pag->pagf_freeblks += len;
-		xfs_perag_put(pag);
-
-		agf = XFS_BUF_TO_AGF(agbp);
-		be32_add_cpu(&agf->agf_freeblks, len);
-		xfs_trans_agblocks_delta(tp, len);
-		XFS_WANT_CORRUPTED_GOTO(
-			be32_to_cpu(agf->agf_freeblks) <=
-			be32_to_cpu(agf->agf_length),
-			error0);
-		xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
-		if (!isfl)
-			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-		XFS_STATS_INC(xs_freex);
-		XFS_STATS_ADD(xs_freeb, len);
-	}
+	pag = xfs_perag_get(mp, agno);
+	error = xfs_alloc_update_counters(tp, pag, agbp, len);
+	xfs_perag_put(pag);
+	if (error)
+		goto error0;
+
+	if (!isfl)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+	XFS_STATS_INC(xs_freex);
+	XFS_STATS_ADD(xs_freeb, len);
 
 	trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
 
-- 
cgit v1.2.3


From facc31ddc3570a3a0d8951c94f16b898e01b464d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:27 +0100
Subject: block: Don't implicitly trigger event check on disk_unblock_events()

Currently, disk_unblock_events() implicitly kick event check if the
block count reaches zero.  This behavior is not described in the
comment and hinders with future changes.  Make the unblocker
explicitly check events by calling disk_check_events() as necessary.

This patch doesn't cause any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 fs/block_dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 88928701959..fffdf86c175 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1446,6 +1446,7 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 		if (bdev_free) {
 			if (bdev->bd_write_holder) {
 				disk_unblock_events(bdev->bd_disk);
+				disk_check_events(bdev->bd_disk);
 				bdev->bd_write_holder = false;
 			} else
 				disk_check_events(bdev->bd_disk);
-- 
cgit v1.2.3


From 6936217cc7e58573026bdba25b1bfb778e8f2267 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:27 +0100
Subject: block: Don't check events on close unless it was blocked

The block event mechanism currently always checks events when the
device is being closed regardless of the open mode.  The intention was
to allow detection of EJECT_REQUEST when a device is closed whether
disk event polling is enabled or not.

This is unnecessary as, for devices of interest, events are checked
from either userland or kernel and in the former case ->check_events()
is performed on open of each poll attempt anyway.  Furthermore, this
unconditional event check on close makes the code susceptible to event
loop if the block driver doesn't clear reported events correctly - an
event triggers userland to open and close the device which in turn
causes another event, rinse and repeat.

Check events on close only if it was blocked by excl write open.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 fs/block_dev.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fffdf86c175..7dd2c658d42 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1448,13 +1448,11 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 				disk_unblock_events(bdev->bd_disk);
 				disk_check_events(bdev->bd_disk);
 				bdev->bd_write_holder = false;
-			} else
-				disk_check_events(bdev->bd_disk);
+			}
 		}
 
 		mutex_unlock(&bdev->bd_mutex);
-	} else
-		disk_check_events(bdev->bd_disk);
+	}
 
 	return __blkdev_put(bdev, mode, 0);
 }
-- 
cgit v1.2.3


From 69e02c59a7d962dced8047401b81a8d897e1702e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 9 Mar 2011 19:54:27 +0100
Subject: block: Don't check events while open is in progress

Not all block drivers clear events immediately after reporting.  Some
do so in ->revalidate_disk() or other steps during ->open().  There is
a slim chance event poll may happen between the clearing event check
from check_disk_change() and the actual clearing of the events which
would result in spurious events.

Block event checks while block device open is in progress.  There is
no need to kick explicit event check afterwards as events are always
checked during open.

-v2: The original patch could have called disk_unblock_events() with
     an already released or %NULL @disk causing oops.  Fixed by making
     sure references are put after disk_unblock_events() is called.
     It also makes the error path of __blkdev_get() a bit simpler.
     This problem was reported by Jens.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 fs/block_dev.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7dd2c658d42..d42cad2757a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1087,6 +1087,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	if (!disk)
 		goto out;
 
+	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
 	if (!bdev->bd_openers) {
 		bdev->bd_disk = disk;
@@ -1108,10 +1109,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					 */
 					disk_put_part(bdev->bd_part);
 					bdev->bd_part = NULL;
-					module_put(disk->fops->owner);
-					put_disk(disk);
 					bdev->bd_disk = NULL;
 					mutex_unlock(&bdev->bd_mutex);
+					disk_unblock_events(disk);
+					module_put(disk->fops->owner);
+					put_disk(disk);
 					goto restart;
 				}
 				if (ret)
@@ -1148,9 +1150,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
 		}
 	} else {
-		module_put(disk->fops->owner);
-		put_disk(disk);
-		disk = NULL;
 		if (bdev->bd_contains == bdev) {
 			if (bdev->bd_disk->fops->open) {
 				ret = bdev->bd_disk->fops->open(bdev, mode);
@@ -1160,11 +1159,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (bdev->bd_invalidated)
 				rescan_partitions(bdev->bd_disk, bdev);
 		}
+		/* only one opener holds refs to the module and disk */
+		module_put(disk->fops->owner);
+		put_disk(disk);
 	}
 	bdev->bd_openers++;
 	if (for_part)
 		bdev->bd_part_count++;
 	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
 	return 0;
 
  out_clear:
@@ -1177,9 +1180,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	bdev->bd_contains = NULL;
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
+	disk_unblock_events(disk);
  out:
-	if (disk)
-		module_put(disk->fops->owner);
+	module_put(disk->fops->owner);
 	put_disk(disk);
 	bdput(bdev);
 
-- 
cgit v1.2.3


From 7eaceaccab5f40bbfda044629a6298616aeaed50 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 10 Mar 2011 08:52:07 +0100
Subject: block: remove per-queue plugging

Code has been converted over to the new explicit on-stack plugging,
and delay users have been converted to use the new API for that.
So lets kill off the old plugging along with aops->sync_page().

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/adfs/inode.c              |  1 -
 fs/affs/file.c               |  2 -
 fs/aio.c                     |  4 +-
 fs/befs/linuxvfs.c           |  1 -
 fs/bfs/file.c                |  1 -
 fs/block_dev.c               |  1 -
 fs/btrfs/disk-io.c           | 79 --------------------------------------
 fs/btrfs/inode.c             |  1 -
 fs/btrfs/volumes.c           | 91 ++++++--------------------------------------
 fs/buffer.c                  | 31 ++-------------
 fs/cifs/file.c               | 30 ---------------
 fs/direct-io.c               |  5 +--
 fs/efs/inode.c               |  1 -
 fs/exofs/inode.c             |  1 -
 fs/ext2/inode.c              |  2 -
 fs/ext3/inode.c              |  3 --
 fs/ext4/inode.c              |  4 --
 fs/fat/inode.c               |  1 -
 fs/freevxfs/vxfs_subr.c      |  1 -
 fs/fuse/inode.c              |  1 -
 fs/gfs2/aops.c               |  3 --
 fs/gfs2/meta_io.c            |  1 -
 fs/hfs/inode.c               |  2 -
 fs/hfsplus/inode.c           |  2 -
 fs/hpfs/file.c               |  1 -
 fs/isofs/inode.c             |  1 -
 fs/jfs/inode.c               |  1 -
 fs/jfs/jfs_metapage.c        |  1 -
 fs/logfs/dev_bdev.c          |  2 -
 fs/minix/inode.c             |  1 -
 fs/nilfs2/btnode.c           |  6 +--
 fs/nilfs2/gcinode.c          |  1 -
 fs/nilfs2/inode.c            |  1 -
 fs/nilfs2/mdt.c              |  9 +----
 fs/nilfs2/page.c             |  5 +--
 fs/nilfs2/page.h             |  3 +-
 fs/ntfs/aops.c               |  4 --
 fs/ntfs/compress.c           |  3 +-
 fs/ocfs2/aops.c              |  1 -
 fs/ocfs2/cluster/heartbeat.c |  4 --
 fs/omfs/file.c               |  1 -
 fs/qnx4/inode.c              |  1 -
 fs/reiserfs/inode.c          |  1 -
 fs/sysv/itree.c              |  1 -
 fs/ubifs/super.c             |  1 -
 fs/udf/file.c                |  1 -
 fs/udf/inode.c               |  1 -
 fs/ufs/inode.c               |  1 -
 fs/ufs/truncate.c            |  2 +-
 fs/xfs/linux-2.6/xfs_aops.c  |  1 -
 fs/xfs/linux-2.6/xfs_buf.c   | 13 +++----
 51 files changed, 32 insertions(+), 305 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 65794b8fe79..1cc84b27613 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -73,7 +73,6 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations adfs_aops = {
 	.readpage	= adfs_readpage,
 	.writepage	= adfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= adfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= _adfs_bmap
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0a90dcd46de..acf321b70fc 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,7 +429,6 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations affs_aops = {
 	.readpage = affs_readpage,
 	.writepage = affs_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = affs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = _affs_bmap
@@ -786,7 +785,6 @@ out:
 const struct address_space_operations affs_aops_ofs = {
 	.readpage = affs_readpage_ofs,
 	//.writepage = affs_writepage_ofs,
-	//.sync_page = affs_sync_page_ofs,
 	.write_begin = affs_write_begin_ofs,
 	.write_end = affs_write_end_ofs
 };
diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a..c5ea494ea9e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1550,9 +1550,11 @@ static void aio_batch_free(struct hlist_head *batch_hash)
 	struct hlist_node *pos, *n;
 	int i;
 
+	/*
+	 * TODO: kill this
+	 */
 	for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
 		hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
-			blk_run_address_space(abe->mapping);
 			iput(abe->mapping->host);
 			hlist_del(&abe->list);
 			mempool_free(abe, abe_pool);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b1d0c794747..06457ed8f3e 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -75,7 +75,6 @@ static const struct inode_operations befs_dir_inode_operations = {
 
 static const struct address_space_operations befs_aops = {
 	.readpage	= befs_readpage,
-	.sync_page	= block_sync_page,
 	.bmap		= befs_bmap,
 };
 
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index eb67edd0f8e..f20e8a71062 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -186,7 +186,6 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations bfs_aops = {
 	.readpage	= bfs_readpage,
 	.writepage	= bfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= bfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= bfs_bmap,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4fb8a343153..fffc2c67239 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1520,7 +1520,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.writepages	= generic_writepages,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1aa8d607bc..ada1f6bd0a5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -847,7 +847,6 @@ static const struct address_space_operations btree_aops = {
 	.writepages	= btree_writepages,
 	.releasepage	= btree_releasepage,
 	.invalidatepage = btree_invalidatepage,
-	.sync_page	= block_sync_page,
 #ifdef CONFIG_MIGRATION
 	.migratepage	= btree_migratepage,
 #endif
@@ -1330,82 +1329,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	return ret;
 }
 
-/*
- * this unplugs every device on the box, and it is only used when page
- * is null
- */
-static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-	struct btrfs_device *device;
-	struct btrfs_fs_info *info;
-
-	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
-		if (!device->bdev)
-			continue;
-
-		bdi = blk_get_backing_dev_info(device->bdev);
-		if (bdi->unplug_io_fn)
-			bdi->unplug_io_fn(bdi, page);
-	}
-}
-
-static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-	struct inode *inode;
-	struct extent_map_tree *em_tree;
-	struct extent_map *em;
-	struct address_space *mapping;
-	u64 offset;
-
-	/* the generic O_DIRECT read code does this */
-	if (1 || !page) {
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-
-	/*
-	 * page->mapping may change at any time.  Get a consistent copy
-	 * and use that for everything below
-	 */
-	smp_mb();
-	mapping = page->mapping;
-	if (!mapping)
-		return;
-
-	inode = mapping->host;
-
-	/*
-	 * don't do the expensive searching for a small number of
-	 * devices
-	 */
-	if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-
-	offset = page_offset(page);
-
-	em_tree = &BTRFS_I(inode)->extent_tree;
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
-	read_unlock(&em_tree->lock);
-	if (!em) {
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-
-	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-		free_extent_map(em);
-		__unplug_io_fn(bdi, page);
-		return;
-	}
-	offset = offset - em->start;
-	btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
-			  em->block_start + offset, page);
-	free_extent_map(em);
-}
-
 /*
  * If this fails, caller must call bdi_destroy() to get rid of the
  * bdi again.
@@ -1420,8 +1343,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 		return err;
 
 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
-	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
-	bdi->unplug_io_data	= info;
 	bdi->congested_fn	= btrfs_congested_fn;
 	bdi->congested_data	= info;
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fb9bd7832b6..462e08e724b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7218,7 +7218,6 @@ static const struct address_space_operations btrfs_aops = {
 	.writepage	= btrfs_writepage,
 	.writepages	= btrfs_writepages,
 	.readpages	= btrfs_readpages,
-	.sync_page	= block_sync_page,
 	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index af7dbca1527..6e0e82a1b18 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -162,7 +162,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	struct bio *cur;
 	int again = 0;
 	unsigned long num_run;
-	unsigned long num_sync_run;
 	unsigned long batch_run = 0;
 	unsigned long limit;
 	unsigned long last_waited = 0;
@@ -173,11 +172,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
-	/* we want to make sure that every time we switch from the sync
-	 * list to the normal list, we unplug
-	 */
-	num_sync_run = 0;
-
 loop:
 	spin_lock(&device->io_lock);
 
@@ -223,15 +217,6 @@ loop_lock:
 
 	spin_unlock(&device->io_lock);
 
-	/*
-	 * if we're doing the regular priority list, make sure we unplug
-	 * for any high prio bios we've sent down
-	 */
-	if (pending_bios == &device->pending_bios && num_sync_run > 0) {
-		num_sync_run = 0;
-		blk_run_backing_dev(bdi, NULL);
-	}
-
 	while (pending) {
 
 		rmb();
@@ -259,19 +244,11 @@ loop_lock:
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 
-		if (cur->bi_rw & REQ_SYNC)
-			num_sync_run++;
-
 		submit_bio(cur->bi_rw, cur);
 		num_run++;
 		batch_run++;
-		if (need_resched()) {
-			if (num_sync_run) {
-				blk_run_backing_dev(bdi, NULL);
-				num_sync_run = 0;
-			}
+		if (need_resched())
 			cond_resched();
-		}
 
 		/*
 		 * we made progress, there is more work to do and the bdi
@@ -304,13 +281,8 @@ loop_lock:
 				 * against it before looping
 				 */
 				last_waited = ioc->last_waited;
-				if (need_resched()) {
-					if (num_sync_run) {
-						blk_run_backing_dev(bdi, NULL);
-						num_sync_run = 0;
-					}
+				if (need_resched())
 					cond_resched();
-				}
 				continue;
 			}
 			spin_lock(&device->io_lock);
@@ -323,22 +295,6 @@ loop_lock:
 		}
 	}
 
-	if (num_sync_run) {
-		num_sync_run = 0;
-		blk_run_backing_dev(bdi, NULL);
-	}
-	/*
-	 * IO has already been through a long path to get here.  Checksumming,
-	 * async helper threads, perhaps compression.  We've done a pretty
-	 * good job of collecting a batch of IO and should just unplug
-	 * the device right away.
-	 *
-	 * This will help anyone who is waiting on the IO, they might have
-	 * already unplugged, but managed to do so before the bio they
-	 * cared about found its way down here.
-	 */
-	blk_run_backing_dev(bdi, NULL);
-
 	cond_resched();
 	if (again)
 		goto loop;
@@ -2948,7 +2904,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			     u64 logical, u64 *length,
 			     struct btrfs_multi_bio **multi_ret,
-			     int mirror_num, struct page *unplug_page)
+			     int mirror_num)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -2980,11 +2936,6 @@ again:
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	read_unlock(&em_tree->lock);
 
-	if (!em && unplug_page) {
-		kfree(multi);
-		return 0;
-	}
-
 	if (!em) {
 		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
 		       (unsigned long long)logical,
@@ -3040,13 +2991,13 @@ again:
 		*length = em->len - offset;
 	}
 
-	if (!multi_ret && !unplug_page)
+	if (!multi_ret)
 		goto out;
 
 	num_stripes = 1;
 	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (unplug_page || (rw & REQ_WRITE))
+		if (rw & REQ_WRITE)
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
@@ -3068,7 +3019,7 @@ again:
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (unplug_page || (rw & REQ_WRITE))
+		if (rw & REQ_WRITE)
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
@@ -3088,22 +3039,10 @@ again:
 	BUG_ON(stripe_index >= map->num_stripes);
 
 	for (i = 0; i < num_stripes; i++) {
-		if (unplug_page) {
-			struct btrfs_device *device;
-			struct backing_dev_info *bdi;
-
-			device = map->stripes[stripe_index].dev;
-			if (device->bdev) {
-				bdi = blk_get_backing_dev_info(device->bdev);
-				if (bdi->unplug_io_fn)
-					bdi->unplug_io_fn(bdi, unplug_page);
-			}
-		} else {
-			multi->stripes[i].physical =
-				map->stripes[stripe_index].physical +
-				stripe_offset + stripe_nr * map->stripe_len;
-			multi->stripes[i].dev = map->stripes[stripe_index].dev;
-		}
+		multi->stripes[i].physical =
+			map->stripes[stripe_index].physical +
+			stripe_offset + stripe_nr * map->stripe_len;
+		multi->stripes[i].dev = map->stripes[stripe_index].dev;
 		stripe_index++;
 	}
 	if (multi_ret) {
@@ -3121,7 +3060,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		      struct btrfs_multi_bio **multi_ret, int mirror_num)
 {
 	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
-				 mirror_num, NULL);
+				 mirror_num);
 }
 
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3189,14 +3128,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	return 0;
 }
 
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
-		      u64 logical, struct page *page)
-{
-	u64 length = PAGE_CACHE_SIZE;
-	return __btrfs_map_block(map_tree, READ, logical, &length,
-				 NULL, 0, page);
-}
-
 static void end_bio_multi_stripe(struct bio *bio, int err)
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76e2ca..f903f2e5b4f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -54,23 +54,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 }
 EXPORT_SYMBOL(init_buffer);
 
-static int sync_buffer(void *word)
+static int sleep_on_buffer(void *word)
 {
-	struct block_device *bd;
-	struct buffer_head *bh
-		= container_of(word, struct buffer_head, b_state);
-
-	smp_mb();
-	bd = bh->b_bdev;
-	if (bd)
-		blk_run_address_space(bd->bd_inode->i_mapping);
 	io_schedule();
 	return 0;
 }
 
 void __lock_buffer(struct buffer_head *bh)
 {
-	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
+	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
 							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(unlock_buffer);
  */
 void __wait_on_buffer(struct buffer_head * bh)
 {
-	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
+	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__wait_on_buffer);
 
@@ -749,7 +741,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head tmp;
-	struct address_space *mapping, *prev_mapping = NULL;
+	struct address_space *mapping;
 	int err = 0, err2;
 
 	INIT_LIST_HEAD(&tmp);
@@ -783,10 +775,6 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * wait_on_buffer() will do that for us
 				 * through sync_buffer().
 				 */
-				if (prev_mapping && prev_mapping != mapping)
-					blk_run_address_space(prev_mapping);
-				prev_mapping = mapping;
-
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -3138,17 +3126,6 @@ out:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
 
-void block_sync_page(struct page *page)
-{
-	struct address_space *mapping;
-
-	smp_mb();
-	mapping = page_mapping(page);
-	if (mapping)
-		blk_run_backing_dev(mapping->backing_dev_info, page);
-}
-EXPORT_SYMBOL(block_sync_page);
-
 /*
  * There are no bdflush tunables left.  But distributions are
  * still running obsolete flush daemons, so we terminate them here.
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e964b1cd5dd..c27d236738f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1569,34 +1569,6 @@ int cifs_fsync(struct file *file, int datasync)
 	return rc;
 }
 
-/* static void cifs_sync_page(struct page *page)
-{
-	struct address_space *mapping;
-	struct inode *inode;
-	unsigned long index = page->index;
-	unsigned int rpages = 0;
-	int rc = 0;
-
-	cFYI(1, "sync page %p", page);
-	mapping = page->mapping;
-	if (!mapping)
-		return 0;
-	inode = mapping->host;
-	if (!inode)
-		return; */
-
-/*	fill in rpages then
-	result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
-
-/*	cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
-
-#if 0
-	if (rc < 0)
-		return rc;
-	return 0;
-#endif
-} */
-
 /*
  * As file closes, flush all cached write data for this inode checking
  * for write behind errors.
@@ -2510,7 +2482,6 @@ const struct address_space_operations cifs_addr_ops = {
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = cifs_release_page,
 	.invalidatepage = cifs_invalidate_page,
-	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
 };
 
@@ -2528,6 +2499,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.releasepage = cifs_release_page,
 	.invalidatepage = cifs_invalidate_page,
-	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
 };
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b044705eedd..df709b3b860 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1110,11 +1110,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	    ((rw & READ) || (dio->result == dio->size)))
 		ret = -EIOCBQUEUED;
 
-	if (ret != -EIOCBQUEUED) {
-		/* All IO is now issued, send it on its way */
-		blk_run_address_space(inode->i_mapping);
+	if (ret != -EIOCBQUEUED)
 		dio_await_completion(dio);
-	}
 
 	/*
 	 * Sync will always be dropping the final ref and completing the
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index a8e7797b947..9c13412e6c9 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -23,7 +23,6 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
 }
 static const struct address_space_operations efs_aops = {
 	.readpage = efs_readpage,
-	.sync_page = block_sync_page,
 	.bmap = _efs_bmap
 };
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a7555238c41..82b94c8f5d2 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -795,7 +795,6 @@ const struct address_space_operations exofs_aops = {
 	.direct_IO	= NULL, /* TODO: Should be trivial to do */
 
 	/* With these NULL has special meaning or default is not exported */
-	.sync_page	= NULL,
 	.get_xip_mem	= NULL,
 	.migratepage	= NULL,
 	.launder_page	= NULL,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 40ad210a504..c47f706878b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -860,7 +860,6 @@ const struct address_space_operations ext2_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext2_write_begin,
 	.write_end		= ext2_write_end,
 	.bmap			= ext2_bmap,
@@ -880,7 +879,6 @@ const struct address_space_operations ext2_nobh_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_nobh_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext2_nobh_write_begin,
 	.write_end		= nobh_write_end,
 	.bmap			= ext2_bmap,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ae94f6d949f..fe2541d250e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1894,7 +1894,6 @@ static const struct address_space_operations ext3_ordered_aops = {
 	.readpage		= ext3_readpage,
 	.readpages		= ext3_readpages,
 	.writepage		= ext3_ordered_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext3_write_begin,
 	.write_end		= ext3_ordered_write_end,
 	.bmap			= ext3_bmap,
@@ -1910,7 +1909,6 @@ static const struct address_space_operations ext3_writeback_aops = {
 	.readpage		= ext3_readpage,
 	.readpages		= ext3_readpages,
 	.writepage		= ext3_writeback_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext3_write_begin,
 	.write_end		= ext3_writeback_write_end,
 	.bmap			= ext3_bmap,
@@ -1926,7 +1924,6 @@ static const struct address_space_operations ext3_journalled_aops = {
 	.readpage		= ext3_readpage,
 	.readpages		= ext3_readpages,
 	.writepage		= ext3_journalled_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext3_write_begin,
 	.write_end		= ext3_journalled_write_end,
 	.set_page_dirty		= ext3_journalled_set_page_dirty,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f7f9e49914..9297ad46c46 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3903,7 +3903,6 @@ static const struct address_space_operations ext4_ordered_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_ordered_write_end,
 	.bmap			= ext4_bmap,
@@ -3919,7 +3918,6 @@ static const struct address_space_operations ext4_writeback_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_writeback_write_end,
 	.bmap			= ext4_bmap,
@@ -3935,7 +3933,6 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_journalled_write_end,
 	.set_page_dirty		= ext4_journalled_set_page_dirty,
@@ -3951,7 +3948,6 @@ static const struct address_space_operations ext4_da_aops = {
 	.readpages		= ext4_readpages,
 	.writepage		= ext4_writepage,
 	.writepages		= ext4_da_writepages,
-	.sync_page		= block_sync_page,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
 	.bmap			= ext4_bmap,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd..f4ff09fb79b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -236,7 +236,6 @@ static const struct address_space_operations fat_aops = {
 	.readpages	= fat_readpages,
 	.writepage	= fat_writepage,
 	.writepages	= fat_writepages,
-	.sync_page	= block_sync_page,
 	.write_begin	= fat_write_begin,
 	.write_end	= fat_write_end,
 	.direct_IO	= fat_direct_IO,
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 1429f3ae1e8..5d318c44f85 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -44,7 +44,6 @@ static sector_t		vxfs_bmap(struct address_space *, sector_t);
 const struct address_space_operations vxfs_aops = {
 	.readpage =		vxfs_readpage,
 	.bmap =			vxfs_bmap,
-	.sync_page =		block_sync_page,
 };
 
 inline void
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd..09e8d51eeb6 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -868,7 +868,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 
 	fc->bdi.name = "fuse";
 	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-	fc->bdi.unplug_io_fn = default_unplug_io_fn;
 	/* fuse does it's own writeback accounting */
 	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4f36f8832b9..2f87ad27efd 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1116,7 +1116,6 @@ static const struct address_space_operations gfs2_writeback_aops = {
 	.writepages = gfs2_writeback_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
 	.write_begin = gfs2_write_begin,
 	.write_end = gfs2_write_end,
 	.bmap = gfs2_bmap,
@@ -1132,7 +1131,6 @@ static const struct address_space_operations gfs2_ordered_aops = {
 	.writepage = gfs2_ordered_writepage,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
 	.write_begin = gfs2_write_begin,
 	.write_end = gfs2_write_end,
 	.set_page_dirty = gfs2_set_page_dirty,
@@ -1150,7 +1148,6 @@ static const struct address_space_operations gfs2_jdata_aops = {
 	.writepages = gfs2_jdata_writepages,
 	.readpage = gfs2_readpage,
 	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
 	.write_begin = gfs2_write_begin,
 	.write_end = gfs2_write_end,
 	.set_page_dirty = gfs2_set_page_dirty,
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939739c7b3f..a566331db4e 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -94,7 +94,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 const struct address_space_operations gfs2_meta_aops = {
 	.writepage = gfs2_aspace_writepage,
 	.releasepage = gfs2_releasepage,
-	.sync_page = block_sync_page,
 };
 
 /**
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index dffb4e99664..fff16c968e6 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,6 @@ static int hfs_writepages(struct address_space *mapping,
 const struct address_space_operations hfs_btree_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= hfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfs_bmap,
@@ -160,7 +159,6 @@ const struct address_space_operations hfs_btree_aops = {
 const struct address_space_operations hfs_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= hfs_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfs_bmap,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a8df651747f..b248a6cfcad 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -146,7 +146,6 @@ static int hfsplus_writepages(struct address_space *mapping,
 const struct address_space_operations hfsplus_btree_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= hfsplus_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfsplus_bmap,
@@ -156,7 +155,6 @@ const struct address_space_operations hfsplus_btree_aops = {
 const struct address_space_operations hfsplus_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= hfsplus_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= hfsplus_bmap,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c0340887c7e..9e84257b3ad 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -120,7 +120,6 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations hpfs_aops = {
 	.readpage = hpfs_readpage,
 	.writepage = hpfs_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = hpfs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = _hpfs_bmap
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a0f3833c0db..3db5ba4568f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1158,7 +1158,6 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 
 static const struct address_space_operations isofs_aops = {
 	.readpage = isofs_readpage,
-	.sync_page = block_sync_page,
 	.bmap = _isofs_bmap
 };
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9978803ceed..eddbb373209 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -352,7 +352,6 @@ const struct address_space_operations jfs_aops = {
 	.readpages	= jfs_readpages,
 	.writepage	= jfs_writepage,
 	.writepages	= jfs_writepages,
-	.sync_page	= block_sync_page,
 	.write_begin	= jfs_write_begin,
 	.write_end	= nobh_write_end,
 	.bmap		= jfs_bmap,
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 48b44bd8267..6740d34cd82 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -583,7 +583,6 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
 const struct address_space_operations jfs_metapage_aops = {
 	.readpage	= metapage_readpage,
 	.writepage	= metapage_writepage,
-	.sync_page	= block_sync_page,
 	.releasepage	= metapage_releasepage,
 	.invalidatepage	= metapage_invalidatepage,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 723bc5bca09..1adc8d455f0 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -39,7 +39,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
 	bio.bi_end_io = request_complete;
 
 	submit_bio(rw, &bio);
-	generic_unplug_device(bdev_get_queue(bdev));
 	wait_for_completion(&complete);
 	return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
 }
@@ -168,7 +167,6 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
 	}
 	len = PAGE_ALIGN(len);
 	__bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
-	generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
 }
 
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index ae0b83f476a..adcdc0a4e18 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -399,7 +399,6 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations minix_aops = {
 	.readpage = minix_readpage,
 	.writepage = minix_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = minix_write_begin,
 	.write_end = generic_write_end,
 	.bmap = minix_bmap
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f528..f4f1c08807e 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -40,14 +40,10 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
 	nilfs_mapping_init_once(btnc);
 }
 
-static const struct address_space_operations def_btnode_aops = {
-	.sync_page		= block_sync_page,
-};
-
 void nilfs_btnode_cache_init(struct address_space *btnc,
 			     struct backing_dev_info *bdi)
 {
-	nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
+	nilfs_mapping_init(btnc, bdi);
 }
 
 void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index caf9a6a3fb5..1c2a3e23f8b 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -49,7 +49,6 @@
 #include "ifile.h"
 
 static const struct address_space_operations def_gcinode_aops = {
-	.sync_page		= block_sync_page,
 };
 
 /*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2fd440d8d6b..c89d5d1ea7c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -262,7 +262,6 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 const struct address_space_operations nilfs_aops = {
 	.writepage		= nilfs_writepage,
 	.readpage		= nilfs_readpage,
-	.sync_page		= block_sync_page,
 	.writepages		= nilfs_writepages,
 	.set_page_dirty		= nilfs_set_page_dirty,
 	.readpages		= nilfs_readpages,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f6..3fdb61d79c9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -399,7 +399,6 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 
 static const struct address_space_operations def_mdt_aops = {
 	.writepage		= nilfs_mdt_write_page,
-	.sync_page		= block_sync_page,
 };
 
 static const struct inode_operations def_mdt_iops;
@@ -438,10 +437,6 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
 	mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
 
-static const struct address_space_operations shadow_map_aops = {
-	.sync_page		= block_sync_page,
-};
-
 /**
  * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
  * @inode: inode of the metadata file
@@ -455,9 +450,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
 	nilfs_mapping_init_once(&shadow->frozen_data);
-	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
+	nilfs_mapping_init(&shadow->frozen_data, bdi);
 	nilfs_mapping_init_once(&shadow->frozen_btnodes);
-	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
+	nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
 	mi->mi_shadow = shadow;
 	return 0;
 }
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfe..3da37cc5de3 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -506,15 +506,14 @@ void nilfs_mapping_init_once(struct address_space *mapping)
 }
 
 void nilfs_mapping_init(struct address_space *mapping,
-			struct backing_dev_info *bdi,
-			const struct address_space_operations *aops)
+			struct backing_dev_info *bdi)
 {
 	mapping->host = NULL;
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	mapping->assoc_mapping = NULL;
 	mapping->backing_dev_info = bdi;
-	mapping->a_ops = aops;
+	mapping->a_ops = NULL;
 }
 
 /*
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd89..ba4d6fd40b0 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -63,8 +63,7 @@ void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
 void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
-			struct backing_dev_info *bdi,
-			const struct address_space_operations *aops);
+			struct backing_dev_info *bdi);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 					    sector_t start_blk,
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index c3c2c7ac902..0b1e885b8cf 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1543,8 +1543,6 @@ err_out:
  */
 const struct address_space_operations ntfs_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
-	.sync_page	= block_sync_page,	/* Currently, just unplugs the
-						   disk request queue. */
 #ifdef NTFS_RW
 	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
 #endif /* NTFS_RW */
@@ -1560,8 +1558,6 @@ const struct address_space_operations ntfs_aops = {
  */
 const struct address_space_operations ntfs_mst_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
-	.sync_page	= block_sync_page,	/* Currently, just unplugs the
-						   disk request queue. */
 #ifdef NTFS_RW
 	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
 	.set_page_dirty	= __set_page_dirty_nobuffers,	/* Set the page dirty
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6551c7cbad9..ef9ed854255 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -698,8 +698,7 @@ lock_retry_remap:
 					"uptodate! Unplugging the disk queue "
 					"and rescheduling.");
 			get_bh(tbh);
-			blk_run_address_space(mapping);
-			schedule();
+			io_schedule();
 			put_bh(tbh);
 			if (unlikely(!buffer_uptodate(tbh)))
 				goto read_err;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1fbb0e20131..daea0359e97 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2043,7 +2043,6 @@ const struct address_space_operations ocfs2_aops = {
 	.write_begin		= ocfs2_write_begin,
 	.write_end		= ocfs2_write_end,
 	.bmap			= ocfs2_bmap,
-	.sync_page		= block_sync_page,
 	.direct_IO		= ocfs2_direct_IO,
 	.invalidatepage		= ocfs2_invalidatepage,
 	.releasepage		= ocfs2_releasepage,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b108e863d8f..1adab287bd2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -367,11 +367,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
 static void o2hb_wait_on_io(struct o2hb_region *reg,
 			    struct o2hb_bio_wait_ctxt *wc)
 {
-	struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
-
-	blk_run_address_space(mapping);
 	o2hb_bio_wait_dec(wc, 1);
-
 	wait_for_completion(&wc->wc_io_complete);
 }
 
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 8a6d34fa668..d738a7e493d 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,6 @@ const struct address_space_operations omfs_aops = {
 	.readpages = omfs_readpages,
 	.writepage = omfs_writepage,
 	.writepages = omfs_writepages,
-	.sync_page = block_sync_page,
 	.write_begin = omfs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = omfs_bmap,
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e63b4171d58..2b0646613f5 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -335,7 +335,6 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations qnx4_aops = {
 	.readpage	= qnx4_readpage,
 	.writepage	= qnx4_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin	= qnx4_write_begin,
 	.write_end	= generic_write_end,
 	.bmap		= qnx4_bmap
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e..03674675f88 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3212,7 +3212,6 @@ const struct address_space_operations reiserfs_address_space_operations = {
 	.readpages = reiserfs_readpages,
 	.releasepage = reiserfs_releasepage,
 	.invalidatepage = reiserfs_invalidatepage,
-	.sync_page = block_sync_page,
 	.write_begin = reiserfs_write_begin,
 	.write_end = reiserfs_write_end,
 	.bmap = reiserfs_aop_bmap,
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 9ca66276315..fa8d43c92bb 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -488,7 +488,6 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations sysv_aops = {
 	.readpage = sysv_readpage,
 	.writepage = sysv_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = sysv_write_begin,
 	.write_end = generic_write_end,
 	.bmap = sysv_bmap
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6e11c2975dc..81368d4d4a2 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1979,7 +1979,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	c->bdi.name = "ubifs",
 	c->bdi.capabilities = BDI_CAP_MAP_COPY;
-	c->bdi.unplug_io_fn = default_unplug_io_fn;
 	err  = bdi_init(&c->bdi);
 	if (err)
 		goto out_close;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 89c78486cbb..94e4553491c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -98,7 +98,6 @@ static int udf_adinicb_write_end(struct file *file,
 const struct address_space_operations udf_adinicb_aops = {
 	.readpage	= udf_adinicb_readpage,
 	.writepage	= udf_adinicb_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin = simple_write_begin,
 	.write_end = udf_adinicb_write_end,
 };
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c6a2e782b97..fa96fc0fe12 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -133,7 +133,6 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations udf_aops = {
 	.readpage	= udf_readpage,
 	.writepage	= udf_writepage,
-	.sync_page	= block_sync_page,
 	.write_begin		= udf_write_begin,
 	.write_end		= generic_write_end,
 	.bmap		= udf_bmap,
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b251f2093a..83b28444eb1 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -588,7 +588,6 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations ufs_aops = {
 	.readpage = ufs_readpage,
 	.writepage = ufs_writepage,
-	.sync_page = block_sync_page,
 	.write_begin = ufs_write_begin,
 	.write_end = generic_write_end,
 	.bmap = ufs_bmap
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index a58f9155fc9..ff0e79276f2 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -481,7 +481,7 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
 			break;
 		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
 			ufs_sync_inode (inode);
-		blk_run_address_space(inode->i_mapping);
+		blk_flush_plug(current);
 		yield();
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index ec7bbb5645b..83c1c20d145 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1495,7 +1495,6 @@ const struct address_space_operations xfs_address_space_operations = {
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
 	.writepages		= xfs_vm_writepages,
-	.sync_page		= block_sync_page,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
 	.write_begin		= xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378d..4f8f53c4d42 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -991,7 +991,7 @@ xfs_buf_lock(
 	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
 		xfs_log_force(bp->b_target->bt_mount, 0);
 	if (atomic_read(&bp->b_io_remaining))
-		blk_run_address_space(bp->b_target->bt_mapping);
+		blk_flush_plug(current);
 	down(&bp->b_sema);
 	XB_SET_OWNER(bp);
 
@@ -1035,9 +1035,7 @@ xfs_buf_wait_unpin(
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (atomic_read(&bp->b_pin_count) == 0)
 			break;
-		if (atomic_read(&bp->b_io_remaining))
-			blk_run_address_space(bp->b_target->bt_mapping);
-		schedule();
+		io_schedule();
 	}
 	remove_wait_queue(&bp->b_waiters, &wait);
 	set_current_state(TASK_RUNNING);
@@ -1443,7 +1441,7 @@ xfs_buf_iowait(
 	trace_xfs_buf_iowait(bp, _RET_IP_);
 
 	if (atomic_read(&bp->b_io_remaining))
-		blk_run_address_space(bp->b_target->bt_mapping);
+		blk_flush_plug(current);
 	wait_for_completion(&bp->b_iowait);
 
 	trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1667,7 +1665,6 @@ xfs_mapping_buftarg(
 	struct inode		*inode;
 	struct address_space	*mapping;
 	static const struct address_space_operations mapping_aops = {
-		.sync_page = block_sync_page,
 		.migratepage = fail_migrate_page,
 	};
 
@@ -1948,7 +1945,7 @@ xfsbufd(
 			count++;
 		}
 		if (count)
-			blk_run_address_space(target->bt_mapping);
+			blk_flush_plug(current);
 
 	} while (!kthread_should_stop());
 
@@ -1996,7 +1993,7 @@ xfs_flush_buftarg(
 
 	if (wait) {
 		/* Expedite and wait for IO to complete. */
-		blk_run_address_space(target->bt_mapping);
+		blk_flush_plug(current);
 		while (!list_empty(&wait_list)) {
 			bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
 
-- 
cgit v1.2.3


From 2ed1a6bcf97a7ed787e6799bf2e80b9e6f51dca7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 22 Jun 2010 12:52:14 +0200
Subject: fs: make mpage read/write_pages() plug

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/mpage.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/mpage.c b/fs/mpage.c
index d78455a81ec..0afc809e46e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -364,6 +364,9 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
+	struct blk_plug plug;
+
+	blk_start_plug(&plug);
 
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
@@ -385,6 +388,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		mpage_bio_submit(READ, bio);
+	blk_finish_plug(&plug);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
@@ -666,8 +670,11 @@ int
 mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block)
 {
+	struct blk_plug plug;
 	int ret;
 
+	blk_start_plug(&plug);
+
 	if (!get_block)
 		ret = generic_writepages(mapping, wbc);
 	else {
@@ -682,6 +689,7 @@ mpage_writepages(struct address_space *mapping,
 		if (mpd.bio)
 			mpage_bio_submit(WRITE, mpd.bio);
 	}
+	blk_finish_plug(&plug);
 	return ret;
 }
 EXPORT_SYMBOL(mpage_writepages);
-- 
cgit v1.2.3


From 9f5b9425468c85a901d863d241ba5c5dff9b23b8 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 1 Jul 2010 07:55:01 +0200
Subject: fs: make aio plug

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/aio.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index c5ea494ea9e..1476bed1c5f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1660,6 +1660,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 	long ret = 0;
 	int i;
 	struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
+	struct blk_plug plug;
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1676,6 +1677,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 		return -EINVAL;
 	}
 
+	blk_start_plug(&plug);
+
 	/*
 	 * AKPM: should this return a partial result if some of the IOs were
 	 * successfully submitted?
@@ -1698,6 +1701,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 		if (ret)
 			break;
 	}
+	blk_finish_plug(&plug);
 	aio_batch_free(batch_hash);
 
 	put_ioctx(ctx);
-- 
cgit v1.2.3


From cf15900e1209d5b46ec2d24643adbf561830935f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 2 Mar 2011 20:12:18 -0500
Subject: aio: remove request submission batching

This should be useless now that we have on-stack plugging. So lets just
kill it.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/aio.c | 75 +++-------------------------------------------------------------
 1 file changed, 3 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 1476bed1c5f..020de5cb4a6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -34,8 +34,6 @@
 #include <linux/security.h>
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/hash.h>
 #include <linux/compat.h>
 
 #include <asm/kmap_types.h>
@@ -65,14 +63,6 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
 
-#define AIO_BATCH_HASH_BITS	3 /* allocated on-stack, so don't go crazy */
-#define AIO_BATCH_HASH_SIZE	(1 << AIO_BATCH_HASH_BITS)
-struct aio_batch_entry {
-	struct hlist_node list;
-	struct address_space *mapping;
-};
-mempool_t *abe_pool;
-
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
 
@@ -86,8 +76,7 @@ static int __init aio_setup(void)
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
 	aio_wq = create_workqueue("aio");
-	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
-	BUG_ON(!aio_wq || !abe_pool);
+	BUG_ON(!aio_wq);
 
 	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
 
@@ -1512,59 +1501,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 	return 0;
 }
 
-static void aio_batch_add(struct address_space *mapping,
-			  struct hlist_head *batch_hash)
-{
-	struct aio_batch_entry *abe;
-	struct hlist_node *pos;
-	unsigned bucket;
-
-	bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
-	hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
-		if (abe->mapping == mapping)
-			return;
-	}
-
-	abe = mempool_alloc(abe_pool, GFP_KERNEL);
-
-	/*
-	 * we should be using igrab here, but
-	 * we don't want to hammer on the global
-	 * inode spinlock just to take an extra
-	 * reference on a file that we must already
-	 * have a reference to.
-	 *
-	 * When we're called, we always have a reference
-	 * on the file, so we must always have a reference
-	 * on the inode, so ihold() is safe here.
-	 */
-	ihold(mapping->host);
-	abe->mapping = mapping;
-	hlist_add_head(&abe->list, &batch_hash[bucket]);
-	return;
-}
-
-static void aio_batch_free(struct hlist_head *batch_hash)
-{
-	struct aio_batch_entry *abe;
-	struct hlist_node *pos, *n;
-	int i;
-
-	/*
-	 * TODO: kill this
-	 */
-	for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
-		hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
-			iput(abe->mapping->host);
-			hlist_del(&abe->list);
-			mempool_free(abe, abe_pool);
-		}
-	}
-}
-
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, struct hlist_head *batch_hash,
-			 bool compat)
+			 struct iocb *iocb, bool compat)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1638,11 +1576,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			;
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
-	if (req->ki_opcode == IOCB_CMD_PREAD ||
-	    req->ki_opcode == IOCB_CMD_PREADV ||
-	    req->ki_opcode == IOCB_CMD_PWRITE ||
-	    req->ki_opcode == IOCB_CMD_PWRITEV)
-		aio_batch_add(file->f_mapping, batch_hash);
 
 	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
@@ -1659,7 +1592,6 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 	struct kioctx *ctx;
 	long ret = 0;
 	int i;
-	struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
 	struct blk_plug plug;
 
 	if (unlikely(nr < 0))
@@ -1697,12 +1629,11 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
+		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
 		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
-	aio_batch_free(batch_hash);
 
 	put_ioctx(ctx);
 	return i ? i : ret;
-- 
cgit v1.2.3


From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Wed, 9 Mar 2011 11:56:30 +0100
Subject: block: kill off REQ_UNPLUG

With the plugging now being explicitly controlled by the
submitter, callers need not pass down unplugging hints
to the block layer. If they want to unplug, it's because they
manually plugged on their own - in which case, they should just
unplug at will.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/btrfs/extent_io.c        |  2 +-
 fs/buffer.c                 | 14 ++++----------
 fs/direct-io.c              |  2 +-
 fs/ext4/page-io.c           |  3 +--
 fs/gfs2/log.c               |  4 ++--
 fs/gfs2/lops.c              | 12 ++++++------
 fs/gfs2/meta_io.c           |  2 +-
 fs/jbd/commit.c             |  2 +-
 fs/jbd2/commit.c            |  6 +++---
 fs/nilfs2/segbuf.c          |  2 +-
 fs/xfs/linux-2.6/xfs_aops.c |  3 +--
 11 files changed, 22 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 92ac5192c51..b76f7cd4740 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2182,7 +2182,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	unsigned long nr_written = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		write_flags = WRITE_SYNC_PLUG;
+		write_flags = WRITE_SYNC;
 	else
 		write_flags = WRITE;
 
diff --git a/fs/buffer.c b/fs/buffer.c
index f903f2e5b4f..42534f67d71 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -767,7 +767,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * still in flight on potentially older
 				 * contents.
 				 */
-				write_dirty_buffer(bh, WRITE_SYNC_PLUG);
+				write_dirty_buffer(bh, WRITE_SYNC);
 
 				/*
 				 * Kick off IO for the previous mapping. Note
@@ -1602,14 +1602,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
  * prevents this contention from occurring.
  *
  * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
- * causes the writes to be flagged as synchronous writes, but the
- * block device queue will NOT be unplugged, since usually many pages
- * will be pushed to the out before the higher-level caller actually
- * waits for the writes to be completed.  The various wait functions,
- * such as wait_on_writeback_range() will ultimately call sync_page()
- * which will ultimately call blk_run_backing_dev(), which will end up
- * unplugging the device queue.
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * causes the writes to be flagged as synchronous writes.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
 			get_block_t *get_block, struct writeback_control *wbc,
@@ -1622,7 +1616,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
 	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC_PLUG : WRITE);
+			WRITE_SYNC : WRITE);
 
 	BUG_ON(!PageLocked(page));
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index df709b3b860..42608313609 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1173,7 +1173,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct dio *dio;
 
 	if (rw & WRITE)
-		rw = WRITE_ODIRECT_PLUG;
+		rw = WRITE_ODIRECT;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 955cc309142..e2cd90e4bb7 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -310,8 +310,7 @@ static int io_submit_init(struct ext4_io_submit *io,
 	io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
 
 	io->io_bio = bio;
-	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
-			WRITE_SYNC_PLUG : WRITE);
+	io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
 	io->io_next_block = bh->b_blocknr;
 	return 0;
 }
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eb01f3575e1..7f1c1120234 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -121,7 +121,7 @@ __acquires(&sdp->sd_log_lock)
 			lock_buffer(bh);
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
-				submit_bh(WRITE_SYNC_PLUG, bh);
+				submit_bh(WRITE_SYNC, bh);
 			} else {
 				unlock_buffer(bh);
 				brelse(bh);
@@ -647,7 +647,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
 		lock_buffer(bh);
 		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
 			bh->b_end_io = end_buffer_write_sync;
-			submit_bh(WRITE_SYNC_PLUG, bh);
+			submit_bh(WRITE_SYNC, bh);
 		} else {
 			unlock_buffer(bh);
 			brelse(bh);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index bf33f822058..48b545a1979 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -200,7 +200,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 		}
 
 		gfs2_log_unlock(sdp);
-		submit_bh(WRITE_SYNC_PLUG, bh);
+		submit_bh(WRITE_SYNC, bh);
 		gfs2_log_lock(sdp);
 
 		n = 0;
@@ -210,7 +210,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 			gfs2_log_unlock(sdp);
 			lock_buffer(bd2->bd_bh);
 			bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-			submit_bh(WRITE_SYNC_PLUG, bh);
+			submit_bh(WRITE_SYNC, bh);
 			gfs2_log_lock(sdp);
 			if (++n >= num)
 				break;
@@ -352,7 +352,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 		sdp->sd_log_num_revoke--;
 
 		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-			submit_bh(WRITE_SYNC_PLUG, bh);
+			submit_bh(WRITE_SYNC, bh);
 
 			bh = gfs2_log_get_buf(sdp);
 			mh = (struct gfs2_meta_header *)bh->b_data;
@@ -369,7 +369,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	}
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 
-	submit_bh(WRITE_SYNC_PLUG, bh);
+	submit_bh(WRITE_SYNC, bh);
 }
 
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -571,7 +571,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	ptr = bh_log_ptr(bh);
 	
 	get_bh(bh);
-	submit_bh(WRITE_SYNC_PLUG, bh);
+	submit_bh(WRITE_SYNC, bh);
 	gfs2_log_lock(sdp);
 	while(!list_empty(list)) {
 		bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -597,7 +597,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 		} else {
 			bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
 		}
-		submit_bh(WRITE_SYNC_PLUG, bh1);
+		submit_bh(WRITE_SYNC, bh1);
 		gfs2_log_lock(sdp);
 		ptr += 2;
 	}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index a566331db4e..867b713cba9 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
 	int write_op = REQ_META |
-		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE);
+		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!page_has_buffers(page));
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 34a4861c14b..66be299acb1 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -333,7 +333,7 @@ void journal_commit_transaction(journal_t *journal)
 	 * instead we rely on sync_buffer() doing the unplug for us.
 	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC_PLUG;
+		write_op = WRITE_SYNC;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f3ad1598b20..3da1cc4346d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -137,9 +137,9 @@ static int journal_submit_commit_record(journal_t *journal,
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-		ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
 	else
-		ret = submit_bh(WRITE_SYNC_PLUG, bh);
+		ret = submit_bh(WRITE_SYNC, bh);
 
 	*cbh = bh;
 	return ret;
@@ -369,7 +369,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * instead we rely on sync_buffer() doing the unplug for us.
 	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC_PLUG;
+		write_op = WRITE_SYNC;
 	trace_jbd2_commit_locking(journal, commit_transaction);
 	stats.run.rs_wait = commit_transaction->t_max_wait;
 	stats.run.rs_locked = jiffies;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 0f83e93935b..2853ff20f85 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -509,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
 		 * Last BIO is always sent through the following
 		 * submission.
 		 */
-		rw |= REQ_SYNC | REQ_UNPLUG;
+		rw |= REQ_SYNC;
 		res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 83c1c20d145..6bbb0ee3325 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -413,8 +413,7 @@ xfs_submit_ioend_bio(
 	if (xfs_ioend_new_eof(ioend))
 		xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
 
-	submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
-		   WRITE_SYNC_PLUG : WRITE, bio);
+	submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
 }
 
 STATIC struct bio *
-- 
cgit v1.2.3


From ae50adcb0ac4cde67a7aec8ae67249d1b2be2948 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 10:04:50 -0500
Subject: /proc/self is never going to be invalidated...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 30 ------------------------------
 1 file changed, 30 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d096e82b20..d49c4b5d2c3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2620,35 +2620,6 @@ static const struct pid_entry proc_base_stuff[] = {
 		&proc_self_inode_operations, NULL, {}),
 };
 
-/*
- *	Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- */
-static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-	struct inode *inode;
-	struct task_struct *task;
-
-	if (nd->flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	inode = dentry->d_inode;
-	task = get_proc_task(inode);
-	if (task) {
-		put_task_struct(task);
-		return 1;
-	}
-	d_drop(dentry);
-	return 0;
-}
-
-static const struct dentry_operations proc_base_dentry_operations =
-{
-	.d_revalidate	= proc_base_revalidate,
-	.d_delete	= pid_delete_dentry,
-};
-
 static struct dentry *proc_base_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2685,7 +2656,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 	if (p->fop)
 		inode->i_fop = p->fop;
 	ei->op = p->op;
-	d_set_d_op(dentry, &proc_base_dentry_operations);
 	d_add(dentry, inode);
 	error = NULL;
 out:
-- 
cgit v1.2.3


From c78f4cc5e7d642c7009089817c12d8984e7ba872 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 10:14:56 -0500
Subject: reiserfs xattr ->d_revalidate() shouldn't care about RCU

... it returns an error unconditionally

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/xattr.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 3cfb2e93364..5c11ca82b78 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	if (nd->flags & LOOKUP_RCU)
-		return -ECHILD;
 	return -EPERM;
 }
 
-- 
cgit v1.2.3


From 0eb980e31770cfeff6e27760b4692d595b8dbf28 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:44:05 -0500
Subject: ceph: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 099a58615b9..ebafa65a29b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -993,7 +993,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *dir;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	dir = dentry->d_parent->d_inode;
-- 
cgit v1.2.3


From 529c5f958f9e60abaa7407986034b17d17536bf2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:44:31 -0500
Subject: fuse: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 83543b5ff94..8bd0ef9286c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -158,7 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
 	struct inode *inode;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	inode = entry->d_inode;
-- 
cgit v1.2.3


From 53fe924161ff18d24c5c1c256549e9c1b9874827 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:44:48 -0500
Subject: gfs2: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/dentry.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4a456338b87..0da8da2c991 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 	int error;
 	int had_lock = 0;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	parent = dget_parent(dentry);
-- 
cgit v1.2.3


From 4714e63731a8a641b5e0ed5e2e2191c13bf2d71a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:45:07 -0500
Subject: ocfs2: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/dcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 6d80ecc7834..7eb90403fc8 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 	int ret = 0;    /* if all else fails, just return false */
 	struct ocfs2_super *osb;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	inode = dentry->d_inode;
-- 
cgit v1.2.3


From 8ce84eeb5b40da21f20174dd25891a8409534237 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:45:28 -0500
Subject: jfs: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jfs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 81ead850ddb..5a2b269428a 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1600,7 +1600,7 @@ out:
 
 static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 	/*
 	 * This is not negative dentry. Always valid.
-- 
cgit v1.2.3


From 9177ada99d5e69fe91950b3ef5c23f2bcd109987 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:45:49 -0500
Subject: fat: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fat/namei_vfat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f88f752babd..adae3fb7451 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	/* This is not negative dentry. Always valid. */
@@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	/*
-- 
cgit v1.2.3


From 1ca551c6caae7b52178555cdedea6ca26444be46 Mon Sep 17 00:00:00 2001
From: Marco Stornelli <marco.stornelli@gmail.com>
Date: Sat, 5 Mar 2011 11:10:19 +0100
Subject: Check for immutable/append flag in fallocate path

In the fallocate path the kernel doesn't check for the immutable/append
flag. It's possible to have a race condition in this scenario: an
application open a file in read/write and it does something, meanwhile
root set the immutable flag on the file, the application at that point
can call fallocate with success. In addition, we don't allow to do any
unreserve operation on an append only file but only the reserve one.

Signed-off-by: Marco Stornelli <marco.stornelli@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 5a2c6ebc22b..b47aab39c05 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
+
+	/* It's not possible punch hole on append only file */
+	if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
+		return -EPERM;
+
+	if (IS_IMMUTABLE(inode))
+		return -EPERM;
+
 	/*
 	 * Revalidate the write permissions, in case security policy has
 	 * changed since the files were opened.
-- 
cgit v1.2.3


From d891eedbc3b1b0fade8a9ce60cc0eba1cccb59e5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 18 Jan 2011 15:45:09 -0500
Subject: fs/dcache: allow d_obtain_alias() to return unhashed dentries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:

	client$ mount -tnfs4 server:/export/ /mnt/
	client$ tail -f /mnt/FOO
	...
	server$ df -i /export
	server$ rm /export/FOO
	(^C the tail -f)
	server$ df -i /export
	server$ echo 2 >/proc/sys/vm/drop_caches
	server$ df -i /export

the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.

This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:

	- putfh: look up the filehandle.  The only alias found for the
	  inode will be DCACHE_UNHASHED alias referenced by the filp
	  this, so it creates a new DCACHE_DISCONECTED dentry and
	  returns that instead.
	- close: closes the existing filp, which is destroyed
	  immediately by dput() since it's DCACHE_UNHASHED.
	- end of the compound: release the reference
	  to the current filehandle, and dput() the new
	  DCACHE_DISCONECTED dentry, which gets put on the
	  unused list instead of being destroyed immediately.

Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.

Leave __d_find_alias() alone to avoid changing behavior of other
callers.

Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 2a6bd9a4ae9..611ffe928c0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1523,6 +1523,28 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
 
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+	struct dentry *alias;
+
+	if (list_empty(&inode->i_dentry))
+		return NULL;
+	alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+	__dget(alias);
+	return alias;
+}
+
+static struct dentry * d_find_any_alias(struct inode *inode)
+{
+	struct dentry *de;
+
+	spin_lock(&inode->i_lock);
+	de = __d_find_any_alias(inode);
+	spin_unlock(&inode->i_lock);
+	return de;
+}
+
+
 /**
  * d_obtain_alias - find or allocate a dentry for a given inode
  * @inode: inode to allocate the dentry for
@@ -1552,7 +1574,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
-	res = d_find_alias(inode);
+	res = d_find_any_alias(inode);
 	if (res)
 		goto out_iput;
 
@@ -1565,7 +1587,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 
 
 	spin_lock(&inode->i_lock);
-	res = __d_find_alias(inode, 0);
+	res = __d_find_any_alias(inode);
 	if (res) {
 		spin_unlock(&inode->i_lock);
 		dput(tmp);
-- 
cgit v1.2.3


From b4966b7770349deb05e3dd2bd2c65d2d044abbbb Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Wed, 9 Mar 2011 16:46:42 +0000
Subject: btrfs: fix dip leak

The btrfs DIO code leaks dip structs when dip->csums allocation
fails; bio->bi_end_io isn't set at the point where the free_ordered
branch is consequently taken, thus bio_endio doesn't call the function
which would free it in the normal case. Fix.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
Acked-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 44b926646e3..e7a8303328b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6058,6 +6058,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 	if (!skip_sum) {
 		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
 		if (!dip->csums) {
+			kfree(dip);
 			ret = -ENOMEM;
 			goto free_ordered;
 		}
-- 
cgit v1.2.3


From 7e6b6465e6efbca3985258996be9c189da96c8bf Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 18 Feb 2011 09:21:17 +0000
Subject: btrfs: fix not enough reserved space

btrfs_link() will insert 3 items(inode ref, dir name item and dir index item)
into the b+ tree and update 2 items(its inode, and parent's inode) in the b+
tree. So we should reserve space for these 5 items, not 3 items.

Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e7a8303328b..db67821ccac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4823,10 +4823,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		goto fail;
 
 	/*
-	 * 1 item for inode ref
+	 * 2 items for inode and inode ref
 	 * 2 items for dir items
+	 * 1 item for parent inode
 	 */
-	trans = btrfs_start_transaction(root, 3);
+	trans = btrfs_start_transaction(root, 5);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto fail;
-- 
cgit v1.2.3


From 8304d6f24cc1221392b6d61fa9d16631cbd6beb7 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 21 Feb 2011 14:58:21 -0600
Subject: dlm: record full callback state

Change how callbacks are recorded for locks.  Previously, information
about multiple callbacks was combined into a couple of variables that
indicated what the end result should be.  In some situations, we
could not tell from this combined state what the exact sequence of
callbacks were, and would end up either delivering the callbacks in
the wrong order, or suppress redundant callbacks incorrectly.  This
new approach records all the data for each callback, leaving no
uncertainty about what needs to be delivered.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c          | 257 ++++++++++++++++++++++++++++++++++++++------------
 fs/dlm/ast.h          |   7 +-
 fs/dlm/debug_fs.c     |   4 +-
 fs/dlm/dlm_internal.h |  35 ++++---
 fs/dlm/lock.c         |  38 ++++----
 fs/dlm/rcom.c         |   4 +-
 fs/dlm/user.c         | 185 +++++++++++++-----------------------
 fs/dlm/user.h         |   3 +-
 8 files changed, 311 insertions(+), 222 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 4314f0d48d8..abc49f29245 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -18,6 +18,7 @@
 
 #define WAKE_ASTS  0
 
+static uint64_t			ast_seq_count;
 static struct list_head		ast_queue;
 static spinlock_t		ast_queue_lock;
 static struct task_struct *	astd_task;
@@ -25,40 +26,186 @@ static unsigned long		astd_wakeflags;
 static struct mutex		astd_running;
 
 
+static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
+{
+	int i;
+
+	log_print("last_bast %x %llu flags %x mode %d sb %d %x",
+		  lkb->lkb_id,
+		  (unsigned long long)lkb->lkb_last_bast.seq,
+		  lkb->lkb_last_bast.flags,
+		  lkb->lkb_last_bast.mode,
+		  lkb->lkb_last_bast.sb_status,
+		  lkb->lkb_last_bast.sb_flags);
+
+	log_print("last_cast %x %llu flags %x mode %d sb %d %x",
+		  lkb->lkb_id,
+		  (unsigned long long)lkb->lkb_last_cast.seq,
+		  lkb->lkb_last_cast.flags,
+		  lkb->lkb_last_cast.mode,
+		  lkb->lkb_last_cast.sb_status,
+		  lkb->lkb_last_cast.sb_flags);
+
+	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+		log_print("cb %x %llu flags %x mode %d sb %d %x",
+			  lkb->lkb_id,
+			  (unsigned long long)lkb->lkb_callbacks[i].seq,
+			  lkb->lkb_callbacks[i].flags,
+			  lkb->lkb_callbacks[i].mode,
+			  lkb->lkb_callbacks[i].sb_status,
+			  lkb->lkb_callbacks[i].sb_flags);
+	}
+}
+
 void dlm_del_ast(struct dlm_lkb *lkb)
 {
 	spin_lock(&ast_queue_lock);
-	if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
-		list_del(&lkb->lkb_astqueue);
+	if (!list_empty(&lkb->lkb_astqueue))
+		list_del_init(&lkb->lkb_astqueue);
 	spin_unlock(&ast_queue_lock);
 }
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+			 int status, uint32_t sbflags, uint64_t seq)
 {
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	uint64_t prev_seq;
+	int prev_mode;
+	int i;
+
+	for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+		if (lkb->lkb_callbacks[i].seq)
+			continue;
+
+		/*
+		 * Suppress some redundant basts here, do more on removal.
+		 * Don't even add a bast if the callback just before it
+		 * is a bast for the same mode or a more restrictive mode.
+		 * (the addional > PR check is needed for PR/CW inversion)
+		 */
+
+		if ((i > 0) && (flags & DLM_CB_BAST) &&
+		    (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
+
+			prev_seq = lkb->lkb_callbacks[i-1].seq;
+			prev_mode = lkb->lkb_callbacks[i-1].mode;
+
+			if ((prev_mode == mode) ||
+			    (prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
+
+				log_debug(ls, "skip %x add bast %llu mode %d "
+					  "for bast %llu mode %d",
+					  lkb->lkb_id,
+					  (unsigned long long)seq,
+					  mode,
+					  (unsigned long long)prev_seq,
+					  prev_mode);
+				return 0;
+			}
+		}
+
+		lkb->lkb_callbacks[i].seq = seq;
+		lkb->lkb_callbacks[i].flags = flags;
+		lkb->lkb_callbacks[i].mode = mode;
+		lkb->lkb_callbacks[i].sb_status = status;
+		lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
+		break;
+	}
+
+	if (i == DLM_CALLBACKS_SIZE) {
+		log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
+			  lkb->lkb_id, (unsigned long long)seq,
+			  flags, mode, status, sbflags);
+		dlm_dump_lkb_callbacks(lkb);
+		return -1;
+	}
+
+	return 0;
+}
+
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			 struct dlm_callback *cb, int *resid)
+{
+	int i;
+
+	*resid = 0;
+
+	if (!lkb->lkb_callbacks[0].seq)
+		return -ENOENT;
+
+	/* oldest undelivered cb is callbacks[0] */
+
+	memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
+	memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
+
+	/* shift others down */
+
+	for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
+		if (!lkb->lkb_callbacks[i].seq)
+			break;
+		memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
+		       sizeof(struct dlm_callback));
+		memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
+		(*resid)++;
+	}
+
+	/* if cb is a bast, it should be skipped if the blocking mode is
+	   compatible with the last granted mode */
+
+	if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
+		if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
+			cb->flags |= DLM_CB_SKIP;
+
+			log_debug(ls, "skip %x bast %llu mode %d "
+				  "for cast %llu mode %d",
+				  lkb->lkb_id,
+				  (unsigned long long)cb->seq,
+				  cb->mode,
+				  (unsigned long long)lkb->lkb_last_cast.seq,
+				  lkb->lkb_last_cast.mode);
+			return 0;
+		}
+	}
+
+	if (cb->flags & DLM_CB_CAST) {
+		memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
+		lkb->lkb_last_cast_time = ktime_get();
+	}
+
+	if (cb->flags & DLM_CB_BAST) {
+		memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
+		lkb->lkb_last_bast_time = ktime_get();
+	}
+
+	return 0;
+}
+
+void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+		 uint32_t sbflags)
+{
+	uint64_t seq;
+	int rv;
+
+	spin_lock(&ast_queue_lock);
+
+	seq = ++ast_seq_count;
+
 	if (lkb->lkb_flags & DLM_IFL_USER) {
-		dlm_user_add_ast(lkb, type, mode);
+		spin_unlock(&ast_queue_lock);
+		dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq);
 		return;
 	}
 
-	spin_lock(&ast_queue_lock);
-	if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+	rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
+	if (rv < 0) {
+		spin_unlock(&ast_queue_lock);
+		return;
+	}
+
+	if (list_empty(&lkb->lkb_astqueue)) {
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_astqueue, &ast_queue);
-		lkb->lkb_ast_first = type;
 	}
-
-	/* sanity check, this should not happen */
-
-	if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
-		log_print("repeat cast %d castmode %d lock %x %s",
-			  mode, lkb->lkb_castmode,
-			  lkb->lkb_id, lkb->lkb_resource->res_name);
-
-	lkb->lkb_ast_type |= type;
-	if (type == AST_BAST)
-		lkb->lkb_bastmode = mode;
-	else
-		lkb->lkb_castmode = mode;
 	spin_unlock(&ast_queue_lock);
 
 	set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -72,7 +219,8 @@ static void process_asts(void)
 	struct dlm_lkb *lkb;
 	void (*castfn) (void *astparam);
 	void (*bastfn) (void *astparam, int mode);
-	int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
+	struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
+	int i, rv, resid;
 
 repeat:
 	spin_lock(&ast_queue_lock);
@@ -83,54 +231,45 @@ repeat:
 		if (dlm_locking_stopped(ls))
 			continue;
 
-		list_del(&lkb->lkb_astqueue);
-		type = lkb->lkb_ast_type;
-		lkb->lkb_ast_type = 0;
-		first = lkb->lkb_ast_first;
-		lkb->lkb_ast_first = 0;
-		bastmode = lkb->lkb_bastmode;
-		castmode = lkb->lkb_castmode;
+		/* we remove from astqueue list and remove everything in
+		   lkb_callbacks before releasing the spinlock so empty
+		   lkb_astqueue is always consistent with empty lkb_callbacks */
+
+		list_del_init(&lkb->lkb_astqueue);
+
 		castfn = lkb->lkb_astfn;
 		bastfn = lkb->lkb_bastfn;
-		spin_unlock(&ast_queue_lock);
 
-		do_cast = (type & AST_COMP) && castfn;
-		do_bast = (type & AST_BAST) && bastfn;
+		memset(&callbacks, 0, sizeof(callbacks));
 
-		/* Skip a bast if its blocking mode is compatible with the
-		   granted mode of the preceding cast. */
+		for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+			rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
+			if (rv < 0)
+				break;
+		}
+		spin_unlock(&ast_queue_lock);
 
-		if (do_bast) {
-			if (first == AST_COMP)
-				last_castmode = castmode;
-			else
-				last_castmode = lkb->lkb_castmode_done;
-			if (dlm_modes_compat(bastmode, last_castmode))
-				do_bast = 0;
+		if (resid) {
+			/* shouldn't happen, for loop should have removed all */
+			log_error(ls, "callback resid %d lkb %x",
+				  resid, lkb->lkb_id);
 		}
 
-		if (first == AST_COMP) {
-			if (do_cast)
-				castfn(lkb->lkb_astparam);
-			if (do_bast)
-				bastfn(lkb->lkb_astparam, bastmode);
-		} else if (first == AST_BAST) {
-			if (do_bast)
-				bastfn(lkb->lkb_astparam, bastmode);
-			if (do_cast)
+		for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+			if (!callbacks[i].seq)
+				break;
+			if (callbacks[i].flags & DLM_CB_SKIP) {
+				continue;
+			} else if (callbacks[i].flags & DLM_CB_BAST) {
+				bastfn(lkb->lkb_astparam, callbacks[i].mode);
+			} else if (callbacks[i].flags & DLM_CB_CAST) {
+				lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
+				lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
 				castfn(lkb->lkb_astparam);
-		} else {
-			log_error(ls, "bad ast_first %d ast_type %d",
-				  first, type);
+			}
 		}
 
-		if (do_cast)
-			lkb->lkb_castmode_done = castmode;
-		if (do_bast)
-			lkb->lkb_bastmode_done = bastmode;
-
-		/* this removes the reference added by dlm_add_ast
-		   and may result in the lkb being freed */
+		/* removes ref for ast_queue, may cause lkb to be freed */
 		dlm_put_lkb(lkb);
 
 		cond_resched();
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index bcb1aaba519..8aa89c9b561 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -13,8 +13,13 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
 void dlm_del_ast(struct dlm_lkb *lkb);
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                         int status, uint32_t sbflags, uint64_t seq);
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                         struct dlm_callback *cb, int *resid);
+void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+		 uint32_t sbflags);
 
 void dlm_astd_wake(void);
 int dlm_astd_start(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 6b42ba807df..59779237e2b 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -257,12 +257,12 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
 			lkb->lkb_status,
 			lkb->lkb_grmode,
 			lkb->lkb_rqmode,
-			lkb->lkb_bastmode,
+			lkb->lkb_last_bast.mode,
 			rsb_lookup,
 			lkb->lkb_wait_type,
 			lkb->lkb_lvbseq,
 			(unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
-			(unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+			(unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
 	return rv;
 }
 
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index f632b58cd22..b9420491301 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -192,11 +192,6 @@ struct dlm_args {
  * lkb is a process copy, the nodeid specifies the lock master.
  */
 
-/* lkb_ast_type */
-
-#define AST_COMP		1
-#define AST_BAST		2
-
 /* lkb_status */
 
 #define DLM_LKSTS_WAITING	1
@@ -217,6 +212,20 @@ struct dlm_args {
 #define DLM_IFL_USER		0x00000001
 #define DLM_IFL_ORPHAN		0x00000002
 
+#define DLM_CALLBACKS_SIZE	6
+
+#define DLM_CB_CAST		0x00000001
+#define DLM_CB_BAST		0x00000002
+#define DLM_CB_SKIP		0x00000004
+
+struct dlm_callback {
+	uint64_t		seq;
+	uint32_t		flags;		/* DLM_CBF_ */
+	int			sb_status;	/* copy to lksb status */
+	uint8_t			sb_flags;	/* copy to lksb flags */
+	int8_t			mode; /* rq mode of bast, gr mode of cast */
+};
+
 struct dlm_lkb {
 	struct dlm_rsb		*lkb_resource;	/* the rsb */
 	struct kref		lkb_ref;
@@ -236,13 +245,6 @@ struct dlm_lkb {
 
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
 	int8_t			lkb_wait_count;
-	int8_t			lkb_ast_type;	/* type of ast queued for */
-	int8_t			lkb_ast_first;	/* type of first ast queued */
-
-	int8_t			lkb_bastmode;	/* req mode of queued bast */
-	int8_t			lkb_castmode;	/* gr mode of queued cast */
-	int8_t			lkb_bastmode_done; /* last delivered bastmode */
-	int8_t			lkb_castmode_done; /* last delivered castmode */
 
 	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
 	struct list_head	lkb_statequeue;	/* rsb g/c/w list */
@@ -251,10 +253,15 @@ struct dlm_lkb {
 	struct list_head	lkb_astqueue;	/* need ast to be sent */
 	struct list_head	lkb_ownqueue;	/* list of locks for a process */
 	struct list_head	lkb_time_list;
-	ktime_t			lkb_time_bast;	/* for debugging */
 	ktime_t			lkb_timestamp;
 	unsigned long		lkb_timeout_cs;
 
+	struct dlm_callback	lkb_callbacks[DLM_CALLBACKS_SIZE];
+	struct dlm_callback	lkb_last_cast;
+	struct dlm_callback	lkb_last_bast;
+	ktime_t			lkb_last_cast_time;	/* for debugging */
+	ktime_t			lkb_last_bast_time;	/* for debugging */
+
 	char			*lkb_lvbptr;
 	struct dlm_lksb		*lkb_lksb;      /* caller's status block */
 	void			(*lkb_astfn) (void *astparam);
@@ -544,8 +551,6 @@ struct dlm_user_args {
 					  (dlm_user_proc) on the struct file,
 					  the process's locks point back to it*/
 	struct dlm_lksb		lksb;
-	int			old_mode;
-	int			update_user_lvb;
 	struct dlm_lksb __user	*user_lksb;
 	void __user		*castparam;
 	void __user		*castaddr;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 64e5f3efdd8..04b8c449303 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -160,10 +160,10 @@ static const int __quecvt_compat_matrix[8][8] = {
 void dlm_print_lkb(struct dlm_lkb *lkb)
 {
 	printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
-	       "     status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+	       "     status %d rqmode %d grmode %d wait_type %d\n",
 	       lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
 	       lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
-	       lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+	       lkb->lkb_grmode, lkb->lkb_wait_type);
 }
 
 static void dlm_print_rsb(struct dlm_rsb *r)
@@ -305,10 +305,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 		rv = -EDEADLK;
 	}
 
-	lkb->lkb_lksb->sb_status = rv;
-	lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
-
-	dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
+	dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags);
 }
 
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -319,13 +316,10 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
 
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
-	lkb->lkb_time_bast = ktime_get();
-
 	if (is_master_copy(lkb)) {
-		lkb->lkb_bastmode = rqmode; /* printed by debugfs */
 		send_bast(r, lkb, rqmode);
 	} else {
-		dlm_add_ast(lkb, AST_BAST, rqmode);
+		dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0);
 	}
 }
 
@@ -600,6 +594,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
 	INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
 	INIT_LIST_HEAD(&lkb->lkb_time_list);
+	INIT_LIST_HEAD(&lkb->lkb_astqueue);
 
 	get_random_bytes(&bucket, sizeof(bucket));
 	bucket &= (ls->ls_lkbtbl_size - 1);
@@ -2819,9 +2814,9 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	   not from lkb fields */
 
 	if (lkb->lkb_bastfn)
-		ms->m_asts |= AST_BAST;
+		ms->m_asts |= DLM_CB_BAST;
 	if (lkb->lkb_astfn)
-		ms->m_asts |= AST_COMP;
+		ms->m_asts |= DLM_CB_CAST;
 
 	/* compare with switch in create_message; send_remove() doesn't
 	   use send_args() */
@@ -3122,8 +3117,8 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_grmode = DLM_LOCK_IV;
 	lkb->lkb_rqmode = ms->m_rqmode;
 
-	lkb->lkb_bastfn = (ms->m_asts & AST_BAST) ? &fake_bastfn : NULL;
-	lkb->lkb_astfn = (ms->m_asts & AST_COMP) ? &fake_astfn : NULL;
+	lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
+	lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
 
 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
 		/* lkb was just created so there won't be an lvb yet */
@@ -4412,8 +4407,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_grmode = rl->rl_grmode;
 	/* don't set lkb_status because add_lkb wants to itself */
 
-	lkb->lkb_bastfn = (rl->rl_asts & AST_BAST) ? &fake_bastfn : NULL;
-	lkb->lkb_astfn = (rl->rl_asts & AST_COMP) ? &fake_astfn : NULL;
+	lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
+	lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
 
 	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
 		int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4589,7 +4584,6 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
 			      fake_astfn, ua, fake_bastfn, &args);
 	lkb->lkb_flags |= DLM_IFL_USER;
-	ua->old_mode = DLM_LOCK_IV;
 
 	if (error) {
 		__put_lkb(ls, lkb);
@@ -4658,7 +4652,6 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	ua->bastparam = ua_tmp->bastparam;
 	ua->bastaddr = ua_tmp->bastaddr;
 	ua->user_lksb = ua_tmp->user_lksb;
-	ua->old_mode = lkb->lkb_grmode;
 
 	error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
 			      fake_astfn, ua, fake_bastfn, &args);
@@ -4917,8 +4910,9 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 	}
 
 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
-		lkb->lkb_ast_type = 0;
-		list_del(&lkb->lkb_astqueue);
+		memset(&lkb->lkb_callbacks, 0,
+		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+		list_del_init(&lkb->lkb_astqueue);
 		dlm_put_lkb(lkb);
 	}
 
@@ -4958,7 +4952,9 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 
 	spin_lock(&proc->asts_spin);
 	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
-		list_del(&lkb->lkb_astqueue);
+		memset(&lkb->lkb_callbacks, 0,
+		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+		list_del_init(&lkb->lkb_astqueue);
 		dlm_put_lkb(lkb);
 	}
 	spin_unlock(&proc->asts_spin);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 3c83a49a48a..f10a50f24e8 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -321,9 +321,9 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type);
 
 	if (lkb->lkb_bastfn)
-		rl->rl_asts |= AST_BAST;
+		rl->rl_asts |= DLM_CB_BAST;
 	if (lkb->lkb_astfn)
-		rl->rl_asts |= AST_COMP;
+		rl->rl_asts |= DLM_CB_CAST;
 
 	rl->rl_namelen = cpu_to_le16(r->res_length);
 	memcpy(rl->rl_name, r->res_name, r->res_length);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 66d6c16bf44..d5ab3fe7c19 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,6 +24,7 @@
 #include "lock.h"
 #include "lvb_table.h"
 #include "user.h"
+#include "ast.h"
 
 static const char name_prefix[] = "dlm";
 static const struct file_operations device_fops;
@@ -152,19 +153,16 @@ static void compat_output(struct dlm_lock_result *res,
    not related to the lifetime of the lkb struct which is managed
    entirely by refcount. */
 
-static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
+static int lkb_is_endoflife(int mode, int status)
 {
-	switch (sb_status) {
+	switch (status) {
 	case -DLM_EUNLOCK:
 		return 1;
 	case -DLM_ECANCEL:
 	case -ETIMEDOUT:
 	case -EDEADLK:
-		if (lkb->lkb_grmode == DLM_LOCK_IV)
-			return 1;
-		break;
 	case -EAGAIN:
-		if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV)
+		if (mode == DLM_LOCK_IV)
 			return 1;
 		break;
 	}
@@ -174,12 +172,13 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
    being removed and then remove that lkb from the orphans list and free it */
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+		      int status, uint32_t sbflags, uint64_t seq)
 {
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
 	struct dlm_user_proc *proc;
-	int eol = 0, ast_type;
+	int rv;
 
 	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
 		return;
@@ -200,49 +199,29 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
 	ua = lkb->lkb_ua;
 	proc = ua->proc;
 
-	if (type == AST_BAST && ua->bastaddr == NULL)
+	if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL)
 		goto out;
 
+	if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status))
+		lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
+
 	spin_lock(&proc->asts_spin);
 
-	ast_type = lkb->lkb_ast_type;
-	lkb->lkb_ast_type |= type;
-	if (type == AST_BAST)
-		lkb->lkb_bastmode = mode;
-	else
-		lkb->lkb_castmode = mode;
+	rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
+	if (rv < 0) {
+		spin_unlock(&proc->asts_spin);
+		goto out;
+	}
 
-	if (!ast_type) {
+	if (list_empty(&lkb->lkb_astqueue)) {
 		kref_get(&lkb->lkb_ref);
 		list_add_tail(&lkb->lkb_astqueue, &proc->asts);
-		lkb->lkb_ast_first = type;
 		wake_up_interruptible(&proc->wait);
 	}
-	if (type == AST_COMP && (ast_type & AST_COMP))
-		log_debug(ls, "ast overlap %x status %x %x",
-			  lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
-
-	eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
-	if (eol) {
-		lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
-	}
-
-	/* We want to copy the lvb to userspace when the completion
-	   ast is read if the status is 0, the lock has an lvb and
-	   lvb_ops says we should.  We could probably have set_lvb_lock()
-	   set update_user_lvb instead and not need old_mode */
-
-	if ((lkb->lkb_ast_type & AST_COMP) &&
-	    (lkb->lkb_lksb->sb_status == 0) &&
-	    lkb->lkb_lksb->sb_lvbptr &&
-	    dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
-		ua->update_user_lvb = 1;
-	else
-		ua->update_user_lvb = 0;
-
 	spin_unlock(&proc->asts_spin);
 
-	if (eol) {
+	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+		/* N.B. spin_lock locks_spin, not asts_spin */
 		spin_lock(&proc->locks_spin);
 		if (!list_empty(&lkb->lkb_ownqueue)) {
 			list_del_init(&lkb->lkb_ownqueue);
@@ -705,8 +684,9 @@ static int device_close(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
-			       int mode, char __user *buf, size_t count)
+static int copy_result_to_user(struct dlm_user_args *ua, int compat,
+			       uint32_t flags, int mode, int copy_lvb,
+			       char __user *buf, size_t count)
 {
 #ifdef CONFIG_COMPAT
 	struct dlm_lock_result32 result32;
@@ -730,7 +710,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
 	   notes that a new blocking AST address and parameter are set even if
 	   the conversion fails, so maybe we should just do that. */
 
-	if (type == AST_BAST) {
+	if (flags & DLM_CB_BAST) {
 		result.user_astaddr = ua->bastaddr;
 		result.user_astparam = ua->bastparam;
 		result.bast_mode = mode;
@@ -750,8 +730,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
 	/* copy lvb to userspace if there is one, it's been updated, and
 	   the user buffer has space for it */
 
-	if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
-	    count >= len + DLM_USER_LVB_LEN) {
+	if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) {
 		if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
 				 DLM_USER_LVB_LEN)) {
 			error = -EFAULT;
@@ -801,13 +780,12 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	struct dlm_user_proc *proc = file->private_data;
 	struct dlm_lkb *lkb;
 	DECLARE_WAITQUEUE(wait, current);
-	int error = 0, removed;
-	int ret_type, ret_mode;
-	int bastmode, castmode, do_bast, do_cast;
+	struct dlm_callback cb;
+	int rv, resid, copy_lvb = 0;
 
 	if (count == sizeof(struct dlm_device_version)) {
-		error = copy_version_to_user(buf, count);
-		return error;
+		rv = copy_version_to_user(buf, count);
+		return rv;
 	}
 
 	if (!proc) {
@@ -854,92 +832,57 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 		}
 	}
 
-	/* there may be both completion and blocking asts to return for
-	   the lkb, don't remove lkb from asts list unless no asts remain */
+	/* if we empty lkb_callbacks, we don't want to unlock the spinlock
+	   without removing lkb_astqueue; so empty lkb_astqueue is always
+	   consistent with empty lkb_callbacks */
 
 	lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
 
-	removed = 0;
-	ret_type = 0;
-	ret_mode = 0;
-	do_bast = lkb->lkb_ast_type & AST_BAST;
-	do_cast = lkb->lkb_ast_type & AST_COMP;
-	bastmode = lkb->lkb_bastmode;
-	castmode = lkb->lkb_castmode;
-
-	/* when both are queued figure out which to do first and
-	   switch first so the other goes in the next read */
-
-	if (do_cast && do_bast) {
-		if (lkb->lkb_ast_first == AST_COMP) {
-			ret_type = AST_COMP;
-			ret_mode = castmode;
-			lkb->lkb_ast_type &= ~AST_COMP;
-			lkb->lkb_ast_first = AST_BAST;
-		} else {
-			ret_type = AST_BAST;
-			ret_mode = bastmode;
-			lkb->lkb_ast_type &= ~AST_BAST;
-			lkb->lkb_ast_first = AST_COMP;
-		}
-	} else {
-		ret_type = lkb->lkb_ast_first;
-		ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
-		lkb->lkb_ast_type &= ~ret_type;
-		lkb->lkb_ast_first = 0;
+	rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
+	if (rv < 0) {
+		/* this shouldn't happen; lkb should have been removed from
+		   list when resid was zero */
+		log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
+		list_del_init(&lkb->lkb_astqueue);
+		spin_unlock(&proc->asts_spin);
+		/* removes ref for proc->asts, may cause lkb to be freed */
+		dlm_put_lkb(lkb);
+		goto try_another;
 	}
+	if (!resid)
+		list_del_init(&lkb->lkb_astqueue);
+	spin_unlock(&proc->asts_spin);
 
-	/* if we're doing a bast but the bast is unnecessary, then
-	   switch to do nothing or do a cast if that was needed next */
-
-	if ((ret_type == AST_BAST) &&
-	    dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
-		ret_type = 0;
-		ret_mode = 0;
-
-		if (do_cast) {
-			ret_type = AST_COMP;
-			ret_mode = castmode;
-			lkb->lkb_ast_type &= ~AST_COMP;
-			lkb->lkb_ast_first = 0;
-		}
+	if (cb.flags & DLM_CB_SKIP) {
+		/* removes ref for proc->asts, may cause lkb to be freed */
+		if (!resid)
+			dlm_put_lkb(lkb);
+		goto try_another;
 	}
 
-	if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
-		log_print("device_read %x ast_first %x ast_type %x",
-			  lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
-	}
+	if (cb.flags & DLM_CB_CAST) {
+		int old_mode, new_mode;
 
-	if (!lkb->lkb_ast_type) {
-		list_del(&lkb->lkb_astqueue);
-		removed = 1;
-	}
-	spin_unlock(&proc->asts_spin);
+		old_mode = lkb->lkb_last_cast.mode;
+		new_mode = cb.mode;
 
-	if (ret_type) {
-		error = copy_result_to_user(lkb->lkb_ua,
-				test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
-				ret_type, ret_mode, buf, count);
+		if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
+		    dlm_lvb_operations[old_mode + 1][new_mode + 1])
+			copy_lvb = 1;
 
-		if (ret_type == AST_COMP)
-			lkb->lkb_castmode_done = castmode;
-		if (ret_type == AST_BAST)
-			lkb->lkb_bastmode_done = bastmode;
+		lkb->lkb_lksb->sb_status = cb.sb_status;
+		lkb->lkb_lksb->sb_flags = cb.sb_flags;
 	}
 
-	/* removes reference for the proc->asts lists added by
-	   dlm_user_add_ast() and may result in the lkb being freed */
+	rv = copy_result_to_user(lkb->lkb_ua,
+				 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+				 cb.flags, cb.mode, copy_lvb, buf, count);
 
-	if (removed)
+	/* removes ref for proc->asts, may cause lkb to be freed */
+	if (!resid)
 		dlm_put_lkb(lkb);
 
-	/* the bast that was queued was eliminated (see unnecessary above),
-	   leaving nothing to return */
-
-	if (!ret_type)
-		goto try_another;
-
-	return error;
+	return rv;
 }
 
 static unsigned int device_poll(struct file *file, poll_table *wait)
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index f196091dd7f..00499ab8835 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,8 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                      int status, uint32_t sbflags, uint64_t seq);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
-- 
cgit v1.2.3


From e3853a90e218bcb2e48d3f403d0962bf54444f5f Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 10 Mar 2011 13:07:17 -0600
Subject: dlm: increase default hash table sizes

Make all three hash tables a consistent size of 1024
rather than 1024, 512, 256.  All three tables, for
resources, locks, and lock dir entries, will generally
be filled to the same order of magnitude.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/config.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index b54bca03d92..0d329ff8ed4 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -977,9 +977,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 /* Config file defaults */
 #define DEFAULT_TCP_PORT       21064
 #define DEFAULT_BUFFER_SIZE     4096
-#define DEFAULT_RSBTBL_SIZE      256
+#define DEFAULT_RSBTBL_SIZE     1024
 #define DEFAULT_LKBTBL_SIZE     1024
-#define DEFAULT_DIRTBL_SIZE      512
+#define DEFAULT_DIRTBL_SIZE     1024
 #define DEFAULT_RECOVER_TIMER      5
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_SCAN_SECS          5
-- 
cgit v1.2.3


From e43f055a953721ed1787a039ab5e720755596ea2 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 10 Mar 2011 13:22:34 -0600
Subject: dlm: use alloc_workqueue function

Replaces deprecated create_singlethread_workqueue().

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 2d8c87b951c..bffa1e73b9a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1468,13 +1468,15 @@ static void work_stop(void)
 
 static int work_start(void)
 {
-	recv_workqueue = create_singlethread_workqueue("dlm_recv");
+	recv_workqueue = alloc_workqueue("dlm_recv",
+					 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
 	if (!recv_workqueue) {
 		log_print("can't start dlm_recv");
 		return -ENOMEM;
 	}
 
-	send_workqueue = create_singlethread_workqueue("dlm_send");
+	send_workqueue = alloc_workqueue("dlm_send",
+					 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
 	if (!send_workqueue) {
 		log_print("can't start dlm_send");
 		destroy_workqueue(recv_workqueue);
-- 
cgit v1.2.3


From bf294b41cefcb22fc3139e0f42c5b3f06728bd5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 21 Feb 2011 11:05:41 -0800
Subject: SUNRPC: Close a race in __rpc_wait_for_completion_task()

Although they run as rpciod background tasks, under normal operation
(i.e. no SIGKILL), functions like nfs_sillyrename(), nfs4_proc_unlck()
and nfs4_do_close() want to be fully synchronous. This means that when we
exit, we want all references to the rpc_task to be gone, and we want
any dentry references etc. held by that task to be released.

For this reason these functions call __rpc_wait_for_completion_task(),
followed by rpc_put_task() in the expectation that the latter will be
releasing the last reference to the rpc_task, and thus ensuring that the
callback_ops->rpc_release() has been called synchronously.

This patch fixes a race which exists due to the fact that
rpciod calls rpc_complete_task() (in order to wake up the callers of
__rpc_wait_for_completion_task()) and then subsequently calls
rpc_put_task() without ensuring that these two steps are done atomically.

In order to avoid adding new spin locks, the patch uses the existing
waitqueue spin lock to order the rpc_task reference count releases between
the waiting process and rpciod.
The common case where nobody is waiting for completion is optimised for by
checking if the RPC_TASK_ASYNC flag is cleared and/or if the rpc_task
reference count is 1: in those cases we drop trying to grab the spin lock,
and immediately free up the rpc_task.

Those few processes that need to put the rpc_task from inside an
asynchronous context and that do not care about ordering are given a new
helper: rpc_put_task_async().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 4 ++--
 fs/nfs/unlink.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1ff76acc7e9..d1ed67145cf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4150,7 +4150,7 @@ static void nfs4_lock_release(void *calldata)
 		task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
 				data->arg.lock_seqid);
 		if (!IS_ERR(task))
-			rpc_put_task(task);
+			rpc_put_task_async(task);
 		dprintk("%s: cancelling lock!\n", __func__);
 	} else
 		nfs_free_seqid(data->arg.lock_seqid);
@@ -5227,7 +5227,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
 	if (IS_ERR(task))
 		ret = PTR_ERR(task);
 	else
-		rpc_put_task(task);
+		rpc_put_task_async(task);
 	dprintk("<-- %s status=%d\n", __func__, ret);
 	return ret;
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index e313a51acdd..6481d537d69 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -180,7 +180,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 	task_setup_data.rpc_client = NFS_CLIENT(dir);
 	task = rpc_run_task(&task_setup_data);
 	if (!IS_ERR(task))
-		rpc_put_task(task);
+		rpc_put_task_async(task);
 	return 1;
 }
 
-- 
cgit v1.2.3


From d2224e7afbf2a6556f4f8f25bc0e96d99ec4d2bd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 6 Mar 2011 17:14:13 +0000
Subject: nfs: close NFSv4 COMMIT vs. CLOSE race

I've been adding in more artificial delays in the NFSv4 commit and close
codepaths to uncover races. The kernel I'm testing has the patch to
close the race in __rpc_wait_for_completion_task that's in Trond's
cthon2011 branch. The reproducer I've been using does this in a loop:

	mkdir("DIR");
	fd = open("DIR/FILE", O_WRONLY|O_CREAT|O_EXCL, 0644);
	write(fd, "abcdefg", 7);
	close(fd);
	unlink("DIR/FILE");
	rmdir("DIR");

The above reproducer shouldn't result in any silly-renaming. However,
when I add a "msleep(100)" just after the nfs_commit_clear_lock call in
nfs_commit_release, I can almost always force one to occur. If I can
force it to occur with that, then it can happen without that delay
given the right timing.

nfs_commit_inode waits for the NFS_INO_COMMIT bit to clear when called
with FLUSH_SYNC set. nfs_commit_rpcsetup on the other hand does not wait
for the task to complete before putting its reference to it, so the last
reference get put in rpc_release task and gets queued to a workqueue.

In this situation, the last open context reference may be put by the
COMMIT release instead of the close() syscall. The close() syscall
returns too quickly and the unlink runs while the d_count is still
high since the COMMIT release hasn't put its dentry reference yet.

Fix this by having rpc_commit_rpcsetup wait for the RPC call to complete
before putting the task reference when FLUSH_SYNC is set. With this, the
last reference is put by the process that's initiating the FLUSH_SYNC
commit and the race is closed.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c8278f4046c..42b92d7a9cc 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1292,6 +1292,8 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
+	if (how & FLUSH_SYNC)
+		rpc_wait_for_completion_task(task);
 	rpc_put_task(task);
 	return 0;
 }
-- 
cgit v1.2.3


From b9f810570d9cc13177128e11a74e22d37aa68a1a Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <kernel@fomichev.me>
Date: Sat, 5 Feb 2011 23:13:01 +0000
Subject: nfs: add kmalloc return value check in decode_and_add_ds

add kmalloc return value check in decode_and_add_ds

Signed-off-by: Stanislav Fomichev <kernel@fomichev.me>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayoutdev.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f5c9b125e8c..b73c34375f6 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -219,6 +219,10 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
 		goto out_err;
 	}
 	buf = kmalloc(rlen + 1, GFP_KERNEL);
+	if (!buf) {
+		dprintk("%s: Not enough memory\n", __func__);
+		goto out_err;
+	}
 	buf[rlen] = '\0';
 	memcpy(buf, r_addr, rlen);
 
-- 
cgit v1.2.3


From 43b7c3f051dea504afccc39bcb56d8e26c2e0b77 Mon Sep 17 00:00:00 2001
From: Jovi Zhang <bookjovi@gmail.com>
Date: Wed, 2 Mar 2011 23:19:37 +0000
Subject: nfs: fix compilation warning

this commit fix compilation warning as following:
linux-2.6/fs/nfs/nfs4proc.c:3265: warning: comparison of distinct pointer types lacks a cast

Signed-off-by: Jovi Zhang <bookjovi@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d1ed67145cf..b07d4e23b87 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3262,7 +3262,7 @@ static int buf_to_pages_noslab(const void *buf, size_t buflen,
 	spages = pages;
 
 	do {
-		len = min(PAGE_CACHE_SIZE, buflen);
+		len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
 		newpage = alloc_page(GFP_KERNEL);
 
 		if (newpage == NULL)
-- 
cgit v1.2.3


From 3fa0b4e201d254b52a251fa348bd53e53000cff6 Mon Sep 17 00:00:00 2001
From: Frank Filz <ffilzlnx@us.ibm.com>
Date: Thu, 2 Dec 2010 19:31:23 +0000
Subject: (try3-resend) Fix nfs_compat_user_ino64 so it doesn't cause problems
 if bit 31 or 63 are set in fileid

The problem was use of an int32, which when converted to a uint64
is sign extended resulting in a fileid that doesn't fit in 32 bits
even though the intent of the function is to fit the fileid into
32 bits.

Signed-off-by: Frank Filz <ffilzlnx@us.ibm.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
[Trond: Added an include for compat.h]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1cc600e77bb..2f8e61816d7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
 #include <linux/slab.h>
+#include <linux/compat.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word)
  */
 u64 nfs_compat_user_ino64(u64 fileid)
 {
-	int ino;
+#ifdef CONFIG_COMPAT
+	compat_ulong_t ino;
+#else	
+	unsigned long ino;
+#endif
 
 	if (enable_ino64)
 		return fileid;
-- 
cgit v1.2.3


From 7d6d63d6427090cbb1d282364b65b12634ca59bd Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 9 Mar 2011 13:13:44 -0500
Subject: NFSv4.1: Retry CREATE_SESSION on NFS4ERR_DELAY

Fix bug where we currently retry the EXCHANGEID call again, eventhough
we already have a valid clientid.  Instead, delay and retry the CREATE_SESSION
call.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b07d4e23b87..d3c705aa71f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5028,10 +5028,20 @@ int nfs4_proc_create_session(struct nfs_client *clp)
 	int status;
 	unsigned *ptr;
 	struct nfs4_session *session = clp->cl_session;
+	long timeout = 0;
+	int err;
 
 	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
 
-	status = _nfs4_proc_create_session(clp);
+	do {
+		status = _nfs4_proc_create_session(clp);
+		if (status == -NFS4ERR_DELAY) {
+			err = nfs4_delay(clp->cl_rpcclient, &timeout);
+			if (err)
+				status = err;
+		}
+	} while (status == -NFS4ERR_DELAY);
+
 	if (status)
 		goto out;
 
-- 
cgit v1.2.3


From 114f64b5f24abac33a42f4f1856eb3a9766d497e Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Mar 2011 13:13:45 -0500
Subject: NFSv4: remove duplicate clientid in struct nfs_client

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e2c168b6ee..94d50e86a12 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1660,7 +1660,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 
 	p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
 	*p++ = cpu_to_be32(OP_CREATE_SESSION);
-	p = xdr_encode_hyper(p, clp->cl_ex_clid);
+	p = xdr_encode_hyper(p, clp->cl_clientid);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
 	*p++ = cpu_to_be32(args->flags);			/*flags */
 
@@ -4694,7 +4694,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	xdr_decode_hyper(p, &clp->cl_ex_clid);
+	xdr_decode_hyper(p, &clp->cl_clientid);
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 		goto out_overflow;
-- 
cgit v1.2.3


From c34c32ea97718bb24fc06158733580003ba89211 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Mar 2011 13:13:46 -0500
Subject: NFSv4.1 reclaim complete must wait for completion

Signed-off-by: Andy Adamson <andros@netapp.com>
[Trond: fix whitespace errors]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d3c705aa71f..0a13ae720dd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5359,6 +5359,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
 		status = PTR_ERR(task);
 		goto out;
 	}
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status == 0)
+		status = task->tk_status;
 	rpc_put_task(task);
 	return 0;
 out:
-- 
cgit v1.2.3


From d882962f6af2b484b62a7fb05ef959e1bf355fc4 Mon Sep 17 00:00:00 2001
From: "Matthew L. Creech" <mlcreech@gmail.com>
Date: Fri, 4 Mar 2011 17:55:02 -0500
Subject: UBIFS: handle allocation failures in UBIFS write path

Running kernel 2.6.37, my PPC-based device occasionally gets an
order-2 allocation failure in UBIFS, which causes the root FS to
become unwritable:

kswapd0: page allocation failure. order:2, mode:0x4050
Call Trace:
[c787dc30] [c00085b8] show_stack+0x7c/0x194 (unreliable)
[c787dc70] [c0061aec] __alloc_pages_nodemask+0x4f0/0x57c
[c787dd00] [c0061b98] __get_free_pages+0x20/0x50
[c787dd10] [c00e4f88] ubifs_jnl_write_data+0x54/0x200
[c787dd50] [c00e82d4] do_writepage+0x94/0x198
[c787dd90] [c00675e4] shrink_page_list+0x40c/0x77c
[c787de40] [c0067de0] shrink_inactive_list+0x1e0/0x370
[c787de90] [c0068224] shrink_zone+0x2b4/0x2b8
[c787df00] [c0068854] kswapd+0x408/0x5d4
[c787dfb0] [c0037bcc] kthread+0x80/0x84
[c787dff0] [c000ef44] kernel_thread+0x4c/0x68

Similar problems were encountered last April by Tomasz Stanislawski:

http://patchwork.ozlabs.org/patch/50965/

This patch implements Artem's suggested fix: fall back to a
mutex-protected static buffer, allocated at mount time.  I tested it
by forcing execution down the failure path, and didn't see any ill
effects.

Artem: massaged the patch a little, improved it so that we'd not
allocate the write reserve buffer when we are in R/O mode.

Signed-off-by: Matthew L. Creech <mlcreech@gmail.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/journal.c | 28 ++++++++++++++++++++++------
 fs/ubifs/super.c   | 18 ++++++++++++++++++
 fs/ubifs/ubifs.h   | 14 ++++++++++++++
 3 files changed, 54 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 914f1bd89e5..aed25e86422 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -690,7 +690,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 {
 	struct ubifs_data_node *data;
 	int err, lnum, offs, compr_type, out_len;
-	int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
+	int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
 	dbg_jnl("ino %lu, blk %u, len %d, key %s",
@@ -698,9 +698,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 		DBGKEY(key));
 	ubifs_assert(len <= UBIFS_BLOCK_SIZE);
 
-	data = kmalloc(dlen, GFP_NOFS);
-	if (!data)
-		return -ENOMEM;
+	data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
+	if (!data) {
+		/*
+		 * Fall-back to the write reserve buffer. Note, we might be
+		 * currently on the memory reclaim path, when the kernel is
+		 * trying to free some memory by writing out dirty pages. The
+		 * write reserve buffer helps us to guarantee that we are
+		 * always able to write the data.
+		 */
+		allocated = 0;
+		mutex_lock(&c->write_reserve_mutex);
+		data = c->write_reserve_buf;
+	}
 
 	data->ch.node_type = UBIFS_DATA_NODE;
 	key_write(c, key, &data->key);
@@ -736,7 +746,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 		goto out_ro;
 
 	finish_reservation(c);
-	kfree(data);
+	if (!allocated)
+		mutex_unlock(&c->write_reserve_mutex);
+	else
+		kfree(data);
 	return 0;
 
 out_release:
@@ -745,7 +758,10 @@ out_ro:
 	ubifs_ro_mode(c, err);
 	finish_reservation(c);
 out_free:
-	kfree(data);
+	if (!allocated)
+		mutex_unlock(&c->write_reserve_mutex);
+	else
+		kfree(data);
 	return err;
 }
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d4b4cb4596e..e360c7a71f9 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1213,6 +1213,13 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (c->bulk_read == 1)
 		bu_init(c);
 
+	if (!c->ro_mount) {
+		c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
+					       GFP_KERNEL);
+		if (!c->write_reserve_buf)
+			goto out_free;
+	}
+
 	c->mounting = 1;
 
 	err = ubifs_read_superblock(c);
@@ -1482,6 +1489,7 @@ out_wbufs:
 out_cbuf:
 	kfree(c->cbuf);
 out_free:
+	kfree(c->write_reserve_buf);
 	kfree(c->bu.buf);
 	vfree(c->ileb_buf);
 	vfree(c->sbuf);
@@ -1520,6 +1528,7 @@ static void ubifs_umount(struct ubifs_info *c)
 	kfree(c->cbuf);
 	kfree(c->rcvrd_mst_node);
 	kfree(c->mst_node);
+	kfree(c->write_reserve_buf);
 	kfree(c->bu.buf);
 	vfree(c->ileb_buf);
 	vfree(c->sbuf);
@@ -1605,6 +1614,10 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 		goto out;
 	}
 
+	c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
+	if (!c->write_reserve_buf)
+		goto out;
+
 	err = ubifs_lpt_init(c, 0, 1);
 	if (err)
 		goto out;
@@ -1669,6 +1682,8 @@ out:
 		c->bgt = NULL;
 	}
 	free_wbufs(c);
+	kfree(c->write_reserve_buf);
+	c->write_reserve_buf = NULL;
 	vfree(c->ileb_buf);
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
@@ -1712,6 +1727,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	free_wbufs(c);
 	vfree(c->orph_buf);
 	c->orph_buf = NULL;
+	kfree(c->write_reserve_buf);
+	c->write_reserve_buf = NULL;
 	vfree(c->ileb_buf);
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
@@ -1942,6 +1959,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	mutex_init(&c->mst_mutex);
 	mutex_init(&c->umount_mutex);
 	mutex_init(&c->bu_mutex);
+	mutex_init(&c->write_reserve_mutex);
 	init_waitqueue_head(&c->cmt_wq);
 	c->buds = RB_ROOT;
 	c->old_idx = RB_ROOT;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 36249507848..8c40ad3c672 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -151,6 +151,12 @@
  */
 #define WORST_COMPR_FACTOR 2
 
+/*
+ * How much memory is needed for a buffer where we comress a data node.
+ */
+#define COMPRESSED_DATA_NODE_BUF_SZ \
+	(UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
+
 /* Maximum expected tree height for use by bottom_up_buf */
 #define BOTTOM_UP_HEIGHT 64
 
@@ -1005,6 +1011,11 @@ struct ubifs_debug_info;
  * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
  * @bu: pre-allocated bulk-read information
  *
+ * @write_reserve_mutex: protects @write_reserve_buf
+ * @write_reserve_buf: on the write path we allocate memory, which might
+ *                     sometimes be unavailable, in which case we use this
+ *                     write reserve buffer
+ *
  * @log_lebs: number of logical eraseblocks in the log
  * @log_bytes: log size in bytes
  * @log_last: last LEB of the log
@@ -1256,6 +1267,9 @@ struct ubifs_info {
 	struct mutex bu_mutex;
 	struct bu_info bu;
 
+	struct mutex write_reserve_mutex;
+	void *write_reserve_buf;
+
 	int log_lebs;
 	long long log_bytes;
 	int log_last;
-- 
cgit v1.2.3


From 6342aaebda9b94e3cd101ba13eee690ac6577124 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 8 Mar 2011 14:26:47 +0200
Subject: UBIFS: print max. index node size

Improve debugging messages by printing the maximum index node size
on mount.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e360c7a71f9..e9585ad90f5 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1447,9 +1447,9 @@ static int mount_ubifs(struct ubifs_info *c)
 		UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
 	dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
 		UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
-	dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu",
+	dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
 	        UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
-		UBIFS_MAX_DENT_NODE_SZ);
+		UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
 	dbg_msg("dead watermark:      %d", c->dead_wm);
 	dbg_msg("dark watermark:      %d", c->dark_wm);
 	dbg_msg("LEB overhead:        %d", c->leb_overhead);
-- 
cgit v1.2.3


From cce3f612fedcbeee61977497b99bbf68a4082b6b Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 9 Mar 2011 13:36:23 +0200
Subject: UBIFS: simplify UBIFS Kconfig menu

Remove debug message level and debug checks Kconfig options as they
proved to be useless anyway. We have sysfs interface which we can
use for fine-grained debugging messages and checks selection, see
Documentation/filesystems/ubifs.txt for mode details.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/Kconfig | 23 ++++++++++-------------
 fs/ubifs/debug.c |  4 ++--
 fs/ubifs/debug.h | 22 ----------------------
 3 files changed, 12 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 830e3f76f44..1d1859dc3de 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -44,23 +44,20 @@ config UBIFS_FS_ZLIB
 
 # Debugging-related stuff
 config UBIFS_FS_DEBUG
-	bool "Enable debugging"
+	bool "Enable debugging support"
 	depends on UBIFS_FS
 	select DEBUG_FS
 	select KALLSYMS_ALL
 	help
-	  This option enables UBIFS debugging.
-
-config UBIFS_FS_DEBUG_MSG_LVL
-	int "Default message level (0 = no extra messages, 3 = lots)"
-	depends on UBIFS_FS_DEBUG
-	default "0"
-	help
-	  This controls the amount of debugging messages produced by UBIFS.
-	  If reporting bugs, please try to have available a full dump of the
-	  messages at level 1 while the misbehaviour was occurring. Level 2
-	  may become necessary if level 1 messages were not enough to find the
-	  bug. Generally Level 3 should be avoided.
+	  This option enables UBIFS debugging support. It makes sure various
+	  assertions, self-checks, debugging messages and test modes are compiled
+	  in (this all is compiled out otherwise). Assertions are light-weight
+	  and this option also enables them. Self-checks, debugging messages and
+	  test modes are switched off by default. Thus, it is safe and actually
+	  recommended to have debugging support enabled, and it should not slow
+	  down UBIFS. You can then further enable / disable individual  debugging
+	  features using UBIFS module parameters and the corresponding sysfs
+	  interfaces.
 
 config UBIFS_FS_DEBUG_CHKS
 	bool "Enable extra checks"
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index bcb1acb7926..02c10dccdd6 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -43,8 +43,8 @@ DEFINE_SPINLOCK(dbg_lock);
 static char dbg_key_buf0[128];
 static char dbg_key_buf1[128];
 
-unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
-unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
+unsigned int ubifs_msg_flags;
+unsigned int ubifs_chk_flags;
 unsigned int ubifs_tst_flags;
 
 module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 69ebe472915..10190c18981 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -205,12 +205,6 @@ enum {
 	UBIFS_MSG_RCVRY = 0x1000,
 };
 
-/* Debugging message type flags for each default debug message level */
-#define UBIFS_MSG_LVL_0 0
-#define UBIFS_MSG_LVL_1 0x1
-#define UBIFS_MSG_LVL_2 0x7f
-#define UBIFS_MSG_LVL_3 0xffff
-
 /*
  * Debugging check flags (must match chk_names in debug.c).
  *
@@ -243,22 +237,6 @@ enum {
 	UBIFS_TST_RCVRY             = 0x4,
 };
 
-#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
-#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
-#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
-#else
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
-#endif
-
-#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
-#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
-#else
-#define UBIFS_CHK_FLAGS_DEFAULT 0
-#endif
-
 extern spinlock_t dbg_lock;
 
 extern unsigned int ubifs_msg_flags;
-- 
cgit v1.2.3


From 2bcf002159c2aedd5c0ab5a21c3ea73fec87ff8d Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 10 Mar 2011 16:26:32 +0200
Subject: UBIFS: do not check data crc by default

Change the default UBIFS behavior WRT data CRC checking. Currently,
UBIFS checks data CRC when reading, which slows it down quite a bit,
and this is the default option. However, it looks like in average
user does not need this feature and would prefer faster read speed
over extra reliability. And this seems to be de-facto standard that
file-systems do not check data CRC every time they read from the
media.

Thus, make UBIFS default behavior so that it does not check data
CRC. This corresponds to the no_chk_data_crc mount option. Those users
who need extra protection can always enable it using the chk_data_crc
option.

Please, read more information about this feature here:
http://www.linux-mtd.infradead.org/doc/ubifs.html#L_checksumming

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e9585ad90f5..1da5155a1be 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1977,6 +1977,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	INIT_LIST_HEAD(&c->old_buds);
 	INIT_LIST_HEAD(&c->orph_list);
 	INIT_LIST_HEAD(&c->orph_new);
+	c->no_chk_data_crc = 1;
 
 	c->vfs_sb = sb;
 	c->highest_inum = UBIFS_FIRST_INO;
-- 
cgit v1.2.3


From fa1bbdea300a15ec7c1186a5e53de33a5b1672f5 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 10 Mar 2011 11:41:57 -0500
Subject: GFS2: Optimize glock multiple-dequeue code

This is a small patch that optimizes multiple glock dequeue
operations.  It changes the unlock order to be more efficient
and makes it easier for lock debugging tools to unravel.  It
also eliminates the need for the temp variable x, although
that would likely be optimized out.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3f45a14009b..8648409be45 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1248,10 +1248,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-	unsigned int x;
-
-	for (x = 0; x < num_gh; x++)
-		gfs2_glock_dq(&ghs[x]);
+	while (num_gh--)
+		gfs2_glock_dq(&ghs[num_gh]);
 }
 
 /**
@@ -1263,10 +1261,8 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-	unsigned int x;
-
-	for (x = 0; x < num_gh; x++)
-		gfs2_glock_dq_uninit(&ghs[x]);
+	while (num_gh--)
+		gfs2_glock_dq_uninit(&ghs[num_gh]);
 }
 
 void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
-- 
cgit v1.2.3


From e4a7b7b0c98efcdcc4c1f6eb10925dec1fbc4016 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Fri, 11 Mar 2011 00:49:09 -0600
Subject: GFS2: fix block allocation check for fallocate

GFS2 fallocate wasn't properly checking if a blocks were already allocated.
In write_empty_blocks(), if a page didn't have buffer_heads attached, GFS2
was always treating it as if there were no blocks allocated for that page.
GFS2 now calls gfs2_block_map() to check if the blocks are allocated before
writing them out.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 56 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2878481f72a..4074b952b05 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -622,8 +622,7 @@ static void empty_write_end(struct page *page, unsigned from,
 {
 	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
 
-	page_zero_new_buffers(page, from, to);
-	flush_dcache_page(page);
+	zero_user(page, from, to-from);
 	mark_page_accessed(page);
 
 	if (!gfs2_is_writeback(ip))
@@ -632,36 +631,43 @@ static void empty_write_end(struct page *page, unsigned from,
 	block_commit_write(page, from, to);
 }
 
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int needs_empty_write(sector_t block, struct inode *inode)
 {
-	unsigned start, end, next;
-	struct buffer_head *bh, *head;
 	int error;
+	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
 
-	if (!page_has_buffers(page)) {
-		error = __block_write_begin(page, from, to - from, gfs2_block_map);
-		if (unlikely(error))
-			return error;
+	bh_map.b_size = 1 << inode->i_blkbits;
+	error = gfs2_block_map(inode, block, &bh_map, 0);
+	if (unlikely(error))
+		return error;
+	return !buffer_mapped(&bh_map);
+}
 
-		empty_write_end(page, from, to);
-		return 0;
-	}
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned start, end, next, blksize;
+	sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	int ret;
 
-	bh = head = page_buffers(page);
+	blksize = 1 << inode->i_blkbits;
 	next = end = 0;
 	while (next < from) {
-		next += bh->b_size;
-		bh = bh->b_this_page;
+		next += blksize;
+		block++;
 	}
 	start = next;
 	do {
-		next += bh->b_size;
-		if (buffer_mapped(bh)) {
+		next += blksize;
+		ret = needs_empty_write(block, inode);
+		if (unlikely(ret < 0))
+			return ret;
+		if (ret == 0) {
 			if (end) {
-				error = __block_write_begin(page, start, end - start,
-							    gfs2_block_map);
-				if (unlikely(error))
-					return error;
+				ret = __block_write_begin(page, start, end - start,
+							  gfs2_block_map);
+				if (unlikely(ret))
+					return ret;
 				empty_write_end(page, start, end);
 				end = 0;
 			}
@@ -669,13 +675,13 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
 		}
 		else
 			end = next;
-		bh = bh->b_this_page;
+		block++;
 	} while (next < to);
 
 	if (end) {
-		error = __block_write_begin(page, start, end - start, gfs2_block_map);
-		if (unlikely(error))
-			return error;
+		ret = __block_write_begin(page, start, end - start, gfs2_block_map);
+		if (unlikely(ret))
+			return ret;
 		empty_write_end(page, start, end);
 	}
 
-- 
cgit v1.2.3


From d6a079e82efd5fcbb1c7295f22e123c2cc748018 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 11 Mar 2011 11:52:25 +0000
Subject: GFS2: introduce AIL lock

The log lock is currently used to protect the AIL lists and
the movements of buffers into and out of them. The lists
are self contained and no log specific items outside the
lists are accessed when starting or emptying the AIL lists.

Hence the operation of the AIL does not require the protection
of the log lock so split them out into a new AIL specific lock
to reduce the amount of traffic on the log lock. This will
also reduce the amount of serialisation that occurs when
the gfs2_logd pushes on the AIL to move it forward.

This reduces the impact of log pushing on sequential write
throughput.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c      | 10 ++++++++--
 fs/gfs2/incore.h     |  1 +
 fs/gfs2/log.c        | 30 ++++++++++++++++--------------
 fs/gfs2/lops.c       |  5 +++--
 fs/gfs2/ops_fstype.c |  1 +
 5 files changed, 29 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index ac5fac948f8..3754e3cbf02 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,20 +56,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	BUG_ON(current->journal_info);
 	current->journal_info = &tr;
 
-	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
 	while (!list_empty(head)) {
 		bd = list_entry(head->next, struct gfs2_bufdata,
 				bd_ail_gl_list);
 		bh = bd->bd_bh;
 		gfs2_remove_from_ail(bd);
+		spin_unlock(&sdp->sd_ail_lock);
+
 		bd->bd_bh = NULL;
 		bh->b_private = NULL;
 		bd->bd_blkno = bh->b_blocknr;
+		gfs2_log_lock(sdp);
 		gfs2_assert_withdraw(sdp, !buffer_busy(bh));
 		gfs2_trans_add_revoke(sdp, bd);
+		gfs2_log_unlock(sdp);
+
+		spin_lock(&sdp->sd_ail_lock);
 	}
 	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-	gfs2_log_unlock(sdp);
+	spin_unlock(&sdp->sd_ail_lock);
 
 	gfs2_trans_end(sdp);
 	gfs2_log_flush(sdp, NULL);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 59aaaa05113..870a89d6d4d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -651,6 +651,7 @@ struct gfs2_sbd {
 	unsigned int sd_log_flush_head;
 	u64 sd_log_flush_wrapped;
 
+	spinlock_t sd_ail_lock;
 	struct list_head sd_ail1_list;
 	struct list_head sd_ail2_list;
 	u64 sd_ail_sync_gen;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eb01f3575e1..4e3c044934e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -88,8 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
  */
 
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-__releases(&sdp->sd_log_lock)
-__acquires(&sdp->sd_log_lock)
+__releases(&sdp->sd_ail_lock)
+__acquires(&sdp->sd_ail_lock)
 {
 	struct gfs2_bufdata *bd, *s;
 	struct buffer_head *bh;
@@ -117,7 +117,7 @@ __acquires(&sdp->sd_log_lock)
 			list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
 
 			get_bh(bh);
-			gfs2_log_unlock(sdp);
+			spin_unlock(&sdp->sd_ail_lock);
 			lock_buffer(bh);
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
@@ -126,7 +126,7 @@ __acquires(&sdp->sd_log_lock)
 				unlock_buffer(bh);
 				brelse(bh);
 			}
-			gfs2_log_lock(sdp);
+			spin_lock(&sdp->sd_ail_lock);
 
 			retry = 1;
 			break;
@@ -175,10 +175,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
 	struct gfs2_ail *ai;
 	int done = 0;
 
-	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
 	head = &sdp->sd_ail1_list;
 	if (list_empty(head)) {
-		gfs2_log_unlock(sdp);
+		spin_unlock(&sdp->sd_ail_lock);
 		return;
 	}
 	sync_gen = sdp->sd_ail_sync_gen++;
@@ -189,13 +189,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
 			if (ai->ai_sync_gen >= sync_gen)
 				continue;
 			ai->ai_sync_gen = sync_gen;
-			gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+			gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
 			done = 0;
 			break;
 		}
 	}
 
-	gfs2_log_unlock(sdp);
+	spin_unlock(&sdp->sd_ail_lock);
 }
 
 static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
@@ -203,7 +203,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
 	struct gfs2_ail *ai, *s;
 	int ret;
 
-	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
 
 	list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
 		if (gfs2_ail1_empty_one(sdp, ai, flags))
@@ -214,7 +214,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
 
 	ret = list_empty(&sdp->sd_ail1_list);
 
-	gfs2_log_unlock(sdp);
+	spin_unlock(&sdp->sd_ail_lock);
 
 	return ret;
 }
@@ -247,7 +247,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 	int wrap = (new_tail < old_tail);
 	int a, b, rm;
 
-	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
 
 	list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
 		a = (old_tail <= ai->ai_first);
@@ -263,7 +263,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 		kfree(ai);
 	}
 
-	gfs2_log_unlock(sdp);
+	spin_unlock(&sdp->sd_ail_lock);
 }
 
 /**
@@ -421,7 +421,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
 	struct gfs2_ail *ai;
 	unsigned int tail;
 
-	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
 
 	if (list_empty(&sdp->sd_ail1_list)) {
 		tail = sdp->sd_log_head;
@@ -430,7 +430,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
 		tail = ai->ai_first;
 	}
 
-	gfs2_log_unlock(sdp);
+	spin_unlock(&sdp->sd_ail_lock);
 
 	return tail;
 }
@@ -743,10 +743,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	sdp->sd_log_commited_databuf = 0;
 	sdp->sd_log_commited_revoke = 0;
 
+	spin_lock(&sdp->sd_ail_lock);
 	if (!list_empty(&ai->ai_ail1_list)) {
 		list_add(&ai->ai_list, &sdp->sd_ail1_list);
 		ai = NULL;
 	}
+	spin_unlock(&sdp->sd_ail_lock);
 	gfs2_log_unlock(sdp);
 	trace_gfs2_log_flush(sdp, 0);
 	up_write(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 11a73efa826..4295a6a0f1e 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -80,7 +80,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	mark_buffer_dirty(bh);
 	clear_buffer_pinned(bh);
 
-	gfs2_log_lock(sdp);
+	spin_lock(&sdp->sd_ail_lock);
 	if (bd->bd_ail) {
 		list_del(&bd->bd_ail_st_list);
 		brelse(bh);
@@ -91,10 +91,11 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	}
 	bd->bd_ail = ai;
 	list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+	spin_unlock(&sdp->sd_ail_lock);
+
 	if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
 		gfs2_glock_schedule_for_reclaim(bd->bd_gl);
 	trace_gfs2_pin(bd, 0);
-	gfs2_log_unlock(sdp);
 	unlock_buffer(bh);
 	atomic_dec(&sdp->sd_log_pinned);
 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 67654d0ba15..42ef24355af 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	init_waitqueue_head(&sdp->sd_log_waitq);
 	init_waitqueue_head(&sdp->sd_logd_waitq);
+	spin_lock_init(&sdp->sd_ail_lock);
 	INIT_LIST_HEAD(&sdp->sd_ail1_list);
 	INIT_LIST_HEAD(&sdp->sd_ail2_list);
 
-- 
cgit v1.2.3


From 0400a6b0cb756f976bae32ae8db47bfa9853897c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 Mar 2011 16:00:53 -0500
Subject: NFSv4/4.1: Fix nfs4_schedule_state_recovery abuses

nfs4_schedule_state_recovery() should only be used when we need to force
the state manager to check the lease. If we just want to start the
state manager in order to handle a state recovery situation, we should be
using nfs4_schedule_state_manager().

This patch fixes the abuses of nfs4_schedule_state_recovery() by replacing
its use with a set of helper functions that do the right thing.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h   |  8 +++++++-
 fs/nfs/nfs4proc.c  | 45 +++++++++++++++++++++++----------------------
 fs/nfs/nfs4state.c | 25 +++++++++++++++++++------
 3 files changed, 49 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a747407314..54ff900cb8f 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -298,6 +298,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 #if defined(CONFIG_NFS_V4_1)
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *);
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
@@ -307,8 +312,9 @@ extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
-extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
 extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0a13ae720dd..411dc80d065 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -257,12 +257,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 		case -NFS4ERR_OPENMODE:
 			if (state == NULL)
 				break;
-			nfs4_state_mark_reclaim_nograce(clp, state);
-			goto do_state_recovery;
+			nfs4_schedule_stateid_recovery(server, state);
+			goto wait_on_recovery;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_EXPIRED:
-			goto do_state_recovery;
+			nfs4_schedule_lease_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -273,7 +274,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR: %d Reset session\n", __func__,
 				errorcode);
-			nfs4_schedule_state_recovery(clp);
+			nfs4_schedule_session_recovery(clp->cl_session);
 			exception->retry = 1;
 			break;
 #endif /* defined(CONFIG_NFS_V4_1) */
@@ -296,8 +297,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 	}
 	/* We failed to handle the error */
 	return nfs4_map_errors(ret);
-do_state_recovery:
-	nfs4_schedule_state_recovery(clp);
+wait_on_recovery:
 	ret = nfs4_wait_clnt_recover(clp);
 	if (ret == 0)
 		exception->retry = 1;
@@ -1256,14 +1256,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_state_recovery(
-					server->nfs_client);
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
 				goto out;
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_EXPIRED:
 				/* Don't recall a delegation if it was lost */
-				nfs4_schedule_state_recovery(server->nfs_client);
+				nfs4_schedule_lease_recovery(server->nfs_client);
 				goto out;
 			case -ERESTARTSYS:
 				/*
@@ -1272,7 +1271,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 				 */
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_BAD_STATEID:
-				nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+				nfs4_schedule_stateid_recovery(server, state);
 			case -EKEYEXPIRED:
 				/*
 				 * User RPCSEC_GSS context has expired.
@@ -1588,7 +1587,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
 		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
 		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
 			break;
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_state_manager(clp);
 		ret = -EIO;
 	}
 	return ret;
@@ -3179,7 +3178,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 	if (task->tk_status < 0) {
 		/* Unless we're shutting down, schedule state recovery! */
 		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
-			nfs4_schedule_state_recovery(clp);
+			nfs4_schedule_lease_recovery(clp);
 		return;
 	}
 	do_renew_lease(clp, timestamp);
@@ -3504,12 +3503,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_OPENMODE:
 			if (state == NULL)
 				break;
-			nfs4_state_mark_reclaim_nograce(clp, state);
-			goto do_state_recovery;
+			nfs4_schedule_stateid_recovery(server, state);
+			goto wait_on_recovery;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_EXPIRED:
-			goto do_state_recovery;
+			nfs4_schedule_lease_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -3520,7 +3520,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
-			nfs4_schedule_state_recovery(clp);
+			nfs4_schedule_session_recovery(clp->cl_session);
 			task->tk_status = 0;
 			return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3537,9 +3537,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 	}
 	task->tk_status = nfs4_map_errors(task->tk_status);
 	return 0;
-do_state_recovery:
+wait_on_recovery:
 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
-	nfs4_schedule_state_recovery(clp);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
 		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
 	task->tk_status = 0;
@@ -4406,12 +4405,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case -NFS4ERR_EXPIRED:
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
+				nfs4_schedule_lease_recovery(server->nfs_client);
+				goto out;
 			case -NFS4ERR_BADSESSION:
 			case -NFS4ERR_BADSLOT:
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_state_recovery(server->nfs_client);
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
 				goto out;
 			case -ERESTARTSYS:
 				/*
@@ -4421,7 +4422,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_BAD_STATEID:
 			case -NFS4ERR_OPENMODE:
-				nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+				nfs4_schedule_stateid_recovery(server, state);
 				err = 0;
 				goto out;
 			case -EKEYEXPIRED:
@@ -5150,7 +5151,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
 		rpc_delay(task, NFS4_POLL_RETRY_MAX);
 		return -EAGAIN;
 	default:
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_lease_recovery(clp);
 	}
 	return 0;
 }
@@ -5291,7 +5292,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
 		rpc_delay(task, NFS4_POLL_RETRY_MAX);
 		return -EAGAIN;
 	default:
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_lease_recovery(clp);
 	}
 	return 0;
 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e6742b57a04..47c8dcdada8 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1007,9 +1007,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 }
 
 /*
- * Schedule a state recovery attempt
+ * Schedule a lease recovery attempt
  */
-void nfs4_schedule_state_recovery(struct nfs_client *clp)
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 {
 	if (!clp)
 		return;
@@ -1041,6 +1041,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s
 	return 1;
 }
 
+void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	nfs4_state_mark_reclaim_nograce(clp, state);
+	nfs4_schedule_state_manager(clp);
+}
+
 static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
 	struct inode *inode = state->inode;
@@ -1436,10 +1444,15 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 
 #ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+	nfs4_schedule_lease_recovery(session->clp);
+}
+
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
 	set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
-	nfs4_schedule_state_recovery(clp);
+	nfs4_schedule_state_manager(clp);
 }
 
 static void nfs4_reset_all_state(struct nfs_client *clp)
@@ -1447,7 +1460,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
 		clp->cl_boot_time = CURRENT_TIME;
 		nfs4_state_start_reclaim_nograce(clp);
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_state_manager(clp);
 	}
 }
 
@@ -1455,7 +1468,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
 		nfs4_state_start_reclaim_reboot(clp);
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_state_manager(clp);
 	}
 }
 
@@ -1475,7 +1488,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
 {
 	nfs_expire_all_delegations(clp);
 	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_state_manager(clp);
 }
 
 void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
-- 
cgit v1.2.3


From b4410c2f7f775b03da31566c05bb8d2383c7dc27 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 Mar 2011 16:00:55 -0500
Subject: NFSv4.1: Fix the handling of the SEQUENCE status bits

We want SEQUENCE status bits to be handled by the state manager in order
to avoid threading issues.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 411dc80d065..b0b1a556852 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -436,8 +436,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
 		clp = res->sr_session->clp;
 		do_renew_lease(clp, timestamp);
 		/* Check sequence flags */
-		if (atomic_read(&clp->cl_count) > 1)
-			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+		if (res->sr_status_flags != 0)
+			nfs4_schedule_lease_recovery(clp);
 		break;
 	case -NFS4ERR_DELAY:
 		/* The server detected a resend of the RPC call and
@@ -5254,8 +5254,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
 		goto out;
 	}
 	ret = rpc_wait_for_completion_task(task);
-	if (!ret)
+	if (!ret) {
+		struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+
+		if (task->tk_status == 0)
+			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
 		ret = task->tk_status;
+	}
 	rpc_put_task(task);
 out:
 	dprintk("<-- %s status=%d\n", __func__, ret);
-- 
cgit v1.2.3


From ecac799a5ecc364006f0db6f2db15e77ed4d63e2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 Mar 2011 16:00:56 -0500
Subject: NFSv4: Fix the setlk error handler

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b0b1a556852..bf4e6d4b5bf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4173,23 +4173,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
 
 static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
 {
-	struct nfs_client *clp = server->nfs_client;
-	struct nfs4_state *state = lsp->ls_state;
-
 	switch (error) {
 	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_BAD_STATEID:
-	case -NFS4ERR_EXPIRED:
+		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
 		if (new_lock_owner != 0 ||
 		   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-			nfs4_state_mark_reclaim_nograce(clp, state);
-		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+			nfs4_schedule_stateid_recovery(server, lsp->ls_state);
 		break;
 	case -NFS4ERR_STALE_STATEID:
-		if (new_lock_owner != 0 ||
-		    (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-			nfs4_state_mark_reclaim_reboot(clp, state);
 		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+	case -NFS4ERR_EXPIRED:
+		nfs4_schedule_lease_recovery(server->nfs_client);
 	};
 }
 
-- 
cgit v1.2.3


From f9feab1e180d1392f2f59d692826c6da2e57adf4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 Mar 2011 16:12:46 -0500
Subject: NFSv4: nfs4_state_mark_reclaim_nograce() should be static

There are no more external users of nfs4_state_mark_reclaim_nograce() or
nfs4_state_mark_reclaim_reboot(), so mark them as static.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h   | 2 --
 fs/nfs/nfs4state.c | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 54ff900cb8f..1be36cf65bf 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -315,8 +315,6 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
-extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
-extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 47c8dcdada8..0592288f9f0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1018,7 +1018,7 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 	nfs4_schedule_state_manager(clp);
 }
 
-int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
 
 	set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1032,7 +1032,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st
 	return 1;
 }
 
-int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
 	set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
 	clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
-- 
cgit v1.2.3


From 57df216bd8c8813a79a6a618e3d2ec937d532b86 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Tue, 8 Mar 2011 23:11:30 +0000
Subject: nfs4: remove duplicated #include

Remove duplicated #include('s) in
  fs/nfs/nfs4proc.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bf4e6d4b5bf..0a07e353a96 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -51,7 +51,6 @@
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
-#include <linux/mm.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
-- 
cgit v1.2.3


From 53d4737580535e073963b91ce87d4216e434fab5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 11 Mar 2011 15:31:06 -0500
Subject: NFS: NFSROOT should default to "proto=udp"

There have been a number of recent reports that NFSROOT is no longer
working with default mount options, but fails only with certain NICs.

Brian Downing <bdowning@lavos.net> bisected to commit 56463e50 "NFS:
Use super.c for NFSROOT mount option parsing".  Among other things,
this commit changes the default mount options for NFSROOT to use TCP
instead of UDP as the underlying transport.

TCP seems less able to deal with NICs that are slow to initialize.
The system logs that have accompanied reports of problems all show
that NFSROOT attempts to establish a TCP connection before the NIC is
fully initialized, and thus the TCP connection attempt fails.

When a TCP connection attempt fails during a mount operation, the
NFS stack needs to fail the operation.  Usually user space knows how
and when to retry it.  The network layer does not report a distinct
error code for this particular failure mode.  Thus, there isn't a
clean way for the RPC client to see that it needs to retry in this
case, but not in others.

Because NFSROOT is used in some environments where it is not possible
to update the kernel command line to specify "udp", the proper thing
to do is change NFSROOT to use UDP by default, as it did before commit
56463e50.

To make it easier to see how to change default mount options for
NFSROOT and to distinguish default settings from mandatory settings,
I've adjusted a couple of areas to document the specifics.

root_nfs_cat() is also modified to deal with commas properly when
concatenating strings containing mount option lists.  This keeps
root_nfs_cat() call sites simpler, now that we may be concatenating
multiple mount option strings.

Tested-by: Brian Downing <bdowning@lavos.net>
Tested-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: <stable@kernel.org> # 2.6.37
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 903908a2002..c541093a5bf 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,11 +86,14 @@
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT		"/tftpboot/%s"
 
+/* Default NFSROOT mount options. */
+#define NFS_DEF_OPTIONS		"udp"
+
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[256] __initdata = "";
 
 /* Text-based mount options passed to super.c */
-static char nfs_root_options[256] __initdata = "";
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
 
 /* Address of NFS server */
 static __be32 servaddr __initdata = htonl(INADDR_NONE);
@@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src,
 }
 
 static int __init root_nfs_cat(char *dest, const char *src,
-				  const size_t destlen)
+			       const size_t destlen)
 {
+	size_t len = strlen(dest);
+
+	if (len && dest[len - 1] != ',')
+		if (strlcat(dest, ",", destlen) > destlen)
+			return -1;
+
 	if (strlcat(dest, src, destlen) > destlen)
 		return -1;
 	return 0;
@@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
 		if (root_nfs_cat(nfs_root_options, incoming,
 						sizeof(nfs_root_options)))
 			return -1;
-
-	/*
-	 * Possibly prepare for more options to be appended
-	 */
-	if (nfs_root_options[0] != '\0' &&
-	    nfs_root_options[strlen(nfs_root_options)] != ',')
-		if (root_nfs_cat(nfs_root_options, ",",
-						sizeof(nfs_root_options)))
-			return -1;
-
 	return 0;
 }
 
@@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
  */
 static int __init root_nfs_data(char *cmdline)
 {
-	char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+	char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
 	int len, retval = -1;
 	char *tmp = NULL;
 	const size_t tmplen = sizeof(nfs_export_path);
@@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline)
 	 * Append mandatory options for nfsroot so they override
 	 * what has come before
 	 */
-	snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+	snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
 			&servaddr);
-	if (root_nfs_cat(nfs_root_options, addr_option,
+	if (root_nfs_cat(nfs_root_options, mand_options,
 						sizeof(nfs_root_options)))
 		goto out_optionstoolong;
 
-- 
cgit v1.2.3


From 38511722446993d926861696194c39ef135d85a4 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Feb 2011 18:28:50 +0000
Subject: pnfs: avoid incorrect use of layout stateid

The code could violate the following from RFC5661, section 12.5.3:
"Once a client has no more layouts on a file, the layout stateid is no
longer valid and MUST NOT be used."

This can occur when a layout already has a lseg, starts another
non-everlapping LAYOUTGET, and a CB_LAYOUTRECALL for the existing lseg
is processed before we hit pnfs_layout_process().

Solve by setting, each time the client has no more lsegs for a file, a
flag which blocks further use of the layout and triggers its removal.

This also fixes a second bug which occurs in the same instance as
above.  If we actually use pnfs_layout_process, we add the new lseg to
the layout, but the layout has been removed from the nfs_client list
by the intervening CB_LAYOUTRECALL and will not be added back.  Thus
the newly acquired lseg will not be properly returned in the event of
a subsequent CB_LAYOUTRECALL.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1b1bc1a0fb0..c8d9b2148cb 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -255,6 +255,9 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
 			list_del_init(&lseg->pls_layout->plh_layouts);
 			spin_unlock(&clp->cl_lock);
 			clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
+			set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+			/* Matched by initial refcount set in alloc_init_layout_hdr */
+			put_layout_hdr_locked(lseg->pls_layout);
 		}
 		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
 		list_add(&lseg->pls_list, tmp_list);
@@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 
 	dprintk("%s:Begin lo %p\n", __func__, lo);
 
+	if (list_empty(&lo->plh_segs)) {
+		if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+			put_layout_hdr_locked(lo);
+		return 0;
+	}
 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 		if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
 			dprintk("%s: freeing lseg %p iomode %d "
@@ -332,10 +340,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 	spin_lock(&nfsi->vfs_inode.i_lock);
 	lo = nfsi->layout;
 	if (lo) {
-		set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
 		mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
-		/* Matched by refcount set to 1 in alloc_init_layout_hdr */
-		put_layout_hdr_locked(lo);
 	}
 	spin_unlock(&nfsi->vfs_inode.i_lock);
 	pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +409,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
 	    (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
 		return true;
 	return lo->plh_block_lgets ||
+		test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
 		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 		(list_empty(&lo->plh_segs) &&
 		 (atomic_read(&lo->plh_outstanding) > lget));
-- 
cgit v1.2.3


From 9f52c2525e09854ed6aa4cbd83915a56226d86c1 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Feb 2011 18:28:51 +0000
Subject: pnfs: do not need to clear NFS_LAYOUT_BULK_RECALL flag

We do not need to clear the NFS_LAYOUT_BULK_RECALL, as setting it
guarantees that NFS_LAYOUT_DESTROYED will be set once any outstanding
io is finished.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c8d9b2148cb..c17edfbbaeb 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -254,7 +254,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
 			/* List does not take a reference, so no need for put here */
 			list_del_init(&lseg->pls_layout->plh_layouts);
 			spin_unlock(&clp->cl_lock);
-			clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
 			set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
 			/* Matched by initial refcount set in alloc_init_layout_hdr */
 			put_layout_hdr_locked(lseg->pls_layout);
@@ -754,7 +753,6 @@ pnfs_update_layout(struct inode *ino,
 			spin_lock(&clp->cl_lock);
 			list_del_init(&lo->plh_layouts);
 			spin_unlock(&clp->cl_lock);
-			clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 		}
 		spin_unlock(&ino->i_lock);
 	}
-- 
cgit v1.2.3


From f49f9baac8f63de9cbc17a0a84e04060496e8e76 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Feb 2011 18:28:52 +0000
Subject: pnfs: fix pnfs lock inversion of i_lock and cl_lock

The pnfs code was using throughout the lock order i_lock, cl_lock.
This conflicts with the nfs delegation code.  Rework the pnfs code
to avoid taking both locks simultaneously.

Currently the code takes the double lock to add/remove the layout to a
nfs_client list, while atomically checking that the list of lsegs is
empty.  To avoid this, we rely on existing serializations.  When a
layout is initialized with lseg count equal zero, LAYOUTGET's
openstateid serialization is in effect, making it safe to assume it
stays zero unless we change it.  And once a layout's lseg count drops
to zero, it is set as DESTROYED and so will stay at zero.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_proc.c |  2 +-
 fs/nfs/pnfs.c          | 42 +++++++++++++++++++++++++-----------------
 2 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 89587573fe5..2f41dccea18 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
 			rv = NFS4ERR_DELAY;
 		list_del_init(&lo->plh_bulk_recall);
 		spin_unlock(&ino->i_lock);
+		pnfs_free_lseg_list(&free_me_list);
 		put_layout_hdr(lo);
 		iput(ino);
 	}
-	pnfs_free_lseg_list(&free_me_list);
 	return rv;
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c17edfbbaeb..0f5b66f90d1 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -247,13 +247,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg,
 		BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 		list_del(&lseg->pls_list);
 		if (list_empty(&lseg->pls_layout->plh_segs)) {
-			struct nfs_client *clp;
-
-			clp = NFS_SERVER(ino)->nfs_client;
-			spin_lock(&clp->cl_lock);
-			/* List does not take a reference, so no need for put here */
-			list_del_init(&lseg->pls_layout->plh_layouts);
-			spin_unlock(&clp->cl_lock);
 			set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
 			/* Matched by initial refcount set in alloc_init_layout_hdr */
 			put_layout_hdr_locked(lseg->pls_layout);
@@ -319,11 +312,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 	return invalid - removed;
 }
 
+/* note free_me must contain lsegs from a single layout_hdr */
 void
 pnfs_free_lseg_list(struct list_head *free_me)
 {
 	struct pnfs_layout_segment *lseg, *tmp;
+	struct pnfs_layout_hdr *lo;
+
+	if (list_empty(free_me))
+		return;
 
+	lo = list_first_entry(free_me, struct pnfs_layout_segment,
+			      pls_list)->pls_layout;
+
+	if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
+		struct nfs_client *clp;
+
+		clp = NFS_SERVER(lo->plh_inode)->nfs_client;
+		spin_lock(&clp->cl_lock);
+		list_del_init(&lo->plh_layouts);
+		spin_unlock(&clp->cl_lock);
+	}
 	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
 		list_del(&lseg->pls_list);
 		free_lseg(lseg);
@@ -705,6 +714,7 @@ pnfs_update_layout(struct inode *ino,
 	struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 	struct pnfs_layout_hdr *lo;
 	struct pnfs_layout_segment *lseg = NULL;
+	bool first = false;
 
 	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 		return NULL;
@@ -735,7 +745,10 @@ pnfs_update_layout(struct inode *ino,
 	atomic_inc(&lo->plh_outstanding);
 
 	get_layout_hdr(lo);
-	if (list_empty(&lo->plh_segs)) {
+	if (list_empty(&lo->plh_segs))
+		first = true;
+	spin_unlock(&ino->i_lock);
+	if (first) {
 		/* The lo must be on the clp list if there is any
 		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
 		 */
@@ -744,17 +757,12 @@ pnfs_update_layout(struct inode *ino,
 		list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
 		spin_unlock(&clp->cl_lock);
 	}
-	spin_unlock(&ino->i_lock);
 
 	lseg = send_layoutget(lo, ctx, iomode);
-	if (!lseg) {
-		spin_lock(&ino->i_lock);
-		if (list_empty(&lo->plh_segs)) {
-			spin_lock(&clp->cl_lock);
-			list_del_init(&lo->plh_layouts);
-			spin_unlock(&clp->cl_lock);
-		}
-		spin_unlock(&ino->i_lock);
+	if (!lseg && first) {
+		spin_lock(&clp->cl_lock);
+		list_del_init(&lo->plh_layouts);
+		spin_unlock(&clp->cl_lock);
 	}
 	atomic_dec(&lo->plh_outstanding);
 	put_layout_hdr(lo);
-- 
cgit v1.2.3


From 83762c56c1ba7c5b4b92fb32d570661633228bc6 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 11 Feb 2011 15:42:37 +0000
Subject: NFS: remove pointless if statement in nfs_direct_write_result

The code was doing nothing more in either branch of the if.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9943a75bb6d..f493bdd74f7 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -649,8 +649,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
 
-	if (nfs_writeback_done(task, data) != 0)
-		return;
+	nfs_writeback_done(task, data);
 }
 
 /*
-- 
cgit v1.2.3


From 136028967a283929c6f01518d0700b73fa622d56 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 11 Feb 2011 15:42:38 +0000
Subject: NFS: change nfs_writeback_done to return void

The return values are not used by any callers.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 42b92d7a9cc..ae528b98b80 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1132,7 +1132,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
 /*
  * This function is called when the WRITE call is complete.
  */
-int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct nfs_writeargs	*argp = &data->args;
 	struct nfs_writeres	*resp = &data->res;
@@ -1151,7 +1151,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 	 */
 	status = NFS_PROTO(data->inode)->write_done(task, data);
 	if (status != 0)
-		return status;
+		return;
 	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1196,7 +1196,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 				argp->stable = NFS_FILE_SYNC;
 			}
 			nfs_restart_rpc(task, server->nfs_client);
-			return -EAGAIN;
+			return;
 		}
 		if (time_before(complain, jiffies)) {
 			printk(KERN_WARNING
@@ -1207,7 +1207,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		/* Can't do anything about it except throw an error. */
 		task->tk_status = -EIO;
 	}
-	return 0;
+	return;
 }
 
 
-- 
cgit v1.2.3


From bf9c1387ca80deac792c9ecf1c64dfcc5d1cc768 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:07 +0000
Subject: NFSv4.1: put_layout_hdr can remove nfsi->layout

Prevents an Oops triggered by CB_LAYOUTRECALL and LAYOUTGET race on a
pnfs_layout_hdr first pnfs_layout_segment.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0f5b66f90d1..7d031cd7d92 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -768,7 +768,7 @@ pnfs_update_layout(struct inode *ino,
 	put_layout_hdr(lo);
 out:
 	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-		nfsi->layout->plh_flags, lseg);
+		nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
 	return lseg;
 out_unlock:
 	spin_unlock(&ino->i_lock);
-- 
cgit v1.2.3


From 45a52a02072b2a7e265f024cfdb00127e08dd9f2 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:08 +0000
Subject: NFS move nfs_client initialization into nfs_get_client

Now nfs_get_client returns an nfs_client ready to be used no matter if it was
found or created.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c   | 56 +++++++++++++++++++++++++++++--------------------------
 fs/nfs/internal.h |  9 +++++++++
 fs/nfs/nfs3proc.c |  1 +
 fs/nfs/nfs4proc.c |  1 +
 fs/nfs/proc.c     |  1 +
 5 files changed, 42 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bd3ca32879e..b9ed2a8bc26 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -481,7 +481,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
  * Look up a client by IP address and protocol version
  * - creates a new record if one doesn't yet exist
  */
-static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
+static struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+	       const struct rpc_timeout *timeparms,
+	       const char *ip_addr,
+	       rpc_authflavor_t authflavour,
+	       int noresvport)
 {
 	struct nfs_client *clp, *new = NULL;
 	int error;
@@ -512,6 +517,13 @@ install_client:
 	clp = new;
 	list_add(&clp->cl_share_link, &nfs_client_list);
 	spin_unlock(&nfs_client_lock);
+
+	error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
+					      authflavour, noresvport);
+	if (error < 0) {
+		nfs_put_client(clp);
+		return ERR_PTR(error);
+	}
 	dprintk("--> nfs_get_client() = %p [new]\n", clp);
 	return clp;
 
@@ -767,9 +779,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 /*
  * Initialise an NFS2 or NFS3 client
  */
-static int nfs_init_client(struct nfs_client *clp,
-			   const struct rpc_timeout *timeparms,
-			   const struct nfs_parsed_mount_data *data)
+int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
+		    const char *ip_addr, rpc_authflavor_t authflavour,
+		    int noresvport)
 {
 	int error;
 
@@ -784,7 +796,7 @@ static int nfs_init_client(struct nfs_client *clp,
 	 * - RFC 2623, sec 2.3.2
 	 */
 	error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
-				      0, data->flags & NFS_MOUNT_NORESVPORT);
+				      0, noresvport);
 	if (error < 0)
 		goto error;
 	nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +832,17 @@ static int nfs_init_server(struct nfs_server *server,
 		cl_init.rpc_ops = &nfs_v3_clientops;
 #endif
 
+	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+			data->timeo, data->retrans);
+
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init);
+	clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
+			     data->flags & NFS_MOUNT_NORESVPORT);
 	if (IS_ERR(clp)) {
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		return PTR_ERR(clp);
 	}
 
-	nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
-			data->timeo, data->retrans);
-	error = nfs_init_client(clp, &timeparms, data);
-	if (error < 0)
-		goto error;
-
 	server->nfs_client = clp;
 
 	/* Initialise the client representation from the mount data */
@@ -1307,11 +1317,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 /*
  * Initialise an NFS4 client record
  */
-static int nfs4_init_client(struct nfs_client *clp,
-		const struct rpc_timeout *timeparms,
-		const char *ip_addr,
-		rpc_authflavor_t authflavour,
-		int flags)
+int nfs4_init_client(struct nfs_client *clp,
+		     const struct rpc_timeout *timeparms,
+		     const char *ip_addr,
+		     rpc_authflavor_t authflavour,
+		     int noresvport)
 {
 	int error;
 
@@ -1325,7 +1335,7 @@ static int nfs4_init_client(struct nfs_client *clp,
 	clp->rpc_ops = &nfs_v4_clientops;
 
 	error = nfs_create_rpc_client(clp, timeparms, authflavour,
-				      1, flags & NFS_MOUNT_NORESVPORT);
+				      1, noresvport);
 	if (error < 0)
 		goto error;
 	strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1378,22 +1388,16 @@ static int nfs4_set_client(struct nfs_server *server,
 	dprintk("--> nfs4_set_client()\n");
 
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init);
+	clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
+			     server->flags & NFS_MOUNT_NORESVPORT);
 	if (IS_ERR(clp)) {
 		error = PTR_ERR(clp);
 		goto error;
 	}
-	error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
-					server->flags);
-	if (error < 0)
-		goto error_put;
 
 	server->nfs_client = clp;
 	dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
 	return 0;
-
-error_put:
-	nfs_put_client(clp);
 error:
 	dprintk("<-- nfs4_set_client() = xerror %d\n", error);
 	return error;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index cf9fdbdabc6..4d7b3a97e52 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -215,6 +215,10 @@ extern struct rpc_procinfo nfs4_procedures[];
 
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern int nfs_init_client(struct nfs_client *clp,
+			   const struct rpc_timeout *timeparms,
+			   const char *ip_addr, rpc_authflavor_t authflavour,
+			   int noresvport);
 
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -274,6 +278,11 @@ extern int nfs_migrate_page(struct address_space *,
 #endif
 
 /* nfs4proc.c */
+extern int nfs4_init_client(struct nfs_client *clp,
+			    const struct rpc_timeout *timeparms,
+			    const char *ip_addr,
+			    rpc_authflavor_t authflavour,
+			    int noresvport);
 extern int _nfs4_call_sync(struct nfs_server *server,
 			   struct rpc_message *msg,
 			   struct nfs4_sequence_args *args,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce939c062a5..d0c80d8b3f9 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.lock		= nfs3_proc_lock,
 	.clear_acl_cache = nfs3_forget_cached_acls,
 	.close_context	= nfs_close_context,
+	.init_client	= nfs_init_client,
 };
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0a07e353a96..55a8fc2f3df 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5648,6 +5648,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.clear_acl_cache = nfs4_zap_acl_attr,
 	.close_context  = nfs4_close_context,
 	.open_context	= nfs4_atomic_open,
+	.init_client	= nfs4_init_client,
 };
 
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77d5e21c4ad..b8ec170f2a0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
 	.lock		= nfs_proc_lock,
 	.lock_check_bounds = nfs_lock_check_bounds,
 	.close_context	= nfs_close_context,
+	.init_client	= nfs_init_client,
 };
-- 
cgit v1.2.3


From 89d1ea65798953b251e399b17f32d31033889ae0 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:09 +0000
Subject: NFSv4.1: send zero stateid seqid on v4.1 i/o

Data servers require a zero stateid seqid, and there is no advantage to not
doing the same for all NFSv4.1

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 94d50e86a12..a656b6e179b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	hdr->replen += decode_putrootfh_maxsz;
 }
 
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
 {
 	nfs4_stateid stateid;
 	__be32 *p;
@@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 	p = reserve_space(xdr, NFS4_STATEID_SIZE);
 	if (ctx->state != NULL) {
 		nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+		if (zero_seqid)
+			stateid.stateid.seqid = 0;
 		xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
 	} else
 		xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_READ);
 
-	encode_stateid(xdr, args->context, args->lock_context);
+	encode_stateid(xdr, args->context, args->lock_context,
+		       hdr->minorversion);
 
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(OP_WRITE);
 
-	encode_stateid(xdr, args->context, args->lock_context);
+	encode_stateid(xdr, args->context, args->lock_context,
+		       hdr->minorversion);
 
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
-- 
cgit v1.2.3


From d3b4c9d76738df49a7db7682c2518a0ef9f7391d Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:10 +0000
Subject: NFSv4.1: new flag for state renewal check

Data servers not sharing a session with the mount MDS always have an empty
cl_superblocks list.
Replace the cl_superblocks empty list check to see if it is time to shut down
renewd with the NFS_CS_STOP_RENEW bit which is not set by such a data server.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c     | 5 +++++
 fs/nfs/nfs4renewd.c | 6 +-----
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index b9ed2a8bc26..a86698cd82f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1019,14 +1019,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
 	spin_lock(&nfs_client_lock);
 	list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
 	list_add_tail(&server->master_link, &nfs_volume_list);
+	clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
 	spin_unlock(&nfs_client_lock);
 
 }
 
 static void nfs_server_remove_lists(struct nfs_server *server)
 {
+	struct nfs_client *clp = server->nfs_client;
+
 	spin_lock(&nfs_client_lock);
 	list_del_rcu(&server->client_link);
+	if (clp && list_empty(&clp->cl_superblocks))
+		set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
 	list_del(&server->master_link);
 	spin_unlock(&nfs_client_lock);
 
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 402143d75fc..df8e7f3ca56 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
 	ops = clp->cl_mvops->state_renewal_ops;
 	dprintk("%s: start\n", __func__);
 
-	rcu_read_lock();
-	if (list_empty(&clp->cl_superblocks)) {
-		rcu_read_unlock();
+	if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
 		goto out;
-	}
-	rcu_read_unlock();
 
 	spin_lock(&clp->cl_lock);
 	lease = clp->cl_lease_time;
-- 
cgit v1.2.3


From d6fb79d433d0a34c36bdf74eaf90857193a6261f Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:11 +0000
Subject: NFSv4.1: new flag for lease time check

Data servers cannot send nfs4_proc_get_lease_time. but still need to setup
state renewal. Add the NFS_CS_CHECK_LEASE_TIME bit to indicate if the lease
time can be checked.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c    | 9 +++++++++
 fs/nfs/nfs4state.c | 5 +++++
 2 files changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index a86698cd82f..280d41f64a5 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1400,6 +1400,15 @@ static int nfs4_set_client(struct nfs_server *server,
 		goto error;
 	}
 
+	/*
+	 * Query for the lease time on clientid setup or renewal
+	 *
+	 * Note that this will be set on nfs_clients that were created
+	 * only for the DS role and did not set this bit, but now will
+	 * serve a dual role.
+	 */
+	set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
+
 	server->nfs_client = clp;
 	dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
 	return 0;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0592288f9f0..69c83637312 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
 	int status;
 	struct nfs_fsinfo fsinfo;
 
+	if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+		nfs4_schedule_state_renewal(clp);
+		return 0;
+	}
+
 	status = nfs4_proc_get_lease_time(clp, &fsinfo);
 	if (status == 0) {
 		/* Update lease time and schedule renewal */
-- 
cgit v1.2.3


From 94de8b27d0dcb2608d56a7e5c2941b87e6da7ce3 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:12 +0000
Subject: NFSv4.1: add MDS mount DS only check

The DS only role cannot be used to mount.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c  |  4 ++++
 fs/nfs/nfs4_fs.h | 13 +++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 280d41f64a5..d5c5bdfa423 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1453,6 +1453,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 	BUG_ON(!server->nfs_client->rpc_ops);
 	BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
 
+	/* data servers support only a subset of NFSv4.1 */
+	if (is_ds_only_client(server->nfs_client))
+		return -EPROTONOSUPPORT;
+
 	fattr = nfs_alloc_fattr();
 	if (fattr == NULL)
 		return -ENOMEM;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1be36cf65bf..d4cfacc4000 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -259,6 +259,13 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
 extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 		struct nfs_fsinfo *fsinfo);
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+	return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+		EXCHGID4_FLAG_USE_PNFS_DS;
+}
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
 {
@@ -276,6 +283,12 @@ static inline int nfs4_init_session(struct nfs_server *server)
 {
 	return 0;
 }
+
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+	return false;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
-- 
cgit v1.2.3


From d684d2ae10a4f95d3035abf698d7d611ff2cd279 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Tue, 1 Mar 2011 01:34:13 +0000
Subject: NFSv4.1: lseg refcounting

Prepare put_lseg and get_lseg to be called from the pNFS I/O code.
Pull common code from pnfs_lseg_locked to call from pnfs_lseg.
Inline pnfs_lseg_locked into it's only caller.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 62 +++++++++++++++++++++++++++++++++++++----------------------
 fs/nfs/pnfs.h | 20 +++++++++++++++++++
 2 files changed, 59 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7d031cd7d92..3afa82e4543 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -230,32 +230,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
 	put_layout_hdr(NFS_I(ino)->layout);
 }
 
-/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
- * could sleep, so must be called outside of the lock.
- * Returns 1 if object was removed, otherwise return 0.
- */
-static int
-put_lseg_locked(struct pnfs_layout_segment *lseg,
-		struct list_head *tmp_list)
+static void
+put_lseg_common(struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = lseg->pls_layout->plh_inode;
+
+	BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+	list_del_init(&lseg->pls_list);
+	if (list_empty(&lseg->pls_layout->plh_segs)) {
+		set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+		/* Matched by initial refcount set in alloc_init_layout_hdr */
+		put_layout_hdr_locked(lseg->pls_layout);
+	}
+	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+static void
+put_lseg(struct pnfs_layout_segment *lseg)
 {
+	struct inode *inode;
+
+	if (!lseg)
+		return;
+
 	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 		atomic_read(&lseg->pls_refcount),
 		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-	if (atomic_dec_and_test(&lseg->pls_refcount)) {
-		struct inode *ino = lseg->pls_layout->plh_inode;
+	inode = lseg->pls_layout->plh_inode;
+	if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+		LIST_HEAD(free_me);
 
-		BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-		list_del(&lseg->pls_list);
-		if (list_empty(&lseg->pls_layout->plh_segs)) {
-			set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
-			/* Matched by initial refcount set in alloc_init_layout_hdr */
-			put_layout_hdr_locked(lseg->pls_layout);
-		}
-		rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
-		list_add(&lseg->pls_list, tmp_list);
-		return 1;
+		put_lseg_common(lseg);
+		list_add(&lseg->pls_list, &free_me);
+		spin_unlock(&inode->i_lock);
+		pnfs_free_lseg_list(&free_me);
 	}
-	return 0;
 }
 
 static bool
@@ -276,7 +285,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 		 * list.  It will now be removed when all
 		 * outstanding io is finished.
 		 */
-		rv = put_lseg_locked(lseg, tmp_list);
+		dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+			atomic_read(&lseg->pls_refcount));
+		if (atomic_dec_and_test(&lseg->pls_refcount)) {
+			put_lseg_common(lseg);
+			list_add(&lseg->pls_list, tmp_list);
+			rv = 1;
+		}
 	}
 	return rv;
 }
@@ -689,7 +704,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
 		    is_matching_lseg(lseg, iomode)) {
-			ret = lseg;
+			ret = get_lseg(lseg);
 			break;
 		}
 		if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -769,6 +784,7 @@ pnfs_update_layout(struct inode *ino,
 out:
 	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
 		nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
+	put_lseg(lseg); /* STUB - callers currently ignore return value */
 	return lseg;
 out_unlock:
 	spin_unlock(&ino->i_lock);
@@ -821,7 +837,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	}
 	init_lseg(lo, lseg);
 	lseg->pls_range = res->range;
-	*lgp->lsegpp = lseg;
+	*lgp->lsegpp = get_lseg(lseg);
 	pnfs_insert_layout(lo, lseg);
 
 	if (res->return_on_close) {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e2612ea0cbe..9a994bc9899 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -177,6 +177,16 @@ static inline int lo_fail_bit(u32 iomode)
 			 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 }
 
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+	if (lseg) {
+		atomic_inc(&lseg->pls_refcount);
+		smp_mb__after_atomic_inc();
+	}
+	return lseg;
+}
+
 /* Return true if a layout driver is being used for this mountpoint */
 static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 {
@@ -193,6 +203,16 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
 {
 }
 
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+	return NULL;
+}
+
+static inline void put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+
 static inline struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 		   enum pnfs_iomode access_type)
-- 
cgit v1.2.3


From 94ad1c80e28f9700c84b4d28d1e5302ddf63a6fd Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Tue, 1 Mar 2011 01:34:14 +0000
Subject: NFSv4.1: coelesce across layout stripes

Add a pg_test layout driver hook which is used to avoid coelescing I/O across
layout stripes.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c | 26 ++++++++++++++++++++++++++
 fs/nfs/pagelist.c       | 12 ++++++++++--
 fs/nfs/pnfs.c           | 16 ++++++++++++++++
 fs/nfs/pnfs.h           | 12 ++++++++++++
 fs/nfs/read.c           |  1 +
 fs/nfs/write.c          |  3 +++
 6 files changed, 68 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 23f930caf1e..0efe8cbd9e3 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -252,6 +252,31 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 	_filelayout_free_lseg(fl);
 }
 
+/*
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return 1 :  coalesce page
+ * return 0 :  don't coalesce page
+ */
+int
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+		   struct nfs_page *req)
+{
+	u64 p_stripe, r_stripe;
+	u32 stripe_unit;
+
+	if (!pgio->pg_lseg)
+		return 1;
+	p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
+	r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+	stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+
+	do_div(p_stripe, stripe_unit);
+	do_div(r_stripe, stripe_unit);
+
+	return (p_stripe == r_stripe);
+}
+
 static struct pnfs_layoutdriver_type filelayout_type = {
 	.id = LAYOUT_NFSV4_1_FILES,
 	.name = "LAYOUT_NFSV4_1_FILES",
@@ -260,6 +285,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.clear_layoutdriver = filelayout_clear_layoutdriver,
 	.alloc_lseg              = filelayout_alloc_lseg,
 	.free_lseg               = filelayout_free_lseg,
+	.pg_test		= filelayout_pg_test,
 };
 
 static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e1164e3f9e6..9b9a65c9bb4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -226,6 +226,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	desc->pg_doio = doio;
 	desc->pg_ioflags = io_flags;
 	desc->pg_error = 0;
+	desc->pg_lseg = NULL;
 }
 
 /**
@@ -240,7 +241,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
  * Return 'true' if this is the case, else return 'false'.
  */
 static int nfs_can_coalesce_requests(struct nfs_page *prev,
-				     struct nfs_page *req)
+				     struct nfs_page *req,
+				     struct nfs_pageio_descriptor *pgio)
 {
 	if (req->wb_context->cred != prev->wb_context->cred)
 		return 0;
@@ -254,6 +256,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
 		return 0;
 	if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
 		return 0;
+	/*
+	 * Non-whole file layouts need to check that req is inside of
+	 * pgio->pg_lseg.
+	 */
+	if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
+		return 0;
 	return 1;
 }
 
@@ -286,7 +294,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 		if (newlen > desc->pg_bsize)
 			return 0;
 		prev = nfs_list_entry(desc->pg_list.prev);
-		if (!nfs_can_coalesce_requests(prev, req))
+		if (!nfs_can_coalesce_requests(prev, req, desc))
 			return 0;
 	} else
 		desc->pg_base = req->wb_pgbase;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3afa82e4543..330cee115de 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -858,6 +858,22 @@ out_forget_reply:
 	goto out;
 }
 
+static void
+pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
+{
+	struct pnfs_layoutdriver_type *ld;
+
+	ld = NFS_SERVER(inode)->pnfs_curr_ld;
+	pgio->pg_test = (ld ? ld->pg_test : NULL);
+}
+
+void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
+		  struct inode *inode)
+{
+	pnfs_set_pg_test(inode, pgio);
+}
+
 /*
  * Device ID cache. Currently supports one layout type per struct nfs_client.
  * Add layout type to the lookup key to expand to support multiple types.
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9a994bc9899..db52d965857 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,8 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
 
+#include <linux/nfs_page.h>
+
 enum {
 	NFS_LSEG_VALID = 0,	/* cleared when lseg is recalled/returned */
 	NFS_LSEG_ROC,		/* roc bit received from server */
@@ -65,6 +67,9 @@ struct pnfs_layoutdriver_type {
 	int (*clear_layoutdriver) (struct nfs_server *);
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
+
+	/* test for nfs page cache coalescing */
+	int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
 };
 
 struct pnfs_layout_hdr {
@@ -151,6 +156,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 		   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -250,6 +256,12 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
 
+static inline void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+	pgio->pg_test = NULL;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index aedcaa7f291..2a2765975e1 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -626,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 		goto read_complete; /* all pages were read */
 
 	pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
+	pnfs_pageio_init_read(&pgio, inode);
 	if (rsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
 	else
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ae528b98b80..40143c4747a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -28,6 +28,7 @@
 #include "iostat.h"
 #include "nfs4_fs.h"
 #include "fscache.h"
+#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
@@ -982,6 +983,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 {
 	size_t wsize = NFS_SERVER(inode)->wsize;
 
+	pgio->pg_test = NULL;
+
 	if (wsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
 	else
-- 
cgit v1.2.3


From bae724ef95b0d0a1f4518f5451e7c8aabc41f820 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Tue, 1 Mar 2011 01:34:15 +0000
Subject: NFSv4.1: shift pnfs_update_layout locations

Move the pnfs_update_layout call location to nfs_pageio_do_add_request().
Grab the lseg sent in the doio function to nfs_read_rpcsetup and attach
it to each nfs_read_data so it can be sent to the layout driver.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c     |  4 ----
 fs/nfs/pagelist.c |  7 +++++--
 fs/nfs/pnfs.c     | 27 ++++++++++++++++-----------
 fs/nfs/pnfs.h     |  1 +
 fs/nfs/read.c     | 40 ++++++++++++++++++++++++----------------
 fs/nfs/write.c    |  4 ++--
 6 files changed, 48 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7bf029ef408..d85a534b15c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
-	pnfs_update_layout(mapping->host,
-			   nfs_file_open_context(file),
-			   IOMODE_RW);
-
 start:
 	/*
 	 * Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 9b9a65c9bb4..45b0fb8add3 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,6 +20,7 @@
 #include <linux/nfs_mount.h>
 
 #include "internal.h"
+#include "pnfs.h"
 
 static struct kmem_cache *nfs_page_cachep;
 
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
  */
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     struct inode *inode,
-		     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+		     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *),
 		     size_t bsize,
 		     int io_flags)
 {
@@ -315,7 +316,9 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 					  nfs_page_array_len(desc->pg_base,
 							     desc->pg_count),
 					  desc->pg_count,
-					  desc->pg_ioflags);
+					  desc->pg_ioflags,
+					  desc->pg_lseg);
+		desc->pg_lseg = NULL;
 		if (error < 0)
 			desc->pg_error = error;
 		else
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 330cee115de..77966ecb0a2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -245,7 +245,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
 	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 }
 
-static void
+void
 put_lseg(struct pnfs_layout_segment *lseg)
 {
 	struct inode *inode;
@@ -784,7 +784,6 @@ pnfs_update_layout(struct inode *ino,
 out:
 	dprintk("%s end, state 0x%lx lseg %p\n", __func__,
 		nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
-	put_lseg(lseg); /* STUB - callers currently ignore return value */
 	return lseg;
 out_unlock:
 	spin_unlock(&ino->i_lock);
@@ -858,20 +857,26 @@ out_forget_reply:
 	goto out;
 }
 
-static void
-pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio)
+static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
+			     struct nfs_page *prev,
+			     struct nfs_page *req)
 {
-	struct pnfs_layoutdriver_type *ld;
-
-	ld = NFS_SERVER(inode)->pnfs_curr_ld;
-	pgio->pg_test = (ld ? ld->pg_test : NULL);
+	if (pgio->pg_count == prev->wb_bytes) {
+		/* This is first coelesce call for a series of nfs_pages */
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   prev->wb_context,
+						   IOMODE_READ);
+	}
+	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
 }
 
 void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
-		  struct inode *inode)
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
 {
-	pnfs_set_pg_test(inode, pgio);
+	struct pnfs_layoutdriver_type *ld;
+
+	ld = NFS_SERVER(inode)->pnfs_curr_ld;
+	pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
 }
 
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index db52d965857..5107d14db48 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -151,6 +151,7 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 
 /* pnfs.c */
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void put_lseg(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 		   enum pnfs_iomode access_type);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2a2765975e1..6dc9eaf00e5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -20,17 +20,17 @@
 #include <linux/nfs_page.h>
 
 #include <asm/system.h>
+#include "pnfs.h"
 
 #include "nfs4_fs.h"
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
-#include "pnfs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
-static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
-static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
+static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 
@@ -69,6 +69,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
 
 static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
+	put_lseg(rdata->lseg);
 	put_nfs_open_context(rdata->args.context);
 	nfs_readdata_free(rdata);
 }
@@ -121,7 +122,6 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	len = nfs_page_length(page);
 	if (len == 0)
 		return nfs_return_empty_page(page);
-	pnfs_update_layout(inode, ctx, IOMODE_READ);
 	new = nfs_create_request(ctx, inode, page, 0, len);
 	if (IS_ERR(new)) {
 		unlock_page(page);
@@ -132,9 +132,9 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 
 	nfs_list_add_request(new, &one_request);
 	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
-		nfs_pagein_multi(inode, &one_request, 1, len, 0);
+		nfs_pagein_multi(inode, &one_request, 1, len, 0, NULL);
 	else
-		nfs_pagein_one(inode, &one_request, 1, len, 0);
+		nfs_pagein_one(inode, &one_request, 1, len, 0, NULL);
 	return 0;
 }
 
@@ -160,7 +160,8 @@ static void nfs_readpage_release(struct nfs_page *req)
  */
 static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 		const struct rpc_call_ops *call_ops,
-		unsigned int count, unsigned int offset)
+		unsigned int count, unsigned int offset,
+		struct pnfs_layout_segment *lseg)
 {
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
@@ -183,6 +184,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	data->req	  = req;
 	data->inode	  = inode;
 	data->cred	  = msg.rpc_cred;
+	data->lseg	  = get_lseg(lseg);
 
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
@@ -240,7 +242,7 @@ nfs_async_read_error(struct list_head *head)
  * won't see the new data until our attribute cache is updated.  This is more
  * or less conventional NFS client behavior.
  */
-static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
 {
 	struct nfs_page *req = nfs_list_entry(head->next);
 	struct page *page = req->wb_page;
@@ -266,6 +268,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 	} while(nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
 
+	/* We know lseg==NULL */
+	lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
 	ClearPageError(page);
 	offset = 0;
 	nbytes = count;
@@ -280,12 +284,13 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 		if (nbytes < rsize)
 			rsize = nbytes;
 		ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-				  rsize, offset);
+					 rsize, offset, lseg);
 		if (ret == 0)
 			ret = ret2;
 		offset += rsize;
 		nbytes -= rsize;
 	} while (nbytes != 0);
+	put_lseg(lseg);
 
 	return ret;
 
@@ -300,7 +305,7 @@ out_bad:
 	return -ENOMEM;
 }
 
-static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
@@ -308,8 +313,10 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
 	int ret = -ENOMEM;
 
 	data = nfs_readdata_alloc(npages);
-	if (!data)
-		goto out_bad;
+	if (!data) {
+		nfs_async_read_error(head);
+		goto out;
+	}
 
 	pages = data->pagevec;
 	while (!list_empty(head)) {
@@ -320,10 +327,12 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
 		*pages++ = req->wb_page;
 	}
 	req = nfs_list_entry(data->pages.next);
+	if ((!lseg) && list_is_singular(&data->pages))
+		lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
 
-	return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
-out_bad:
-	nfs_async_read_error(head);
+	ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg);
+out:
+	put_lseg(lseg);
 	return ret;
 }
 
@@ -625,7 +634,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	if (ret == 0)
 		goto read_complete; /* all pages were read */
 
-	pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
 	pnfs_pageio_init_read(&pgio, inode);
 	if (rsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 40143c4747a..f033fa0d7d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -880,7 +880,7 @@ static void nfs_redirty_request(struct nfs_page *req)
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
  */
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
 {
 	struct nfs_page *req = nfs_list_entry(head->next);
 	struct page *page = req->wb_page;
@@ -947,7 +947,7 @@ out_bad:
  * This is the case if nfs_updatepage detects a conflicting request
  * that has been written but not committed.
  */
-static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
-- 
cgit v1.2.3


From 64419a9b20938d9070fdd8c58c2fa23c911915f8 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:16 +0000
Subject: NFSv4.1: generic read

Separate the rpc run portion of nfs_read_rpcsetup into a new function
nfs_initiate_read that is called for normal NFS I/O.

Add a pNFS read_pagelist function that is called instead of nfs_intitate_read
for pNFS reads.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Mingyang Guo <guomingyang@nrchpc.ac.cn>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c | 28 +++++++++++++++++++++++++
 fs/nfs/pnfs.h | 20 ++++++++++++++++++
 fs/nfs/read.c | 65 ++++++++++++++++++++++++++++++++++++-----------------------
 3 files changed, 88 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 77966ecb0a2..86c154bad5d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
 #include "pnfs.h"
+#include "iostat.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS
 
@@ -879,6 +880,33 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
 	pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
 }
 
+/*
+ * Call the appropriate parallel I/O subsystem read function.
+ */
+enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *rdata,
+		       const struct rpc_call_ops *call_ops)
+{
+	struct inode *inode = rdata->inode;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	enum pnfs_try_status trypnfs;
+
+	rdata->mds_ops = call_ops;
+
+	dprintk("%s: Reading ino:%lu %u@%llu\n",
+		__func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+
+	trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+	if (trypnfs == PNFS_NOT_ATTEMPTED) {
+		put_lseg(rdata->lseg);
+		rdata->lseg = NULL;
+	} else {
+		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
+	}
+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+	return trypnfs;
+}
+
 /*
  * Device ID cache. Currently supports one layout type per struct nfs_client.
  * Add layout type to the lookup key to expand to support multiple types.
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5107d14db48..585023fabb5 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -45,6 +45,11 @@ struct pnfs_layout_segment {
 	struct pnfs_layout_hdr *pls_layout;
 };
 
+enum pnfs_try_status {
+	PNFS_ATTEMPTED     = 0,
+	PNFS_NOT_ATTEMPTED = 1,
+};
+
 #ifdef CONFIG_NFS_V4_1
 
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
@@ -70,6 +75,12 @@ struct pnfs_layoutdriver_type {
 
 	/* test for nfs page cache coalescing */
 	int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+
+	/*
+	 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+	 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+	 */
+	enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
 };
 
 struct pnfs_layout_hdr {
@@ -157,6 +168,8 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 		   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+					    const struct rpc_call_ops *);
 void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
@@ -227,6 +240,13 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 	return NULL;
 }
 
+static inline enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *data,
+		      const struct rpc_call_ops *call_ops)
+{
+	return PNFS_NOT_ATTEMPTED;
+}
+
 static inline bool
 pnfs_roc(struct inode *ino)
 {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6dc9eaf00e5..4127a1c0eec 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,6 +18,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/smp_lock.h>
+#include <linux/module.h>
 
 #include <asm/system.h>
 #include "pnfs.h"
@@ -155,25 +157,20 @@ static void nfs_readpage_release(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
-/*
- * Set up the NFS read request struct
- */
-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
-		const struct rpc_call_ops *call_ops,
-		unsigned int count, unsigned int offset,
-		struct pnfs_layout_segment *lseg)
+static int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+		      const struct rpc_call_ops *call_ops)
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct inode *inode = data->inode;
 	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
-		.rpc_cred = req->wb_context->cred,
+		.rpc_cred = data->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
 		.task = &data->task,
-		.rpc_client = NFS_CLIENT(inode),
+		.rpc_client = clnt,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
 		.callback_data = data,
@@ -181,9 +178,37 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 		.flags = RPC_TASK_ASYNC | swap_flags,
 	};
 
+	/* Set up the initial task struct. */
+	NFS_PROTO(inode)->read_setup(data, &msg);
+
+	dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+			"offset %llu)\n",
+			data->task.tk_pid,
+			inode->i_sb->s_id,
+			(long long)NFS_FILEID(inode),
+			data->args.count,
+			(unsigned long long)data->args.offset);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	rpc_put_task(task);
+	return 0;
+}
+
+/*
+ * Set up the NFS read request struct
+ */
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+		const struct rpc_call_ops *call_ops,
+		unsigned int count, unsigned int offset,
+		struct pnfs_layout_segment *lseg)
+{
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+
 	data->req	  = req;
 	data->inode	  = inode;
-	data->cred	  = msg.rpc_cred;
+	data->cred	  = req->wb_context->cred;
 	data->lseg	  = get_lseg(lseg);
 
 	data->args.fh     = NFS_FH(inode);
@@ -199,21 +224,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
 	data->res.eof     = 0;
 	nfs_fattr_init(&data->fattr);
 
-	/* Set up the initial task struct. */
-	NFS_PROTO(inode)->read_setup(data, &msg);
-
-	dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-			data->task.tk_pid,
-			inode->i_sb->s_id,
-			(long long)NFS_FILEID(inode),
-			count,
-			(unsigned long long)data->args.offset);
+	if (data->lseg &&
+	    (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+		return 0;
 
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task))
-		return PTR_ERR(task);
-	rpc_put_task(task);
-	return 0;
+	return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
 }
 
 static void
-- 
cgit v1.2.3


From d83217c13531fd59730d77b5c2284e90e56c0a50 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:17 +0000
Subject: NFSv4.1: data server connection

Introduce a data server set_client and init session following the
nfs4_set_client and  nfs4_init_session convention.

Once a new nfs_client is on the nfs_client_list, the nfs_client cl_cons_state
serializes access to creating an nfs_client struct with matching properties.

Use the new nfs_get_client() that initializes new clients.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c            | 41 +++++++++++++++++++++++++++++++
 fs/nfs/internal.h          |  5 ++++
 fs/nfs/nfs4_fs.h           | 12 +++++++++
 fs/nfs/nfs4filelayoutdev.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4proc.c          | 29 ++++++++++++++++++++--
 5 files changed, 146 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d5c5bdfa423..6dd50ac5b54 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1417,6 +1417,47 @@ error:
 	return error;
 }
 
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+		const struct sockaddr *ds_addr,
+		int ds_addrlen, int ds_proto)
+{
+	struct nfs_client_initdata cl_init = {
+		.addr = ds_addr,
+		.addrlen = ds_addrlen,
+		.rpc_ops = &nfs_v4_clientops,
+		.proto = ds_proto,
+		.minorversion = mds_clp->cl_minorversion,
+	};
+	struct rpc_timeout ds_timeout = {
+		.to_initval = 15 * HZ,
+		.to_maxval = 15 * HZ,
+		.to_retries = 1,
+		.to_exponential = 1,
+	};
+	struct nfs_client *clp;
+
+	/*
+	 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+	 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+	 * (section 13.1 RFC 5661).
+	 */
+	clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+			     mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+
+	dprintk("<-- %s %p\n", __func__, clp);
+	return clp;
+}
+EXPORT_SYMBOL(nfs4_set_ds_client);
 
 /*
  * Session has been established, and the client marked ready.
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4d7b3a97e52..5cc92014259 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -148,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
 					   struct nfs_fattr *);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+					     const struct sockaddr *ds_addr,
+					     int ds_addrlen, int ds_proto);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -213,6 +216,8 @@ extern const u32 nfs41_maxwrite_overhead;
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
+extern int nfs4_init_ds_session(struct nfs_client *clp);
+
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 extern int nfs_init_client(struct nfs_client *clp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index d4cfacc4000..7058a9f75e7 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -266,6 +266,12 @@ is_ds_only_client(struct nfs_client *clp)
 	return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
 		EXCHGID4_FLAG_USE_PNFS_DS;
 }
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+	return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
 {
@@ -289,6 +295,12 @@ is_ds_only_client(struct nfs_client *clp)
 {
 	return false;
 }
+
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+	return false;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index b73c34375f6..8bc91fb8b6f 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -104,6 +104,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
 	return NULL;
 }
 
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server
+ * Currently only support IPv4
+ */
+static int
+nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+{
+	struct nfs_client *clp;
+	struct sockaddr_in sin;
+	int status = 0;
+
+	dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = ds->ds_ip_addr;
+	sin.sin_port = ds->ds_port;
+
+	clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
+				 sizeof(sin), IPPROTO_TCP);
+	if (IS_ERR(clp)) {
+		status = PTR_ERR(clp);
+		goto out;
+	}
+
+	if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
+		if (!is_ds_client(clp)) {
+			status = -ENODEV;
+			goto out_put;
+		}
+		ds->ds_clp = clp;
+		dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
+			ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+		goto out;
+	}
+
+	/*
+	 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
+	 * be equal to the MDS lease. Renewal is scheduled in create_session.
+	 */
+	spin_lock(&mds_srv->nfs_client->cl_lock);
+	clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
+	spin_unlock(&mds_srv->nfs_client->cl_lock);
+	clp->cl_last_renewal = jiffies;
+
+	/* New nfs_client */
+	status = nfs4_init_ds_session(clp);
+	if (status)
+		goto out_put;
+
+	ds->ds_clp = clp;
+	dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
+		ntohs(ds->ds_port));
+out:
+	return status;
+out_put:
+	nfs_put_client(clp);
+	goto out;
+}
+
 static void
 destroy_ds(struct nfs4_pnfs_ds *ds)
 {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 55a8fc2f3df..07d1a43f40f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1573,9 +1573,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	return 0;
 }
 
-static int nfs4_recover_expired_lease(struct nfs_server *server)
+static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
 {
-	struct nfs_client *clp = server->nfs_client;
 	unsigned int loop;
 	int ret;
 
@@ -1592,6 +1591,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
 	return ret;
 }
 
+static int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+	return nfs4_client_recover_expired_lease(server->nfs_client);
+}
+
 /*
  * OPEN_EXPIRED:
  * 	reclaim state on the server after a network partition.
@@ -5118,6 +5122,27 @@ int nfs4_init_session(struct nfs_server *server)
 	return ret;
 }
 
+int nfs4_init_ds_session(struct nfs_client *clp)
+{
+	struct nfs4_session *session = clp->cl_session;
+	int ret;
+
+	if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+		return 0;
+
+	ret = nfs4_client_recover_expired_lease(clp);
+	if (!ret)
+		/* Test for the DS role */
+		if (!is_ds_client(clp))
+			ret = -ENODEV;
+	if (!ret)
+		ret = nfs4_check_client_ready(clp);
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
+
+
 /*
  * Renew the cl_session lease.
  */
-- 
cgit v1.2.3


From cfe7f4120f8b1b9465c333d1e42efd4669b1799f Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Tue, 1 Mar 2011 01:34:18 +0000
Subject: NFSv4.1: filelayout i/o helpers

Prepare for filelayout_read_pagelist with helper functions that find the correct
data server, filehandle, and offset.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn>
Signed-off-by: Tigran Mkrtchyan <tigran@anahit.desy.de>
Signed-off-by: Tigran Mkrtchyan <tigran.mkrtchyan@desy.de>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c    | 34 +++++++++++++++++++++++
 fs/nfs/nfs4filelayout.h    |  7 +++++
 fs/nfs/nfs4filelayoutdev.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 0efe8cbd9e3..ed833705dce 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -66,6 +66,40 @@ filelayout_clear_layoutdriver(struct nfs_server *nfss)
 	return 0;
 }
 
+static loff_t
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
+			    loff_t offset)
+{
+	u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
+	u64 tmp;
+
+	offset -= flseg->pattern_offset;
+	tmp = offset;
+	do_div(tmp, stripe_width);
+
+	return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+}
+
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+
+	switch (flseg->stripe_type) {
+	case STRIPE_SPARSE:
+		return offset;
+
+	case STRIPE_DENSE:
+		return filelayout_get_dense_offset(flseg, offset);
+	}
+
+	BUG();
+}
+
 /*
  * filelayout_check_layout()
  *
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index bbf60dd2ab9..9fef76e0493 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -83,9 +83,16 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
 			    generic_hdr);
 }
 
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
+
 extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
 extern void print_ds(struct nfs4_pnfs_ds *ds);
 extern void print_deviceid(struct nfs4_deviceid *dev_id);
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+					u32 ds_idx);
 extern struct nfs4_file_layout_dsaddr *
 nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
 struct nfs4_file_layout_dsaddr *
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index 8bc91fb8b6f..f466fed2f46 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -516,3 +516,70 @@ nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
 	return (d == NULL) ? NULL :
 		container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
 }
+
+/*
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+	u64 tmp;
+
+	tmp = offset - flseg->pattern_offset;
+	do_div(tmp, flseg->stripe_unit);
+	tmp += flseg->first_stripe_index;
+	return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+	u32 i;
+
+	if (flseg->stripe_type == STRIPE_SPARSE) {
+		if (flseg->num_fh == 1)
+			i = 0;
+		else if (flseg->num_fh == 0)
+			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+			return NULL;
+		else
+			i = nfs4_fl_calc_ds_index(lseg, j);
+	} else
+		i = j;
+	return flseg->fh_array[i];
+}
+
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+
+	if (ds == NULL) {
+		printk(KERN_ERR "%s: No data server for offset index %d\n",
+			__func__, ds_idx);
+		return NULL;
+	}
+
+	if (!ds->ds_clp) {
+		int err;
+
+		err = nfs4_ds_connect(NFS_SERVER(lseg->pls_layout->plh_inode),
+					  dsaddr->ds_list[ds_idx]);
+		if (err) {
+			printk(KERN_ERR "%s nfs4_ds_connect error %d\n",
+			       __func__, err);
+			return NULL;
+		}
+	}
+	return ds;
+}
-- 
cgit v1.2.3


From dc70d7b3189597f313df7bd2da849cfc39063b15 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:19 +0000
Subject: NFSv4.1: filelayout read

Attempt a pNFS file layout read by setting up the nfs_read_data struct and
calling nfs_initiate_read with the data server rpc client and the
filelayout rpc call ops.

Error handling is implemented in a subsequent patch.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Mingyang Guo <guomingyang@nrchpc.ac.cn>
Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Tested-by: Guo Mingyang <guomingyang@nrchpc.ac.cn>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h       |  2 ++
 fs/nfs/nfs4_fs.h        |  3 ++
 fs/nfs/nfs4filelayout.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4proc.c       |  3 +-
 fs/nfs/read.c           |  3 +-
 5 files changed, 91 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5cc92014259..5e9df992cd7 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -271,6 +271,8 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
 
 /* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+			     const struct rpc_call_ops *call_ops);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 /* write.c */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7058a9f75e7..c64be1cff08 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
 extern int nfs4_setup_sequence(const struct nfs_server *server,
 		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
 		int cache_reply, struct rpc_task *task);
+extern int nfs41_setup_sequence(struct nfs4_session *session,
+		struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+		int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index ed833705dce..3608411653d 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -100,6 +100,87 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 	BUG();
 }
 
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+				&rdata->args.seq_args, &rdata->res.seq_res,
+				0, task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+	/* Note this may cause RPC to be resent */
+	rdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_read_release(void *data)
+{
+	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+
+	rdata->mds_ops->rpc_release(data);
+}
+
+struct rpc_call_ops filelayout_read_call_ops = {
+	.rpc_call_prepare = filelayout_read_prepare,
+	.rpc_call_done = filelayout_read_call_done,
+	.rpc_release = filelayout_read_release,
+};
+
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_read_data *data)
+{
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	loff_t offset = data->args.offset;
+	u32 j, idx;
+	struct nfs_fh *fh;
+	int status;
+
+	dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+		__func__, data->inode->i_ino,
+		data->args.pgbase, (size_t)data->args.count, offset);
+
+	/* Retrieve the correct rpc_client for the byte range */
+	j = nfs4_fl_calc_j_index(lseg, offset);
+	idx = nfs4_fl_calc_ds_index(lseg, j);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds) {
+		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+		return PNFS_NOT_ATTEMPTED;
+	}
+	dprintk("%s USE DS:ip %x %hu\n", __func__,
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+	/* No multipath support. Use first DS */
+	data->ds_clp = ds->ds_clp;
+	fh = nfs4_fl_select_ds_fh(lseg, j);
+	if (fh)
+		data->args.fh = fh;
+
+	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+	data->mds_offset = offset;
+
+	/* Perform an asynchronous read to ds */
+	status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+				   &filelayout_read_call_ops);
+	BUG_ON(status != 0);
+	return PNFS_ATTEMPTED;
+}
+
 /*
  * filelayout_check_layout()
  *
@@ -320,6 +401,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.alloc_lseg              = filelayout_alloc_lseg,
 	.free_lseg               = filelayout_free_lseg,
 	.pg_test		= filelayout_pg_test,
+	.read_pagelist		= filelayout_read_pagelist,
 };
 
 static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 07d1a43f40f..d0962393330 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -505,7 +505,7 @@ out:
 	return ret_id;
 }
 
-static int nfs41_setup_sequence(struct nfs4_session *session,
+int nfs41_setup_sequence(struct nfs4_session *session,
 				struct nfs4_sequence_args *args,
 				struct nfs4_sequence_res *res,
 				int cache_reply,
@@ -571,6 +571,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
 	res->sr_status = 1;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
 
 int nfs4_setup_sequence(const struct nfs_server *server,
 			struct nfs4_sequence_args *args,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4127a1c0eec..f4d0fcffcb5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -157,7 +157,7 @@ static void nfs_readpage_release(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
-static int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
 		      const struct rpc_call_ops *call_ops)
 {
 	struct inode *inode = data->inode;
@@ -195,6 +195,7 @@ static int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
 	rpc_put_task(task);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(nfs_initiate_read);
 
 /*
  * Set up the NFS read request struct
-- 
cgit v1.2.3


From cbdabc7f8bf14ca1d40ab1cb86f64b3bc09716e8 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:20 +0000
Subject: NFSv4.1: filelayout async error handler

Use our own async error handler.
Mark the layout as failed and retry i/o through the MDS on specified errors.

Update the mds_offset in nfs_readpage_retry so that a failed short-read retry
to a DS gets correctly resent through the MDS.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h       |  1 +
 fs/nfs/nfs4filelayout.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4proc.c       | 35 +++++++++++++++++----
 fs/nfs/nfs4state.c      |  1 +
 fs/nfs/read.c           |  1 +
 5 files changed, 113 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5e9df992cd7..1a3228e9ea2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -285,6 +285,7 @@ extern int nfs_migrate_page(struct address_space *,
 #endif
 
 /* nfs4proc.c */
+extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
 extern int nfs4_init_client(struct nfs_client *clp,
 			    const struct rpc_timeout *timeparms,
 			    const char *ip_addr,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 3608411653d..6a424c19abe 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,6 +40,8 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
 MODULE_DESCRIPTION("The NFSv4 file layout driver");
 
+#define FILELAYOUT_POLL_RETRY_MAX     (15*HZ)
+
 static int
 filelayout_set_layoutdriver(struct nfs_server *nfss)
 {
@@ -100,6 +102,83 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
 	BUG();
 }
 
+/* For data server errors we don't recover from */
+static void
+filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+	if (lseg->pls_range.iomode == IOMODE_RW) {
+		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+	} else {
+		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+	}
+}
+
+static int filelayout_async_handle_error(struct rpc_task *task,
+					 struct nfs4_state *state,
+					 struct nfs_client *clp,
+					 int *reset)
+{
+	if (task->tk_status >= 0)
+		return 0;
+
+	*reset = 0;
+
+	switch (task->tk_status) {
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_BADSLOT:
+	case -NFS4ERR_BAD_HIGH_SLOT:
+	case -NFS4ERR_DEADSESSION:
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+	case -NFS4ERR_SEQ_FALSE_RETRY:
+	case -NFS4ERR_SEQ_MISORDERED:
+		dprintk("%s ERROR %d, Reset session. Exchangeid "
+			"flags 0x%x\n", __func__, task->tk_status,
+			clp->cl_exchange_flags);
+		nfs4_schedule_session_recovery(clp->cl_session);
+		break;
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_GRACE:
+	case -EKEYEXPIRED:
+		rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+		break;
+	default:
+		dprintk("%s DS error. Retry through MDS %d\n", __func__,
+			task->tk_status);
+		*reset = 1;
+		break;
+	}
+	task->tk_status = 0;
+	return -EAGAIN;
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int filelayout_read_done_cb(struct rpc_task *task,
+				struct nfs_read_data *data)
+{
+	struct nfs_client *clp = data->ds_clp;
+	int reset = 0;
+
+	dprintk("%s DS read\n", __func__);
+
+	if (filelayout_async_handle_error(task, data->args.context->state,
+					  data->ds_clp, &reset) == -EAGAIN) {
+		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+			__func__, data->ds_clp, data->ds_clp->cl_session);
+		if (reset) {
+			filelayout_set_lo_fail(data->lseg);
+			nfs4_reset_read(task, data);
+			clp = NFS_SERVER(data->inode)->nfs_client;
+		}
+		nfs_restart_rpc(task, clp);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
 /*
  * Call ops for the async read/write cases
  * In the case of dense layouts, the offset needs to be reset to its
@@ -109,6 +188,8 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_read_data *rdata = (struct nfs_read_data *)data;
 
+	rdata->read_done_cb = filelayout_read_done_cb;
+
 	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
 				&rdata->args.seq_args, &rdata->res.seq_res,
 				0, task))
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d0962393330..1dc80903944 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3074,15 +3074,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return err;
 }
 
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	dprintk("--> %s\n", __func__);
-
-	if (!nfs4_sequence_done(task, &data->res.seq_res))
-		return -EAGAIN;
-
 	if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
 		nfs_restart_rpc(task, server->nfs_client);
 		return -EAGAIN;
@@ -3094,12 +3089,40 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 	return 0;
 }
 
+static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+
+	dprintk("--> %s\n", __func__);
+
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
+
+	return data->read_done_cb(task, data);
+}
+
 static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
 	data->timestamp   = jiffies;
+	data->read_done_cb = nfs4_read_done_cb;
 	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 }
 
+/* Reset the the nfs_read_data to send the read to the MDS. */
+void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
+{
+	dprintk("%s Reset task for i/o through\n", __func__);
+	put_lseg(data->lseg);
+	data->lseg = NULL;
+	/* offsets will differ in the dense stripe case */
+	data->args.offset = data->mds_offset;
+	data->ds_clp = NULL;
+	data->args.fh     = NFS_FH(data->inode);
+	data->read_done_cb = nfs4_read_done_cb;
+	task->tk_ops = data->mds_ops;
+	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_read);
+
 static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct inode *inode = data->inode;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 69c83637312..ab1bf5bb021 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1453,6 +1453,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session)
 {
 	nfs4_schedule_lease_recovery(session->clp);
 }
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f4d0fcffcb5..f40c7f4dc16 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -391,6 +391,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
 		return;
 
 	/* Yes, so retry the read at the end of the data */
+	data->mds_offset += resp->count;
 	argp->offset += resp->count;
 	argp->pgbase += resp->count;
 	argp->count -= resp->count;
-- 
cgit v1.2.3


From ea8eecdd11ee6becd09c095c8efa88aa7df95961 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 1 Mar 2011 01:34:21 +0000
Subject: NFSv4.1 move deviceid cache to filelayout driver

No need for generic cache with only one user.
Keep a simple hash of deviceids in the filelayout driver.

Signed-off-by: Christoph Hellwig <hch@infradead.org>
Acked-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c    |  46 +++-----------
 fs/nfs/nfs4filelayout.h    |   8 ++-
 fs/nfs/nfs4filelayoutdev.c | 106 +++++++++++++++++++++++---------
 fs/nfs/pnfs.c              | 147 +--------------------------------------------
 fs/nfs/pnfs.h              |  48 ---------------
 5 files changed, 92 insertions(+), 263 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 6a424c19abe..a922e75af42 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -42,32 +42,6 @@ MODULE_DESCRIPTION("The NFSv4 file layout driver");
 
 #define FILELAYOUT_POLL_RETRY_MAX     (15*HZ)
 
-static int
-filelayout_set_layoutdriver(struct nfs_server *nfss)
-{
-	int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
-						nfs4_fl_free_deviceid_callback);
-	if (status) {
-		printk(KERN_WARNING "%s: deviceid cache could not be "
-			"initialized\n", __func__);
-		return status;
-	}
-	dprintk("%s: deviceid cache has been initialized successfully\n",
-		__func__);
-	return 0;
-}
-
-/* Clear out the layout by destroying its device list */
-static int
-filelayout_clear_layoutdriver(struct nfs_server *nfss)
-{
-	dprintk("--> %s\n", __func__);
-
-	if (nfss->nfs_client->cl_devid_cache)
-		pnfs_put_deviceid_cache(nfss->nfs_client);
-	return 0;
-}
-
 static loff_t
 filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
 			    loff_t offset)
@@ -295,7 +269,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 	}
 
 	/* find and reference the deviceid */
-	dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+	dsaddr = nfs4_fl_find_get_deviceid(id);
 	if (dsaddr == NULL) {
 		dsaddr = get_device_info(lo->plh_inode, id);
 		if (dsaddr == NULL)
@@ -330,7 +304,7 @@ out:
 	dprintk("--> %s returns %d\n", __func__, status);
 	return status;
 out_put:
-	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+	nfs4_fl_put_deviceid(dsaddr);
 	goto out;
 }
 
@@ -439,12 +413,10 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 static void
 filelayout_free_lseg(struct pnfs_layout_segment *lseg)
 {
-	struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
 	struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
 
 	dprintk("--> %s\n", __func__);
-	pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
-			  &fl->dsaddr->deviceid);
+	nfs4_fl_put_deviceid(fl->dsaddr);
 	_filelayout_free_lseg(fl);
 }
 
@@ -474,13 +446,11 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 
 static struct pnfs_layoutdriver_type filelayout_type = {
-	.id = LAYOUT_NFSV4_1_FILES,
-	.name = "LAYOUT_NFSV4_1_FILES",
-	.owner = THIS_MODULE,
-	.set_layoutdriver = filelayout_set_layoutdriver,
-	.clear_layoutdriver = filelayout_clear_layoutdriver,
-	.alloc_lseg              = filelayout_alloc_lseg,
-	.free_lseg               = filelayout_free_lseg,
+	.id			= LAYOUT_NFSV4_1_FILES,
+	.name			= "LAYOUT_NFSV4_1_FILES",
+	.owner			= THIS_MODULE,
+	.alloc_lseg		= filelayout_alloc_lseg,
+	.free_lseg		= filelayout_free_lseg,
 	.pg_test		= filelayout_pg_test,
 	.read_pagelist		= filelayout_read_pagelist,
 };
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 9fef76e0493..23f1e1e2a0f 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -56,7 +56,9 @@ struct nfs4_pnfs_ds {
 };
 
 struct nfs4_file_layout_dsaddr {
-	struct pnfs_deviceid_node	deviceid;
+	struct hlist_node		node;
+	struct nfs4_deviceid		deviceid;
+	atomic_t			ref;
 	u32				stripe_count;
 	u8				*stripe_indices;
 	u32				ds_num;
@@ -86,7 +88,6 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
 extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 
-extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
 extern void print_ds(struct nfs4_pnfs_ds *ds);
 extern void print_deviceid(struct nfs4_deviceid *dev_id);
 u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
@@ -94,7 +95,8 @@ u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
 					u32 ds_idx);
 extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
 
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f466fed2f46..f594ca35a99 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -36,6 +36,30 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 
+/*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_FL_DEVICE_ID_HASH_BITS	5
+#define NFS4_FL_DEVICE_ID_HASH_SIZE	(1 << NFS4_FL_DEVICE_ID_HASH_BITS)
+#define NFS4_FL_DEVICE_ID_HASH_MASK	(NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
+
+static inline u32
+nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
+{
+	unsigned char *cptr = (unsigned char *)id->data;
+	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+	u32 x = 0;
+
+	while (nbytes--) {
+		x *= 37;
+		x += *cptr++;
+	}
+	return x & NFS4_FL_DEVICE_ID_HASH_MASK;
+}
+
+static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(filelayout_deviceid_lock);
+
 /*
  * Data server cache
  *
@@ -183,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 	struct nfs4_pnfs_ds *ds;
 	int i;
 
-	print_deviceid(&dsaddr->deviceid.de_id);
+	print_deviceid(&dsaddr->deviceid);
 
 	for (i = 0; i < dsaddr->ds_num; i++) {
 		ds = dsaddr->ds_list[i];
@@ -200,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 	kfree(dsaddr);
 }
 
-void
-nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
-{
-	struct nfs4_file_layout_dsaddr *dsaddr =
-		container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
-
-	nfs4_fl_free_deviceid(dsaddr);
-}
-
 static struct nfs4_pnfs_ds *
 nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
 {
@@ -361,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev)
 	dsaddr->stripe_count = cnt;
 	dsaddr->ds_num = num;
 
-	memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+	memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
 
 	/* Go back an read stripe indices */
 	p = indicesp;
@@ -411,28 +426,37 @@ out_err:
 }
 
 /*
- * Decode the opaque device specified in 'dev'
- * and add it to the list of available devices.
- * If the deviceid is already cached, nfs4_add_deviceid will return
- * a pointer to the cached struct and throw away the new.
+ * Decode the opaque device specified in 'dev' and add it to the cache of
+ * available devices.
  */
-static struct nfs4_file_layout_dsaddr*
+static struct nfs4_file_layout_dsaddr *
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
 {
-	struct nfs4_file_layout_dsaddr *dsaddr;
-	struct pnfs_deviceid_node *d;
+	struct nfs4_file_layout_dsaddr *d, *new;
+	long hash;
 
-	dsaddr = decode_device(inode, dev);
-	if (!dsaddr) {
+	new = decode_device(inode, dev);
+	if (!new) {
 		printk(KERN_WARNING "%s: Could not decode or add device\n",
 			__func__);
 		return NULL;
 	}
 
-	d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
-			      &dsaddr->deviceid);
+	spin_lock(&filelayout_deviceid_lock);
+	d = nfs4_fl_find_get_deviceid(&new->deviceid);
+	if (d) {
+		spin_unlock(&filelayout_deviceid_lock);
+		nfs4_fl_free_deviceid(new);
+		return d;
+	}
+
+	INIT_HLIST_NODE(&new->node);
+	atomic_set(&new->ref, 1);
+	hash = nfs4_fl_deviceid_hash(&new->deviceid);
+	hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
+	spin_unlock(&filelayout_deviceid_lock);
 
-	return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+	return new;
 }
 
 /*
@@ -507,14 +531,38 @@ out_free:
 	return dsaddr;
 }
 
-struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
-	struct pnfs_deviceid_node *d;
+	if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
+		hlist_del_rcu(&dsaddr->node);
+		spin_unlock(&filelayout_deviceid_lock);
+
+		synchronize_rcu();
+		nfs4_fl_free_deviceid(dsaddr);
+	}
+}
 
-	d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
-	return (d == NULL) ? NULL :
-		container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+struct nfs4_file_layout_dsaddr *
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
+{
+	struct nfs4_file_layout_dsaddr *d;
+	struct hlist_node *n;
+	long hash = nfs4_fl_deviceid_hash(id);
+
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
+		if (!memcmp(&d->deviceid, id, sizeof(*id))) {
+			if (!atomic_inc_not_zero(&d->ref))
+				goto fail;
+			rcu_read_unlock();
+			return d;
+		}
+	}
+fail:
+	rcu_read_unlock();
+	return NULL;
 }
 
 /*
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 86c154bad5d..1f4c153441a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -75,10 +75,8 @@ find_pnfs_driver(u32 id)
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
-	if (nfss->pnfs_curr_ld) {
-		nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
+	if (nfss->pnfs_curr_ld)
 		module_put(nfss->pnfs_curr_ld->owner);
-	}
 	nfss->pnfs_curr_ld = NULL;
 }
 
@@ -116,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
 		goto out_no_driver;
 	}
 	server->pnfs_curr_ld = ld_type;
-	if (ld_type->set_layoutdriver(server)) {
-		printk(KERN_ERR
-		       "%s: Error initializing mount point for layout driver %u.\n",
-		       __func__, id);
-		module_put(ld_type->owner);
-		goto out_no_driver;
-	}
+
 	dprintk("%s: pNFS module for %u set\n", __func__, id);
 	return;
 
@@ -906,138 +898,3 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
 	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
 	return trypnfs;
 }
-
-/*
- * Device ID cache. Currently supports one layout type per struct nfs_client.
- * Add layout type to the lookup key to expand to support multiple types.
- */
-int
-pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
-			 void (*free_callback)(struct pnfs_deviceid_node *))
-{
-	struct pnfs_deviceid_cache *c;
-
-	c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
-	if (!c)
-		return -ENOMEM;
-	spin_lock(&clp->cl_lock);
-	if (clp->cl_devid_cache != NULL) {
-		atomic_inc(&clp->cl_devid_cache->dc_ref);
-		dprintk("%s [kref [%d]]\n", __func__,
-			atomic_read(&clp->cl_devid_cache->dc_ref));
-		kfree(c);
-	} else {
-		/* kzalloc initializes hlists */
-		spin_lock_init(&c->dc_lock);
-		atomic_set(&c->dc_ref, 1);
-		c->dc_free_callback = free_callback;
-		clp->cl_devid_cache = c;
-		dprintk("%s [new]\n", __func__);
-	}
-	spin_unlock(&clp->cl_lock);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
-
-/*
- * Called from pnfs_layoutdriver_type->free_lseg
- * last layout segment reference frees deviceid
- */
-void
-pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-		  struct pnfs_deviceid_node *devid)
-{
-	struct nfs4_deviceid *id = &devid->de_id;
-	struct pnfs_deviceid_node *d;
-	struct hlist_node *n;
-	long h = nfs4_deviceid_hash(id);
-
-	dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
-	if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
-		return;
-
-	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
-		if (!memcmp(&d->de_id, id, sizeof(*id))) {
-			hlist_del_rcu(&d->de_node);
-			spin_unlock(&c->dc_lock);
-			synchronize_rcu();
-			c->dc_free_callback(devid);
-			return;
-		}
-	spin_unlock(&c->dc_lock);
-	/* Why wasn't it found in  the list? */
-	BUG();
-}
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
-
-/* Find and reference a deviceid */
-struct pnfs_deviceid_node *
-pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
-{
-	struct pnfs_deviceid_node *d;
-	struct hlist_node *n;
-	long hash = nfs4_deviceid_hash(id);
-
-	dprintk("--> %s hash %ld\n", __func__, hash);
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
-		if (!memcmp(&d->de_id, id, sizeof(*id))) {
-			if (!atomic_inc_not_zero(&d->de_ref)) {
-				goto fail;
-			} else {
-				rcu_read_unlock();
-				return d;
-			}
-		}
-	}
-fail:
-	rcu_read_unlock();
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
-
-/*
- * Add a deviceid to the cache.
- * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
- */
-struct pnfs_deviceid_node *
-pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
-{
-	struct pnfs_deviceid_node *d;
-	long hash = nfs4_deviceid_hash(&new->de_id);
-
-	dprintk("--> %s hash %ld\n", __func__, hash);
-	spin_lock(&c->dc_lock);
-	d = pnfs_find_get_deviceid(c, &new->de_id);
-	if (d) {
-		spin_unlock(&c->dc_lock);
-		dprintk("%s [discard]\n", __func__);
-		c->dc_free_callback(new);
-		return d;
-	}
-	INIT_HLIST_NODE(&new->de_node);
-	atomic_set(&new->de_ref, 1);
-	hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
-	spin_unlock(&c->dc_lock);
-	dprintk("%s [new]\n", __func__);
-	return new;
-}
-EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
-
-void
-pnfs_put_deviceid_cache(struct nfs_client *clp)
-{
-	struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
-
-	dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
-	if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
-		int i;
-		/* Verify cache is empty */
-		for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
-			BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
-		clp->cl_devid_cache = NULL;
-		spin_unlock(&clp->cl_lock);
-		kfree(local);
-	}
-}
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 585023fabb5..acbb7780207 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -68,8 +68,6 @@ struct pnfs_layoutdriver_type {
 	const u32 id;
 	const char *name;
 	struct module *owner;
-	int (*set_layoutdriver) (struct nfs_server *);
-	int (*clear_layoutdriver) (struct nfs_server *);
 	struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
 	void (*free_lseg) (struct pnfs_layout_segment *lseg);
 
@@ -106,52 +104,6 @@ struct pnfs_device {
 	unsigned int  pglen;
 };
 
-/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_DEVICE_ID_HASH_BITS	5
-#define NFS4_DEVICE_ID_HASH_SIZE	(1 << NFS4_DEVICE_ID_HASH_BITS)
-#define NFS4_DEVICE_ID_HASH_MASK	(NFS4_DEVICE_ID_HASH_SIZE - 1)
-
-static inline u32
-nfs4_deviceid_hash(struct nfs4_deviceid *id)
-{
-	unsigned char *cptr = (unsigned char *)id->data;
-	unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-	u32 x = 0;
-
-	while (nbytes--) {
-		x *= 37;
-		x += *cptr++;
-	}
-	return x & NFS4_DEVICE_ID_HASH_MASK;
-}
-
-struct pnfs_deviceid_node {
-	struct hlist_node	de_node;
-	struct nfs4_deviceid	de_id;
-	atomic_t		de_ref;
-};
-
-struct pnfs_deviceid_cache {
-	spinlock_t		dc_lock;
-	atomic_t		dc_ref;
-	void			(*dc_free_callback)(struct pnfs_deviceid_node *);
-	struct hlist_head	dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
-};
-
-extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
-			void (*free_callback)(struct pnfs_deviceid_node *));
-extern void pnfs_put_deviceid_cache(struct nfs_client *);
-extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
-				struct pnfs_deviceid_cache *,
-				struct nfs4_deviceid *);
-extern struct pnfs_deviceid_node *pnfs_add_deviceid(
-				struct pnfs_deviceid_cache *,
-				struct pnfs_deviceid_node *);
-extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-			      struct pnfs_deviceid_node *devid);
-
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 
-- 
cgit v1.2.3


From 568e8c494ded95a28c5dd8b79b4d3ffb95b6d845 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Tue, 1 Mar 2011 01:34:22 +0000
Subject: NFSv4.1: turn off pNFS on ds connection failure

If a data server is unavailable, go through MDS.

Mark the deviceid containing the data server as a negative cache entry.
Do not try to connect to any data server on a deviceid marked as a negative
cache entry. Mark any layout that tries to use the marked deviceid as failed.

Inodes with a layout marked as fails will not use the layout for I/O, and will
not perform any more layoutgets.
Inodes without a layout will still do layoutget, but the layout will get
marked immediately.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c    |  4 +++-
 fs/nfs/nfs4filelayout.h    |  4 ++++
 fs/nfs/nfs4filelayoutdev.c | 28 ++++++++++++++++++++++++----
 fs/nfs/pnfs.c              |  9 +++++----
 4 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index a922e75af42..0040a5ee620 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -214,7 +214,9 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	idx = nfs4_fl_calc_ds_index(lseg, j);
 	ds = nfs4_fl_prepare_ds(lseg, idx);
 	if (!ds) {
-		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+		/* Either layout fh index faulty, or ds connect failed */
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
 		return PNFS_NOT_ATTEMPTED;
 	}
 	dprintk("%s USE DS:ip %x %hu\n", __func__,
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 23f1e1e2a0f..ee0c907742b 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -55,10 +55,14 @@ struct nfs4_pnfs_ds {
 	atomic_t		ds_count;
 };
 
+/* nfs4_file_layout_dsaddr flags */
+#define NFS4_DEVICE_ID_NEG_ENTRY	0x00000001
+
 struct nfs4_file_layout_dsaddr {
 	struct hlist_node		node;
 	struct nfs4_deviceid		deviceid;
 	atomic_t			ref;
+	unsigned long			flags;
 	u32				stripe_count;
 	u8				*stripe_indices;
 	u32				ds_num;
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f594ca35a99..68143c162e3 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -606,6 +606,21 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
 	return flseg->fh_array[i];
 }
 
+static void
+filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
+			       int err, u32 ds_addr)
+{
+	u32 *p = (u32 *)&dsaddr->deviceid;
+
+	printk(KERN_ERR "NFS: data server %x connection error %d."
+		" Deviceid [%x%x%x%x] marked out of use.\n",
+		ds_addr, err, p[0], p[1], p[2], p[3]);
+
+	spin_lock(&filelayout_deviceid_lock);
+	dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
+	spin_unlock(&filelayout_deviceid_lock);
+}
+
 struct nfs4_pnfs_ds *
 nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 {
@@ -619,13 +634,18 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 	}
 
 	if (!ds->ds_clp) {
+		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
 		int err;
 
-		err = nfs4_ds_connect(NFS_SERVER(lseg->pls_layout->plh_inode),
-					  dsaddr->ds_list[ds_idx]);
+		if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
+			/* Already tried to connect, don't try again */
+			dprintk("%s Deviceid marked out of use\n", __func__);
+			return NULL;
+		}
+		err = nfs4_ds_connect(s, ds);
 		if (err) {
-			printk(KERN_ERR "%s nfs4_ds_connect error %d\n",
-			       __func__, err);
+			filelayout_mark_devid_negative(dsaddr, err,
+						       ntohl(ds->ds_ip_addr));
 			return NULL;
 		}
 	}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1f4c153441a..3e545144a0b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -739,15 +739,16 @@ pnfs_update_layout(struct inode *ino,
 		dprintk("%s matches recall, use MDS\n", __func__);
 		goto out_unlock;
 	}
-	/* Check to see if the layout for the given range already exists */
-	lseg = pnfs_find_lseg(lo, iomode);
-	if (lseg)
-		goto out_unlock;
 
 	/* if LAYOUTGET already failed once we don't try again */
 	if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
 		goto out_unlock;
 
+	/* Check to see if the layout for the given range already exists */
+	lseg = pnfs_find_lseg(lo, iomode);
+	if (lseg)
+		goto out_unlock;
+
 	if (pnfs_layoutgets_blocked(lo, NULL, 0))
 		goto out_unlock;
 	atomic_inc(&lo->plh_outstanding);
-- 
cgit v1.2.3


From d138d5d17be6a60d883e8bd4e22bc218d3adfab3 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 3 Mar 2011 15:13:41 +0000
Subject: NFSv4.1: rearrange nfs_write_rpcsetup

Reorder nfs_write_rpcsetup, preparing for a pnfs entry point.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 82 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 46 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f033fa0d7d3..ae035990941 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -782,25 +782,21 @@ static int flush_task_priority(int how)
 	return RPC_PRIORITY_NORMAL;
 }
 
-/*
- * Set up the argument/result storage required for the RPC call.
- */
-static int nfs_write_rpcsetup(struct nfs_page *req,
-		struct nfs_write_data *data,
-		const struct rpc_call_ops *call_ops,
-		unsigned int count, unsigned int offset,
-		int how)
+static int nfs_initiate_write(struct nfs_write_data *data,
+		       struct rpc_clnt *clnt,
+		       const struct rpc_call_ops *call_ops,
+		       int how)
 {
-	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct inode *inode = data->inode;
 	int priority = flush_task_priority(how);
 	struct rpc_task *task;
 	struct rpc_message msg = {
 		.rpc_argp = &data->args,
 		.rpc_resp = &data->res,
-		.rpc_cred = req->wb_context->cred,
+		.rpc_cred = data->cred,
 	};
 	struct rpc_task_setup task_setup_data = {
-		.rpc_client = NFS_CLIENT(inode),
+		.rpc_client = clnt,
 		.task = &data->task,
 		.rpc_message = &msg,
 		.callback_ops = call_ops,
@@ -811,12 +807,49 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	};
 	int ret = 0;
 
+	/* Set up the initial task struct.  */
+	NFS_PROTO(inode)->write_setup(data, &msg);
+
+	dprintk("NFS: %5u initiated write call "
+		"(req %s/%lld, %u bytes @ offset %llu)\n",
+		data->task.tk_pid,
+		inode->i_sb->s_id,
+		(long long)NFS_FILEID(inode),
+		data->args.count,
+		(unsigned long long)data->args.offset);
+
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+		ret = PTR_ERR(task);
+		goto out;
+	}
+	if (how & FLUSH_SYNC) {
+		ret = rpc_wait_for_completion_task(task);
+		if (ret == 0)
+			ret = task->tk_status;
+	}
+	rpc_put_task(task);
+out:
+	return ret;
+}
+
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_write_rpcsetup(struct nfs_page *req,
+		struct nfs_write_data *data,
+		const struct rpc_call_ops *call_ops,
+		unsigned int count, unsigned int offset,
+		int how)
+{
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 
 	data->req = req;
 	data->inode = inode = req->wb_context->path.dentry->d_inode;
-	data->cred = msg.rpc_cred;
+	data->cred = req->wb_context->cred;
 
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
@@ -837,30 +870,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
 
-	/* Set up the initial task struct.  */
-	NFS_PROTO(inode)->write_setup(data, &msg);
-
-	dprintk("NFS: %5u initiated write call "
-		"(req %s/%lld, %u bytes @ offset %llu)\n",
-		data->task.tk_pid,
-		inode->i_sb->s_id,
-		(long long)NFS_FILEID(inode),
-		count,
-		(unsigned long long)data->args.offset);
-
-	task = rpc_run_task(&task_setup_data);
-	if (IS_ERR(task)) {
-		ret = PTR_ERR(task);
-		goto out;
-	}
-	if (how & FLUSH_SYNC) {
-		ret = rpc_wait_for_completion_task(task);
-		if (ret == 0)
-			ret = task->tk_status;
-	}
-	rpc_put_task(task);
-out:
-	return ret;
+	return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
 }
 
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
-- 
cgit v1.2.3


From b029bc9b0880cbaf999f580c0ea8f06dd274fc77 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Mar 2011 15:13:42 +0000
Subject: NFSv4.1: add callback to nfs4_write_done

Add callback that pnfs layout driver can use to do its own handling
of data server WRITE response.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1dc80903944..15248549c89 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3123,13 +3123,10 @@ void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
 }
 EXPORT_SYMBOL_GPL(nfs4_reset_read);
 
-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
 {
 	struct inode *inode = data->inode;
 	
-	if (!nfs4_sequence_done(task, &data->res.seq_res))
-		return -EAGAIN;
-
 	if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
 		nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
 		return -EAGAIN;
@@ -3141,11 +3138,20 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	return 0;
 }
 
+static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+	if (!nfs4_sequence_done(task, &data->res.seq_res))
+		return -EAGAIN;
+	return data->write_done_cb(task, data);
+}
+
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
 	data->args.bitmask = server->cache_consistency_bitmask;
+	if (!data->write_done_cb)
+		data->write_done_cb = nfs4_write_done_cb;
 	data->res.server = server;
 	data->timestamp   = jiffies;
 
-- 
cgit v1.2.3


From 5053aa568d4017aeb1fa35247d4ad96be262920f Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Mar 2011 15:13:43 +0000
Subject: NFSv4.1: Send lseg down into nfs_write_rpcsetup

We grab the lseg sent in from the doio function and attach it to
each struct nfs_write_data created.  This is how the lseg will be
sent to the layout driver.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ae035990941..72b0ec0bb0e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -97,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
 
 static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
+	put_lseg(wdata->lseg);
 	put_nfs_open_context(wdata->args.context);
 	nfs_writedata_free(wdata);
 }
@@ -840,6 +841,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 		struct nfs_write_data *data,
 		const struct rpc_call_ops *call_ops,
 		unsigned int count, unsigned int offset,
+		struct pnfs_layout_segment *lseg,
 		int how)
 {
 	struct inode *inode = req->wb_context->path.dentry->d_inode;
@@ -850,6 +852,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->req = req;
 	data->inode = inode = req->wb_context->path.dentry->d_inode;
 	data->cred = req->wb_context->cred;
+	data->lseg = get_lseg(lseg);
 
 	data->args.fh     = NFS_FH(inode);
 	data->args.offset = req_offset(req) + offset;
@@ -930,7 +933,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 		if (nbytes < wsize)
 			wsize = nbytes;
 		ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-				   wsize, offset, how);
+					  wsize, offset, lseg, how);
 		if (ret == 0)
 			ret = ret2;
 		offset += wsize;
@@ -978,7 +981,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 	req = nfs_list_entry(data->pages.next);
 
 	/* Set up the argument struct */
-	return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
+	return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
  out_bad:
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
-- 
cgit v1.2.3


From 44b83799a922a153957c65ccfc985a8c902958c8 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Mar 2011 15:13:44 +0000
Subject: NFSv4.1: trigger LAYOUTGET for writes

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pnfs.c  | 22 ++++++++++++++++++++++
 fs/nfs/pnfs.h  |  7 +++++++
 fs/nfs/write.c | 32 ++++++++++++++++++++------------
 3 files changed, 49 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 3e545144a0b..5f205d31d96 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -873,6 +873,28 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
 	pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
 }
 
+static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
+			      struct nfs_page *prev,
+			      struct nfs_page *req)
+{
+	if (pgio->pg_count == prev->wb_bytes) {
+		/* This is first coelesce call for a series of nfs_pages */
+		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+						   prev->wb_context,
+						   IOMODE_RW);
+	}
+	return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
+}
+
+void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+{
+	struct pnfs_layoutdriver_type *ld;
+
+	ld = NFS_SERVER(inode)->pnfs_curr_ld;
+	pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+}
+
 /*
  * Call the appropriate parallel I/O subsystem read function.
  */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index acbb7780207..1d4e6317fa9 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -123,6 +123,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *);
 enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
 					    const struct rpc_call_ops *);
 void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -235,6 +236,12 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
 	pgio->pg_test = NULL;
 }
 
+static inline void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+	pgio->pg_test = NULL;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 72b0ec0bb0e..49c4784c24e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -919,6 +919,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 	} while (nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
 
+	BUG_ON(lseg);
+	lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
 	ClearPageError(page);
 	offset = 0;
 	nbytes = count;
@@ -940,6 +942,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 		nbytes -= wsize;
 	} while (nbytes != 0);
 
+	put_lseg(lseg);
 	return ret;
 
 out_bad:
@@ -965,11 +968,18 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_write_data	*data;
+	int ret;
 
 	data = nfs_writedata_alloc(npages);
-	if (!data)
-		goto out_bad;
-
+	if (!data) {
+		while (!list_empty(head)) {
+			req = nfs_list_entry(head->next);
+			nfs_list_remove_request(req);
+			nfs_redirty_request(req);
+		}
+		ret = -ENOMEM;
+		goto out;
+	}
 	pages = data->pagevec;
 	while (!list_empty(head)) {
 		req = nfs_list_entry(head->next);
@@ -979,16 +989,14 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 		*pages++ = req->wb_page;
 	}
 	req = nfs_list_entry(data->pages.next);
+	if ((!lseg) && list_is_singular(&data->pages))
+		lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
 
 	/* Set up the argument struct */
-	return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
- out_bad:
-	while (!list_empty(head)) {
-		req = nfs_list_entry(head->next);
-		nfs_list_remove_request(req);
-		nfs_redirty_request(req);
-	}
-	return -ENOMEM;
+	ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
+out:
+	put_lseg(lseg); /* Cleans any gotten in ->pg_test */
+	return ret;
 }
 
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -996,7 +1004,7 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 {
 	size_t wsize = NFS_SERVER(inode)->wsize;
 
-	pgio->pg_test = NULL;
+	pnfs_pageio_init_write(pgio, inode);
 
 	if (wsize < PAGE_CACHE_SIZE)
 		nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
-- 
cgit v1.2.3


From 0382b74409c6b9ef12c952b50bb44f557a361a43 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Thu, 3 Mar 2011 15:13:45 +0000
Subject: NFSv4.1: implement generic pnfs layer write switch

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Mike Sager <sager@netapp.com>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Tao Guo <guotao@nrchpc.ac.cn>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c |  7 +++++++
 fs/nfs/pnfs.c           | 24 ++++++++++++++++++++++++
 fs/nfs/pnfs.h           | 10 ++++++++++
 fs/nfs/write.c          |  4 ++++
 4 files changed, 45 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 0040a5ee620..9d21bfeec88 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -238,6 +238,12 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	return PNFS_ATTEMPTED;
 }
 
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+{
+	return PNFS_NOT_ATTEMPTED;
+}
+
 /*
  * filelayout_check_layout()
  *
@@ -455,6 +461,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
 	.free_lseg		= filelayout_free_lseg,
 	.pg_test		= filelayout_pg_test,
 	.read_pagelist		= filelayout_read_pagelist,
+	.write_pagelist		= filelayout_write_pagelist,
 };
 
 static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 5f205d31d96..f38813a0a29 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -895,6 +895,30 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
 	pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
 }
 
+enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *wdata,
+			const struct rpc_call_ops *call_ops, int how)
+{
+	struct inode *inode = wdata->inode;
+	enum pnfs_try_status trypnfs;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+
+	wdata->mds_ops = call_ops;
+
+	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+		inode->i_ino, wdata->args.count, wdata->args.offset, how);
+
+	trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+	if (trypnfs == PNFS_NOT_ATTEMPTED) {
+		put_lseg(wdata->lseg);
+		wdata->lseg = NULL;
+	} else
+		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+
+	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+	return trypnfs;
+}
+
 /*
  * Call the appropriate parallel I/O subsystem read function.
  */
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1d4e6317fa9..6380b9405bc 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -79,6 +79,7 @@ struct pnfs_layoutdriver_type {
 	 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
 	 */
 	enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
+	enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
 };
 
 struct pnfs_layout_hdr {
@@ -120,6 +121,8 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
 		   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+					     const struct rpc_call_ops *, int);
 enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
 					    const struct rpc_call_ops *);
 void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
@@ -200,6 +203,13 @@ pnfs_try_to_read_data(struct nfs_read_data *data,
 	return PNFS_NOT_ATTEMPTED;
 }
 
+static inline enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *data,
+		       const struct rpc_call_ops *call_ops, int how)
+{
+	return PNFS_NOT_ATTEMPTED;
+}
+
 static inline bool
 pnfs_roc(struct inode *ino)
 {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 49c4784c24e..df99c5b0ee6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -873,6 +873,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 	data->res.verf    = &data->verf;
 	nfs_fattr_init(&data->fattr);
 
+	if (data->lseg &&
+	    (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+		return 0;
+
 	return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
 }
 
-- 
cgit v1.2.3


From 7ffd10640dc008f6d5a375bd6450755745c63c7d Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Mar 2011 15:13:46 +0000
Subject: NFSv4.1: remove GETATTR from ds writes

Any WRITE compound directed to a data server needs to have the
GETATTR calls suppressed.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 6 +++++-
 fs/nfs/nfs4xdr.c  | 8 +++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 15248549c89..da902123ec5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3149,7 +3149,11 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
 
-	data->args.bitmask = server->cache_consistency_bitmask;
+	if (data->lseg) {
+		data->args.bitmask = NULL;
+		data->res.fattr = NULL;
+	} else
+		data->args.bitmask = server->cache_consistency_bitmask;
 	if (!data->write_done_cb)
 		data->write_done_cb = nfs4_write_done_cb;
 	data->res.server = server;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a656b6e179b..0f2dcfb41f2 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2275,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_putfh(xdr, args->fh, &hdr);
 	encode_write(xdr, args, &hdr);
 	req->rq_snd_buf.flags |= XDRBUF_WRITE;
-	encode_getfattr(xdr, args->bitmask, &hdr);
+	if (args->bitmask)
+		encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -5694,8 +5695,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_write(xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	if (res->fattr)
+		decode_getfattr(xdr, res->fattr, res->server,
+				!RPC_IS_ASYNC(rqstp->rq_task));
 	if (!status)
 		status = res->count;
 out:
-- 
cgit v1.2.3


From a69aef1496726ed88386dad65abfcc8cd3195304 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Mar 2011 15:13:47 +0000
Subject: NFSv4.1: pnfs filelayout driver write

Allows the pnfs filelayout driver to write to the data servers.

Note that COMMIT to data servers will be implemented in a future
patch.  To avoid improper behavior, for the moment any WRITE to a data
server that would also require a COMMIT to the data server is sent
NFS_FILE_SYNC.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Fred Isaman <iisaman@citi.umich.edu>
Signed-off-by: Mingyang Guo <guomingyang@nrchpc.ac.cn>
Signed-off-by: Oleg Drokin <green@linuxhacker.ru>
Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h       |   5 +++
 fs/nfs/nfs4filelayout.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/nfs4proc.c       |  17 ++++++++
 fs/nfs/write.c          |   5 ++-
 4 files changed, 126 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 1a3228e9ea2..d1ddc23c404 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -276,6 +276,10 @@ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 /* write.c */
+extern int nfs_initiate_write(struct nfs_write_data *data,
+			      struct rpc_clnt *clnt,
+			      const struct rpc_call_ops *call_ops,
+			      int how);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
@@ -291,6 +295,7 @@ extern int nfs4_init_client(struct nfs_client *clp,
 			    const char *ip_addr,
 			    rpc_authflavor_t authflavour,
 			    int noresvport);
+extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
 extern int _nfs4_call_sync(struct nfs_server *server,
 			   struct rpc_message *msg,
 			   struct nfs4_sequence_args *args,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 9d21bfeec88..7e1d4571b7b 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -189,12 +189,69 @@ static void filelayout_read_release(void *data)
 	rdata->mds_ops->rpc_release(data);
 }
 
+static int filelayout_write_done_cb(struct rpc_task *task,
+				struct nfs_write_data *data)
+{
+	int reset = 0;
+
+	if (filelayout_async_handle_error(task, data->args.context->state,
+					  data->ds_clp, &reset) == -EAGAIN) {
+		struct nfs_client *clp;
+
+		dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+			__func__, data->ds_clp, data->ds_clp->cl_session);
+		if (reset) {
+			filelayout_set_lo_fail(data->lseg);
+			nfs4_reset_write(task, data);
+			clp = NFS_SERVER(data->inode)->nfs_client;
+		} else
+			clp = data->ds_clp;
+		nfs_restart_rpc(task, clp);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+				&wdata->args.seq_args, &wdata->res.seq_res,
+				0, task))
+		return;
+
+	rpc_call_start(task);
+}
+
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	/* Note this may cause RPC to be resent */
+	wdata->mds_ops->rpc_call_done(task, data);
+}
+
+static void filelayout_write_release(void *data)
+{
+	struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+
+	wdata->mds_ops->rpc_release(data);
+}
+
 struct rpc_call_ops filelayout_read_call_ops = {
 	.rpc_call_prepare = filelayout_read_prepare,
 	.rpc_call_done = filelayout_read_call_done,
 	.rpc_release = filelayout_read_release,
 };
 
+struct rpc_call_ops filelayout_write_call_ops = {
+	.rpc_call_prepare = filelayout_write_prepare,
+	.rpc_call_done = filelayout_write_call_done,
+	.rpc_release = filelayout_write_release,
+};
+
 static enum pnfs_try_status
 filelayout_read_pagelist(struct nfs_read_data *data)
 {
@@ -238,10 +295,52 @@ filelayout_read_pagelist(struct nfs_read_data *data)
 	return PNFS_ATTEMPTED;
 }
 
+/* Perform async writes. */
 static enum pnfs_try_status
 filelayout_write_pagelist(struct nfs_write_data *data, int sync)
 {
-	return PNFS_NOT_ATTEMPTED;
+	struct pnfs_layout_segment *lseg = data->lseg;
+	struct nfs4_pnfs_ds *ds;
+	loff_t offset = data->args.offset;
+	u32 j, idx;
+	struct nfs_fh *fh;
+	int status;
+
+	/* Retrieve the correct rpc_client for the byte range */
+	j = nfs4_fl_calc_j_index(lseg, offset);
+	idx = nfs4_fl_calc_ds_index(lseg, j);
+	ds = nfs4_fl_prepare_ds(lseg, idx);
+	if (!ds) {
+		printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+		return PNFS_NOT_ATTEMPTED;
+	}
+	dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+		data->inode->i_ino, sync, (size_t) data->args.count, offset,
+		ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+
+	/* We can't handle commit to ds yet */
+	if (!FILELAYOUT_LSEG(lseg)->commit_through_mds)
+		data->args.stable = NFS_FILE_SYNC;
+
+	data->write_done_cb = filelayout_write_done_cb;
+	data->ds_clp = ds->ds_clp;
+	fh = nfs4_fl_select_ds_fh(lseg, j);
+	if (fh)
+		data->args.fh = fh;
+	/*
+	 * Get the file offset on the dserver. Set the write offset to
+	 * this offset and save the original offset.
+	 */
+	data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+	data->mds_offset = offset;
+
+	/* Perform an asynchronous write */
+	status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+				    &filelayout_write_call_ops, sync);
+	BUG_ON(status != 0);
+	return PNFS_ATTEMPTED;
 }
 
 /*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index da902123ec5..7b4b9f3e984 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3145,6 +3145,23 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 	return data->write_done_cb(task, data);
 }
 
+/* Reset the the nfs_write_data to send the write to the MDS. */
+void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
+{
+	dprintk("%s Reset task for i/o through\n", __func__);
+	put_lseg(data->lseg);
+	data->lseg          = NULL;
+	data->ds_clp        = NULL;
+	data->write_done_cb = nfs4_write_done_cb;
+	data->args.fh       = NFS_FH(data->inode);
+	data->args.bitmask  = data->res.server->cache_consistency_bitmask;
+	data->args.offset   = data->mds_offset;
+	data->res.fattr     = &data->fattr;
+	task->tk_ops        = data->mds_ops;
+	rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_write);
+
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
 	struct nfs_server *server = NFS_SERVER(data->inode);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index df99c5b0ee6..ee62ddf60e7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -783,7 +783,7 @@ static int flush_task_priority(int how)
 	return RPC_PRIORITY_NORMAL;
 }
 
-static int nfs_initiate_write(struct nfs_write_data *data,
+int nfs_initiate_write(struct nfs_write_data *data,
 		       struct rpc_clnt *clnt,
 		       const struct rpc_call_ops *call_ops,
 		       int how)
@@ -833,6 +833,7 @@ static int nfs_initiate_write(struct nfs_write_data *data,
 out:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(nfs_initiate_write);
 
 /*
  * Set up the argument/result storage required for the RPC call.
@@ -1194,6 +1195,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 		 */
 		static unsigned long    complain;
 
+		/* Note this will print the MDS for a DS write */
 		if (time_before(complain, jiffies)) {
 			dprintk("NFS:       faulty NFS server %s:"
 				" (committed = %d) != (stable = %d)\n",
@@ -1214,6 +1216,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 			/* Was this an NFSv2 write or an NFSv3 stable write? */
 			if (resp->verf->committed != NFS_UNSTABLE) {
 				/* Resend from where the server left off */
+				data->mds_offset += resp->count;
 				argp->offset += resp->count;
 				argp->pgbase += resp->count;
 				argp->count -= resp->count;
-- 
cgit v1.2.3


From c76069bda0f17cd3e153e54d9ac01242909c6b15 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Mar 2011 15:13:48 +0000
Subject: NFSv4.1: rearrange ->doio args

This will make it possible to clear the lseg pointer in the same
function as it is put, instead of in the caller nfs_pageio_doio().

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c | 10 ++--------
 fs/nfs/read.c     | 42 +++++++++++++++++++++++++-----------------
 fs/nfs/write.c    | 28 ++++++++++++++++------------
 3 files changed, 43 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 45b0fb8add3..9f628746f5c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -214,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
  */
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     struct inode *inode,
-		     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *),
+		     int (*doio)(struct nfs_pageio_descriptor *),
 		     size_t bsize,
 		     int io_flags)
 {
@@ -311,13 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
 	if (!list_empty(&desc->pg_list)) {
-		int error = desc->pg_doio(desc->pg_inode,
-					  &desc->pg_list,
-					  nfs_page_array_len(desc->pg_base,
-							     desc->pg_count),
-					  desc->pg_count,
-					  desc->pg_ioflags,
-					  desc->pg_lseg);
+		int error = desc->pg_doio(desc);
 		desc->pg_lseg = NULL;
 		if (error < 0)
 			desc->pg_error = error;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f40c7f4dc16..ab9c7768b7c 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -31,8 +31,8 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PAGECACHE
 
-static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
-static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *);
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
 
@@ -117,9 +117,9 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 		       struct page *page)
 {
-	LIST_HEAD(one_request);
 	struct nfs_page	*new;
 	unsigned int len;
+	struct nfs_pageio_descriptor pgio;
 
 	len = nfs_page_length(page);
 	if (len == 0)
@@ -132,11 +132,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	if (len < PAGE_CACHE_SIZE)
 		zero_user_segment(page, len, PAGE_CACHE_SIZE);
 
-	nfs_list_add_request(new, &one_request);
+	nfs_pageio_init(&pgio, inode, NULL, 0, 0);
+	nfs_list_add_request(new, &pgio.pg_list);
+	pgio.pg_count = len;
+
 	if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
-		nfs_pagein_multi(inode, &one_request, 1, len, 0, NULL);
+		nfs_pagein_multi(&pgio);
 	else
-		nfs_pagein_one(inode, &one_request, 1, len, 0, NULL);
+		nfs_pagein_one(&pgio);
 	return 0;
 }
 
@@ -258,20 +261,21 @@ nfs_async_read_error(struct list_head *head)
  * won't see the new data until our attribute cache is updated.  This is more
  * or less conventional NFS client behavior.
  */
-static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_page *req = nfs_list_entry(head->next);
+	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
 	struct page *page = req->wb_page;
 	struct nfs_read_data *data;
-	size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
+	size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
 	unsigned int offset;
 	int requests = 0;
 	int ret = 0;
+	struct pnfs_layout_segment *lseg;
 	LIST_HEAD(list);
 
 	nfs_list_remove_request(req);
 
-	nbytes = count;
+	nbytes = desc->pg_count;
 	do {
 		size_t len = min(nbytes,rsize);
 
@@ -284,11 +288,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
 	} while(nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
 
-	/* We know lseg==NULL */
-	lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
+	BUG_ON(desc->pg_lseg != NULL);
+	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
 	ClearPageError(page);
 	offset = 0;
-	nbytes = count;
+	nbytes = desc->pg_count;
 	do {
 		int ret2;
 
@@ -321,14 +325,17 @@ out_bad:
 	return -ENOMEM;
 }
 
-static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg)
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_read_data	*data;
+	struct list_head *head = &desc->pg_list;
+	struct pnfs_layout_segment *lseg = desc->pg_lseg;
 	int ret = -ENOMEM;
 
-	data = nfs_readdata_alloc(npages);
+	data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
+						     desc->pg_count));
 	if (!data) {
 		nfs_async_read_error(head);
 		goto out;
@@ -344,9 +351,10 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
 	}
 	req = nfs_list_entry(data->pages.next);
 	if ((!lseg) && list_is_singular(&data->pages))
-		lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ);
+		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
 
-	ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg);
+	ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
+				0, lseg);
 out:
 	put_lseg(lseg);
 	return ret;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ee62ddf60e7..b74200a2f75 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -898,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req)
  * Generate multiple small requests to write out a single
  * contiguous dirty area on one page.
  */
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_page *req = nfs_list_entry(head->next);
+	struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
 	struct page *page = req->wb_page;
 	struct nfs_write_data *data;
-	size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
+	size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
 	unsigned int offset;
 	int requests = 0;
 	int ret = 0;
+	struct pnfs_layout_segment *lseg;
 	LIST_HEAD(list);
 
 	nfs_list_remove_request(req);
 
-	nbytes = count;
+	nbytes = desc->pg_count;
 	do {
 		size_t len = min(nbytes, wsize);
 
@@ -924,11 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 	} while (nbytes != 0);
 	atomic_set(&req->wb_complete, requests);
 
-	BUG_ON(lseg);
-	lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
+	BUG_ON(desc->pg_lseg);
+	lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
 	ClearPageError(page);
 	offset = 0;
-	nbytes = count;
+	nbytes = desc->pg_count;
 	do {
 		int ret2;
 
@@ -940,7 +941,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
 		if (nbytes < wsize)
 			wsize = nbytes;
 		ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-					  wsize, offset, lseg, how);
+					  wsize, offset, lseg, desc->pg_ioflags);
 		if (ret == 0)
 			ret = ret2;
 		offset += wsize;
@@ -968,14 +969,17 @@ out_bad:
  * This is the case if nfs_updatepage detects a conflicting request
  * that has been written but not committed.
  */
-static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg)
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
 {
 	struct nfs_page		*req;
 	struct page		**pages;
 	struct nfs_write_data	*data;
+	struct list_head *head = &desc->pg_list;
+	struct pnfs_layout_segment *lseg = desc->pg_lseg;
 	int ret;
 
-	data = nfs_writedata_alloc(npages);
+	data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
+						      desc->pg_count));
 	if (!data) {
 		while (!list_empty(head)) {
 			req = nfs_list_entry(head->next);
@@ -995,10 +999,10 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
 	}
 	req = nfs_list_entry(data->pages.next);
 	if ((!lseg) && list_is_singular(&data->pages))
-		lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW);
+		lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
 
 	/* Set up the argument struct */
-	ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how);
+	ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
 out:
 	put_lseg(lseg); /* Cleans any gotten in ->pg_test */
 	return ret;
-- 
cgit v1.2.3


From 36fe432d33e078caee5c954e15e929819c2cacae Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 3 Mar 2011 15:13:49 +0000
Subject: NFSv4.1: Clear lseg pointer in ->doio function

Now that we have access to the pointer, clear it immediately after
the put, instead of in caller.

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c | 1 -
 fs/nfs/read.c     | 2 ++
 fs/nfs/write.c    | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 9f628746f5c..23e79441066 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -312,7 +312,6 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
 	if (!list_empty(&desc->pg_list)) {
 		int error = desc->pg_doio(desc);
-		desc->pg_lseg = NULL;
 		if (error < 0)
 			desc->pg_error = error;
 		else
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index ab9c7768b7c..4b764c6048d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -311,6 +311,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
 		nbytes -= rsize;
 	} while (nbytes != 0);
 	put_lseg(lseg);
+	desc->pg_lseg = NULL;
 
 	return ret;
 
@@ -357,6 +358,7 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
 				0, lseg);
 out:
 	put_lseg(lseg);
+	desc->pg_lseg = NULL;
 	return ret;
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b74200a2f75..47a3ad63e0d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -949,6 +949,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
 	} while (nbytes != 0);
 
 	put_lseg(lseg);
+	desc->pg_lseg = NULL;
 	return ret;
 
 out_bad:
@@ -1005,6 +1006,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
 	ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
 out:
 	put_lseg(lseg); /* Cleans any gotten in ->pg_test */
+	desc->pg_lseg = NULL;
 	return ret;
 }
 
-- 
cgit v1.2.3


From 75247affd7930cc3dcf57f850f0d7898379ef3b3 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Tue, 22 Feb 2011 15:56:01 -0800
Subject: NFSv4.1: reject zero layout with zeroed stripe unit

Allowing stripe_unit==0 causes the client to crash later on
when dividing by zero.

Reported-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 7e1d4571b7b..42855846481 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -369,8 +369,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
 		goto out;
 	}
 
-	if (fl->stripe_unit % PAGE_SIZE) {
-		dprintk("%s Stripe unit (%u) not page aligned\n",
+	if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
+		dprintk("%s Invalid stripe unit (%u)\n",
 			__func__, fl->stripe_unit);
 		goto out;
 	}
-- 
cgit v1.2.3


From 5cf36cfdc8caa2724738ad0842c5c3dd02f309dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 Feb 2011 15:44:31 -0800
Subject: NFSv4: If the server sends us a numeric uid/gid then accept it

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 18696882f1c..cbe6e2fa8ce 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -33,6 +33,24 @@
  *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+
+static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+	unsigned long val;
+	char buf[16];
+
+	if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+		return 0;
+	memcpy(buf, name, namelen);
+	buf[namelen] = '\0';
+	if (strict_strtoul(buf, 0, &val) != 0)
+		return 0;
+	*res = val;
+	return 1;
+}
 
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 
@@ -42,7 +60,6 @@
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <linux/rcupdate.h>
-#include <linux/kernel.h>
 #include <linux/err.h>
 
 #include <keys/user-type.h>
@@ -221,11 +238,15 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
 
 int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
 {
+	if (nfs_map_string_to_numeric(name, namelen, uid))
+		return 0;
 	return nfs_idmap_lookup_id(name, namelen, "uid", uid);
 }
 
 int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
 {
+	if (nfs_map_string_to_numeric(name, namelen, gid))
+		return 0;
 	return nfs_idmap_lookup_id(name, namelen, "gid", gid);
 }
 
@@ -243,7 +264,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
-#include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/in.h>
@@ -699,6 +719,8 @@ int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen
 {
 	struct idmap *idmap = clp->cl_idmap;
 
+	if (nfs_map_string_to_numeric(name, namelen, uid))
+		return 0;
 	return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
 }
 
@@ -706,6 +728,8 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
 {
 	struct idmap *idmap = clp->cl_idmap;
 
+	if (nfs_map_string_to_numeric(name, namelen, uid))
+		return 0;
 	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
 
-- 
cgit v1.2.3


From f0b851689a5da2354f19bcbbac30cd2cab45c4a1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 Feb 2011 15:44:31 -0800
Subject: NFSv4: Send unmapped uid/gids to the server if the idmapper fails

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index cbe6e2fa8ce..8518573c3ff 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -52,6 +52,11 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
 	return 1;
 }
 
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+	return snprintf(buf, buflen, "%u", id);
+}
+
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 
 #include <linux/slab.h>
@@ -252,11 +257,20 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
 
 int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
-	return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+	int ret;
+	ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+	return ret;
 }
 int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
 {
-	return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+	int ret;
+
+	ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(gid, buf, buflen);
+	return ret;
 }
 
 #else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
@@ -736,14 +750,22 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele
 int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = clp->cl_idmap;
+	int ret;
 
-	return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+	ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+	return ret;
 }
 int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = clp->cl_idmap;
+	int ret;
 
-	return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+	ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+	if (ret < 0)
+		ret = nfs_map_numeric_to_string(uid, buf, buflen);
+	return ret;
 }
 
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
-- 
cgit v1.2.3


From e4fd72a17d2703cfd626c55893ac4ca7e7d81ce9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 Feb 2011 15:44:31 -0800
Subject: NFSv4: cleanup idmapper functions to take an nfs_server argument

...instead of the nfs_client.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c   | 24 ++++++++++++------------
 fs/nfs/nfs4xdr.c | 18 ++++++++----------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 8518573c3ff..e2d579d458f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -241,21 +241,21 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
 	return ret;
 }
 
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
 	if (nfs_map_string_to_numeric(name, namelen, uid))
 		return 0;
 	return nfs_idmap_lookup_id(name, namelen, "uid", uid);
 }
 
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
 {
 	if (nfs_map_string_to_numeric(name, namelen, gid))
 		return 0;
 	return nfs_idmap_lookup_id(name, namelen, "gid", gid);
 }
 
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
 	int ret;
 	ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
@@ -263,7 +263,7 @@ int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buf
 		ret = nfs_map_numeric_to_string(uid, buf, buflen);
 	return ret;
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
 {
 	int ret;
 
@@ -729,27 +729,27 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
 	return hash;
 }
 
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
-	struct idmap *idmap = clp->cl_idmap;
+	struct idmap *idmap = server->nfs_client->cl_idmap;
 
 	if (nfs_map_string_to_numeric(name, namelen, uid))
 		return 0;
 	return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
 }
 
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
-	struct idmap *idmap = clp->cl_idmap;
+	struct idmap *idmap = server->nfs_client->cl_idmap;
 
 	if (nfs_map_string_to_numeric(name, namelen, uid))
 		return 0;
 	return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
 
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-	struct idmap *idmap = clp->cl_idmap;
+	struct idmap *idmap = server->nfs_client->cl_idmap;
 	int ret;
 
 	ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
@@ -757,9 +757,9 @@ int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buf
 		ret = nfs_map_numeric_to_string(uid, buf, buflen);
 	return ret;
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-	struct idmap *idmap = clp->cl_idmap;
+	struct idmap *idmap = server->nfs_client->cl_idmap;
 	int ret;
 
 	ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0f2dcfb41f2..686c21d8c52 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 	if (iap->ia_valid & ATTR_MODE)
 		len += 4;
 	if (iap->ia_valid & ATTR_UID) {
-		owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+		owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
 		if (owner_namelen < 0) {
 			dprintk("nfs: couldn't resolve uid %d to string\n",
 					iap->ia_uid);
@@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
 		len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
 	}
 	if (iap->ia_valid & ATTR_GID) {
-		owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+		owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
 		if (owner_grouplen < 0) {
 			dprintk("nfs: couldn't resolve gid %d to string\n",
 					iap->ia_gid);
@@ -3387,7 +3387,7 @@ out_overflow:
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
-		struct nfs_client *clp, uint32_t *uid, int may_sleep)
+		const struct nfs_server *server, uint32_t *uid, int may_sleep)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3407,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
 		if (!may_sleep) {
 			/* do nothing */
 		} else if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+			if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
 				ret = NFS_ATTR_FATTR_OWNER;
 			else
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
@@ -3425,7 +3425,7 @@ out_overflow:
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
-		struct nfs_client *clp, uint32_t *gid, int may_sleep)
+		const struct nfs_server *server, uint32_t *gid, int may_sleep)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3445,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
 		if (!may_sleep) {
 			/* do nothing */
 		} else if (len < XDR_MAX_NETOBJ) {
-			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+			if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
 				ret = NFS_ATTR_FATTR_GROUP;
 			else
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
@@ -3944,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_owner(xdr, bitmap, server->nfs_client,
-			&fattr->uid, may_sleep);
+	status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_group(xdr, bitmap, server->nfs_client,
-			&fattr->gid, may_sleep);
+	status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
-- 
cgit v1.2.3


From 3ddeb7c5c61d0d6bfd837487d3454ffdb788bb91 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 Feb 2011 15:44:31 -0800
Subject: NFSv4: Propagate the error NFS4ERR_BADOWNER to nfs4_do_setattr

This will be required in order to switch uid/gid mapping back on if the
admin has tried to disable it.

Note that we also propagate NFS4ERR_BADNAME at the same time, in order to
work around a Linux server bug.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 3 +++
 fs/nfs/nfs4xdr.c  | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7b4b9f3e984..8f3ada04ea1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -85,6 +85,9 @@ static int nfs4_map_errors(int err)
 	switch (err) {
 	case -NFS4ERR_RESOURCE:
 		return -EREMOTEIO;
+	case -NFS4ERR_BADOWNER:
+	case -NFS4ERR_BADNAME:
+		return -EINVAL;
 	default:
 		dprintk("%s could not handle NFSv4 error %d\n",
 				__func__, -err);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 686c21d8c52..0cf560f7788 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -6171,8 +6171,6 @@ static struct {
 	{ NFS4ERR_DQUOT,	-EDQUOT		},
 	{ NFS4ERR_STALE,	-ESTALE		},
 	{ NFS4ERR_BADHANDLE,	-EBADHANDLE	},
-	{ NFS4ERR_BADOWNER,	-EINVAL		},
-	{ NFS4ERR_BADNAME,	-EINVAL		},
 	{ NFS4ERR_BAD_COOKIE,	-EBADCOOKIE	},
 	{ NFS4ERR_NOTSUPP,	-ENOTSUPP	},
 	{ NFS4ERR_TOOSMALL,	-ETOOSMALL	},
-- 
cgit v1.2.3


From b064eca2cf6440bf9d5843b24cc4010624031694 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 22 Feb 2011 15:44:32 -0800
Subject: NFSv4: Send unmapped uid/gids to the server when using auth_sys

The new behaviour is enabled using the new module parameter
'nfs4_disable_idmapping'.

Note that if the server rejects an unmapped uid or gid, then
the client will automatically switch back to using the idmapper.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c   | 16 ++++++++++++++++
 fs/nfs/idmap.c    | 24 ++++++++++++++++--------
 fs/nfs/nfs4proc.c | 15 ++++++++++++++-
 3 files changed, 46 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 6dd50ac5b54..139be9647d8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -81,6 +81,11 @@ retry:
 }
 #endif /* CONFIG_NFS_V4 */
 
+/*
+ * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
+ */
+static int nfs4_disable_idmapping = 0;
+
 /*
  * RPC cruft for NFS
  */
@@ -1567,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server,
 	if (error < 0)
 		goto error;
 
+	/*
+	 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+	 * authentication.
+	 */
+	if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
+		server->caps |= NFS_CAP_UIDGID_NOMAP;
+
 	if (data->rsize)
 		server->rsize = nfs_block_size(data->rsize, NULL);
 	if (data->wsize)
@@ -1984,3 +1996,7 @@ void nfs_fs_proc_exit(void)
 }
 
 #endif /* CONFIG_PROC_FS */
+
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+		"Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index e2d579d458f..79664a1025a 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -61,6 +61,9 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
 
 #include <linux/slab.h>
 #include <linux/cred.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_idmap.h>
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
@@ -257,17 +260,20 @@ int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size
 
 int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-	int ret;
-	ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
+	int ret = -EINVAL;
+
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
 	if (ret < 0)
 		ret = nfs_map_numeric_to_string(uid, buf, buflen);
 	return ret;
 }
 int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
 {
-	int ret;
+	int ret = -EINVAL;
 
-	ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
 	if (ret < 0)
 		ret = nfs_map_numeric_to_string(gid, buf, buflen);
 	return ret;
@@ -750,9 +756,10 @@ int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size
 int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = server->nfs_client->cl_idmap;
-	int ret;
+	int ret = -EINVAL;
 
-	ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
 	if (ret < 0)
 		ret = nfs_map_numeric_to_string(uid, buf, buflen);
 	return ret;
@@ -760,9 +767,10 @@ int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, s
 int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
 	struct idmap *idmap = server->nfs_client->cl_idmap;
-	int ret;
+	int ret = -EINVAL;
 
-	ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+	if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+		ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
 	if (ret < 0)
 		ret = nfs_map_numeric_to_string(uid, buf, buflen);
 	return ret;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8f3ada04ea1..1d84e7088af 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -244,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 /* This is the error handling routine for processes that are allowed
  * to sleep.
  */
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
 {
 	struct nfs_client *clp = server->nfs_client;
 	struct nfs4_state *state = exception->state;
@@ -296,6 +296,19 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 				break;
 		case -NFS4ERR_OLD_STATEID:
 			exception->retry = 1;
+			break;
+		case -NFS4ERR_BADOWNER:
+			/* The following works around a Linux server bug! */
+		case -NFS4ERR_BADNAME:
+			if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+				server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+				exception->retry = 1;
+				printk(KERN_WARNING "NFS: v4 server %s "
+						"does not accept raw "
+						"uid/gids. "
+						"Reenabling the idmapper.\n",
+						server->nfs_client->cl_hostname);
+			}
 	}
 	/* We failed to handle the error */
 	return nfs4_map_errors(ret);
-- 
cgit v1.2.3


From 7ec10f26e1fd5fcceb9c96e508c1292a816199f7 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 22 Feb 2011 00:28:34 +0300
Subject: NFS: account direct-io into task io accounting

Account NFS direct-io reads and writes into Task I/O Accounting.
Do it before complition to handle aio.

NFS have unusual direct-io implementation,
thus accounting in generic code does not work.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f493bdd74f7..8eea2536671 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -45,6 +45,7 @@
 #include <linux/pagemap.h>
 #include <linux/kref.h>
 #include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
@@ -937,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	if (retval)
 		goto out;
 
+	task_io_account_read(count);
+
 	retval = nfs_direct_read(iocb, iov, nr_segs, pos);
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
@@ -998,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (retval)
 		goto out;
 
+	task_io_account_write(count);
+
 	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
 
 	if (retval > 0)
-- 
cgit v1.2.3


From c12bacec458bef16d843c052f38422862f3da8fe Mon Sep 17 00:00:00 2001
From: Rob Landley <rlandley@parallels.com>
Date: Wed, 9 Mar 2011 15:54:13 -0600
Subject: cleanup: save 60 lines/100 bytes by combining two mostly duplicate
 functions.

Eliminate two mostly duplicate functions (nfs_parse_simple_hostname()
and nfs_parse_protected_hostname()) and instead just make the calling
function (nfs_parse_devname()) do everything.

Signed-off-by: Rob Landley <rlandley@parallels.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 129 +++++++++++++++------------------------------------------
 1 file changed, 33 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b68c8607770..a74e9740190 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1665,99 +1665,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 	return nfs_walk_authlist(args, &request);
 }
 
-static int nfs_parse_simple_hostname(const char *dev_name,
-				     char **hostname, size_t maxnamlen,
-				     char **export_path, size_t maxpathlen)
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+			     char **hostname, size_t maxnamlen,
+			     char **export_path, size_t maxpathlen)
 {
 	size_t len;
-	char *colon, *comma;
-
-	colon = strchr(dev_name, ':');
-	if (colon == NULL)
-		goto out_bad_devname;
-
-	len = colon - dev_name;
-	if (len > maxnamlen)
-		goto out_hostname;
-
-	/* N.B. caller will free nfs_server.hostname in all cases */
-	*hostname = kstrndup(dev_name, len, GFP_KERNEL);
-	if (!*hostname)
-		goto out_nomem;
+	char *end;
 
-	/* kill possible hostname list: not supported */
-	comma = strchr(*hostname, ',');
-	if (comma != NULL) {
-		if (comma == *hostname)
+	/* Is the host name protected with square brakcets? */
+	if (*dev_name == '[') {
+		end = strchr(++dev_name, ']');
+		if (end == NULL || end[1] != ':')
 			goto out_bad_devname;
-		*comma = '\0';
-	}
-
-	colon++;
-	len = strlen(colon);
-	if (len > maxpathlen)
-		goto out_path;
-	*export_path = kstrndup(colon, len, GFP_KERNEL);
-	if (!*export_path)
-		goto out_nomem;
-
-	dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
-	return 0;
 
-out_bad_devname:
-	dfprintk(MOUNT, "NFS: device name not in host:path format\n");
-	return -EINVAL;
-
-out_nomem:
-	dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
-	return -ENOMEM;
-
-out_hostname:
-	dfprintk(MOUNT, "NFS: server hostname too long\n");
-	return -ENAMETOOLONG;
-
-out_path:
-	dfprintk(MOUNT, "NFS: export pathname too long\n");
-	return -ENAMETOOLONG;
-}
-
-/*
- * Hostname has square brackets around it because it contains one or
- * more colons.  We look for the first closing square bracket, and a
- * colon must follow it.
- */
-static int nfs_parse_protected_hostname(const char *dev_name,
-					char **hostname, size_t maxnamlen,
-					char **export_path, size_t maxpathlen)
-{
-	size_t len;
-	char *start, *end;
+		len = end - dev_name;
+		end++;
+	} else {
+		char *comma;
 
-	start = (char *)(dev_name + 1);
+		end = strchr(dev_name, ':');
+		if (end == NULL)
+			goto out_bad_devname;
+		len = end - dev_name;
 
-	end = strchr(start, ']');
-	if (end == NULL)
-		goto out_bad_devname;
-	if (*(end + 1) != ':')
-		goto out_bad_devname;
+		/* kill possible hostname list: not supported */
+		comma = strchr(dev_name, ',');
+		if (comma != NULL && comma < end)
+			*comma = 0;
+	}
 
-	len = end - start;
 	if (len > maxnamlen)
 		goto out_hostname;
 
 	/* N.B. caller will free nfs_server.hostname in all cases */
-	*hostname = kstrndup(start, len, GFP_KERNEL);
+	*hostname = kstrndup(dev_name, len, GFP_KERNEL);
 	if (*hostname == NULL)
 		goto out_nomem;
-
-	end += 2;
-	len = strlen(end);
+	len = strlen(++end);
 	if (len > maxpathlen)
 		goto out_path;
 	*export_path = kstrndup(end, len, GFP_KERNEL);
 	if (!*export_path)
 		goto out_nomem;
 
+	dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
 	return 0;
 
 out_bad_devname:
@@ -1777,29 +1737,6 @@ out_path:
 	return -ENAMETOOLONG;
 }
 
-/*
- * Split "dev_name" into "hostname:export_path".
- *
- * The leftmost colon demarks the split between the server's hostname
- * and the export path.  If the hostname starts with a left square
- * bracket, then it may contain colons.
- *
- * Note: caller frees hostname and export path, even on error.
- */
-static int nfs_parse_devname(const char *dev_name,
-			     char **hostname, size_t maxnamlen,
-			     char **export_path, size_t maxpathlen)
-{
-	if (*dev_name == '[')
-		return nfs_parse_protected_hostname(dev_name,
-						    hostname, maxnamlen,
-						    export_path, maxpathlen);
-
-	return nfs_parse_simple_hostname(dev_name,
-					 hostname, maxnamlen,
-					 export_path, maxpathlen);
-}
-
 /*
  * Validate the NFS2/NFS3 mount data
  * - fills in the mount root filehandle
-- 
cgit v1.2.3


From c5cb09b6f898609922f9b873661f6cbc26cb29e1 Mon Sep 17 00:00:00 2001
From: Rob Landley <rlandley@parallels.com>
Date: Wed, 9 Mar 2011 16:02:37 -0600
Subject: Cleanup: Factor out some cut-and-paste code.

Factor out some cut-and-paste code in options parsing.
Saves about 800 bytes on x86-64.

Signed-off-by: Rob Landley <rlandley@parallels.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 155 ++++++++++++++++-----------------------------------------
 1 file changed, 44 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a74e9740190..7e13e1a6b39 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -979,6 +979,27 @@ static int nfs_parse_security_flavors(char *value,
 	return 1;
 }
 
+static int nfs_get_option_str(substring_t args[], char **option)
+{
+	kfree(*option);
+	*option = match_strdup(args);
+	return !option;
+}
+
+static int nfs_get_option_ul(substring_t args[], unsigned long *option)
+{
+	int rc;
+	char *string;
+
+	string = match_strdup(args);
+	if (string == NULL)
+		return -ENOMEM;
+	rc = strict_strtoul(string, 10, option);
+	kfree(string);
+
+	return rc;
+}
+
 /*
  * Error-check and convert a string of mount options from user space into
  * a data structure.  The whole mount string is processed; bad options are
@@ -1127,155 +1148,82 @@ static int nfs_parse_mount_options(char *raw,
 		 * options that take numeric values
 		 */
 		case Opt_port:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 || option > USHRT_MAX)
+			if (nfs_get_option_ul(args, &option) ||
+			    option > USHRT_MAX)
 				goto out_invalid_value;
 			mnt->nfs_server.port = option;
 			break;
 		case Opt_rsize:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			mnt->rsize = option;
 			break;
 		case Opt_wsize:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			mnt->wsize = option;
 			break;
 		case Opt_bsize:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			mnt->bsize = option;
 			break;
 		case Opt_timeo:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 || option == 0)
+			if (nfs_get_option_ul(args, &option) || option == 0)
 				goto out_invalid_value;
 			mnt->timeo = option;
 			break;
 		case Opt_retrans:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 || option == 0)
+			if (nfs_get_option_ul(args, &option) || option == 0)
 				goto out_invalid_value;
 			mnt->retrans = option;
 			break;
 		case Opt_acregmin:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			mnt->acregmin = option;
 			break;
 		case Opt_acregmax:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			mnt->acregmax = option;
 			break;
 		case Opt_acdirmin:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			mnt->acdirmin = option;
 			break;
 		case Opt_acdirmax:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			mnt->acdirmax = option;
 			break;
 		case Opt_actimeo:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			mnt->acregmin = mnt->acregmax =
 			mnt->acdirmin = mnt->acdirmax = option;
 			break;
 		case Opt_namelen:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			mnt->namlen = option;
 			break;
 		case Opt_mountport:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 || option > USHRT_MAX)
+			if (nfs_get_option_ul(args, &option) ||
+			    option > USHRT_MAX)
 				goto out_invalid_value;
 			mnt->mount_server.port = option;
 			break;
 		case Opt_mountvers:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0 ||
+			if (nfs_get_option_ul(args, &option) ||
 			    option < NFS_MNT_VERSION ||
 			    option > NFS_MNT3_VERSION)
 				goto out_invalid_value;
 			mnt->mount_server.version = option;
 			break;
 		case Opt_nfsvers:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			switch (option) {
 			case NFS2_VERSION:
@@ -1295,12 +1243,7 @@ static int nfs_parse_mount_options(char *raw,
 			}
 			break;
 		case Opt_minorversion:
-			string = match_strdup(args);
-			if (string == NULL)
-				goto out_nomem;
-			rc = strict_strtoul(string, 10, &option);
-			kfree(string);
-			if (rc != 0)
+			if (nfs_get_option_ul(args, &option))
 				goto out_invalid_value;
 			if (option > NFS4_MAX_MINOR_VERSION)
 				goto out_invalid_value;
@@ -1336,21 +1279,18 @@ static int nfs_parse_mount_options(char *raw,
 			case Opt_xprt_udp:
 				mnt->flags &= ~NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-				kfree(string);
 				break;
 			case Opt_xprt_tcp6:
 				protofamily = AF_INET6;
 			case Opt_xprt_tcp:
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-				kfree(string);
 				break;
 			case Opt_xprt_rdma:
 				/* vector side protocols to TCP */
 				mnt->flags |= NFS_MOUNT_TCP;
 				mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
 				xprt_load_transport(string);
-				kfree(string);
 				break;
 			default:
 				dfprintk(MOUNT, "NFS:   unrecognized "
@@ -1358,6 +1298,7 @@ static int nfs_parse_mount_options(char *raw,
 				kfree(string);
 				return 0;
 			}
+			kfree(string);
 			break;
 		case Opt_mountproto:
 			string = match_strdup(args);
@@ -1400,18 +1341,13 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_invalid_address;
 			break;
 		case Opt_clientaddr:
-			string = match_strdup(args);
-			if (string == NULL)
+			if (nfs_get_option_str(args, &mnt->client_address))
 				goto out_nomem;
-			kfree(mnt->client_address);
-			mnt->client_address = string;
 			break;
 		case Opt_mounthost:
-			string = match_strdup(args);
-			if (string == NULL)
+			if (nfs_get_option_str(args,
+					       &mnt->mount_server.hostname))
 				goto out_nomem;
-			kfree(mnt->mount_server.hostname);
-			mnt->mount_server.hostname = string;
 			break;
 		case Opt_mountaddr:
 			string = match_strdup(args);
@@ -1451,11 +1387,8 @@ static int nfs_parse_mount_options(char *raw,
 			};
 			break;
 		case Opt_fscache_uniq:
-			string = match_strdup(args);
-			if (string == NULL)
+			if (nfs_get_option_str(args, &mnt->fscache_uniq))
 				goto out_nomem;
-			kfree(mnt->fscache_uniq);
-			mnt->fscache_uniq = string;
 			mnt->options |= NFS_OPTION_FSCACHE;
 			break;
 		case Opt_local_lock:
-- 
cgit v1.2.3


From 0c9ba97318fc9a905bcc1419b6966de061203a70 Mon Sep 17 00:00:00 2001
From: Alex Elder <aelder@sgi.com>
Date: Fri, 11 Mar 2011 12:39:51 +0000
Subject: xfs: don't name variables "panic"

The new xfs_alert_tag() used a variable named "panic",
and that is to be avoided.  Rename it.

Signed-off-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_message.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 8fe8cf69d80..508e06fd7d1 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -96,13 +96,13 @@ xfs_alert_tag(
 {
 	struct va_format	vaf;
 	va_list			args;
-	int			panic = 0;
-	int			 r;
+	int			do_panic = 0;
+	int			r;
 
 	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
 		xfs_printk(KERN_ALERT, mp,
 			"XFS: Transforming an alert into a BUG.");
-		panic = 1;
+		do_panic = 1;
 	}
 
 	va_start(args, fmt);
@@ -113,7 +113,7 @@ xfs_alert_tag(
 	r = __xfs_printk(KERN_ALERT, mp, &vaf);
 	va_end(args);
 
-	BUG_ON(panic);
+	BUG_ON(do_panic);
 
 	return r;
 }
-- 
cgit v1.2.3


From 36e39c40b3facc9b489a13f1d301fc53ff6960a3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 12 Mar 2011 07:08:42 -0500
Subject: Btrfs: break out of shrink_delalloc earlier

Josef had changed shrink_delalloc to exit after three shrink
attempts, which wasn't quite enough because new writers could
race in and steal free space.

But it also fixed deadlocks and stalls as we tried to recover
delalloc reservations.  The code was tweaked to loop 1024
times, and would reset the counter any time a small amount
of progress was made.  This was too drastic, and with a
lot of writers we can end up stuck in shrink_delalloc forever.

The shrink_delalloc loop is fairly complex because the caller is looping
too, and the caller will go ahead and force a transaction commit to make
sure we reclaim space.

This reworks things to exit shrink_delalloc when we've forced some
writeback and the delalloc reservations have gone down.  This means
the writeback has not just started but has also finished at
least some of the metadata changes required to reclaim delalloc
space.

If we've got this wrong, we're returning ENOSPC too early, which
is a big improvement over the current behavior of hanging the machine.

Test 224 in xfstests hammers on this nicely, and with 1000 writers
trying to fill a 1GB drive we get our first ENOSPC at 93% full.  The
other writers are able to continue until we get 100%.

This is a worst case test for btrfs because the 1000 writers are doing
small IO, and the small FS size means we don't have a lot of room
for metadata chunks.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  9 +++++++++
 fs/btrfs/extent-tree.c | 35 +++++++++++++++++++++++------------
 2 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 28188a786da..8b4b9d158a0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -729,6 +729,15 @@ struct btrfs_space_info {
 	u64 disk_total;		/* total bytes on disk, takes mirrors into
 				   account */
 
+	/*
+	 * we bump reservation progress every time we decrement
+	 * bytes_reserved.  This way people waiting for reservations
+	 * know something good has happened and they can check
+	 * for progress.  The number here isn't to be trusted, it
+	 * just shows reclaim activity
+	 */
+	unsigned long reservation_progress;
+
 	int full;		/* indicates that we cannot allocate any more
 				   chunks for this space */
 	int force_alloc;	/* set if we need to force a chunk alloc for
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 100e409e905..f1db57d4a01 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3343,15 +3343,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	u64 max_reclaim;
 	u64 reclaimed = 0;
 	long time_left;
-	int pause = 1;
 	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 	int loops = 0;
+	unsigned long progress;
 
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
 	smp_mb();
 	reserved = space_info->bytes_reserved;
+	progress = space_info->reservation_progress;
 
 	if (reserved == 0)
 		return 0;
@@ -3366,31 +3367,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_reserved) {
-			loops = 0;
+		if (reserved > space_info->bytes_reserved)
 			reclaimed += reserved - space_info->bytes_reserved;
-		} else {
-			loops++;
-		}
 		reserved = space_info->bytes_reserved;
 		spin_unlock(&space_info->lock);
 
+		loops++;
+
 		if (reserved == 0 || reclaimed >= max_reclaim)
 			break;
 
 		if (trans && trans->transaction->blocked)
 			return -EAGAIN;
 
-		__set_current_state(TASK_INTERRUPTIBLE);
-		time_left = schedule_timeout(pause);
+		time_left = schedule_timeout_interruptible(1);
 
 		/* We were interrupted, exit */
 		if (time_left)
 			break;
 
-		pause <<= 1;
-		if (pause > HZ / 10)
-			pause = HZ / 10;
+		/* we've kicked the IO a few times, if anything has been freed,
+		 * exit.  There is no sense in looping here for a long time
+		 * when we really need to commit the transaction, or there are
+		 * just too many writers without enough free space
+		 */
+
+		if (loops > 3) {
+			smp_mb();
+			if (progress != space_info->reservation_progress)
+				break;
+		}
 
 	}
 	return reclaimed >= to_reclaim;
@@ -3613,6 +3619,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
 			space_info->bytes_reserved -= num_bytes;
+			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
 	}
@@ -3845,6 +3852,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
 		sinfo->bytes_reserved -= num_bytes;
+		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
 	}
@@ -4006,7 +4014,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		to_reserve = 0;
 	}
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
 	to_reserve += calc_csum_metadata_size(inode, num_bytes);
 	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
 	if (ret)
@@ -4134,6 +4141,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			btrfs_set_block_group_used(&cache->item, old_val);
 			cache->reserved -= num_bytes;
 			cache->space_info->bytes_reserved -= num_bytes;
+			cache->space_info->reservation_progress++;
 			cache->space_info->bytes_used += num_bytes;
 			cache->space_info->disk_used += num_bytes * factor;
 			spin_unlock(&cache->lock);
@@ -4185,6 +4193,7 @@ static int pin_down_extent(struct btrfs_root *root,
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
+		cache->space_info->reservation_progress++;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
@@ -4235,6 +4244,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
 				space_info->bytes_readonly += num_bytes;
 			cache->reserved -= num_bytes;
 			space_info->bytes_reserved -= num_bytes;
+			space_info->reservation_progress++;
 		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&space_info->lock);
@@ -4713,6 +4723,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		if (ret) {
 			spin_lock(&cache->space_info->lock);
 			cache->space_info->bytes_reserved -= buf->len;
+			cache->space_info->reservation_progress++;
 			spin_unlock(&cache->space_info->lock);
 		}
 		goto out;
-- 
cgit v1.2.3


From 586ce098a23b6ab7383df853a84ae3d48dc889aa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 01:50:58 -0500
Subject: compat breakage in preadv() and pwritev()

Fix for a dumb preadv()/pwritev() compat bug - unlike the native
variants, compat_... ones forget to check FMODE_P{READ,WRITE}, so e.g.
on pipe the native preadv() will fail with -ESPIPE and compat one will
act as readv() and succeed.  Not critical, but it's a clear bug with trivial
fix.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6c..691c3fd8ce1 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1228,7 +1228,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_readv(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PREAD)
+		ret = compat_readv(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
@@ -1285,7 +1287,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_writev(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PWRITE)
+		ret = compat_writev(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
-- 
cgit v1.2.3


From c44ed965be7a84afaa07543c04eb97a5dfe93422 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 13 Mar 2011 23:24:46 +0000
Subject: compat breakage in preadv() and pwritev()

Fix for a dumb preadv()/pwritev() compat bug - unlike the native
variants, the compat_...  ones forget to check FMODE_P{READ,WRITE}, so
e.g.  on pipe the native preadv() will fail with -ESPIPE and compat one
will act as readv() and succeed.

Not critical, but it's a clear bug with trivial fix, so IMO it's OK for
-final.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6c..691c3fd8ce1 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1228,7 +1228,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_readv(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PREAD)
+		ret = compat_readv(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
@@ -1285,7 +1287,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_writev(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PWRITE)
+		ret = compat_writev(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
-- 
cgit v1.2.3


From c618e87a5fd02aaad006c12d5a80a231dfa39250 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 14 Mar 2011 12:40:29 +0000
Subject: GFS2: Update to AIL list locking

The previous patch missed a couple of places where the AIL list
needed locking, so this fixes up those places, plus a comment
is corrected too.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Chinner <dchinner@redhat.com>
---
 fs/gfs2/log.c     | 2 +-
 fs/gfs2/lops.c    | 2 ++
 fs/gfs2/meta_io.c | 2 ++
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4e3c044934e..e7ed31f858d 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -67,7 +67,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
  * @mapping: The associated mapping (maybe NULL)
  * @bd: The gfs2_bufdata to remove
  *
- * The log lock _must_ be held when calling this function
+ * The ail lock _must_ be held when calling this function
  *
  */
 
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4295a6a0f1e..e919abf25ec 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -51,8 +51,10 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	/* If this buffer is in the AIL and it has already been written
 	 * to in-place disk block, remove it from the AIL.
 	 */
+	spin_lock(&sdp->sd_ail_lock);
 	if (bd->bd_ail)
 		list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+	spin_unlock(&sdp->sd_ail_lock);
 	get_bh(bh);
 	atomic_inc(&sdp->sd_log_pinned);
 	trace_gfs2_pin(bd, 1);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939739c7b3f..01d97f48655 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -326,6 +326,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
 		brelse(bh);
 	}
 	if (bd) {
+		spin_lock(&sdp->sd_ail_lock);
 		if (bd->bd_ail) {
 			gfs2_remove_from_ail(bd);
 			bh->b_private = NULL;
@@ -333,6 +334,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
 			bd->bd_blkno = bh->b_blocknr;
 			gfs2_trans_add_revoke(sdp, bd);
 		}
+		spin_unlock(&sdp->sd_ail_lock);
 	}
 	clear_buffer_dirty(bh);
 	clear_buffer_uptodate(bh);
-- 
cgit v1.2.3


From c9c6cac0c2bdbda42e7b804838648d0bc60ddb13 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 15:15:47 -0500
Subject: kill path_lookup()

all remaining callers pass LOOKUP_PARENT to it, so
flags argument can die; renamed to kern_path_parent()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c              | 7 +++----
 fs/ocfs2/refcounttree.c | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a4689eb2df2..1d6bc815155 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1747,10 +1747,9 @@ static int do_path_lookup(int dfd, const char *name,
 	return retval;
 }
 
-int path_lookup(const char *name, unsigned int flags,
-			struct nameidata *nd)
+int kern_path_parent(const char *name, struct nameidata *nd)
 {
-	return do_path_lookup(AT_FDCWD, name, flags, nd);
+	return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
 }
 
 int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -3586,7 +3585,7 @@ EXPORT_SYMBOL(page_readlink);
 EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path_parent);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19ebc5aad39..29623da133c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4379,7 +4379,7 @@ static int ocfs2_user_path_parent(const char __user *path,
 	if (IS_ERR(s))
 		return PTR_ERR(s);
 
-	error = path_lookup(s, LOOKUP_PARENT, nd);
+	error = kern_path_parent(s, nd);
 	if (error)
 		putname(s);
 	else
-- 
cgit v1.2.3


From 52094c8a0610cf57920ad4c6c57470ae2ccbbd25 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Feb 2011 21:34:47 -0500
Subject: take RCU-dependent stuff around exec_permission() into a new helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 1d6bc815155..8c704465f6c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1322,6 +1322,18 @@ fail:
 	return PTR_ERR(dentry);
 }
 
+static inline int may_lookup(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU) {
+		int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+		if (err != -ECHILD)
+			return err;
+		if (nameidata_drop_rcu(nd))
+			return -ECHILD;
+	}
+	return exec_permission(nd->inode, 0);
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1352,17 +1364,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		unsigned int c;
 
 		nd->flags |= LOOKUP_CONTINUE;
-		if (nd->flags & LOOKUP_RCU) {
-			err = exec_permission(nd->inode, IPERM_FLAG_RCU);
-			if (err == -ECHILD) {
-				if (nameidata_drop_rcu(nd))
-					return -ECHILD;
-				goto exec_again;
-			}
-		} else {
-exec_again:
-			err = exec_permission(nd->inode, 0);
-		}
+
+		err = may_lookup(nd);
  		if (err)
 			break;
 
-- 
cgit v1.2.3


From ee0827cd6b42b0385dc1a116cd853ac1b739f711 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Feb 2011 23:38:09 -0500
Subject: sanitize path_walk() mess

New helper: path_lookupat().  Basically, what do_path_lookup() boils to
modulo -ECHILD/-ESTALE handler.  path_walk* family is gone; vfs_path_lookup()
is using link_path_walk() directly, do_path_lookup() and do_filp_open()
are using path_lookupat().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 148 +++++++++++++++++++++++--------------------------------------
 1 file changed, 56 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 8c704465f6c..f5de5bb1a61 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1520,59 +1520,6 @@ return_err:
 	return err;
 }
 
-static inline int path_walk_rcu(const char *name, struct nameidata *nd)
-{
-	current->total_link_count = 0;
-
-	return link_path_walk(name, nd);
-}
-
-static inline int path_walk_simple(const char *name, struct nameidata *nd)
-{
-	current->total_link_count = 0;
-
-	return link_path_walk(name, nd);
-}
-
-static int path_walk(const char *name, struct nameidata *nd)
-{
-	struct path save = nd->path;
-	int result;
-
-	current->total_link_count = 0;
-
-	/* make sure the stuff we saved doesn't go away */
-	path_get(&save);
-
-	result = link_path_walk(name, nd);
-	if (result == -ESTALE) {
-		/* nd->path had been dropped */
-		current->total_link_count = 0;
-		nd->path = save;
-		nd->inode = save.dentry->d_inode;
-		path_get(&nd->path);
-		nd->flags |= LOOKUP_REVAL;
-		result = link_path_walk(name, nd);
-	}
-
-	path_put(&save);
-
-	return result;
-}
-
-static void path_finish_rcu(struct nameidata *nd)
-{
-	if (nd->flags & LOOKUP_RCU) {
-		/* RCU dangling. Cancel it. */
-		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
-	}
-	if (nd->file)
-		fput(nd->file);
-}
-
 static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
@@ -1697,7 +1644,7 @@ out_fail:
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
+static int path_lookupat(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
 	int retval;
@@ -1716,29 +1663,45 @@ static int do_path_lookup(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	retval = path_init_rcu(dfd, name, flags, nd);
+	if (flags & LOOKUP_RCU)
+		retval = path_init_rcu(dfd, name, flags, nd);
+	else
+		retval = path_init(dfd, name, flags, nd);
+
 	if (unlikely(retval))
 		return retval;
-	retval = path_walk_rcu(name, nd);
-	path_finish_rcu(nd);
+
+	current->total_link_count = 0;
+	retval = link_path_walk(name, nd);
+
+	if (nd->flags & LOOKUP_RCU) {
+		/* RCU dangling. Cancel it. */
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+
+	if (nd->file) {
+		fput(nd->file);
+		nd->file = NULL;
+	}
+
 	if (nd->root.mnt) {
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
+	return retval;
+}
 
-	if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
-		/* slower, locked walk */
-		if (retval == -ESTALE)
-			flags |= LOOKUP_REVAL;
-		retval = path_init(dfd, name, flags, nd);
-		if (unlikely(retval))
-			return retval;
-		retval = path_walk(name, nd);
-		if (nd->root.mnt) {
-			path_put(&nd->root);
-			nd->root.mnt = NULL;
-		}
-	}
+static int do_path_lookup(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
+{
+	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+	if (unlikely(retval == -ECHILD))
+		retval = path_lookupat(dfd, name, flags, nd);
+	if (unlikely(retval == -ESTALE))
+		retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
 
 	if (likely(!retval)) {
 		if (unlikely(!audit_dummy_context())) {
@@ -1746,7 +1709,6 @@ static int do_path_lookup(int dfd, const char *name,
 				audit_inode(name, nd->path.dentry);
 		}
 	}
-
 	return retval;
 }
 
@@ -1776,7 +1738,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		    const char *name, unsigned int flags,
 		    struct nameidata *nd)
 {
-	int retval;
+	int result;
 
 	/* same as do_path_lookup */
 	nd->last_type = LAST_ROOT;
@@ -1790,15 +1752,27 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	path_get(&nd->root);
 	nd->inode = nd->path.dentry->d_inode;
 
-	retval = path_walk(name, nd);
-	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
+	current->total_link_count = 0;
+
+	result = link_path_walk(name, nd);
+	if (result == -ESTALE) {
+		/* nd->path had been dropped */
+		current->total_link_count = 0;
+		nd->path.dentry = dentry;
+		nd->path.mnt = mnt;
+		nd->inode = dentry->d_inode;
+		path_get(&nd->path);
+		nd->flags |= LOOKUP_REVAL;
+		result = link_path_walk(name, nd);
+	}
+	if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
 				nd->inode))
 		audit_inode(name, nd->path.dentry);
 
 	path_put(&nd->root);
 	nd->root.mnt = NULL;
 
-	return retval;
+	return result;
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -2483,24 +2457,14 @@ out_filp2:
 
 creat:
 	/* OK, have to create the file. Find the parent. */
-	error = path_init_rcu(dfd, pathname,
-			LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-	if (error)
-		goto out_filp;
-	error = path_walk_rcu(pathname, &nd);
-	path_finish_rcu(&nd);
-	if (unlikely(error == -ECHILD || error == -ESTALE)) {
-		/* slower, locked walk */
-		if (error == -ESTALE) {
+	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | LOOKUP_RCU, &nd);
+	if (unlikely(error == -ECHILD))
+		error = path_lookupat(dfd, pathname, LOOKUP_PARENT, &nd);
+	if (unlikely(error == -ESTALE)) {
 reval:
-			flags |= LOOKUP_REVAL;
-		}
-		error = path_init(dfd, pathname,
-				LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-		if (error)
-			goto out_filp;
-
-		error = path_walk_simple(pathname, &nd);
+		flags |= LOOKUP_REVAL;
+		error = path_lookupat(dfd, pathname,
+				LOOKUP_PARENT | LOOKUP_REVAL, &nd);
 	}
 	if (unlikely(error))
 		goto out_filp;
-- 
cgit v1.2.3


From e41f7d4ee5bdb00da7d327a00b0ab9c4a2e9eaa3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 14:02:58 -0500
Subject: merge path_init and path_init_rcu

Actual dependency on whether we want RCU or not is in 3 small areas
(as it ought to be) and everything around those is the same in both
versions.  Since each function has only one caller and those callers
are on two sides of if (flags & LOOKUP_RCU), it's easier and cleaner
to merge them and pull the checks inside.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 118 ++++++++++++++++++-------------------------------------------
 1 file changed, 35 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f5de5bb1a61..b9e537980ef 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1520,45 +1520,44 @@ return_err:
 	return err;
 }
 
-static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
 	int fput_needed;
 	struct file *file;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
-	nd->flags = flags | LOOKUP_RCU;
+	nd->flags = flags;
 	nd->depth = 0;
 	nd->root.mnt = NULL;
 	nd->file = NULL;
 
 	if (*name=='/') {
-		struct fs_struct *fs = current->fs;
-		unsigned seq;
-
-		br_read_lock(vfsmount_lock);
-		rcu_read_lock();
-
-		do {
-			seq = read_seqcount_begin(&fs->seq);
-			nd->root = fs->root;
-			nd->path = nd->root;
-			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-		} while (read_seqcount_retry(&fs->seq, seq));
-
+		if (flags & LOOKUP_RCU) {
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+			set_root_rcu(nd);
+		} else {
+			set_root(nd);
+			path_get(&nd->root);
+		}
+		nd->path = nd->root;
 	} else if (dfd == AT_FDCWD) {
-		struct fs_struct *fs = current->fs;
-		unsigned seq;
-
-		br_read_lock(vfsmount_lock);
-		rcu_read_lock();
+		if (flags & LOOKUP_RCU) {
+			struct fs_struct *fs = current->fs;
+			unsigned seq;
 
-		do {
-			seq = read_seqcount_begin(&fs->seq);
-			nd->path = fs->pwd;
-			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-		} while (read_seqcount_retry(&fs->seq, seq));
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
 
+			do {
+				seq = read_seqcount_begin(&fs->seq);
+				nd->path = fs->pwd;
+				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			} while (read_seqcount_retry(&fs->seq, seq));
+		} else {
+			get_fs_pwd(current->fs, &nd->path);
+		}
 	} else {
 		struct dentry *dentry;
 
@@ -1578,62 +1577,18 @@ static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct n
 			goto fput_fail;
 
 		nd->path = file->f_path;
-		if (fput_needed)
-			nd->file = file;
-
-		nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-		br_read_lock(vfsmount_lock);
-		rcu_read_lock();
+		if (flags & LOOKUP_RCU) {
+			if (fput_needed)
+				nd->file = file;
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+		} else {
+			path_get(&file->f_path);
+			fput_light(file, fput_needed);
+		}
 	}
-	nd->inode = nd->path.dentry->d_inode;
-	return 0;
 
-fput_fail:
-	fput_light(file, fput_needed);
-out_fail:
-	return retval;
-}
-
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
-{
-	int retval = 0;
-	int fput_needed;
-	struct file *file;
-
-	nd->last_type = LAST_ROOT; /* if there are only slashes... */
-	nd->flags = flags;
-	nd->depth = 0;
-	nd->root.mnt = NULL;
-
-	if (*name=='/') {
-		set_root(nd);
-		nd->path = nd->root;
-		path_get(&nd->root);
-	} else if (dfd == AT_FDCWD) {
-		get_fs_pwd(current->fs, &nd->path);
-	} else {
-		struct dentry *dentry;
-
-		file = fget_light(dfd, &fput_needed);
-		retval = -EBADF;
-		if (!file)
-			goto out_fail;
-
-		dentry = file->f_path.dentry;
-
-		retval = -ENOTDIR;
-		if (!S_ISDIR(dentry->d_inode->i_mode))
-			goto fput_fail;
-
-		retval = file_permission(file, MAY_EXEC);
-		if (retval)
-			goto fput_fail;
-
-		nd->path = file->f_path;
-		path_get(&file->f_path);
-
-		fput_light(file, fput_needed);
-	}
 	nd->inode = nd->path.dentry->d_inode;
 	return 0;
 
@@ -1663,10 +1618,7 @@ static int path_lookupat(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	if (flags & LOOKUP_RCU)
-		retval = path_init_rcu(dfd, name, flags, nd);
-	else
-		retval = path_init(dfd, name, flags, nd);
+	retval = path_init(dfd, name, flags, nd);
 
 	if (unlikely(retval))
 		return retval;
-- 
cgit v1.2.3


From fe479a580dc9c737c4eb49ff7fdb31d41d2c7003 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 15:10:03 -0500
Subject: merge component type recognition

no need to do it in three places...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 48 ++++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index b9e537980ef..4521b5ff7c9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1362,6 +1362,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		unsigned long hash;
 		struct qstr this;
 		unsigned int c;
+		int type;
 
 		nd->flags |= LOOKUP_CONTINUE;
 
@@ -1381,6 +1382,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		this.len = name - (const char *) this.name;
 		this.hash = end_name_hash(hash);
 
+		type = LAST_NORM;
+		if (this.name[0] == '.') switch (this.len) {
+			case 2:
+				if (this.name[1] == '.')
+					type = LAST_DOTDOT;
+				break;
+			case 1:
+				type = LAST_DOT;
+		}
+
 		/* remove trailing slashes? */
 		if (!c)
 			goto last_component;
@@ -1393,21 +1404,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * to be able to know about the current root directory and
 		 * parent relationships.
 		 */
-		if (this.name[0] == '.') switch (this.len) {
-			default:
-				break;
-			case 2:
-				if (this.name[1] != '.')
-					break;
+		if (unlikely(type != LAST_NORM)) {
+			if (type == LAST_DOTDOT) {
 				if (nd->flags & LOOKUP_RCU) {
 					if (follow_dotdot_rcu(nd))
 						return -ECHILD;
 				} else
 					follow_dotdot(nd);
-				/* fallthrough */
-			case 1:
-				continue;
+			}
+			continue;
 		}
+
 		/* This does the actual lookups.. */
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1441,20 +1448,15 @@ last_component:
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
-		if (this.name[0] == '.') switch (this.len) {
-			default:
-				break;
-			case 2:
-				if (this.name[1] != '.')
-					break;
+		if (unlikely(type != LAST_NORM)) {
+			if (type == LAST_DOTDOT) {
 				if (nd->flags & LOOKUP_RCU) {
 					if (follow_dotdot_rcu(nd))
 						return -ECHILD;
 				} else
 					follow_dotdot(nd);
-				/* fallthrough */
-			case 1:
-				goto return_reval;
+			}
+			goto return_reval;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1480,14 +1482,8 @@ last_component:
 		goto return_base;
 lookup_parent:
 		nd->last = this;
-		nd->last_type = LAST_NORM;
-		if (this.name[0] != '.')
-			goto return_base;
-		if (this.len == 1)
-			nd->last_type = LAST_DOT;
-		else if (this.len == 2 && this.name[1] == '.')
-			nd->last_type = LAST_DOTDOT;
-		else
+		nd->last_type = type;
+		if (type == LAST_NORM)
 			goto return_base;
 return_reval:
 		/*
-- 
cgit v1.2.3


From 16c2cd7179881d5dd87779512ca5a0d657c64f62 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 15:50:10 -0500
Subject: untangle the "need_reval_dot" mess

instead of ad-hackery around need_reval_dot(), do the following:
set a flag (LOOKUP_JUMPED) in the beginning of path, on absolute
symlink traversal, on ".." and on procfs-style symlinks.  Clear on
normal components, leave unchanged on ".".  Non-nested callers of
link_path_walk() call handle_reval_path(), which checks that flag
is set and that fs does want the final revalidate thing, then does
->d_revalidate().  In link_path_walk() all the return_reval stuff
is gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 107 +++++++++++++++++++++++++------------------------------------
 1 file changed, 44 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 4521b5ff7c9..450b686e968 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -613,19 +613,8 @@ do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
 	return dentry;
 }
 
-static inline int need_reval_dot(struct dentry *dentry)
-{
-	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-		return 0;
-
-	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
-		return 0;
-
-	return 1;
-}
-
 /*
- * force_reval_path - force revalidation of a dentry
+ * handle_reval_path - force revalidation of a dentry
  *
  * In some situations the path walking code will trust dentries without
  * revalidating them. This causes problems for filesystems that depend on
@@ -639,27 +628,28 @@ static inline int need_reval_dot(struct dentry *dentry)
  * invalidate the dentry. It's up to the caller to handle putting references
  * to the path if necessary.
  */
-static int
-force_reval_path(struct path *path, struct nameidata *nd)
+static inline int handle_reval_path(struct nameidata *nd)
 {
+	struct dentry *dentry = nd->path.dentry;
 	int status;
-	struct dentry *dentry = path->dentry;
 
-	/*
-	 * only check on filesystems where it's possible for the dentry to
-	 * become stale.
-	 */
-	if (!need_reval_dot(dentry))
+	if (likely(!(nd->flags & LOOKUP_JUMPED)))
+		return 0;
+
+	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
 		return 0;
 
+	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+		return 0;
+
+	/* Note: we do not d_invalidate() */
 	status = d_revalidate(dentry, nd);
 	if (status > 0)
 		return 0;
 
-	if (!status) {
-		d_invalidate(dentry);
+	if (!status)
 		status = -ESTALE;
-	}
+
 	return status;
 }
 
@@ -728,6 +718,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		path_put(&nd->path);
 		nd->path = nd->root;
 		path_get(&nd->root);
+		nd->flags |= LOOKUP_JUMPED;
 	}
 	nd->inode = nd->path.dentry->d_inode;
 
@@ -779,11 +770,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 		error = 0;
 		if (s)
 			error = __vfs_follow_link(nd, s);
-		else if (nd->last_type == LAST_BIND) {
-			error = force_reval_path(&nd->path, nd);
-			if (error)
-				path_put(&nd->path);
-		}
+		else if (nd->last_type == LAST_BIND)
+			nd->flags |= LOOKUP_JUMPED;
 	}
 	return error;
 }
@@ -1351,7 +1339,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	while (*name=='/')
 		name++;
 	if (!*name)
-		goto return_reval;
+		goto return_base;
 
 	if (nd->depth)
 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
@@ -1385,12 +1373,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		type = LAST_NORM;
 		if (this.name[0] == '.') switch (this.len) {
 			case 2:
-				if (this.name[1] == '.')
+				if (this.name[1] == '.') {
 					type = LAST_DOTDOT;
+					nd->flags |= LOOKUP_JUMPED;
+				}
 				break;
 			case 1:
 				type = LAST_DOT;
 		}
+		if (likely(type == LAST_NORM))
+			nd->flags &= ~LOOKUP_JUMPED;
 
 		/* remove trailing slashes? */
 		if (!c)
@@ -1456,7 +1448,7 @@ last_component:
 				} else
 					follow_dotdot(nd);
 			}
-			goto return_reval;
+			goto return_base;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1483,24 +1475,6 @@ last_component:
 lookup_parent:
 		nd->last = this;
 		nd->last_type = type;
-		if (type == LAST_NORM)
-			goto return_base;
-return_reval:
-		/*
-		 * We bypassed the ordinary revalidation routines.
-		 * We may need to check the cached dentry for staleness.
-		 */
-		if (need_reval_dot(nd->path.dentry)) {
-			if (nameidata_drop_rcu_last_maybe(nd))
-				return -ECHILD;
-			/* Note: we do not d_invalidate() */
-			err = d_revalidate(nd->path.dentry, nd);
-			if (!err)
-				err = -ESTALE;
-			if (err < 0)
-				break;
-			return 0;
-		}
 return_base:
 		if (nameidata_drop_rcu_last_maybe(nd))
 			return -ECHILD;
@@ -1523,7 +1497,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 	struct file *file;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
-	nd->flags = flags;
+	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 	nd->root.mnt = NULL;
 	nd->file = NULL;
@@ -1630,6 +1604,9 @@ static int path_lookupat(int dfd, const char *name,
 		br_read_unlock(vfsmount_lock);
 	}
 
+	if (!retval)
+		retval = handle_reval_path(nd);
+
 	if (nd->file) {
 		fput(nd->file);
 		nd->file = NULL;
@@ -1690,7 +1667,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 
 	/* same as do_path_lookup */
 	nd->last_type = LAST_ROOT;
-	nd->flags = flags;
+	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 
 	nd->path.dentry = dentry;
@@ -1703,6 +1680,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	current->total_link_count = 0;
 
 	result = link_path_walk(name, nd);
+	if (!result)
+		result = handle_reval_path(nd);
 	if (result == -ESTALE) {
 		/* nd->path had been dropped */
 		current->total_link_count = 0;
@@ -1710,8 +1689,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		nd->path.mnt = mnt;
 		nd->inode = dentry->d_inode;
 		path_get(&nd->path);
-		nd->flags |= LOOKUP_REVAL;
+		nd->flags = flags | LOOKUP_JUMPED | LOOKUP_REVAL;
+
 		result = link_path_walk(name, nd);
+		if (!result)
+			result = handle_reval_path(nd);
 	}
 	if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
 				nd->inode))
@@ -2198,30 +2180,29 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
-	int error = -EISDIR;
+	int error;
 
 	switch (nd->last_type) {
 	case LAST_DOTDOT:
 		follow_dotdot(nd);
 		dir = nd->path.dentry;
 	case LAST_DOT:
-		if (need_reval_dot(dir)) {
-			int status = d_revalidate(nd->path.dentry, nd);
-			if (!status)
-				status = -ESTALE;
-			if (status < 0) {
-				error = status;
-				goto exit;
-			}
-		}
 		/* fallthrough */
 	case LAST_ROOT:
+		error = handle_reval_path(nd);
+		if (error)
+			goto exit;
+		error = -EISDIR;
 		goto exit;
 	case LAST_BIND:
+		error = handle_reval_path(nd);
+		if (error)
+			goto exit;
 		audit_inode(pathname, dir);
 		goto ok;
 	}
 
+	error = -EISDIR;
 	/* trailing slashes? */
 	if (nd->last.name[nd->last.len])
 		goto exit;
@@ -2422,7 +2403,7 @@ reval:
 	/*
 	 * We have the parent and last component.
 	 */
-	nd.flags = flags;
+	nd.flags = (nd.flags & ~LOOKUP_PARENT) | flags;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
-- 
cgit v1.2.3


From 086e183a641109033420e0b26ddecb6f4abb4c89 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 20:56:27 -0500
Subject: pull dropping RCU on success of link_path_walk() into path_lookupat()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 450b686e968..8f10a9ff9f6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -539,14 +539,6 @@ err_unlock:
 	return -ECHILD;
 }
 
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
-{
-	if (likely(nd->flags & LOOKUP_RCU))
-		return nameidata_drop_rcu_last(nd);
-	return 0;
-}
-
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -1339,7 +1331,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	while (*name=='/')
 		name++;
 	if (!*name)
-		goto return_base;
+		return 0;
 
 	if (nd->depth)
 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
@@ -1448,7 +1440,7 @@ last_component:
 				} else
 					follow_dotdot(nd);
 			}
-			goto return_base;
+			return 0;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1471,13 +1463,10 @@ last_component:
 			if (!nd->inode->i_op->lookup)
 				break;
 		}
-		goto return_base;
+		return 0;
 lookup_parent:
 		nd->last = this;
 		nd->last_type = type;
-return_base:
-		if (nameidata_drop_rcu_last_maybe(nd))
-			return -ECHILD;
 		return 0;
 out_dput:
 		if (!(nd->flags & LOOKUP_RCU))
@@ -1598,10 +1587,15 @@ static int path_lookupat(int dfd, const char *name,
 
 	if (nd->flags & LOOKUP_RCU) {
 		/* RCU dangling. Cancel it. */
-		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		if (!retval) {
+			if (nameidata_drop_rcu_last(nd))
+				retval = -ECHILD;
+		} else {
+			nd->flags &= ~LOOKUP_RCU;
+			nd->root.mnt = NULL;
+			rcu_read_unlock();
+			br_read_unlock(vfsmount_lock);
+		}
 	}
 
 	if (!retval)
-- 
cgit v1.2.3


From 36f3b4f69070fee7c647bab5dc4408990bb3606c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 21:24:38 -0500
Subject: pull security_inode_follow_link() into __do_follow_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 8f10a9ff9f6..f956567270b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -754,6 +754,13 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 	if (link->mnt == nd->path.mnt)
 		mntget(link->mnt);
 
+	error = security_inode_follow_link(link->dentry, nd);
+	if (error) {
+		*p = ERR_PTR(error); /* no ->put_link(), please */
+		path_put(&nd->path);
+		return error;
+	}
+
 	nd->last_type = LAST_BIND;
 	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
 	error = PTR_ERR(*p);
@@ -791,9 +798,6 @@ static inline int do_follow_link(struct inode *inode, struct path *path, struct
 		goto loop;
 	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 	cond_resched();
-	err = security_inode_follow_link(path->dentry, nd);
-	if (err)
-		goto loop;
 	current->link_count++;
 	current->total_link_count++;
 	nd->depth++;
@@ -2420,9 +2424,6 @@ reval:
 		 * just set LAST_BIND.
 		 */
 		nd.flags |= LOOKUP_PARENT;
-		error = security_inode_follow_link(link.dentry, &nd);
-		if (error)
-			goto exit_dput;
 		error = __do_follow_link(&link, &nd, &cookie);
 		if (unlikely(error)) {
 			if (!IS_ERR(cookie) && linki->i_op->put_link)
-- 
cgit v1.2.3


From f1afe9efc84476ca42fbb7301a441021063eead7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 22:27:28 -0500
Subject: clean up the failure exits after __do_follow_link() in do_filp_open()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f956567270b..e0f59031be8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2426,15 +2426,12 @@ reval:
 		nd.flags |= LOOKUP_PARENT;
 		error = __do_follow_link(&link, &nd, &cookie);
 		if (unlikely(error)) {
-			if (!IS_ERR(cookie) && linki->i_op->put_link)
-				linki->i_op->put_link(link.dentry, &nd, cookie);
-			/* nd.path had been dropped */
-			nd.path = link;
-			goto out_path;
+			filp = ERR_PTR(error);
+		} else {
+			nd.flags &= ~LOOKUP_PARENT;
+			filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 		}
-		nd.flags &= ~LOOKUP_PARENT;
-		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		if (linki->i_op->put_link)
+		if (!IS_ERR(cookie) && linki->i_op->put_link)
 			linki->i_op->put_link(link.dentry, &nd, cookie);
 		path_put(&link);
 	}
-- 
cgit v1.2.3


From c3e380b0b3cfa613189fb91513efd88a65e1d9d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 13:39:45 -0500
Subject: Collect "operation mode" arguments of do_last() into a structure

No point messing with passing shitloads of "operation mode" arguments
to do_open() one by one, especially since they are not going to change
during do_filp_open().  Collect them into a struct, fill it and pass
to do_last() by reference.

Make sure that lookup intent flags are correctly set and removed - we
want them for do_last(), but they make no sense for __do_follow_link().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 57 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 35 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index e0f59031be8..5e4206f4537 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2169,17 +2169,26 @@ exit:
 	return ERR_PTR(error);
 }
 
+struct open_flags {
+	int open_flag;
+	int mode;
+	int acc_mode;
+	int intent;
+};
+
 /*
  * Handle O_CREAT case for do_filp_open
  */
 static struct file *do_last(struct nameidata *nd, struct path *path,
-			    int open_flag, int acc_mode,
-			    int mode, const char *pathname)
+			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
 	int error;
 
+	nd->flags &= ~LOOKUP_PARENT;
+	nd->flags |= op->intent;
+
 	switch (nd->last_type) {
 	case LAST_DOTDOT:
 		follow_dotdot(nd);
@@ -2233,7 +2242,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit_mutex_unlock;
-		error = __open_namei_create(nd, path, open_flag, mode);
+		error = __open_namei_create(nd, path, op->open_flag, op->mode);
 		if (error) {
 			mnt_drop_write(nd->path.mnt);
 			goto exit;
@@ -2242,7 +2251,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		mnt_drop_write(nd->path.mnt);
 		path_put(&nd->path);
 		if (!IS_ERR(filp)) {
-			error = ima_file_check(filp, acc_mode);
+			error = ima_file_check(filp, op->acc_mode);
 			if (error) {
 				fput(filp);
 				filp = ERR_PTR(error);
@@ -2258,7 +2267,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	audit_inode(pathname, path->dentry);
 
 	error = -EEXIST;
-	if (open_flag & O_EXCL)
+	if (op->open_flag & O_EXCL)
 		goto exit_dput;
 
 	error = follow_managed(path, nd->flags);
@@ -2278,7 +2287,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	filp = finish_open(nd, open_flag, acc_mode);
+	filp = finish_open(nd, op->open_flag, op->acc_mode);
 	return filp;
 
 exit_mutex_unlock:
@@ -2304,7 +2313,8 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	struct path path;
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
-	int flags;
+	int flags = 0;
+	struct open_flags op;
 
 	if (!(open_flag & O_CREAT))
 		mode = 0;
@@ -2321,6 +2331,8 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (open_flag & __O_SYNC)
 		open_flag |= O_DSYNC;
 
+	op.open_flag = open_flag;
+
 	if (!acc_mode)
 		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
 
@@ -2333,12 +2345,15 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (open_flag & O_APPEND)
 		acc_mode |= MAY_APPEND;
 
-	flags = LOOKUP_OPEN;
+	op.acc_mode = acc_mode;
+
+	op.intent = LOOKUP_OPEN;
 	if (open_flag & O_CREAT) {
-		flags |= LOOKUP_CREATE;
+		op.intent |= LOOKUP_CREATE;
 		if (open_flag & O_EXCL)
-			flags |= LOOKUP_EXCL;
+			op.intent |= LOOKUP_EXCL;
 	}
+
 	if (open_flag & O_DIRECTORY)
 		flags |= LOOKUP_DIRECTORY;
 	if (!(open_flag & O_NOFOLLOW))
@@ -2357,7 +2372,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 		goto creat;
 
 	/* !O_CREAT, simple open */
-	error = do_path_lookup(dfd, pathname, flags, &nd);
+	error = do_path_lookup(dfd, pathname, flags | op.intent, &nd);
 	if (unlikely(error))
 		goto out_filp2;
 	error = -ELOOP;
@@ -2384,14 +2399,14 @@ out_filp2:
 
 creat:
 	/* OK, have to create the file. Find the parent. */
-	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | LOOKUP_RCU, &nd);
+	error = path_lookupat(dfd, pathname,
+			LOOKUP_PARENT | LOOKUP_RCU | flags, &nd);
 	if (unlikely(error == -ECHILD))
-		error = path_lookupat(dfd, pathname, LOOKUP_PARENT, &nd);
+		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
 	if (unlikely(error == -ESTALE)) {
 reval:
 		flags |= LOOKUP_REVAL;
-		error = path_lookupat(dfd, pathname,
-				LOOKUP_PARENT | LOOKUP_REVAL, &nd);
+		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
 	}
 	if (unlikely(error))
 		goto out_filp;
@@ -2401,8 +2416,7 @@ reval:
 	/*
 	 * We have the parent and last component.
 	 */
-	nd.flags = (nd.flags & ~LOOKUP_PARENT) | flags;
-	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+	filp = do_last(&nd, &path, &op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
@@ -2424,13 +2438,12 @@ reval:
 		 * just set LAST_BIND.
 		 */
 		nd.flags |= LOOKUP_PARENT;
+		nd.flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
 		error = __do_follow_link(&link, &nd, &cookie);
-		if (unlikely(error)) {
+		if (unlikely(error))
 			filp = ERR_PTR(error);
-		} else {
-			nd.flags &= ~LOOKUP_PARENT;
-			filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		}
+		else
+			filp = do_last(&nd, &path, &op, pathname);
 		if (!IS_ERR(cookie) && linki->i_op->put_link)
 			linki->i_op->put_link(link.dentry, &nd, cookie);
 		path_put(&link);
-- 
cgit v1.2.3


From 47c805dc2d2dff686962f5f0baa6bac2d703ba19 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 17:44:09 -0500
Subject: switch do_filp_open() to struct open_flags

take calculation of open_flags by open(2) arguments into new helper
in fs/open.c, move filp_open() over there, have it and do_sys_open()
use that helper, switch exec.c callers of do_filp_open() to explicit
(and constant) struct open_flags.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c     | 18 ++++++++----
 fs/internal.h |  8 ++++++
 fs/namei.c    | 88 ++++++-----------------------------------------------------
 fs/open.c     | 73 ++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 101 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 52a447d9b6a..ba99e1abb1a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 	struct file *file;
 	char *tmp = getname(library);
 	int error = PTR_ERR(tmp);
+	static const struct open_flags uselib_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN
+	};
 
 	if (IS_ERR(tmp))
 		goto out;
 
-	file = do_filp_open(AT_FDCWD, tmp,
-				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-				MAY_READ | MAY_EXEC | MAY_OPEN);
+	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
 	putname(tmp);
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
 {
 	struct file *file;
 	int err;
+	static const struct open_flags open_exec_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN
+	};
 
-	file = do_filp_open(AT_FDCWD, name,
-				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-				MAY_EXEC | MAY_OPEN);
+	file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
 	if (IS_ERR(file))
 		goto out;
 
diff --git a/fs/internal.h b/fs/internal.h
index 9b976b57d7f..6fdbdf2c604 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -106,6 +106,14 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+struct open_flags {
+	int open_flag;
+	int mode;
+	int acc_mode;
+	int intent;
+};
+extern struct file *do_filp_open(int dfd, const char *pathname,
+		const struct open_flags *op, int lookup_flags);
 
 /*
  * inode.c
diff --git a/fs/namei.c b/fs/namei.c
index 5e4206f4537..9c7fa946abe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2169,13 +2169,6 @@ exit:
 	return ERR_PTR(error);
 }
 
-struct open_flags {
-	int open_flag;
-	int mode;
-	int acc_mode;
-	int intent;
-};
-
 /*
  * Handle O_CREAT case for do_filp_open
  */
@@ -2305,74 +2298,28 @@ exit:
  * open_to_namei_flags() for more details.
  */
 struct file *do_filp_open(int dfd, const char *pathname,
-		int open_flag, int mode, int acc_mode)
+		const struct open_flags *op, int flags)
 {
 	struct file *filp;
 	struct nameidata nd;
 	int error;
 	struct path path;
 	int count = 0;
-	int flag = open_to_namei_flags(open_flag);
-	int flags = 0;
-	struct open_flags op;
-
-	if (!(open_flag & O_CREAT))
-		mode = 0;
-
-	/* Must never be set by userspace */
-	open_flag &= ~FMODE_NONOTIFY;
-
-	/*
-	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-	 * check for O_DSYNC if the need any syncing at all we enforce it's
-	 * always set instead of having to deal with possibly weird behaviour
-	 * for malicious applications setting only __O_SYNC.
-	 */
-	if (open_flag & __O_SYNC)
-		open_flag |= O_DSYNC;
-
-	op.open_flag = open_flag;
-
-	if (!acc_mode)
-		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
-
-	/* O_TRUNC implies we need access checks for write permissions */
-	if (open_flag & O_TRUNC)
-		acc_mode |= MAY_WRITE;
-
-	/* Allow the LSM permission hook to distinguish append 
-	   access from general write access. */
-	if (open_flag & O_APPEND)
-		acc_mode |= MAY_APPEND;
-
-	op.acc_mode = acc_mode;
-
-	op.intent = LOOKUP_OPEN;
-	if (open_flag & O_CREAT) {
-		op.intent |= LOOKUP_CREATE;
-		if (open_flag & O_EXCL)
-			op.intent |= LOOKUP_EXCL;
-	}
-
-	if (open_flag & O_DIRECTORY)
-		flags |= LOOKUP_DIRECTORY;
-	if (!(open_flag & O_NOFOLLOW))
-		flags |= LOOKUP_FOLLOW;
 
 	filp = get_empty_filp();
 	if (!filp)
 		return ERR_PTR(-ENFILE);
 
-	filp->f_flags = open_flag;
+	filp->f_flags = op->open_flag;
 	nd.intent.open.file = filp;
-	nd.intent.open.flags = flag;
-	nd.intent.open.create_mode = mode;
+	nd.intent.open.flags = open_to_namei_flags(op->open_flag);
+	nd.intent.open.create_mode = op->mode;
 
-	if (open_flag & O_CREAT)
+	if (op->open_flag & O_CREAT)
 		goto creat;
 
 	/* !O_CREAT, simple open */
-	error = do_path_lookup(dfd, pathname, flags | op.intent, &nd);
+	error = do_path_lookup(dfd, pathname, flags | op->intent, &nd);
 	if (unlikely(error))
 		goto out_filp2;
 	error = -ELOOP;
@@ -2386,7 +2333,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 			goto out_path2;
 	}
 	audit_inode(pathname, nd.path.dentry);
-	filp = finish_open(&nd, open_flag, acc_mode);
+	filp = finish_open(&nd, op->open_flag, op->acc_mode);
 out2:
 	release_open_intent(&nd);
 	return filp;
@@ -2416,7 +2363,7 @@ reval:
 	/*
 	 * We have the parent and last component.
 	 */
-	filp = do_last(&nd, &path, &op, pathname);
+	filp = do_last(&nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
@@ -2443,7 +2390,7 @@ reval:
 		if (unlikely(error))
 			filp = ERR_PTR(error);
 		else
-			filp = do_last(&nd, &path, &op, pathname);
+			filp = do_last(&nd, &path, op, pathname);
 		if (!IS_ERR(cookie) && linki->i_op->put_link)
 			linki->i_op->put_link(link.dentry, &nd, cookie);
 		path_put(&link);
@@ -2465,23 +2412,6 @@ out_filp:
 	goto out;
 }
 
-/**
- * filp_open - open file and return file pointer
- *
- * @filename:	path to open
- * @flags:	open flags as per the open(2) second argument
- * @mode:	mode for the new file if O_CREAT is set, else ignored
- *
- * This is the helper to open a file from kernelspace if you really
- * have to.  But in generally you should not do this, so please move
- * along, nothing to see here..
- */
-struct file *filp_open(const char *filename, int flags, int mode)
-{
-	return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
-}
-EXPORT_SYMBOL(filp_open);
-
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
diff --git a/fs/open.c b/fs/open.c
index b47aab39c05..d05e18c60ba 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -890,15 +890,86 @@ void fd_install(unsigned int fd, struct file *file)
 
 EXPORT_SYMBOL(fd_install);
 
+static inline int build_open_flags(int flags, int mode, struct open_flags *op)
+{
+	int lookup_flags = 0;
+	int acc_mode;
+
+	if (!(flags & O_CREAT))
+		mode = 0;
+	op->mode = mode;
+
+	/* Must never be set by userspace */
+	flags &= ~FMODE_NONOTIFY;
+
+	/*
+	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+	 * check for O_DSYNC if the need any syncing at all we enforce it's
+	 * always set instead of having to deal with possibly weird behaviour
+	 * for malicious applications setting only __O_SYNC.
+	 */
+	if (flags & __O_SYNC)
+		flags |= O_DSYNC;
+
+	op->open_flag = flags;
+
+	acc_mode = MAY_OPEN | ACC_MODE(flags);
+
+	/* O_TRUNC implies we need access checks for write permissions */
+	if (flags & O_TRUNC)
+		acc_mode |= MAY_WRITE;
+
+	/* Allow the LSM permission hook to distinguish append
+	   access from general write access. */
+	if (flags & O_APPEND)
+		acc_mode |= MAY_APPEND;
+
+	op->acc_mode = acc_mode;
+
+	op->intent = LOOKUP_OPEN;
+	if (flags & O_CREAT) {
+		op->intent |= LOOKUP_CREATE;
+		if (flags & O_EXCL)
+			op->intent |= LOOKUP_EXCL;
+	}
+
+	if (flags & O_DIRECTORY)
+		lookup_flags |= LOOKUP_DIRECTORY;
+	if (!(flags & O_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+	return lookup_flags;
+}
+
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:	path to open
+ * @flags:	open flags as per the open(2) second argument
+ * @mode:	mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+	struct open_flags op;
+	int lookup = build_open_flags(flags, mode, &op);
+	return do_filp_open(AT_FDCWD, filename, &op, lookup);
+}
+EXPORT_SYMBOL(filp_open);
+
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
+	struct open_flags op;
+	int lookup = build_open_flags(flags, mode, &op);
 	char *tmp = getname(filename);
 	int fd = PTR_ERR(tmp);
 
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd_flags(flags);
 		if (fd >= 0) {
-			struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
+			struct file *f = do_filp_open(dfd, tmp, &op, lookup);
 			if (IS_ERR(f)) {
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
-- 
cgit v1.2.3


From 13aab428a73d3200b9283b61b7fdf5713181ac66 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 17:54:08 -0500
Subject: separate -ESTALE/-ECHILD retries in do_filp_open() from real work

new helper: path_openat().  Does what do_filp_open() does, except
that it tries only the walk mode (RCU/normal/force revalidation)
it had been told to.

Both create and non-create branches are using path_lookupat() now.
Fixed the double audit_inode() in non-create branch.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 49 ++++++++++++++++++++-----------------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9c7fa946abe..01a17dd2f15 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2292,19 +2292,14 @@ exit:
 	return ERR_PTR(error);
 }
 
-/*
- * Note that the low bits of the passed in "open_flag"
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
+static struct file *path_openat(int dfd, const char *pathname,
 		const struct open_flags *op, int flags)
 {
 	struct file *filp;
 	struct nameidata nd;
-	int error;
 	struct path path;
 	int count = 0;
+	int error;
 
 	filp = get_empty_filp();
 	if (!filp)
@@ -2319,42 +2314,27 @@ struct file *do_filp_open(int dfd, const char *pathname,
 		goto creat;
 
 	/* !O_CREAT, simple open */
-	error = do_path_lookup(dfd, pathname, flags | op->intent, &nd);
+	error = path_lookupat(dfd, pathname, flags | op->intent, &nd);
 	if (unlikely(error))
-		goto out_filp2;
+		goto out_filp;
 	error = -ELOOP;
 	if (!(nd.flags & LOOKUP_FOLLOW)) {
 		if (nd.inode->i_op->follow_link)
-			goto out_path2;
+			goto out_path;
 	}
 	error = -ENOTDIR;
 	if (nd.flags & LOOKUP_DIRECTORY) {
 		if (!nd.inode->i_op->lookup)
-			goto out_path2;
+			goto out_path;
 	}
 	audit_inode(pathname, nd.path.dentry);
 	filp = finish_open(&nd, op->open_flag, op->acc_mode);
-out2:
 	release_open_intent(&nd);
 	return filp;
 
-out_path2:
-	path_put(&nd.path);
-out_filp2:
-	filp = ERR_PTR(error);
-	goto out2;
-
 creat:
 	/* OK, have to create the file. Find the parent. */
-	error = path_lookupat(dfd, pathname,
-			LOOKUP_PARENT | LOOKUP_RCU | flags, &nd);
-	if (unlikely(error == -ECHILD))
-		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
-	if (unlikely(error == -ESTALE)) {
-reval:
-		flags |= LOOKUP_REVAL;
-		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
-	}
+	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
 	if (unlikely(error))
 		goto out_filp;
 	if (unlikely(!audit_dummy_context()))
@@ -2398,8 +2378,6 @@ reval:
 out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
-	if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
-		goto reval;
 	release_open_intent(&nd);
 	return filp;
 
@@ -2412,6 +2390,19 @@ out_filp:
 	goto out;
 }
 
+struct file *do_filp_open(int dfd, const char *pathname,
+		const struct open_flags *op, int flags)
+{
+	struct file *filp;
+
+	filp = path_openat(dfd, pathname, op, flags | LOOKUP_RCU);
+	if (unlikely(filp == ERR_PTR(-ECHILD)))
+		filp = path_openat(dfd, pathname, op, flags);
+	if (unlikely(filp == ERR_PTR(-ESTALE)))
+		filp = path_openat(dfd, pathname, op, flags | LOOKUP_REVAL);
+	return filp;
+}
+
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
-- 
cgit v1.2.3


From 7bc055d1d524f209bf49d8b9cb220712dd7df4ed Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 19:41:31 -0500
Subject: kill out_dput: in link_path_walk()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 01a17dd2f15..fea36369dc8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1407,22 +1407,19 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-		err = -ENOENT;
-		if (!inode)
-			goto out_dput;
 
-		if (inode->i_op->follow_link) {
+		if (inode && inode->i_op->follow_link) {
 			err = do_follow_link(inode, &next, nd);
 			if (err)
 				goto return_err;
 			nd->inode = nd->path.dentry->d_inode;
-			err = -ENOENT;
-			if (!nd->inode)
-				break;
 		} else {
 			path_to_nameidata(&next, nd);
 			nd->inode = inode;
 		}
+		err = -ENOENT;
+		if (!nd->inode)
+			break;
 		err = -ENOTDIR; 
 		if (!nd->inode->i_op->lookup)
 			break;
@@ -1472,10 +1469,6 @@ lookup_parent:
 		nd->last = this;
 		nd->last_type = type;
 		return 0;
-out_dput:
-		if (!(nd->flags & LOOKUP_RCU))
-			path_put_conditional(&next, nd);
-		break;
 	}
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&nd->path);
-- 
cgit v1.2.3


From 9856fa1b281eccdc9f8d94d716e96818c675e78e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:22:06 -0500
Subject: pull handling of . and .. into inlined helper

getting LOOKUP_RCU checks out of link_path_walk()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index fea36369dc8..d29f91e8ff3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1318,6 +1318,18 @@ static inline int may_lookup(struct nameidata *nd)
 	return exec_permission(nd->inode, 0);
 }
 
+static inline int handle_dots(struct nameidata *nd, int type)
+{
+	if (type == LAST_DOTDOT) {
+		if (nd->flags & LOOKUP_RCU) {
+			if (follow_dotdot_rcu(nd))
+				return -ECHILD;
+		} else
+			follow_dotdot(nd);
+	}
+	return 0;
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1393,13 +1405,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * parent relationships.
 		 */
 		if (unlikely(type != LAST_NORM)) {
-			if (type == LAST_DOTDOT) {
-				if (nd->flags & LOOKUP_RCU) {
-					if (follow_dotdot_rcu(nd))
-						return -ECHILD;
-				} else
-					follow_dotdot(nd);
-			}
+			if (handle_dots(nd, type))
+				return -ECHILD;
 			continue;
 		}
 
@@ -1434,13 +1441,8 @@ last_component:
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
 		if (unlikely(type != LAST_NORM)) {
-			if (type == LAST_DOTDOT) {
-				if (nd->flags & LOOKUP_RCU) {
-					if (follow_dotdot_rcu(nd))
-						return -ECHILD;
-				} else
-					follow_dotdot(nd);
-			}
+			if (handle_dots(nd, type))
+				return -ECHILD;
 			return 0;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
-- 
cgit v1.2.3


From 4455ca6223cc59cbc0a75f4be8bce9e84cc0d6b8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:28:10 -0500
Subject: clear RCU on all failure exits from link_path_walk()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d29f91e8ff3..f09887a4583 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1405,8 +1405,9 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * parent relationships.
 		 */
 		if (unlikely(type != LAST_NORM)) {
-			if (handle_dots(nd, type))
-				return -ECHILD;
+			err = handle_dots(nd, type);
+			if (err)
+				goto return_err;
 			continue;
 		}
 
@@ -1441,8 +1442,9 @@ last_component:
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
 		if (unlikely(type != LAST_NORM)) {
-			if (handle_dots(nd, type))
-				return -ECHILD;
+			err = handle_dots(nd, type);
+			if (err)
+				goto return_err;
 			return 0;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
@@ -1475,6 +1477,12 @@ lookup_parent:
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&nd->path);
 return_err:
+	if (nd->flags & LOOKUP_RCU) {
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
 	return err;
 }
 
@@ -1585,16 +1593,10 @@ static int path_lookupat(int dfd, const char *name,
 	retval = link_path_walk(name, nd);
 
 	if (nd->flags & LOOKUP_RCU) {
-		/* RCU dangling. Cancel it. */
-		if (!retval) {
-			if (nameidata_drop_rcu_last(nd))
-				retval = -ECHILD;
-		} else {
-			nd->flags &= ~LOOKUP_RCU;
-			nd->root.mnt = NULL;
-			rcu_read_unlock();
-			br_read_unlock(vfsmount_lock);
-		}
+		/* went all way through without dropping RCU */
+		BUG_ON(retval);
+		if (nameidata_drop_rcu_last(nd))
+			retval = -ECHILD;
 	}
 
 	if (!retval)
-- 
cgit v1.2.3


From ef7562d5283a91da3ba5c14de3221f47b7f08823 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:35:59 -0500
Subject: make handle_dots() leave RCU mode on error

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f09887a4583..ea14bfb0478 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1052,7 +1052,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 
 			seq = read_seqcount_begin(&parent->d_seq);
 			if (read_seqcount_retry(&old->d_seq, nd->seq))
-				return -ECHILD;
+				goto failed;
 			inode = parent->d_inode;
 			nd->path.dentry = parent;
 			nd->seq = seq;
@@ -1065,8 +1065,14 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	}
 	__follow_mount_rcu(nd, &nd->path, &inode, true);
 	nd->inode = inode;
-
 	return 0;
+
+failed:
+	nd->flags &= ~LOOKUP_RCU;
+	nd->root.mnt = NULL;
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	return -ECHILD;
 }
 
 /*
@@ -1405,9 +1411,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * parent relationships.
 		 */
 		if (unlikely(type != LAST_NORM)) {
-			err = handle_dots(nd, type);
-			if (err)
-				goto return_err;
+			if (handle_dots(nd, type))
+				return -ECHILD;
 			continue;
 		}
 
@@ -1441,12 +1446,8 @@ last_component:
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
-		if (unlikely(type != LAST_NORM)) {
-			err = handle_dots(nd, type);
-			if (err)
-				goto return_err;
-			return 0;
-		}
+		if (unlikely(type != LAST_NORM))
+			return handle_dots(nd, type);
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-- 
cgit v1.2.3


From a7472baba22dd5d68580f528374f93421b33667e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:39:30 -0500
Subject: make nameidata_dentry_drop_rcu_maybe() always leave RCU mode

Now we have do_follow_link() guaranteed to leave without dangling RCU
and the next step will get LOOKUP_RCU logics completely out of
link_path_walk().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index ea14bfb0478..53bba7c1a52 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -498,8 +498,15 @@ err_root:
 /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
 static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
 {
-	if (nd->flags & LOOKUP_RCU)
-		return nameidata_dentry_drop_rcu(nd, dentry);
+	if (nd->flags & LOOKUP_RCU) {
+		if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
+			nd->flags &= ~LOOKUP_RCU;
+			nd->root.mnt = NULL;
+			rcu_read_unlock();
+			br_read_unlock(vfsmount_lock);
+			return -ECHILD;
+		}
+	}
 	return 0;
 }
 
@@ -1424,7 +1431,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (inode && inode->i_op->follow_link) {
 			err = do_follow_link(inode, &next, nd);
 			if (err)
-				goto return_err;
+				return err;
 			nd->inode = nd->path.dentry->d_inode;
 		} else {
 			path_to_nameidata(&next, nd);
@@ -1455,7 +1462,7 @@ last_component:
 		    (lookup_flags & LOOKUP_FOLLOW)) {
 			err = do_follow_link(inode, &next, nd);
 			if (err)
-				goto return_err;
+				return err;
 			nd->inode = nd->path.dentry->d_inode;
 		} else {
 			path_to_nameidata(&next, nd);
@@ -1477,7 +1484,6 @@ lookup_parent:
 	}
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&nd->path);
-return_err:
 	if (nd->flags & LOOKUP_RCU) {
 		nd->flags &= ~LOOKUP_RCU;
 		nd->root.mnt = NULL;
-- 
cgit v1.2.3


From 951361f954596bd134d4270df834f47d151f98a6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:44:37 -0500
Subject: get rid of the last LOOKUP_RCU dependencies in link_path_walk()

New helper: terminate_walk().  An error has happened during pathname
resolution and we either drop nd->path or terminate RCU, depending
the mode we had been in.  After that, nd is essentially empty.
Switch link_path_walk() to using that for cleanup.

Now the top-level logics in link_path_walk() is back to sanity.  RCU
dependencies are in the lower-level functions.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 53bba7c1a52..85f6e39b403 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1343,6 +1343,18 @@ static inline int handle_dots(struct nameidata *nd, int type)
 	return 0;
 }
 
+static void terminate_walk(struct nameidata *nd)
+{
+	if (!(nd->flags & LOOKUP_RCU)) {
+		path_put(&nd->path);
+	} else {
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1482,14 +1494,7 @@ lookup_parent:
 		nd->last_type = type;
 		return 0;
 	}
-	if (!(nd->flags & LOOKUP_RCU))
-		path_put(&nd->path);
-	if (nd->flags & LOOKUP_RCU) {
-		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
-	}
+	terminate_walk(nd);
 	return err;
 }
 
-- 
cgit v1.2.3


From 70e9b3571107b88674cd55ae4bed33f76261e7d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Mar 2011 21:12:22 -0500
Subject: get rid of nd->file

Don't stash the struct file * used as starting point of walk in nameidata;
pass file ** to path_init() instead.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 85f6e39b403..a260a306daf 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1498,7 +1498,8 @@ lookup_parent:
 	return err;
 }
 
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags,
+		     struct nameidata *nd, struct file **fp)
 {
 	int retval = 0;
 	int fput_needed;
@@ -1508,7 +1509,6 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 	nd->root.mnt = NULL;
-	nd->file = NULL;
 
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
@@ -1557,7 +1557,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 		nd->path = file->f_path;
 		if (flags & LOOKUP_RCU) {
 			if (fput_needed)
-				nd->file = file;
+				*fp = file;
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 			br_read_lock(vfsmount_lock);
 			rcu_read_lock();
@@ -1580,6 +1580,7 @@ out_fail:
 static int path_lookupat(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
+	struct file *base = NULL;
 	int retval;
 
 	/*
@@ -1596,7 +1597,7 @@ static int path_lookupat(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	retval = path_init(dfd, name, flags, nd);
+	retval = path_init(dfd, name, flags, nd, &base);
 
 	if (unlikely(retval))
 		return retval;
@@ -1614,10 +1615,8 @@ static int path_lookupat(int dfd, const char *name,
 	if (!retval)
 		retval = handle_reval_path(nd);
 
-	if (nd->file) {
-		fput(nd->file);
-		nd->file = NULL;
-	}
+	if (base)
+		fput(base);
 
 	if (nd->root.mnt) {
 		path_put(&nd->root);
-- 
cgit v1.2.3


From fe2d35ff0d18a2c93993b0d7d46f846ff4331b72 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Mar 2011 22:58:25 -0500
Subject: switch non-create side of open() to use of do_last()

Instead of path_lookupat() doing trailing symlink resolution,
use the same scheme as on the O_CREAT side.  Walk with
LOOKUP_PARENT, then (in do_last()) look the final component
up, then either open it or return error or, if it's a symlink,
give the symlink back to path_openat() to be resolved there.

The really messy complication here is RCU.  We don't want to drop
out of RCU mode before the final lookup, since we don't want to
bounce parent directory ->d_count without a good reason.

Result is _not_ pretty; later in the series we'll clean it up.
For now we are roughly back where we'd been before the revert
done by Nick's series - top-level logics of path_openat() is
cleaned up, do_last() does actual opening, symlink resolution is
done uniformly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 100 +++++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 67 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a260a306daf..9595b4a55c3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2178,13 +2178,14 @@ exit:
 }
 
 /*
- * Handle O_CREAT case for do_filp_open
+ * Handle the last step of open()
  */
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
+	struct inode *inode;
 	int error;
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -2192,17 +2193,27 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	switch (nd->last_type) {
 	case LAST_DOTDOT:
-		follow_dotdot(nd);
-		dir = nd->path.dentry;
 	case LAST_DOT:
+		error = handle_dots(nd, nd->last_type);
+		if (error)
+			return ERR_PTR(error);
 		/* fallthrough */
 	case LAST_ROOT:
+		if (nd->flags & LOOKUP_RCU) {
+			if (nameidata_drop_rcu_last(nd))
+				return ERR_PTR(-ECHILD);
+		}
 		error = handle_reval_path(nd);
 		if (error)
 			goto exit;
-		error = -EISDIR;
-		goto exit;
+		audit_inode(pathname, nd->path.dentry);
+		if (op->open_flag & O_CREAT) {
+			error = -EISDIR;
+			goto exit;
+		}
+		goto ok;
 	case LAST_BIND:
+		/* can't be RCU mode here */
 		error = handle_reval_path(nd);
 		if (error)
 			goto exit;
@@ -2210,6 +2221,51 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		goto ok;
 	}
 
+	if (!(op->open_flag & O_CREAT)) {
+		if (nd->last.name[nd->last.len])
+			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+		/* we _can_ be in RCU mode here */
+		error = do_lookup(nd, &nd->last, path, &inode);
+		if (error) {
+			terminate_walk(nd);
+			return ERR_PTR(error);
+		}
+		if (!inode) {
+			path_to_nameidata(path, nd);
+			terminate_walk(nd);
+			return ERR_PTR(-ENOENT);
+		}
+		if (unlikely(inode->i_op->follow_link)) {
+			/* We drop rcu-walk here */
+			if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+				return ERR_PTR(-ECHILD);
+			return NULL;
+		}
+		path_to_nameidata(path, nd);
+		nd->inode = inode;
+		/* sayonara */
+		if (nd->flags & LOOKUP_RCU) {
+			if (nameidata_drop_rcu_last(nd))
+				return ERR_PTR(-ECHILD);
+		}
+
+		error = -ENOTDIR;
+		if (nd->flags & LOOKUP_DIRECTORY) {
+			if (!inode->i_op->lookup)
+				goto exit;
+		}
+		audit_inode(pathname, nd->path.dentry);
+		goto ok;
+	}
+
+	/* create side of things */
+
+	if (nd->flags & LOOKUP_RCU) {
+		if (nameidata_drop_rcu_last(nd))
+			return ERR_PTR(-ECHILD);
+	}
+
+	audit_inode(pathname, dir);
 	error = -EISDIR;
 	/* trailing slashes? */
 	if (nd->last.name[nd->last.len])
@@ -2303,6 +2359,7 @@ exit:
 static struct file *path_openat(int dfd, const char *pathname,
 		const struct open_flags *op, int flags)
 {
+	struct file *base = NULL;
 	struct file *filp;
 	struct nameidata nd;
 	struct path path;
@@ -2318,39 +2375,15 @@ static struct file *path_openat(int dfd, const char *pathname,
 	nd.intent.open.flags = open_to_namei_flags(op->open_flag);
 	nd.intent.open.create_mode = op->mode;
 
-	if (op->open_flag & O_CREAT)
-		goto creat;
-
-	/* !O_CREAT, simple open */
-	error = path_lookupat(dfd, pathname, flags | op->intent, &nd);
+	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, &nd, &base);
 	if (unlikely(error))
 		goto out_filp;
-	error = -ELOOP;
-	if (!(nd.flags & LOOKUP_FOLLOW)) {
-		if (nd.inode->i_op->follow_link)
-			goto out_path;
-	}
-	error = -ENOTDIR;
-	if (nd.flags & LOOKUP_DIRECTORY) {
-		if (!nd.inode->i_op->lookup)
-			goto out_path;
-	}
-	audit_inode(pathname, nd.path.dentry);
-	filp = finish_open(&nd, op->open_flag, op->acc_mode);
-	release_open_intent(&nd);
-	return filp;
 
-creat:
-	/* OK, have to create the file. Find the parent. */
-	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
+	current->total_link_count = 0;
+	error = link_path_walk(pathname, &nd);
 	if (unlikely(error))
 		goto out_filp;
-	if (unlikely(!audit_dummy_context()))
-		audit_inode(pathname, nd.path.dentry);
 
-	/*
-	 * We have the parent and last component.
-	 */
 	filp = do_last(&nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
@@ -2386,12 +2419,13 @@ creat:
 out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
+	if (base)
+		fput(base);
 	release_open_intent(&nd);
 	return filp;
 
 exit_dput:
 	path_put_conditional(&path, &nd);
-out_path:
 	path_put(&nd.path);
 out_filp:
 	filp = ERR_PTR(error);
-- 
cgit v1.2.3


From 6a96ba54418be740303765c0f52be028573cb99a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Mar 2011 23:49:20 -0500
Subject: kill __lookup_one_len()

only one caller left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9595b4a55c3..f6f3ef47bc7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1759,28 +1759,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
 
-static int __lookup_one_len(const char *name, struct qstr *this,
-		struct dentry *base, int len)
-{
-	unsigned long hash;
-	unsigned int c;
-
-	this->name = name;
-	this->len = len;
-	if (!len)
-		return -EACCES;
-
-	hash = init_name_hash();
-	while (len--) {
-		c = *(const unsigned char *)name++;
-		if (c == '/' || c == '\0')
-			return -EACCES;
-		hash = partial_name_hash(c, hash);
-	}
-	this->hash = end_name_hash(hash);
-	return 0;
-}
-
 /**
  * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:	pathname component to lookup
@@ -1794,14 +1772,25 @@ static int __lookup_one_len(const char *name, struct qstr *this,
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
-	int err;
 	struct qstr this;
+	unsigned long hash;
+	unsigned int c;
 
 	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
 
-	err = __lookup_one_len(name, &this, base, len);
-	if (err)
-		return ERR_PTR(err);
+	this.name = name;
+	this.len = len;
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	hash = init_name_hash();
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+		hash = partial_name_hash(c, hash);
+	}
+	this.hash = end_name_hash(hash);
 
 	return __lookup_hash(&this, base, NULL);
 }
-- 
cgit v1.2.3


From 5a202bcd75bbd2397136397961babbd8463416af Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Mar 2011 14:17:44 -0500
Subject: sanitize pathname component hash calculation

Lift it to lookup_one_len() and link_path_walk() resp. into the
same place where we calculated default hash function of the same
name.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 42 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f6f3ef47bc7..d1a5dfeaf99 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1216,16 +1216,6 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	struct inode *dir;
 	int err;
 
-	/*
-	 * See if the low-level filesystem might want
-	 * to use its own hash..
-	 */
-	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-		err = parent->d_op->d_hash(parent, nd->inode, name);
-		if (err < 0)
-			return err;
-	}
-
 	/*
 	 * Rename seqlock is not required here because in the off chance
 	 * of a false negative due to a concurrent rename, we're going to
@@ -1414,8 +1404,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			case 1:
 				type = LAST_DOT;
 		}
-		if (likely(type == LAST_NORM))
+		if (likely(type == LAST_NORM)) {
+			struct dentry *parent = nd->path.dentry;
 			nd->flags &= ~LOOKUP_JUMPED;
+			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+				err = parent->d_op->d_hash(parent, nd->inode,
+							   &this);
+				if (err < 0)
+					break;
+			}
+		}
 
 		/* remove trailing slashes? */
 		if (!c)
@@ -1722,17 +1720,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	if (err)
 		return ERR_PTR(err);
 
-	/*
-	 * See if the low-level filesystem might want
-	 * to use its own hash..
-	 */
-	if (base->d_flags & DCACHE_OP_HASH) {
-		err = base->d_op->d_hash(base, inode, name);
-		dentry = ERR_PTR(err);
-		if (err < 0)
-			goto out;
-	}
-
 	/*
 	 * Don't bother with __d_lookup: callers are for creat as
 	 * well as unlink, so a lot of the time it would cost
@@ -1745,7 +1732,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 
 	if (!dentry)
 		dentry = d_alloc_and_lookup(base, name, nd);
-out:
+
 	return dentry;
 }
 
@@ -1791,6 +1778,15 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 		hash = partial_name_hash(c, hash);
 	}
 	this.hash = end_name_hash(hash);
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, base->d_inode, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
 
 	return __lookup_hash(&this, base, NULL);
 }
-- 
cgit v1.2.3


From 0f9d1a10c341020617e5b1c7f9c16f6a070438ec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:13:14 -0500
Subject: expand finish_open() in its only caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 90 ++++++++++++++++++++++++++------------------------------------
 1 file changed, 38 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d1a5dfeaf99..1f561dc495a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2111,57 +2111,6 @@ static int open_will_truncate(int flag, struct inode *inode)
 	return (flag & O_TRUNC);
 }
 
-static struct file *finish_open(struct nameidata *nd,
-				int open_flag, int acc_mode)
-{
-	struct file *filp;
-	int will_truncate;
-	int error;
-
-	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
-	if (will_truncate) {
-		error = mnt_want_write(nd->path.mnt);
-		if (error)
-			goto exit;
-	}
-	error = may_open(&nd->path, acc_mode, open_flag);
-	if (error) {
-		if (will_truncate)
-			mnt_drop_write(nd->path.mnt);
-		goto exit;
-	}
-	filp = nameidata_to_filp(nd);
-	if (!IS_ERR(filp)) {
-		error = ima_file_check(filp, acc_mode);
-		if (error) {
-			fput(filp);
-			filp = ERR_PTR(error);
-		}
-	}
-	if (!IS_ERR(filp)) {
-		if (will_truncate) {
-			error = handle_truncate(filp);
-			if (error) {
-				fput(filp);
-				filp = ERR_PTR(error);
-			}
-		}
-	}
-	/*
-	 * It is now safe to drop the mnt write
-	 * because the filp has had a write taken
-	 * on its behalf.
-	 */
-	if (will_truncate)
-		mnt_drop_write(nd->path.mnt);
-	path_put(&nd->path);
-	return filp;
-
-exit:
-	path_put(&nd->path);
-	return ERR_PTR(error);
-}
-
 /*
  * Handle the last step of open()
  */
@@ -2169,6 +2118,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
+	int will_truncate;
 	struct file *filp;
 	struct inode *inode;
 	int error;
@@ -2329,7 +2279,43 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	filp = finish_open(nd, op->open_flag, op->acc_mode);
+	will_truncate = open_will_truncate(op->open_flag, nd->path.dentry->d_inode);
+	if (will_truncate) {
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto exit;
+	}
+	error = may_open(&nd->path, op->acc_mode, op->open_flag);
+	if (error) {
+		if (will_truncate)
+			mnt_drop_write(nd->path.mnt);
+		goto exit;
+	}
+	filp = nameidata_to_filp(nd);
+	if (!IS_ERR(filp)) {
+		error = ima_file_check(filp, op->acc_mode);
+		if (error) {
+			fput(filp);
+			filp = ERR_PTR(error);
+		}
+	}
+	if (!IS_ERR(filp)) {
+		if (will_truncate) {
+			error = handle_truncate(filp);
+			if (error) {
+				fput(filp);
+				filp = ERR_PTR(error);
+			}
+		}
+	}
+	/*
+	 * It is now safe to drop the mnt write
+	 * because the filp has had a write taken
+	 * on its behalf.
+	 */
+	if (will_truncate)
+		mnt_drop_write(nd->path.mnt);
+	path_put(&nd->path);
 	return filp;
 
 exit_mutex_unlock:
-- 
cgit v1.2.3


From 9b44f1b3928b6f41532c9a1dc9a6fc665989ad5b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:17:27 -0500
Subject: move may_open() from __open_name_create() to do_last()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 1f561dc495a..def63e7c058 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2069,11 +2069,7 @@ out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(nd->path.dentry);
 	nd->path.dentry = path->dentry;
-
-	if (error)
-		return error;
-	/* Don't check for write permission, don't truncate */
-	return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
+	return error;
 }
 
 /*
@@ -2239,6 +2235,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			mnt_drop_write(nd->path.mnt);
 			goto exit;
 		}
+		/* Don't check for write permission, don't truncate */
+		error = may_open(&nd->path, 0, op->open_flag & ~O_TRUNC);
+		if (error) {
+			mnt_drop_write(nd->path.mnt);
+			goto exit;
+		}
 		filp = nameidata_to_filp(nd);
 		mnt_drop_write(nd->path.mnt);
 		path_put(&nd->path);
-- 
cgit v1.2.3


From ca344a894b41a133dab07dfbbdf652c053f6658c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:36:45 -0500
Subject: do_last: unify may_open() call and everyting after it

We have a bunch of diverging codepaths in do_last(); some of
them converge, but the case of having to create a new file
duplicates large part of common tail of the rest and exits
separately.  Massage them so that they could be merged.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 59 ++++++++++++++++++++++-------------------------------------
 1 file changed, 22 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index def63e7c058..63844776484 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2114,7 +2114,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
+	int open_flag = op->open_flag;
 	int will_truncate;
+	int want_write = 0;
+	int skip_perm = 0;
 	struct file *filp;
 	struct inode *inode;
 	int error;
@@ -2138,7 +2141,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (error)
 			goto exit;
 		audit_inode(pathname, nd->path.dentry);
-		if (op->open_flag & O_CREAT) {
+		if (open_flag & O_CREAT) {
 			error = -EISDIR;
 			goto exit;
 		}
@@ -2152,7 +2155,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		goto ok;
 	}
 
-	if (!(op->open_flag & O_CREAT)) {
+	if (!(open_flag & O_CREAT)) {
 		if (nd->last.name[nd->last.len])
 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 		/* we _can_ be in RCU mode here */
@@ -2230,28 +2233,15 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit_mutex_unlock;
-		error = __open_namei_create(nd, path, op->open_flag, op->mode);
-		if (error) {
-			mnt_drop_write(nd->path.mnt);
+		want_write = 1;
+		will_truncate = 0;
+		error = __open_namei_create(nd, path, open_flag, op->mode);
+		if (error)
 			goto exit;
-		}
 		/* Don't check for write permission, don't truncate */
-		error = may_open(&nd->path, 0, op->open_flag & ~O_TRUNC);
-		if (error) {
-			mnt_drop_write(nd->path.mnt);
-			goto exit;
-		}
-		filp = nameidata_to_filp(nd);
-		mnt_drop_write(nd->path.mnt);
-		path_put(&nd->path);
-		if (!IS_ERR(filp)) {
-			error = ima_file_check(filp, op->acc_mode);
-			if (error) {
-				fput(filp);
-				filp = ERR_PTR(error);
-			}
-		}
-		return filp;
+		open_flag &= ~O_TRUNC;
+		skip_perm = 1;
+		goto common;
 	}
 
 	/*
@@ -2261,7 +2251,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	audit_inode(pathname, path->dentry);
 
 	error = -EEXIST;
-	if (op->open_flag & O_EXCL)
+	if (open_flag & O_EXCL)
 		goto exit_dput;
 
 	error = follow_managed(path, nd->flags);
@@ -2281,18 +2271,17 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	will_truncate = open_will_truncate(op->open_flag, nd->path.dentry->d_inode);
+	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
 	if (will_truncate) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit;
+		want_write = 1;
 	}
-	error = may_open(&nd->path, op->acc_mode, op->open_flag);
-	if (error) {
-		if (will_truncate)
-			mnt_drop_write(nd->path.mnt);
+common:
+	error = may_open(&nd->path, skip_perm ? 0 : op->acc_mode, open_flag);
+	if (error)
 		goto exit;
-	}
 	filp = nameidata_to_filp(nd);
 	if (!IS_ERR(filp)) {
 		error = ima_file_check(filp, op->acc_mode);
@@ -2310,12 +2299,8 @@ ok:
 			}
 		}
 	}
-	/*
-	 * It is now safe to drop the mnt write
-	 * because the filp has had a write taken
-	 * on its behalf.
-	 */
-	if (will_truncate)
+out:
+	if (want_write)
 		mnt_drop_write(nd->path.mnt);
 	path_put(&nd->path);
 	return filp;
@@ -2325,8 +2310,8 @@ exit_mutex_unlock:
 exit_dput:
 	path_put_conditional(path, nd);
 exit:
-	path_put(&nd->path);
-	return ERR_PTR(error);
+	filp = ERR_PTR(error);
+	goto out;
 }
 
 static struct file *path_openat(int dfd, const char *pathname,
-- 
cgit v1.2.3


From 6c0d46c493217cf48999b3f8808910ae534aa085 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:59:59 -0500
Subject: fold __open_namei_create() and open_will_truncate() into do_last()

... and clean up a bit more

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 74 ++++++++++++++++++++++----------------------------------------
 1 file changed, 26 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 63844776484..441f1106de0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2048,30 +2048,6 @@ static int handle_truncate(struct file *filp)
 	return error;
 }
 
-/*
- * Be careful about ever adding any more callers of this
- * function.  Its flags must be in the namei format, not
- * what get passed to sys_open().
- */
-static int __open_namei_create(struct nameidata *nd, struct path *path,
-				int open_flag, int mode)
-{
-	int error;
-	struct dentry *dir = nd->path.dentry;
-
-	if (!IS_POSIXACL(dir->d_inode))
-		mode &= ~current_umask();
-	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
-	if (error)
-		goto out_unlock;
-	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
-out_unlock:
-	mutex_unlock(&dir->d_inode->i_mutex);
-	dput(nd->path.dentry);
-	nd->path.dentry = path->dentry;
-	return error;
-}
-
 /*
  * Note that while the flag value (low two bits) for sys_open means:
  *	00 - read-only
@@ -2096,17 +2072,6 @@ static inline int open_to_namei_flags(int flag)
 	return flag;
 }
 
-static int open_will_truncate(int flag, struct inode *inode)
-{
-	/*
-	 * We'll never write to the fs underlying
-	 * a device file.
-	 */
-	if (special_file(inode->i_mode))
-		return 0;
-	return (flag & O_TRUNC);
-}
-
 /*
  * Handle the last step of open()
  */
@@ -2114,8 +2079,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
+	struct dentry *dentry;
 	int open_flag = op->open_flag;
-	int will_truncate;
+	int will_truncate = open_flag & O_TRUNC;
 	int want_write = 0;
 	int skip_perm = 0;
 	struct file *filp;
@@ -2207,25 +2173,29 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	mutex_lock(&dir->d_inode->i_mutex);
 
-	path->dentry = lookup_hash(nd);
-	path->mnt = nd->path.mnt;
-
-	error = PTR_ERR(path->dentry);
-	if (IS_ERR(path->dentry)) {
+	dentry = lookup_hash(nd);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry)) {
 		mutex_unlock(&dir->d_inode->i_mutex);
 		goto exit;
 	}
 
+	path->dentry = dentry;
+	path->mnt = nd->path.mnt;
+
 	if (IS_ERR(nd->intent.open.file)) {
 		error = PTR_ERR(nd->intent.open.file);
 		goto exit_mutex_unlock;
 	}
 
 	/* Negative dentry, just create the file */
-	if (!path->dentry->d_inode) {
+	if (!dentry->d_inode) {
+		int mode = op->mode;
+		if (!IS_POSIXACL(dir->d_inode))
+			mode &= ~current_umask();
 		/*
 		 * This write is needed to ensure that a
-		 * ro->rw transition does not occur between
+		 * rw->ro transition does not occur between
 		 * the time when the file is created and when
 		 * a permanent write count is taken through
 		 * the 'struct file' in nameidata_to_filp().
@@ -2234,13 +2204,19 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (error)
 			goto exit_mutex_unlock;
 		want_write = 1;
-		will_truncate = 0;
-		error = __open_namei_create(nd, path, open_flag, op->mode);
-		if (error)
-			goto exit;
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
+		will_truncate = 0;
 		skip_perm = 1;
+		error = security_path_mknod(&nd->path, dentry, mode, 0);
+		if (error)
+			goto exit_mutex_unlock;
+		error = vfs_create(dir->d_inode, dentry, mode, nd);
+		if (error)
+			goto exit_mutex_unlock;
+		mutex_unlock(&dir->d_inode->i_mutex);
+		dput(nd->path.dentry);
+		nd->path.dentry = dentry;
 		goto common;
 	}
 
@@ -2271,7 +2247,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
+	if (!S_ISREG(nd->inode->i_mode))
+		will_truncate = 0;
+
 	if (will_truncate) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
-- 
cgit v1.2.3


From f374ed5fa8afed8590deaae5dc147422e0e1a6d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 01:34:45 -0500
Subject: do_last: kill a rudiment of old ->d_revalidate() workaround

There used to be time when ->d_revalidate() couldn't return an error.
So intents code had lookup_instantiate_filp() stash ERR_PTR(error)
in nd->intent.open.filp and had it checked after lookup_hash(), to
catch the otherwise silent failures.  That had been introduced by
commit 4af4c52f34606bdaab6930a845550c6fb02078a4.  These days
->d_revalidate() can and does propagate errors back to callers
explicitly, so this check isn't needed anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 441f1106de0..6972e761286 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2183,11 +2183,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	path->dentry = dentry;
 	path->mnt = nd->path.mnt;
 
-	if (IS_ERR(nd->intent.open.file)) {
-		error = PTR_ERR(nd->intent.open.file);
-		goto exit_mutex_unlock;
-	}
-
 	/* Negative dentry, just create the file */
 	if (!dentry->d_inode) {
 		int mode = op->mode;
-- 
cgit v1.2.3


From 40b39136f07279fdc868a36cba050f4e84ce0ace Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 16:22:18 -0500
Subject: path_openat: clean ELOOP handling a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 6972e761286..ca9a06a6570 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2320,11 +2320,12 @@ static struct file *path_openat(int dfd, const char *pathname,
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
-		error = -ELOOP;
-		if (!(nd.flags & LOOKUP_FOLLOW))
-			goto exit_dput;
-		if (count++ == 32)
-			goto exit_dput;
+		if (!(nd.flags & LOOKUP_FOLLOW) || count++ == 32) {
+			path_put_conditional(&path, &nd);
+			path_put(&nd.path);
+			filp = ERR_PTR(-ELOOP);
+			break;
+		}
 		/*
 		 * This is subtle. Instead of calling do_follow_link() we do
 		 * the thing by hands. The reason is that this way we have zero
@@ -2355,9 +2356,6 @@ out:
 	release_open_intent(&nd);
 	return filp;
 
-exit_dput:
-	path_put_conditional(&path, &nd);
-	path_put(&nd.path);
 out_filp:
 	filp = ERR_PTR(error);
 	goto out;
-- 
cgit v1.2.3


From 5a18fff2090c3af830d699c8ccb230498a1e37e5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 Mar 2011 04:44:53 -0500
Subject: untangle do_lookup()

That thing has devolved into rats nest of gotos; sane use of unlikely()
gets rid of that horror and gives much more readable structure:
	* make a fast attempt to find a dentry; false negatives are OK.
In RCU mode if everything went fine, we are done, otherwise just drop
out of RCU.  If we'd done (RCU) ->d_revalidate() and it had not refused
outright (i.e. didn't give us -ECHILD), remember its result.
	* now we are not in RCU mode and hopefully have a dentry.  If we
do not, lock parent, do full d_lookup() and if that has not found anything,
allocate and call ->lookup().  If we'd done that ->lookup(), remember that
dentry is good and we don't need to revalidate it.
	* now we have a dentry.  If it has ->d_revalidate() and we can't
skip it, call it.
	* hopefully dentry is good; if not, either fail (in case of error)
or try to invalidate it.  If d_invalidate() has succeeded, drop it and
retry everything as if original attempt had not found a dentry.
	* now we can finish it up - deal with mountpoint crossing and
automount.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 141 ++++++++++++++++++++++++-------------------------------------
 1 file changed, 56 insertions(+), 85 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index ca9a06a6570..0bebd13e5cb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -589,29 +589,6 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return dentry;
 }
 
-static inline struct dentry *
-do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
-{
-	int status = d_revalidate(dentry, nd);
-	if (likely(status > 0))
-		return dentry;
-	if (status == -ECHILD) {
-		if (nameidata_dentry_drop_rcu(nd, dentry))
-			return ERR_PTR(-ECHILD);
-		return do_revalidate(dentry, nd);
-	}
-	if (status < 0)
-		return ERR_PTR(status);
-	/* Don't d_invalidate in rcu-walk mode */
-	if (nameidata_dentry_drop_rcu(nd, dentry))
-		return ERR_PTR(-ECHILD);
-	if (!d_invalidate(dentry)) {
-		dput(dentry);
-		dentry = NULL;
-	}
-	return dentry;
-}
-
 /*
  * handle_reval_path - force revalidation of a dentry
  *
@@ -1213,7 +1190,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 {
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
-	struct inode *dir;
+	int need_reval = 1;
+	int status = 1;
 	int err;
 
 	/*
@@ -1223,48 +1201,74 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	 */
 	if (nd->flags & LOOKUP_RCU) {
 		unsigned seq;
-
 		*inode = nd->inode;
 		dentry = __d_lookup_rcu(parent, name, &seq, inode);
-		if (!dentry) {
-			if (nameidata_drop_rcu(nd))
-				return -ECHILD;
-			goto need_lookup;
-		}
+		if (!dentry)
+			goto unlazy;
+
 		/* Memory barrier in read_seqcount_begin of child is enough */
 		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
 			return -ECHILD;
-
 		nd->seq = seq;
+
 		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-			dentry = do_revalidate_rcu(dentry, nd);
-			if (!dentry)
-				goto need_lookup;
-			if (IS_ERR(dentry))
-				goto fail;
-			if (!(nd->flags & LOOKUP_RCU))
-				goto done;
+			status = d_revalidate(dentry, nd);
+			if (unlikely(status <= 0)) {
+				if (status != -ECHILD)
+					need_reval = 0;
+				goto unlazy;
+			}
 		}
 		path->mnt = mnt;
 		path->dentry = dentry;
 		if (likely(__follow_mount_rcu(nd, path, inode, false)))
 			return 0;
-		if (nameidata_drop_rcu(nd))
-			return -ECHILD;
-		/* fallthru */
+unlazy:
+		if (dentry) {
+			if (nameidata_dentry_drop_rcu(nd, dentry))
+				return -ECHILD;
+		} else {
+			if (nameidata_drop_rcu(nd))
+				return -ECHILD;
+		}
+	} else {
+		dentry = __d_lookup(parent, name);
 	}
-	dentry = __d_lookup(parent, name);
-	if (!dentry)
-		goto need_lookup;
-found:
-	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-		dentry = do_revalidate(dentry, nd);
-		if (!dentry)
-			goto need_lookup;
-		if (IS_ERR(dentry))
-			goto fail;
+
+retry:
+	if (unlikely(!dentry)) {
+		struct inode *dir = parent->d_inode;
+		BUG_ON(nd->inode != dir);
+
+		mutex_lock(&dir->i_mutex);
+		dentry = d_lookup(parent, name);
+		if (likely(!dentry)) {
+			dentry = d_alloc_and_lookup(parent, name, nd);
+			if (IS_ERR(dentry)) {
+				mutex_unlock(&dir->i_mutex);
+				return PTR_ERR(dentry);
+			}
+			/* known good */
+			need_reval = 0;
+			status = 1;
+		}
+		mutex_unlock(&dir->i_mutex);
+	}
+	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+		status = d_revalidate(dentry, nd);
+	if (unlikely(status <= 0)) {
+		if (status < 0) {
+			dput(dentry);
+			return status;
+		}
+		if (!d_invalidate(dentry)) {
+			dput(dentry);
+			dentry = NULL;
+			need_reval = 1;
+			goto retry;
+		}
 	}
-done:
+
 	path->mnt = mnt;
 	path->dentry = dentry;
 	err = follow_managed(path, nd->flags);
@@ -1274,39 +1278,6 @@ done:
 	}
 	*inode = path->dentry->d_inode;
 	return 0;
-
-need_lookup:
-	dir = parent->d_inode;
-	BUG_ON(nd->inode != dir);
-
-	mutex_lock(&dir->i_mutex);
-	/*
-	 * First re-do the cached lookup just in case it was created
-	 * while we waited for the directory semaphore, or the first
-	 * lookup failed due to an unrelated rename.
-	 *
-	 * This could use version numbering or similar to avoid unnecessary
-	 * cache lookups, but then we'd have to do the first lookup in the
-	 * non-racy way. However in the common case here, everything should
-	 * be hot in cache, so would it be a big win?
-	 */
-	dentry = d_lookup(parent, name);
-	if (likely(!dentry)) {
-		dentry = d_alloc_and_lookup(parent, name, nd);
-		mutex_unlock(&dir->i_mutex);
-		if (IS_ERR(dentry))
-			goto fail;
-		goto done;
-	}
-	/*
-	 * Uhhuh! Nasty case: the cache was re-populated while
-	 * we waited on the semaphore. Need to revalidate.
-	 */
-	mutex_unlock(&dir->i_mutex);
-	goto found;
-
-fail:
-	return PTR_ERR(dentry);
 }
 
 static inline int may_lookup(struct nameidata *nd)
-- 
cgit v1.2.3


From 5b6ca027d85b7438c84b78a54ccdc2e53f2909cd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 23:04:47 -0500
Subject: reduce vfs_path_lookup() to do_path_lookup()

New lookup flag: LOOKUP_ROOT.  nd->root is set (and held) by caller,
path_init() starts walking from that place and all pathname resolution
machinery never drops nd->root if that flag is set.  That turns
vfs_path_lookup() into a special case of do_path_lookup() *and*
gets us down to 3 callers of link_path_walk(), making it finally
feasible to rip the handling of trailing symlink out of link_path_walk().
That will not only simply the living hell out of it, but make life
much simpler for unionfs merge.  Trailing symlink handling will
become iterative, which is a good thing for stack footprint in
a lot of situations as well.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 95 ++++++++++++++++++++++++++++----------------------------------
 1 file changed, 43 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0bebd13e5cb..8ee7785d564 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -401,9 +401,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 {
 	struct fs_struct *fs = current->fs;
 	struct dentry *dentry = nd->path.dentry;
+	int want_root = 0;
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
-	if (nd->root.mnt) {
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		want_root = 1;
 		spin_lock(&fs->lock);
 		if (nd->root.mnt != fs->root.mnt ||
 				nd->root.dentry != fs->root.dentry)
@@ -414,7 +416,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 		goto err;
 	BUG_ON(nd->inode != dentry->d_inode);
 	spin_unlock(&dentry->d_lock);
-	if (nd->root.mnt) {
+	if (want_root) {
 		path_get(&nd->root);
 		spin_unlock(&fs->lock);
 	}
@@ -427,7 +429,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 err:
 	spin_unlock(&dentry->d_lock);
 err_root:
-	if (nd->root.mnt)
+	if (want_root)
 		spin_unlock(&fs->lock);
 	return -ECHILD;
 }
@@ -454,9 +456,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 {
 	struct fs_struct *fs = current->fs;
 	struct dentry *parent = nd->path.dentry;
+	int want_root = 0;
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
-	if (nd->root.mnt) {
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		want_root = 1;
 		spin_lock(&fs->lock);
 		if (nd->root.mnt != fs->root.mnt ||
 				nd->root.dentry != fs->root.dentry)
@@ -476,7 +480,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 	parent->d_count++;
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
-	if (nd->root.mnt) {
+	if (want_root) {
 		path_get(&nd->root);
 		spin_unlock(&fs->lock);
 	}
@@ -490,7 +494,7 @@ err:
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
 err_root:
-	if (nd->root.mnt)
+	if (want_root)
 		spin_unlock(&fs->lock);
 	return -ECHILD;
 }
@@ -501,7 +505,8 @@ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct d
 	if (nd->flags & LOOKUP_RCU) {
 		if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
 			nd->flags &= ~LOOKUP_RCU;
-			nd->root.mnt = NULL;
+			if (!(nd->flags & LOOKUP_ROOT))
+				nd->root.mnt = NULL;
 			rcu_read_unlock();
 			br_read_unlock(vfsmount_lock);
 			return -ECHILD;
@@ -525,7 +530,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 	nd->flags &= ~LOOKUP_RCU;
-	nd->root.mnt = NULL;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
 	spin_lock(&dentry->d_lock);
 	if (!__d_rcu_to_refcount(dentry, nd->seq))
 		goto err_unlock;
@@ -1053,7 +1059,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 
 failed:
 	nd->flags &= ~LOOKUP_RCU;
-	nd->root.mnt = NULL;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
 	rcu_read_unlock();
 	br_read_unlock(vfsmount_lock);
 	return -ECHILD;
@@ -1310,7 +1317,8 @@ static void terminate_walk(struct nameidata *nd)
 		path_put(&nd->path);
 	} else {
 		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
 		rcu_read_unlock();
 		br_read_unlock(vfsmount_lock);
 	}
@@ -1477,6 +1485,25 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
+	if (flags & LOOKUP_ROOT) {
+		struct inode *inode = nd->root.dentry->d_inode;
+		if (!inode->i_op->lookup)
+			return -ENOTDIR;
+		retval = inode_permission(inode, MAY_EXEC);
+		if (retval)
+			return retval;
+		nd->path = nd->root;
+		nd->inode = inode;
+		if (flags & LOOKUP_RCU) {
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+		} else {
+			path_get(&nd->path);
+		}
+		return 0;
+	}
+
 	nd->root.mnt = NULL;
 
 	if (*name=='/') {
@@ -1587,7 +1614,7 @@ static int path_lookupat(int dfd, const char *name,
 	if (base)
 		fput(base);
 
-	if (nd->root.mnt) {
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
@@ -1638,46 +1665,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		    const char *name, unsigned int flags,
 		    struct nameidata *nd)
 {
-	int result;
-
-	/* same as do_path_lookup */
-	nd->last_type = LAST_ROOT;
-	nd->flags = flags | LOOKUP_JUMPED;
-	nd->depth = 0;
-
-	nd->path.dentry = dentry;
-	nd->path.mnt = mnt;
-	path_get(&nd->path);
-	nd->root = nd->path;
-	path_get(&nd->root);
-	nd->inode = nd->path.dentry->d_inode;
-
-	current->total_link_count = 0;
-
-	result = link_path_walk(name, nd);
-	if (!result)
-		result = handle_reval_path(nd);
-	if (result == -ESTALE) {
-		/* nd->path had been dropped */
-		current->total_link_count = 0;
-		nd->path.dentry = dentry;
-		nd->path.mnt = mnt;
-		nd->inode = dentry->d_inode;
-		path_get(&nd->path);
-		nd->flags = flags | LOOKUP_JUMPED | LOOKUP_REVAL;
-
-		result = link_path_walk(name, nd);
-		if (!result)
-			result = handle_reval_path(nd);
-	}
-	if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
-				nd->inode))
-		audit_inode(name, nd->path.dentry);
-
-	path_put(&nd->root);
-	nd->root.mnt = NULL;
-
-	return result;
+	nd->root.dentry = dentry;
+	nd->root.mnt = mnt;
+	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
+	return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -2320,7 +2311,7 @@ static struct file *path_openat(int dfd, const char *pathname,
 		path_put(&link);
 	}
 out:
-	if (nd.root.mnt)
+	if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
 		path_put(&nd.root);
 	if (base)
 		fput(base);
-- 
cgit v1.2.3


From 73d049a40fc6269189c4e2ba6792cb5dd054883c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 Mar 2011 12:08:24 -0500
Subject: open-style analog of vfs_path_lookup()

new function: file_open_root(dentry, mnt, name, flags) opens the file
vfs_path_lookup would arrive to.

Note that name can be empty; in that case the usual requirement that
dentry should be a directory is lifted.

open-coded equivalents switched to it, may_open() got down exactly
one caller and became static.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h |  2 ++
 fs/namei.c    | 80 ++++++++++++++++++++++++++++++++++++++---------------------
 fs/nfsctl.c   | 21 +++-------------
 fs/open.c     | 14 +++++++++++
 4 files changed, 72 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index 6fdbdf2c604..52abc5287f5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -114,6 +114,8 @@ struct open_flags {
 };
 extern struct file *do_filp_open(int dfd, const char *pathname,
 		const struct open_flags *op, int lookup_flags);
+extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+		const char *, const struct open_flags *, int lookup_flags);
 
 /*
  * inode.c
diff --git a/fs/namei.c b/fs/namei.c
index 8ee7785d564..abc8d2df121 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1487,11 +1487,13 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	nd->depth = 0;
 	if (flags & LOOKUP_ROOT) {
 		struct inode *inode = nd->root.dentry->d_inode;
-		if (!inode->i_op->lookup)
-			return -ENOTDIR;
-		retval = inode_permission(inode, MAY_EXEC);
-		if (retval)
-			return retval;
+		if (*name) {
+			if (!inode->i_op->lookup)
+				return -ENOTDIR;
+			retval = inode_permission(inode, MAY_EXEC);
+			if (retval)
+				return retval;
+		}
 		nd->path = nd->root;
 		nd->inode = inode;
 		if (flags & LOOKUP_RCU) {
@@ -1937,7 +1939,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	return error;
 }
 
-int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *inode = dentry->d_inode;
@@ -2250,11 +2252,10 @@ exit:
 }
 
 static struct file *path_openat(int dfd, const char *pathname,
-		const struct open_flags *op, int flags)
+		struct nameidata *nd, const struct open_flags *op, int flags)
 {
 	struct file *base = NULL;
 	struct file *filp;
-	struct nameidata nd;
 	struct path path;
 	int count = 0;
 	int error;
@@ -2264,27 +2265,27 @@ static struct file *path_openat(int dfd, const char *pathname,
 		return ERR_PTR(-ENFILE);
 
 	filp->f_flags = op->open_flag;
-	nd.intent.open.file = filp;
-	nd.intent.open.flags = open_to_namei_flags(op->open_flag);
-	nd.intent.open.create_mode = op->mode;
+	nd->intent.open.file = filp;
+	nd->intent.open.flags = open_to_namei_flags(op->open_flag);
+	nd->intent.open.create_mode = op->mode;
 
-	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, &nd, &base);
+	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
 	if (unlikely(error))
 		goto out_filp;
 
 	current->total_link_count = 0;
-	error = link_path_walk(pathname, &nd);
+	error = link_path_walk(pathname, nd);
 	if (unlikely(error))
 		goto out_filp;
 
-	filp = do_last(&nd, &path, op, pathname);
+	filp = do_last(nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
-		if (!(nd.flags & LOOKUP_FOLLOW) || count++ == 32) {
-			path_put_conditional(&path, &nd);
-			path_put(&nd.path);
+		if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) {
+			path_put_conditional(&path, nd);
+			path_put(&nd->path);
 			filp = ERR_PTR(-ELOOP);
 			break;
 		}
@@ -2299,23 +2300,23 @@ static struct file *path_openat(int dfd, const char *pathname,
 		 * have to putname() it when we are done. Procfs-like symlinks
 		 * just set LAST_BIND.
 		 */
-		nd.flags |= LOOKUP_PARENT;
-		nd.flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-		error = __do_follow_link(&link, &nd, &cookie);
+		nd->flags |= LOOKUP_PARENT;
+		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+		error = __do_follow_link(&link, nd, &cookie);
 		if (unlikely(error))
 			filp = ERR_PTR(error);
 		else
-			filp = do_last(&nd, &path, op, pathname);
+			filp = do_last(nd, &path, op, pathname);
 		if (!IS_ERR(cookie) && linki->i_op->put_link)
-			linki->i_op->put_link(link.dentry, &nd, cookie);
+			linki->i_op->put_link(link.dentry, nd, cookie);
 		path_put(&link);
 	}
 out:
-	if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
-		path_put(&nd.root);
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
+		path_put(&nd->root);
 	if (base)
 		fput(base);
-	release_open_intent(&nd);
+	release_open_intent(nd);
 	return filp;
 
 out_filp:
@@ -2326,16 +2327,39 @@ out_filp:
 struct file *do_filp_open(int dfd, const char *pathname,
 		const struct open_flags *op, int flags)
 {
+	struct nameidata nd;
 	struct file *filp;
 
-	filp = path_openat(dfd, pathname, op, flags | LOOKUP_RCU);
+	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
 	if (unlikely(filp == ERR_PTR(-ECHILD)))
-		filp = path_openat(dfd, pathname, op, flags);
+		filp = path_openat(dfd, pathname, &nd, op, flags);
 	if (unlikely(filp == ERR_PTR(-ESTALE)))
-		filp = path_openat(dfd, pathname, op, flags | LOOKUP_REVAL);
+		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
 	return filp;
 }
 
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+		const char *name, const struct open_flags *op, int flags)
+{
+	struct nameidata nd;
+	struct file *file;
+
+	nd.root.mnt = mnt;
+	nd.root.dentry = dentry;
+
+	flags |= LOOKUP_ROOT;
+
+	if (dentry->d_inode->i_op->follow_link)
+		return ERR_PTR(-ELOOP);
+
+	file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+	if (unlikely(file == ERR_PTR(-ECHILD)))
+		file = path_openat(-1, name, &nd, op, flags);
+	if (unlikely(file == ERR_PTR(-ESTALE)))
+		file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+	return file;
+}
+
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd242dd..124e8fcb0dd 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
 
 static struct file *do_open(char *name, int flags)
 {
-	struct nameidata nd;
 	struct vfsmount *mnt;
-	int error;
+	struct file *file;
 
 	mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
 	if (IS_ERR(mnt))
 		return (struct file *)mnt;
 
-	error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd);
-	mntput(mnt);	/* drop do_kern_mount reference */
-	if (error)
-		return ERR_PTR(error);
-
-	if (flags == O_RDWR)
-		error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
-	else
-		error = may_open(&nd.path, MAY_WRITE, flags);
+	file = file_open_root(mnt->mnt_root, mnt, name, flags);
 
-	if (!error)
-		return dentry_open(nd.path.dentry, nd.path.mnt, flags,
-				   current_cred());
-
-	path_put(&nd.path);
-	return ERR_PTR(error);
+	mntput(mnt);	/* drop do_kern_mount reference */
+	return file;
 }
 
 static struct {
diff --git a/fs/open.c b/fs/open.c
index d05e18c60ba..48afc5c139d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -959,6 +959,20 @@ struct file *filp_open(const char *filename, int flags, int mode)
 }
 EXPORT_SYMBOL(filp_open);
 
+struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+			    const char *filename, int flags)
+{
+	struct open_flags op;
+	int lookup = build_open_flags(flags, 0, &op);
+	if (flags & O_CREAT)
+		return ERR_PTR(-EINVAL);
+	if (!filename && (flags & O_DIRECTORY))
+		if (!dentry->d_inode->i_op->lookup)
+			return ERR_PTR(-ENOTDIR);
+	return do_file_open_root(dentry, mnt, filename, &op, lookup);
+}
+EXPORT_SYMBOL(file_open_root);
+
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
 	struct open_flags op;
-- 
cgit v1.2.3


From c8b91accfa1059d5565443193d89572eca2f5dd6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 12 Mar 2011 10:41:39 -0500
Subject: clean statfs-like syscalls up

New helpers: user_statfs() and fd_statfs(), taking userland pathname and
descriptor resp. and filling struct kstatfs.  Syscalls of statfs family
(native, compat and foreign - osf and hpux on alpha and parisc resp.)
switched to those.  Removes some boilerplate code, simplifies cleanup
on errors...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c |  48 ++++-------------
 fs/statfs.c | 176 ++++++++++++++++++++++++++++--------------------------------
 2 files changed, 91 insertions(+), 133 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 691c3fd8ce1..a071775f3bb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
  */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-	struct path path;
-	int error;
-
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct kstatfs tmp;
-		error = vfs_statfs(&path, &tmp);
-		if (!error)
-			error = put_compat_statfs(buf, &tmp);
-		path_put(&path);
-	}
+	struct kstatfs tmp;
+	int error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs(buf, &tmp);
 	return error;
 }
 
 asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
 {
-	struct file * file;
 	struct kstatfs tmp;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = vfs_statfs(&file->f_path, &tmp);
+	int error = fd_statfs(fd, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
-	fput(file);
-out:
 	return error;
 }
 
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 
 asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-	struct path path;
+	struct kstatfs tmp;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct kstatfs tmp;
-		error = vfs_statfs(&path, &tmp);
-		if (!error)
-			error = put_compat_statfs64(buf, &tmp);
-		path_put(&path);
-	}
+	error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs64(buf, &tmp);
 	return error;
 }
 
 asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-	struct file * file;
 	struct kstatfs tmp;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = vfs_statfs(&file->f_path, &tmp);
+	error = fd_statfs(fd, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
-	fput(file);
-out:
 	return error;
 }
 
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8a996..8244924dec5 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
 }
 EXPORT_SYMBOL(vfs_statfs);
 
-static int do_statfs_native(struct path *path, struct statfs *buf)
+int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
-	struct kstatfs st;
-	int retval;
+	struct path path;
+	int error = user_path(pathname, &path);
+	if (!error) {
+		error = vfs_statfs(&path, st);
+		path_put(&path);
+	}
+	return error;
+}
 
-	retval = vfs_statfs(path, &st);
-	if (retval)
-		return retval;
+int fd_statfs(int fd, struct kstatfs *st)
+{
+	struct file *file = fget(fd);
+	int error = -EBADF;
+	if (file) {
+		error = vfs_statfs(&file->f_path, st);
+		fput(file);
+	}
+	return error;
+}
 
-	if (sizeof(*buf) == sizeof(st))
-		memcpy(buf, &st, sizeof(st));
+static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
+{
+	struct statfs buf;
+
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
 	else {
-		if (sizeof buf->f_blocks == 4) {
-			if ((st.f_blocks | st.f_bfree | st.f_bavail |
-			     st.f_bsize | st.f_frsize) &
+		if (sizeof buf.f_blocks == 4) {
+			if ((st->f_blocks | st->f_bfree | st->f_bavail |
+			     st->f_bsize | st->f_frsize) &
 			    0xffffffff00000000ULL)
 				return -EOVERFLOW;
 			/*
 			 * f_files and f_ffree may be -1; it's okay to stuff
 			 * that into 32 bits
 			 */
-			if (st.f_files != -1 &&
-			    (st.f_files & 0xffffffff00000000ULL))
+			if (st->f_files != -1 &&
+			    (st->f_files & 0xffffffff00000000ULL))
 				return -EOVERFLOW;
-			if (st.f_ffree != -1 &&
-			    (st.f_ffree & 0xffffffff00000000ULL))
+			if (st->f_ffree != -1 &&
+			    (st->f_ffree & 0xffffffff00000000ULL))
 				return -EOVERFLOW;
 		}
 
-		buf->f_type = st.f_type;
-		buf->f_bsize = st.f_bsize;
-		buf->f_blocks = st.f_blocks;
-		buf->f_bfree = st.f_bfree;
-		buf->f_bavail = st.f_bavail;
-		buf->f_files = st.f_files;
-		buf->f_ffree = st.f_ffree;
-		buf->f_fsid = st.f_fsid;
-		buf->f_namelen = st.f_namelen;
-		buf->f_frsize = st.f_frsize;
-		buf->f_flags = st.f_flags;
-		memset(buf->f_spare, 0, sizeof(buf->f_spare));
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
 	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
 	return 0;
 }
 
-static int do_statfs64(struct path *path, struct statfs64 *buf)
+static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
 {
-	struct kstatfs st;
-	int retval;
-
-	retval = vfs_statfs(path, &st);
-	if (retval)
-		return retval;
-
-	if (sizeof(*buf) == sizeof(st))
-		memcpy(buf, &st, sizeof(st));
+	struct statfs64 buf;
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
 	else {
-		buf->f_type = st.f_type;
-		buf->f_bsize = st.f_bsize;
-		buf->f_blocks = st.f_blocks;
-		buf->f_bfree = st.f_bfree;
-		buf->f_bavail = st.f_bavail;
-		buf->f_files = st.f_files;
-		buf->f_ffree = st.f_ffree;
-		buf->f_fsid = st.f_fsid;
-		buf->f_namelen = st.f_namelen;
-		buf->f_frsize = st.f_frsize;
-		buf->f_flags = st.f_flags;
-		memset(buf->f_spare, 0, sizeof(buf->f_spare));
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
 	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
 	return 0;
 }
 
 SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
-	struct path path;
-	int error;
-
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct statfs tmp;
-		error = do_statfs_native(&path, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
+	struct kstatfs st;
+	int error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
 	return error;
 }
 
 SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
 {
-	struct path path;
-	long error;
-
+	struct kstatfs st;
+	int error;
 	if (sz != sizeof(*buf))
 		return -EINVAL;
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct statfs64 tmp;
-		error = do_statfs64(&path, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
+	error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
 	return error;
 }
 
 SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 {
-	struct file *file;
-	struct statfs tmp;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = do_statfs_native(&file->f_path, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
-out:
+	struct kstatfs st;
+	int error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
 	return error;
 }
 
 SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
-	struct file *file;
-	struct statfs64 tmp;
+	struct kstatfs st;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = do_statfs64(&file->f_path, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
-out:
+	error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
 	return error;
 }
 
-- 
cgit v1.2.3


From 5fe0c2378884e68beb532f5890cc0e3539ac747b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:25 +0530
Subject: exportfs: Return the minimum required handle size

The exportfs encode handle function should return the minimum required
handle size. This helps user to find out the handle size by passing 0
handle size in the first step and then redoing to the call again with
the returned handle size value.

Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/export.c             | 8 ++++++--
 fs/exportfs/expfs.c           | 9 +++++++--
 fs/fat/inode.c                | 4 +++-
 fs/fuse/inode.c               | 4 +++-
 fs/gfs2/export.c              | 8 ++++++--
 fs/isofs/export.c             | 8 ++++++--
 fs/ocfs2/export.c             | 8 ++++++--
 fs/reiserfs/inode.c           | 7 ++++++-
 fs/udf/namei.c                | 7 ++++++-
 fs/xfs/linux-2.6/xfs_export.c | 4 +++-
 10 files changed, 52 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ff27d7a477b..b4ffad859ad 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 	int len = *max_len;
 	int type;
 
-	if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
-	    (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+	if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+		*max_len = BTRFS_FID_SIZE_CONNECTABLE;
 		return 255;
+	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+		*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+		return 255;
+	}
 
 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
 	type = FILEID_BTRFS_WITHOUT_PARENT;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b6825740dd..cfe55731b6d 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
 	struct inode * inode = dentry->d_inode;
 	int len = *max_len;
 	int type = FILEID_INO32_GEN;
-	
-	if (len < 2 || (connectable && len < 4))
+
+	if (connectable && (len < 4)) {
+		*max_len = 4;
+		return 255;
+	} else if (len < 2) {
+		*max_len = 2;
 		return 255;
+	}
 
 	len = 2;
 	fid->i32.ino = inode->i_ino;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd..0e277ec4b61 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
 	struct inode *inode =  de->d_inode;
 	u32 ipos_h, ipos_m, ipos_l;
 
-	if (len < 5)
+	if (len < 5) {
+		*lenp = 5;
 		return 255; /* no room */
+	}
 
 	ipos_h = MSDOS_I(inode)->i_pos >> 8;
 	ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd..051b1a08452 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 	u64 nodeid;
 	u32 generation;
 
-	if (*max_len < len)
+	if (*max_len < len) {
+		*max_len = len;
 		return  255;
+	}
 
 	nodeid = get_fuse_inode(inode)->nodeid;
 	generation = inode->i_generation;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8184f..b5a5e60df0d 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
 	struct super_block *sb = inode->i_sb;
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	if (*len < GFS2_SMALL_FH_SIZE ||
-	    (connectable && *len < GFS2_LARGE_FH_SIZE))
+	if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
+		*len = GFS2_LARGE_FH_SIZE;
 		return 255;
+	} else if (*len < GFS2_SMALL_FH_SIZE) {
+		*len = GFS2_SMALL_FH_SIZE;
+		return 255;
+	}
 
 	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
 	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb3847..dd4687ff30d 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
 	 * offset of the inode and the upper 16 bits of fh32[1] to
 	 * hold the offset of the parent.
 	 */
-
-	if (len < 3 || (connectable && len < 5))
+	if (connectable && (len < 5)) {
+		*max_len = 5;
+		return 255;
+	} else if (len < 3) {
+		*max_len = 3;
 		return 255;
+	}
 
 	len = 3;
 	fh32[0] = ei->i_iget5_block;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc3062b4f..254652a9b54 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 		   dentry->d_name.len, dentry->d_name.name,
 		   fh, len, connectable);
 
-	if (len < 3 || (connectable && len < 6)) {
-		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+	if (connectable && (len < 6)) {
+		*max_len = 6;
+		type = 255;
+		goto bail;
+	} else if (len < 3) {
+		*max_len = 3;
 		type = 255;
 		goto bail;
 	}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e..1bba24bad82 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
 	struct inode *inode = dentry->d_inode;
 	int maxlen = *lenp;
 
-	if (maxlen < 3)
+	if (need_parent && (maxlen < 5)) {
+		*lenp = 5;
 		return 255;
+	} else if (maxlen < 3) {
+		*lenp = 3;
+		return 255;
+	}
 
 	data[0] = inode->i_ino;
 	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index b7c338d5e9d..f1dce848ef9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1286,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
 	struct fid *fid = (struct fid *)fh;
 	int type = FILEID_UDF_WITHOUT_PARENT;
 
-	if (len < 3 || (connectable && len < 5))
+	if (connectable && (len < 5)) {
+		*lenp = 5;
+		return 255;
+	} else if (len < 3) {
+		*lenp = 3;
 		return 255;
+	}
 
 	*lenp = 3;
 	fid->udf.block = location.logicalBlockNum;
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fd..f4f878fc008 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
 	 * seven combinations work.  The real answer is "don't use v2".
 	 */
 	len = xfs_fileid_length(fileid_type);
-	if (*max_len < len)
+	if (*max_len < len) {
+		*max_len = len;
 		return 255;
+	}
 	*max_len = len;
 
 	switch (fileid_type) {
-- 
cgit v1.2.3


From 6c474f7bc1ef70bc05ce486e7e7e524124122de8 Mon Sep 17 00:00:00 2001
From: Maxim <maxim.patlasov@gmail.com>
Date: Mon, 14 Mar 2011 13:19:21 +0000
Subject: GFS2: Adding missing unlock_page()

gfs2_write_begin() calls grab_cache_page_write_begin() that returns *locked*
page. Correspondent error-handling path lacks for unlock_page() call:

> out:
> 	if (error == 0)
> 		return 0;
>
> 	page_cache_release(page);

The whole system hangs if gfs2_unstuff_dinode() called from gfs2_write_begin()
failed for some reason.

Reported-by: Maxim <maxim.patlasov@gmail.com>
Signed-off-by: Maxim <maxim.patlasov@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4f36f8832b9..aad77e4f61b 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -695,6 +695,7 @@ out:
 	if (error == 0)
 		return 0;
 
+	unlock_page(page);
 	page_cache_release(page);
 
 	gfs2_trans_end(sdp);
-- 
cgit v1.2.3


From 1eafbfeb7bdf59cfe173304c76188f3fd5f1fd05 Mon Sep 17 00:00:00 2001
From: Timo Warns <Warns@pre-sense.de>
Date: Mon, 14 Mar 2011 14:59:33 +0100
Subject: Fix corrupted OSF partition table parsing

The kernel automatically evaluates partition tables of storage devices.
The code for evaluating OSF partitions contains a bug that leaks data
from kernel heap memory to userspace for certain corrupted OSF
partitions.

In more detail:

  for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) {

iterates from 0 to d_npartitions - 1, where d_npartitions is read from
the partition table without validation and partition is a pointer to an
array of at most 8 d_partitions.

Add the proper and obvious validation.

Signed-off-by: Timo Warns <warns@pre-sense.de>
Cc: stable@kernel.org
[ Changed the patch trivially to not repeat the whole le16_to_cpu()
  thing, and to use an explicit constant for the magic value '8' ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/osf.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index 48cec7cbca1..be03a0b08b4 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,10 +10,13 @@
 #include "check.h"
 #include "osf.h"
 
+#define MAX_OSF_PARTITIONS 8
+
 int osf_partition(struct parsed_partitions *state)
 {
 	int i;
 	int slot = 1;
+	unsigned int npartitions;
 	Sector sect;
 	unsigned char *data;
 	struct disklabel {
@@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state)
 			u8  p_fstype;
 			u8  p_frag;
 			__le16 p_cpg;
-		} d_partitions[8];
+		} d_partitions[MAX_OSF_PARTITIONS];
 	} * label;
 	struct d_partition * partition;
 
@@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state)
 		put_dev_sector(sect);
 		return 0;
 	}
-	for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) {
+	npartitions = le16_to_cpu(label->d_npartitions);
+	if (npartitions > MAX_OSF_PARTITIONS) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	for (i = 0 ; i < npartitions; i++, partition++) {
 		if (slot == state->limit)
 		        break;
 		if (le32_to_cpu(partition->p_size))
-- 
cgit v1.2.3


From f52e0c11305aa09ed56cad97ffc8f0cdc3d78b5d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 18:56:51 -0400
Subject: New AT_... flag: AT_EMPTY_PATH

For name_to_handle_at(2) we'll want both ...at()-style syscall that
would be usable for non-directory descriptors (with empty relative
pathname).  Introduce new flag (AT_EMPTY_PATH) to deal with that and
corresponding LOOKUP_EMPTY; teach user_path_at() and path_init() to
deal with the latter.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index abc8d2df121..83e92bab79a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
 	return retval;
 }
 
-char * getname(const char __user * filename)
+static char *getname_flags(const char __user * filename, int flags)
 {
 	char *tmp, *result;
 
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
 
 		result = tmp;
 		if (retval < 0) {
-			__putname(tmp);
-			result = ERR_PTR(retval);
+			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+				__putname(tmp);
+				result = ERR_PTR(retval);
+			}
 		}
 	}
 	audit_getname(result);
 	return result;
 }
 
+char *getname(const char __user * filename)
+{
+	return getname_flags(filename, 0);
+}
+
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
@@ -1544,13 +1551,15 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 
 		dentry = file->f_path.dentry;
 
-		retval = -ENOTDIR;
-		if (!S_ISDIR(dentry->d_inode->i_mode))
-			goto fput_fail;
+		if (*name) {
+			retval = -ENOTDIR;
+			if (!S_ISDIR(dentry->d_inode->i_mode))
+				goto fput_fail;
 
-		retval = file_permission(file, MAY_EXEC);
-		if (retval)
-			goto fput_fail;
+			retval = file_permission(file, MAY_EXEC);
+			if (retval)
+				goto fput_fail;
+		}
 
 		nd->path = file->f_path;
 		if (flags & LOOKUP_RCU) {
@@ -1759,7 +1768,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
 		 struct path *path)
 {
 	struct nameidata nd;
-	char *tmp = getname(name);
+	char *tmp = getname_flags(name, flags);
 	int err = PTR_ERR(tmp);
 	if (!IS_ERR(tmp)) {
 
-- 
cgit v1.2.3


From 0a5e5f122c756d1c1a6ca712eda76ea8664e5fd9 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 14 Mar 2011 20:57:44 -0400
Subject: nfsd: fix compile error

"fs/built-in.o: In function `supported_enctypes_show':
nfsctl.c:(.text+0x7beb0): undefined reference to `gss_mech_get_by_name'
nfsctl.c:(.text+0x7bebc): undefined reference to `gss_mech_put'
"

Reported-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsctl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 35dcfa8eba2..1f5eae40f34 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -189,6 +189,7 @@ static struct file_operations export_features_operations = {
 	.release	= single_release,
 };
 
+#ifdef CONFIG_SUNRPC_GSS
 static int supported_enctypes_show(struct seq_file *m, void *v)
 {
 	struct gss_api_mech *k5mech;
@@ -214,6 +215,7 @@ static struct file_operations supported_enctypes_ops = {
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
+#endif /* CONFIG_SUNRPC_GSS */
 
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
@@ -1425,7 +1427,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+#ifdef CONFIG_SUNRPC_GSS
 		[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+#endif /* CONFIG_SUNRPC_GSS */
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
-- 
cgit v1.2.3


From 990d6c2d7aee921e3bce22b2d6a750fd552262be Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:26 +0530
Subject: vfs: Add name to file handle conversion support

The syscall also return mount id which can be used
to lookup file system specific information such as uuid
in /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Kconfig   |   2 +-
 fs/Makefile  |   2 ++
 fs/fhandle.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 fs/fhandle.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa57ed..7cb53aafac1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
 	def_bool n
 
 config EXPORTFS
-	tristate
+	bool
 
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EXPERT
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c..ba01202844c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 
+obj-$(CONFIG_FHANDLE)		+= fhandle.o
+
 obj-y				+= quota/
 
 obj-$(CONFIG_PROC_FS)		+= proc/
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 00000000000..9f79e743a84
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,107 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static long do_sys_name_to_handle(struct path *path,
+				  struct file_handle __user *ufh,
+				  int __user *mnt_id)
+{
+	long retval;
+	struct file_handle f_handle;
+	int handle_dwords, handle_bytes;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * We need t make sure wether the file system
+	 * support decoding of the file handle
+	 */
+	if (!path->mnt->mnt_sb->s_export_op ||
+	    !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+		return -EFAULT;
+
+	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+		return -EINVAL;
+
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	/* convert handle size to  multiple of sizeof(u32) */
+	handle_dwords = f_handle.handle_bytes >> 2;
+
+	/* we ask for a non connected handle */
+	retval = exportfs_encode_fh(path->dentry,
+				    (struct fid *)handle->f_handle,
+				    &handle_dwords,  0);
+	handle->handle_type = retval;
+	/* convert handle size to bytes */
+	handle_bytes = handle_dwords * sizeof(u32);
+	handle->handle_bytes = handle_bytes;
+	if ((handle->handle_bytes > f_handle.handle_bytes) ||
+	    (retval == 255) || (retval == -ENOSPC)) {
+		/* As per old exportfs_encode_fh documentation
+		 * we could return ENOSPC to indicate overflow
+		 * But file system returned 255 always. So handle
+		 * both the values
+		 */
+		/*
+		 * set the handle size to zero so we copy only
+		 * non variable part of the file_handle
+		 */
+		handle_bytes = 0;
+		retval = -EOVERFLOW;
+	} else
+		retval = 0;
+	/* copy the mount id */
+	if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+	    copy_to_user(ufh, handle,
+			 sizeof(struct file_handle) + handle_bytes))
+		retval = -EFAULT;
+	kfree(handle);
+	return retval;
+}
+
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+		struct file_handle __user *, handle, int __user *, mnt_id,
+		int, flag)
+{
+	struct path path;
+	int lookup_flags;
+	int err;
+
+	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	err = user_path_at(dfd, name, lookup_flags, &path);
+	if (!err) {
+		err = do_sys_name_to_handle(&path, handle, mnt_id);
+		path_put(&path);
+	}
+	return err;
+}
-- 
cgit v1.2.3


From becfd1f37544798cbdfd788f32c827160fab98c1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:26 +0530
Subject: vfs: Add open by file handle support

[AV: duplicate of open() guts removed; file_open_root() used instead]

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c         |  13 +++++
 fs/exportfs/expfs.c |   2 +
 fs/fhandle.c        | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/internal.h       |   3 +
 4 files changed, 176 insertions(+)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index a071775f3bb..c6d31a3bab8 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -2284,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
 }
 
 #endif /* CONFIG_TIMERFD */
+
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+			     struct file_handle __user *handle, int flags)
+{
+	return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index cfe55731b6d..b05acb79613 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -374,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	/*
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
+	if (!nop || !nop->fh_to_dentry)
+		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
 	if (!result)
 		result = ERR_PTR(-ESTALE);
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 9f79e743a84..bf93ad2bee0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -5,6 +5,8 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -105,3 +107,159 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
 	}
 	return err;
 }
+
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+	struct path path;
+
+	if (fd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
+		spin_lock(&fs->lock);
+		path = fs->pwd;
+		mntget(path.mnt);
+		spin_unlock(&fs->lock);
+	} else {
+		int fput_needed;
+		struct file *file = fget_light(fd, &fput_needed);
+		if (!file)
+			return ERR_PTR(-EBADF);
+		path = file->f_path;
+		mntget(path.mnt);
+		fput_light(file, fput_needed);
+	}
+	return path.mnt;
+}
+
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+	return 1;
+}
+
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+			     struct path *path)
+{
+	int retval = 0;
+	int handle_dwords;
+
+	path->mnt = get_vfsmount_from_fd(mountdirfd);
+	if (IS_ERR(path->mnt)) {
+		retval = PTR_ERR(path->mnt);
+		goto out_err;
+	}
+	/* change the handle size to multiple of sizeof(u32) */
+	handle_dwords = handle->handle_bytes >> 2;
+	path->dentry = exportfs_decode_fh(path->mnt,
+					  (struct fid *)handle->f_handle,
+					  handle_dwords, handle->handle_type,
+					  vfs_dentry_acceptable, NULL);
+	if (IS_ERR(path->dentry)) {
+		retval = PTR_ERR(path->dentry);
+		goto out_mnt;
+	}
+	return 0;
+out_mnt:
+	mntput(path->mnt);
+out_err:
+	return retval;
+}
+
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+		   struct path *path)
+{
+	int retval = 0;
+	struct file_handle f_handle;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * With handle we don't look at the execute bit on the
+	 * the directory. Ideally we would like CAP_DAC_SEARCH.
+	 * But we don't have that
+	 */
+	if (!capable(CAP_DAC_READ_SEARCH)) {
+		retval = -EPERM;
+		goto out_err;
+	}
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+		retval = -EFAULT;
+		goto out_err;
+	}
+	if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+	    (f_handle.handle_bytes == 0)) {
+		retval = -EINVAL;
+		goto out_err;
+	}
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle) {
+		retval = -ENOMEM;
+		goto out_err;
+	}
+	/* copy the full handle */
+	if (copy_from_user(handle, ufh,
+			   sizeof(struct file_handle) +
+			   f_handle.handle_bytes)) {
+		retval = -EFAULT;
+		goto out_handle;
+	}
+
+	retval = do_handle_to_path(mountdirfd, handle, path);
+
+out_handle:
+	kfree(handle);
+out_err:
+	return retval;
+}
+
+long do_handle_open(int mountdirfd,
+		    struct file_handle __user *ufh, int open_flag)
+{
+	long retval = 0;
+	struct path path;
+	struct file *file;
+	int fd;
+
+	retval = handle_to_path(mountdirfd, ufh, &path);
+	if (retval)
+		return retval;
+
+	fd = get_unused_fd_flags(open_flag);
+	if (fd < 0) {
+		path_put(&path);
+		return fd;
+	}
+	file = file_open_root(path.dentry, path.mnt, "", open_flag);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		retval =  PTR_ERR(file);
+	} else {
+		retval = fd;
+		fsnotify_open(file);
+		fd_install(fd, file);
+	}
+	path_put(&path);
+	return retval;
+}
+
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+		struct file_handle __user *, handle,
+		int, flags)
+{
+	long ret;
+
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	ret = do_handle_open(mountdirfd, handle, flags);
+	return ret;
+}
diff --git a/fs/internal.h b/fs/internal.h
index 52abc5287f5..f3d15de44b1 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -117,6 +117,9 @@ extern struct file *do_filp_open(int dfd, const char *pathname,
 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 		const char *, const struct open_flags *, int lookup_flags);
 
+extern long do_handle_open(int mountdirfd,
+			   struct file_handle __user *ufh, int open_flag);
+
 /*
  * inode.c
  */
-- 
cgit v1.2.3


From aae8a97d3ec30788790d1720b71d76fd8eb44b73 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:27 +0530
Subject: fs: Don't allow to create hardlink for deleted file

Add inode->i_nlink == 0 check in VFS. Some of the file systems
do this internally. A followup patch will remove those instance.
This is needed to ensure that with link by handle we don't allow
to create hardlink of an unlinked file. The check also prevent a race
between unlink and link

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 83e92bab79a..33be51a2ddb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2906,7 +2906,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		return error;
 
 	mutex_lock(&inode->i_mutex);
-	error = dir->i_op->link(old_dentry, dir, new_dentry);
+	/* Make sure we don't allow creating hardlink to an unlinked file */
+	if (inode->i_nlink == 0)
+		error =  -ENOENT;
+	else
+		error = dir->i_op->link(old_dentry, dir, new_dentry);
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
 		fsnotify_link(dir, inode, new_dentry);
-- 
cgit v1.2.3


From f17b6042073e7000a90063f7edbca59a5bd1caa2 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:30 +0530
Subject: fs: Remove i_nlink check from file system link callback

Now that VFS check for inode->i_nlink == 0 and returns proper
error, remove similar check from file system

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c    |  3 ---
 fs/ext3/namei.c     |  7 -------
 fs/ext4/namei.c     |  7 -------
 fs/jfs/namei.c      |  3 ---
 fs/reiserfs/namei.c |  4 ----
 fs/ubifs/dir.c      | 18 ------------------
 6 files changed, 42 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0efdb65953c..c23f050f47c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4806,9 +4806,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	int err;
 	int drop_inode = 0;
 
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 	/* do not allow sys_link's with other subvols of the same device */
 	if (root->objectid != BTRFS_I(inode)->root->objectid)
 		return -EPERM;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71810e..561f6925626 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
 
 	dquot_initialize(dir);
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 */
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390d32c..e781b7ea563 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry,
 
 	dquot_initialize(dir);
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 */
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 5a2b269428a..3f04a180493 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
 	if (ip->i_nlink == JFS_LINK_MAX)
 		return -EMLINK;
 
-	if (ip->i_nlink == 0)
-		return -ENOENT;
-
 	dquot_initialize(dir);
 
 	tid = txBegin(ip->i_sb, 0);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 68fdf45cc6c..4b2eb564fda 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1122,10 +1122,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 		reiserfs_write_unlock(dir->i_sb);
 		return -EMLINK;
 	}
-	if (inode->i_nlink == 0) {
-		reiserfs_write_unlock(dir->i_sb);
-		return -ENOENT;
-	}
 
 	/* inc before scheduling so reiserfs_unlink knows we are here */
 	inc_nlink(inode);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b689d7..7217d67a80a 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	ubifs_assert(mutex_is_locked(&dir->i_mutex));
 	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 *
-	 * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
-	 * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
-	 * lock 'dirA->i_mutex', so this is possible. Both of the functions
-	 * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
-	 * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
-	 * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
-	 * to the list of orphans. After this, 'vfs_link()' will link
-	 * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
-	 * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
-	 * to the list of orphans.
-	 */
-	 if (inode->i_nlink == 0)
-		 return -ENOENT;
-
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From 93f1c20bc8cdb757be50566eff88d65c3b26881f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:38 +0530
Subject: vfs: Export file system uuid via /proc/<pid>/mountinfo

We add a per superblock uuid field. File systems should
update the uuid in the fill_super callback

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index d1edf26025d..dffe6f49ab9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1002,6 +1002,18 @@ const struct seq_operations mounts_op = {
 	.show	= show_vfsmnt
 };
 
+static int uuid_is_nil(u8 *uuid)
+{
+	int i;
+	u8  *cp = (u8 *)uuid;
+
+	for (i = 0; i < 16; i++) {
+		if (*cp++)
+			return 0;
+	}
+	return 1;
+}
+
 static int show_mountinfo(struct seq_file *m, void *v)
 {
 	struct proc_mounts *p = m->private;
@@ -1040,6 +1052,10 @@ static int show_mountinfo(struct seq_file *m, void *v)
 	if (IS_MNT_UNBINDABLE(mnt))
 		seq_puts(m, " unbindable");
 
+	if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
+		/* print the uuid */
+		seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
+
 	/* Filesystem specific data */
 	seq_puts(m, " - ");
 	show_type(m, sb);
-- 
cgit v1.2.3


From 03cb5f03dcb26846fcad345d8c15aae91579a53d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:39 +0530
Subject: ext3: Copy fs UUID to superblock.

File system UUID is made available to application
via  /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f247..9cc19a1dea8 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1936,6 +1936,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sb->s_qcop = &ext3_qctl_operations;
 	sb->dq_op = &ext3_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 	mutex_init(&sbi->s_resize_lock);
-- 
cgit v1.2.3


From f2fa2ffc2046fdc35f96366d1ec8675f4d578522 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:40 +0530
Subject: ext4: Copy fs UUID to superblock

File system UUID is made available to application
via  /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f6a318f836b..5977b356a43 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_qcop = &ext4_qctl_operations;
 	sb->dq_op = &ext4_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 	mutex_init(&sbi->s_resize_lock);
-- 
cgit v1.2.3


From 1abf0c718f15a56a0a435588d1b104c7a37dc9bd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 03:51:11 -0400
Subject: New kind of open files - "location only".

New flag for open(2) - O_PATH.  Semantics:
	* pathname is resolved, but the file itself is _NOT_ opened
as far as filesystem is concerned.
	* almost all operations on the resulting descriptors shall
fail with -EBADF.  Exceptions are:
	1) operations on descriptors themselves (i.e.
		close(), dup(), dup2(), dup3(), fcntl(fd, F_DUPFD),
		fcntl(fd, F_DUPFD_CLOEXEC, ...), fcntl(fd, F_GETFD),
		fcntl(fd, F_SETFD, ...))
	2) fcntl(fd, F_GETFL), for a common non-destructive way to
		check if descriptor is open
	3) "dfd" arguments of ...at(2) syscalls, i.e. the starting
		points of pathname resolution
	* closing such descriptor does *NOT* affect dnotify or
posix locks.
	* permissions are checked as usual along the way to file;
no permission checks are applied to the file itself.  Of course,
giving such thing to syscall will result in permission checks (at
the moment it means checking that starting point of ....at() is
a directory and caller has exec permissions on it).

fget() and fget_light() return NULL on such descriptors; use of
fget_raw() and fget_raw_light() is needed to get them.  That protects
existing code from dealing with those things.

There are two things still missing (they come in the next commits):
one is handling of symlinks (right now we refuse to open them that
way; see the next commit for semantics related to those) and another
is descriptor passing via SCM_RIGHTS datagrams.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c      | 37 ++++++++++++++++++++++++++++++++-----
 fs/file_table.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/namei.c      |  2 +-
 fs/open.c       | 35 +++++++++++++++++++++++++++++------
 4 files changed, 110 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index cb1026181bd..6c82e5bac03 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
 	int ret = -EBADF;
-	struct file *file = fget(fildes);
+	struct file *file = fget_raw(fildes);
 
 	if (file) {
 		ret = get_unused_fd();
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	return err;
 }
 
+static int check_fcntl_cmd(unsigned cmd)
+{
+	switch (cmd) {
+	case F_DUPFD:
+	case F_DUPFD_CLOEXEC:
+	case F_GETFD:
+	case F_SETFD:
+	case F_GETFL:
+		return 1;
+	}
+	return 0;
+}
+
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {	
 	struct file *filp;
 	long err = -EBADF;
 
-	filp = fget(fd);
+	filp = fget_raw(fd);
 	if (!filp)
 		goto out;
 
+	if (unlikely(filp->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd)) {
+			fput(filp);
+			goto out;
+		}
+	}
+
 	err = security_file_fcntl(filp, cmd, arg);
 	if (err) {
 		fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 	long err;
 
 	err = -EBADF;
-	filp = fget(fd);
+	filp = fget_raw(fd);
 	if (!filp)
 		goto out;
 
+	if (unlikely(filp->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd)) {
+			fput(filp);
+			goto out;
+		}
+	}
+
 	err = security_file_fcntl(filp, cmd, arg);
 	if (err) {
 		fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+	BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
 		O_RDONLY	| O_WRONLY	| O_RDWR	|
 		O_CREAT		| O_EXCL	| O_NOCTTY	|
 		O_TRUNC		| O_APPEND	| /* O_NONBLOCK	| */
 		__O_SYNC	| O_DSYNC	| FASYNC	|
 		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
 		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
-		__FMODE_EXEC
+		__FMODE_EXEC	| O_PATH
 		));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/file_table.c b/fs/file_table.c
index eb36b6b17e2..3c16e1ca163 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -276,11 +276,10 @@ struct file *fget(unsigned int fd)
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (!atomic_long_inc_not_zero(&file->f_count)) {
-			/* File object ref couldn't be taken */
-			rcu_read_unlock();
-			return NULL;
-		}
+		/* File object ref couldn't be taken */
+		if (file->f_mode & FMODE_PATH ||
+		    !atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
 	}
 	rcu_read_unlock();
 
@@ -289,6 +288,23 @@ struct file *fget(unsigned int fd)
 
 EXPORT_SYMBOL(fget);
 
+struct file *fget_raw(unsigned int fd)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	rcu_read_lock();
+	file = fcheck_files(files, fd);
+	if (file) {
+		/* File object ref couldn't be taken */
+		if (!atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
+	}
+	rcu_read_unlock();
+
+	return file;
+}
+
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  *
@@ -310,6 +326,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 	struct file *file;
 	struct files_struct *files = current->files;
 
+	*fput_needed = 0;
+	if (atomic_read(&files->count) == 1) {
+		file = fcheck_files(files, fd);
+		if (file && (file->f_mode & FMODE_PATH))
+			file = NULL;
+	} else {
+		rcu_read_lock();
+		file = fcheck_files(files, fd);
+		if (file) {
+			if (!(file->f_mode & FMODE_PATH) &&
+			    atomic_long_inc_not_zero(&file->f_count))
+				*fput_needed = 1;
+			else
+				/* Didn't get the reference, someone's freed */
+				file = NULL;
+		}
+		rcu_read_unlock();
+	}
+
+	return file;
+}
+
+struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
 	*fput_needed = 0;
 	if (atomic_read(&files->count) == 1) {
 		file = fcheck_files(files, fd);
diff --git a/fs/namei.c b/fs/namei.c
index 33be51a2ddb..e1d9f90d977 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1544,7 +1544,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	} else {
 		struct dentry *dentry;
 
-		file = fget_light(dfd, &fput_needed);
+		file = fget_raw_light(dfd, &fput_needed);
 		retval = -EBADF;
 		if (!file)
 			goto out_fail;
diff --git a/fs/open.c b/fs/open.c
index 48afc5c139d..14a51de01f5 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -669,11 +669,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 					int (*open)(struct inode *, struct file *),
 					const struct cred *cred)
 {
+	static const struct file_operations empty_fops = {};
 	struct inode *inode;
 	int error;
 
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
+
+	if (unlikely(f->f_flags & O_PATH))
+		f->f_mode = FMODE_PATH;
+
 	inode = dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
 		error = __get_file_write_access(inode, mnt);
@@ -687,9 +692,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	f->f_path.dentry = dentry;
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
-	f->f_op = fops_get(inode->i_fop);
 	file_sb_list_add(f, inode->i_sb);
 
+	if (unlikely(f->f_mode & FMODE_PATH)) {
+		f->f_op = &empty_fops;
+		return f;
+	}
+
+	f->f_op = fops_get(inode->i_fop);
+
 	error = security_dentry_open(f, cred);
 	if (error)
 		goto cleanup_all;
@@ -911,9 +922,18 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op)
 	if (flags & __O_SYNC)
 		flags |= O_DSYNC;
 
-	op->open_flag = flags;
+	/*
+	 * If we have O_PATH in the open flag. Then we
+	 * cannot have anything other than the below set of flags
+	 */
+	if (flags & O_PATH) {
+		flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+		acc_mode = 0;
+	} else {
+		acc_mode = MAY_OPEN | ACC_MODE(flags);
+	}
 
-	acc_mode = MAY_OPEN | ACC_MODE(flags);
+	op->open_flag = flags;
 
 	/* O_TRUNC implies we need access checks for write permissions */
 	if (flags & O_TRUNC)
@@ -926,7 +946,8 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op)
 
 	op->acc_mode = acc_mode;
 
-	op->intent = LOOKUP_OPEN;
+	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+
 	if (flags & O_CREAT) {
 		op->intent |= LOOKUP_CREATE;
 		if (flags & O_EXCL)
@@ -1053,8 +1074,10 @@ int filp_close(struct file *filp, fl_owner_t id)
 	if (filp->f_op && filp->f_op->flush)
 		retval = filp->f_op->flush(filp, id);
 
-	dnotify_flush(filp, id);
-	locks_remove_posix(filp, id);
+	if (likely(!(filp->f_mode & FMODE_PATH))) {
+		dnotify_flush(filp, id);
+		locks_remove_posix(filp, id);
+	}
 	fput(filp);
 	return retval;
 }
-- 
cgit v1.2.3


From bcda76524cd1fa32af748536f27f674a13e56700 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 16:42:14 -0400
Subject: Allow O_PATH for symlinks

At that point we can't do almost nothing with them.  They can be opened
with O_PATH, we can manipulate such descriptors with dup(), etc. and
we can see them in /proc/*/{fd,fdinfo}/*.

We can't (and won't be able to) follow /proc/*/fd/* symlinks for those;
there's simply not enough information for pathname resolution to go on
from such point - to resolve a symlink we need to know which directory
does it live in.

We will be able to do useful things with them after the next commit, though -
readlinkat() and fchownat() will be possible to use with dfd being an
O_PATH-opened symlink and empty relative pathname.  Combined with
open_by_handle() it'll give us a way to do realink-by-handle and
lchown-by-handle without messing with more redundant syscalls.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index e1d9f90d977..9d4f3270017 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -766,8 +766,14 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 		error = 0;
 		if (s)
 			error = __vfs_follow_link(nd, s);
-		else if (nd->last_type == LAST_BIND)
+		else if (nd->last_type == LAST_BIND) {
 			nd->flags |= LOOKUP_JUMPED;
+			if (nd->path.dentry->d_inode->i_op->follow_link) {
+				/* stepped on a _really_ weird one */
+				path_put(&nd->path);
+				error = -ELOOP;
+			}
+		}
 	}
 	return error;
 }
@@ -1954,6 +1960,10 @@ static int may_open(struct path *path, int acc_mode, int flag)
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+	/* O_PATH? */
+	if (!acc_mode)
+		return 0;
+
 	if (!inode)
 		return -ENOENT;
 
@@ -2056,7 +2066,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	int open_flag = op->open_flag;
 	int will_truncate = open_flag & O_TRUNC;
 	int want_write = 0;
-	int skip_perm = 0;
+	int acc_mode = op->acc_mode;
 	struct file *filp;
 	struct inode *inode;
 	int error;
@@ -2095,8 +2105,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	}
 
 	if (!(open_flag & O_CREAT)) {
+		int symlink_ok = 0;
 		if (nd->last.name[nd->last.len])
 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+			symlink_ok = 1;
 		/* we _can_ be in RCU mode here */
 		error = do_lookup(nd, &nd->last, path, &inode);
 		if (error) {
@@ -2108,7 +2121,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			terminate_walk(nd);
 			return ERR_PTR(-ENOENT);
 		}
-		if (unlikely(inode->i_op->follow_link)) {
+		if (unlikely(inode->i_op->follow_link && !symlink_ok)) {
 			/* We drop rcu-walk here */
 			if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
 				return ERR_PTR(-ECHILD);
@@ -2175,7 +2188,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
 		will_truncate = 0;
-		skip_perm = 1;
+		acc_mode = MAY_OPEN;
 		error = security_path_mknod(&nd->path, dentry, mode, 0);
 		if (error)
 			goto exit_mutex_unlock;
@@ -2225,7 +2238,7 @@ ok:
 		want_write = 1;
 	}
 common:
-	error = may_open(&nd->path, skip_perm ? 0 : op->acc_mode, open_flag);
+	error = may_open(&nd->path, acc_mode, open_flag);
 	if (error)
 		goto exit;
 	filp = nameidata_to_filp(nd);
@@ -2358,7 +2371,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 
 	flags |= LOOKUP_ROOT;
 
-	if (dentry->d_inode->i_op->follow_link)
+	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
 
 	file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
-- 
cgit v1.2.3


From 65cfc6722361570bfe255698d9cd4dccaf47570d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 15:56:26 -0400
Subject: readlinkat(), fchownat() and fstatat() with empty relative pathnames

For readlinkat() we simply allow empty pathname; it will fail unless
we have dfd equal to O_PATH-opened symlink, so we are outside of
POSIX scope here.  For fchownat() and fstatat() we allow AT_EMPTY_PATH;
let the caller explicitly ask for such behaviour.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 10 ++++++----
 fs/stat.c |  7 +++++--
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 14a51de01f5..3cac0bda46d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -573,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 {
 	struct path path;
 	int error = -EINVAL;
-	int follow;
+	int lookup_flags;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
 		goto out;
 
-	follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-	error = user_path_at(dfd, filename, follow, &path);
+	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
 		goto out;
 	error = mnt_want_write(path.mnt);
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf2b70..961039121cb 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
 	int error = -EINVAL;
 	int lookup_flags = 0;
 
-	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+		      AT_EMPTY_PATH)) != 0)
 		goto out;
 
 	if (!(flag & AT_SYMLINK_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 	if (flag & AT_NO_AUTOMOUNT)
 		lookup_flags |= LOOKUP_NO_AUTOMOUNT;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
 
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
 	if (bufsiz <= 0)
 		return -EINVAL;
 
-	error = user_path_at(dfd, pathname, 0, &path);
+	error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
 	if (!error) {
 		struct inode *inode = path.dentry->d_inode;
 
-- 
cgit v1.2.3


From 326be7b484843988afe57566b627fb7a70beac56 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 17:08:22 -0400
Subject: Allow passing O_PATH descriptors via SCM_RIGHTS datagrams

Just need to make sure that AF_UNIX garbage collector won't
confuse O_PATHed socket on filesystem for real AF_UNIX opened
socket.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index 3c16e1ca163..74a9544ac77 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -305,6 +305,8 @@ struct file *fget_raw(unsigned int fd)
 	return file;
 }
 
+EXPORT_SYMBOL(fget_raw);
+
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  *
-- 
cgit v1.2.3


From 7e32d02613a72a39ba01638337c609a9a866c653 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 15 Mar 2011 08:32:14 +0000
Subject: GFS2: Don't use _raw version of RCU dereference

As per RCU glock patch review comments, don't use the _raw
version of this function here.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 fs/gfs2/glock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8648409be45..85044b41824 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1723,7 +1723,7 @@ static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
 
 static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
 {
-	return hlist_bl_entry(rcu_dereference_raw(gl->gl_list.next),
+	return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
 			      struct gfs2_glock, gl_list);
 }
 
-- 
cgit v1.2.3


From 2c722c9a47d1369e2685b85288e78c469a081238 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Thu, 9 Dec 2010 15:55:21 +0100
Subject: exofs: Remove redundant unlikely()

IS_ERR() already implies unlikely(), so it can be omitted here.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
---
 fs/exofs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8c6c4669b38..78f5ad633d3 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -495,7 +495,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 		}
 
 		od = osduld_info_lookup(&odi);
-		if (unlikely(IS_ERR(od))) {
+		if (IS_ERR(od)) {
 			ret = PTR_ERR(od);
 			EXOFS_ERR("ERROR: device requested is not found "
 				  "osd_name-%s =>%d\n", odi.osdname, ret);
-- 
cgit v1.2.3


From 8f68cd42d85f31fb58dd2cabf3ff4aad0a2bafd9 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 15 Mar 2011 18:37:09 +1100
Subject: nfs: BKL is no longer needed, so remove the include

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4b764c6048d..7cded2b12a0 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,7 +18,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
-#include <linux/smp_lock.h>
 #include <linux/module.h>
 
 #include <asm/system.h>
-- 
cgit v1.2.3


From 0a935519cca83f26dc15e7577fa6c2b39606a4ac Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 26 Oct 2010 17:52:41 +0200
Subject: exofs: Trivial: fix some indentation and debug prints

I stumbled on some of these prints in log files so, might
just submit the fixes.

* All i_ino prints in exofs should be hex
* All OSD_ERR prints should end with a "\n"

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/dir.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index dcc941d82d6..d0941c6a1f7 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -124,7 +124,7 @@ out:
 
 Ebadsize:
 	EXOFS_ERR("ERROR [exofs_check_page]: "
-		"size of directory #%lu is not a multiple of chunk size",
+		"size of directory(0x%lx) is not a multiple of chunk size\n",
 		dir->i_ino
 	);
 	goto fail;
@@ -142,8 +142,8 @@ Espan:
 	goto bad_entry;
 bad_entry:
 	EXOFS_ERR(
-		"ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
-		"offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+		"ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
+		"offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
 		dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
 		_LLU(le64_to_cpu(p->inode_no)),
 		rec_len, p->name_len);
@@ -151,8 +151,8 @@ bad_entry:
 Eend:
 	p = (struct exofs_dir_entry *)(kaddr + offs);
 	EXOFS_ERR("ERROR [exofs_check_page]: "
-		"entry in directory #%lu spans the page boundary"
-		"offset=%lu, inode=%llu",
+		"entry in directory(0x%lx) spans the page boundary"
+		"offset=%lu, inode=0x%llx\n",
 		dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
 		_LLU(le64_to_cpu(p->inode_no)));
 fail:
@@ -261,9 +261,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		struct page *page = exofs_get_page(inode, n);
 
 		if (IS_ERR(page)) {
-			EXOFS_ERR("ERROR: "
-				   "bad page in #%lu",
-				   inode->i_ino);
+			EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
+				  inode->i_ino);
 			filp->f_pos += PAGE_CACHE_SIZE - offset;
 			return PTR_ERR(page);
 		}
@@ -283,7 +282,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		for (; (char *)de <= limit; de = exofs_next_entry(de)) {
 			if (de->rec_len == 0) {
 				EXOFS_ERR("ERROR: "
-					"zero-length directory entry");
+				     "zero-length entry in directory(0x%lx)\n",
+				     inode->i_ino);
 				exofs_put_page(page);
 				return -EIO;
 			}
@@ -342,9 +342,9 @@ struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
 			kaddr += exofs_last_byte(dir, n) - reclen;
 			while ((char *) de <= kaddr) {
 				if (de->rec_len == 0) {
-					EXOFS_ERR(
-						"ERROR: exofs_find_entry: "
-						"zero-length directory entry");
+					EXOFS_ERR("ERROR: zero-length entry in "
+						  "directory(0x%lx)\n",
+						  dir->i_ino);
 					exofs_put_page(page);
 					goto out;
 				}
@@ -472,7 +472,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
 			}
 			if (de->rec_len == 0) {
 				EXOFS_ERR("ERROR: exofs_add_link: "
-					"zero-length directory entry");
+				      "zero-length entry in directory(0x%lx)\n",
+				      inode->i_ino);
 				err = -EIO;
 				goto out_unlock;
 			}
@@ -491,7 +492,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
 		exofs_put_page(page);
 	}
 
-	EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+	EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
+		  dentry, inode->i_ino);
 	return -EINVAL;
 
 got_it:
@@ -542,7 +544,8 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
 	while (de < dir) {
 		if (de->rec_len == 0) {
 			EXOFS_ERR("ERROR: exofs_delete_entry:"
-				"zero-length directory entry");
+				  "zero-length entry in directory(0x%lx)\n",
+				  inode->i_ino);
 			err = -EIO;
 			goto out;
 		}
-- 
cgit v1.2.3


From a8f1418f9e9bd4c487a7b703ff26c5dd5ceb2bf3 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 22 Nov 2010 18:02:45 +0200
Subject: exofs: Optimize read_4_write

Don't attempt a read passed i_size, just zero the page and be
done with it.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/inode.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a7555238c41..c8f58a96e59 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -350,8 +350,10 @@ static int readpage_strip(void *data, struct page *page)
 
 		if (!pcol->read_4_write)
 			unlock_page(page);
-		EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
-			     " splitting\n", inode->i_ino, page->index);
+		EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
+			     "read_4_write=%d index=0x%lx end_index=0x%lx "
+			     "splitting\n", inode->i_ino, len,
+			     pcol->read_4_write, page->index, end_index);
 
 		return read_exec(pcol);
 	}
@@ -722,11 +724,28 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 
 	 /* read modify write */
 	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+		loff_t i_size = i_size_read(mapping->host);
+		pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+		size_t rlen;
+
+		if (page->index < end_index)
+			rlen = PAGE_CACHE_SIZE;
+		else if (page->index == end_index)
+			rlen = i_size & ~PAGE_CACHE_MASK;
+		else
+			rlen = 0;
+
+		if (!rlen) {
+			clear_highpage(page);
+			SetPageUptodate(page);
+			goto out;
+		}
+
 		ret = _readpage(page, true);
 		if (ret) {
 			/*SetPageError was done by _readpage. Is it ok?*/
 			unlock_page(page);
-			EXOFS_DBGMSG("__readpage_filler failed\n");
+			EXOFS_DBGMSG("__readpage failed\n");
 		}
 	}
 out:
-- 
cgit v1.2.3


From 97178b7b6c84bd14660b89474d27931a1ea65c66 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Thu, 25 Nov 2010 12:47:15 +0200
Subject: exofs: simple fsync race fix

It is incorrect to test inode dirty bits without participating in the inode
writeback protocol. Inode writeback sets I_SYNC and clears I_DIRTY_?, then
writes out the particular bits, then clears I_SYNC when it is done. BTW. it
may not completely write all pages out, so I_DIRTY_PAGES would get set
again.

This is a standard pattern used throughout the kernel's writeback caches
(I_SYNC ~= I_WRITEBACK, if that makes it clearer).

And so it is not possible to determine an inode's dirty status just by
checking I_DIRTY bits. Especially not for the purpose of data integrity
syncs.

Missing the check for these bits means that fsync can complete while
writeback to the inode is underway. Inode writeback functions get this
right, so call into them rather than try to shortcut things by testing
dirty state improperly.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/file.c  | 5 -----
 fs/exofs/inode.c | 3 ++-
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index b905c79b4f0..4c0d6bac914 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -48,11 +48,6 @@ static int exofs_file_fsync(struct file *filp, int datasync)
 	struct inode *inode = filp->f_mapping->host;
 	struct super_block *sb;
 
-	if (!(inode->i_state & I_DIRTY))
-		return 0;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return 0;
-
 	ret = sync_inode_metadata(inode, 1);
 
 	/* This is a good place to write the sb */
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index c8f58a96e59..fb9d3805610 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1290,7 +1290,8 @@ out:
 
 int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+	/* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
+	return exofs_update_inode(inode, 1);
 }
 
 /*
-- 
cgit v1.2.3


From 66cd6cad4919f980dd21307d0150ff251762a264 Mon Sep 17 00:00:00 2001
From: "bharrosh@panasas.com" <bharrosh@panasas.com>
Date: Thu, 7 Oct 2010 14:28:18 -0400
Subject: exofs: Override read-ahead to align on stripe_size

* Set all inode->i_mapping->backing_dev_info to point to
  the per super-block sb->s_bdi.

* Calculating a read_ahead that is:
  - preferable 2 stripes long
    (Future patch will add a mount option to override this)
  - Minimum 128K aligned up to stripe-size
  - Caped to maximum-IO-sizes round down to stripe_size.
    (Max sizes are governed by max bio-size that fits in a page
     times number-of-devices)

CC: Marc Dionne <marc.c.dionne@gmail.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |  2 ++
 fs/exofs/inode.c | 19 +++++++++++++++----
 fs/exofs/super.c | 18 ++++++++++++++++++
 3 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2dc925fa101..99fcb9126a9 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -256,6 +256,8 @@ static inline int exofs_oi_read(struct exofs_i_info *oi,
 }
 
 /* inode.c               */
+unsigned exofs_max_io_pages(struct exofs_layout *layout,
+			    unsigned expected_pages);
 int exofs_setattr(struct dentry *, struct iattr *);
 int exofs_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index fb9d3805610..681b3cb9b4d 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -43,6 +43,17 @@ enum { BIO_MAX_PAGES_KMALLOC =
 		PAGE_SIZE / sizeof(struct page *),
 };
 
+unsigned exofs_max_io_pages(struct exofs_layout *layout,
+			    unsigned expected_pages)
+{
+	unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
+
+	/* TODO: easily support bio chaining */
+	pages =  min_t(unsigned, pages,
+		       layout->group_width * BIO_MAX_PAGES_KMALLOC);
+	return pages;
+}
+
 struct page_collect {
 	struct exofs_sb_info *sbi;
 	struct inode *inode;
@@ -97,8 +108,7 @@ static void _pcol_reset(struct page_collect *pcol)
 
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-	unsigned pages = min_t(unsigned, pcol->expected_pages,
-			  MAX_PAGES_KMALLOC);
+	unsigned pages;
 
 	if (!pcol->ios) { /* First time allocate io_state */
 		int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
@@ -108,8 +118,7 @@ static int pcol_try_alloc(struct page_collect *pcol)
 	}
 
 	/* TODO: easily support bio chaining */
-	pages =  min_t(unsigned, pages,
-		       pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
+	pages =  exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
 
 	for (; pages; pages >>= 1) {
 		pcol->pages = kmalloc(pages * sizeof(struct page *),
@@ -1049,6 +1058,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
 	}
 
+	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &exofs_file_inode_operations;
 		inode->i_fop = &exofs_file_operations;
@@ -1149,6 +1159,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	sbi = sb->s_fs_info;
 
+	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	sb->s_dirt = 1;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 78f5ad633d3..e87510f4749 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -390,6 +390,23 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 	return 0;
 }
 
+static unsigned __ra_pages(struct exofs_layout *layout)
+{
+	const unsigned _MIN_RA = 32; /* min 128K read-ahead */
+	unsigned ra_pages = layout->group_width * layout->stripe_unit /
+				PAGE_SIZE;
+	unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
+
+	ra_pages *= 2; /* two stripes */
+	if (ra_pages < _MIN_RA)
+		ra_pages = roundup(_MIN_RA, ra_pages / 2);
+
+	if (ra_pages > max_io_pages)
+		ra_pages = max_io_pages;
+
+	return ra_pages;
+}
+
 /* @odi is valid only as long as @fscb_dev is valid */
 static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
 			     struct osd_dev_info *odi)
@@ -623,6 +640,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	/* set up operation vectors */
+	sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
 	sb->s_bdi = &sbi->bdi;
 	sb->s_fs_info = sbi;
 	sb->s_op = &exofs_sops;
-- 
cgit v1.2.3


From 9ed96484311b89360b80a4181d856cbdb21630fd Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 31 Jan 2011 14:32:14 +0200
Subject: exofs: Add option to mount by osdname

If /dev/osd* devices are shuffled because more devices
where added, and/or login order has changed. It is hard to
mount the FS you want.

Add an option to mount by osdname. osdname is any osd-device's
osdname as specified to the mkfs.exofs command when formatting
the osd-devices.
The new mount format is:
	OPT="osdname=$UUID0,pid=$PID,_netdev"
	mount -t exofs -o $OPT $DEV_OSD0 $MOUNTDIR

if "osdname=" is specified in options above $DEV_OSD0 is
ignored and can be empty.

Also while at it: Removed some old unused Opt_* enums.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/super.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index e87510f4749..474989eeb7d 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -48,6 +48,7 @@
  * struct to hold what we get from mount options
  */
 struct exofs_mountopt {
+	bool is_osdname;
 	const char *dev_name;
 	uint64_t pid;
 	int timeout;
@@ -56,7 +57,7 @@ struct exofs_mountopt {
 /*
  * exofs-specific mount-time options.
  */
-enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+enum { Opt_name, Opt_pid, Opt_to, Opt_err };
 
 /*
  * Our mount-time options.  These should ideally be 64-bit unsigned, but the
@@ -64,6 +65,7 @@ enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
  * sufficient for most applications now.
  */
 static match_table_t tokens = {
+	{Opt_name, "osdname=%s"},
 	{Opt_pid, "pid=%u"},
 	{Opt_to, "to=%u"},
 	{Opt_err, NULL}
@@ -94,6 +96,14 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
 
 		token = match_token(p, tokens, args);
 		switch (token) {
+		case Opt_name:
+			opts->dev_name = match_strdup(&args[0]);
+			if (unlikely(!opts->dev_name)) {
+				EXOFS_ERR("Error allocating dev_name");
+				return -ENOMEM;
+			}
+			opts->is_osdname = true;
+			break;
 		case Opt_pid:
 			if (0 == match_strlcpy(str, &args[0], sizeof(str)))
 				return -EINVAL;
@@ -575,9 +585,17 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_bdi;
 
 	/* use mount options to fill superblock */
-	od = osduld_path_lookup(opts->dev_name);
+	if (opts->is_osdname) {
+		struct osd_dev_info odi = {.systemid_len = 0};
+
+		odi.osdname_len = strlen(opts->dev_name);
+		odi.osdname = (u8 *)opts->dev_name;
+		od = osduld_info_lookup(&odi);
+	} else {
+		od = osduld_path_lookup(opts->dev_name);
+	}
 	if (IS_ERR(od)) {
-		ret = PTR_ERR(od);
+		ret = -EINVAL;
 		goto free_sbi;
 	}
 
@@ -670,6 +688,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 
 	_exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
 			    sbi->layout.s_pid);
+	if (opts->is_osdname)
+		kfree(opts->dev_name);
 	return 0;
 
 free_sbi:
@@ -678,6 +698,8 @@ free_bdi:
 	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
 		  opts->dev_name, sbi->layout.s_pid, ret);
 	exofs_free_sbi(sbi);
+	if (opts->is_osdname)
+		kfree(opts->dev_name);
 	return ret;
 }
 
@@ -695,7 +717,8 @@ static struct dentry *exofs_mount(struct file_system_type *type,
 	if (ret)
 		return ERR_PTR(ret);
 
-	opts.dev_name = dev_name;
+	if (!opts.dev_name)
+		opts.dev_name = dev_name;
 	return mount_nodev(type, flags, &opts, exofs_fill_super);
 }
 
-- 
cgit v1.2.3


From 1cea312ad49d9cb964179a784fedb1fcfe396283 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 3 Feb 2011 17:53:25 +0200
Subject: exofs: Write sbi->s_nextid as part of the Create command

Before when creating a new inode, we'd set the sb->s_dirt flag,
and sometime later the system would write out s_nextid as part
of the sb_info. Also on inode sync we would force the sb sync
as well.

Define the s_nextid as a new partition attribute and set it
every time we create a new object.
At mount we read it from it's new place.

We now never set sb->s_dirt anywhere in exofs. write_super
is actually never called. The call to exofs_write_super from
exofs_put_super is also removed because the VFS always calls
->sync_fs before calling ->put_super twice.

To stay backward-and-forward compatible we also write the old
s_nextid in the super_block object at unmount, and support zero
length attribute on mount.

This also fixes a BUG where in layouts when group_width was not
a divisor of EXOFS_SUPER_ID (0x10000) the s_nextid was not read
from the device it was written to. Because of the sliding window
layout trick, and because the read was always done from the 0
device but the write was done via the raid engine that might slide
the device view. Now we read and write through the raid engine.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/common.h |  18 +++++++-
 fs/exofs/exofs.h  |   4 +-
 fs/exofs/file.c   |  11 +----
 fs/exofs/inode.c  |   4 +-
 fs/exofs/super.c  | 135 +++++++++++++++++++++++++++++++++++++++++++++++-------
 5 files changed, 141 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index f0d520312d8..5e74ad3d400 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -53,10 +53,14 @@
 #define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
 
 /* exofs Application specific page/attribute */
+/* Inode attrs */
 # define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
 # define EXOFS_ATTR_INODE_DATA	1
 # define EXOFS_ATTR_INODE_FILE_LAYOUT	2
 # define EXOFS_ATTR_INODE_DIR_LAYOUT	3
+/* Partition attrs */
+# define EXOFS_APAGE_SB_DATA	(0xF0000000U + 3)
+# define EXOFS_ATTR_SB_STATS	1
 
 /*
  * The maximum number of files we can have is limited by the size of the
@@ -86,8 +90,8 @@ enum {
  */
 enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
 struct exofs_fscb {
-	__le64  s_nextid;	/* Highest object ID used */
-	__le64  s_numfiles;	/* Number of files on fs */
+	__le64  s_nextid;	/* Only used after mkfs */
+	__le64  s_numfiles;	/* Only used after mkfs */
 	__le32	s_version;	/* == EXOFS_FSCB_VER */
 	__le16  s_magic;	/* Magic signature */
 	__le16  s_newfs;	/* Non-zero if this is a new fs */
@@ -97,6 +101,16 @@ struct exofs_fscb {
 	__le64	s_dev_table_count; /* == 0 means no dev_table */
 } __packed;
 
+/*
+ * This struct is set on the FS partition's attributes.
+ * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
+ * with the create command, to atomically persist the sb writeable information.
+ */
+struct exofs_sb_stats {
+	__le64  s_nextid;	/* Highest object ID used */
+	__le64  s_numfiles;	/* Number of files on fs */
+} __packed;
+
 /*
  * Describes the raid used in the FS. It is part of the device table.
  * This here is taken from the pNFS-objects definition. In exofs we
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 99fcb9126a9..c965806c282 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -77,7 +77,7 @@ struct exofs_layout {
  * our extension to the in-memory superblock
  */
 struct exofs_sb_info {
-	struct exofs_fscb s_fscb;		/* Written often, pre-allocate*/
+	struct exofs_sb_stats s_ess;		/* Written often, pre-allocate*/
 	int		s_timeout;		/* timeout for OSD operations */
 	uint64_t	s_nextid;		/* highest object ID used     */
 	uint32_t	s_numfiles;		/* number of files on fs      */
@@ -281,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
 		    struct inode *);
 
 /* super.c               */
-int exofs_sync_fs(struct super_block *sb, int wait);
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
 
 /*********************
  * operation vectors *
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 4c0d6bac914..45ca323d836 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -45,17 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
 static int exofs_file_fsync(struct file *filp, int datasync)
 {
 	int ret;
-	struct inode *inode = filp->f_mapping->host;
-	struct super_block *sb;
-
-	ret = sync_inode_metadata(inode, 1);
-
-	/* This is a good place to write the sb */
-	/* TODO: Sechedule an sb-sync on create */
-	sb = inode->i_sb;
-	if (sb->s_dirt)
-		exofs_sync_fs(sb, 1);
 
+	ret = sync_inode_metadata(filp->f_mapping->host, 1);
 	return ret;
 }
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 681b3cb9b4d..0c713cfbebf 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1102,6 +1102,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
 	}
 	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
 }
+
 /*
  * Callback function from exofs_new_inode().  The important thing is that we
  * set the obj_created flag so that other methods know that the object exists on
@@ -1160,7 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	sbi = sb->s_fs_info;
 
 	inode->i_mapping->backing_dev_info = sb->s_bdi;
-	sb->s_dirt = 1;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
 	inode->i_blkbits = EXOFS_BLKSHIFT;
@@ -1171,6 +1171,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 	spin_unlock(&sbi->s_next_gen_lock);
 	insert_inode_hash(inode);
 
+	exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
+
 	mark_inode_dirty(inode);
 
 	ret = exofs_get_io_state(&sbi->layout, &ios);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 474989eeb7d..5eb0851e548 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -213,6 +213,101 @@ static void destroy_inodecache(void)
 static const struct super_operations exofs_sops;
 static const struct export_operations exofs_export_ops;
 
+static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
+	EXOFS_APAGE_SB_DATA,
+	EXOFS_ATTR_SB_STATS,
+	sizeof(struct exofs_sb_stats));
+
+static int __sbi_read_stats(struct exofs_sb_info *sbi)
+{
+	struct osd_attr attrs[] = {
+		[0] = g_attr_sb_stats,
+	};
+	struct exofs_io_state *ios;
+	int ret;
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	ios->cred = sbi->s_cred;
+
+	ios->in_attr = attrs;
+	ios->in_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_read(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("Error reading super_block stats => %d\n", ret);
+		goto out;
+	}
+
+	ret = extract_attr_from_ios(ios, &attrs[0]);
+	if (ret) {
+		EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
+		goto out;
+	}
+	if (attrs[0].len) {
+		struct exofs_sb_stats *ess;
+
+		if (unlikely(attrs[0].len != sizeof(*ess))) {
+			EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
+				  "size(%d) != expected(%zd)\n",
+				  __func__, attrs[0].len, sizeof(*ess));
+			goto out;
+		}
+
+		ess = attrs[0].val_ptr;
+		sbi->s_nextid = le64_to_cpu(ess->s_nextid);
+		sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
+	}
+
+out:
+	exofs_put_io_state(ios);
+	return ret;
+}
+
+static void stats_done(struct exofs_io_state *ios, void *p)
+{
+	exofs_put_io_state(ios);
+	/* Good thanks nothing to do anymore */
+}
+
+/* Asynchronously write the stats attribute */
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
+{
+	struct osd_attr attrs[] = {
+		[0] = g_attr_sb_stats,
+	};
+	struct exofs_io_state *ios;
+	int ret;
+
+	ret = exofs_get_io_state(&sbi->layout, &ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+		return ret;
+	}
+
+	sbi->s_ess.s_nextid   = cpu_to_le64(sbi->s_nextid);
+	sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
+	attrs[0].val_ptr = &sbi->s_ess;
+
+	ios->cred = sbi->s_cred;
+	ios->done = stats_done;
+	ios->private = sbi;
+	ios->out_attr = attrs;
+	ios->out_attr_len = ARRAY_SIZE(attrs);
+
+	ret = exofs_sbi_write(ios);
+	if (unlikely(ret)) {
+		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
+		exofs_put_io_state(ios);
+	}
+
+	return ret;
+}
+
 /*
  * Write the superblock to the OSD
  */
@@ -223,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 	struct exofs_io_state *ios;
 	int ret = -ENOMEM;
 
-	lock_super(sb);
+	fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
+	if (unlikely(!fscb))
+		return -ENOMEM;
+
 	sbi = sb->s_fs_info;
-	fscb = &sbi->s_fscb;
 
+	/* NOTE: We no longer dirty the super_block anywhere in exofs. The
+	 * reason we write the fscb here on unmount is so we can stay backwards
+	 * compatible with fscb->s_version == 1. (What we are not compatible
+	 * with is if a new version FS crashed and then we try to mount an old
+	 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
+	 * the writeable info is set in exofs_sbi_write_stats() above.
+	 */
 	ret = exofs_get_io_state(&sbi->layout, &ios);
-	if (ret)
+	if (unlikely(ret))
 		goto out;
 
-	/* Note: We only write the changing part of the fscb. .i.e upto the
-	 *       the fscb->s_dev_table_oid member. There is no read-modify-write
-	 *       here.
-	 */
+	lock_super(sb);
+
 	ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
 	memset(fscb, 0, ios->length);
 	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -249,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait)
 	ios->cred = sbi->s_cred;
 
 	ret = exofs_sbi_write(ios);
-	if (unlikely(ret)) {
+	if (unlikely(ret))
 		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
-		goto out;
-	}
-	sb->s_dirt = 0;
+	else
+		sb->s_dirt = 0;
 
+
+	unlock_super(sb);
 out:
 	EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
 	exofs_put_io_state(ios);
-	unlock_super(sb);
+	kfree(fscb);
 	return ret;
 }
 
@@ -302,9 +405,6 @@ static void exofs_put_super(struct super_block *sb)
 	int num_pend;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 
-	if (sb->s_dirt)
-		exofs_write_super(sb);
-
 	/* make sure there are no pending commands */
 	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
 	     num_pend = atomic_read(&sbi->s_curr_pending)) {
@@ -629,6 +729,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 
 	sb->s_magic = le16_to_cpu(fscb.s_magic);
+	/* NOTE: we read below to be backward compatible with old versions */
 	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
 	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
 
@@ -639,7 +740,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		ret = -EINVAL;
 		goto free_sbi;
 	}
-	if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+	if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
 		EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
 			  EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
 		ret = -EINVAL;
@@ -657,6 +758,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 			goto free_sbi;
 	}
 
+	__sbi_read_stats(sbi);
+
 	/* set up operation vectors */
 	sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
 	sb->s_bdi = &sbi->bdi;
-- 
cgit v1.2.3


From a49fb4c3d035ab516507b31ec3bd49263caee14d Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 7 Feb 2011 18:12:15 +0200
Subject: exofs: deprecate the commands pending counter

One leftover from the days of IBM's original code, is an SB counter
that counts in-flight asynchronous commands. And a piece of code that
waits for the counter to reach zero at unmount. I guess it might have
been needed then, cause of some reference missing or something.

I'm not removing it yet but am putting a warning message if ever this
counter triggers at unmount. If I'll never see it triggers or reported
I'll remove the counter for good.
(I had this print as a debug output for a long time and never had it
 trigger)

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 5eb0851e548..06065bd37fc 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -409,6 +409,10 @@ static void exofs_put_super(struct super_block *sb)
 	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
 	     num_pend = atomic_read(&sbi->s_curr_pending)) {
 		wait_queue_head_t wq;
+
+		printk(KERN_NOTICE "%s: !!Pending operations in flight. "
+		       "This is a BUG. please report to osd-dev@open-osd.org\n",
+		       __func__);
 		init_waitqueue_head(&wq);
 		wait_event_timeout(wq,
 				  (atomic_read(&sbi->s_curr_pending) == 0),
-- 
cgit v1.2.3


From c61fa0d6d9d466356ffa89fa1c1a9a1cd726fab4 Mon Sep 17 00:00:00 2001
From: "Venkateswararao Jujjuri (JV)" <jvrao@linux.vnet.ibm.com>
Date: Thu, 13 Jan 2011 15:28:39 -0800
Subject: [fs/9p] Plug potential acl leak

In v9fs_get_acl() if __v9fs_get_acl() gets only one of the
dacl/pacl we are not releasing it.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/9p/acl.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 02a2cf61631..291ff7be27f 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -71,11 +71,15 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 	if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
 		set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
 		set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
-		posix_acl_release(dacl);
-		posix_acl_release(pacl);
 	} else
 		retval = -EIO;
 
+	if (!IS_ERR(dacl))
+		posix_acl_release(dacl);
+
+	if (!IS_ERR(pacl))
+		posix_acl_release(pacl);
+
 	return retval;
 }
 
-- 
cgit v1.2.3


From d344b0fb72e00339625464c5a29711906fa70b8b Mon Sep 17 00:00:00 2001
From: "Venkateswararao Jujjuri (JV)" <jvrao@linux.vnet.ibm.com>
Date: Thu, 13 Jan 2011 16:33:00 -0800
Subject: [fs/9p] Initialze cached acls both in cached/uncached mode.

With create/mkdir/mknod in non cached mode we initialize the inode using
v9fs_get_inode. v9fs_get_inode doesn't initialize the cache inode value
to NULL.  This is causing to trip on BUG_ON in v9fs_get_cached_acl.
Fix is to initialize acls to NULL and not to leave them in ACL_NOT_CACHED
state.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/9p/acl.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 291ff7be27f..0a2e480477a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -132,6 +132,10 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
 	struct inode *inode = dentry->d_inode;
 
 	set_cached_acl(inode, type, acl);
+
+	if (!acl)
+		return 0;
+
 	/* Set a setxattr request to server */
 	size = posix_acl_xattr_size(acl->a_count);
 	buffer = kmalloc(size, GFP_KERNEL);
@@ -181,10 +185,8 @@ int v9fs_acl_chmod(struct dentry *dentry)
 int v9fs_set_create_acl(struct dentry *dentry,
 			struct posix_acl *dpacl, struct posix_acl *pacl)
 {
-	if (dpacl)
-		v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
-	if (pacl)
-		v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
+	v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+	v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
 	posix_acl_release(dpacl);
 	posix_acl_release(pacl);
 	return 0;
-- 
cgit v1.2.3


From 9332685dffed3b402816c3564342f3e2df0c83ef Mon Sep 17 00:00:00 2001
From: "Venkateswararao Jujjuri (JV)" <jvrao@linux.vnet.ibm.com>
Date: Fri, 14 Jan 2011 15:24:59 -0800
Subject: [fs/9p] Ignore acl mount option when CONFIG_9P_FS_POSIX_ACL is not
 defined.

If the kernel is not compiled with CONFIG_9P_FS_POSIX_ACL and the
mount option is specified to enable ACLs current code fails the mount.
This patch brings the behavior inline with other filesystems like ext3
by proceeding with the mount and log a warning to syslog.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2f77cd33ba8..d34f2937df6 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -198,10 +198,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 				v9ses->flags |= V9FS_ACCESS_CLIENT;
 #else
 				P9_DPRINTK(P9_DEBUG_ERROR,
-					"access=client option not supported\n");
-				kfree(s);
-				ret = -EINVAL;
-				goto free_and_return;
+					"Not defined CONFIG_9P_FS_POSIX_ACL. "
+					"Ignoring access=client option\n");
 #endif
 			} else {
 				v9ses->flags |= V9FS_ACCESS_SINGLE;
-- 
cgit v1.2.3


From e782ef71097e832f62256370a2fe231b9fba96cf Mon Sep 17 00:00:00 2001
From: "Venkateswararao Jujjuri (JV)" <jvrao@linux.vnet.ibm.com>
Date: Tue, 25 Jan 2011 15:40:54 -0800
Subject: [fs/9P] Add posixacl mount option

The mount option access=client is overloaded as it assumes acl too.
Adding posixacl option to enable POSIX ACLs makes it explicit and clear.
Also it is convenient in the future to add other types of acls like richacls.

Ideally, the access mode 'client' should be just like V9FS_ACCESS_USER
except it underscores the location of access check.
Traditional 9P protocol lets the server perform access checks but with
this mode, all the access checks will be performed on the client itself.
Server just follows the client's directive.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/acl.c       |  8 +++++---
 fs/9p/v9fs.c      | 27 ++++++++++++++++++++-------
 fs/9p/v9fs.h      |  6 +++++-
 fs/9p/vfs_super.c |  2 +-
 4 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 0a2e480477a..1ee3434239c 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -59,7 +59,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 	struct v9fs_session_info *v9ses;
 
 	v9ses = v9fs_inode2v9ses(inode);
-	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+	if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+			((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
 		set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
 		set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
 		return 0;
@@ -104,9 +105,10 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
 		return -ECHILD;
 
 	v9ses = v9fs_inode2v9ses(inode);
-	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+	if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+			((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
 		/*
-		 * On access = client mode get the acl
+		 * On access = client  and acl = on mode get the acl
 		 * values from the server
 		 */
 		return 0;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d34f2937df6..f5a3200877d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -55,7 +55,7 @@ enum {
 	/* Cache options */
 	Opt_cache_loose, Opt_fscache,
 	/* Access options */
-	Opt_access,
+	Opt_access, Opt_posixacl,
 	/* Error token */
 	Opt_err
 };
@@ -73,6 +73,7 @@ static const match_table_t tokens = {
 	{Opt_fscache, "fscache"},
 	{Opt_cachetag, "cachetag=%s"},
 	{Opt_access, "access=%s"},
+	{Opt_posixacl, "posixacl"},
 	{Opt_err, NULL}
 };
 
@@ -194,13 +195,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			else if (strcmp(s, "any") == 0)
 				v9ses->flags |= V9FS_ACCESS_ANY;
 			else if (strcmp(s, "client") == 0) {
-#ifdef CONFIG_9P_FS_POSIX_ACL
 				v9ses->flags |= V9FS_ACCESS_CLIENT;
-#else
-				P9_DPRINTK(P9_DEBUG_ERROR,
-					"Not defined CONFIG_9P_FS_POSIX_ACL. "
-					"Ignoring access=client option\n");
-#endif
 			} else {
 				v9ses->flags |= V9FS_ACCESS_SINGLE;
 				v9ses->uid = simple_strtoul(s, &e, 10);
@@ -210,6 +205,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			kfree(s);
 			break;
 
+		case Opt_posixacl:
+#ifdef CONFIG_9P_FS_POSIX_ACL
+			v9ses->flags |= V9FS_POSIX_ACL;
+#else
+			P9_DPRINTK(P9_DEBUG_ERROR,
+					"Not defined CONFIG_9P_FS_POSIX_ACL. "
+					"Ignoring posixacl option\n");
+#endif
+			break;
+
 		default:
 			continue;
 		}
@@ -304,6 +309,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		v9ses->flags |= V9FS_ACCESS_ANY;
 		v9ses->uid = ~0;
 	}
+	if (!v9fs_proto_dotl(v9ses) ||
+		!((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+		/*
+		 * We support ACL checks on clinet only if the protocol is
+		 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
+		 */
+		v9ses->flags &= ~V9FS_ACL_MASK;
+	}
 
 	fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
 							v9ses->aname);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index c4b5d8864f0..6fa3cf5547d 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -28,8 +28,10 @@
  * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
  * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
  * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
+ * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
  * @V9FS_ACCESS_ANY: use a single attach for all users
  * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
+ * @V9FS_POSIX_ACL: POSIX ACLs are enforced
  *
  * Session flags reflect options selected by users at mount time
  */
@@ -37,13 +39,15 @@
 			 V9FS_ACCESS_USER |   \
 			 V9FS_ACCESS_CLIENT)
 #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+#define V9FS_ACL_MASK V9FS_POSIX_ACL
 
 enum p9_session_flags {
 	V9FS_PROTO_2000U	= 0x01,
 	V9FS_PROTO_2000L	= 0x02,
 	V9FS_ACCESS_SINGLE	= 0x04,
 	V9FS_ACCESS_USER	= 0x08,
-	V9FS_ACCESS_CLIENT	= 0x10
+	V9FS_ACCESS_CLIENT	= 0x10,
+	V9FS_POSIX_ACL		= 0x20
 };
 
 /* possible values of ->cache */
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index dbaabe3b813..4f14be585d6 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -91,7 +91,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	    MS_NOATIME;
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
-	if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+	if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
 		sb->s_flags |= MS_POSIXACL;
 #endif
 
-- 
cgit v1.2.3


From 6752a1ebd13f69b9d5ff08914fe29ee2813cbeea Mon Sep 17 00:00:00 2001
From: "Venkateswararao Jujjuri (JV)" <jvrao@linux.vnet.ibm.com>
Date: Wed, 26 Jan 2011 16:20:35 -0800
Subject: [fs/9p] Make access=client default in 9p2000.L protocol

Current code sets access=user as default for all protocol versions.
This patch chagnes it to "client" only for dotl.

User can always specify particular access mode with -o access= option.
No change there.

Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f5a3200877d..738be8f6994 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -263,19 +263,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	list_add(&v9ses->slist, &v9fs_sessionlist);
 	spin_unlock(&v9fs_sessionlist_lock);
 
-	v9ses->flags = V9FS_ACCESS_USER;
 	strcpy(v9ses->uname, V9FS_DEFUSER);
 	strcpy(v9ses->aname, V9FS_DEFANAME);
 	v9ses->uid = ~0;
 	v9ses->dfltuid = V9FS_DEFUID;
 	v9ses->dfltgid = V9FS_DEFGID;
 
-	rc = v9fs_parse_options(v9ses, data);
-	if (rc < 0) {
-		retval = rc;
-		goto error;
-	}
-
 	v9ses->clnt = p9_client_create(dev_name, data);
 	if (IS_ERR(v9ses->clnt)) {
 		retval = PTR_ERR(v9ses->clnt);
@@ -284,10 +277,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 		goto error;
 	}
 
-	if (p9_is_proto_dotl(v9ses->clnt))
+	v9ses->flags = V9FS_ACCESS_USER;
+
+	if (p9_is_proto_dotl(v9ses->clnt)) {
+		v9ses->flags = V9FS_ACCESS_CLIENT;
 		v9ses->flags |= V9FS_PROTO_2000L;
-	else if (p9_is_proto_dotu(v9ses->clnt))
+	} else if (p9_is_proto_dotu(v9ses->clnt)) {
 		v9ses->flags |= V9FS_PROTO_2000U;
+	}
+
+	rc = v9fs_parse_options(v9ses, data);
+	if (rc < 0) {
+		retval = rc;
+		goto error;
+	}
 
 	v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
-- 
cgit v1.2.3


From 29236f4e18dde0c772968b6ce965d0365fe3fe4e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:03:54 +0530
Subject: fs/9p: set the cached file_operations struct during inode init

With the old code we were not setting the file->f_op
with cached file operations during creat.

(format correction by jvrao@linux.vnet.ibm.com)

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs_vfs.h  |  2 ++
 fs/9p/vfs_file.c  | 18 ++++--------------
 fs/9p/vfs_inode.c | 11 +++++++++--
 3 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index b789f8e597e..e4d5540cbb7 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -45,6 +45,8 @@ extern const struct file_operations v9fs_dir_operations;
 extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
+extern const struct file_operations v9fs_cached_file_operations;
+extern const struct file_operations v9fs_cached_file_operations_dotl;
 
 #ifdef CONFIG_9P_FSCACHE
 struct inode *v9fs_alloc_inode(struct super_block *sb);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 240c3067439..6a671000263 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -44,9 +44,6 @@
 #include "fid.h"
 #include "cache.h"
 
-static const struct file_operations v9fs_cached_file_operations;
-static const struct file_operations v9fs_cached_file_operations_dotl;
-
 /**
  * v9fs_file_open - open a file (or directory)
  * @inode: inode to be opened
@@ -89,19 +86,12 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	}
 
 	file->private_data = fid;
+#ifdef CONFIG_9P_FSCACHE
 	if ((fid->qid.version) && (v9ses->cache)) {
 		P9_DPRINTK(P9_DEBUG_VFS, "cached");
-		/* enable cached file options */
-		if(file->f_op == &v9fs_file_operations)
-			file->f_op = &v9fs_cached_file_operations;
-		else if (file->f_op == &v9fs_file_operations_dotl)
-			file->f_op = &v9fs_cached_file_operations_dotl;
-
-#ifdef CONFIG_9P_FSCACHE
 		v9fs_cache_inode_set_cookie(inode, file);
-#endif
 	}
-
+#endif
 	return 0;
 }
 
@@ -505,7 +495,7 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync)
 	return retval;
 }
 
-static const struct file_operations v9fs_cached_file_operations = {
+const struct file_operations v9fs_cached_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
 	.aio_read = generic_file_aio_read,
@@ -517,7 +507,7 @@ static const struct file_operations v9fs_cached_file_operations = {
 	.fsync = v9fs_file_fsync,
 };
 
-static const struct file_operations v9fs_cached_file_operations_dotl = {
+const struct file_operations v9fs_cached_file_operations_dotl = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
 	.aio_read = generic_file_aio_read,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b76a40bdf4c..83af2b7e65d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -292,10 +292,17 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 	case S_IFREG:
 		if (v9fs_proto_dotl(v9ses)) {
 			inode->i_op = &v9fs_file_inode_operations_dotl;
-			inode->i_fop = &v9fs_file_operations_dotl;
+			if (v9ses->cache)
+				inode->i_fop =
+					&v9fs_cached_file_operations_dotl;
+			else
+				inode->i_fop = &v9fs_file_operations_dotl;
 		} else {
 			inode->i_op = &v9fs_file_inode_operations;
-			inode->i_fop = &v9fs_file_operations;
+			if (v9ses->cache)
+				inode->i_fop = &v9fs_cached_file_operations;
+			else
+				inode->i_fop = &v9fs_file_operations;
 		}
 
 		break;
-- 
cgit v1.2.3


From 46848de0249470e50d87af6d7f9d41cdff3e43f5 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:03:55 +0530
Subject: fs/9p: set fs cache cookie in create path also

We need to call v9fs_cache_inode_set_cookie in create
path also

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c       | 4 +---
 fs/9p/vfs_inode.c      | 4 ++++
 fs/9p/vfs_inode_dotl.c | 4 ++++
 3 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6a671000263..ce1eae48a12 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -87,10 +87,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 	file->private_data = fid;
 #ifdef CONFIG_9P_FSCACHE
-	if ((fid->qid.version) && (v9ses->cache)) {
-		P9_DPRINTK(P9_DEBUG_VFS, "cached");
+	if (v9ses->cache)
 		v9fs_cache_inode_set_cookie(inode, file);
-	}
 #endif
 	return 0;
 }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 83af2b7e65d..95f55011aca 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -608,6 +608,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		}
 
 		filp->private_data = fid;
+#ifdef CONFIG_9P_FSCACHE
+		if (v9ses->cache)
+			v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
+#endif
 	} else
 		p9_client_clunk(fid);
 
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index fe3ffa9aace..fbe957268f0 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -226,6 +226,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		return PTR_ERR(filp);
 	}
 	filp->private_data = ofid;
+#ifdef CONFIG_9P_FSCACHE
+	if (v9ses->cache)
+		v9fs_cache_inode_set_cookie(inode, filp);
+#endif
 	return 0;
 
 error:
-- 
cgit v1.2.3


From 20656a49ef37d8f44ed1e0b47d132197f9628adc Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:03:55 +0530
Subject: fs/9p: increment inode->i_count in cached mode.

We need to ihold even in cached mode

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode_dotl.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index fbe957268f0..265f5834498 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -636,13 +636,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 		v9fs_stat2inode_dotl(st, old_dentry->d_inode);
 
 		kfree(st);
-	} else {
-		/* Caching disabled. No need to get upto date stat info.
-		 * This dentry will be released immediately. So, just hold the
-		 * inode
-		 */
-		ihold(old_dentry->d_inode);
 	}
+	ihold(old_dentry->d_inode);
 	d_instantiate(dentry, old_dentry->d_inode);
 
 	return err;
-- 
cgit v1.2.3


From 2efda7998bbc50589d28f18fddfb0c44d412128e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:03:56 +0530
Subject: fs/9p: [fscache] wait for page write in cached mode

We need to call fscache_wait_on_page_write in launder_page
for fscache

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/cache.c    | 11 +++++++++++
 fs/9p/cache.h    | 16 ++++++++++++++--
 fs/9p/vfs_addr.c |  2 ++
 3 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 0dbe0d139ac..610913d42a2 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -461,3 +461,14 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 	if (ret != 0)
 		v9fs_uncache_page(inode, page);
 }
+
+/*
+ * wait for a page to complete writing to the cache
+ */
+void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
+{
+	const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+	if (PageFsCache(page))
+		fscache_wait_on_page_write(vcookie->fscache, page);
+}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index a94192bfaee..ec16fcdc3a6 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -64,8 +64,8 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode,
 					 struct list_head *pages,
 					 unsigned *nr_pages);
 extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
-
-
+extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
+					      struct page *page);
 /**
  * v9fs_cache_register - Register v9fs file system with the cache
  */
@@ -131,6 +131,12 @@ static inline void v9fs_vcookie_set_qid(struct inode *inode,
 	spin_unlock(&vcookie->lock);
 }
 
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+						   struct page *page)
+{
+	return __v9fs_fscache_wait_on_page_write(inode, page);
+}
+
 #else /* CONFIG_9P_FSCACHE */
 
 static inline int v9fs_cache_register(void)
@@ -172,5 +178,11 @@ static inline void v9fs_vcookie_set_qid(struct inode *inode,
 					struct p9_qid *qid)
 {}
 
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+						   struct page *page)
+{
+	return;
+}
+
 #endif /* CONFIG_9P_FSCACHE */
 #endif /* _9P_CACHE_H */
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index b7f2a8e3863..637bd703e28 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -151,6 +151,8 @@ static void v9fs_invalidate_page(struct page *page, unsigned long offset)
 
 static int v9fs_launder_page(struct page *page)
 {
+	struct inode *inode = page->mapping->host;
+	v9fs_fscache_wait_on_page_write(inode, page);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 17311779ac3dcd06f8ef727a06969c439e116a20 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:03:56 +0530
Subject: fs/9p: Add read write helper function

We add read write helper function here which will
be used later by the mmap patch

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs_vfs.h |   4 ++-
 fs/9p/vfs_file.c | 102 +++++++++++++++++++++++++++++++++----------------------
 2 files changed, 64 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index e4d5540cbb7..c44aaa8bd2a 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -64,8 +64,10 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 int v9fs_uflags2omode(int uflags, int extended);
 
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
+ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
 int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
 int v9fs_file_fsync_dotl(struct file *filp, int datasync);
-
+ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
+				 const char __user *, size_t, loff_t *, int);
 #define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index ce1eae48a12..6e1e8f43eda 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -323,25 +323,22 @@ out_err:
 }
 
 /**
- * v9fs_file_readn - read from a file
- * @filp: file pointer to read
+ * v9fs_fid_readn - read from a fid
+ * @fid: fid to read
  * @data: data buffer to read data into
  * @udata: user data buffer to read data into
  * @count: size of buffer
  * @offset: offset at which to read data
  *
  */
-
 ssize_t
-v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
 	       u64 offset)
 {
 	int n, total, size;
-	struct p9_fid *fid = filp->private_data;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
-					(long long unsigned) offset, count);
-
+		   (long long unsigned) offset, count);
 	n = 0;
 	total = 0;
 	size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -366,6 +363,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
 	return total;
 }
 
+/**
+ * v9fs_file_readn - read from a file
+ * @filp: file pointer to read
+ * @data: data buffer to read data into
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+ssize_t
+v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+	       u64 offset)
+{
+	return v9fs_fid_readn(filp->private_data, data, udata, count, offset);
+}
+
 /**
  * v9fs_file_read - read from a file
  * @filp: file pointer to read
@@ -398,45 +411,21 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
 	return ret;
 }
 
-/**
- * v9fs_file_write - write to a file
- * @filp: file pointer to write
- * @data: data buffer to write data from
- * @count: size of buffer
- * @offset: offset at which to write data
- *
- */
-
-static ssize_t
-v9fs_file_write(struct file *filp, const char __user * data,
-		size_t count, loff_t * offset)
+ssize_t
+v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
+			 const char __user *data, size_t count,
+			 loff_t *offset, int invalidate)
 {
-	ssize_t retval;
-	size_t total = 0;
 	int n;
-	struct p9_fid *fid;
+	size_t total = 0;
 	struct p9_client *clnt;
-	struct inode *inode = filp->f_path.dentry->d_inode;
 	loff_t origin = *offset;
 	unsigned long pg_start, pg_end;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
 		(int)count, (int)*offset);
 
-	fid = filp->private_data;
 	clnt = fid->clnt;
-
-	retval = generic_write_checks(filp, &origin, &count, 0);
-	if (retval)
-		goto out;
-
-	retval = -EINVAL;
-	if ((ssize_t) count < 0)
-		goto out;
-	retval = 0;
-	if (!count)
-		goto out;
-
 	do {
 		n = p9_client_write(fid, NULL, data+total, origin+total, count);
 		if (n <= 0)
@@ -445,7 +434,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		total += n;
 	} while (count > 0);
 
-	if (total > 0) {
+	if (invalidate && (total > 0)) {
 		pg_start = origin >> PAGE_CACHE_SHIFT;
 		pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
 		if (inode->i_mapping && inode->i_mapping->nrpages)
@@ -455,11 +444,42 @@ v9fs_file_write(struct file *filp, const char __user * data,
 		i_size_write(inode, i_size_read(inode) + total);
 		inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
 	}
-
 	if (n < 0)
-		retval = n;
-	else
-		retval = total;
+		return n;
+
+	return total;
+}
+
+/**
+ * v9fs_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_file_write(struct file *filp, const char __user * data,
+		size_t count, loff_t *offset)
+{
+	ssize_t retval = 0;
+	loff_t origin = *offset;
+
+
+	retval = generic_write_checks(filp, &origin, &count, 0);
+	if (retval)
+		goto out;
+
+	retval = -EINVAL;
+	if ((ssize_t) count < 0)
+		goto out;
+	retval = 0;
+	if (!count)
+		goto out;
+
+	return v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+					filp->private_data,
+					data, count, offset, 1);
 out:
 	return retval;
 }
-- 
cgit v1.2.3


From 3cf387d780944305839f5b27c51f225444ba4d27 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:03:57 +0530
Subject: fs/9p: Add fid to inode in cached mode

The fid attached to inode will be opened O_RDWR mode and is used
for dirty page writeback only.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/fid.c            | 26 ++++++++++++++++++++++++++
 fs/9p/fid.h            |  1 +
 fs/9p/vfs_file.c       | 19 +++++++++++++++++++
 fs/9p/vfs_inode.c      | 22 +++++++++++++++++++++-
 fs/9p/vfs_inode_dotl.c | 18 ++++++++++++++++--
 5 files changed, 83 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b00223c99d7..9d6a5d3bfe1 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -261,3 +261,29 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 	ret = p9_client_walk(fid, 0, NULL, 1);
 	return ret;
 }
+
+
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
+{
+	int err;
+	struct p9_fid *fid;
+
+	fid = v9fs_fid_clone(dentry);
+	if (IS_ERR(fid))
+		goto error_out;
+	/*
+	 * writeback fid will only be used to write back the
+	 * dirty pages. We always request for the open fid in read-write
+	 * mode so that a partial page write which result in page
+	 * read can work.
+	 * FIXME!!: we should make the fid owned by uid = 0
+	 */
+	err = p9_client_open(fid, O_RDWR);
+	if (err < 0) {
+		p9_client_clunk(fid);
+		fid = ERR_PTR(err);
+		goto error_out;
+	}
+error_out:
+	return fid;
+}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index c3bbd6af996..c058f1c7656 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -45,3 +45,4 @@ struct v9fs_dentry {
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
 int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6e1e8f43eda..e966f15f92e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -86,11 +86,30 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	}
 
 	file->private_data = fid;
+	if (v9ses->cache && !inode->i_private) {
+		/*
+		 * clone a fid and add it to inode->i_private
+		 * we do it during open time instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		fid = v9fs_writeback_fid(file->f_path.dentry);
+		if (IS_ERR(fid)) {
+			err = PTR_ERR(fid);
+			goto out_error;
+		}
+		inode->i_private = (void *) fid;
+	}
 #ifdef CONFIG_9P_FSCACHE
 	if (v9ses->cache)
 		v9fs_cache_inode_set_cookie(inode, file);
 #endif
 	return 0;
+out_error:
+	p9_client_clunk(file->private_data);
+	file->private_data = NULL;
+	return err;
 }
 
 /**
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 95f55011aca..a0d65a39872 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -417,6 +417,11 @@ void v9fs_evict_inode(struct inode *inode)
 #ifdef CONFIG_9P_FSCACHE
 	v9fs_cache_inode_put_cookie(inode);
 #endif
+	/* clunk the fid stashed in inode->i_private */
+	if (inode->i_private) {
+		p9_client_clunk((struct p9_fid *)inode->i_private);
+		inode->i_private = NULL;
+	}
 }
 
 struct inode *
@@ -578,7 +583,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	u32 perm;
 	int flags;
 	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid;
+	struct p9_fid *fid, *inode_fid;
 	struct file *filp;
 
 	err = 0;
@@ -601,6 +606,21 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	/* if we are opening a file, assign the open fid to the file */
 	if (nd && nd->flags & LOOKUP_OPEN) {
+		if (v9ses->cache && !dentry->d_inode->i_private) {
+			/*
+			 * clone a fid and add it to inode->i_private
+			 * we do it during open time instead of
+			 * page dirty time via write_begin/page_mkwrite
+			 * because we want write after unlink usecase
+			 * to work.
+			 */
+			inode_fid = v9fs_writeback_fid(dentry);
+			if (IS_ERR(inode_fid)) {
+				err = PTR_ERR(inode_fid);
+				goto error;
+			}
+			dentry->d_inode->i_private = (void *) inode_fid;
+		}
 		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
 		if (IS_ERR(filp)) {
 			err = PTR_ERR(filp);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 265f5834498..1327464bb2b 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -142,7 +142,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	mode_t mode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL;
-	struct p9_fid *dfid, *ofid;
+	struct p9_fid *dfid, *ofid, *inode_fid;
 	struct file *filp;
 	struct p9_qid qid;
 	struct inode *inode;
@@ -218,7 +218,21 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 
 	/* Now set the ACL based on the default value */
 	v9fs_set_create_acl(dentry, dacl, pacl);
-
+	if (v9ses->cache && !inode->i_private) {
+		/*
+		 * clone a fid and add it to inode->i_private
+		 * we do it during open time instead of
+		 * page dirty time via write_begin/page_mkwrite
+		 * because we want write after unlink usecase
+		 * to work.
+		 */
+		inode_fid = v9fs_writeback_fid(dentry);
+		if (IS_ERR(inode_fid)) {
+			err = PTR_ERR(inode_fid);
+			goto error;
+		}
+		inode->i_private = (void *) inode_fid;
+	}
 	/* Since we are opening a file, assign the open fid to the file */
 	filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
 	if (IS_ERR(filp)) {
-- 
cgit v1.2.3


From 7263cebed9fadad719063fdc8bba7085cf2c080d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:03:58 +0530
Subject: fs/9p: Add buffered write support for v9fs.

We can now support writeable mmaps.
Based on the original patch from Badari Pulavarty <pbadari@us.ibm.com>

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_addr.c | 183 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/9p/vfs_file.c |  54 ++++++++++++++--
 2 files changed, 218 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 637bd703e28..566684ce55e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -39,16 +39,16 @@
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "cache.h"
+#include "fid.h"
 
 /**
- * v9fs_vfs_readpage - read an entire page in from 9P
+ * v9fs_fid_readpage - read an entire page in from 9P
  *
- * @filp: file being read
+ * @fid: fid being read
  * @page: structure to page
  *
  */
-
-static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
 {
 	int retval;
 	loff_t offset;
@@ -67,7 +67,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
 	buffer = kmap(page);
 	offset = page_offset(page);
 
-	retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
+	retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset);
 	if (retval < 0) {
 		v9fs_uncache_page(inode, page);
 		goto done;
@@ -86,6 +86,19 @@ done:
 	return retval;
 }
 
+/**
+ * v9fs_vfs_readpage - read an entire page in from 9P
+ *
+ * @filp: file being read
+ * @page: structure to page
+ *
+ */
+
+static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+{
+	return v9fs_fid_readpage(filp->private_data, page);
+}
+
 /**
  * v9fs_vfs_readpages - read a set of pages from 9P
  *
@@ -124,7 +137,6 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 {
 	if (PagePrivate(page))
 		return 0;
-
 	return v9fs_fscache_release_page(page, gfp);
 }
 
@@ -137,22 +149,87 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 
 static void v9fs_invalidate_page(struct page *page, unsigned long offset)
 {
+	/*
+	 * If called with zero offset, we should release
+	 * the private state assocated with the page
+	 */
 	if (offset == 0)
 		v9fs_fscache_invalidate_page(page);
 }
 
+static int v9fs_vfs_writepage_locked(struct page *page)
+{
+	char *buffer;
+	int retval, len;
+	loff_t offset, size;
+	mm_segment_t old_fs;
+	struct inode *inode = page->mapping->host;
+
+	size = i_size_read(inode);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	set_page_writeback(page);
+
+	buffer = kmap(page);
+	offset = page_offset(page);
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* We should have i_private always set */
+	BUG_ON(!inode->i_private);
+
+	retval = v9fs_file_write_internal(inode,
+					  (struct p9_fid *)inode->i_private,
+					  (__force const char __user *)buffer,
+					  len, &offset, 0);
+	if (retval > 0)
+		retval = 0;
+
+	set_fs(old_fs);
+	kunmap(page);
+	end_page_writeback(page);
+	return retval;
+}
+
+static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int retval;
+
+	retval = v9fs_vfs_writepage_locked(page);
+	if (retval < 0) {
+		if (retval == -EAGAIN) {
+			redirty_page_for_writepage(wbc, page);
+			retval = 0;
+		} else {
+			SetPageError(page);
+			mapping_set_error(page->mapping, retval);
+		}
+	} else
+		retval = 0;
+
+	unlock_page(page);
+	return retval;
+}
+
 /**
  * v9fs_launder_page - Writeback a dirty page
- * Since the writes go directly to the server, we simply return a 0
- * here to indicate success.
- *
  * Returns 0 on success.
  */
 
 static int v9fs_launder_page(struct page *page)
 {
+	int retval;
 	struct inode *inode = page->mapping->host;
+
 	v9fs_fscache_wait_on_page_write(inode, page);
+	if (clear_page_dirty_for_io(page)) {
+		retval = v9fs_vfs_writepage_locked(page);
+		if (retval)
+			return retval;
+	}
 	return 0;
 }
 
@@ -178,6 +255,11 @@ static int v9fs_launder_page(struct page *page)
 ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 		loff_t pos, unsigned long nr_segs)
 {
+	/*
+	 * FIXME
+	 * Now that we do caching with cache mode enabled, We need
+	 * to support direct IO
+	 */
 	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
 			"off/no(%lld/%lu) EINVAL\n",
 			iocb->ki_filp->f_path.dentry->d_name.name,
@@ -185,11 +267,82 @@ ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 
 	return -EINVAL;
 }
+
+static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	int retval = 0;
+	struct page *page;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct inode *inode = mapping->host;
+
+start:
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	if (!page) {
+		retval = -ENOMEM;
+		goto out;
+	}
+	BUG_ON(!inode->i_private);
+	if (PageUptodate(page))
+		goto out;
+
+	if (len == PAGE_CACHE_SIZE)
+		goto out;
+
+	retval = v9fs_fid_readpage(inode->i_private, page);
+	page_cache_release(page);
+	if (!retval)
+		goto start;
+out:
+	*pagep = page;
+	return retval;
+}
+
+static int v9fs_write_end(struct file *filp, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	loff_t last_pos = pos + copied;
+	struct inode *inode = page->mapping->host;
+
+	if (unlikely(copied < len)) {
+		/*
+		 * zero out the rest of the area
+		 */
+		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+		zero_user(page, from + copied, len - copied);
+		flush_dcache_page(page);
+	}
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold the i_mutex.
+	 */
+	if (last_pos > inode->i_size) {
+		inode_add_bytes(inode, last_pos - inode->i_size);
+		i_size_write(inode, last_pos);
+	}
+	set_page_dirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
+
 const struct address_space_operations v9fs_addr_operations = {
-      .readpage = v9fs_vfs_readpage,
-      .readpages = v9fs_vfs_readpages,
-      .releasepage = v9fs_release_page,
-      .invalidatepage = v9fs_invalidate_page,
-      .launder_page = v9fs_launder_page,
-      .direct_IO = v9fs_direct_IO,
+	.readpage = v9fs_vfs_readpage,
+	.readpages = v9fs_vfs_readpages,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.writepage = v9fs_vfs_writepage,
+	.write_begin = v9fs_write_begin,
+	.write_end = v9fs_write_end,
+	.releasepage = v9fs_release_page,
+	.invalidatepage = v9fs_invalidate_page,
+	.launder_page = v9fs_launder_page,
+	.direct_IO = v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index e966f15f92e..f7b571ddf99 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -44,6 +44,8 @@
 #include "fid.h"
 #include "cache.h"
 
+static const struct vm_operations_struct v9fs_file_vm_ops;
+
 /**
  * v9fs_file_open - open a file (or directory)
  * @inode: inode to be opened
@@ -503,6 +505,7 @@ out:
 	return retval;
 }
 
+
 static int v9fs_file_fsync(struct file *filp, int datasync)
 {
 	struct p9_fid *fid;
@@ -532,28 +535,71 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync)
 	return retval;
 }
 
+static int
+v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int retval;
+
+	retval = generic_file_mmap(file, vma);
+	if (!retval)
+		vma->vm_ops = &v9fs_file_vm_ops;
+
+	return retval;
+}
+
+static int
+v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct file *filp = vma->vm_file;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+
+
+	P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
+		   page, (unsigned long)filp->private_data);
+
+	/* make sure the cache has finished storing the page */
+	v9fs_fscache_wait_on_page_write(inode, page);
+	BUG_ON(!inode->i_private);
+	lock_page(page);
+	if (page->mapping != inode->i_mapping)
+		goto out_unlock;
+
+	return VM_FAULT_LOCKED;
+out_unlock:
+	unlock_page(page);
+	return VM_FAULT_NOPAGE;
+}
+
+static const struct vm_operations_struct v9fs_file_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = v9fs_vm_page_mkwrite,
+};
+
 const struct file_operations v9fs_cached_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
+	.write = do_sync_write,
 	.aio_read = generic_file_aio_read,
-	.write = v9fs_file_write,
+	.aio_write = generic_file_aio_write,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock,
-	.mmap = generic_file_readonly_mmap,
+	.mmap = v9fs_file_mmap,
 	.fsync = v9fs_file_fsync,
 };
 
 const struct file_operations v9fs_cached_file_operations_dotl = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
+	.write = do_sync_write,
 	.aio_read = generic_file_aio_read,
-	.write = v9fs_file_write,
+	.aio_write = generic_file_aio_write,
 	.open = v9fs_file_open,
 	.release = v9fs_dir_release,
 	.lock = v9fs_file_lock_dotl,
 	.flock = v9fs_file_flock_dotl,
-	.mmap = generic_file_readonly_mmap,
+	.mmap = v9fs_file_mmap,
 	.fsync = v9fs_file_fsync_dotl,
 };
 
-- 
cgit v1.2.3


From a950a65264ab9e9b96e7093e3358ffa84799f99c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:03:58 +0530
Subject: fs/9p: Clarify cached dentry delete operation

Update the comment to indicate that we don't want to cache
negative dentries.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_dentry.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 233b7d4ffe5..a4ae4be5ab5 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -63,20 +63,15 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
  * v9fs_cached_dentry_delete - called when dentry refcount equals 0
  * @dentry:  dentry in question
  *
- * Only return 1 if our inode is invalid.  Only non-synthetic files
- * (ones without mtime == 0) should be calling this function.
- *
  */
-
 static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
-	struct inode *inode = dentry->d_inode;
-	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
-									dentry);
+	P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+		   dentry->d_name.name, dentry);
 
-	if(!inode)
+	/* Don't cache negative dentries */
+	if (!dentry->d_inode)
 		return 1;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From db5841d4a505d1ecb087dc37462926a80511ae8b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:03:59 +0530
Subject: fs/9p: Mark file system with MS_SYNCHRONOUS only if it is not cached
 mode

We should not mark file system synchronous if mounted cache=* option

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 4f14be585d6..76f867cf23f 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -87,8 +87,9 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 		sb->s_op = &v9fs_super_ops;
 	sb->s_bdi = &v9ses->bdi;
 
-	sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
-	    MS_NOATIME;
+	sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
+	if (!v9ses->cache)
+		sb->s_flags |= MS_SYNCHRONOUS;
 
 #ifdef CONFIG_9P_FS_POSIX_ACL
 	if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
-- 
cgit v1.2.3


From 00ea2df43e0a68a90bb6055cc48965b2c970228d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:00 +0530
Subject: fs/9p: Implement syncfs call back for 9Pfs

FIXME!! what about dotu ?

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.h      |  1 +
 fs/9p/vfs_super.c | 33 ++++++++++++++++++++++++---------
 2 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 6fa3cf5547d..21bd803a3cc 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -113,6 +113,7 @@ struct v9fs_session_info {
 	struct list_head slist; /* list of sessions registered with v9fs */
 	struct backing_dev_info bdi;
 	struct rw_semaphore rename_sem;
+	struct p9_fid *root_fid; /* Used for file system sync */
 };
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 76f867cf23f..6985e2a7a11 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -152,7 +152,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		retval = PTR_ERR(inode);
 		goto release_sb;
 	}
-
 	root = d_alloc_root(inode);
 	if (!root) {
 		iput(inode);
@@ -184,10 +183,21 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		p9stat_free(st);
 		kfree(st);
 	}
+	v9fs_fid_add(root, fid);
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
 		goto release_sb;
-	v9fs_fid_add(root, fid);
+	/*
+	 * Add the root fid to session info. This is used
+	 * for file system sync. We want a cloned fid here
+	 * so that we can do a sync_filesystem after a
+	 * shrink_dcache_for_umount
+	 */
+	v9ses->root_fid = v9fs_fid_clone(root);
+	if (IS_ERR(v9ses->root_fid)) {
+		retval = PTR_ERR(v9ses->root_fid);
+		goto release_sb;
+	}
 
 	P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
 	return dget(sb->s_root);
@@ -198,15 +208,11 @@ close_session:
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
 	return ERR_PTR(retval);
-
 release_sb:
 	/*
-	 * we will do the session_close and root dentry release
-	 * in the below call. But we need to clunk fid, because we haven't
-	 * attached the fid to dentry so it won't get clunked
-	 * automatically.
+	 * we will do the session_close and root dentry
+	 * release in the below call.
 	 */
-	p9_client_clunk(fid);
 	deactivate_locked_super(sb);
 	return ERR_PTR(retval);
 }
@@ -224,7 +230,7 @@ static void v9fs_kill_super(struct super_block *s)
 	P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
 
 	kill_anon_super(s);
-
+	p9_client_clunk(v9ses->root_fid);
 	v9fs_session_cancel(v9ses);
 	v9fs_session_close(v9ses);
 	kfree(v9ses);
@@ -277,6 +283,14 @@ done:
 	return res;
 }
 
+static int v9fs_sync_fs(struct super_block *sb, int wait)
+{
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_sync_fs: super_block %p\n", sb);
+	return p9_client_sync_fs(v9ses->root_fid);
+}
+
 static const struct super_operations v9fs_super_ops = {
 #ifdef CONFIG_9P_FSCACHE
 	.alloc_inode = v9fs_alloc_inode,
@@ -293,6 +307,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
 	.alloc_inode = v9fs_alloc_inode,
 	.destroy_inode = v9fs_destroy_inode,
 #endif
+	.sync_fs = v9fs_sync_fs,
 	.statfs = v9fs_statfs,
 	.evict_inode = v9fs_evict_inode,
 	.show_options = generic_show_options,
-- 
cgit v1.2.3


From 62d810b424e434a38ad6b17fb93cd5748692a026 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:00 +0530
Subject: fs/9p: We need not writeback dirty pages during close

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_dir.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b84ebe8cefe..9c2bdda5cd9 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -295,7 +295,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 	P9_DPRINTK(P9_DEBUG_VFS,
 			"v9fs_dir_release: inode: %p filp: %p fid: %d\n",
 			inode, filp, fid ? fid->fid : -1);
-	filemap_write_and_wait(inode->i_mapping);
 	if (fid)
 		p9_client_clunk(fid);
 	return 0;
-- 
cgit v1.2.3


From 5ffc0cb308f69cea36058d308d911f26ee59316e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:01 +0530
Subject: fs/9p: Add inode hashing

We didn't add the inode to inode hash in 9p. We need to do that
to get sync to work, otherwise __mark_inode_dirty will not
add the inode to super block's dirty list.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/acl.c            |   2 +-
 fs/9p/v9fs_vfs.h       |   2 +
 fs/9p/vfs_inode.c      | 121 ++++++++++++++++++++++++++++++-------------------
 fs/9p/vfs_inode_dotl.c |  69 ++++++++++++++++++----------
 4 files changed, 124 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 1ee3434239c..51545529637 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -21,8 +21,8 @@
 #include <linux/posix_acl_xattr.h>
 #include "xattr.h"
 #include "acl.h"
-#include "v9fs_vfs.h"
 #include "v9fs.h"
+#include "v9fs_vfs.h"
 
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index c44aaa8bd2a..ab72e66c2e0 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -54,6 +54,8 @@ void v9fs_destroy_inode(struct inode *inode);
 #endif
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+		    struct inode *inode, int mode);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index a0d65a39872..445dd283dc1 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -243,26 +243,10 @@ void v9fs_destroy_inode(struct inode *inode)
 }
 #endif
 
-/**
- * v9fs_get_inode - helper function to setup an inode
- * @sb: superblock
- * @mode: mode to setup inode with
- *
- */
-
-struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+		    struct inode *inode, int mode)
 {
-	int err;
-	struct inode *inode;
-	struct v9fs_session_info *v9ses = sb->s_fs_info;
-
-	P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
-
-	inode = new_inode(sb);
-	if (!inode) {
-		P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
-		return ERR_PTR(-ENOMEM);
-	}
+	int err = 0;
 
 	inode_init_owner(inode, NULL, mode);
 	inode->i_blocks = 0;
@@ -306,7 +290,6 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		}
 
 		break;
-
 	case S_IFLNK:
 		if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
 			P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
@@ -342,12 +325,37 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		err = -EINVAL;
 		goto error;
 	}
+error:
+	return err;
 
-	return inode;
+}
 
-error:
-	iput(inode);
-	return ERR_PTR(err);
+/**
+ * v9fs_get_inode - helper function to setup an inode
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+
+struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+{
+	int err;
+	struct inode *inode;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+
+	inode = new_inode(sb);
+	if (!inode) {
+		P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	err = v9fs_init_inode(v9ses, inode, mode);
+	if (err) {
+		iput(inode);
+		return ERR_PTR(err);
+	}
+	return inode;
 }
 
 /*
@@ -424,39 +432,60 @@ void v9fs_evict_inode(struct inode *inode)
 	}
 }
 
-struct inode *
-v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-	struct super_block *sb)
+static struct inode *v9fs_qid_iget(struct super_block *sb,
+				   struct p9_qid *qid,
+				   struct p9_wstat *st)
 {
-	int err, umode;
-	struct inode *ret = NULL;
-	struct p9_wstat *st;
-
-	st = p9_client_stat(fid);
-	if (IS_ERR(st))
-		return ERR_CAST(st);
+	int retval, umode;
+	unsigned long i_ino;
+	struct inode *inode;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
+	i_ino = v9fs_qid2ino(qid);
+	inode = iget_locked(sb, i_ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	/*
+	 * initialize the inode with the stat info
+	 * FIXME!! we may need support for stale inodes
+	 * later.
+	 */
 	umode = p9mode2unixmode(v9ses, st->mode);
-	ret = v9fs_get_inode(sb, umode);
-	if (IS_ERR(ret)) {
-		err = PTR_ERR(ret);
+	retval = v9fs_init_inode(v9ses, inode, umode);
+	if (retval)
 		goto error;
-	}
-
-	v9fs_stat2inode(st, ret, sb);
-	ret->i_ino = v9fs_qid2ino(&st->qid);
 
+	v9fs_stat2inode(st, inode, sb);
 #ifdef CONFIG_9P_FSCACHE
 	v9fs_vcookie_set_qid(ret, &st->qid);
-	v9fs_cache_inode_get_cookie(ret);
+	v9fs_cache_inode_get_cookie(inode);
 #endif
-	p9stat_free(st);
-	kfree(st);
-	return ret;
+	unlock_new_inode(inode);
+	return inode;
 error:
+	unlock_new_inode(inode);
+	iput(inode);
+	return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+	   struct super_block *sb)
+{
+	struct p9_wstat *st;
+	struct inode *inode = NULL;
+
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return ERR_CAST(st);
+
+	inode = v9fs_qid_iget(sb, &st->qid, st);
 	p9stat_free(st);
 	kfree(st);
-	return ERR_PTR(err);
+	return inode;
 }
 
 /**
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1327464bb2b..6f62320eda8 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -86,40 +86,63 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
 	return dentry;
 }
 
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+					struct p9_qid *qid,
+					struct p9_fid *fid,
+					struct p9_stat_dotl *st)
+{
+	int retval;
+	unsigned long i_ino;
+	struct inode *inode;
+	struct v9fs_session_info *v9ses = sb->s_fs_info;
+
+	i_ino = v9fs_qid2ino(qid);
+	inode = iget_locked(sb, i_ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	/*
+	 * initialize the inode with the stat info
+	 * FIXME!! we may need support for stale inodes
+	 * later.
+	 */
+	retval = v9fs_init_inode(v9ses, inode, st->st_mode);
+	if (retval)
+		goto error;
+
+	v9fs_stat2inode_dotl(st, inode);
+#ifdef CONFIG_9P_FSCACHE
+	v9fs_vcookie_set_qid(inode, &st->qid);
+	v9fs_cache_inode_get_cookie(inode);
+#endif
+	retval = v9fs_get_acl(inode, fid);
+	if (retval)
+		goto error;
+
+	unlock_new_inode(inode);
+	return inode;
+error:
+	unlock_new_inode(inode);
+	iput(inode);
+	return ERR_PTR(retval);
+
+}
+
 struct inode *
 v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	struct super_block *sb)
 {
-	struct inode *ret = NULL;
-	int err;
 	struct p9_stat_dotl *st;
+	struct inode *inode = NULL;
 
 	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
 	if (IS_ERR(st))
 		return ERR_CAST(st);
 
-	ret = v9fs_get_inode(sb, st->st_mode);
-	if (IS_ERR(ret)) {
-		err = PTR_ERR(ret);
-		goto error;
-	}
-
-	v9fs_stat2inode_dotl(st, ret);
-	ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-	v9fs_vcookie_set_qid(ret, &st->qid);
-	v9fs_cache_inode_get_cookie(ret);
-#endif
-	err = v9fs_get_acl(ret, fid);
-	if (err) {
-		iput(ret);
-		goto error;
-	}
-	kfree(st);
-	return ret;
-error:
+	inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st);
 	kfree(st);
-	return ERR_PTR(err);
+	return inode;
 }
 
 /**
-- 
cgit v1.2.3


From a12119087bd803d3fa0b067ee18497e2e5d064cd Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:01 +0530
Subject: fs/9p: Don't set stat.st_blocks based on nrpages

simple_getattr does set stat.st_blocks to a value
derived from nrpages. That is not correct with 9p

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c      | 7 ++++---
 fs/9p/vfs_inode_dotl.c | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 445dd283dc1..304904b4061 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -891,9 +891,10 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	err = -EPERM;
 	v9ses = v9fs_inode2v9ses(dentry->d_inode);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-		return simple_getattr(mnt, dentry, stat);
-
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		generic_fillattr(dentry->d_inode, stat);
+		return 0;
+	}
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 6f62320eda8..a2a3d7edb17 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -387,9 +387,10 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
 	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	err = -EPERM;
 	v9ses = v9fs_inode2v9ses(dentry->d_inode);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-		return simple_getattr(mnt, dentry, stat);
-
+	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
+		generic_fillattr(dentry->d_inode, stat);
+		return 0;
+	}
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
-- 
cgit v1.2.3


From a78ce05d5d342297b66122eda8add4eefa21f8a8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:02 +0530
Subject: fs/9p: Add v9fs_inode

Switch to the fscache code to v9fs_inode. We will later use
v9fs_inode in cache=loose mode to track the inode cache
validity timeout. Ie if we find an inode in cache older
that a specific jiffie range we will consider it stale

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/cache.c          | 197 ++++++++++++++++++-------------------------------
 fs/9p/cache.h          |  54 ++------------
 fs/9p/v9fs.c           |  58 +++++++++++++++
 fs/9p/v9fs.h           |  37 +++++++---
 fs/9p/v9fs_vfs.h       |   4 +-
 fs/9p/vfs_inode.c      |  36 ++++-----
 fs/9p/vfs_inode_dotl.c |  14 ++--
 fs/9p/vfs_super.c      |   4 -
 8 files changed, 184 insertions(+), 220 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 610913d42a2..5b335c5086a 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -33,67 +33,11 @@
 
 #define CACHETAG_LEN  11
 
-struct kmem_cache *vcookie_cache;
-
 struct fscache_netfs v9fs_cache_netfs = {
 	.name 		= "9p",
 	.version 	= 0,
 };
 
-static void init_once(void *foo)
-{
-	struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
-	vcookie->fscache = NULL;
-	vcookie->qid = NULL;
-	inode_init_once(&vcookie->inode);
-}
-
-/**
- * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
- *			    vcookie to inode mapping
- *
- * Returns 0 on success.
- */
-
-static int v9fs_init_vcookiecache(void)
-{
-	vcookie_cache = kmem_cache_create("vcookie_cache",
-					  sizeof(struct v9fs_cookie),
-					  0, (SLAB_RECLAIM_ACCOUNT|
-					      SLAB_MEM_SPREAD),
-					  init_once);
-	if (!vcookie_cache)
-		return -ENOMEM;
-
-	return 0;
-}
-
-/**
- * v9fs_destroy_vcookiecache - destroy the cache of vcookies
- *
- */
-
-static void v9fs_destroy_vcookiecache(void)
-{
-	kmem_cache_destroy(vcookie_cache);
-}
-
-int __v9fs_cache_register(void)
-{
-	int ret;
-	ret = v9fs_init_vcookiecache();
-	if (ret < 0)
-		return ret;
-
-	return fscache_register_netfs(&v9fs_cache_netfs);
-}
-
-void __v9fs_cache_unregister(void)
-{
-	v9fs_destroy_vcookiecache();
-	fscache_unregister_netfs(&v9fs_cache_netfs);
-}
-
 /**
  * v9fs_random_cachetag - Generate a random tag to be associated
  *			  with a new cache session.
@@ -133,9 +77,9 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
 }
 
 const struct fscache_cookie_def v9fs_cache_session_index_def = {
-	.name 		= "9P.session",
-	.type 		= FSCACHE_COOKIE_TYPE_INDEX,
-	.get_key 	= v9fs_cache_session_get_key,
+	.name		= "9P.session",
+	.type		= FSCACHE_COOKIE_TYPE_INDEX,
+	.get_key	= v9fs_cache_session_get_key,
 };
 
 void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
@@ -163,33 +107,33 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
 static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
 					 void *buffer, uint16_t bufmax)
 {
-	const struct v9fs_cookie *vcookie = cookie_netfs_data;
-	memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
-
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
-		   vcookie->qid->path);
-	return sizeof(vcookie->qid->path);
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+	memcpy(buffer, &v9inode->fscache_key->path,
+	       sizeof(v9inode->fscache_key->path));
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
+		   v9inode->fscache_key->path);
+	return sizeof(v9inode->fscache_key->path);
 }
 
 static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
 				      uint64_t *size)
 {
-	const struct v9fs_cookie *vcookie = cookie_netfs_data;
-	*size = i_size_read(&vcookie->inode);
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+	*size = i_size_read(&v9inode->vfs_inode);
 
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
 		   *size);
 }
 
 static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
 					 void *buffer, uint16_t buflen)
 {
-	const struct v9fs_cookie *vcookie = cookie_netfs_data;
-	memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
-
-	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
-		   vcookie->qid->version);
-	return sizeof(vcookie->qid->version);
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
+	memcpy(buffer, &v9inode->fscache_key->version,
+	       sizeof(v9inode->fscache_key->version));
+	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
+		   v9inode->fscache_key->version);
+	return sizeof(v9inode->fscache_key->version);
 }
 
 static enum
@@ -197,13 +141,13 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 					    const void *buffer,
 					    uint16_t buflen)
 {
-	const struct v9fs_cookie *vcookie = cookie_netfs_data;
+	const struct v9fs_inode *v9inode = cookie_netfs_data;
 
-	if (buflen != sizeof(vcookie->qid->version))
+	if (buflen != sizeof(v9inode->fscache_key->version))
 		return FSCACHE_CHECKAUX_OBSOLETE;
 
-	if (memcmp(buffer, &vcookie->qid->version,
-		   sizeof(vcookie->qid->version)))
+	if (memcmp(buffer, &v9inode->fscache_key->version,
+		   sizeof(v9inode->fscache_key->version)))
 		return FSCACHE_CHECKAUX_OBSOLETE;
 
 	return FSCACHE_CHECKAUX_OKAY;
@@ -211,7 +155,7 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 
 static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
 {
-	struct v9fs_cookie *vcookie = cookie_netfs_data;
+	struct v9fs_inode *v9inode = cookie_netfs_data;
 	struct pagevec pvec;
 	pgoff_t first;
 	int loop, nr_pages;
@@ -220,7 +164,7 @@ static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
 	first = 0;
 
 	for (;;) {
-		nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
+		nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
 					  first,
 					  PAGEVEC_SIZE - pagevec_count(&pvec));
 		if (!nr_pages)
@@ -249,115 +193,114 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
 
 void v9fs_cache_inode_get_cookie(struct inode *inode)
 {
-	struct v9fs_cookie *vcookie;
+	struct v9fs_inode *v9inode;
 	struct v9fs_session_info *v9ses;
 
 	if (!S_ISREG(inode->i_mode))
 		return;
 
-	vcookie = v9fs_inode2cookie(inode);
-	if (vcookie->fscache)
+	v9inode = V9FS_I(inode);
+	if (v9inode->fscache)
 		return;
 
 	v9ses = v9fs_inode2v9ses(inode);
-	vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  vcookie);
+						  v9inode);
 
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
-		   vcookie->fscache);
+		   v9inode->fscache);
 }
 
 void v9fs_cache_inode_put_cookie(struct inode *inode)
 {
-	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	if (!vcookie->fscache)
+	if (!v9inode->fscache)
 		return;
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
-		   vcookie->fscache);
+		   v9inode->fscache);
 
-	fscache_relinquish_cookie(vcookie->fscache, 0);
-	vcookie->fscache = NULL;
+	fscache_relinquish_cookie(v9inode->fscache, 0);
+	v9inode->fscache = NULL;
 }
 
 void v9fs_cache_inode_flush_cookie(struct inode *inode)
 {
-	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	if (!vcookie->fscache)
+	if (!v9inode->fscache)
 		return;
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
-		   vcookie->fscache);
+		   v9inode->fscache);
 
-	fscache_relinquish_cookie(vcookie->fscache, 1);
-	vcookie->fscache = NULL;
+	fscache_relinquish_cookie(v9inode->fscache, 1);
+	v9inode->fscache = NULL;
 }
 
 void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 {
-	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	struct v9fs_inode *v9inode = V9FS_I(inode);
 	struct p9_fid *fid;
 
-	if (!vcookie->fscache)
+	if (!v9inode->fscache)
 		return;
 
-	spin_lock(&vcookie->lock);
+	spin_lock(&v9inode->fscache_lock);
 	fid = filp->private_data;
 	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
 		v9fs_cache_inode_flush_cookie(inode);
 	else
 		v9fs_cache_inode_get_cookie(inode);
 
-	spin_unlock(&vcookie->lock);
+	spin_unlock(&v9inode->fscache_lock);
 }
 
 void v9fs_cache_inode_reset_cookie(struct inode *inode)
 {
-	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	struct v9fs_inode *v9inode = V9FS_I(inode);
 	struct v9fs_session_info *v9ses;
 	struct fscache_cookie *old;
 
-	if (!vcookie->fscache)
+	if (!v9inode->fscache)
 		return;
 
-	old = vcookie->fscache;
+	old = v9inode->fscache;
 
-	spin_lock(&vcookie->lock);
-	fscache_relinquish_cookie(vcookie->fscache, 1);
+	spin_lock(&v9inode->fscache_lock);
+	fscache_relinquish_cookie(v9inode->fscache, 1);
 
 	v9ses = v9fs_inode2v9ses(inode);
-	vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+	v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
 						  &v9fs_cache_inode_index_def,
-						  vcookie);
-
+						  v9inode);
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
-		   inode, old, vcookie->fscache);
+		   inode, old, v9inode->fscache);
 
-	spin_unlock(&vcookie->lock);
+	spin_unlock(&v9inode->fscache_lock);
 }
 
 int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
 {
 	struct inode *inode = page->mapping->host;
-	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	BUG_ON(!vcookie->fscache);
+	BUG_ON(!v9inode->fscache);
 
-	return fscache_maybe_release_page(vcookie->fscache, page, gfp);
+	return fscache_maybe_release_page(v9inode->fscache, page, gfp);
 }
 
 void __v9fs_fscache_invalidate_page(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	struct v9fs_inode *v9inode = V9FS_I(inode);
 
-	BUG_ON(!vcookie->fscache);
+	BUG_ON(!v9inode->fscache);
 
 	if (PageFsCache(page)) {
-		fscache_wait_on_page_write(vcookie->fscache, page);
+		fscache_wait_on_page_write(v9inode->fscache, page);
 		BUG_ON(!PageLocked(page));
-		fscache_uncache_page(vcookie->fscache, page);
+		fscache_uncache_page(v9inode->fscache, page);
 	}
 }
 
@@ -380,13 +323,13 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data,
 int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
 {
 	int ret;
-	const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-	if (!vcookie->fscache)
+	if (!v9inode->fscache)
 		return -ENOBUFS;
 
-	ret = fscache_read_or_alloc_page(vcookie->fscache,
+	ret = fscache_read_or_alloc_page(v9inode->fscache,
 					 page,
 					 v9fs_vfs_readpage_complete,
 					 NULL,
@@ -418,13 +361,13 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 				  unsigned *nr_pages)
 {
 	int ret;
-	const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
-	if (!vcookie->fscache)
+	if (!v9inode->fscache)
 		return -ENOBUFS;
 
-	ret = fscache_read_or_alloc_pages(vcookie->fscache,
+	ret = fscache_read_or_alloc_pages(v9inode->fscache,
 					  mapping, pages, nr_pages,
 					  v9fs_vfs_readpage_complete,
 					  NULL,
@@ -453,10 +396,10 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 {
 	int ret;
-	const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-	ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
+	ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
 	P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
 	if (ret != 0)
 		v9fs_uncache_page(inode, page);
@@ -467,8 +410,8 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
  */
 void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
 {
-	const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+	const struct v9fs_inode *v9inode = V9FS_I(inode);
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
 	if (PageFsCache(page))
-		fscache_wait_on_page_write(vcookie->fscache, page);
+		fscache_wait_on_page_write(v9inode->fscache, page);
 }
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index ec16fcdc3a6..049507a5b01 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -25,20 +25,6 @@
 #include <linux/fscache.h>
 #include <linux/spinlock.h>
 
-extern struct kmem_cache *vcookie_cache;
-
-struct v9fs_cookie {
-	spinlock_t lock;
-	struct inode inode;
-	struct fscache_cookie *fscache;
-	struct p9_qid *qid;
-};
-
-static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
-{
-	return container_of(inode, struct v9fs_cookie, inode);
-}
-
 extern struct fscache_netfs v9fs_cache_netfs;
 extern const struct fscache_cookie_def v9fs_cache_session_index_def;
 extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
@@ -66,21 +52,6 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode,
 extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
 extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
 					      struct page *page);
-/**
- * v9fs_cache_register - Register v9fs file system with the cache
- */
-static inline int v9fs_cache_register(void)
-{
-	return __v9fs_cache_register();
-}
-
-/**
- * v9fs_cache_unregister - Unregister v9fs from the cache
- */
-static inline void v9fs_cache_unregister(void)
-{
-	__v9fs_cache_unregister();
-}
 
 static inline int v9fs_fscache_release_page(struct page *page,
 					    gfp_t gfp)
@@ -117,18 +88,18 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {
-	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
-	fscache_uncache_page(vcookie->fscache, page);
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	fscache_uncache_page(v9inode->fscache, page);
 	BUG_ON(PageFsCache(page));
 }
 
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
+static inline void v9fs_fscache_set_key(struct inode *inode,
 					struct p9_qid *qid)
 {
-	struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
-	spin_lock(&vcookie->lock);
-	vcookie->qid = qid;
-	spin_unlock(&vcookie->lock);
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	spin_lock(&v9inode->fscache_lock);
+	v9inode->fscache_key = qid;
+	spin_unlock(&v9inode->fscache_lock);
 }
 
 static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
@@ -139,13 +110,6 @@ static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
 
 #else /* CONFIG_9P_FSCACHE */
 
-static inline int v9fs_cache_register(void)
-{
-	return 1;
-}
-
-static inline void v9fs_cache_unregister(void) {}
-
 static inline int v9fs_fscache_release_page(struct page *page,
 					    gfp_t gfp) {
 	return 1;
@@ -174,10 +138,6 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {}
 
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
-					struct p9_qid *qid)
-{}
-
 static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
 						   struct page *page)
 {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 738be8f6994..c82b017f51f 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -39,6 +39,7 @@
 
 static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
 static LIST_HEAD(v9fs_sessionlist);
+struct kmem_cache *v9fs_inode_cache;
 
 /*
  * Option Parsing (code inspired by NFS code)
@@ -481,6 +482,63 @@ static void v9fs_sysfs_cleanup(void)
 	kobject_put(v9fs_kobj);
 }
 
+static void v9fs_inode_init_once(void *foo)
+{
+	struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
+#ifdef CONFIG_9P_FSCACHE
+	v9inode->fscache = NULL;
+	v9inode->fscache_key = NULL;
+#endif
+	inode_init_once(&v9inode->vfs_inode);
+}
+
+/**
+ * v9fs_init_inode_cache - initialize a cache for 9P
+ * Returns 0 on success.
+ */
+static int v9fs_init_inode_cache(void)
+{
+	v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
+					  sizeof(struct v9fs_inode),
+					  0, (SLAB_RECLAIM_ACCOUNT|
+					      SLAB_MEM_SPREAD),
+					  v9fs_inode_init_once);
+	if (!v9fs_inode_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * v9fs_destroy_inode_cache - destroy the cache of 9P inode
+ *
+ */
+static void v9fs_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(v9fs_inode_cache);
+}
+
+static int v9fs_cache_register(void)
+{
+	int ret;
+	ret = v9fs_init_inode_cache();
+	if (ret < 0)
+		return ret;
+#ifdef CONFIG_9P_FSCACHE
+	return fscache_register_netfs(&v9fs_cache_netfs);
+#else
+	return ret;
+#endif
+}
+
+static void v9fs_cache_unregister(void)
+{
+	v9fs_destroy_inode_cache();
+#ifdef CONFIG_9P_FSCACHE
+	fscache_unregister_netfs(&v9fs_cache_netfs);
+#endif
+}
+
 /**
  * init_v9fs - Initialize module
  *
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 21bd803a3cc..ce59d151206 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -116,6 +116,20 @@ struct v9fs_session_info {
 	struct p9_fid *root_fid; /* Used for file system sync */
 };
 
+struct v9fs_inode {
+#ifdef CONFIG_9P_FSCACHE
+	spinlock_t fscache_lock;
+	struct fscache_cookie *fscache;
+	struct p9_qid *fscache_key;
+#endif
+	struct inode vfs_inode;
+};
+
+static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
+{
+	return container_of(inode, struct v9fs_inode, vfs_inode);
+}
+
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
 									char *);
 extern void v9fs_session_close(struct v9fs_session_info *v9ses);
@@ -129,16 +143,15 @@ extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry);
 extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
 			void *p);
-extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
-			struct p9_fid *fid,
-			struct super_block *sb);
-
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
+					 struct p9_fid *fid,
+					 struct super_block *sb);
 extern const struct inode_operations v9fs_dir_inode_operations_dotl;
 extern const struct inode_operations v9fs_file_inode_operations_dotl;
 extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
-extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
-			struct p9_fid *fid,
-			struct super_block *sb);
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
+					      struct p9_fid *fid,
+					      struct super_block *sb);
 
 /* other default globals */
 #define V9FS_PORT	564
@@ -163,7 +176,7 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 }
 
 /**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * v9fs_get_inode_from_fid - Helper routine to populate an inode by
  * issuing a attribute request
  * @v9ses: session information
  * @fid: fid to issue attribute request for
@@ -171,11 +184,11 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
  *
  */
 static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-				struct super_block *sb)
+v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			struct super_block *sb)
 {
 	if (v9fs_proto_dotl(v9ses))
-		return v9fs_inode_dotl(v9ses, fid, sb);
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb);
 	else
-		return v9fs_inode(v9ses, fid, sb);
+		return v9fs_inode_from_fid(v9ses, fid, sb);
 }
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ab72e66c2e0..ed9fd00566f 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -47,12 +47,10 @@ extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
 extern const struct file_operations v9fs_cached_file_operations;
 extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern struct kmem_cache *v9fs_inode_cache;
 
-#ifdef CONFIG_9P_FSCACHE
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_destroy_inode(struct inode *inode);
-#endif
-
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
 		    struct inode *inode, int mode);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 304904b4061..411c70a88b3 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -203,26 +203,23 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
 	wstat->extension = NULL;
 }
 
-#ifdef CONFIG_9P_FSCACHE
 /**
  * v9fs_alloc_inode - helper function to allocate an inode
- * This callback is executed before setting up the inode so that we
- * can associate a vcookie with each inode.
  *
  */
-
 struct inode *v9fs_alloc_inode(struct super_block *sb)
 {
-	struct v9fs_cookie *vcookie;
-	vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
-							 GFP_KERNEL);
-	if (!vcookie)
+	struct v9fs_inode *v9inode;
+	v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
+							GFP_KERNEL);
+	if (!v9inode)
 		return NULL;
-
-	vcookie->fscache = NULL;
-	vcookie->qid = NULL;
-	spin_lock_init(&vcookie->lock);
-	return &vcookie->inode;
+#ifdef CONFIG_9P_FSCACHE
+	v9inode->fscache = NULL;
+	v9inode->fscache_key = NULL;
+	spin_lock_init(&v9inode->fscache_lock);
+#endif
+	return &v9inode->vfs_inode;
 }
 
 /**
@@ -234,14 +231,13 @@ static void v9fs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	INIT_LIST_HEAD(&inode->i_dentry);
-	kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
+	kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
 }
 
 void v9fs_destroy_inode(struct inode *inode)
 {
 	call_rcu(&inode->i_rcu, v9fs_i_callback);
 }
-#endif
 
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
 		    struct inode *inode, int mode)
@@ -459,7 +455,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 
 	v9fs_stat2inode(st, inode, sb);
 #ifdef CONFIG_9P_FSCACHE
-	v9fs_vcookie_set_qid(ret, &st->qid);
+	v9fs_fscache_set_key(inode, &st->qid);
 	v9fs_cache_inode_get_cookie(inode);
 #endif
 	unlock_new_inode(inode);
@@ -472,8 +468,8 @@ error:
 }
 
 struct inode *
-v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-	   struct super_block *sb)
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+		    struct super_block *sb)
 {
 	struct p9_wstat *st;
 	struct inode *inode = NULL;
@@ -572,7 +568,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	}
 
 	/* instantiate inode and assign the unopened fid to the dentry */
-	inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+	inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -747,7 +743,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(result);
 	}
 
-	inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+	inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		result = PTR_ERR(inode);
 		inode = NULL;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a2a3d7edb17..21523f27f5d 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -113,7 +113,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 
 	v9fs_stat2inode_dotl(st, inode);
 #ifdef CONFIG_9P_FSCACHE
-	v9fs_vcookie_set_qid(inode, &st->qid);
+	v9fs_fscache_set_key(inode, &st->qid);
 	v9fs_cache_inode_get_cookie(inode);
 #endif
 	retval = v9fs_get_acl(inode, fid);
@@ -130,8 +130,8 @@ error:
 }
 
 struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-	struct super_block *sb)
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			 struct super_block *sb)
 {
 	struct p9_stat_dotl *st;
 	struct inode *inode = NULL;
@@ -228,7 +228,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		fid = NULL;
 		goto error;
 	}
-	inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+	inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -341,7 +341,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 			goto error;
 		}
 
-		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -588,7 +588,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 		}
 
 		/* instantiate inode and assign the unopened fid to dentry */
-		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -747,7 +747,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 			goto error;
 		}
 
-		inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 6985e2a7a11..6c812d1c58f 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -292,10 +292,8 @@ static int v9fs_sync_fs(struct super_block *sb, int wait)
 }
 
 static const struct super_operations v9fs_super_ops = {
-#ifdef CONFIG_9P_FSCACHE
 	.alloc_inode = v9fs_alloc_inode,
 	.destroy_inode = v9fs_destroy_inode,
-#endif
 	.statfs = simple_statfs,
 	.evict_inode = v9fs_evict_inode,
 	.show_options = generic_show_options,
@@ -303,10 +301,8 @@ static const struct super_operations v9fs_super_ops = {
 };
 
 static const struct super_operations v9fs_super_ops_dotl = {
-#ifdef CONFIG_9P_FSCACHE
 	.alloc_inode = v9fs_alloc_inode,
 	.destroy_inode = v9fs_destroy_inode,
-#endif
 	.sync_fs = v9fs_sync_fs,
 	.statfs = v9fs_statfs,
 	.evict_inode = v9fs_evict_inode,
-- 
cgit v1.2.3


From 6b39f6d22fbf67cf795c105b4d67c64e9c352ca4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:03 +0530
Subject: fs/9p: Move writeback fid to v9fs_inode

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.h           |  1 +
 fs/9p/vfs_addr.c       | 14 +++++++++-----
 fs/9p/vfs_file.c       | 12 ++++++++----
 fs/9p/vfs_inode.c      | 21 +++++++++++++--------
 fs/9p/vfs_inode_dotl.c | 17 ++++++++++-------
 5 files changed, 41 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index ce59d151206..71cab8e3a57 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -122,6 +122,7 @@ struct v9fs_inode {
 	struct fscache_cookie *fscache;
 	struct p9_qid *fscache_key;
 #endif
+	struct p9_fid *writeback_fid;
 	struct inode vfs_inode;
 };
 
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 566684ce55e..ee455526ca5 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -163,8 +163,10 @@ static int v9fs_vfs_writepage_locked(struct page *page)
 	int retval, len;
 	loff_t offset, size;
 	mm_segment_t old_fs;
+	struct v9fs_inode *v9inode;
 	struct inode *inode = page->mapping->host;
 
+	v9inode = V9FS_I(inode);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -178,11 +180,11 @@ static int v9fs_vfs_writepage_locked(struct page *page)
 
 	old_fs = get_fs();
 	set_fs(get_ds());
-	/* We should have i_private always set */
-	BUG_ON(!inode->i_private);
+	/* We should have writeback_fid always set */
+	BUG_ON(!v9inode->writeback_fid);
 
 	retval = v9fs_file_write_internal(inode,
-					  (struct p9_fid *)inode->i_private,
+					  v9inode->writeback_fid,
 					  (__force const char __user *)buffer,
 					  len, &offset, 0);
 	if (retval > 0)
@@ -274,23 +276,25 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
 {
 	int retval = 0;
 	struct page *page;
+	struct v9fs_inode *v9inode;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = mapping->host;
 
+	v9inode = V9FS_I(inode);
 start:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		retval = -ENOMEM;
 		goto out;
 	}
-	BUG_ON(!inode->i_private);
+	BUG_ON(!v9inode->writeback_fid);
 	if (PageUptodate(page))
 		goto out;
 
 	if (len == PAGE_CACHE_SIZE)
 		goto out;
 
-	retval = v9fs_fid_readpage(inode->i_private, page);
+	retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
 	page_cache_release(page);
 	if (!retval)
 		goto start;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index f7b571ddf99..98c4307a1f0 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -56,11 +56,13 @@ static const struct vm_operations_struct v9fs_file_vm_ops;
 int v9fs_file_open(struct inode *inode, struct file *file)
 {
 	int err;
+	struct v9fs_inode *v9inode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
 	int omode;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+	v9inode = V9FS_I(inode);
 	v9ses = v9fs_inode2v9ses(inode);
 	if (v9fs_proto_dotl(v9ses))
 		omode = file->f_flags;
@@ -88,9 +90,9 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	}
 
 	file->private_data = fid;
-	if (v9ses->cache && !inode->i_private) {
+	if (v9ses->cache && !v9inode->writeback_fid) {
 		/*
-		 * clone a fid and add it to inode->i_private
+		 * clone a fid and add it to writeback_fid
 		 * we do it during open time instead of
 		 * page dirty time via write_begin/page_mkwrite
 		 * because we want write after unlink usecase
@@ -101,7 +103,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 			err = PTR_ERR(fid);
 			goto out_error;
 		}
-		inode->i_private = (void *) fid;
+		v9inode->writeback_fid = (void *) fid;
 	}
 #ifdef CONFIG_9P_FSCACHE
 	if (v9ses->cache)
@@ -550,6 +552,7 @@ v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 static int
 v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+	struct v9fs_inode *v9inode;
 	struct page *page = vmf->page;
 	struct file *filp = vma->vm_file;
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -558,9 +561,10 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
 		   page, (unsigned long)filp->private_data);
 
+	v9inode = V9FS_I(inode);
 	/* make sure the cache has finished storing the page */
 	v9fs_fscache_wait_on_page_write(inode, page);
-	BUG_ON(!inode->i_private);
+	BUG_ON(!v9inode->writeback_fid);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping)
 		goto out_unlock;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 411c70a88b3..fdc086d0744 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -219,6 +219,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 	v9inode->fscache_key = NULL;
 	spin_lock_init(&v9inode->fscache_lock);
 #endif
+	v9inode->writeback_fid = NULL;
 	return &v9inode->vfs_inode;
 }
 
@@ -414,6 +415,8 @@ error:
  */
 void v9fs_evict_inode(struct inode *inode)
 {
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
 	truncate_inode_pages(inode->i_mapping, 0);
 	end_writeback(inode);
 	filemap_fdatawrite(inode->i_mapping);
@@ -421,10 +424,10 @@ void v9fs_evict_inode(struct inode *inode)
 #ifdef CONFIG_9P_FSCACHE
 	v9fs_cache_inode_put_cookie(inode);
 #endif
-	/* clunk the fid stashed in inode->i_private */
-	if (inode->i_private) {
-		p9_client_clunk((struct p9_fid *)inode->i_private);
-		inode->i_private = NULL;
+	/* clunk the fid stashed in writeback_fid */
+	if (v9inode->writeback_fid) {
+		p9_client_clunk(v9inode->writeback_fid);
+		v9inode->writeback_fid = NULL;
 	}
 }
 
@@ -607,9 +610,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	int err;
 	u32 perm;
 	int flags;
+	struct file *filp;
+	struct v9fs_inode *v9inode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid, *inode_fid;
-	struct file *filp;
 
 	err = 0;
 	fid = NULL;
@@ -631,9 +635,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	/* if we are opening a file, assign the open fid to the file */
 	if (nd && nd->flags & LOOKUP_OPEN) {
-		if (v9ses->cache && !dentry->d_inode->i_private) {
+		v9inode = V9FS_I(dentry->d_inode);
+		if (v9ses->cache && !v9inode->writeback_fid) {
 			/*
-			 * clone a fid and add it to inode->i_private
+			 * clone a fid and add it to writeback_fid
 			 * we do it during open time instead of
 			 * page dirty time via write_begin/page_mkwrite
 			 * because we want write after unlink usecase
@@ -644,7 +649,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 				err = PTR_ERR(inode_fid);
 				goto error;
 			}
-			dentry->d_inode->i_private = (void *) inode_fid;
+			v9inode->writeback_fid = (void *) inode_fid;
 		}
 		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
 		if (IS_ERR(filp)) {
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 21523f27f5d..984594123ab 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -159,16 +159,17 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		struct nameidata *nd)
 {
 	int err = 0;
-	char *name = NULL;
 	gid_t gid;
 	int flags;
 	mode_t mode;
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *fid = NULL;
-	struct p9_fid *dfid, *ofid, *inode_fid;
+	char *name = NULL;
 	struct file *filp;
 	struct p9_qid qid;
 	struct inode *inode;
+	struct p9_fid *fid = NULL;
+	struct v9fs_inode *v9inode;
+	struct p9_fid *dfid, *ofid, *inode_fid;
+	struct v9fs_session_info *v9ses;
 	struct posix_acl *pacl = NULL, *dacl = NULL;
 
 	v9ses = v9fs_inode2v9ses(dir);
@@ -241,9 +242,11 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 
 	/* Now set the ACL based on the default value */
 	v9fs_set_create_acl(dentry, dacl, pacl);
-	if (v9ses->cache && !inode->i_private) {
+
+	v9inode = V9FS_I(inode);
+	if (v9ses->cache && !v9inode->writeback_fid) {
 		/*
-		 * clone a fid and add it to inode->i_private
+		 * clone a fid and add it to writeback_fid
 		 * we do it during open time instead of
 		 * page dirty time via write_begin/page_mkwrite
 		 * because we want write after unlink usecase
@@ -254,7 +257,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 			err = PTR_ERR(inode_fid);
 			goto error;
 		}
-		inode->i_private = (void *) inode_fid;
+		v9inode->writeback_fid = (void *) inode_fid;
 	}
 	/* Since we are opening a file, assign the open fid to the file */
 	filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
-- 
cgit v1.2.3


From 6b365604ca8c1b67f0d5a816cf5acf2dc3edc229 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:03 +0530
Subject: fs/9p: set default readahead pages in cached mode

We want to enable readahead in cached mode

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 6c812d1c58f..f84d625b879 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -86,6 +86,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 	} else
 		sb->s_op = &v9fs_super_ops;
 	sb->s_bdi = &v9ses->bdi;
+	if (v9ses->cache)
+		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
 
 	sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
 	if (!v9ses->cache)
-- 
cgit v1.2.3


From fa6ea16160c72c448e2728dab4b6b0a133fdfc98 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:04 +0530
Subject: fs/9p: Fix inode i_size update in file_write

Only update inode i_size when we write towards end of file.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 98c4307a1f0..3cff25e759e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -440,6 +440,7 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
 			 loff_t *offset, int invalidate)
 {
 	int n;
+	loff_t i_size;
 	size_t total = 0;
 	struct p9_client *clnt;
 	loff_t origin = *offset;
@@ -464,8 +465,11 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
 			invalidate_inode_pages2_range(inode->i_mapping,
 						      pg_start, pg_end);
 		*offset += total;
-		i_size_write(inode, i_size_read(inode) + total);
-		inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+		i_size = i_size_read(inode);
+		if (*offset > i_size) {
+			inode_add_bytes(inode, *offset - i_size);
+			i_size_write(inode, *offset);
+		}
 	}
 	if (n < 0)
 		return n;
-- 
cgit v1.2.3


From e959b54901e835f062ac8d44107bc543b66f0364 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:04 +0530
Subject: fs/9p: Add direct IO support in cached mode

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_addr.c |   5 ++-
 fs/9p/vfs_file.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index ee455526ca5..2524e4cbb8e 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -254,8 +254,9 @@ static int v9fs_launder_page(struct page *page)
  * with an error.
  *
  */
-ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-		loff_t pos, unsigned long nr_segs)
+static ssize_t
+v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+	       loff_t pos, unsigned long nr_segs)
 {
 	/*
 	 * FIXME
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3cff25e759e..78bcb97c342 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -579,15 +579,124 @@ out_unlock:
 	return VM_FAULT_NOPAGE;
 }
 
+static ssize_t
+v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
+		 loff_t *offsetp)
+{
+	loff_t size, offset;
+	struct inode *inode;
+	struct address_space *mapping;
+
+	offset = *offsetp;
+	mapping = filp->f_mapping;
+	inode = mapping->host;
+	if (!count)
+		return 0;
+	size = i_size_read(inode);
+	if (offset < size)
+		filemap_write_and_wait_range(mapping, offset,
+					     offset + count - 1);
+
+	return v9fs_file_read(filp, udata, count, offsetp);
+}
+
+/**
+ * v9fs_cached_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
+		      loff_t *offset)
+{
+	if (filp->f_flags & O_DIRECT)
+		return v9fs_direct_read(filp, data, count, offset);
+	return do_sync_read(filp, data, count, offset);
+}
+
+static ssize_t
+v9fs_direct_write(struct file *filp, const char __user * data,
+		  size_t count, loff_t *offsetp)
+{
+	loff_t offset;
+	ssize_t retval;
+	struct inode *inode;
+	struct address_space *mapping;
+
+	offset = *offsetp;
+	mapping = filp->f_mapping;
+	inode = mapping->host;
+	if (!count)
+		return 0;
+
+	mutex_lock(&inode->i_mutex);
+	retval = filemap_write_and_wait_range(mapping, offset,
+					      offset + count - 1);
+	if (retval)
+		goto err_out;
+	/*
+	 * After a write we want buffered reads to be sure to go to disk to get
+	 * the new data.  We invalidate clean cached page from the region we're
+	 * about to write.  We do this *before* the write so that if we fail
+	 * here we fall back to buffered write
+	 */
+	if (mapping->nrpages) {
+		pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
+		pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+
+		retval = invalidate_inode_pages2_range(mapping,
+							pg_start, pg_end);
+		/*
+		 * If a page can not be invalidated, fall back
+		 * to buffered write.
+		 */
+		if (retval) {
+			if (retval == -EBUSY)
+				goto buff_write;
+			goto err_out;
+		}
+	}
+	retval = v9fs_file_write(filp, data, count, offsetp);
+err_out:
+	mutex_unlock(&inode->i_mutex);
+	return retval;
+
+buff_write:
+	mutex_unlock(&inode->i_mutex);
+	return do_sync_write(filp, data, count, offsetp);
+}
+
+/**
+ * v9fs_cached_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_cached_file_write(struct file *filp, const char __user * data,
+		       size_t count, loff_t *offset)
+{
+
+	if (filp->f_flags & O_DIRECT)
+		return v9fs_direct_write(filp, data, count, offset);
+	return do_sync_write(filp, data, count, offset);
+}
+
 static const struct vm_operations_struct v9fs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
 };
 
+
 const struct file_operations v9fs_cached_file_operations = {
 	.llseek = generic_file_llseek,
-	.read = do_sync_read,
-	.write = do_sync_write,
+	.read = v9fs_cached_file_read,
+	.write = v9fs_cached_file_write,
 	.aio_read = generic_file_aio_read,
 	.aio_write = generic_file_aio_write,
 	.open = v9fs_file_open,
@@ -599,8 +708,8 @@ const struct file_operations v9fs_cached_file_operations = {
 
 const struct file_operations v9fs_cached_file_operations_dotl = {
 	.llseek = generic_file_llseek,
-	.read = do_sync_read,
-	.write = do_sync_write,
+	.read = v9fs_cached_file_read,
+	.write = v9fs_cached_file_write,
 	.aio_read = generic_file_aio_read,
 	.aio_write = generic_file_aio_write,
 	.open = v9fs_file_open,
-- 
cgit v1.2.3


From edd73cf544849c10e0fbc294ca2171f6c28b4093 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:05 +0530
Subject: fs/9p: Add drop_inode 9p callback

We want to immediately drop the inode in non cached mode

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_super.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f84d625b879..3fdaeb3c6ed 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -293,6 +293,20 @@ static int v9fs_sync_fs(struct super_block *sb, int wait)
 	return p9_client_sync_fs(v9ses->root_fid);
 }
 
+static int v9fs_drop_inode(struct inode *inode)
+{
+	struct v9fs_session_info *v9ses;
+	v9ses = v9fs_inode2v9ses(inode);
+	if (v9ses->cache)
+		return generic_drop_inode(inode);
+	/*
+	 * in case of non cached mode always drop the
+	 * the inode because we want the inode attribute
+	 * to always match that on the server.
+	 */
+	return 1;
+}
+
 static const struct super_operations v9fs_super_ops = {
 	.alloc_inode = v9fs_alloc_inode,
 	.destroy_inode = v9fs_destroy_inode,
@@ -307,6 +321,7 @@ static const struct super_operations v9fs_super_ops_dotl = {
 	.destroy_inode = v9fs_destroy_inode,
 	.sync_fs = v9fs_sync_fs,
 	.statfs = v9fs_statfs,
+	.drop_inode = v9fs_drop_inode,
 	.evict_inode = v9fs_evict_inode,
 	.show_options = generic_show_options,
 	.umount_begin = v9fs_umount_begin,
-- 
cgit v1.2.3


From b271ec47bc11deacb6a0373ee29965ab628e74b2 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:05 +0530
Subject: fs/9p: Update link count correctly on different file system
 operations

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c      | 32 ++++++++++++++++++++++++++++----
 fs/9p/vfs_inode_dotl.c |  2 +-
 2 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index fdc086d0744..334ad12a7bb 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -510,8 +510,17 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 		return PTR_ERR(v9fid);
 
 	retval = p9_client_remove(v9fid);
-	if (!retval)
-		drop_nlink(file_inode);
+	if (!retval) {
+		/*
+		 * directories on unlink should have zero
+		 * link count
+		 */
+		if (rmdir) {
+			clear_nlink(file_inode);
+			drop_nlink(dir);
+		} else
+			drop_nlink(file_inode);
+	}
 	return retval;
 }
 
@@ -697,7 +706,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
 		fid = NULL;
-	}
+	} else
+		inc_nlink(dir);
 
 	if (fid)
 		p9_client_clunk(fid);
@@ -809,6 +819,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct inode *old_inode;
+	struct inode *new_inode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *oldfid;
 	struct p9_fid *olddirfid;
@@ -819,6 +830,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
 	retval = 0;
 	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
 	v9ses = v9fs_inode2v9ses(old_inode);
 	oldfid = v9fs_fid_lookup(old_dentry);
 	if (IS_ERR(oldfid))
@@ -859,9 +871,21 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	retval = p9_client_wstat(oldfid, &wstat);
 
 clunk_newdir:
-	if (!retval)
+	if (!retval) {
+		if (new_inode) {
+			if (S_ISDIR(new_inode->i_mode))
+				clear_nlink(new_inode);
+			else
+				drop_nlink(new_inode);
+		}
+		if (S_ISDIR(old_inode->i_mode)) {
+			if (!new_inode)
+				inc_nlink(new_dir);
+			drop_nlink(old_dir);
+		}
 		/* successful rename */
 		d_move(old_dentry, new_dentry);
+	}
 	up_write(&v9ses->rename_sem);
 	p9_client_clunk(newdirfid);
 
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 984594123ab..c6d9677dcb6 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -371,7 +371,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	}
 	/* Now set the ACL based on the default value */
 	v9fs_set_create_acl(dentry, dacl, pacl);
-
+	inc_nlink(dir);
 error:
 	if (fid)
 		p9_client_clunk(fid);
-- 
cgit v1.2.3


From 0e432703aac3b187dd88d81ac23282f7b1c71002 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:06 +0530
Subject: fs/9p: Initialize root inode number for dotl

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 3fdaeb3c6ed..1d99b185bed 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -168,7 +168,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 			retval = PTR_ERR(st);
 			goto release_sb;
 		}
-
+		root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
 		v9fs_stat2inode_dotl(st, root->d_inode);
 		kfree(st);
 	} else {
-- 
cgit v1.2.3


From b3cbea03b4edbd6b625dbf813bf8c30c22213cb7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:06 +0530
Subject: fs/9p: Add support for marking inode attribute invalid

With cached mode some of the file system operation result
in updating inode attributes (ctime). Add support for
marking inode attribute invalid in such cases so that
we fetch the updated inode attribute on dentry revalidation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.h           |  4 ++++
 fs/9p/v9fs_vfs.h       | 10 ++++++++++
 fs/9p/vfs_dentry.c     | 34 ++++++++++++++++++++++++++++++++++
 fs/9p/vfs_inode.c      | 29 +++++++++++++++++++++++++++++
 fs/9p/vfs_inode_dotl.c | 27 +++++++++++++++++++++++++++
 5 files changed, 104 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 71cab8e3a57..cfdc05527f8 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -116,12 +116,16 @@ struct v9fs_session_info {
 	struct p9_fid *root_fid; /* Used for file system sync */
 };
 
+/* cache_validity flags */
+#define V9FS_INO_INVALID_ATTR 0x01
+
 struct v9fs_inode {
 #ifdef CONFIG_9P_FSCACHE
 	spinlock_t fscache_lock;
 	struct fscache_cookie *fscache;
 	struct p9_qid *fscache_key;
 #endif
+	unsigned int cache_validity;
 	struct p9_fid *writeback_fid;
 	struct inode vfs_inode;
 };
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed9fd00566f..591807f2018 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -70,4 +70,14 @@ int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
 int v9fs_file_fsync_dotl(struct file *filp, int datasync);
 ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
 				 const char __user *, size_t, loff_t *, int);
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
+static inline void v9fs_invalidate_inode_attr(struct inode *inode)
+{
+	struct v9fs_inode *v9inode;
+	v9inode = V9FS_I(inode);
+	v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
+	return;
+}
+
 #define P9_LOCK_TIMEOUT (30*HZ)
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index a4ae4be5ab5..b6a3b9f7fe4 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -100,7 +100,41 @@ static void v9fs_dentry_release(struct dentry *dentry)
 	}
 }
 
+static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct p9_fid *fid;
+	struct inode *inode;
+	struct v9fs_inode *v9inode;
+
+	if (nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	if (!inode)
+		goto out_valid;
+
+	v9inode = V9FS_I(inode);
+	if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+		int retval;
+		struct v9fs_session_info *v9ses;
+		fid = v9fs_fid_lookup(dentry);
+		if (IS_ERR(fid))
+			return PTR_ERR(fid);
+
+		v9ses = v9fs_inode2v9ses(inode);
+		if (v9fs_proto_dotl(v9ses))
+			retval = v9fs_refresh_inode_dotl(fid, inode);
+		else
+			retval = v9fs_refresh_inode(fid, inode);
+		if (retval <= 0)
+			return retval;
+	}
+out_valid:
+	return 1;
+}
+
 const struct dentry_operations v9fs_cached_dentry_operations = {
+	.d_revalidate = v9fs_lookup_revalidate,
 	.d_delete = v9fs_cached_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 334ad12a7bb..a28fe9fa20a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -220,6 +220,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 	spin_lock_init(&v9inode->fscache_lock);
 #endif
 	v9inode->writeback_fid = NULL;
+	v9inode->cache_validity = 0;
 	return &v9inode->vfs_inode;
 }
 
@@ -1010,6 +1011,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	char tag_name[14];
 	unsigned int i_nlink;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	inode->i_nlink = 1;
 
@@ -1069,6 +1071,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 
 	/* not real number of blocks, but 512 byte ones ... */
 	inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 
 /**
@@ -1323,6 +1326,32 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	return retval;
 }
 
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
+{
+	loff_t i_size;
+	struct p9_wstat *st;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	st = p9_client_stat(fid);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	spin_lock(&inode->i_lock);
+	/*
+	 * We don't want to refresh inode->i_size,
+	 * because we may have cached data
+	 */
+	i_size = inode->i_size;
+	v9fs_stat2inode(st, inode, inode->i_sb);
+	if (v9ses->cache)
+		inode->i_size = i_size;
+	spin_unlock(&inode->i_lock);
+	p9stat_free(st);
+	kfree(st);
+	return 0;
+}
+
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
 	.create = v9fs_vfs_create,
 	.lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c6d9677dcb6..5c04d66afb1 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -484,6 +484,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 void
 v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 {
+	struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
 		inode->i_atime.tv_sec = stat->st_atime_sec;
@@ -542,6 +543,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 	/* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
 	 * because the inode structure does not have fields for them.
 	 */
+	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 
 static int
@@ -822,6 +824,31 @@ ndset:
 	return NULL;
 }
 
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
+{
+	loff_t i_size;
+	struct p9_stat_dotl *st;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(inode);
+	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+	if (IS_ERR(st))
+		return PTR_ERR(st);
+
+	spin_lock(&inode->i_lock);
+	/*
+	 * We don't want to refresh inode->i_size,
+	 * because we may have cached data
+	 */
+	i_size = inode->i_size;
+	v9fs_stat2inode_dotl(st, inode);
+	if (v9ses->cache)
+		inode->i_size = i_size;
+	spin_unlock(&inode->i_lock);
+	kfree(st);
+	return 0;
+}
+
 const struct inode_operations v9fs_dir_inode_operations_dotl = {
 	.create = v9fs_vfs_create_dotl,
 	.lookup = v9fs_vfs_lookup,
-- 
cgit v1.2.3


From 3bc86de317f9e0edbbae142c9a5490a2eca517ce Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:07 +0530
Subject: fs/9p: mark inode attribute invalid on rename, unlink and setattr

rename, unlink and setattr can result in update of inode attribute.
So mark the cached copy invalid

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c      | 4 ++++
 fs/9p/vfs_inode_dotl.c | 1 +
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index a28fe9fa20a..3e3ffe3ad1a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -521,6 +521,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 			drop_nlink(dir);
 		} else
 			drop_nlink(file_inode);
+		v9fs_invalidate_inode_attr(file_inode);
 	}
 	return retval;
 }
@@ -884,6 +885,8 @@ clunk_newdir:
 				inc_nlink(new_dir);
 			drop_nlink(old_dir);
 		}
+		v9fs_invalidate_inode_attr(old_inode);
+
 		/* successful rename */
 		d_move(old_dentry, new_dentry);
 	}
@@ -983,6 +986,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (retval < 0)
 		return retval;
 
+	v9fs_invalidate_inode_attr(dentry->d_inode);
 	if ((iattr->ia_valid & ATTR_SIZE) &&
 	    iattr->ia_size != i_size_read(dentry->d_inode)) {
 		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 5c04d66afb1..8d5f7e32863 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -455,6 +455,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	if (retval < 0)
 		return retval;
 
+	v9fs_invalidate_inode_attr(dentry->d_inode);
 	if ((iattr->ia_valid & ATTR_SIZE) &&
 	    iattr->ia_size != i_size_read(dentry->d_inode)) {
 		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
-- 
cgit v1.2.3


From 823fcfd42297acaf28892b3d2aabef2a121449c2 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:07 +0530
Subject: fs/9p: Add . and .. dentry revalidation flag

We need to revalidate . and .. entries also

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 1d99b185bed..09fd08d1606 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -332,5 +332,5 @@ struct file_system_type v9fs_fs_type = {
 	.mount = v9fs_mount,
 	.kill_sb = v9fs_kill_super,
 	.owner = THIS_MODULE,
-	.fs_flags = FS_RENAME_DOES_D_MOVE,
+	.fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
 };
-- 
cgit v1.2.3


From d28c61f0e08aab9e3a2d3430e75f97937c5fe5fd Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:08 +0530
Subject: fs/9p: Mark directory inode invalid for many directory inode
 operations

One successfull directory operation we would have changed directory
inode attribute. So mark them invalid

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c      | 23 ++++++++++++++++-------
 fs/9p/vfs_inode_dotl.c | 23 ++++++++++++++---------
 2 files changed, 30 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3e3ffe3ad1a..c072bb97e95 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -499,8 +499,8 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
 	int retval;
-	struct inode *file_inode;
 	struct p9_fid *v9fid;
+	struct inode *file_inode;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
 		rmdir);
@@ -521,7 +521,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 			drop_nlink(dir);
 		} else
 			drop_nlink(file_inode);
+
 		v9fs_invalidate_inode_attr(file_inode);
+		v9fs_invalidate_inode_attr(dir);
 	}
 	return retval;
 }
@@ -644,6 +646,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		goto error;
 	}
 
+	v9fs_invalidate_inode_attr(dir);
 	/* if we are opening a file, assign the open fid to the file */
 	if (nd && nd->flags & LOOKUP_OPEN) {
 		v9inode = V9FS_I(dentry->d_inode);
@@ -697,8 +700,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int err;
 	u32 perm;
-	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
+	struct v9fs_session_info *v9ses;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
 	err = 0;
@@ -708,8 +711,10 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
 		fid = NULL;
-	} else
+	} else {
 		inc_nlink(dir);
+		v9fs_invalidate_inode_attr(dir);
+	}
 
 	if (fid)
 		p9_client_clunk(fid);
@@ -820,6 +825,7 @@ int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		struct inode *new_dir, struct dentry *new_dentry)
 {
+	int retval;
 	struct inode *old_inode;
 	struct inode *new_inode;
 	struct v9fs_session_info *v9ses;
@@ -827,7 +833,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct p9_fid *olddirfid;
 	struct p9_fid *newdirfid;
 	struct p9_wstat wstat;
-	int retval;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
 	retval = 0;
@@ -886,6 +891,8 @@ clunk_newdir:
 			drop_nlink(old_dir);
 		}
 		v9fs_invalidate_inode_attr(old_inode);
+		v9fs_invalidate_inode_attr(old_dir);
+		v9fs_invalidate_inode_attr(new_dir);
 
 		/* successful rename */
 		d_move(old_dentry, new_dentry);
@@ -1208,8 +1215,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 	int mode, const char *extension)
 {
 	u32 perm;
-	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid;
+	struct v9fs_session_info *v9ses;
 
 	v9ses = v9fs_inode2v9ses(dir);
 	if (!v9fs_proto_dotu(v9ses)) {
@@ -1223,6 +1230,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
 
+	v9fs_invalidate_inode_attr(dir);
 	p9_client_clunk(fid);
 	return 0;
 }
@@ -1259,8 +1267,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	      struct dentry *dentry)
 {
 	int retval;
-	struct p9_fid *oldfid;
 	char *name;
+	struct p9_fid *oldfid;
 
 	P9_DPRINTK(P9_DEBUG_VFS,
 		" %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
@@ -1279,7 +1287,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	sprintf(name, "%d\n", oldfid->fid);
 	retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
 	__putname(name);
-
+	if (!retval)
+		v9fs_invalidate_inode_attr(dir);
 clunk_fid:
 	p9_client_clunk(oldfid);
 	return retval;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 8d5f7e32863..4d4c70e0708 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -220,6 +220,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 				err);
 		goto error;
 	}
+	v9fs_invalidate_inode_attr(dir);
 
 	/* instantiate inode and assign the unopened fid to the dentry */
 	fid = p9_client_walk(dfid, 1, &name, 1);
@@ -372,6 +373,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 	/* Now set the ACL based on the default value */
 	v9fs_set_create_acl(dentry, dacl, pacl);
 	inc_nlink(dir);
+	v9fs_invalidate_inode_attr(dir);
 error:
 	if (fid)
 		p9_client_clunk(fid);
@@ -551,14 +553,14 @@ static int
 v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 		const char *symname)
 {
-	struct v9fs_session_info *v9ses;
-	struct p9_fid *dfid;
-	struct p9_fid *fid = NULL;
-	struct inode *inode;
-	struct p9_qid qid;
-	char *name;
 	int err;
 	gid_t gid;
+	char *name;
+	struct p9_qid qid;
+	struct inode *inode;
+	struct p9_fid *dfid;
+	struct p9_fid *fid = NULL;
+	struct v9fs_session_info *v9ses;
 
 	name = (char *) dentry->d_name.name;
 	P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
@@ -582,6 +584,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 		goto error;
 	}
 
+	v9fs_invalidate_inode_attr(dir);
 	if (v9ses->cache) {
 		/* Now walk from the parent so we can get an unopened fid. */
 		fid = p9_client_walk(dfid, 1, &name, 1);
@@ -636,10 +639,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 		struct dentry *dentry)
 {
 	int err;
-	struct p9_fid *dfid, *oldfid;
 	char *name;
-	struct v9fs_session_info *v9ses;
 	struct dentry *dir_dentry;
+	struct p9_fid *dfid, *oldfid;
+	struct v9fs_session_info *v9ses;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
 			dir->i_ino, old_dentry->d_name.name,
@@ -664,6 +667,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 		return err;
 	}
 
+	v9fs_invalidate_inode_attr(dir);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		/* Get the latest stat info from server. */
 		struct p9_fid *fid;
@@ -700,12 +704,12 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		dev_t rdev)
 {
 	int err;
+	gid_t gid;
 	char *name;
 	mode_t mode;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
 	struct inode *inode;
-	gid_t gid;
 	struct p9_qid qid;
 	struct dentry *dir_dentry;
 	struct posix_acl *dacl = NULL, *pacl = NULL;
@@ -742,6 +746,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	if (err < 0)
 		goto error;
 
+	v9fs_invalidate_inode_attr(dir);
 	/* instantiate inode and assign the unopened fid to the dentry */
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		fid = p9_client_walk(dfid, 1, &name, 1);
-- 
cgit v1.2.3


From 23b08e97f2c0d68c2a46a11e4fd8a4686d7351ee Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:08 +0530
Subject: fs/9p: Workaround vfs rename rehash bug

This is similar to what ceph, ocfs2 and nfs does
http://kerneltrap.org/mailarchive/linux-fsdevel/2008/4/18/1498534

May be we should get vfs fixed

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c072bb97e95..8e9d60345bc 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -884,6 +884,11 @@ clunk_newdir:
 				clear_nlink(new_inode);
 			else
 				drop_nlink(new_inode);
+			/*
+			 * Work around vfs rename rehash bug with
+			 * FS_RENAME_DOES_D_MOVE
+			 */
+			v9fs_invalidate_inode_attr(new_inode);
 		}
 		if (S_ISDIR(old_inode->i_mode)) {
 			if (!new_inode)
-- 
cgit v1.2.3


From e0459f57b8b3bbabf6f11f73da4d17abb0c159de Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:09 +0530
Subject: fs/9p: Prevent multiple inclusion of same header

Add necessary #ifndef #endif blocks to avoid mulitple inclusion of same headers

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/fid.h      | 4 +++-
 fs/9p/v9fs.h     | 4 ++++
 fs/9p/v9fs_vfs.h | 6 ++++--
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index c058f1c7656..bb0b6e7f58f 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -19,7 +19,8 @@
  *  Boston, MA  02111-1301  USA
  *
  */
-
+#ifndef FS_9P_FID_H
+#define FS_9P_FID_H
 #include <linux/list.h>
 
 /**
@@ -46,3 +47,4 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
 int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
 struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
+#endif
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index cfdc05527f8..bd8496db135 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,9 @@
  *  Boston, MA  02111-1301  USA
  *
  */
+#ifndef FS_9P_V9FS_H
+#define FS_9P_V9FS_H
+
 #include <linux/backing-dev.h>
 
 /**
@@ -197,3 +200,4 @@ v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	else
 		return v9fs_inode_from_fid(v9ses, fid, sb);
 }
+#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 591807f2018..4014160903a 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -20,6 +20,8 @@
  *  Boston, MA  02111-1301  USA
  *
  */
+#ifndef FS_9P_V9FS_VFS_H
+#define FS_9P_V9FS_VFS_H
 
 /* plan9 semantics are that created files are implicitly opened.
  * But linux semantics are that you call create, then open.
@@ -36,6 +38,7 @@
  * unlink calls remove, which is an implicit clunk. So we have to track
  * that kind of thing so that we don't try to clunk a dead fid.
  */
+#define P9_LOCK_TIMEOUT (30*HZ)
 
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
@@ -79,5 +82,4 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
 	v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
 	return;
 }
-
-#define P9_LOCK_TIMEOUT (30*HZ)
+#endif
-- 
cgit v1.2.3


From c06c066a083aa0a336d6b2b1ed502dd0b8488ac7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:09 +0530
Subject: fs/9p: Properly update inode attributes on link

With caching enabled, we need to make sure we don't
update inode->i_size via stat2inode because we could
have dirty data which is not yet written to the server

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c      |  4 +++-
 fs/9p/vfs_inode_dotl.c | 10 +---------
 2 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8e9d60345bc..524d255a574 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1292,8 +1292,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	sprintf(name, "%d\n", oldfid->fid);
 	retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
 	__putname(name);
-	if (!retval)
+	if (!retval) {
+		v9fs_refresh_inode(oldfid, old_dentry->d_inode);
 		v9fs_invalidate_inode_attr(dir);
+	}
 clunk_fid:
 	p9_client_clunk(oldfid);
 	return retval;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4d4c70e0708..81bb4c2a0b8 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -671,19 +671,11 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		/* Get the latest stat info from server. */
 		struct p9_fid *fid;
-		struct p9_stat_dotl *st;
-
 		fid = v9fs_fid_lookup(old_dentry);
 		if (IS_ERR(fid))
 			return PTR_ERR(fid);
 
-		st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-		if (IS_ERR(st))
-			return PTR_ERR(st);
-
-		v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-
-		kfree(st);
+		v9fs_refresh_inode_dotl(fid, old_dentry->d_inode);
 	}
 	ihold(old_dentry->d_inode);
 	d_instantiate(dentry, old_dentry->d_inode);
-- 
cgit v1.2.3


From f10fc50f1adaf40fdd5da6dd154ecb464b468e2f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:10 +0530
Subject: fs/9p: call vmtruncate before setattr 9p opeation

We need to call vmtruncate before 9p setattr operation, otherwise we
could write back some dirty pages between setattr with ATTR_SIZE and vmtruncate
causing some truncated pages to be written back to server

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c      | 10 ++++------
 fs/9p/vfs_inode_dotl.c |  9 ++++-----
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 524d255a574..2555776920a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -993,18 +993,16 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		if (iattr->ia_valid & ATTR_GID)
 			wstat.n_gid = iattr->ia_gid;
 	}
-
-	retval = p9_client_wstat(fid, &wstat);
-	if (retval < 0)
-		return retval;
-
-	v9fs_invalidate_inode_attr(dentry->d_inode);
 	if ((iattr->ia_valid & ATTR_SIZE) &&
 	    iattr->ia_size != i_size_read(dentry->d_inode)) {
 		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
 		if (retval)
 			return retval;
 	}
+	retval = p9_client_wstat(fid, &wstat);
+	if (retval < 0)
+		return retval;
+	v9fs_invalidate_inode_attr(dentry->d_inode);
 
 	setattr_copy(dentry->d_inode, iattr);
 	mark_inode_dirty(dentry->d_inode);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 81bb4c2a0b8..bcd0ee2e549 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -453,17 +453,16 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
 
-	retval = p9_client_setattr(fid, &p9attr);
-	if (retval < 0)
-		return retval;
-
-	v9fs_invalidate_inode_attr(dentry->d_inode);
 	if ((iattr->ia_valid & ATTR_SIZE) &&
 	    iattr->ia_size != i_size_read(dentry->d_inode)) {
 		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
 		if (retval)
 			return retval;
 	}
+	retval = p9_client_setattr(fid, &p9attr);
+	if (retval < 0)
+		return retval;
+	v9fs_invalidate_inode_attr(dentry->d_inode);
 
 	setattr_copy(dentry->d_inode, iattr);
 	mark_inode_dirty(dentry->d_inode);
-- 
cgit v1.2.3


From 3dc5436aa5719be029bbc2bef0e5cb0312e4ff88 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:11 +0530
Subject: fs/9p: Writeback dirty data before setattr

change file attribute can result in making the file readonly.
So flush the dirty pages before that.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c      | 4 ++++
 fs/9p/vfs_inode_dotl.c | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2555776920a..8a2c232f708 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -999,6 +999,10 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		if (retval)
 			return retval;
 	}
+	/* Write all dirty data */
+	if (S_ISREG(dentry->d_inode->i_mode))
+		filemap_write_and_wait(dentry->d_inode->i_mapping);
+
 	retval = p9_client_wstat(fid, &wstat);
 	if (retval < 0)
 		return retval;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index bcd0ee2e549..67c138e94fe 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -459,6 +459,10 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 		if (retval)
 			return retval;
 	}
+	/* Write all dirty data */
+	if (S_ISREG(dentry->d_inode->i_mode))
+		filemap_write_and_wait(dentry->d_inode->i_mapping);
+
 	retval = p9_client_setattr(fid, &p9attr);
 	if (retval < 0)
 		return retval;
-- 
cgit v1.2.3


From 7c9e592e1f6a994d2903c9b055e488ec90f58159 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 28 Feb 2011 17:04:11 +0530
Subject: fs/9p: Make the writeback_fid owned by root

Changes to make sure writeback fid is owned by root

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/fid.c | 92 +++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 56 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 9d6a5d3bfe1..cd63e002d82 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -125,46 +125,17 @@ err_out:
 	return -ENOMEM;
 }
 
-/**
- * v9fs_fid_lookup - lookup for a fid, try to walk if not found
- * @dentry: dentry to look for fid in
- *
- * Look for a fid in the specified dentry for the current user.
- * If no fid is found, try to create one walking from a fid from the parent
- * dentry (if it has one), or the root dentry. If the user haven't accessed
- * the fs yet, attach now and walk from the root.
- */
-
-struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
+static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
+					       uid_t uid, int any)
 {
-	int i, n, l, clone, any, access;
-	u32 uid;
-	struct p9_fid *fid, *old_fid = NULL;
 	struct dentry *ds;
-	struct v9fs_session_info *v9ses;
 	char **wnames, *uname;
+	int i, n, l, clone, access;
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid, *old_fid = NULL;
 
 	v9ses = v9fs_inode2v9ses(dentry->d_inode);
 	access = v9ses->flags & V9FS_ACCESS_MASK;
-	switch (access) {
-	case V9FS_ACCESS_SINGLE:
-	case V9FS_ACCESS_USER:
-	case V9FS_ACCESS_CLIENT:
-		uid = current_fsuid();
-		any = 0;
-		break;
-
-	case V9FS_ACCESS_ANY:
-		uid = v9ses->uid;
-		any = 1;
-		break;
-
-	default:
-		uid = ~0;
-		any = 0;
-		break;
-	}
-
 	fid = v9fs_fid_find(dentry, uid, any);
 	if (fid)
 		return fid;
@@ -250,6 +221,45 @@ err_out:
 	return fid;
 }
 
+/**
+ * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+ * @dentry: dentry to look for fid in
+ *
+ * Look for a fid in the specified dentry for the current user.
+ * If no fid is found, try to create one walking from a fid from the parent
+ * dentry (if it has one), or the root dentry. If the user haven't accessed
+ * the fs yet, attach now and walk from the root.
+ */
+
+struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
+{
+	uid_t uid;
+	int  any, access;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	access = v9ses->flags & V9FS_ACCESS_MASK;
+	switch (access) {
+	case V9FS_ACCESS_SINGLE:
+	case V9FS_ACCESS_USER:
+	case V9FS_ACCESS_CLIENT:
+		uid = current_fsuid();
+		any = 0;
+		break;
+
+	case V9FS_ACCESS_ANY:
+		uid = v9ses->uid;
+		any = 1;
+		break;
+
+	default:
+		uid = ~0;
+		any = 0;
+		break;
+	}
+	return v9fs_fid_lookup_with_uid(dentry, uid, any);
+}
+
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 {
 	struct p9_fid *fid, *ret;
@@ -262,13 +272,24 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 	return ret;
 }
 
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+{
+	struct p9_fid *fid, *ret;
+
+	fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
+	if (IS_ERR(fid))
+		return fid;
+
+	ret = p9_client_walk(fid, 0, NULL, 1);
+	return ret;
+}
 
 struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 {
 	int err;
 	struct p9_fid *fid;
 
-	fid = v9fs_fid_clone(dentry);
+	fid = v9fs_fid_clone_with_uid(dentry, 0);
 	if (IS_ERR(fid))
 		goto error_out;
 	/*
@@ -276,7 +297,6 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 	 * dirty pages. We always request for the open fid in read-write
 	 * mode so that a partial page write which result in page
 	 * read can work.
-	 * FIXME!!: we should make the fid owned by uid = 0
 	 */
 	err = p9_client_open(fid, O_RDWR);
 	if (err < 0) {
-- 
cgit v1.2.3


From 09adc80c611bb8902daa8ccfe34dbbc009d6befe Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 4 Feb 2011 21:38:47 -0800
Subject: ceph: preserve I_COMPLETE across rename

d_move puts the renamed dentry at the end of d_subdirs, screwing with our
cached dentry directory offsets.  We were just clearing I_COMPLETE to avoid
any possibility of trouble.  However, assigning the renamed dentry an
offset at the end of the directory (to match it's new d_subdirs position)
is sufficient to maintain correct behavior and hold onto I_COMPLETE.

This is especially important for workloads like rsync, which renames files
into place.  Before, we would lose I_COMPLETE and do MDS lookups for each
file.  With this patch we only talk to the MDS on create and rename.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 193bfa5e9cb..60456361e07 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1030,9 +1030,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			dout("fill_trace doing d_move %p -> %p\n",
 			     req->r_old_dentry, dn);
 
-			/* d_move screws up d_subdirs order */
-			ceph_i_clear(dir, CEPH_I_COMPLETE);
-
 			d_move(req->r_old_dentry, dn);
 			dout(" src %p '%.*s' dst %p '%.*s'\n",
 			     req->r_old_dentry,
@@ -1044,12 +1041,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			   rehashing bug in vfs_rename_dir */
 			ceph_invalidate_dentry_lease(dn);
 
-			/* take overwritten dentry's readdir offset */
-			dout("dn %p gets %p offset %lld (old offset %lld)\n",
-			     req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+			/*
+			 * d_move() puts the renamed dentry at the end of
+			 * d_subdirs.  We need to assign it an appropriate
+			 * directory offset so we can behave when holding
+			 * I_COMPLETE.
+			 */
+			ceph_set_dentry_offset(req->r_old_dentry);
+			dout("dn %p gets new offset %lld\n", req->r_old_dentry, 
 			     ceph_dentry(req->r_old_dentry)->offset);
-			ceph_dentry(req->r_old_dentry)->offset =
-				ceph_dentry(dn)->offset;
 
 			dn = req->r_old_dentry;  /* use old_dentry */
 			in = dn->d_inode;
-- 
cgit v1.2.3


From 11a7b371b64ef39fc5fb1b6f2218eef7c4d035e3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:42 +0530
Subject: fs: allow AT_EMPTY_PATH in linkat(), limit that to
 CAP_DAC_READ_SEARCH

We don't want to allow creation of private hardlinks by different application
using the fd passed to them via SCM_RIGHTS. So limit the null relative name
usage in linkat syscall to CAP_DAC_READ_SEARCH

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/namei.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9d4f3270017..c9b7f5b7e92 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2945,15 +2945,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	struct dentry *new_dentry;
 	struct nameidata nd;
 	struct path old_path;
+	int how = 0;
 	int error;
 	char *to;
 
-	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
 		return -EINVAL;
+	/*
+	 * To use null names we require CAP_DAC_READ_SEARCH
+	 * This ensures that not everyone will be able to create
+	 * handlink using the passed filedescriptor.
+	 */
+	if (flags & AT_EMPTY_PATH) {
+		if (!capable(CAP_DAC_READ_SEARCH))
+			return -ENOENT;
+		how = LOOKUP_EMPTY;
+	}
+
+	if (flags & AT_SYMLINK_FOLLOW)
+		how |= LOOKUP_FOLLOW;
 
-	error = user_path_at(olddfd, oldname,
-			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-			     &old_path);
+	error = user_path_at(olddfd, oldname, how, &old_path);
 	if (error)
 		return error;
 
-- 
cgit v1.2.3


From ce57dfc1791221ef58b6d6b8f5437fccefc4e187 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 19:58:58 -0400
Subject: pull handling of one pathname component into a helper

new helper: walk_component().  Handles everything except symlinks;
returns negative on error, 0 on success and 1 on symlinks we decided
to follow.  Drops out of RCU mode on such symlinks.

link_path_walk() and do_last() switched to using that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 123 +++++++++++++++++++++++++++----------------------------------
 1 file changed, 55 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index c9b7f5b7e92..549bbe2f25c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -785,16 +785,11 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
  * Without that kind of total limit, nasty chains of consecutive
  * symlinks can cause almost arbitrarily long lookups. 
  */
-static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
+static inline int do_follow_link(struct path *path, struct nameidata *nd)
 {
 	void *cookie;
 	int err = -ELOOP;
 
-	/* We drop rcu-walk here */
-	if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-		return -ECHILD;
-	BUG_ON(inode != path->dentry->d_inode);
-
 	if (current->link_count >= MAX_NESTED_LINKS)
 		goto loop;
 	if (current->total_link_count >= 40)
@@ -1337,6 +1332,39 @@ static void terminate_walk(struct nameidata *nd)
 	}
 }
 
+static inline int walk_component(struct nameidata *nd, struct path *path,
+		struct qstr *name, int type, int follow)
+{
+	struct inode *inode;
+	int err;
+	/*
+	 * "." and ".." are special - ".." especially so because it has
+	 * to be able to know about the current root directory and
+	 * parent relationships.
+	 */
+	if (unlikely(type != LAST_NORM))
+		return handle_dots(nd, type);
+	err = do_lookup(nd, name, path, &inode);
+	if (unlikely(err)) {
+		terminate_walk(nd);
+		return err;
+	}
+	if (!inode) {
+		path_to_nameidata(path, nd);
+		terminate_walk(nd);
+		return -ENOENT;
+	}
+	if (unlikely(inode->i_op->follow_link) && follow) {
+		if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+			return -ECHILD;
+		BUG_ON(inode != path->dentry->d_inode);
+		return 1;
+	}
+	path_to_nameidata(path, nd);
+	nd->inode = inode;
+	return 0;
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1361,7 +1389,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
-		struct inode *inode;
 		unsigned long hash;
 		struct qstr this;
 		unsigned int c;
@@ -1414,34 +1441,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (!*name)
 			goto last_with_slashes;
 
-		/*
-		 * "." and ".." are special - ".." especially so because it has
-		 * to be able to know about the current root directory and
-		 * parent relationships.
-		 */
-		if (unlikely(type != LAST_NORM)) {
-			if (handle_dots(nd, type))
-				return -ECHILD;
-			continue;
-		}
-
-		/* This does the actual lookups.. */
-		err = do_lookup(nd, &this, &next, &inode);
-		if (err)
-			break;
+		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
+		if (err < 0)
+			return err;
 
-		if (inode && inode->i_op->follow_link) {
-			err = do_follow_link(inode, &next, nd);
+		if (err) {
+			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
 			nd->inode = nd->path.dentry->d_inode;
-		} else {
-			path_to_nameidata(&next, nd);
-			nd->inode = inode;
 		}
-		err = -ENOENT;
-		if (!nd->inode)
-			break;
 		err = -ENOTDIR; 
 		if (!nd->inode->i_op->lookup)
 			break;
@@ -1453,36 +1462,27 @@ last_with_slashes:
 last_component:
 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-		if (lookup_flags & LOOKUP_PARENT)
-			goto lookup_parent;
-		if (unlikely(type != LAST_NORM))
-			return handle_dots(nd, type);
-		err = do_lookup(nd, &this, &next, &inode);
-		if (err)
-			break;
-		if (inode && unlikely(inode->i_op->follow_link) &&
-		    (lookup_flags & LOOKUP_FOLLOW)) {
-			err = do_follow_link(inode, &next, nd);
+		if (lookup_flags & LOOKUP_PARENT) {
+			nd->last = this;
+			nd->last_type = type;
+			return 0;
+		}
+		err = walk_component(nd, &next, &this, type,
+					lookup_flags & LOOKUP_FOLLOW);
+		if (err < 0)
+			return err;
+		if (err) {
+			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
 			nd->inode = nd->path.dentry->d_inode;
-		} else {
-			path_to_nameidata(&next, nd);
-			nd->inode = inode;
 		}
-		err = -ENOENT;
-		if (!nd->inode)
-			break;
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
 			if (!nd->inode->i_op->lookup)
 				break;
 		}
 		return 0;
-lookup_parent:
-		nd->last = this;
-		nd->last_type = type;
-		return 0;
 	}
 	terminate_walk(nd);
 	return err;
@@ -2068,7 +2068,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	int want_write = 0;
 	int acc_mode = op->acc_mode;
 	struct file *filp;
-	struct inode *inode;
 	int error;
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -2111,24 +2110,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
 			symlink_ok = 1;
 		/* we _can_ be in RCU mode here */
-		error = do_lookup(nd, &nd->last, path, &inode);
-		if (error) {
-			terminate_walk(nd);
+		error = walk_component(nd, path, &nd->last, LAST_NORM,
+					!symlink_ok);
+		if (error < 0)
 			return ERR_PTR(error);
-		}
-		if (!inode) {
-			path_to_nameidata(path, nd);
-			terminate_walk(nd);
-			return ERR_PTR(-ENOENT);
-		}
-		if (unlikely(inode->i_op->follow_link && !symlink_ok)) {
-			/* We drop rcu-walk here */
-			if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-				return ERR_PTR(-ECHILD);
+		if (error) /* symlink */
 			return NULL;
-		}
-		path_to_nameidata(path, nd);
-		nd->inode = inode;
 		/* sayonara */
 		if (nd->flags & LOOKUP_RCU) {
 			if (nameidata_drop_rcu_last(nd))
@@ -2137,7 +2124,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 		error = -ENOTDIR;
 		if (nd->flags & LOOKUP_DIRECTORY) {
-			if (!inode->i_op->lookup)
+			if (!nd->inode->i_op->lookup)
 				goto exit;
 		}
 		audit_inode(pathname, nd->path.dentry);
-- 
cgit v1.2.3


From b21041d0f72899ed815bd2cbf7275339c74737b6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 20:01:51 -0400
Subject: update nd->inode in __do_follow_link() instead of after
 do_follow_link()

... and note that we only need to do it for LAST_BIND symlinks

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 549bbe2f25c..9e7b18a8be6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -768,7 +768,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 			error = __vfs_follow_link(nd, s);
 		else if (nd->last_type == LAST_BIND) {
 			nd->flags |= LOOKUP_JUMPED;
-			if (nd->path.dentry->d_inode->i_op->follow_link) {
+			nd->inode = nd->path.dentry->d_inode;
+			if (nd->inode->i_op->follow_link) {
 				/* stepped on a _really_ weird one */
 				path_put(&nd->path);
 				error = -ELOOP;
@@ -1449,7 +1450,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
-			nd->inode = nd->path.dentry->d_inode;
 		}
 		err = -ENOTDIR; 
 		if (!nd->inode->i_op->lookup)
@@ -1475,7 +1475,6 @@ last_component:
 			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
-			nd->inode = nd->path.dentry->d_inode;
 		}
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
-- 
cgit v1.2.3


From bd92d7fed877ed1e6997e4f3f13dbcd872947653 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 19:54:59 -0400
Subject: Make trailing symlink resolution in path_lookupat() iterative

Now the only caller of link_path_walk() that does *not* pass
LOOKUP_PARENT is do_follow_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9e7b18a8be6..a3431639e16 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1588,12 +1588,23 @@ out_fail:
 	return retval;
 }
 
+static inline int lookup_last(struct nameidata *nd, struct path *path)
+{
+	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+	nd->flags &= ~LOOKUP_PARENT;
+	return walk_component(nd, path, &nd->last, nd->last_type,
+					nd->flags & LOOKUP_FOLLOW);
+}
+
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int path_lookupat(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
 	struct file *base = NULL;
-	int retval;
+	struct path path;
+	int err;
 
 	/*
 	 * Path walking is largely split up into 2 different synchronisation
@@ -1609,23 +1620,55 @@ static int path_lookupat(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	retval = path_init(dfd, name, flags, nd, &base);
+	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
 
-	if (unlikely(retval))
-		return retval;
+	if (unlikely(err))
+		return err;
 
 	current->total_link_count = 0;
-	retval = link_path_walk(name, nd);
+	err = link_path_walk(name, nd);
+
+	if (!err && !(flags & LOOKUP_PARENT)) {
+		int count = 0;
+		err = lookup_last(nd, &path);
+		while (err > 0) {
+			void *cookie;
+			struct path link = path;
+			struct inode *inode = link.dentry->d_inode;
+
+			if (count++ > 32) {
+				path_put_conditional(&path, nd);
+				path_put(&nd->path);
+				err = -ELOOP;
+				break;
+			}
+			cond_resched();
+			nd->flags |= LOOKUP_PARENT;
+			err = __do_follow_link(&link, nd, &cookie);
+			if (!err)
+				err = lookup_last(nd, &path);
+			if (!IS_ERR(cookie) && inode->i_op->put_link)
+				inode->i_op->put_link(link.dentry, nd, cookie);
+			path_put(&link);
+		}
+	}
 
 	if (nd->flags & LOOKUP_RCU) {
 		/* went all way through without dropping RCU */
-		BUG_ON(retval);
+		BUG_ON(err);
 		if (nameidata_drop_rcu_last(nd))
-			retval = -ECHILD;
+			err = -ECHILD;
 	}
 
-	if (!retval)
-		retval = handle_reval_path(nd);
+	if (!err)
+		err = handle_reval_path(nd);
+
+	if (!err && nd->flags & LOOKUP_DIRECTORY) {
+		if (!nd->inode->i_op->lookup) {
+			path_put(&nd->path);
+			return -ENOTDIR;
+		}
+	}
 
 	if (base)
 		fput(base);
@@ -1634,7 +1677,7 @@ static int path_lookupat(int dfd, const char *name,
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
-	return retval;
+	return err;
 }
 
 static int do_path_lookup(int dfd, const char *name,
-- 
cgit v1.2.3


From ce0525449da56444948c368f52e10f3db0465338 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 21:28:04 -0400
Subject: simplify link_path_walk() tail

Now that link_path_walk() is called without LOOKUP_PARENT
only from do_follow_link(), we can simplify the checks in
last component handling.  First of all, checking if we'd
arrived to a directory is not needed - the caller will check
it anyway.  And LOOKUP_FOLLOW is guaranteed to be there,
since we only get to that place with nd->depth > 0.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a3431639e16..9575d003969 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1467,8 +1467,7 @@ last_component:
 			nd->last_type = type;
 			return 0;
 		}
-		err = walk_component(nd, &next, &this, type,
-					lookup_flags & LOOKUP_FOLLOW);
+		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
 		if (err < 0)
 			return err;
 		if (err) {
@@ -1476,11 +1475,6 @@ last_component:
 			if (err)
 				return err;
 		}
-		if (lookup_flags & LOOKUP_DIRECTORY) {
-			err = -ENOTDIR; 
-			if (!nd->inode->i_op->lookup)
-				break;
-		}
 		return 0;
 	}
 	terminate_walk(nd);
-- 
cgit v1.2.3


From b356379a020bb7197603118bb1cbc903963aa198 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 21:54:55 -0400
Subject: Turn resolution of trailing symlinks iterative everywhere

The last remaining place (resolution of nested symlink) converted
to the loop of the same kind we have in path_lookupat() and
path_openat().

Note that we still *do* have a recursion in pathname resolution;
can't avoid it, really.  However, it's strictly for nested symlinks
now - i.e. ones in the middle of a pathname.

link_path_walk() has lost the tail now - it always walks everything
except the last component.

do_follow_link() renamed to nested_symlink() and moved down.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 104 +++++++++++++++++++++++++++++--------------------------------
 1 file changed, 50 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9575d003969..017c3fa3a08 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -779,40 +779,6 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 	return error;
 }
 
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct path *path, struct nameidata *nd)
-{
-	void *cookie;
-	int err = -ELOOP;
-
-	if (current->link_count >= MAX_NESTED_LINKS)
-		goto loop;
-	if (current->total_link_count >= 40)
-		goto loop;
-	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-	cond_resched();
-	current->link_count++;
-	current->total_link_count++;
-	nd->depth++;
-	err = __do_follow_link(path, nd, &cookie);
-	if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-		path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-	path_put(path);
-	current->link_count--;
-	nd->depth--;
-	return err;
-loop:
-	path_put_conditional(path, nd);
-	path_put(&nd->path);
-	return err;
-}
-
 static int follow_up_rcu(struct path *path)
 {
 	struct vfsmount *parent;
@@ -1366,6 +1332,52 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 	return 0;
 }
 
+/*
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+	int res;
+
+	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+		path_put_conditional(path, nd);
+		path_put(&nd->path);
+		return -ELOOP;
+	}
+
+	nd->depth++;
+	current->link_count++;
+
+	do {
+		struct path link = *path;
+		void *cookie;
+		if (unlikely(current->total_link_count >= 40)) {
+			path_put_conditional(path, nd);
+			path_put(&nd->path);
+			res = -ELOOP;
+			break;
+		}
+		cond_resched();
+		current->total_link_count++;
+		res = __do_follow_link(&link, nd, &cookie);
+		if (!res)
+			res = walk_component(nd, path, &nd->last,
+					     nd->last_type, LOOKUP_FOLLOW);
+		if (!IS_ERR(cookie) && link.dentry->d_inode->i_op->put_link)
+			link.dentry->d_inode->i_op->put_link(link.dentry, nd, cookie);
+		path_put(&link);
+	} while (res > 0);
+
+	current->link_count--;
+	nd->depth--;
+	return res;
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1385,9 +1397,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	if (!*name)
 		return 0;
 
-	if (nd->depth)
-		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
-
 	/* At this point we know we have a real path component. */
 	for(;;) {
 		unsigned long hash;
@@ -1440,14 +1449,14 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			goto last_component;
 		while (*++name == '/');
 		if (!*name)
-			goto last_with_slashes;
+			goto last_component;
 
 		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
 		if (err < 0)
 			return err;
 
 		if (err) {
-			err = do_follow_link(&next, nd);
+			err = nested_symlink(&next, nd);
 			if (err)
 				return err;
 		}
@@ -1457,24 +1466,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		continue;
 		/* here ends the main loop */
 
-last_with_slashes:
-		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-		if (lookup_flags & LOOKUP_PARENT) {
-			nd->last = this;
-			nd->last_type = type;
-			return 0;
-		}
-		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
-		if (err < 0)
-			return err;
-		if (err) {
-			err = do_follow_link(&next, nd);
-			if (err)
-				return err;
-		}
+		nd->last = this;
+		nd->last_type = type;
 		return 0;
 	}
 	terminate_walk(nd);
-- 
cgit v1.2.3


From 574197e0de46a8a4db5c54ef7b65e43ffa8873a7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 22:20:34 -0400
Subject: tidy the trailing symlinks traversal up

* pull the handling of current->total_link_count into
__do_follow_link()
* put the common "do ->put_link() if needed and path_put() the link"
  stuff into a helper (put_link(nd, link, cookie))
* rename __do_follow_link() to follow_link(), while we are at it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 71 +++++++++++++++++++++++---------------------------------------
 1 file changed, 26 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 017c3fa3a08..0a601cae23d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -737,14 +737,31 @@ static inline void path_to_nameidata(const struct path *path,
 	nd->path.dentry = path->dentry;
 }
 
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+	struct inode *inode = link->dentry->d_inode;
+	if (!IS_ERR(cookie) && inode->i_op->put_link)
+		inode->i_op->put_link(link->dentry, nd, cookie);
+	path_put(link);
+}
+
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
 	int error;
 	struct dentry *dentry = link->dentry;
 
 	BUG_ON(nd->flags & LOOKUP_RCU);
 
+	if (unlikely(current->total_link_count >= 40)) {
+		*p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+		path_put_conditional(link, nd);
+		path_put(&nd->path);
+		return -ELOOP;
+	}
+	cond_resched();
+	current->total_link_count++;
+
 	touch_atime(link->mnt, dentry);
 	nd_set_link(nd, NULL);
 
@@ -1356,21 +1373,12 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 	do {
 		struct path link = *path;
 		void *cookie;
-		if (unlikely(current->total_link_count >= 40)) {
-			path_put_conditional(path, nd);
-			path_put(&nd->path);
-			res = -ELOOP;
-			break;
-		}
-		cond_resched();
-		current->total_link_count++;
-		res = __do_follow_link(&link, nd, &cookie);
+
+		res = follow_link(&link, nd, &cookie);
 		if (!res)
 			res = walk_component(nd, path, &nd->last,
 					     nd->last_type, LOOKUP_FOLLOW);
-		if (!IS_ERR(cookie) && link.dentry->d_inode->i_op->put_link)
-			link.dentry->d_inode->i_op->put_link(link.dentry, nd, cookie);
-		path_put(&link);
+		put_link(nd, &link, cookie);
 	} while (res > 0);
 
 	current->link_count--;
@@ -1619,27 +1627,15 @@ static int path_lookupat(int dfd, const char *name,
 	err = link_path_walk(name, nd);
 
 	if (!err && !(flags & LOOKUP_PARENT)) {
-		int count = 0;
 		err = lookup_last(nd, &path);
 		while (err > 0) {
 			void *cookie;
 			struct path link = path;
-			struct inode *inode = link.dentry->d_inode;
-
-			if (count++ > 32) {
-				path_put_conditional(&path, nd);
-				path_put(&nd->path);
-				err = -ELOOP;
-				break;
-			}
-			cond_resched();
 			nd->flags |= LOOKUP_PARENT;
-			err = __do_follow_link(&link, nd, &cookie);
+			err = follow_link(&link, nd, &cookie);
 			if (!err)
 				err = lookup_last(nd, &path);
-			if (!IS_ERR(cookie) && inode->i_op->put_link)
-				inode->i_op->put_link(link.dentry, nd, cookie);
-			path_put(&link);
+			put_link(nd, &link, cookie);
 		}
 	}
 
@@ -2298,7 +2294,6 @@ static struct file *path_openat(int dfd, const char *pathname,
 	struct file *base = NULL;
 	struct file *filp;
 	struct path path;
-	int count = 0;
 	int error;
 
 	filp = get_empty_filp();
@@ -2322,35 +2317,21 @@ static struct file *path_openat(int dfd, const char *pathname,
 	filp = do_last(nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
-		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
-		if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) {
+		if (!(nd->flags & LOOKUP_FOLLOW)) {
 			path_put_conditional(&path, nd);
 			path_put(&nd->path);
 			filp = ERR_PTR(-ELOOP);
 			break;
 		}
-		/*
-		 * This is subtle. Instead of calling do_follow_link() we do
-		 * the thing by hands. The reason is that this way we have zero
-		 * link_count and path_walk() (called from ->follow_link)
-		 * honoring LOOKUP_PARENT.  After that we have the parent and
-		 * last component, i.e. we are in the same situation as after
-		 * the first path_walk().  Well, almost - if the last component
-		 * is normal we get its copy stored in nd->last.name and we will
-		 * have to putname() it when we are done. Procfs-like symlinks
-		 * just set LAST_BIND.
-		 */
 		nd->flags |= LOOKUP_PARENT;
 		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-		error = __do_follow_link(&link, nd, &cookie);
+		error = follow_link(&link, nd, &cookie);
 		if (unlikely(error))
 			filp = ERR_PTR(error);
 		else
 			filp = do_last(nd, &path, op, pathname);
-		if (!IS_ERR(cookie) && linki->i_op->put_link)
-			linki->i_op->put_link(link.dentry, nd, cookie);
-		path_put(&link);
+		put_link(nd, &link, cookie);
 	}
 out:
 	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
-- 
cgit v1.2.3


From c826cb7dfce80512c26c984350077a25046bd215 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 15 Mar 2011 15:29:21 -0700
Subject: dcache.c: create helper function for duplicated functionality

This creates a helper function for he "try to ascend into the parent
directory" case, which was written out in triplicate before.  With all
the locking and subtle sequence number stuff, we really don't want to
duplicate that kind of code.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 88 ++++++++++++++++++++++++++-----------------------------------
 1 file changed, 37 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 611ffe928c0..361882a14cc 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1011,6 +1011,34 @@ void shrink_dcache_for_umount(struct super_block *sb)
 	}
 }
 
+/*
+ * This tries to ascend one level of parenthood, but
+ * we can race with renaming, so we need to re-check
+ * the parenthood after dropping the lock and check
+ * that the sequence number still matches.
+ */
+static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+{
+	struct dentry *new = old->d_parent;
+
+	rcu_read_lock();
+	spin_unlock(&old->d_lock);
+	spin_lock(&new->d_lock);
+
+	/*
+	 * might go back up the wrong parent if we have had a rename
+	 * or deletion
+	 */
+	if (new != old->d_parent ||
+		 (!locked && read_seqretry(&rename_lock, seq))) {
+		spin_unlock(&new->d_lock);
+		new = NULL;
+	}
+	rcu_read_unlock();
+	return new;
+}
+
+
 /*
  * Search for at least 1 mount point in the dentry's subdirs.
  * We descend to the next level whenever the d_subdirs
@@ -1066,24 +1094,10 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		struct dentry *tmp;
-		struct dentry *child;
-
-		tmp = this_parent->d_parent;
-		rcu_read_lock();
-		spin_unlock(&this_parent->d_lock);
-		child = this_parent;
-		this_parent = tmp;
-		spin_lock(&this_parent->d_lock);
-		/* might go back up the wrong parent if we have had a rename
-		 * or deletion */
-		if (this_parent != child->d_parent ||
-			 (!locked && read_seqretry(&rename_lock, seq))) {
-			spin_unlock(&this_parent->d_lock);
-			rcu_read_unlock();
+		struct dentry *child = this_parent;
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
 			goto rename_retry;
-		}
-		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
@@ -1181,24 +1195,10 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		struct dentry *tmp;
-		struct dentry *child;
-
-		tmp = this_parent->d_parent;
-		rcu_read_lock();
-		spin_unlock(&this_parent->d_lock);
-		child = this_parent;
-		this_parent = tmp;
-		spin_lock(&this_parent->d_lock);
-		/* might go back up the wrong parent if we have had a rename
-		 * or deletion */
-		if (this_parent != child->d_parent ||
-			(!locked && read_seqretry(&rename_lock, seq))) {
-			spin_unlock(&this_parent->d_lock);
-			rcu_read_unlock();
+		struct dentry *child = this_parent;
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
 			goto rename_retry;
-		}
-		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
@@ -2942,28 +2942,14 @@ resume:
 		spin_unlock(&dentry->d_lock);
 	}
 	if (this_parent != root) {
-		struct dentry *tmp;
-		struct dentry *child;
-
-		tmp = this_parent->d_parent;
+		struct dentry *child = this_parent;
 		if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
 			this_parent->d_flags |= DCACHE_GENOCIDE;
 			this_parent->d_count--;
 		}
-		rcu_read_lock();
-		spin_unlock(&this_parent->d_lock);
-		child = this_parent;
-		this_parent = tmp;
-		spin_lock(&this_parent->d_lock);
-		/* might go back up the wrong parent if we have had a rename
-		 * or deletion */
-		if (this_parent != child->d_parent ||
-			 (!locked && read_seqretry(&rename_lock, seq))) {
-			spin_unlock(&this_parent->d_lock);
-			rcu_read_unlock();
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
 			goto rename_retry;
-		}
-		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
-- 
cgit v1.2.3


From c83ce989cb5ff86575821992ea82c4df5c388ebc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 15 Mar 2011 13:36:43 -0400
Subject: VFS: Fix the nfs sillyrename regression in kernel 2.6.38

The new vfs locking scheme introduced in 2.6.38 breaks NFS sillyrename
because the latter relies on being able to determine the parent
directory of the dentry in the ->iput() callback in order to send the
appropriate unlink rpc call.

Looking at the code that cares about races with dput(), there doesn't
seem to be anything that specifically uses d_parent as a test for
whether or not there is a race:
  - __d_lookup_rcu(), __d_lookup() all test for d_hashed() after d_parent
  - shrink_dcache_for_umount() is safe since nothing else can rearrange
    the dentries in that super block.
  - have_submount(), select_parent() and d_genocide() can test for a
    deletion if we set the DCACHE_DISCONNECTED flag when the dentry
    is removed from the parent's d_subdirs list.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org (2.6.38, needs commit c826cb7dfce8 "dcache.c:
	create helper function for duplicated functionality" )
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 361882a14cc..a39fe47c466 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	__releases(parent->d_lock)
 	__releases(dentry->d_inode->i_lock)
 {
-	dentry->d_parent = NULL;
 	list_del(&dentry->d_u.d_child);
+	/*
+	 * Inform try_to_ascend() that we are no longer attached to the
+	 * dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DISCONNECTED;
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	dentry_iput(dentry);
@@ -1030,6 +1034,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
 	 * or deletion
 	 */
 	if (new != old->d_parent ||
+		 (old->d_flags & DCACHE_DISCONNECTED) ||
 		 (!locked && read_seqretry(&rename_lock, seq))) {
 		spin_unlock(&new->d_lock);
 		new = NULL;
-- 
cgit v1.2.3


From 44cff8a9ee8a974f9e931df910688e7fc1f0b0f9 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Tue, 15 Mar 2011 22:09:55 +0000
Subject: Squashfs: handle corruption of directory structure

Handle the rare case where a directory metadata block is uncompressed and
corrupted, leading to a kernel oops in directory scanning (memcpy).
Normally corruption is detected at the decompression stage and dealt with
then, however, this will not happen if:

- metadata isn't compressed (users can optionally request no metadata
  compression), or
- the compressed metadata block was larger than the original, in which
  case the uncompressed version was used, or
- the data was corrupt after decompression

This patch fixes this by adding some sanity checks against known maximum
values.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/dir.c   |  9 +++++++++
 fs/squashfs/namei.c | 12 ++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 0dc340aa2be..3f79cd1d0c1 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -172,6 +172,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
 		length += sizeof(dirh);
 
 		dir_count = le32_to_cpu(dirh.count) + 1;
+
+		/* dir_count should never be larger than 256 */
+		if (dir_count > 256)
+			goto failed_read;
+
 		while (dir_count--) {
 			/*
 			 * Read directory entry.
@@ -183,6 +188,10 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
 
 			size = le16_to_cpu(dire->size) + 1;
 
+			/* size should never be larger than SQUASHFS_NAME_LEN */
+			if (size > SQUASHFS_NAME_LEN)
+				goto failed_read;
+
 			err = squashfs_read_metadata(inode->i_sb, dire->name,
 					&block, &offset, size);
 			if (err < 0)
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 7a9464d08cf..5d922a6701a 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -176,6 +176,11 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
 		length += sizeof(dirh);
 
 		dir_count = le32_to_cpu(dirh.count) + 1;
+
+		/* dir_count should never be larger than 256 */
+		if (dir_count > 256)
+			goto data_error;
+
 		while (dir_count--) {
 			/*
 			 * Read directory entry.
@@ -187,6 +192,10 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
 
 			size = le16_to_cpu(dire->size) + 1;
 
+			/* size should never be larger than SQUASHFS_NAME_LEN */
+			if (size > SQUASHFS_NAME_LEN)
+				goto data_error;
+
 			err = squashfs_read_metadata(dir->i_sb, dire->name,
 					&block, &offset, size);
 			if (err < 0)
@@ -228,6 +237,9 @@ exit_lookup:
 	d_add(dentry, inode);
 	return ERR_PTR(0);
 
+data_error:
+	err = -EIO;
+
 read_failure:
 	ERROR("Unable to read directory block [%llx:%x]\n",
 		squashfs_i(dir)->start + msblk->directory_table,
-- 
cgit v1.2.3


From 0e794589e588a88d34e339feee50c72606fb21a7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 02:45:02 -0400
Subject: fix follow_link() breakage

commit 574197e0de46a8a4db5c54ef7b65e43ffa8873a7 had a missing
piece, breaking the loop detection ;-/

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0a601cae23d..b912b7abe74 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -753,9 +753,11 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 
 	BUG_ON(nd->flags & LOOKUP_RCU);
 
+	if (link->mnt == nd->path.mnt)
+		mntget(link->mnt);
+
 	if (unlikely(current->total_link_count >= 40)) {
 		*p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
-		path_put_conditional(link, nd);
 		path_put(&nd->path);
 		return -ELOOP;
 	}
@@ -765,9 +767,6 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 	touch_atime(link->mnt, dentry);
 	nd_set_link(nd, NULL);
 
-	if (link->mnt == nd->path.mnt)
-		mntget(link->mnt);
-
 	error = security_inode_follow_link(link->dentry, nd);
 	if (error) {
 		*p = ERR_PTR(error); /* no ->put_link(), please */
-- 
cgit v1.2.3


From 73d9aec3fd212d7bf8af5aa1eca9c79f8a90ad5a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 11 Mar 2011 15:39:09 +0200
Subject: UBIFS: allocate dump buffer on demand

Instead of using pre-allocated 'c->dbg->buf' buffer in
'dbg_dump_leb()', dynamically allocate it when needed. The intend
is to get rid of the pre-allocated 'c->dbg->buf' buffer and save
128KiB of RAM (or more if PEB size is larger). Indeed, currently we
allocate this memory even if the user never enables any self-check,
which is wasteful.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 02c10dccdd6..c2e5c08a9d3 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -810,16 +810,24 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 {
 	struct ubifs_scan_leb *sleb;
 	struct ubifs_scan_node *snod;
+	void *buf;
 
 	if (dbg_failure_mode)
 		return;
 
 	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
 	       current->pid, lnum);
-	sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+
+	buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err("cannot allocate memory for dumping LEB %d", lnum);
+		return;
+	}
+
+	sleb = ubifs_scan(c, lnum, 0, buf, 0);
 	if (IS_ERR(sleb)) {
 		ubifs_err("scan error %d", (int)PTR_ERR(sleb));
-		return;
+		goto out;
 	}
 
 	printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
@@ -835,6 +843,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 	printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
 	       current->pid, lnum);
 	ubifs_scan_destroy(sleb);
+
+out:
+	vfree(buf);
 	return;
 }
 
-- 
cgit v1.2.3


From cd5f7485bbbbfeea4363b535abeaa01df6942c66 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 11 Mar 2011 15:50:37 +0200
Subject: UBIFS: allocate scanning buffer on demand

Instead of using pre-allocated 'c->dbg->buf' buffer in
'scan_check_cb()', dynamically allocate it when needed. The intend
is to get rid of the pre-allocated 'c->dbg->buf' buffer and save
128KiB of RAM (or more if PEB size is larger). Indeed, currently we
allocate this memory even if the user never enables any self-check,
which is wasteful.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lprops.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4d4ca388889..c7b25e2f776 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1035,7 +1035,8 @@ static int scan_check_cb(struct ubifs_info *c,
 	struct ubifs_scan_leb *sleb;
 	struct ubifs_scan_node *snod;
 	struct ubifs_lp_stats *lst = &data->lst;
-	int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
+	int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
+	void *buf = NULL;
 
 	cat = lp->flags & LPROPS_CAT_MASK;
 	if (cat != LPROPS_UNCAT) {
@@ -1093,7 +1094,13 @@ static int scan_check_cb(struct ubifs_info *c,
 		}
 	}
 
-	sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+	buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err("cannot allocate memory to scan LEB %d", lnum);
+		goto out;
+	}
+
+	sleb = ubifs_scan(c, lnum, 0, buf, 0);
 	if (IS_ERR(sleb)) {
 		/*
 		 * After an unclean unmount, empty and freeable LEBs
@@ -1105,7 +1112,8 @@ static int scan_check_cb(struct ubifs_info *c,
 			lst->empty_lebs += 1;
 			lst->total_free += c->leb_size;
 			lst->total_dark += ubifs_calc_dark(c, c->leb_size);
-			return LPT_SCAN_CONTINUE;
+			ret = LPT_SCAN_CONTINUE;
+			goto exit;
 		}
 
 		if (lp->free + lp->dirty == c->leb_size &&
@@ -1115,10 +1123,12 @@ static int scan_check_cb(struct ubifs_info *c,
 			lst->total_free  += lp->free;
 			lst->total_dirty += lp->dirty;
 			lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
-			return LPT_SCAN_CONTINUE;
+			ret = LPT_SCAN_CONTINUE;
+			goto exit;
 		}
 		data->err = PTR_ERR(sleb);
-		return LPT_SCAN_STOP;
+		ret = LPT_SCAN_STOP;
+		goto exit;
 	}
 
 	is_idx = -1;
@@ -1236,7 +1246,10 @@ static int scan_check_cb(struct ubifs_info *c,
 	}
 
 	ubifs_scan_destroy(sleb);
-	return LPT_SCAN_CONTINUE;
+	ret = LPT_SCAN_CONTINUE;
+exit:
+	vfree(buf);
+	return ret;
 
 out_print:
 	ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1246,6 +1259,7 @@ out_print:
 out_destroy:
 	ubifs_scan_destroy(sleb);
 out:
+	vfree(buf);
 	data->err = -EINVAL;
 	return LPT_SCAN_STOP;
 }
-- 
cgit v1.2.3


From 6fb324a4b0c3c9297cd569bd125ed691f2f98d57 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 11 Mar 2011 15:56:38 +0200
Subject: UBIFS: allocate ltab checking buffer on demand

Instead of using pre-allocated 'c->dbg->buf' buffer in
'dbg_check_ltab_lnum()', dynamically allocate it when needed. The
intend is to get rid of the pre-allocated 'c->dbg->buf' buffer and
save 128KiB of RAM (or more if PEB size is larger). Indeed,
currently we allocate this memory even if the user never enables
any self-check, which is wasteful.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lpt_commit.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 5c90dec5db0..62a38d9c55e 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1628,29 +1628,35 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 {
 	int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
 	int ret;
-	void *buf = c->dbg->buf;
+	void *buf, *p;
 
 	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
 		return 0;
 
+	buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err("cannot allocate memory for ltab checking");
+		return 0;
+	}
+
 	dbg_lp("LEB %d", lnum);
 	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
 	if (err) {
 		dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
-		return err;
+		goto out;
 	}
 	while (1) {
-		if (!is_a_node(c, buf, len)) {
+		if (!is_a_node(c, p, len)) {
 			int i, pad_len;
 
-			pad_len = get_pad_len(c, buf, len);
+			pad_len = get_pad_len(c, p, len);
 			if (pad_len) {
-				buf += pad_len;
+				p += pad_len;
 				len -= pad_len;
 				dirty += pad_len;
 				continue;
 			}
-			if (!dbg_is_all_ff(buf, len)) {
+			if (!dbg_is_all_ff(p, len)) {
 				dbg_msg("invalid empty space in LEB %d at %d",
 					lnum, c->leb_size - len);
 				err = -EINVAL;
@@ -1668,16 +1674,21 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 					lnum, dirty, c->ltab[i].dirty);
 				err = -EINVAL;
 			}
-			return err;
+			goto out;
 		}
-		node_type = get_lpt_node_type(c, buf, &node_num);
+		node_type = get_lpt_node_type(c, p, &node_num);
 		node_len = get_lpt_node_len(c, node_type);
 		ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
 		if (ret == 1)
 			dirty += node_len;
-		buf += node_len;
+		p += node_len;
 		len -= node_len;
 	}
+
+	err = 0;
+out:
+	vfree(buf);
+	return err;
 }
 
 /**
-- 
cgit v1.2.3


From cab95d446cb766062fa7e2e7e326035d7c65b803 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 11 Mar 2011 16:58:39 +0200
Subject: UBIFS: allocate lpt dump buffer on demand

Instead of using pre-allocated 'c->dbg->buf' buffer in
'dump_lpt_leb()', dynamically allocate it when needed. The intend
is to get rid of the pre-allocated 'c->dbg->buf' buffer and save
128KiB of RAM (or more if PEB size is larger). Indeed, currently we
allocate this memory even if the user never enables any self-check,
which is wasteful.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lpt_commit.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 62a38d9c55e..0a3c2c3f5c4 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1881,25 +1881,31 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 {
 	int err, len = c->leb_size, node_type, node_num, node_len, offs;
-	void *buf = c->dbg->buf;
+	void *buf, *p;
 
 	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
 	       current->pid, lnum);
+	buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err("cannot allocate memory to dump LPT");
+		return;
+	}
+
 	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
 	if (err) {
 		ubifs_err("cannot read LEB %d, error %d", lnum, err);
-		return;
+		goto out;
 	}
 	while (1) {
 		offs = c->leb_size - len;
-		if (!is_a_node(c, buf, len)) {
+		if (!is_a_node(c, p, len)) {
 			int pad_len;
 
-			pad_len = get_pad_len(c, buf, len);
+			pad_len = get_pad_len(c, p, len);
 			if (pad_len) {
 				printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
 				       lnum, offs, pad_len);
-				buf += pad_len;
+				p += pad_len;
 				len -= pad_len;
 				continue;
 			}
@@ -1909,7 +1915,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 			break;
 		}
 
-		node_type = get_lpt_node_type(c, buf, &node_num);
+		node_type = get_lpt_node_type(c, p, &node_num);
 		switch (node_type) {
 		case UBIFS_LPT_PNODE:
 		{
@@ -1934,7 +1940,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 			else
 				printk(KERN_DEBUG "LEB %d:%d, nnode, ",
 				       lnum, offs);
-			err = ubifs_unpack_nnode(c, buf, &nnode);
+			err = ubifs_unpack_nnode(c, p, &nnode);
 			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
 				printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
 				       nnode.nbranch[i].offs);
@@ -1955,15 +1961,18 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 			break;
 		default:
 			ubifs_err("LPT node type %d not recognized", node_type);
-			return;
+			goto out;
 		}
 
-		buf += node_len;
+		p += node_len;
 		len -= node_len;
 	}
 
 	printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
 	       current->pid, lnum);
+out:
+	vfree(buf);
+	return;
 }
 
 /**
-- 
cgit v1.2.3


From f5cf319cf32d2284b3fbc24f3c526e2a9363b4ac Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 11 Mar 2011 17:11:25 +0200
Subject: UBIFS: allocate orphans scan buffer on demand

Instead of using pre-allocated 'c->dbg->buf' buffer in
'dbg_scan_orphans()', dynamically allocate it when needed. The intend
is to get rid of the pre-allocated 'c->dbg->buf' buffer and save
128KiB of RAM (or more if PEB size is larger). Indeed, currently we
allocate this memory even if the user never enables any self-check,
which is wasteful.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/orphan.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 82009c74b6a..2cdbd31641d 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -892,15 +892,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
 static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
 {
 	int lnum, err = 0;
+	void *buf;
 
 	/* Check no-orphans flag and skip this if no orphans */
 	if (c->no_orphs)
 		return 0;
 
+	buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	if (!buf) {
+		ubifs_err("cannot allocate memory to check orphans");
+		return 0;
+	}
+
 	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
 		struct ubifs_scan_leb *sleb;
 
-		sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+		sleb = ubifs_scan(c, lnum, 0, buf, 0);
 		if (IS_ERR(sleb)) {
 			err = PTR_ERR(sleb);
 			break;
@@ -912,6 +919,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
 			break;
 	}
 
+	vfree(buf);
 	return err;
 }
 
-- 
cgit v1.2.3


From 7c83cc91ab1505e53ebfb99b1ea19ed1cf01c1b0 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 11 Mar 2011 17:15:55 +0200
Subject: UBIFS: save 128KiB or more RAM

When debugging is enabled, we allocate a buffer of PEB size for
various debugging purposes. However, now all users of this buffer
are gone and we can safely remove it and save 128KiB or more RAM.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 9 ---------
 fs/ubifs/debug.h | 2 --
 2 files changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index c2e5c08a9d3..01c2b028e52 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2701,16 +2701,8 @@ int ubifs_debugging_init(struct ubifs_info *c)
 	if (!c->dbg)
 		return -ENOMEM;
 
-	c->dbg->buf = vmalloc(c->leb_size);
-	if (!c->dbg->buf)
-		goto out;
-
 	failure_mode_init(c);
 	return 0;
-
-out:
-	kfree(c->dbg);
-	return -ENOMEM;
 }
 
 /**
@@ -2720,7 +2712,6 @@ out:
 void ubifs_debugging_exit(struct ubifs_info *c)
 {
 	failure_mode_exit(c);
-	vfree(c->dbg->buf);
 	kfree(c->dbg);
 }
 
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 10190c18981..4efbba78669 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -27,7 +27,6 @@
 
 /**
  * ubifs_debug_info - per-FS debugging information.
- * @buf: a buffer of LEB size, used for various purposes
  * @old_zroot: old index root - used by 'dbg_check_old_index()'
  * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
  * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
@@ -54,7 +53,6 @@
  * dfs_dump_tnc: "dump TNC" debugfs knob
  */
 struct ubifs_debug_info {
-	void *buf;
 	struct ubifs_zbranch old_zroot;
 	int old_zroot_level;
 	unsigned long long old_zroot_sqnum;
-- 
cgit v1.2.3


From 5d630e43284fdb0613e4e7e7dd906f27bc25b6af Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 14 Mar 2011 17:55:40 +0200
Subject: UBIFS: clean-up commentaries

Clean-up commentaries in debug.h and remove references to non-existing
symblols.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 4efbba78669..919f0de29d8 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -171,7 +171,7 @@ const char *dbg_key_str1(const struct ubifs_info *c,
 #define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
 
 /*
- * Debugging message type flags (must match msg_type_names in debug.c).
+ * Debugging message type flags.
  *
  * UBIFS_MSG_GEN: general messages
  * UBIFS_MSG_JNL: journal messages
@@ -204,7 +204,7 @@ enum {
 };
 
 /*
- * Debugging check flags (must match chk_names in debug.c).
+ * Debugging check flags.
  *
  * UBIFS_CHK_GEN: general checks
  * UBIFS_CHK_TNC: check TNC
@@ -225,7 +225,7 @@ enum {
 };
 
 /*
- * Special testing flags (must match tst_names in debug.c).
+ * Special testing flags.
  *
  * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
  * UBIFS_TST_RCVRY: failure mode for recovery testing
-- 
cgit v1.2.3


From bab1d9444d9a147f1dc3478dd06c16f490227f3e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Mar 2011 21:51:24 +0100
Subject: prune back iprune_sem

iprune_sem is continously giving us lockdep warnings because we do take it in
read mode in the reclaim path, but we're also doing non-NOFS allocations under
it taken in write mode.

Taking a bit deeper look at it I think it's fixable quite trivially:

 - for invalidate_inodes we do not need iprune_sem at all.  We have an active
   reference on the superblock, so the filesystem is not going away until it
   has finished.
 - for evict_inodes we do need it, to make sure prune_icache has done it's
   work before we tear down the superblock.  But there is no reason to
   hold it over the actual reclaim operation - it's enough to cycle through
   it after the actual reclaim to make sure we wait for any pending
   prune_icache to complete.  We just have to remove the WARN_ON for
   otherwise busy inodes as they can actually happen now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 0647d80accf..9910c039f02 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -84,16 +84,13 @@ static struct hlist_head *inode_hashtable __read_mostly;
 DEFINE_SPINLOCK(inode_lock);
 
 /*
- * iprune_sem provides exclusion between the kswapd or try_to_free_pages
- * icache shrinking path, and the umount path.  Without this exclusion,
- * by the time prune_icache calls iput for the inode whose pages it has
- * been invalidating, or by the time it calls clear_inode & destroy_inode
- * from its final dispose_list, the struct super_block they refer to
- * (for inode->i_sb->s_op) may already have been freed and reused.
+ * iprune_sem provides exclusion between the icache shrinking and the
+ * umount path.
  *
- * We make this an rwsem because the fastpath is icache shrinking. In
- * some cases a filesystem may be doing a significant amount of work in
- * its inode reclaim code, so this should improve parallelism.
+ * We don't actually need it to protect anything in the umount path,
+ * but only need to cycle through it to make sure any inode that
+ * prune_icache took off the LRU list has been fully torn down by the
+ * time we are past evict_inodes.
  */
 static DECLARE_RWSEM(iprune_sem);
 
@@ -516,17 +513,12 @@ void evict_inodes(struct super_block *sb)
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 
-	down_write(&iprune_sem);
-
 	spin_lock(&inode_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (atomic_read(&inode->i_count))
 			continue;
-
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-			WARN_ON(1);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
 			continue;
-		}
 
 		inode->i_state |= I_FREEING;
 
@@ -542,6 +534,13 @@ void evict_inodes(struct super_block *sb)
 	spin_unlock(&inode_lock);
 
 	dispose_list(&dispose);
+
+	/*
+	 * Cycle through iprune_sem to make sure any inode that prune_icache
+	 * moved off the list before we took the lock has been fully torn
+	 * down.
+	 */
+	down_write(&iprune_sem);
 	up_write(&iprune_sem);
 }
 
@@ -561,8 +560,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 
-	down_write(&iprune_sem);
-
 	spin_lock(&inode_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
@@ -590,7 +587,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	spin_unlock(&inode_lock);
 
 	dispose_list(&dispose);
-	up_write(&iprune_sem);
 
 	return busy;
 }
-- 
cgit v1.2.3


From 34d211a2d5df4984a35b18d8ccacbe1d10abb067 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 16 Mar 2011 08:04:07 -0700
Subject: Increase OSF partition limit from 8 to 18

It turns out that while a maximum of 8 partitions may be what people
"should" have had, you can actually fit up to 18 entries(*) in a sector.

And some people clearly were taking advantage of that, like Michael
Cree, who had ten partitions on one of his OSF disks.

(*) The OSF partition data starts at byte offset 64 in the first sector,
    and the array of 16-byte partition entries start at offset 148 in
    the on-disk partition structure.

Reported-by: Michael Cree <mcree@orcon.net.nz>
Cc: stable@kernel.org (v2.6.38)
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/osf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index be03a0b08b4..764b86a0196 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
 #include "check.h"
 #include "osf.h"
 
-#define MAX_OSF_PARTITIONS 8
+#define MAX_OSF_PARTITIONS 18
 
 int osf_partition(struct parsed_partitions *state)
 {
-- 
cgit v1.2.3


From d2b217439fd1d8f7857175f063113f4d1d66306c Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Thu, 10 Mar 2011 17:43:37 +0800
Subject: nfs41: make sure nfs server return right ca_maxresponsesize_cached

According to rfc5661,

  ca_maxresponsesize_cached:

     Like ca_maxresponsesize, but the maximum size of a reply that
     will be stored in the reply cache (Section 2.10.6.1).  For each
     channel, the server MAY decrease this value, but MUST NOT
     increase it.

the latest kernel(2.6.38-rc8) may increase the value for ignoring
request's ca_maxresponsesize_cached value. We should not ignore it.

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c26dc31fb94..a20827804c5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -608,7 +608,8 @@ static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4
 	u32 maxrpc = nfsd_serv->sv_max_mesg;
 
 	new->maxreqs = numslots;
-	new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
+	new->maxresp_cached = min_t(u32, req->maxresp_cached,
+					slotsize + NFSD_MIN_HDR_SEQ_SZ);
 	new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
 	new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
 	new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
-- 
cgit v1.2.3


From 60ed8cf78f886753e454b671841c0a3a0e55e915 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Wed, 16 Mar 2011 18:17:54 +0100
Subject: fix cdev leak on O_PATH final fput()

__fput doesn't need a cdev_put() for O_PATH handles.

Signed-off-by: mszeredi@suse.cz
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index 74a9544ac77..db30d3497f1 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -246,8 +246,10 @@ static void __fput(struct file *file)
 		file->f_op->release(inode, file);
 	security_file_free(file);
 	ima_file_free(file);
-	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
+	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
+		     !(file->f_mode & FMODE_PATH))) {
 		cdev_put(inode->i_cdev);
+	}
 	fops_put(file->f_op);
 	put_pid(file->f_owner.pid);
 	file_sb_list_del(file);
-- 
cgit v1.2.3


From 0d5839ad05acd0fe2a84a39f33ac5efdf634a5a5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 05:27:27 -0400
Subject: nfs: propagate devname to nfs{,4}_get_root()

step 1 of ->mnt_devname fixes: make sure we have the value of devname
available in ..._get_root().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/getroot.c  |  6 ++++--
 fs/nfs/internal.h |  6 ++++--
 fs/nfs/super.c    | 10 +++++-----
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b5ffe8fa291..4d6e5a317e6 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -75,7 +75,8 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 /*
  * get an NFS2/NFS3 root dentry from the root filehandle
  */
-struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+			    const char *devname)
 {
 	struct nfs_server *server = NFS_SB(sb);
 	struct nfs_fsinfo fsinfo;
@@ -169,7 +170,8 @@ out:
 /*
  * get an NFS4 root dentry from the root filehandle
  */
-struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+			     const char *devname)
 {
 	struct nfs_server *server = NFS_SB(sb);
 	struct nfs_fattr *fattr = NULL;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index cf9fdbdabc6..9e5a003ccc5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -254,9 +254,11 @@ extern char *nfs_path(const char *base,
 extern struct vfsmount *nfs_d_automount(struct path *path);
 
 /* getroot.c */
-extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
+extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
+				   const char *);
 #ifdef CONFIG_NFS_V4
-extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
+extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
+				    const char *);
 
 extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b68c8607770..1d81032b226 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2336,7 +2336,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 			s, data ? data->fscache_uniq : NULL, NULL);
 	}
 
-	mntroot = nfs_get_root(s, mntfh);
+	mntroot = nfs_get_root(s, mntfh, dev_name);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
@@ -2450,7 +2450,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
 		nfs_fscache_get_super_cookie(s, NULL, data);
 	}
 
-	mntroot = nfs_get_root(s, data->fh);
+	mntroot = nfs_get_root(s, data->fh, dev_name);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
@@ -2718,7 +2718,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 			s, data ? data->fscache_uniq : NULL, NULL);
 	}
 
-	mntroot = nfs4_get_root(s, mntfh);
+	mntroot = nfs4_get_root(s, mntfh, dev_name);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
@@ -3033,7 +3033,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
 		nfs_fscache_get_super_cookie(s, NULL, data);
 	}
 
-	mntroot = nfs4_get_root(s, data->fh);
+	mntroot = nfs4_get_root(s, data->fh, dev_name);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
@@ -3120,7 +3120,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
 		nfs_fscache_get_super_cookie(s, NULL, data);
 	}
 
-	mntroot = nfs4_get_root(s, mntfh);
+	mntroot = nfs4_get_root(s, mntfh, dev_name);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
-- 
cgit v1.2.3


From b1942c5f8cf3bea3a3c88a7498ae4c4361f31afe Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 05:44:14 -0400
Subject: nfs: store devname at disconnected NFS roots

part 2: make sure that disconnected roots have corresponding mnt_devname
values stashed into them.

Have nfs*_get_root() stuff a copy of devname into ->d_fsdata of the
found root, provided that it is disconnected.

Have ->d_release() free it when dentry goes away.

Have the places where NFS uses ->d_fsdata for sillyrename (and that
can *never* happen to a disconnected root - dentry will be attached
to its parent) free old devname copies if they find those.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/dir.c     | 13 +++++++++++++
 fs/nfs/getroot.c | 36 ++++++++++++++++++++++++++++++++----
 fs/nfs/unlink.c  | 20 ++++++++++++++++++++
 3 files changed, 65 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2c3eb33b904..abdf38d5971 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1169,11 +1169,23 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 	iput(inode);
 }
 
+static void nfs_d_release(struct dentry *dentry)
+{
+	/* free cached devname value, if it survived that far */
+	if (unlikely(dentry->d_fsdata)) {
+		if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+			WARN_ON(1);
+		else
+			kfree(dentry->d_fsdata);
+	}
+}
+
 const struct dentry_operations nfs_dentry_operations = {
 	.d_revalidate	= nfs_lookup_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
 	.d_automount	= nfs_d_automount,
+	.d_release	= nfs_d_release,
 };
 
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1248,6 +1260,7 @@ const struct dentry_operations nfs4_dentry_operations = {
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
 	.d_automount	= nfs_d_automount,
+	.d_release	= nfs_d_release,
 };
 
 /*
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 4d6e5a317e6..1084792bc0f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -82,12 +82,18 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
 	struct nfs_fsinfo fsinfo;
 	struct dentry *ret;
 	struct inode *inode;
+	void *name = kstrdup(devname, GFP_KERNEL);
 	int error;
 
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+
 	/* get the actual root for this mount */
 	fsinfo.fattr = nfs_alloc_fattr();
-	if (fsinfo.fattr == NULL)
+	if (fsinfo.fattr == NULL) {
+		kfree(name);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
 	if (error < 0) {
@@ -120,7 +126,15 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
 	}
 
 	security_d_instantiate(ret, inode);
+	spin_lock(&ret->d_lock);
+	if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+		ret->d_fsdata = name;
+		name = NULL;
+	}
+	spin_unlock(&ret->d_lock);
 out:
+	if (name)
+		kfree(name);
 	nfs_free_fattr(fsinfo.fattr);
 	return ret;
 }
@@ -177,21 +191,28 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
 	struct nfs_fattr *fattr = NULL;
 	struct dentry *ret;
 	struct inode *inode;
+	void *name = kstrdup(devname, GFP_KERNEL);
 	int error;
 
 	dprintk("--> nfs4_get_root()\n");
 
+	if (!name)
+		return ERR_PTR(-ENOMEM);
+
 	/* get the info about the server and filesystem */
 	error = nfs4_server_capabilities(server, mntfh);
 	if (error < 0) {
 		dprintk("nfs_get_root: getcaps error = %d\n",
 			-error);
+		kfree(name);
 		return ERR_PTR(error);
 	}
 
 	fattr = nfs_alloc_fattr();
-	if (fattr == NULL)
-		return ERR_PTR(-ENOMEM);;
+	if (fattr == NULL) {
+		kfree(name);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	/* get the actual root for this mount */
 	error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
@@ -225,8 +246,15 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
 	}
 
 	security_d_instantiate(ret, inode);
-
+	spin_lock(&ret->d_lock);
+	if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+		ret->d_fsdata = name;
+		name = NULL;
+	}
+	spin_unlock(&ret->d_lock);
 out:
+	if (name)
+		kfree(name);
 	nfs_free_fattr(fattr);
 	dprintk("<-- nfs4_get_root()\n");
 	return ret;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 6481d537d69..8d6864c2a5f 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -148,6 +148,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 	alias = d_lookup(parent, &data->args.name);
 	if (alias != NULL) {
 		int ret = 0;
+		void *devname_garbage = NULL;
 
 		/*
 		 * Hey, we raced with lookup... See if we need to transfer
@@ -157,6 +158,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		spin_lock(&alias->d_lock);
 		if (alias->d_inode != NULL &&
 		    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+			devname_garbage = alias->d_fsdata;
 			alias->d_fsdata = data;
 			alias->d_flags |= DCACHE_NFSFS_RENAMED;
 			ret = 1;
@@ -164,6 +166,13 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 		spin_unlock(&alias->d_lock);
 		nfs_dec_sillycount(dir);
 		dput(alias);
+		/*
+		 * If we'd displaced old cached devname, free it.  At that
+		 * point dentry is definitely not a root, so we won't need
+		 * that anymore.
+		 */
+		if (devname_garbage)
+			kfree(devname_garbage);
 		return ret;
 	}
 	data->dir = igrab(dir);
@@ -252,6 +261,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct nfs_unlinkdata *data;
 	int status = -ENOMEM;
+	void *devname_garbage = NULL;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
@@ -269,8 +279,16 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
 		goto out_unlock;
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+	devname_garbage = dentry->d_fsdata;
 	dentry->d_fsdata = data;
 	spin_unlock(&dentry->d_lock);
+	/*
+	 * If we'd displaced old cached devname, free it.  At that
+	 * point dentry is definitely not a root, so we won't need
+	 * that anymore.
+	 */
+	if (devname_garbage)
+		kfree(devname_garbage);
 	return 0;
 out_unlock:
 	spin_unlock(&dentry->d_lock);
@@ -299,6 +317,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
 		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
 		data = dentry->d_fsdata;
+		dentry->d_fsdata = NULL;
 	}
 	spin_unlock(&dentry->d_lock);
 
@@ -315,6 +334,7 @@ nfs_cancel_async_unlink(struct dentry *dentry)
 		struct nfs_unlinkdata *data = dentry->d_fsdata;
 
 		dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+		dentry->d_fsdata = NULL;
 		spin_unlock(&dentry->d_lock);
 		nfs_free_unlinkdata(data);
 		return;
-- 
cgit v1.2.3


From b514f872f86d4b0c13fed74a1fe1f7ab500c4fd0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 06:26:11 -0400
Subject: nfs: make nfs_path() work without vfsmount

part 3: now we have everything to get nfs_path() just by dentry -
just follow to (disconnected) root and pick the rest of the thing
there.

Start killing propagation of struct vfsmount * on the paths that
used to bring it to nfs_path().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/internal.h      | 15 +++++------
 fs/nfs/namespace.c     | 69 ++++++++++++++++++++++++++++++++------------------
 fs/nfs/nfs4namespace.c | 43 ++++++++++++++-----------------
 fs/nfs/super.c         |  9 +++----
 4 files changed, 74 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9e5a003ccc5..f0234118d04 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -163,10 +163,10 @@ static inline void nfs_fs_proc_exit(void)
 
 /* nfs4namespace.c */
 #ifdef CONFIG_NFS_V4
-extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
+extern struct vfsmount *nfs_do_refmount(struct super_block *sb, struct dentry *dentry);
 #else
 static inline
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct super_block *sb, struct dentry *dentry)
 {
 	return ERR_PTR(-ENOENT);
 }
@@ -247,9 +247,7 @@ extern void nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
 
 /* namespace.c */
-extern char *nfs_path(const char *base,
-		      const struct dentry *droot,
-		      const struct dentry *dentry,
+extern char *nfs_path(char **p, struct dentry *dentry,
 		      char *buffer, ssize_t buflen);
 extern struct vfsmount *nfs_d_automount(struct path *path);
 
@@ -290,12 +288,11 @@ extern int _nfs4_call_sync_session(struct nfs_server *server,
 /*
  * Determine the device name as a string
  */
-static inline char *nfs_devname(const struct vfsmount *mnt_parent,
-				const struct dentry *dentry,
+static inline char *nfs_devname(struct dentry *dentry,
 				char *buffer, ssize_t buflen)
 {
-	return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root,
-			dentry, buffer, buflen);
+	char *dummy;
+	return nfs_path(&dummy, dentry, buffer, buflen);
 }
 
 /*
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f32b8603dca..859cdaba4c1 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -25,33 +25,31 @@ static LIST_HEAD(nfs_automount_list);
 static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
-static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
-					const struct dentry *dentry,
+static struct vfsmount *nfs_do_submount(struct super_block *sb,
+					struct dentry *dentry,
 					struct nfs_fh *fh,
 					struct nfs_fattr *fattr);
 
 /*
  * nfs_path - reconstruct the path given an arbitrary dentry
- * @base - arbitrary string to prepend to the path
- * @droot - pointer to root dentry for mountpoint
+ * @base - used to return pointer to the end of devname part of path
  * @dentry - pointer to dentry
  * @buffer - result buffer
  * @buflen - length of buffer
  *
- * Helper function for constructing the path from the
- * root dentry to an arbitrary hashed dentry.
+ * Helper function for constructing the server pathname
+ * by arbitrary hashed dentry.
  *
  * This is mainly for use in figuring out the path on the
- * server side when automounting on top of an existing partition.
+ * server side when automounting on top of an existing partition
+ * and in generating /proc/mounts and friends.
  */
-char *nfs_path(const char *base,
-	       const struct dentry *droot,
-	       const struct dentry *dentry,
-	       char *buffer, ssize_t buflen)
+char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
 {
 	char *end;
 	int namelen;
 	unsigned seq;
+	const char *base;
 
 rename_retry:
 	end = buffer+buflen;
@@ -60,7 +58,10 @@ rename_retry:
 
 	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
-	while (!IS_ROOT(dentry) && dentry != droot) {
+	while (1) {
+		spin_lock(&dentry->d_lock);
+		if (IS_ROOT(dentry))
+			break;
 		namelen = dentry->d_name.len;
 		buflen -= namelen + 1;
 		if (buflen < 0)
@@ -68,27 +69,47 @@ rename_retry:
 		end -= namelen;
 		memcpy(end, dentry->d_name.name, namelen);
 		*--end = '/';
+		spin_unlock(&dentry->d_lock);
 		dentry = dentry->d_parent;
 	}
-	rcu_read_unlock();
-	if (read_seqretry(&rename_lock, seq))
+	if (read_seqretry(&rename_lock, seq)) {
+		spin_unlock(&dentry->d_lock);
+		rcu_read_unlock();
 		goto rename_retry;
+	}
 	if (*end != '/') {
-		if (--buflen < 0)
+		if (--buflen < 0) {
+			spin_unlock(&dentry->d_lock);
+			rcu_read_unlock();
 			goto Elong;
+		}
 		*--end = '/';
 	}
+	*p = end;
+	base = dentry->d_fsdata;
+	if (!base) {
+		spin_unlock(&dentry->d_lock);
+		rcu_read_unlock();
+		WARN_ON(1);
+		return end;
+	}
 	namelen = strlen(base);
 	/* Strip off excess slashes in base string */
 	while (namelen > 0 && base[namelen - 1] == '/')
 		namelen--;
 	buflen -= namelen;
-	if (buflen < 0)
+	if (buflen < 0) {
+		spin_lock(&dentry->d_lock);
+		rcu_read_unlock();
 		goto Elong;
+	}
 	end -= namelen;
 	memcpy(end, base, namelen);
+	spin_unlock(&dentry->d_lock);
+	rcu_read_unlock();
 	return end;
 Elong_unlock:
+	spin_lock(&dentry->d_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
@@ -143,9 +164,9 @@ struct vfsmount *nfs_d_automount(struct path *path)
 	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-		mnt = nfs_do_refmount(path->mnt, path->dentry);
+		mnt = nfs_do_refmount(path->mnt->mnt_sb, path->dentry);
 	else
-		mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
+		mnt = nfs_do_submount(path->mnt->mnt_sb, path->dentry, fh, fattr);
 	if (IS_ERR(mnt))
 		goto out;
 
@@ -209,19 +230,19 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 
 /**
  * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @mnt_parent - mountpoint of parent directory
+ * @sb - superblock of parent directory
  * @dentry - parent directory
  * @fh - filehandle for new root dentry
  * @fattr - attributes for new root inode
  *
  */
-static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
-					const struct dentry *dentry,
+static struct vfsmount *nfs_do_submount(struct super_block *sb,
+					struct dentry *dentry,
 					struct nfs_fh *fh,
 					struct nfs_fattr *fattr)
 {
 	struct nfs_clone_mount mountdata = {
-		.sb = mnt_parent->mnt_sb,
+		.sb = sb,
 		.dentry = dentry,
 		.fh = fh,
 		.fattr = fattr,
@@ -237,11 +258,11 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 			dentry->d_name.name);
 	if (page == NULL)
 		goto out;
-	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	devname = nfs_devname(dentry, page, PAGE_SIZE);
 	mnt = (struct vfsmount *)devname;
 	if (IS_ERR(devname))
 		goto free_page;
-	mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
+	mnt = nfs_do_clone_mount(NFS_SB(sb), devname, &mountdata);
 free_page:
 	free_page((unsigned long)page);
 out:
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3c2a1724fbd..46942e2680a 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -54,33 +54,29 @@ Elong:
 /*
  * Determine the mount path as a string
  */
-static char *nfs4_path(const struct vfsmount *mnt_parent,
-		       const struct dentry *dentry,
-		       char *buffer, ssize_t buflen)
+static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
 {
-	const char *srvpath;
-
-	srvpath = strchr(mnt_parent->mnt_devname, ':');
-	if (srvpath)
-		srvpath++;
-	else
-		srvpath = mnt_parent->mnt_devname;
-
-	return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen);
+	char *limit;
+	char *path = nfs_path(&limit, dentry, buffer, buflen);
+	if (!IS_ERR(path)) {
+		char *colon = strchr(path, ':');
+		if (colon && colon < limit)
+			path = colon + 1;
+	}
+	return path;
 }
 
 /*
  * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
  * believe to be the server path to this dentry
  */
-static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
-				const struct dentry *dentry,
+static int nfs4_validate_fspath(struct dentry *dentry,
 				const struct nfs4_fs_locations *locations,
 				char *page, char *page2)
 {
 	const char *path, *fs_path;
 
-	path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE);
+	path = nfs4_path(dentry, page, PAGE_SIZE);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 
@@ -165,20 +161,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 
 /**
  * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @mnt_parent - mountpoint of parent directory
+ * @sb - superblock of parent directory
  * @dentry - parent directory
  * @locations - array of NFSv4 server location information
  *
  */
-static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
-					    const struct dentry *dentry,
+static struct vfsmount *nfs_follow_referral(struct super_block *sb,
+					    struct dentry *dentry,
 					    const struct nfs4_fs_locations *locations)
 {
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
 	struct nfs_clone_mount mountdata = {
-		.sb = mnt_parent->mnt_sb,
+		.sb = sb,
 		.dentry = dentry,
-		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+		.authflavor = NFS_SB(sb)->client->cl_auth->au_flavor,
 	};
 	char *page = NULL, *page2 = NULL;
 	int loc, error;
@@ -198,7 +194,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 		goto out;
 
 	/* Ensure fs path is a prefix of current dentry path */
-	error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2);
+	error = nfs4_validate_fspath(dentry, locations, page, page2);
 	if (error < 0) {
 		mnt = ERR_PTR(error);
 		goto out;
@@ -225,11 +221,10 @@ out:
 
 /*
  * nfs_do_refmount - handle crossing a referral on server
- * @mnt_parent - mountpoint of referral
  * @dentry - dentry of referral
  *
  */
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct super_block *sb, struct dentry *dentry)
 {
 	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
 	struct dentry *parent;
@@ -262,7 +257,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
 	    fs_locations->fs_path.ncomponents <= 0)
 		goto out_free;
 
-	mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+	mnt = nfs_follow_referral(sb, dentry, fs_locations);
 out_free:
 	__free_page(page);
 	kfree(fs_locations);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 1d81032b226..a6ab483c9ad 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2771,16 +2771,15 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
 	return root_mnt;
 }
 
-static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
+static void nfs_fix_devname(struct dentry *dentry, struct vfsmount *mnt)
 {
 	char *page = (char *) __get_free_page(GFP_KERNEL);
 	char *devname, *tmp;
+	char *dummy;
 
 	if (page == NULL)
 		return;
-	devname = nfs_path(path->mnt->mnt_devname,
-			path->mnt->mnt_root, path->dentry,
-			page, PAGE_SIZE);
+	devname = nfs_path(&dummy, dentry, page, PAGE_SIZE);
 	if (IS_ERR(devname))
 		goto out_freepage;
 	tmp = kstrdup(devname, GFP_KERNEL);
@@ -2894,7 +2893,7 @@ static int nfs_follow_remote_path(struct vfsmount *root_mnt,
 	mnt_target->mnt_root = dget(nd->path.dentry);
 
 	/* Correct the device pathname */
-	nfs_fix_devname(&nd->path, mnt_target);
+	nfs_fix_devname(nd->path.dentry, mnt_target);
 
 	path_put(&nd->path);
 	kfree(nd);
-- 
cgit v1.2.3


From f8ad9c4bae99854c961ca79ed130a0d11d9ab53c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 06:32:07 -0400
Subject: nfs: nfs_do_{ref,sub}mount() superblock argument is redundant

It's always equal to dentry->d_sb

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/internal.h      |  4 ++--
 fs/nfs/namespace.c     | 15 ++++++---------
 fs/nfs/nfs4namespace.c | 12 +++++-------
 3 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f0234118d04..e94ad22da5d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -163,10 +163,10 @@ static inline void nfs_fs_proc_exit(void)
 
 /* nfs4namespace.c */
 #ifdef CONFIG_NFS_V4
-extern struct vfsmount *nfs_do_refmount(struct super_block *sb, struct dentry *dentry);
+extern struct vfsmount *nfs_do_refmount(struct dentry *dentry);
 #else
 static inline
-struct vfsmount *nfs_do_refmount(struct super_block *sb, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
 {
 	return ERR_PTR(-ENOENT);
 }
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 859cdaba4c1..c0b8344db0c 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -25,8 +25,7 @@ static LIST_HEAD(nfs_automount_list);
 static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
-static struct vfsmount *nfs_do_submount(struct super_block *sb,
-					struct dentry *dentry,
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
 					struct nfs_fh *fh,
 					struct nfs_fattr *fattr);
 
@@ -164,9 +163,9 @@ struct vfsmount *nfs_d_automount(struct path *path)
 	}
 
 	if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-		mnt = nfs_do_refmount(path->mnt->mnt_sb, path->dentry);
+		mnt = nfs_do_refmount(path->dentry);
 	else
-		mnt = nfs_do_submount(path->mnt->mnt_sb, path->dentry, fh, fattr);
+		mnt = nfs_do_submount(path->dentry, fh, fattr);
 	if (IS_ERR(mnt))
 		goto out;
 
@@ -230,19 +229,17 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 
 /**
  * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @sb - superblock of parent directory
  * @dentry - parent directory
  * @fh - filehandle for new root dentry
  * @fattr - attributes for new root inode
  *
  */
-static struct vfsmount *nfs_do_submount(struct super_block *sb,
-					struct dentry *dentry,
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
 					struct nfs_fh *fh,
 					struct nfs_fattr *fattr)
 {
 	struct nfs_clone_mount mountdata = {
-		.sb = sb,
+		.sb = dentry->d_sb,
 		.dentry = dentry,
 		.fh = fh,
 		.fattr = fattr,
@@ -262,7 +259,7 @@ static struct vfsmount *nfs_do_submount(struct super_block *sb,
 	mnt = (struct vfsmount *)devname;
 	if (IS_ERR(devname))
 		goto free_page;
-	mnt = nfs_do_clone_mount(NFS_SB(sb), devname, &mountdata);
+	mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
 free_page:
 	free_page((unsigned long)page);
 out:
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 46942e2680a..bb80c49b653 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -161,20 +161,18 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 
 /**
  * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @sb - superblock of parent directory
  * @dentry - parent directory
  * @locations - array of NFSv4 server location information
  *
  */
-static struct vfsmount *nfs_follow_referral(struct super_block *sb,
-					    struct dentry *dentry,
+static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
 					    const struct nfs4_fs_locations *locations)
 {
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
 	struct nfs_clone_mount mountdata = {
-		.sb = sb,
+		.sb = dentry->d_sb,
 		.dentry = dentry,
-		.authflavor = NFS_SB(sb)->client->cl_auth->au_flavor,
+		.authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor,
 	};
 	char *page = NULL, *page2 = NULL;
 	int loc, error;
@@ -224,7 +222,7 @@ out:
  * @dentry - dentry of referral
  *
  */
-struct vfsmount *nfs_do_refmount(struct super_block *sb, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
 {
 	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
 	struct dentry *parent;
@@ -257,7 +255,7 @@ struct vfsmount *nfs_do_refmount(struct super_block *sb, struct dentry *dentry)
 	    fs_locations->fs_path.ncomponents <= 0)
 		goto out_free;
 
-	mnt = nfs_follow_referral(sb, dentry, fs_locations);
+	mnt = nfs_follow_referral(dentry, fs_locations);
 out_free:
 	__free_page(page);
 	kfree(fs_locations);
-- 
cgit v1.2.3


From c7f404b40a3665d9f4e9a927cc5c1ee0479ed8f9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 06:59:40 -0400
Subject: vfs: new superblock methods to override /proc/*/mount{s,info}

a) ->show_devname(m, mnt) - what to put into devname columns in mounts,
mountinfo and mountstats
b) ->show_path(m, mnt) - what to put into relative path column in mountinfo

Leaving those NULL gives old behaviour.  NFS switched to using those.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 39 ++++++++++++++++++++++++++++++---------
 fs/nfs/super.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index dffe6f49ab9..75d843ae46d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -978,7 +978,13 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 	int err = 0;
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
 
-	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+	if (mnt->mnt_sb->s_op->show_devname) {
+		err = mnt->mnt_sb->s_op->show_devname(m, mnt);
+		if (err)
+			goto out;
+	} else {
+		mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+	}
 	seq_putc(m, ' ');
 	seq_path(m, &mnt_path, " \t\n\\");
 	seq_putc(m, ' ');
@@ -1025,7 +1031,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
 
 	seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
 		   MAJOR(sb->s_dev), MINOR(sb->s_dev));
-	seq_dentry(m, mnt->mnt_root, " \t\n\\");
+	if (sb->s_op->show_path)
+		err = sb->s_op->show_path(m, mnt);
+	else
+		seq_dentry(m, mnt->mnt_root, " \t\n\\");
+	if (err)
+		goto out;
 	seq_putc(m, ' ');
 	seq_path_root(m, &mnt_path, &root, " \t\n\\");
 	if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
@@ -1060,7 +1071,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
 	seq_puts(m, " - ");
 	show_type(m, sb);
 	seq_putc(m, ' ');
-	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+	if (sb->s_op->show_devname)
+		err = sb->s_op->show_devname(m, mnt);
+	else
+		mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+	if (err)
+		goto out;
 	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
 	err = show_sb_opts(m, sb);
 	if (err)
@@ -1086,11 +1102,15 @@ static int show_vfsstat(struct seq_file *m, void *v)
 	int err = 0;
 
 	/* device */
-	if (mnt->mnt_devname) {
-		seq_puts(m, "device ");
-		mangle(m, mnt->mnt_devname);
-	} else
-		seq_puts(m, "no device");
+	if (mnt->mnt_sb->s_op->show_devname) {
+		err = mnt->mnt_sb->s_op->show_devname(m, mnt);
+	} else {
+		if (mnt->mnt_devname) {
+			seq_puts(m, "device ");
+			mangle(m, mnt->mnt_devname);
+		} else
+			seq_puts(m, "no device");
+	}
 
 	/* mount point */
 	seq_puts(m, " mounted on ");
@@ -1104,7 +1124,8 @@ static int show_vfsstat(struct seq_file *m, void *v)
 	/* optional statistics */
 	if (mnt->mnt_sb->s_op->show_stats) {
 		seq_putc(m, ' ');
-		err = mnt->mnt_sb->s_op->show_stats(m, mnt);
+		if (!err)
+			err = mnt->mnt_sb->s_op->show_stats(m, mnt);
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a6ab483c9ad..79bc61fe286 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -263,6 +263,8 @@ static match_table_t nfs_local_lock_tokens = {
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_devname(struct seq_file *, struct vfsmount *);
+static int  nfs_show_path(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
 static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
@@ -296,6 +298,8 @@ static const struct super_operations nfs_sops = {
 	.evict_inode	= nfs_evict_inode,
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
+	.show_devname	= nfs_show_devname,
+	.show_path	= nfs_show_path,
 	.show_stats	= nfs_show_stats,
 	.remount_fs	= nfs_remount,
 };
@@ -366,6 +370,8 @@ static const struct super_operations nfs4_sops = {
 	.evict_inode	= nfs4_evict_inode,
 	.umount_begin	= nfs_umount_begin,
 	.show_options	= nfs_show_options,
+	.show_devname	= nfs_show_devname,
+	.show_path	= nfs_show_path,
 	.show_stats	= nfs_show_stats,
 	.remount_fs	= nfs_remount,
 };
@@ -726,6 +732,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
+static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
+{
+	char *page = (char *) __get_free_page(GFP_KERNEL);
+	char *devname, *dummy;
+	int err = 0;
+	if (!page)
+		return -ENOMEM;
+	devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE);
+	if (IS_ERR(devname))
+		err = PTR_ERR(devname);
+	else
+		seq_escape(m, devname, " \t\n\\");
+	free_page((unsigned long)page);
+	return err;
+}
+
+static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt)
+{
+	seq_puts(m, "/");
+	return 0;
+}
+
 /*
  * Present statistical information for this VFS mountpoint
  */
-- 
cgit v1.2.3


From fd462fb51db46c84bea0fc377c11b9a7e16bc1a0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 07:05:45 -0400
Subject: nfs: stop mangling ->mnt_devname on NFS

now we can do that - nobody cares about its value anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/super.c | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 79bc61fe286..3f967cee407 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2799,26 +2799,6 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
 	return root_mnt;
 }
 
-static void nfs_fix_devname(struct dentry *dentry, struct vfsmount *mnt)
-{
-	char *page = (char *) __get_free_page(GFP_KERNEL);
-	char *devname, *tmp;
-	char *dummy;
-
-	if (page == NULL)
-		return;
-	devname = nfs_path(&dummy, dentry, page, PAGE_SIZE);
-	if (IS_ERR(devname))
-		goto out_freepage;
-	tmp = kstrdup(devname, GFP_KERNEL);
-	if (tmp == NULL)
-		goto out_freepage;
-	kfree(mnt->mnt_devname);
-	mnt->mnt_devname = tmp;
-out_freepage:
-	free_page((unsigned long)page);
-}
-
 struct nfs_referral_count {
 	struct list_head list;
 	const struct task_struct *task;
@@ -2920,9 +2900,6 @@ static int nfs_follow_remote_path(struct vfsmount *root_mnt,
 	mnt_target->mnt_sb = s;
 	mnt_target->mnt_root = dget(nd->path.dentry);
 
-	/* Correct the device pathname */
-	nfs_fix_devname(nd->path.dentry, mnt_target);
-
 	path_put(&nd->path);
 	kfree(nd);
 	down_write(&s->s_umount);
-- 
cgit v1.2.3


From 011949811b946bd3b72fca71200f197c6168a5f8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 07:25:36 -0400
Subject: nfs: switch NFS from ->get_sb() to ->mount()

The last remaining instances of ->get_sb() can be converted ->mount()
now - nothing in them uses new vfsmount anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/super.c | 132 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 3f967cee407..d3286583009 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -266,7 +266,8 @@ static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_devname(struct seq_file *, struct vfsmount *);
 static int  nfs_show_path(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
-static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
+static struct dentry *nfs_fs_mount(struct file_system_type *,
+		int, const char *, void *);
 static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *raw_data);
 static void nfs_put_super(struct super_block *);
@@ -276,7 +277,7 @@ static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 static struct file_system_type nfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs",
-	.get_sb		= nfs_get_sb,
+	.mount		= nfs_fs_mount,
 	.kill_sb	= nfs_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -307,16 +308,16 @@ static const struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_validate_text_mount_data(void *options,
 	struct nfs_parsed_mount_data *args, const char *dev_name);
-static int nfs4_try_mount(int flags, const char *dev_name,
-	struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
-static int nfs4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
+	struct nfs_parsed_mount_data *data);
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data);
-static int nfs4_referral_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data);
 static void nfs4_kill_super(struct super_block *sb);
@@ -324,7 +325,7 @@ static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs4_get_sb,
+	.mount		= nfs4_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -356,7 +357,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {
 struct file_system_type nfs4_referral_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs4",
-	.get_sb		= nfs4_referral_get_sb,
+	.mount		= nfs4_referral_mount,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -2295,19 +2296,19 @@ static int nfs_bdi_register(struct nfs_server *server)
 	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
 }
 
-static int nfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data)
 {
 	struct nfs_server *server = NULL;
 	struct super_block *s;
 	struct nfs_parsed_mount_data *data;
 	struct nfs_fh *mntfh;
-	struct dentry *mntroot;
+	struct dentry *mntroot = ERR_PTR(-ENOMEM);
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
 	};
-	int error = -ENOMEM;
+	int error;
 
 	data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
 	mntfh = nfs_alloc_fhandle();
@@ -2318,12 +2319,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 
 	/* Validate the mount data */
 	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
-	if (error < 0)
+	if (error < 0) {
+		mntroot = ERR_PTR(error);
 		goto out;
+	}
 
 #ifdef CONFIG_NFS_V4
 	if (data->version == 4) {
-		error = nfs4_try_mount(flags, dev_name, data, mnt);
+		mntroot = nfs4_try_mount(flags, dev_name, data);
 		kfree(data->client_address);
 		kfree(data->nfs_server.export_path);
 		goto out;
@@ -2333,7 +2336,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	/* Get a volume representation */
 	server = nfs_create_server(data, mntfh);
 	if (IS_ERR(server)) {
-		error = PTR_ERR(server);
+		mntroot = ERR_CAST(server);
 		goto out;
 	}
 	sb_mntdata.server = server;
@@ -2344,7 +2347,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	/* Get a superblock - note that we may end up sharing one that already exists */
 	s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
 	if (IS_ERR(s)) {
-		error = PTR_ERR(s);
+		mntroot = ERR_CAST(s);
 		goto out_err_nosb;
 	}
 
@@ -2353,8 +2356,10 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 		server = NULL;
 	} else {
 		error = nfs_bdi_register(server);
-		if (error)
+		if (error) {
+			mntroot = ERR_PTR(error);
 			goto error_splat_bdi;
+		}
 	}
 
 	if (!s->s_root) {
@@ -2365,19 +2370,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	}
 
 	mntroot = nfs_get_root(s, mntfh, dev_name);
-	if (IS_ERR(mntroot)) {
-		error = PTR_ERR(mntroot);
+	if (IS_ERR(mntroot))
 		goto error_splat_super;
-	}
 
 	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
 	if (error)
 		goto error_splat_root;
 
 	s->s_flags |= MS_ACTIVE;
-	mnt->mnt_sb = s;
-	mnt->mnt_root = mntroot;
-	error = 0;
 
 out:
 	kfree(data->nfs_server.hostname);
@@ -2387,7 +2387,7 @@ out:
 out_free_fh:
 	nfs_free_fhandle(mntfh);
 	kfree(data);
-	return error;
+	return mntroot;
 
 out_err_nosb:
 	nfs_free_server(server);
@@ -2395,6 +2395,7 @@ out_err_nosb:
 
 error_splat_root:
 	dput(mntroot);
+	mntroot = ERR_PTR(error);
 error_splat_super:
 	if (server && !s->s_root)
 		bdi_unregister(&server->backing_dev_info);
@@ -2865,17 +2866,18 @@ static void nfs_referral_loop_unprotect(void)
 	kfree(p);
 }
 
-static int nfs_follow_remote_path(struct vfsmount *root_mnt,
-		const char *export_path, struct vfsmount *mnt_target)
+static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
+		const char *export_path)
 {
 	struct nameidata *nd = NULL;
 	struct mnt_namespace *ns_private;
 	struct super_block *s;
+	struct dentry *dentry;
 	int ret;
 
 	nd = kmalloc(sizeof(*nd), GFP_KERNEL);
 	if (nd == NULL)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	ns_private = create_mnt_ns(root_mnt);
 	ret = PTR_ERR(ns_private);
@@ -2897,29 +2899,27 @@ static int nfs_follow_remote_path(struct vfsmount *root_mnt,
 
 	s = nd->path.mnt->mnt_sb;
 	atomic_inc(&s->s_active);
-	mnt_target->mnt_sb = s;
-	mnt_target->mnt_root = dget(nd->path.dentry);
+	dentry = dget(nd->path.dentry);
 
 	path_put(&nd->path);
 	kfree(nd);
 	down_write(&s->s_umount);
-	return 0;
+	return dentry;
 out_put_mnt_ns:
 	put_mnt_ns(ns_private);
 out_mntput:
 	mntput(root_mnt);
 out_err:
 	kfree(nd);
-	return ret;
+	return ERR_PTR(ret);
 }
 
-static int nfs4_try_mount(int flags, const char *dev_name,
-			 struct nfs_parsed_mount_data *data,
-			 struct vfsmount *mnt)
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
+			 struct nfs_parsed_mount_data *data)
 {
 	char *export_path;
 	struct vfsmount *root_mnt;
-	int error;
+	struct dentry *res;
 
 	dfprintk(MOUNT, "--> nfs4_try_mount()\n");
 
@@ -2929,26 +2929,25 @@ static int nfs4_try_mount(int flags, const char *dev_name,
 			data->nfs_server.hostname);
 	data->nfs_server.export_path = export_path;
 
-	error = PTR_ERR(root_mnt);
-	if (IS_ERR(root_mnt))
-		goto out;
-
-	error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+	res = ERR_CAST(root_mnt);
+	if (!IS_ERR(root_mnt))
+		res = nfs_follow_remote_path(root_mnt, export_path);
 
-out:
-	dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
-			error != 0 ? " [error]" : "");
-	return error;
+	dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
+			IS_ERR(res) ? PTR_ERR(res) : 0,
+			IS_ERR(res) ? " [error]" : "");
+	return res;
 }
 
 /*
  * Get the superblock for an NFS4 mountpoint
  */
-static int nfs4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data)
 {
 	struct nfs_parsed_mount_data *data;
 	int error = -ENOMEM;
+	struct dentry *res = ERR_PTR(-ENOMEM);
 
 	data = nfs_alloc_parsed_mount_data(4);
 	if (data == NULL)
@@ -2956,10 +2955,14 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 
 	/* Validate the mount data */
 	error = nfs4_validate_mount_data(raw_data, data, dev_name);
-	if (error < 0)
+	if (error < 0) {
+		res = ERR_PTR(error);
 		goto out;
+	}
 
-	error = nfs4_try_mount(flags, dev_name, data, mnt);
+	res = nfs4_try_mount(flags, dev_name, data);
+	if (IS_ERR(res))
+		error = PTR_ERR(res);
 
 out:
 	kfree(data->client_address);
@@ -2968,9 +2971,9 @@ out:
 	kfree(data->fscache_uniq);
 out_free_data:
 	kfree(data);
-	dprintk("<-- nfs4_get_sb() = %d%s\n", error,
+	dprintk("<-- nfs4_mount() = %d%s\n", error,
 			error != 0 ? " [error]" : "");
-	return error;
+	return res;
 }
 
 static void nfs4_kill_super(struct super_block *sb)
@@ -3164,16 +3167,15 @@ error_splat_bdi:
 /*
  * Create an NFS4 server record on referral traversal
  */
-static int nfs4_referral_get_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data,
-		struct vfsmount *mnt)
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
 {
 	struct nfs_clone_mount *data = raw_data;
 	char *export_path;
 	struct vfsmount *root_mnt;
-	int error;
+	struct dentry *res;
 
-	dprintk("--> nfs4_referral_get_sb()\n");
+	dprintk("--> nfs4_referral_mount()\n");
 
 	export_path = data->mnt_path;
 	data->mnt_path = "/";
@@ -3182,15 +3184,13 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type,
 			flags, data, data->hostname);
 	data->mnt_path = export_path;
 
-	error = PTR_ERR(root_mnt);
-	if (IS_ERR(root_mnt))
-		goto out;
-
-	error = nfs_follow_remote_path(root_mnt, export_path, mnt);
-out:
-	dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error,
-			error != 0 ? " [error]" : "");
-	return error;
+	res = ERR_CAST(root_mnt);
+	if (!IS_ERR(root_mnt))
+		res = nfs_follow_remote_path(root_mnt, export_path);
+	dprintk("<-- nfs4_referral_mount() = %ld%s\n",
+			IS_ERR(res) ? PTR_ERR(res) : 0,
+			IS_ERR(res) ? " [error]" : "");
+	return res;
 }
 
 #endif /* CONFIG_NFS_V4 */
-- 
cgit v1.2.3


From 1a102ff92579edeff5e3d5d3c76ca49977898f00 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Mar 2011 09:07:58 -0400
Subject: vfs: bury ->get_sb()

This is an ex-parrot.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 67 ++++++--------------------------------------------------------
 1 file changed, 6 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 7e9dd4cc2c0..4bae0ef6110 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -843,23 +843,6 @@ error:
 }
 EXPORT_SYMBOL(mount_bdev);
 
-int get_sb_bdev(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
-{
-	struct dentry *root;
-
-	root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
-	mnt->mnt_root = root;
-	mnt->mnt_sb = root->d_sb;
-	return 0;
-}
-
-EXPORT_SYMBOL(get_sb_bdev);
-
 void kill_block_super(struct super_block *sb)
 {
 	struct block_device *bdev = sb->s_bdev;
@@ -897,22 +880,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_nodev);
 
-int get_sb_nodev(struct file_system_type *fs_type,
-	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
-{
-	struct dentry *root;
-
-	root = mount_nodev(fs_type, flags, data, fill_super);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
-	mnt->mnt_root = root;
-	mnt->mnt_sb = root->d_sb;
-	return 0;
-}
-EXPORT_SYMBOL(get_sb_nodev);
-
 static int compare_single(struct super_block *s, void *p)
 {
 	return 1;
@@ -943,22 +910,6 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_single);
 
-int get_sb_single(struct file_system_type *fs_type,
-	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
-	struct vfsmount *mnt)
-{
-	struct dentry *root;
-	root = mount_single(fs_type, flags, data, fill_super);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
-	mnt->mnt_root = root;
-	mnt->mnt_sb = root->d_sb;
-	return 0;
-}
-
-EXPORT_SYMBOL(get_sb_single);
-
 struct vfsmount *
 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
@@ -988,19 +939,13 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 			goto out_free_secdata;
 	}
 
-	if (type->mount) {
-		root = type->mount(type, flags, name, data);
-		if (IS_ERR(root)) {
-			error = PTR_ERR(root);
-			goto out_free_secdata;
-		}
-		mnt->mnt_root = root;
-		mnt->mnt_sb = root->d_sb;
-	} else {
-		error = type->get_sb(type, flags, name, data, mnt);
-		if (error < 0)
-			goto out_free_secdata;
+	root = type->mount(type, flags, name, data);
+	if (IS_ERR(root)) {
+		error = PTR_ERR(root);
+		goto out_free_secdata;
 	}
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
 	BUG_ON(!mnt->mnt_sb);
 	WARN_ON(!mnt->mnt_sb->s_bdi);
 	mnt->mnt_sb->s_flags |= MS_BORN;
-- 
cgit v1.2.3


From 4ee2491ed8569f370bf4c1a4c046a6efb8032bd2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 17 Mar 2011 10:51:40 +0100
Subject: fs: make fsync_buffers_list() plug

It used WRITE_SYNC_PLUG before and potentially submits a batch
of IO, so lets enable plugging for this case.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/buffer.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 42534f67d71..2e6b1a387b7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -743,8 +743,10 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 	struct list_head tmp;
 	struct address_space *mapping;
 	int err = 0, err2;
+	struct blk_plug plug;
 
 	INIT_LIST_HEAD(&tmp);
+	blk_start_plug(&plug);
 
 	spin_lock(lock);
 	while (!list_empty(list)) {
@@ -781,6 +783,10 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 		}
 	}
 
+	spin_unlock(lock);
+	blk_finish_plug(&plug);
+	spin_lock(lock);
+
 	while (!list_empty(&tmp)) {
 		bh = BH_ENTRY(tmp.prev);
 		get_bh(bh);
-- 
cgit v1.2.3


From 65ab80279d7c323ce151e858d951e2684df72a97 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 17 Mar 2011 10:56:45 +0100
Subject: jbd: finish conversion from WRITE_SYNC_PLUG to WRITE_SYNC and
 explicit plugging

'write_op' was still used, even though it was always WRITE_SYNC now.
Add plugging around the cases where it submits IO, and flush them
before we end up waiting for that IO.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd/commit.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 66be299acb1..da871ee084d 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -294,7 +295,7 @@ void journal_commit_transaction(journal_t *journal)
 	int first_tag = 0;
 	int tag_flag;
 	int i;
-	int write_op = WRITE_SYNC;
+	struct blk_plug plug;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -327,13 +328,6 @@ void journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
-	/*
-	 * Use plugged writes here, since we want to submit several before
-	 * we unplug the device. We don't do explicit unplugging in here,
-	 * instead we rely on sync_buffer() doing the unplug for us.
-	 */
-	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
@@ -418,8 +412,10 @@ void journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
+	blk_start_plug(&plug);
 	err = journal_submit_data_buffers(journal, commit_transaction,
-					  write_op);
+					  WRITE_SYNC);
+	blk_finish_plug(&plug);
 
 	/*
 	 * Wait for all previously submitted IO to complete.
@@ -480,7 +476,9 @@ void journal_commit_transaction(journal_t *journal)
 		err = 0;
 	}
 
-	journal_write_revoke_records(journal, commit_transaction, write_op);
+	blk_start_plug(&plug);
+
+	journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
 
 	/*
 	 * If we found any dirty or locked buffers, then we should have
@@ -650,7 +648,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(write_op, bh);
+				submit_bh(WRITE_SYNC, bh);
 			}
 			cond_resched();
 
@@ -661,6 +659,8 @@ start_journal_io:
 		}
 	}
 
+	blk_finish_plug(&plug);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
-- 
cgit v1.2.3


From 82f04ab47e1d94d78503591a7460b2cad9601ede Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 17 Mar 2011 11:01:52 +0100
Subject: jbd2: finish conversion from WRITE_SYNC_PLUG to WRITE_SYNC and
 explicit plugging

'write_op' was still used, even though it was always WRITE_SYNC now.
Add plugging around the cases where it submits IO, and flush them
before we end up waiting for that IO.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/jbd2/commit.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3da1cc4346d..fa36d7662b2 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -329,7 +329,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
-	int write_op = WRITE_SYNC;
+	struct blk_plug plug;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -363,13 +363,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	write_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
-	/*
-	 * Use plugged writes here, since we want to submit several before
-	 * we unplug the device. We don't do explicit unplugging in here,
-	 * instead we rely on sync_buffer() doing the unplug for us.
-	 */
-	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
 	trace_jbd2_commit_locking(journal, commit_transaction);
 	stats.run.rs_wait = commit_transaction->t_max_wait;
 	stats.run.rs_locked = jiffies;
@@ -469,8 +462,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	if (err)
 		jbd2_journal_abort(journal, err);
 
+	blk_start_plug(&plug);
 	jbd2_journal_write_revoke_records(journal, commit_transaction,
-					  write_op);
+					  WRITE_SYNC);
+	blk_finish_plug(&plug);
 
 	jbd_debug(3, "JBD: commit phase 2\n");
 
@@ -497,6 +492,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	err = 0;
 	descriptor = NULL;
 	bufs = 0;
+	blk_start_plug(&plug);
 	while (commit_transaction->t_buffers) {
 
 		/* Find the next buffer to be journaled... */
@@ -658,7 +654,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(write_op, bh);
+				submit_bh(WRITE_SYNC, bh);
 			}
 			cond_resched();
 			stats.run.rs_blocks_logged += bufs;
@@ -699,6 +695,8 @@ start_journal_io:
 			__jbd2_journal_abort_hard(journal);
 	}
 
+	blk_finish_plug(&plug);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
-- 
cgit v1.2.3


From a91a2785b200864aef2270ed6a3babac7a253a20 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 17 Mar 2011 11:11:05 +0100
Subject: block: Require subsystems to explicitly allocate bio_set integrity
 mempool

MD and DM create a new bio_set for every metadevice. Each bio_set has an
integrity mempool attached regardless of whether the metadevice is
capable of passing integrity metadata. This is a waste of memory.

Instead we defer the allocation decision to MD and DM since we know at
metadevice creation time whether integrity passthrough is needed or not.

Automatic integrity mempool allocation can then be removed from
bioset_create() and we make an explicit integrity allocation for the
fs_bio_set.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Acked-by: Mike Snitzer <snizer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/bio-integrity.c | 3 +++
 fs/bio.c           | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e49cce234c6..9c5e6b2cd11 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -761,6 +761,9 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
 	unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
 
+	if (bs->bio_integrity_pool)
+		return 0;
+
 	bs->bio_integrity_pool =
 		mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
 
diff --git a/fs/bio.c b/fs/bio.c
index 5694b756ed0..85e2eabb1f8 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1636,9 +1636,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (bioset_integrity_create(bs, pool_size))
-		goto bad;
-
 	if (!biovec_create_pools(bs, pool_size))
 		return bs;
 
@@ -1682,6 +1679,9 @@ static int __init init_bio(void)
 	if (!fs_bio_set)
 		panic("bio: can't allocate bios\n");
 
+	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+		panic("bio: can't create integrity pool\n");
+
 	bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
 						     sizeof(struct bio_pair));
 	if (!bio_split_pool)
-- 
cgit v1.2.3


From 95f28604a65b1c40b6c6cd95e58439cd7ded3add Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Thu, 17 Mar 2011 11:13:12 +0100
Subject: fs: assign sb->s_bdi to default_backing_dev_info if the bdi is going
 away

We don't have proper reference counting for this yet, so we run into
cases where the device is pulled and we OOPS on flushing the fs data.
This happens even though the dirty inodes have already been
migrated to the default_backing_dev_info.

Reported-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Tested-by: Torsten Hilbrich <torsten.hilbrich@secunet.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/super.c | 2 ++
 fs/sync.c  | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 7e9dd4cc2c0..0d89e93f654 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -71,6 +71,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 #else
 		INIT_LIST_HEAD(&s->s_files);
 #endif
+		s->s_bdi = &default_backing_dev_info;
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_BL_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -1003,6 +1004,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	}
 	BUG_ON(!mnt->mnt_sb);
 	WARN_ON(!mnt->mnt_sb->s_bdi);
+	WARN_ON(mnt->mnt_sb->s_bdi == &default_backing_dev_info);
 	mnt->mnt_sb->s_flags |= MS_BORN;
 
 	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
diff --git a/fs/sync.c b/fs/sync.c
index ba76b9623e7..412dc89163d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -33,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	 * This should be safe, as we require bdi backing to actually
 	 * write out data in the first place
 	 */
-	if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
+	if (sb->s_bdi == &noop_backing_dev_info)
 		return 0;
 
 	if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -79,7 +79,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
 
 static void sync_one_sb(struct super_block *sb, void *arg)
 {
-	if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+	if (!(sb->s_flags & MS_RDONLY))
 		__sync_filesystem(sb, *(int *)arg);
 }
 /*
-- 
cgit v1.2.3


From 0ccd234ca04b09a156f8771af316ac9de2fa7312 Mon Sep 17 00:00:00 2001
From: matt mooney <mfm@muteddisk.com>
Date: Fri, 14 Jan 2011 06:12:35 -0800
Subject: fs: change to new flag variable

Replace EXTRA_CFLAGS with ccflags-y. And change ntfs-objs to ntfs-y
for cleaner conditional inclusion.

Signed-off-by: matt mooney <mfm@muteddisk.com>
Acked-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 fs/affs/Makefile        |  2 +-
 fs/coda/Makefile        |  2 +-
 fs/gfs2/Makefile        |  2 +-
 fs/jfs/Makefile         |  2 +-
 fs/ncpfs/Makefile       |  2 +-
 fs/ntfs/Makefile        | 19 +++++++------------
 fs/ocfs2/Makefile       |  4 ++--
 fs/ocfs2/dlm/Makefile   |  2 +-
 fs/ocfs2/dlmfs/Makefile |  2 +-
 fs/reiserfs/Makefile    |  4 +---
 fs/xfs/Makefile         |  7 ++-----
 11 files changed, 19 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/Makefile b/fs/affs/Makefile
index b2c4f54446f..3988b4a7833 100644
--- a/fs/affs/Makefile
+++ b/fs/affs/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux affs filesystem routines.
 #
 
-#EXTRA_CFLAGS=-DDEBUG=1
+#ccflags-y := -DDEBUG=1
 
 obj-$(CONFIG_AFFS_FS) += affs.o
 
diff --git a/fs/coda/Makefile b/fs/coda/Makefile
index 6c22e61da39..1bab69a0d34 100644
--- a/fs/coda/Makefile
+++ b/fs/coda/Makefile
@@ -9,4 +9,4 @@ coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \
 
 # If you want debugging output, please uncomment the following line.
 
-# EXTRA_CFLAGS += -DDEBUG -DDEBUG_SMB_MALLOC=1
+# ccflags-y := -DDEBUG -DDEBUG_SMB_MALLOC=1
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 21f7e46da4c..f3d23ef4e87 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS := -I$(src)
+ccflags-y := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index 3adb6395e42..a58fa72d7e5 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -13,4 +13,4 @@ jfs-y    := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
 
 jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
 
-EXTRA_CFLAGS += -D_JFS_4K
+ccflags-y := -D_JFS_4K
diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile
index 68ea095100a..c66af563f2c 100644
--- a/fs/ncpfs/Makefile
+++ b/fs/ncpfs/Makefile
@@ -11,6 +11,6 @@ ncpfs-$(CONFIG_NCPFS_EXTRAS)   += symlink.o
 ncpfs-$(CONFIG_NCPFS_NFS_NS)   += symlink.o
 
 # If you want debugging output, please uncomment the following line
-# EXTRA_CFLAGS += -DDEBUG_NCP=1
+# ccflags-y := -DDEBUG_NCP=1
 
 CFLAGS_ncplib_kernel.o := -finline-functions
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 4ff028fcfd6..30206b23843 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -2,18 +2,13 @@
 
 obj-$(CONFIG_NTFS_FS) += ntfs.o
 
-ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
-	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
-	     unistr.o upcase.o
+ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
+	  index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
+	  unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
+ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
 
-ifeq ($(CONFIG_NTFS_DEBUG),y)
-EXTRA_CFLAGS += -DDEBUG
-endif
+ccflags-y := -DNTFS_VERSION=\"2.1.30\"
+ccflags-$(CONFIG_NTFS_DEBUG)	+= -DDEBUG
+ccflags-$(CONFIG_NTFS_RW)	+= -DNTFS_RW
 
-ifeq ($(CONFIG_NTFS_RW),y)
-EXTRA_CFLAGS += -DNTFS_RW
-
-ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-endif
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 07d9fd85435..d8a0313e99e 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,6 +1,6 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
 
-EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+ccflags-y += -DCATCH_BH_JBD_RACES
 
 obj-$(CONFIG_OCFS2_FS) += 	\
 	ocfs2.o			\
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index dcebf0d920f..c8a044efbb1 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
 
 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
 
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index df69b4856d0..f14be89a670 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
 
 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
 
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 792b3cb2cd1..3c3b0016511 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -31,9 +31,7 @@ endif
 # and causing a panic. Since this behavior only affects ppc32, this ifeq
 # will work around it. If any other architecture displays this behavior,
 # add it here.
-ifeq ($(CONFIG_PPC32),y)
-EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0400, -O1)
-endif
+ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
 
 TAGS:
 	etags *.c
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index faca4499709..82b1371775c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,14 +16,11 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
 
-EXTRA_CFLAGS +=	 -I$(src) -I$(src)/linux-2.6
+ccflags-y := -I$(src) -I$(src)/linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
 
 XFS_LINUX := linux-2.6
 
-ifeq ($(CONFIG_XFS_DEBUG),y)
-	EXTRA_CFLAGS += -g
-endif
-
 obj-$(CONFIG_XFS_FS)		+= xfs.o
 
 xfs-y				+= linux-2.6/xfs_trace.o
-- 
cgit v1.2.3


From 9ae78bcc000168251f893b1bf92a848308187695 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 17 Mar 2011 12:52:33 -0400
Subject: nfsd4: fix comment and remove unused nfsd4_file fields

A couple fields here were left over from a previous version of a patch,
and are no longer used.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/state.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 2d31224b07b..6bd2f3c21f2 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -367,16 +367,12 @@ struct nfs4_file {
 	struct list_head	fi_delegations;
 	/* One each for O_RDONLY, O_WRONLY, O_RDWR: */
 	struct file *		fi_fds[3];
-	/* One each for O_RDONLY, O_WRONLY: */
-	atomic_t		fi_access[2];
 	/*
-	 * Each open stateid contributes 1 to either fi_readers or
-	 * fi_writers, or both, depending on the open mode.  A
-	 * delegation also takes an fi_readers reference.  Lock
-	 * stateid's take none.
+	 * Each open or lock stateid contributes 1 to either
+	 * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
+	 * on open or lock mode:
 	 */
-	atomic_t		fi_readers;
-	atomic_t		fi_writers;
+	atomic_t		fi_access[2];
 	struct file		*fi_deleg_file;
 	struct file_lock	*fi_lease;
 	atomic_t		fi_delegees;
-- 
cgit v1.2.3


From 5a02ab7c3c4580f94d13c683721039855b67cda6 Mon Sep 17 00:00:00 2001
From: Mi Jinlong <mijinlong@cn.fujitsu.com>
Date: Fri, 11 Mar 2011 12:13:55 +0800
Subject: nfsd: wrong index used in inner loop

We must not use dummy for index.
After the first index, READ32(dummy) will change dummy!!!!

Signed-off-by: Mi Jinlong <mijinlong@cn.fujitsu.com>
[bfields@redhat.com: Trond points out READ_BUF alone is sufficient.]
Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 615f0a9f060..c6766af00d9 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1142,7 +1142,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 
 	u32 dummy;
 	char *machine_name;
-	int i, j;
+	int i;
 	int nr_secflavs;
 
 	READ_BUF(16);
@@ -1215,8 +1215,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 			READ_BUF(4);
 			READ32(dummy);
 			READ_BUF(dummy * 4);
-			for (j = 0; j < dummy; ++j)
-				READ32(dummy);
 			break;
 		case RPC_AUTH_GSS:
 			dprintk("RPC_AUTH_GSS callback secflavor "
@@ -1232,7 +1230,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 			READ_BUF(4);
 			READ32(dummy);
 			READ_BUF(dummy);
-			p += XDR_QUADLEN(dummy);
 			break;
 		default:
 			dprintk("Illegal callback secflavor\n");
-- 
cgit v1.2.3


From 474a00ee1306eb7e82329fdc28b6471a99facba1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Mar 2011 21:31:32 -0400
Subject: kill simple_set_mnt()

not needed anymore, since all users (->get_sb() instances) are gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index d7513485c1f..a2a01a104ab 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -466,14 +466,6 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	br_write_unlock(vfsmount_lock);
 }
 
-void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
-{
-	mnt->mnt_sb = sb;
-	mnt->mnt_root = dget(sb->s_root);
-}
-
-EXPORT_SYMBOL(simple_set_mnt);
-
 void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
-- 
cgit v1.2.3


From fbe0aa1f3d16fac5b641c0c1697371dcbe45b569 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@gmail.com>
Date: Thu, 17 Mar 2011 16:29:15 -0700
Subject: Some fixes for pstore

1) Change from ->get_sb() to ->mount()
2) Use mount_single() instead of mount_nodev()
3) Pulled in ramfs_get_inode() & trimmed to what I need for pstore
4) Drop the ugly pstore_writefile() Just save data using kmalloc() and
   provide a pstore_file_read() that uses simple_read_from_buffer().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pstore/inode.c | 116 ++++++++++++++++++++++++++----------------------------
 1 file changed, 56 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 549d245d0b4..08342232cb1 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -40,9 +40,29 @@
 struct pstore_private {
 	u64	id;
 	int	(*erase)(u64);
+	ssize_t	size;
+	char	data[];
 };
 
-#define pstore_get_inode ramfs_get_inode
+static int pstore_file_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
+						size_t count, loff_t *ppos)
+{
+	struct pstore_private *ps = file->private_data;
+
+	return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
+}
+
+static const struct file_operations pstore_file_operations = {
+	.open	= pstore_file_open,
+	.read	= pstore_file_read,
+	.llseek	= default_llseek,
+};
 
 /*
  * When a file is unlinked from our file system we call the
@@ -63,6 +83,30 @@ static const struct inode_operations pstore_dir_inode_operations = {
 	.unlink		= pstore_unlink,
 };
 
+static struct inode *pstore_get_inode(struct super_block *sb,
+					const struct inode *dir, int mode, dev_t dev)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_ino = get_next_ino();
+		inode->i_uid = inode->i_gid = 0;
+		inode->i_mode = mode;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		switch (mode & S_IFMT) {
+		case S_IFREG:
+			inode->i_fop = &pstore_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &pstore_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+			inc_nlink(inode);
+			break;
+		}
+	}
+	return inode;
+}
+
 static const struct super_operations pstore_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
@@ -70,37 +114,10 @@ static const struct super_operations pstore_ops = {
 };
 
 static struct super_block *pstore_sb;
-static struct vfsmount *pstore_mnt;
 
 int pstore_is_mounted(void)
 {
-	return pstore_mnt != NULL;
-}
-
-/*
- * Set up a file structure as if we had opened this file and
- * write our data to it.
- */
-static int pstore_writefile(struct inode *inode, struct dentry *dentry,
-	char *data, size_t size)
-{
-	struct file f;
-	ssize_t n;
-	mm_segment_t old_fs = get_fs();
-
-	memset(&f, '0', sizeof f);
-	f.f_mapping = inode->i_mapping;
-	f.f_path.dentry = dentry;
-	f.f_path.mnt = pstore_mnt;
-	f.f_pos = 0;
-	f.f_op = inode->i_fop;
-	set_fs(KERNEL_DS);
-	n = do_sync_write(&f, data, size, &f.f_pos);
-	set_fs(old_fs);
-
-	fsnotify_modify(&f);
-
-	return n == size;
+	return pstore_sb != NULL;
 }
 
 /*
@@ -123,8 +140,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
 	inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
 	if (!inode)
 		goto fail;
-	inode->i_uid = inode->i_gid = 0;
-	private = kmalloc(sizeof *private, GFP_KERNEL);
+	private = kmalloc(sizeof *private + size, GFP_KERNEL);
 	if (!private)
 		goto fail_alloc;
 	private->id = id;
@@ -152,28 +168,19 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
 	if (IS_ERR(dentry))
 		goto fail_lockedalloc;
 
-	d_add(dentry, inode);
-
-	mutex_unlock(&root->d_inode->i_mutex);
-
-	if (!pstore_writefile(inode, dentry, data, size))
-		goto fail_write;
+	memcpy(private->data, data, size);
+	inode->i_size = private->size = size;
 
 	inode->i_private = private;
 
 	if (time.tv_sec)
 		inode->i_mtime = inode->i_ctime = time;
 
-	return 0;
+	d_add(dentry, inode);
 
-fail_write:
-	kfree(private);
-	inode->i_nlink--;
-	mutex_lock(&root->d_inode->i_mutex);
-	d_delete(dentry);
-	dput(dentry);
 	mutex_unlock(&root->d_inode->i_mutex);
-	goto fail;
+
+	return 0;
 
 fail_lockedalloc:
 	mutex_unlock(&root->d_inode->i_mutex);
@@ -225,32 +232,21 @@ fail:
 	return err;
 }
 
-static int pstore_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *pstore_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
 {
-	struct dentry *root;
-
-	root = mount_nodev(fs_type, flags, data, pstore_fill_super);
-	if (IS_ERR(root))
-		return -ENOMEM;
-
-	mnt->mnt_root = root;
-	mnt->mnt_sb = root->d_sb;
-	pstore_mnt = mnt;
-
-	return 0;
+	return mount_single(fs_type, flags, data, pstore_fill_super);
 }
 
 static void pstore_kill_sb(struct super_block *sb)
 {
 	kill_litter_super(sb);
 	pstore_sb = NULL;
-	pstore_mnt = NULL;
 }
 
 static struct file_system_type pstore_fs_type = {
 	.name		= "pstore",
-	.get_sb		= pstore_get_sb,
+	.mount		= pstore_mount,
 	.kill_sb	= pstore_kill_sb,
 };
 
-- 
cgit v1.2.3


From 9d412a43c3b26e1e549319e5eec26f0829f9f74d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 17 Mar 2011 22:08:28 -0400
Subject: vfs: split off vfsmount-related parts of vfs_kern_mount()

new function: mount_fs().  Does all work done by vfs_kern_mount()
except the allocation and filling of vfsmount; returns root dentry
or ERR_PTR().

vfs_kern_mount() switched to using it and taken to fs/namespace.c,
along with its wrappers.

alloc_vfsmnt()/free_vfsmnt() made static.

functions in namespace.c slightly reordered.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h  |   5 +-
 fs/namespace.c | 153 ++++++++++++++++++++++++++++++++++++++++++---------------
 fs/super.c     |  96 ++++++------------------------------
 3 files changed, 132 insertions(+), 122 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index f3d15de44b1..17191546d52 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
 #include <linux/lglock.h>
 
 struct super_block;
+struct file_system_type;
 struct linux_binprm;
 struct path;
 
@@ -61,8 +62,6 @@ extern int check_unsafe_exec(struct linux_binprm *);
 extern int copy_mount_options(const void __user *, unsigned long *);
 extern int copy_mount_string(const void __user *, char **);
 
-extern void free_vfsmnt(struct vfsmount *);
-extern struct vfsmount *alloc_vfsmnt(const char *);
 extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
@@ -99,6 +98,8 @@ extern struct file *get_empty_filp(void);
 extern int do_remount_sb(struct super_block *, int, void *, int);
 extern void __put_super(struct super_block *sb);
 extern void put_super(struct super_block *sb);
+extern struct dentry *mount_fs(struct file_system_type *,
+			       int, const char *, void *);
 
 /*
  * open.c
diff --git a/fs/namespace.c b/fs/namespace.c
index a2a01a104ab..453529f72df 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -196,7 +196,7 @@ unsigned int mnt_get_count(struct vfsmount *mnt)
 #endif
 }
 
-struct vfsmount *alloc_vfsmnt(const char *name)
+static struct vfsmount *alloc_vfsmnt(const char *name)
 {
 	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
@@ -466,7 +466,7 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	br_write_unlock(vfsmount_lock);
 }
 
-void free_vfsmnt(struct vfsmount *mnt)
+static void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
@@ -670,6 +670,36 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
 	return p;
 }
 
+struct vfsmount *
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+{
+	struct vfsmount *mnt;
+	struct dentry *root;
+
+	if (!type)
+		return ERR_PTR(-ENODEV);
+
+	mnt = alloc_vfsmnt(name);
+	if (!mnt)
+		return ERR_PTR(-ENOMEM);
+
+	if (flags & MS_KERNMOUNT)
+		mnt->mnt_flags = MNT_INTERNAL;
+
+	root = mount_fs(type, flags, name, data);
+	if (IS_ERR(root)) {
+		free_vfsmnt(mnt);
+		return ERR_CAST(root);
+	}
+
+	mnt->mnt_root = root;
+	mnt->mnt_sb = root->d_sb;
+	mnt->mnt_mountpoint = mnt->mnt_root;
+	mnt->mnt_parent = mnt;
+	return mnt;
+}
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 					int flag)
 {
@@ -1905,7 +1935,81 @@ out:
 	return err;
 }
 
-static int do_add_mount(struct vfsmount *, struct path *, int);
+static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
+{
+	int err;
+	const char *subtype = strchr(fstype, '.');
+	if (subtype) {
+		subtype++;
+		err = -EINVAL;
+		if (!subtype[0])
+			goto err;
+	} else
+		subtype = "";
+
+	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!mnt->mnt_sb->s_subtype)
+		goto err;
+	return mnt;
+
+ err:
+	mntput(mnt);
+	return ERR_PTR(err);
+}
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
+	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+	    !mnt->mnt_sb->s_subtype)
+		mnt = fs_set_subtype(mnt, fstype);
+	put_filesystem(type);
+	return mnt;
+}
+EXPORT_SYMBOL_GPL(do_kern_mount);
+
+/*
+ * add a mount into a namespace's mount tree
+ */
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
+{
+	int err;
+
+	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+
+	down_write(&namespace_sem);
+	/* Something was mounted here while we slept */
+	err = follow_down(path, true);
+	if (err < 0)
+		goto unlock;
+
+	err = -EINVAL;
+	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
+		goto unlock;
+
+	/* Refuse the same filesystem on the same mount point */
+	err = -EBUSY;
+	if (path->mnt->mnt_sb == newmnt->mnt_sb &&
+	    path->mnt->mnt_root == path->dentry)
+		goto unlock;
+
+	err = -EINVAL;
+	if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
+		goto unlock;
+
+	newmnt->mnt_flags = mnt_flags;
+	err = graft_tree(newmnt, path);
+
+unlock:
+	up_write(&namespace_sem);
+	return err;
+}
 
 /*
  * create a new mount for userspace and request it to be added into the
@@ -1965,43 +2069,6 @@ fail:
 	return err;
 }
 
-/*
- * add a mount into a namespace's mount tree
- */
-static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
-{
-	int err;
-
-	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
-
-	down_write(&namespace_sem);
-	/* Something was mounted here while we slept */
-	err = follow_down(path, true);
-	if (err < 0)
-		goto unlock;
-
-	err = -EINVAL;
-	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
-		goto unlock;
-
-	/* Refuse the same filesystem on the same mount point */
-	err = -EBUSY;
-	if (path->mnt->mnt_sb == newmnt->mnt_sb &&
-	    path->mnt->mnt_root == path->dentry)
-		goto unlock;
-
-	err = -EINVAL;
-	if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
-		goto unlock;
-
-	newmnt->mnt_flags = mnt_flags;
-	err = graft_tree(newmnt, path);
-
-unlock:
-	up_write(&namespace_sem);
-	return err;
-}
-
 /**
  * mnt_set_expiry - Put a mount on an expiration list
  * @mnt: The mount to list.
@@ -2660,3 +2727,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	kfree(ns);
 }
 EXPORT_SYMBOL(put_mnt_ns);
+
+struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+{
+	return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
+}
+EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/super.c b/fs/super.c
index 4bae0ef6110..e8486490826 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -910,29 +910,18 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_single);
 
-struct vfsmount *
-vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+struct dentry *
+mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct vfsmount *mnt;
 	struct dentry *root;
+	struct super_block *sb;
 	char *secdata = NULL;
-	int error;
-
-	if (!type)
-		return ERR_PTR(-ENODEV);
-
-	error = -ENOMEM;
-	mnt = alloc_vfsmnt(name);
-	if (!mnt)
-		goto out;
-
-	if (flags & MS_KERNMOUNT)
-		mnt->mnt_flags = MNT_INTERNAL;
+	int error = -ENOMEM;
 
 	if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
 		secdata = alloc_secdata();
 		if (!secdata)
-			goto out_mnt;
+			goto out;
 
 		error = security_sb_copy_data(data, secdata);
 		if (error)
@@ -944,13 +933,12 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 		error = PTR_ERR(root);
 		goto out_free_secdata;
 	}
-	mnt->mnt_root = root;
-	mnt->mnt_sb = root->d_sb;
-	BUG_ON(!mnt->mnt_sb);
-	WARN_ON(!mnt->mnt_sb->s_bdi);
-	mnt->mnt_sb->s_flags |= MS_BORN;
+	sb = root->d_sb;
+	BUG_ON(!sb);
+	WARN_ON(!sb->s_bdi);
+	sb->s_flags |= MS_BORN;
 
-	error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
+	error = security_sb_kern_mount(sb, flags, secdata);
 	if (error)
 		goto out_sb;
 
@@ -961,27 +949,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 	 * violate this rule. This warning should be either removed or
 	 * converted to a BUG() in 2.6.34.
 	 */
-	WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
-		"negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
+	WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+		"negative value (%lld)\n", type->name, sb->s_maxbytes);
 
-	mnt->mnt_mountpoint = mnt->mnt_root;
-	mnt->mnt_parent = mnt;
-	up_write(&mnt->mnt_sb->s_umount);
+	up_write(&sb->s_umount);
 	free_secdata(secdata);
-	return mnt;
+	return root;
 out_sb:
-	dput(mnt->mnt_root);
-	deactivate_locked_super(mnt->mnt_sb);
+	dput(root);
+	deactivate_locked_super(sb);
 out_free_secdata:
 	free_secdata(secdata);
-out_mnt:
-	free_vfsmnt(mnt);
 out:
 	return ERR_PTR(error);
 }
 
-EXPORT_SYMBOL_GPL(vfs_kern_mount);
-
 /**
  * freeze_super - lock the filesystem and force it into a consistent state
  * @sb: the super to lock
@@ -1071,49 +1053,3 @@ out:
 	return 0;
 }
 EXPORT_SYMBOL(thaw_super);
-
-static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
-{
-	int err;
-	const char *subtype = strchr(fstype, '.');
-	if (subtype) {
-		subtype++;
-		err = -EINVAL;
-		if (!subtype[0])
-			goto err;
-	} else
-		subtype = "";
-
-	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
-	err = -ENOMEM;
-	if (!mnt->mnt_sb->s_subtype)
-		goto err;
-	return mnt;
-
- err:
-	mntput(mnt);
-	return ERR_PTR(err);
-}
-
-struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
-{
-	struct file_system_type *type = get_fs_type(fstype);
-	struct vfsmount *mnt;
-	if (!type)
-		return ERR_PTR(-ENODEV);
-	mnt = vfs_kern_mount(type, flags, name, data);
-	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
-	    !mnt->mnt_sb->s_subtype)
-		mnt = fs_set_subtype(mnt, fstype);
-	put_filesystem(type);
-	return mnt;
-}
-EXPORT_SYMBOL_GPL(do_kern_mount);
-
-struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
-{
-	return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
-}
-
-EXPORT_SYMBOL_GPL(kern_mount_data);
-- 
cgit v1.2.3


From 27cb1572e3e6bb1f8cf6bb3d74c914a87b131792 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 18 Mar 2011 08:29:36 -0400
Subject: fix deadlock in pivot_root()

Don't hold vfsmount_lock over the loop traversing ->mnt_parent;
do check_mnt(new.mnt) under namespace_sem instead; combined with
namespace_sem held over all that code it'll guarantee the stability
of ->mnt_parent chain all the way to the root.

Doing check_mnt() outside of namespace_sem in case of pivot_root()
is wrong anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 453529f72df..46cc26b5aaf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2569,9 +2569,6 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	error = user_path_dir(new_root, &new);
 	if (error)
 		goto out0;
-	error = -EINVAL;
-	if (!check_mnt(new.mnt))
-		goto out1;
 
 	error = user_path_dir(put_old, &old);
 	if (error)
@@ -2591,7 +2588,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		IS_MNT_SHARED(new.mnt->mnt_parent) ||
 		IS_MNT_SHARED(root.mnt->mnt_parent))
 		goto out2;
-	if (!check_mnt(root.mnt))
+	if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
 		goto out2;
 	error = -ENOENT;
 	if (cant_mount(old.dentry))
@@ -2615,19 +2612,19 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out2; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	tmp = old.mnt;
-	br_write_lock(vfsmount_lock);
 	if (tmp != new.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
-				goto out3; /* already mounted on put_old */
+				goto out2; /* already mounted on put_old */
 			if (tmp->mnt_parent == new.mnt)
 				break;
 			tmp = tmp->mnt_parent;
 		}
 		if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
-			goto out3;
+			goto out2;
 	} else if (!is_subdir(old.dentry, new.dentry))
-		goto out3;
+		goto out2;
+	br_write_lock(vfsmount_lock);
 	detach_mnt(new.mnt, &parent_path);
 	detach_mnt(root.mnt, &root_parent);
 	/* mount old root on put_old */
@@ -2650,9 +2647,6 @@ out1:
 	path_put(&new);
 out0:
 	return error;
-out3:
-	br_write_unlock(vfsmount_lock);
-	goto out2;
 }
 
 static void __init init_mount_tree(void)
-- 
cgit v1.2.3


From b12cea9198fa99ffd3de1776c323bc7464d26b44 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 18 Mar 2011 08:55:38 -0400
Subject: change the locking order for namespace_sem

Have it nested inside ->i_mutex.  Instead of using follow_down()
under namespace_sem, followed by grabbing i_mutex and checking that
mountpoint to be is not dead, do the following:
	grab i_mutex
	check that it's not dead
	grab namespace_sem
	see if anything is mounted there
	if not, we've won
	otherwise
		drop locks
		put_path on what we had
		replace with what's mounted
		retry everything with new mountpoint to be

New helper (lock_mount()) does that.  do_add_mount(), do_move_mount(),
do_loopback() and pivot_root() switched to it; in case of the last
two that eliminates a race we used to have - original code didn't
do follow_down().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 133 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 73 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 46cc26b5aaf..9263995bf6a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1663,9 +1663,35 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	return err;
 }
 
+static int lock_mount(struct path *path)
+{
+	struct vfsmount *mnt;
+retry:
+	mutex_lock(&path->dentry->d_inode->i_mutex);
+	if (unlikely(cant_mount(path->dentry))) {
+		mutex_unlock(&path->dentry->d_inode->i_mutex);
+		return -ENOENT;
+	}
+	down_write(&namespace_sem);
+	mnt = lookup_mnt(path);
+	if (likely(!mnt))
+		return 0;
+	up_write(&namespace_sem);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	path_put(path);
+	path->mnt = mnt;
+	path->dentry = dget(mnt->mnt_root);
+	goto retry;
+}
+
+static void unlock_mount(struct path *path)
+{
+	up_write(&namespace_sem);
+	mutex_unlock(&path->dentry->d_inode->i_mutex);
+}
+
 static int graft_tree(struct vfsmount *mnt, struct path *path)
 {
-	int err;
 	if (mnt->mnt_sb->s_flags & MS_NOUSER)
 		return -EINVAL;
 
@@ -1673,16 +1699,10 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 	      S_ISDIR(mnt->mnt_root->d_inode->i_mode))
 		return -ENOTDIR;
 
-	err = -ENOENT;
-	mutex_lock(&path->dentry->d_inode->i_mutex);
-	if (cant_mount(path->dentry))
-		goto out_unlock;
+	if (d_unlinked(path->dentry))
+		return -ENOENT;
 
-	if (!d_unlinked(path->dentry))
-		err = attach_recursive_mnt(mnt, path, NULL);
-out_unlock:
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
-	return err;
+	return attach_recursive_mnt(mnt, path, NULL);
 }
 
 /*
@@ -1745,6 +1765,7 @@ static int do_change_type(struct path *path, int flag)
 static int do_loopback(struct path *path, char *old_name,
 				int recurse)
 {
+	LIST_HEAD(umount_list);
 	struct path old_path;
 	struct vfsmount *mnt = NULL;
 	int err = mount_is_safe(path);
@@ -1756,13 +1777,16 @@ static int do_loopback(struct path *path, char *old_name,
 	if (err)
 		return err;
 
-	down_write(&namespace_sem);
+	err = lock_mount(path);
+	if (err)
+		goto out;
+
 	err = -EINVAL;
 	if (IS_MNT_UNBINDABLE(old_path.mnt))
-		goto out;
+		goto out2;
 
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
-		goto out;
+		goto out2;
 
 	err = -ENOMEM;
 	if (recurse)
@@ -1771,20 +1795,18 @@ static int do_loopback(struct path *path, char *old_name,
 		mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
 
 	if (!mnt)
-		goto out;
+		goto out2;
 
 	err = graft_tree(mnt, path);
 	if (err) {
-		LIST_HEAD(umount_list);
-
 		br_write_lock(vfsmount_lock);
 		umount_tree(mnt, 0, &umount_list);
 		br_write_unlock(vfsmount_lock);
-		release_mounts(&umount_list);
 	}
-
+out2:
+	unlock_mount(path);
+	release_mounts(&umount_list);
 out:
-	up_write(&namespace_sem);
 	path_put(&old_path);
 	return err;
 }
@@ -1873,18 +1895,12 @@ static int do_move_mount(struct path *path, char *old_name)
 	if (err)
 		return err;
 
-	down_write(&namespace_sem);
-	err = follow_down(path, true);
+	err = lock_mount(path);
 	if (err < 0)
 		goto out;
 
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
-		goto out;
-
-	err = -ENOENT;
-	mutex_lock(&path->dentry->d_inode->i_mutex);
-	if (cant_mount(path->dentry))
 		goto out1;
 
 	if (d_unlinked(path->dentry))
@@ -1926,9 +1942,8 @@ static int do_move_mount(struct path *path, char *old_name)
 	 * automatically */
 	list_del_init(&old_path.mnt->mnt_expire);
 out1:
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	unlock_mount(path);
 out:
-	up_write(&namespace_sem);
 	if (!err)
 		path_put(&parent_path);
 	path_put(&old_path);
@@ -1983,11 +1998,9 @@ static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flag
 
 	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
 
-	down_write(&namespace_sem);
-	/* Something was mounted here while we slept */
-	err = follow_down(path, true);
-	if (err < 0)
-		goto unlock;
+	err = lock_mount(path);
+	if (err)
+		return err;
 
 	err = -EINVAL;
 	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
@@ -2007,7 +2020,7 @@ static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flag
 	err = graft_tree(newmnt, path);
 
 unlock:
-	up_write(&namespace_sem);
+	unlock_mount(path);
 	return err;
 }
 
@@ -2575,55 +2588,53 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out1;
 
 	error = security_sb_pivotroot(&old, &new);
-	if (error) {
-		path_put(&old);
-		goto out1;
-	}
+	if (error)
+		goto out2;
 
 	get_fs_root(current->fs, &root);
-	down_write(&namespace_sem);
-	mutex_lock(&old.dentry->d_inode->i_mutex);
+	error = lock_mount(&old);
+	if (error)
+		goto out3;
+
 	error = -EINVAL;
 	if (IS_MNT_SHARED(old.mnt) ||
 		IS_MNT_SHARED(new.mnt->mnt_parent) ||
 		IS_MNT_SHARED(root.mnt->mnt_parent))
-		goto out2;
+		goto out4;
 	if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
-		goto out2;
+		goto out4;
 	error = -ENOENT;
-	if (cant_mount(old.dentry))
-		goto out2;
 	if (d_unlinked(new.dentry))
-		goto out2;
+		goto out4;
 	if (d_unlinked(old.dentry))
-		goto out2;
+		goto out4;
 	error = -EBUSY;
 	if (new.mnt == root.mnt ||
 	    old.mnt == root.mnt)
-		goto out2; /* loop, on the same file system  */
+		goto out4; /* loop, on the same file system  */
 	error = -EINVAL;
 	if (root.mnt->mnt_root != root.dentry)
-		goto out2; /* not a mountpoint */
+		goto out4; /* not a mountpoint */
 	if (root.mnt->mnt_parent == root.mnt)
-		goto out2; /* not attached */
+		goto out4; /* not attached */
 	if (new.mnt->mnt_root != new.dentry)
-		goto out2; /* not a mountpoint */
+		goto out4; /* not a mountpoint */
 	if (new.mnt->mnt_parent == new.mnt)
-		goto out2; /* not attached */
+		goto out4; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	tmp = old.mnt;
 	if (tmp != new.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
-				goto out2; /* already mounted on put_old */
+				goto out4; /* already mounted on put_old */
 			if (tmp->mnt_parent == new.mnt)
 				break;
 			tmp = tmp->mnt_parent;
 		}
 		if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
-			goto out2;
+			goto out4;
 	} else if (!is_subdir(old.dentry, new.dentry))
-		goto out2;
+		goto out4;
 	br_write_lock(vfsmount_lock);
 	detach_mnt(new.mnt, &parent_path);
 	detach_mnt(root.mnt, &root_parent);
@@ -2634,14 +2645,16 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	br_write_unlock(vfsmount_lock);
 	chroot_fs_refs(&root, &new);
-
 	error = 0;
-	path_put(&root_parent);
-	path_put(&parent_path);
-out2:
-	mutex_unlock(&old.dentry->d_inode->i_mutex);
-	up_write(&namespace_sem);
+out4:
+	unlock_mount(&old);
+	if (!error) {
+		path_put(&root_parent);
+		path_put(&parent_path);
+	}
+out3:
 	path_put(&root);
+out2:
 	path_put(&old);
 out1:
 	path_put(&new);
-- 
cgit v1.2.3


From 7cc90cc3ffe22a0d81b8d605b20a82ec7911012d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 18 Mar 2011 09:04:20 -0400
Subject: don't pass 'mounting_here' flag to follow_down()

it's always false now

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c    | 4 ++--
 fs/nfsd/vfs.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index b912b7abe74..e092648a068 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1065,7 +1065,7 @@ failed:
  * Care must be taken as namespace_sem may be held (indicated by mounting_here
  * being true).
  */
-int follow_down(struct path *path, bool mounting_here)
+int follow_down(struct path *path)
 {
 	unsigned managed;
 	int ret;
@@ -1086,7 +1086,7 @@ int follow_down(struct path *path, bool mounting_here)
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
 			ret = path->dentry->d_op->d_manage(
-				path->dentry, mounting_here, false);
+				path->dentry, false, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index da1d9701f8e..ff93025ae2f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -87,7 +87,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 			    .dentry = dget(dentry)};
 	int err = 0;
 
-	err = follow_down(&path, false);
+	err = follow_down(&path);
 	if (err < 0)
 		goto out;
 
-- 
cgit v1.2.3


From 1aed3e4204dd787d53b3cd6363eb63bb4900c38e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 18 Mar 2011 09:09:02 -0400
Subject: lose 'mounting_here' argument in ->d_manage()

it's always false...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/root.c | 6 +++---
 fs/namei.c        | 7 +++----
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 014e7aba3b0..e6f84d26f4c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -36,7 +36,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool, bool);
+static int autofs4_d_manage(struct dentry *, bool);
 static void autofs4_dentry_release(struct dentry *);
 
 const struct file_operations autofs4_root_operations = {
@@ -446,7 +446,7 @@ done:
 	return NULL;
 }
 
-int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
+int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
@@ -454,7 +454,7 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
 	/* The daemon never waits. */
-	if (autofs4_oz_mode(sbi) || mounting_here) {
+	if (autofs4_oz_mode(sbi)) {
 		if (!d_mountpoint(dentry))
 			return -EISDIR;
 		return 0;
diff --git a/fs/namei.c b/fs/namei.c
index e092648a068..5a9a6c3094d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -933,8 +933,7 @@ static int follow_managed(struct path *path, unsigned flags)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path->dentry,
-							   false, false);
+			ret = path->dentry->d_op->d_manage(path->dentry, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
@@ -999,7 +998,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		struct vfsmount *mounted;
 		if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
 		    !reverse_transit &&
-		    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+		    path->dentry->d_op->d_manage(path->dentry, true) < 0)
 			return false;
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
@@ -1086,7 +1085,7 @@ int follow_down(struct path *path)
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
 			ret = path->dentry->d_op->d_manage(
-				path->dentry, false, false);
+				path->dentry, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
-- 
cgit v1.2.3


From 24ff6663ccfdaf088dfa7acae489cb11ed4f43c4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 18 Nov 2010 20:52:55 -0500
Subject: fs: call security_d_instantiate in d_obtain_alias V2

While trying to track down some NFS problems with BTRFS, I kept noticing I was
getting -EACCESS for no apparent reason.  Eric Paris and printk() helped me
figure out that it was SELinux that was giving me grief, with the following
denial

type=AVC msg=audit(1290013638.413:95): avc:  denied  { 0x800000 } for  pid=1772
comm="nfsd" name="" dev=sda1 ino=256 scontext=system_u:system_r:kernel_t:s0
tcontext=system_u:object_r:unlabeled_t:s0 tclass=file

Turns out this is because in d_obtain_alias if we can't find an alias we create
one and do all the normal instantiation stuff, but we don't do the
security_d_instantiate.

Usually we are protected from getting a hashed dentry that hasn't yet run
security_d_instantiate() by the parent's i_mutex, but obviously this isn't an
option there, so in order to deal with the case that a second thread comes in
and finds our new dentry before we get to run security_d_instantiate(), we go
ahead and call it if we find a dentry already.  Eric assures me that this is ok
as the code checks to see if the dentry has been initialized already so calling
security_d_instantiate() against the same dentry multiple times is ok.  With
this patch I'm no longer getting errant -EACCESS values.

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index a39fe47c466..1baddc1cec4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1612,10 +1612,13 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	__bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
 	spin_unlock(&tmp->d_lock);
 	spin_unlock(&inode->i_lock);
+	security_d_instantiate(tmp, inode);
 
 	return tmp;
 
  out_iput:
+	if (res && !IS_ERR(res))
+		security_d_instantiate(res, inode);
 	iput(inode);
 	return res;
 }
-- 
cgit v1.2.3


From 4345caba340f051e10847924fc078ae18ed6695c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 19 Mar 2011 13:53:31 +0100
Subject: block: NULL dereference on error path in __blkdev_get()

"disk" is always NULL when we goto out.  There was a check for this
before, but it was removed in 69e02c59a7d9 "block: Don't check events
while open is in progress".

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@carl>
---
 fs/block_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fbe05cbdd69..7d02afb2b7f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1181,9 +1181,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 	disk_unblock_events(disk);
- out:
 	module_put(disk->fops->owner);
 	put_disk(disk);
+ out:
 	bdput(bdev);
 
 	return ret;
-- 
cgit v1.2.3


From 27a4f7e61e1eb4f18737926f4a66db7c48349fea Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Mon, 17 Jan 2011 00:48:17 +0900
Subject: vfs: cleanup do_vfs_ioctl()

Move declaration of 'inode' to beginning of the function. Since it
is referenced directly or indirectly (in case of FIFREEZE/FITHAW/
FS_IOC_FIEMAP) it's not harmful IMHO. And remove unnecessary casts
using 'argp' instead.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ioctl.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1eebeb72b20..1d9b9fcb2db 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -548,6 +548,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 {
 	int error = 0;
 	int __user *argp = (int __user *)arg;
+	struct inode *inode = filp->f_path.dentry->d_inode;
 
 	switch (cmd) {
 	case FIOCLEX:
@@ -567,13 +568,11 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		break;
 
 	case FIOQSIZE:
-		if (S_ISDIR(filp->f_path.dentry->d_inode->i_mode) ||
-		    S_ISREG(filp->f_path.dentry->d_inode->i_mode) ||
-		    S_ISLNK(filp->f_path.dentry->d_inode->i_mode)) {
-			loff_t res =
-				inode_get_bytes(filp->f_path.dentry->d_inode);
-			error = copy_to_user((loff_t __user *)arg, &res,
-					     sizeof(res)) ? -EFAULT : 0;
+		if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
+		    S_ISLNK(inode->i_mode)) {
+			loff_t res = inode_get_bytes(inode);
+			error = copy_to_user(argp, &res, sizeof(res)) ?
+					-EFAULT : 0;
 		} else
 			error = -ENOTTY;
 		break;
@@ -590,14 +589,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		return ioctl_fiemap(filp, arg);
 
 	case FIGETBSZ:
-	{
-		struct inode *inode = filp->f_path.dentry->d_inode;
-		int __user *p = (int __user *)arg;
-		return put_user(inode->i_sb->s_blocksize, p);
-	}
+		return put_user(inode->i_sb->s_blocksize, argp);
 
 	default:
-		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
+		if (S_ISREG(inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
 		else
 			error = vfs_ioctl(filp, cmd, arg);
-- 
cgit v1.2.3


From 2c3d44dc4a1262168ef31bef22b3aa554c0572d8 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Fri, 21 Jan 2011 13:59:59 +0900
Subject: select: remove unused MAX_SELECT_SECONDS

Remove the leftover from the commit 8ff3e8e85fa6 ("select:
switch select() and poll() over to hrtimers").

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c | 3 ---
 fs/select.c | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index c6d31a3bab8..72fe6cda910 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1671,9 +1671,6 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
  * Update: ERESTARTSYS breaks at least the xview clock binary, so
  * I'm trying ERESTARTNOHAND which restart only when you want to.
  */
-#define MAX_SELECT_SECONDS \
-	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
-
 int compat_core_sys_select(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct timespec *end_time)
diff --git a/fs/select.c b/fs/select.c
index e56560d2b08..d33418fdc85 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -517,9 +517,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
  * Update: ERESTARTSYS breaks at least the xview clock binary, so
  * I'm trying ERESTARTNOHAND which restart only when you want to.
  */
-#define MAX_SELECT_SECONDS \
-	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
-
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			   fd_set __user *exp, struct timespec *end_time)
 {
-- 
cgit v1.2.3


From eaae668d01e15435cf977cced3975ccc436257fc Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Tue, 15 Feb 2011 12:48:09 +0000
Subject: fs/inode: Fix kernel-doc format for inode_init_owner

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 9910c039f02..16fefd373fc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1715,7 +1715,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 EXPORT_SYMBOL(init_special_inode);
 
 /**
- * Init uid,gid,mode for new inode according to posix standards
+ * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
  * @inode: New inode
  * @dir: Directory inode
  * @mode: mode of the new inode
-- 
cgit v1.2.3


From ff38c083ad3bb8dbbed80aa9090fcc96bc4af7db Mon Sep 17 00:00:00 2001
From: David Jenni <dave.j@gmx.ch>
Date: Wed, 23 Feb 2011 16:51:05 +0100
Subject: Filesystem: fifo: Fixed coding style issue.

Fixed coding style issue.

Signed-off-by: David Jenni <dave.j@gmx.ch>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fifo.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fifo.c b/fs/fifo.c
index 4e303c22d5e..b1a524d798e 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -66,8 +66,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 				/* suppress POLLHUP until we have
 				 * seen a writer */
 				filp->f_version = pipe->w_counter;
-			} else 
-			{
+			} else {
 				wait_for_partner(inode, &pipe->w_counter);
 				if(signal_pending(current))
 					goto err_rd;
-- 
cgit v1.2.3


From 1bef82917c74249ff21982127e57defd6ca2bb1b Mon Sep 17 00:00:00 2001
From: Holger Hans Peter Freyther <holger@freyther.de>
Date: Thu, 24 Feb 2011 17:46:49 +0100
Subject: Small typo fix...

Hi,

I was backporting the coredump over pipe feature and noticed this small typo,
I wish I would have something bigger to contribute...

>From 15d6080e0ed4267da103c706917a33b1015e8804 Mon Sep 17 00:00:00 2001
From: Holger Hans Peter Freyther <holger@moiji-mobile.com>
Date: Thu, 24 Feb 2011 17:42:50 +0100
Subject: [PATCH] fs: Fix a small typo in the comment

The function is called umh_pipe_setup not uhm_pipe_setup.

Signed-off-by: Holger Hans Peter Freyther <holger@moiji-mobile.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index ba99e1abb1a..5e62d26a4fe 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1875,7 +1875,7 @@ static void wait_for_dump_helpers(struct file *file)
 
 
 /*
- * uhm_pipe_setup
+ * umh_pipe_setup
  * helper function to customize the process used
  * to collect the core in userspace.  Specifically
  * it sets up a pipe and installs it as fd 0 (stdin)
-- 
cgit v1.2.3


From b7ed78f56575074f29ec99d8984f347f6c99c914 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 10 Mar 2011 11:31:30 -0800
Subject: introduce sys_syncfs to sync a single file system

It is frequently useful to sync a single file system, instead of all
mounted file systems via sync(2):

 - On machines with many mounts, it is not at all uncommon for some of
   them to hang (e.g. unresponsive NFS server).  sync(2) will get stuck on
   those and may never get to the one you do care about (e.g., /).
 - Some applications write lots of data to the file system and then
   want to make sure it is flushed to disk.  Calling fsync(2) on each
   file introduces unnecessary ordering constraints that result in a large
   amount of sub-optimal writeback/flush/commit behavior by the file
   system.

There are currently two ways (that I know of) to sync a single super_block:

 - BLKFLSBUF ioctl on the block device: That also invalidates the bdev
   mapping, which isn't usually desirable, and doesn't work for non-block
   file systems.
 - 'mount -o remount,rw' will call sync_filesystem as an artifact of the
   current implemention.  Relying on this little-known side effect for
   something like data safety sounds foolish.

Both of these approaches require root privileges, which some applications
do not have (nor should they need?) given that sync(2) is an unprivileged
operation.

This patch introduces a new system call syncfs(2) that takes an fd and
syncs only the file system it references.  Maybe someday we can

 $ sync /some/path

and not get

 sync: ignoring all arguments

The syscall is motivated by comments by Al and Christoph at the last LSF.
syncfs(2) seems like an appropriate name given statfs(2).

A similar ioctl was also proposed a while back, see
	http://marc.info/?l=linux-fsdevel&m=127970513829285&w=2

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sync.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'fs')

diff --git a/fs/sync.c b/fs/sync.c
index ba76b9623e7..92ca208777d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/namei.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
@@ -128,6 +129,29 @@ void emergency_sync(void)
 	}
 }
 
+/*
+ * sync a single super
+ */
+SYSCALL_DEFINE1(syncfs, int, fd)
+{
+	struct file *file;
+	struct super_block *sb;
+	int ret;
+	int fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	if (!file)
+		return -EBADF;
+	sb = file->f_dentry->d_sb;
+
+	down_read(&sb->s_umount);
+	ret = sync_filesystem(sb);
+	up_read(&sb->s_umount);
+
+	fput_light(file, fput_needed);
+	return ret;
+}
+
 /**
  * vfs_fsync_range - helper to sync a range of data & metadata to disk
  * @file:		file to sync
-- 
cgit v1.2.3


From a872d5101008b091035d579897bccefdeff70def Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Fri, 18 Mar 2011 11:44:48 -0700
Subject: pstore: fix leaking ->i_private

Move kfree() of i_private out of ->unlink() and into ->evict_inode()

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pstore/inode.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 08342232cb1..f777f2902c4 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -73,11 +73,16 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 	struct pstore_private *p = dentry->d_inode->i_private;
 
 	p->erase(p->id);
-	kfree(p);
 
 	return simple_unlink(dir, dentry);
 }
 
+static void pstore_evict_inode(struct inode *inode)
+{
+	end_writeback(inode);
+	kfree(inode->i_private);
+}
+
 static const struct inode_operations pstore_dir_inode_operations = {
 	.lookup		= simple_lookup,
 	.unlink		= pstore_unlink,
@@ -110,6 +115,7 @@ static struct inode *pstore_get_inode(struct super_block *sb,
 static const struct super_operations pstore_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
+	.evict_inode	= pstore_evict_inode,
 	.show_options	= generic_show_options,
 };
 
-- 
cgit v1.2.3


From 1c34092adf1feaba25b7c739cc4def2751f4fa05 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sun, 20 Mar 2011 14:22:07 +0300
Subject: nfs: lock() vs unlock() typo

These should be spin_unlock() instead of spin_lock().  It's a typo.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfs/namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index c0b8344db0c..bf1c68009ff 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -98,7 +98,7 @@ rename_retry:
 		namelen--;
 	buflen -= namelen;
 	if (buflen < 0) {
-		spin_lock(&dentry->d_lock);
+		spin_unlock(&dentry->d_lock);
 		rcu_read_unlock();
 		goto Elong;
 	}
@@ -108,7 +108,7 @@ rename_retry:
 	rcu_read_unlock();
 	return end;
 Elong_unlock:
-	spin_lock(&dentry->d_lock);
+	spin_unlock(&dentry->d_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
-- 
cgit v1.2.3


From aa597bc1f9476d0527e35d6dd9b481422e8205a0 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 8 Feb 2011 00:14:52 +0300
Subject: fs: devpts_pty_new() return -ENOMEM if dentry allocation failed

In this case nobody can open a slave point, so will be better return
from devpts_pty_new()

Now we should not check error code from d_find_alias() in
devpts_pty_kill(), because the dentry exists all times.

Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/devpts/inode.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1bb547c9cad..c6bd815dc79 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -479,6 +479,7 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 	struct dentry *root = sb->s_root;
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
+	int ret = 0;
 	char s[12];
 
 	/* We're supposed to be given the slave end of a pty */
@@ -504,11 +505,14 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 	if (!IS_ERR(dentry)) {
 		d_add(dentry, inode);
 		fsnotify_create(root->d_inode, dentry);
+	} else {
+		iput(inode);
+		ret = -ENOMEM;
 	}
 
 	mutex_unlock(&root->d_inode->i_mutex);
 
-	return 0;
+	return ret;
 }
 
 struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
@@ -544,17 +548,12 @@ void devpts_pty_kill(struct tty_struct *tty)
 	mutex_lock(&root->d_inode->i_mutex);
 
 	dentry = d_find_alias(inode);
-	if (IS_ERR(dentry))
-		goto out;
-
-	if (dentry) {
-		inode->i_nlink--;
-		d_delete(dentry);
-		dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
-	}
 
+	inode->i_nlink--;
+	d_delete(dentry);
+	dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
 	dput(dentry);		/* d_find_alias above */
-out:
+
 	mutex_unlock(&root->d_inode->i_mutex);
 }
 
-- 
cgit v1.2.3


From c212f9aaf9101a037fb7f59e75e639437e11d758 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 19 Jan 2011 21:08:41 +0900
Subject: fs: Use BUG_ON(!mnt) at dentry_open().

dentry_open() requires callers to pass a valid vfsmount.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index f83ca80cc59..b52cf013ffa 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -835,17 +835,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
 
 	validate_creds(cred);
 
-	/*
-	 * We must always pass in a valid mount pointer.   Historically
-	 * callers got away with not passing it, but we must enforce this at
-	 * the earliest possible point now to avoid strange problems deep in the
-	 * filesystem stack.
-	 */
-	if (!mnt) {
-		printk(KERN_WARNING "%s called with NULL vfsmount\n", __func__);
-		dump_stack();
-		return ERR_PTR(-EINVAL);
-	}
+	/* We must always pass in a valid mount pointer. */
+	BUG_ON(!mnt);
 
 	error = -ENFILE;
 	f = get_empty_filp();
-- 
cgit v1.2.3


From 69b195be51620d72956acbf3029adad5765695dc Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 21 Mar 2011 08:32:53 -0400
Subject: bfs: fix bitmap size argument to find_first_zero_bit()

The usage of find_first_zero_bit() in bfs_create() is wrong for two
reasons.

The bitmap size argument to find_first_zero_bit() is info->si_lasti but
the correct bitmap size is info->si_lasti + 1 as info->si_lasti is the
last valid index in info->si_imap bitmap.

Another problem is that it is impossible to detect that info->si_imap
bitmap is full because there is an off-by-one bug in the return value
check for find_first_zero_bit().  If no zero bits exist in info->si_imap,
find_first_zero_bit() returns info->si_lasti.  But the check can't catch
it due to the off-by-one.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: "Tigran A. Aivazian" <tigran@aivazian.fsnet.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/bfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 685ecff3ab3..b14cebfd904 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	if (!inode)
 		return -ENOSPC;
 	mutex_lock(&info->bfs_lock);
-	ino = find_first_zero_bit(info->si_imap, info->si_lasti);
+	ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1);
 	if (ino > info->si_lasti) {
 		mutex_unlock(&info->bfs_lock);
 		iput(inode);
-- 
cgit v1.2.3


From 07d5f69b457019eda4ca568923b1d62b7ada89e1 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 21 Mar 2011 13:58:05 +0100
Subject: fuse: reduce size of struct fuse_request

Reduce the size of struct fuse_request by removing cuse_init_out from
the request structure and allocating it dinamically instead.

CC: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/cuse.c   | 12 ++++++++++--
 fs/fuse/fuse_i.h |  1 -
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3e87cce5837..124b697ef09 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -305,7 +305,7 @@ static void cuse_gendev_release(struct device *dev)
 static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
 	struct cuse_conn *cc = fc_to_cc(fc);
-	struct cuse_init_out *arg = &req->misc.cuse_init_out;
+	struct cuse_init_out *arg = req->out.args[0].value;
 	struct page *page = req->pages[0];
 	struct cuse_devinfo devinfo = { };
 	struct device *dev;
@@ -384,6 +384,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 	dev_set_uevent_suppress(dev, 0);
 	kobject_uevent(&dev->kobj, KOBJ_ADD);
 out:
+	kfree(arg);
 	__free_page(page);
 	return;
 
@@ -405,6 +406,7 @@ static int cuse_send_init(struct cuse_conn *cc)
 	struct page *page;
 	struct fuse_conn *fc = &cc->fc;
 	struct cuse_init_in *arg;
+	void *outarg;
 
 	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
 
@@ -419,6 +421,10 @@ static int cuse_send_init(struct cuse_conn *cc)
 	if (!page)
 		goto err_put_req;
 
+	outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
+	if (!outarg)
+		goto err_free_page;
+
 	arg = &req->misc.cuse_init_in;
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
@@ -429,7 +435,7 @@ static int cuse_send_init(struct cuse_conn *cc)
 	req->in.args[0].value = arg;
 	req->out.numargs = 2;
 	req->out.args[0].size = sizeof(struct cuse_init_out);
-	req->out.args[0].value = &req->misc.cuse_init_out;
+	req->out.args[0].value = outarg;
 	req->out.args[1].size = CUSE_INIT_INFO_MAX;
 	req->out.argvar = 1;
 	req->out.argpages = 1;
@@ -440,6 +446,8 @@ static int cuse_send_init(struct cuse_conn *cc)
 
 	return 0;
 
+err_free_page:
+	__free_page(page);
 err_put_req:
 	fuse_put_request(fc, req);
 err:
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index d4286947bc2..b788becada7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,7 +272,6 @@ struct fuse_req {
 		struct fuse_init_in init_in;
 		struct fuse_init_out init_out;
 		struct cuse_init_in cuse_init_in;
-		struct cuse_init_out cuse_init_out;
 		struct {
 			struct fuse_read_in in;
 			u64 attr_ver;
-- 
cgit v1.2.3


From 357ccf2b69bcefa650a54db83702381d1c9d6959 Mon Sep 17 00:00:00 2001
From: Bryan Green <bryan@grid-net.com>
Date: Tue, 1 Mar 2011 16:43:52 -0800
Subject: fuse: wakeup pollers on connection release/abort

If a fuse dev connection is broken, wake up any
processes that are blocking, in a poll system call,
on one of the files in the now defunct filesystem.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c  | 17 +++++++++++++++++
 fs/fuse/file.c |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cf8d28d1fba..213d3cf4f5e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1910,6 +1910,21 @@ __acquires(fc->lock)
 		kfree(dequeue_forget(fc, 1, NULL));
 }
 
+static void end_polls(struct fuse_conn *fc)
+{
+	struct rb_node *p;
+
+	p = rb_first(&fc->polled_files);
+
+	while (p) {
+		struct fuse_file *ff;
+		ff = rb_entry(p, struct fuse_file, polled_node);
+		wake_up_interruptible_all(&ff->poll_wait);
+
+		p = rb_next(p);
+	}
+}
+
 /*
  * Abort all requests.
  *
@@ -1937,6 +1952,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
 		fc->blocked = 0;
 		end_io_requests(fc);
 		end_queued_requests(fc);
+		end_polls(fc);
 		wake_up_all(&fc->waitq);
 		wake_up_all(&fc->blocked_waitq);
 		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1953,6 +1969,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
 		fc->connected = 0;
 		fc->blocked = 0;
 		end_queued_requests(fc);
+		end_polls(fc);
 		wake_up_all(&fc->blocked_waitq);
 		spin_unlock(&fc->lock);
 		fuse_conn_put(fc);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9e0832dbb1e..6ea00734984 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -222,7 +222,7 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
 		rb_erase(&ff->polled_node, &fc->polled_files);
 	spin_unlock(&fc->lock);
 
-	wake_up_interruptible_sync(&ff->poll_wait);
+	wake_up_interruptible_all(&ff->poll_wait);
 
 	inarg->fh = ff->fh;
 	inarg->flags = flags;
-- 
cgit v1.2.3


From 19690ddb65dbfc7be1b411fce12d3332acefbfb5 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 21 Mar 2011 13:58:06 +0100
Subject: fuse: make fuse_permission() RCU aware

Only bail out of fuse_permission() on IPERM_FLAG_RCU when blocking is
actually necessary.

CC: Nick Piggin <npiggin@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8bd0ef9286c..3b84b913b16 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -970,6 +970,14 @@ static int fuse_access(struct inode *inode, int mask)
 	return err;
 }
 
+static int fuse_perm_getattr(struct inode *inode, int flags)
+{
+	if (flags & IPERM_FLAG_RCU)
+		return -ECHILD;
+
+	return fuse_do_getattr(inode, NULL, NULL);
+}
+
 /*
  * Check permission.  The two basic access models of FUSE are:
  *
@@ -989,9 +997,6 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 	bool refreshed = false;
 	int err = 0;
 
-	if (flags & IPERM_FLAG_RCU)
-		return -ECHILD;
-
 	if (!fuse_allow_task(fc, current))
 		return -EACCES;
 
@@ -1000,9 +1005,15 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 	 */
 	if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
 	    ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
-		err = fuse_update_attributes(inode, NULL, NULL, &refreshed);
-		if (err)
-			return err;
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		if (fi->i_time < get_jiffies_64()) {
+			refreshed = true;
+
+			err = fuse_perm_getattr(inode, flags);
+			if (err)
+				return err;
+		}
 	}
 
 	if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
@@ -1012,7 +1023,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 		   attributes.  This is also needed, because the root
 		   node will at first have no permissions */
 		if (err == -EACCES && !refreshed) {
-			err = fuse_do_getattr(inode, NULL, NULL);
+			err = fuse_perm_getattr(inode, flags);
 			if (!err)
 				err = generic_permission(inode, mask,
 							flags, NULL);
@@ -1023,13 +1034,16 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
 		   noticed immediately, only after the attribute
 		   timeout has expired */
 	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
+		if (flags & IPERM_FLAG_RCU)
+			return -ECHILD;
+
 		err = fuse_access(inode, mask);
 	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		if (!(inode->i_mode & S_IXUGO)) {
 			if (refreshed)
 				return -EACCES;
 
-			err = fuse_do_getattr(inode, NULL, NULL);
+			err = fuse_perm_getattr(inode, flags);
 			if (!err && !(inode->i_mode & S_IXUGO))
 				return -EACCES;
 		}
-- 
cgit v1.2.3


From e7c0a167860620bd2938366896964f729ddaeaaa Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 21 Mar 2011 13:58:06 +0100
Subject: fuse: make fuse_dentry_revalidate() RCU aware

Only bail out of fuse_dentry_revalidate() on LOOKUP_RCU when blocking
is actually necessary.

CC: Nick Piggin <npiggin@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3b84b913b16..c6ba49bd95b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -158,10 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
 	struct inode *inode;
 
-	if (nd && nd->flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	inode = entry->d_inode;
+	inode = ACCESS_ONCE(entry->d_inode);
 	if (inode && is_bad_inode(inode))
 		return 0;
 	else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -177,6 +174,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		if (!inode)
 			return 0;
 
+		if (nd->flags & LOOKUP_RCU)
+			return -ECHILD;
+
 		fc = get_fuse_conn(inode);
 		req = fuse_get_req(fc);
 		if (IS_ERR(req))
-- 
cgit v1.2.3


From 0f60f240d522772467c7d2cebedb910748c78ed4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 21 Mar 2011 14:28:58 +0000
Subject: FS: lookup_mnt() is only used in the core fs routines now

lookup_mnt() is only used in the core fs routines now, so it doesn't need to
be globally declared anymore.  It isn't exported to modules at the moment, so
nothing that can be modularised seems to be using it.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index 17191546d52..8318059b42c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -64,6 +64,7 @@ extern int copy_mount_string(const void __user *, char **);
 
 extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
+extern struct vfsmount *lookup_mnt(struct path *);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
 				struct vfsmount *);
 extern void release_mounts(struct list_head *);
-- 
cgit v1.2.3


From 21f3b5f1bbc3c27e82a8c9fc9861fa20bcb31f26 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 19 Jan 2011 09:45:22 -0800
Subject: ceph: remove debugfs debug cruft

Whoops!

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 08f65faac11..0dba6915712 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -210,8 +210,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 	if (!fsc->debugfs_congestion_kb)
 		goto out;
 
-	dout("a\n");
-
 	snprintf(name, sizeof(name), "../../bdi/%s",
 		 dev_name(fsc->backing_dev_info.dev));
 	fsc->debugfs_bdi =
@@ -221,7 +219,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 	if (!fsc->debugfs_bdi)
 		goto out;
 
-	dout("b\n");
 	fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
 					0600,
 					fsc->client->debugfs_dir,
@@ -230,7 +227,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 	if (!fsc->debugfs_mdsmap)
 		goto out;
 
-	dout("ca\n");
 	fsc->debugfs_mdsc = debugfs_create_file("mdsc",
 						0600,
 						fsc->client->debugfs_dir,
@@ -239,7 +235,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 	if (!fsc->debugfs_mdsc)
 		goto out;
 
-	dout("da\n");
 	fsc->debugfs_caps = debugfs_create_file("caps",
 						   0400,
 						   fsc->client->debugfs_dir,
@@ -248,7 +243,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 	if (!fsc->debugfs_caps)
 		goto out;
 
-	dout("ea\n");
 	fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
 					0600,
 					fsc->client->debugfs_dir,
-- 
cgit v1.2.3


From ad1fee96cbaf873520064252c5dc3212c9844861 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 21 Jan 2011 16:44:03 -0800
Subject: ceph: add ino32 mount option

The ino32 mount option forces the ceph fs to report 32 bit
ino values.  This is useful for 64 bit kernels with 32 bit userspace.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/ceph/dir.c   | 11 ++++++----
 fs/ceph/inode.c |  9 +++++++-
 fs/ceph/super.c |  5 +++++
 fs/ceph/super.h | 65 +++++++++++++++++++++++++++++++++++++++------------------
 4 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index ebafa65a29b..cbe875d3a52 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -161,7 +161,7 @@ more:
 	filp->f_pos = di->offset;
 	err = filldir(dirent, dentry->d_name.name,
 		      dentry->d_name.len, di->offset,
-		      dentry->d_inode->i_ino,
+		      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
 		      dentry->d_inode->i_mode >> 12);
 
 	if (last) {
@@ -245,15 +245,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 		dout("readdir off 0 -> '.'\n");
 		if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
-			    inode->i_ino, inode->i_mode >> 12) < 0)
+			    ceph_translate_ino(inode->i_sb, inode->i_ino),
+			    inode->i_mode >> 12) < 0)
 			return 0;
 		filp->f_pos = 1;
 		off = 1;
 	}
 	if (filp->f_pos == 1) {
+		ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino;
 		dout("readdir off 1 -> '..'\n");
 		if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
-			    filp->f_dentry->d_parent->d_inode->i_ino,
+			    ceph_translate_ino(inode->i_sb, ino),
 			    inode->i_mode >> 12) < 0)
 			return 0;
 		filp->f_pos = 2;
@@ -377,7 +379,8 @@ more:
 		if (filldir(dirent,
 			    rinfo->dir_dname[off - fi->offset],
 			    rinfo->dir_dname_len[off - fi->offset],
-			    pos, ino, ftype) < 0) {
+			    pos,
+			    ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 60456361e07..b54c97da1c4 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -36,6 +36,13 @@ static void ceph_vmtruncate_work(struct work_struct *work);
 /*
  * find or create an inode, given the ceph ino number
  */
+static int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	return 0;
+}
+
 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
 {
 	struct inode *inode;
@@ -1809,7 +1816,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
 	if (!err) {
 		generic_fillattr(inode, stat);
-		stat->ino = inode->i_ino;
+		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
 		if (ceph_snap(inode) != CEPH_NOSNAP)
 			stat->dev = ceph_snap(inode);
 		else
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9c5085465a6..e39ea78c489 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,7 @@ enum {
 	Opt_rbytes,
 	Opt_norbytes,
 	Opt_noasyncreaddir,
+	Opt_ino32,
 };
 
 static match_table_t fsopt_tokens = {
@@ -150,6 +151,7 @@ static match_table_t fsopt_tokens = {
 	{Opt_rbytes, "rbytes"},
 	{Opt_norbytes, "norbytes"},
 	{Opt_noasyncreaddir, "noasyncreaddir"},
+	{Opt_ino32, "ino32"},
 	{-1, NULL}
 };
 
@@ -225,6 +227,9 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_noasyncreaddir:
 		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
 		break;
+	case Opt_ino32:
+		fsopt->flags |= CEPH_MOUNT_OPT_INO32;
+		break;
 	default:
 		BUG_ON(token);
 	}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 20b907d76ae..5405c903704 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -27,6 +27,7 @@
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
 #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
 #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
 
 #define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
 
@@ -319,6 +320,16 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
 	return container_of(inode, struct ceph_inode_info, vfs_inode);
 }
 
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
+{
+	return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
+}
+
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
+{
+	return (struct ceph_fs_client *)sb->s_fs_info;
+}
+
 static inline struct ceph_vino ceph_vino(struct inode *inode)
 {
 	return ceph_inode(inode)->i_vino;
@@ -327,19 +338,49 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
 /*
  * ino_t is <64 bits on many architectures, blech.
  *
- * don't include snap in ino hash, at least for now.
+ *               i_ino (kernel inode)   st_ino (userspace)
+ * i386          32                     32
+ * x86_64+ino32  64                     32
+ * x86_64        64                     64
+ */
+static inline u32 ceph_ino_to_ino32(ino_t ino)
+{
+	ino ^= ino >> (sizeof(ino) * 8 - 32);
+	if (!ino)
+		ino = 1;
+	return ino;
+}
+
+/*
+ * kernel i_ino value
  */
 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
 {
 	ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
 #if BITS_PER_LONG == 32
-	ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
-	if (!ino)
-		ino = 1;
+	ino = ceph_ino_to_ino32(ino);
 #endif
 	return ino;
 }
 
+/*
+ * user-visible ino (stat, filldir)
+ */
+#if BITS_PER_LONG == 32
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+	return ino;
+}
+#else
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
+		ino = ceph_ino_to_ino32(ino);
+	return ino;
+}
+#endif
+
+
 /* for printf-style formatting */
 #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
 
@@ -428,13 +469,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
 	return ((loff_t)frag << 32) | (loff_t)off;
 }
 
-static inline int ceph_set_ino_cb(struct inode *inode, void *data)
-{
-	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
-	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
-	return 0;
-}
-
 /*
  * caps helpers
  */
@@ -503,15 +537,6 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
 				    int *total, int *avail, int *used,
 				    int *reserved, int *min);
 
-static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
-{
-	return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
-}
-
-static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
-{
-	return (struct ceph_fs_client *)sb->s_fs_info;
-}
 
 
 /*
-- 
cgit v1.2.3


From 80456f8672f7e69d05c01627da03587dc1ea1603 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 10 Mar 2011 13:33:26 -0800
Subject: ceph: move readahead default to fs/ceph from libceph

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 4 ++--
 fs/ceph/super.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e39ea78c489..a9e78b4a258 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -293,7 +293,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
         fsopt->sb_flags = flags;
         fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
 
-        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->rsize = CEPH_RSIZE_DEFAULT;
         fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
 	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
 	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
@@ -375,7 +375,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 	if (fsopt->wsize)
 		seq_printf(m, ",wsize=%d", fsopt->wsize);
-	if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
 		seq_printf(m, ",rsize=%d", fsopt->rsize);
 	if (fsopt->congestion_kb != default_congestion_kb())
 		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 5405c903704..619fe719968 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -36,6 +36,7 @@
 #define ceph_test_mount_opt(fsc, opt) \
 	(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
 
+#define CEPH_RSIZE_DEFAULT             (512*1024) /* readahead */
 #define CEPH_MAX_READDIR_DEFAULT        1024
 #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
-- 
cgit v1.2.3


From 78a255654fa7f01945dea0dcedcf5113b3ad9f93 Mon Sep 17 00:00:00 2001
From: Henry C Chang <henry.cy.chang@gmail.com>
Date: Tue, 15 Mar 2011 09:18:01 +0000
Subject: ceph: remove request from unsafe list if it is canceled/timed out

This fixes the list corruption warning like this:

------------[ cut here ]------------
WARNING: at lib/list_debug.c:30 __list_add+0x68/0x81()
Hardware name: X8DTU
list_add corruption. prev->next should be next (ffff880618931250), but was (null). (prev=ffff880c188b9130).
Modules linked in: nfsd lockd nfs_acl auth_rpcgss exportfs ceph libceph libcrc32c sunrpc ipv6 fuse igb i2c_i801 ioatdma i2c_core iTCO_wdt iTCO_vendor_support joydev dca serio_raw usb_storage [last unloaded: scsi_wait_scan]
Pid: 10977, comm: smbd Tainted: G        W  2.6.32.23-170.Elaster.xendom0.fc12.x86_64 #1
Call Trace:
[<ffffffff8105753c>] warn_slowpath_common+0x7c/0x94
[<ffffffff810575ab>] warn_slowpath_fmt+0x41/0x43
[<ffffffff812351a3>] __list_add+0x68/0x81
[<ffffffffa014799d>] ceph_aio_write+0x614/0x8a2 [ceph]
[<ffffffff8111d2a0>] do_sync_write+0xe8/0x125
[<ffffffff81075a1f>] ? autoremove_wake_function+0x0/0x39
[<ffffffff811f21ec>] ? selinux_file_permission+0x5c/0xb3
[<ffffffff811e8521>] ? security_file_permission+0x16/0x18
[<ffffffff8111d864>] vfs_write+0xae/0x10b
[<ffffffff8111d91b>] sys_pwrite64+0x5a/0x76
[<ffffffff81012d32>] system_call_fastpath+0x16/0x1b
---[ end trace 08573eb9f07ff6f4 ]---

Signed-off-by: Henry C Chang <henry_c_chang@tcloudcomputing.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7d0e4a82d89..db5d8630974 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -568,7 +568,14 @@ more:
 			spin_unlock(&ci->i_unsafe_lock);
 			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
 		}
+		
 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+		if (ret < 0 && req->r_safe_callback) {
+			spin_lock(&ci->i_unsafe_lock);
+			list_del_init(&req->r_unsafe_item);
+			spin_unlock(&ci->i_unsafe_lock);
+			ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+		}
 	}
 
 	if (file->f_flags & O_DIRECT)
-- 
cgit v1.2.3


From 49bcb93236ce1c60d9b7eb21a0aea1999f4d8709 Mon Sep 17 00:00:00 2001
From: Henry C Chang <henry.cy.chang@gmail.com>
Date: Tue, 15 Mar 2011 09:18:02 +0000
Subject: ceph: add request to the tail of unsafe write list

In sync_write_wait(), we assume that the newest request is at the
tail of unsafe write list. We should maintain the semantics here.

Signed-off-by: Henry C Chang <henry_c_chang@tcloudcomputing.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index db5d8630974..159b512d5a2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -564,7 +564,8 @@ more:
 			 * start_request so that a tid has been assigned.
 			 */
 			spin_lock(&ci->i_unsafe_lock);
-			list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
+			list_add_tail(&req->r_unsafe_item,
+				      &ci->i_unsafe_writes);
 			spin_unlock(&ci->i_unsafe_lock);
 			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
 		}
-- 
cgit v1.2.3


From 147851d2dc4d2be2f60d40276d12d7ef82f8a7ce Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 15 Mar 2011 14:57:41 -0700
Subject: ceph: rename dentry_release -> d_release, fix comment

Just for consistency's sake.  Fix obsolete comment too.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index cbe875d3a52..1a867a3601a 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1027,14 +1027,13 @@ out_touch:
 }
 
 /*
- * When a dentry is released, clear the dir I_COMPLETE if it was part
- * of the current dir gen or if this is in the snapshot namespace.
+ * Release our ceph_dentry_info.
  */
-static void ceph_dentry_release(struct dentry *dentry)
+static void ceph_d_release(struct dentry *dentry)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
 
-	dout("dentry_release %p\n", dentry);
+	dout("d_release %p\n", dentry);
 	if (di) {
 		ceph_dentry_lru_del(dentry);
 		if (di->lease_session)
@@ -1259,14 +1258,14 @@ const struct inode_operations ceph_dir_iops = {
 
 const struct dentry_operations ceph_dentry_ops = {
 	.d_revalidate = ceph_d_revalidate,
-	.d_release = ceph_dentry_release,
+	.d_release = ceph_d_release,
 };
 
 const struct dentry_operations ceph_snapdir_dentry_ops = {
 	.d_revalidate = ceph_snapdir_d_revalidate,
-	.d_release = ceph_dentry_release,
+	.d_release = ceph_d_release,
 };
 
 const struct dentry_operations ceph_snap_dentry_ops = {
-	.d_release = ceph_dentry_release,
+	.d_release = ceph_d_release,
 };
-- 
cgit v1.2.3


From 366f7e7a79b19bd8c4e8f55fdf12b81538d1a7a4 Mon Sep 17 00:00:00 2001
From: "Luck, Tony" <tony.luck@intel.com>
Date: Fri, 18 Mar 2011 15:33:43 -0700
Subject: pstore: use mount option instead sysfs to tweak kmsg_bytes

/sys/fs is a somewhat strange way to tweak what could more
obviously be tuned with a mount option.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pstore/inode.c    | 68 +++++++++++++++++++++++++++++++++++-----------------
 fs/pstore/internal.h |  3 +--
 fs/pstore/platform.c | 16 +++----------
 3 files changed, 50 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index f777f2902c4..977ed272384 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -27,6 +27,7 @@
 #include <linux/string.h>
 #include <linux/mount.h>
 #include <linux/ramfs.h>
+#include <linux/parser.h>
 #include <linux/sched.h>
 #include <linux/magic.h>
 #include <linux/pstore.h>
@@ -112,10 +113,52 @@ static struct inode *pstore_get_inode(struct super_block *sb,
 	return inode;
 }
 
+enum {
+	Opt_kmsg_bytes, Opt_err
+};
+
+static const match_table_t tokens = {
+	{Opt_kmsg_bytes, "kmsg_bytes=%u"},
+	{Opt_err, NULL}
+};
+
+static void parse_options(char *options)
+{
+	char		*p;
+	substring_t	args[MAX_OPT_ARGS];
+	int		option;
+
+	if (!options)
+		return;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_kmsg_bytes:
+			if (!match_int(&args[0], &option))
+				pstore_set_kmsg_bytes(option);
+			break;
+		}
+	}
+}
+
+static int pstore_remount(struct super_block *sb, int *flags, char *data)
+{
+	parse_options(data);
+
+	return 0;
+}
+
 static const struct super_operations pstore_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= pstore_evict_inode,
+	.remount_fs	= pstore_remount,
 	.show_options	= generic_show_options,
 };
 
@@ -215,6 +258,8 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op		= &pstore_ops;
 	sb->s_time_gran		= 1;
 
+	parse_options(data);
+
 	inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
 	if (!inode) {
 		err = -ENOMEM;
@@ -258,28 +303,7 @@ static struct file_system_type pstore_fs_type = {
 
 static int __init init_pstore_fs(void)
 {
-	int rc = 0;
-	struct kobject *pstorefs_kobj;
-
-	pstorefs_kobj = kobject_create_and_add("pstore", fs_kobj);
-	if (!pstorefs_kobj) {
-		rc = -ENOMEM;
-		goto done;
-	}
-
-	rc = sysfs_create_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
-	if (rc)
-		goto done1;
-
-	rc = register_filesystem(&pstore_fs_type);
-	if (rc == 0)
-		goto done;
-
-	sysfs_remove_file(pstorefs_kobj, &pstore_kmsg_bytes_attr.attr);
-done1:
-	kobject_put(pstorefs_kobj);
-done:
-	return rc;
+	return register_filesystem(&pstore_fs_type);
 }
 module_init(init_pstore_fs)
 
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 76c26d2fab2..8c9f23eb164 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -1,7 +1,6 @@
+extern void	pstore_set_kmsg_bytes(int);
 extern void	pstore_get_records(void);
 extern int	pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
 			      char *data, size_t size,
 			      struct timespec time, int (*erase)(u64));
 extern int	pstore_is_mounted(void);
-
-extern struct kobj_attribute pstore_kmsg_bytes_attr;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 705fdf8abf6..ce9ad84d5dd 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -37,24 +37,14 @@
 static DEFINE_SPINLOCK(pstore_lock);
 static struct pstore_info *psinfo;
 
-/* How much of the console log to snapshot. /sys/fs/pstore/kmsg_bytes */
+/* How much of the console log to snapshot */
 static unsigned long kmsg_bytes = 10240;
 
-static ssize_t b_show(struct kobject *kobj,
-		      struct kobj_attribute *attr, char *buf)
+void pstore_set_kmsg_bytes(int bytes)
 {
-	return snprintf(buf, PAGE_SIZE, "%lu\n", kmsg_bytes);
+	kmsg_bytes = bytes;
 }
 
-static ssize_t b_store(struct kobject *kobj, struct kobj_attribute *attr,
-		       const char *buf, size_t count)
-{
-	return (sscanf(buf, "%lu", &kmsg_bytes) > 0) ? count : 0;
-}
-
-struct kobj_attribute pstore_kmsg_bytes_attr =
-	__ATTR(kmsg_bytes, S_IRUGO | S_IWUSR, b_show, b_store);
-
 /* Tag each group of saved records with a sequence number */
 static int	oopscount;
 
-- 
cgit v1.2.3


From 1e9bb8808ac11094d711d20d580e7b45a4992d0c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 22 Mar 2011 08:35:35 +0100
Subject: block: fix non-atomic access to genhd inflight structures

After the stack plugging introduction, these are called lockless.
Ensure that the counters are updated atomically.

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/partitions/check.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9c21119512b..ac546975031 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -290,7 +290,8 @@ ssize_t part_inflight_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+	return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
+		atomic_read(&p->in_flight[1]));
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-- 
cgit v1.2.3


From 5a7e0a8cf50cf905403f5a498e86d1f97cfcf51b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2011 16:39:46 +0530
Subject: fs/9p: Fix race in initializing writeback fid

When two process open the same file we can end up with both of them
allocating the writeback_fid. Add a new mutex which can be used
for synchronizing v9fs_inode member values.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/v9fs.h           | 1 +
 fs/9p/vfs_file.c       | 3 +++
 fs/9p/vfs_inode.c      | 4 ++++
 fs/9p/vfs_inode_dotl.c | 3 +++
 4 files changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index bd8496db135..89657e86516 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -130,6 +130,7 @@ struct v9fs_inode {
 #endif
 	unsigned int cache_validity;
 	struct p9_fid *writeback_fid;
+	struct mutex v_mutex;
 	struct inode vfs_inode;
 };
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 78bcb97c342..3337d58d0fc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -90,6 +90,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	}
 
 	file->private_data = fid;
+	mutex_lock(&v9inode->v_mutex);
 	if (v9ses->cache && !v9inode->writeback_fid) {
 		/*
 		 * clone a fid and add it to writeback_fid
@@ -101,10 +102,12 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		fid = v9fs_writeback_fid(file->f_path.dentry);
 		if (IS_ERR(fid)) {
 			err = PTR_ERR(fid);
+			mutex_unlock(&v9inode->v_mutex);
 			goto out_error;
 		}
 		v9inode->writeback_fid = (void *) fid;
 	}
+	mutex_unlock(&v9inode->v_mutex);
 #ifdef CONFIG_9P_FSCACHE
 	if (v9ses->cache)
 		v9fs_cache_inode_set_cookie(inode, file);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8a2c232f708..c6cef2495f0 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -221,6 +221,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 #endif
 	v9inode->writeback_fid = NULL;
 	v9inode->cache_validity = 0;
+	mutex_init(&v9inode->v_mutex);
 	return &v9inode->vfs_inode;
 }
 
@@ -650,6 +651,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	/* if we are opening a file, assign the open fid to the file */
 	if (nd && nd->flags & LOOKUP_OPEN) {
 		v9inode = V9FS_I(dentry->d_inode);
+		mutex_lock(&v9inode->v_mutex);
 		if (v9ses->cache && !v9inode->writeback_fid) {
 			/*
 			 * clone a fid and add it to writeback_fid
@@ -661,10 +663,12 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 			inode_fid = v9fs_writeback_fid(dentry);
 			if (IS_ERR(inode_fid)) {
 				err = PTR_ERR(inode_fid);
+				mutex_unlock(&v9inode->v_mutex);
 				goto error;
 			}
 			v9inode->writeback_fid = (void *) inode_fid;
 		}
+		mutex_unlock(&v9inode->v_mutex);
 		filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
 		if (IS_ERR(filp)) {
 			err = PTR_ERR(filp);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 67c138e94fe..327c578c7ba 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -245,6 +245,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 	v9fs_set_create_acl(dentry, dacl, pacl);
 
 	v9inode = V9FS_I(inode);
+	mutex_lock(&v9inode->v_mutex);
 	if (v9ses->cache && !v9inode->writeback_fid) {
 		/*
 		 * clone a fid and add it to writeback_fid
@@ -256,10 +257,12 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		inode_fid = v9fs_writeback_fid(dentry);
 		if (IS_ERR(inode_fid)) {
 			err = PTR_ERR(inode_fid);
+			mutex_unlock(&v9inode->v_mutex);
 			goto error;
 		}
 		v9inode->writeback_fid = (void *) inode_fid;
 	}
+	mutex_unlock(&v9inode->v_mutex);
 	/* Since we are opening a file, assign the open fid to the file */
 	filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
 	if (IS_ERR(filp)) {
-- 
cgit v1.2.3


From 059c138bc79320bd7d6fba91bad4d50eeec9c31f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2011 16:39:48 +0530
Subject: fs/9p: Use truncate_setsize instead of vmtruncate

convert vmtruncate usage to truncate_setsize. We also writeback
all dirty pages before doing 9p operations and on success call truncate_setsize.
This ensure that we continue sanely on failed truncate on the server. The
disadvantage is that we are now going to write back the content that get
thrown away later as a part of truncate.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_inode.c      | 16 ++++++++++------
 fs/9p/vfs_inode_dotl.c | 12 +++++-------
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index c6cef2495f0..0afbbb450fa 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -971,6 +971,10 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	struct p9_wstat wstat;
 
 	P9_DPRINTK(P9_DEBUG_VFS, "\n");
+	retval = inode_change_ok(dentry->d_inode, iattr);
+	if (retval)
+		return retval;
+
 	retval = -EPERM;
 	v9ses = v9fs_inode2v9ses(dentry->d_inode);
 	fid = v9fs_fid_lookup(dentry);
@@ -997,12 +1001,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		if (iattr->ia_valid & ATTR_GID)
 			wstat.n_gid = iattr->ia_gid;
 	}
-	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(dentry->d_inode)) {
-		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
-		if (retval)
-			return retval;
-	}
+
 	/* Write all dirty data */
 	if (S_ISREG(dentry->d_inode->i_mode))
 		filemap_write_and_wait(dentry->d_inode->i_mapping);
@@ -1010,6 +1009,11 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	retval = p9_client_wstat(fid, &wstat);
 	if (retval < 0)
 		return retval;
+
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(dentry->d_inode))
+		truncate_setsize(dentry->d_inode, iattr->ia_size);
+
 	v9fs_invalidate_inode_attr(dentry->d_inode);
 
 	setattr_copy(dentry->d_inode, iattr);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 327c578c7ba..0a0ac30d51d 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -456,12 +456,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
 
-	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(dentry->d_inode)) {
-		retval = vmtruncate(dentry->d_inode, iattr->ia_size);
-		if (retval)
-			return retval;
-	}
 	/* Write all dirty data */
 	if (S_ISREG(dentry->d_inode->i_mode))
 		filemap_write_and_wait(dentry->d_inode->i_mapping);
@@ -469,8 +463,12 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	retval = p9_client_setattr(fid, &p9attr);
 	if (retval < 0)
 		return retval;
-	v9fs_invalidate_inode_attr(dentry->d_inode);
 
+	if ((iattr->ia_valid & ATTR_SIZE) &&
+	    iattr->ia_size != i_size_read(dentry->d_inode))
+		truncate_setsize(dentry->d_inode, iattr->ia_size);
+
+	v9fs_invalidate_inode_attr(dentry->d_inode);
 	setattr_copy(dentry->d_inode, iattr);
 	mark_inode_dirty(dentry->d_inode);
 	if (iattr->ia_valid & ATTR_MODE) {
-- 
cgit v1.2.3


From ea59bb759b8fd240860c37026ab1b998d26ac285 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2011 16:39:49 +0530
Subject: fs/9p: Open writeback fid in O_SYNC mode

Older version of protocol don't support tsyncfs operation.
So for them force a O_SYNC flag on the server

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/fid.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index cd63e002d82..99a1a7b0863 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -286,9 +286,11 @@ static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
 
 struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 {
-	int err;
+	int err, flags;
 	struct p9_fid *fid;
+	struct v9fs_session_info *v9ses;
 
+	v9ses = v9fs_inode2v9ses(dentry->d_inode);
 	fid = v9fs_fid_clone_with_uid(dentry, 0);
 	if (IS_ERR(fid))
 		goto error_out;
@@ -297,8 +299,17 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 	 * dirty pages. We always request for the open fid in read-write
 	 * mode so that a partial page write which result in page
 	 * read can work.
+	 *
+	 * we don't have a tsyncfs operation for older version
+	 * of protocol. So make sure the write back fid is
+	 * opened in O_SYNC mode.
 	 */
-	err = p9_client_open(fid, O_RDWR);
+	if (!v9fs_proto_dotl(v9ses))
+		flags = O_RDWR | O_SYNC;
+	else
+		flags = O_RDWR;
+
+	err = p9_client_open(fid, flags);
 	if (err < 0) {
 		p9_client_clunk(fid);
 		fid = ERR_PTR(err);
-- 
cgit v1.2.3


From 7add697a3d271aa7080513f92dab190c75174b7e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2011 16:39:49 +0530
Subject: fs/9p: Attach writeback_fid on first open with WR flag

We don't need writeback fid if we are only doing O_RDONLY open

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c       | 3 ++-
 fs/9p/vfs_inode.c      | 3 ++-
 fs/9p/vfs_inode_dotl.c | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3337d58d0fc..6997eb62fbb 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -91,7 +91,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 	file->private_data = fid;
 	mutex_lock(&v9inode->v_mutex);
-	if (v9ses->cache && !v9inode->writeback_fid) {
+	if (v9ses->cache && !v9inode->writeback_fid &&
+	    ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
 		/*
 		 * clone a fid and add it to writeback_fid
 		 * we do it during open time instead of
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0afbbb450fa..774a20af01e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -652,7 +652,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	if (nd && nd->flags & LOOKUP_OPEN) {
 		v9inode = V9FS_I(dentry->d_inode);
 		mutex_lock(&v9inode->v_mutex);
-		if (v9ses->cache && !v9inode->writeback_fid) {
+		if (v9ses->cache && !v9inode->writeback_fid &&
+		    ((flags & O_ACCMODE) != O_RDONLY)) {
 			/*
 			 * clone a fid and add it to writeback_fid
 			 * we do it during open time instead of
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0a0ac30d51d..8b616dc3567 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -246,7 +246,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 
 	v9inode = V9FS_I(inode);
 	mutex_lock(&v9inode->v_mutex);
-	if (v9ses->cache && !v9inode->writeback_fid) {
+	if (v9ses->cache && !v9inode->writeback_fid &&
+	    ((flags & O_ACCMODE) != O_RDONLY)) {
 		/*
 		 * clone a fid and add it to writeback_fid
 		 * we do it during open time instead of
-- 
cgit v1.2.3


From 42869c8adae72366fc6c4f3924ce3d6c3735c4a3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2011 16:39:50 +0530
Subject: fs/9p: Add v9fs_dentry2v9ses

Add the new static inline and use the same

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/acl.c            | 4 ++--
 fs/9p/fid.c            | 6 +++---
 fs/9p/v9fs.h           | 5 +++++
 fs/9p/vfs_inode.c      | 6 +++---
 fs/9p/vfs_inode_dotl.c | 4 ++--
 fs/9p/vfs_super.c      | 2 +-
 6 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 51545529637..33aa116732c 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -262,7 +262,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
 	if (strcmp(name, "") != 0)
 		return -EINVAL;
 
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	/*
 	 * We allow set/get/list of acl when access=client is not specified
 	 */
@@ -312,7 +312,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 	if (strcmp(name, "") != 0)
 		return -EINVAL;
 
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	/*
 	 * set the attribute on the remote. Without even looking at the
 	 * xattr value. We leave it to the server to validate
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 99a1a7b0863..0ee594569dc 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -134,7 +134,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid, *old_fid = NULL;
 
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	access = v9ses->flags & V9FS_ACCESS_MASK;
 	fid = v9fs_fid_find(dentry, uid, any);
 	if (fid)
@@ -237,7 +237,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 	int  any, access;
 	struct v9fs_session_info *v9ses;
 
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	access = v9ses->flags & V9FS_ACCESS_MASK;
 	switch (access) {
 	case V9FS_ACCESS_SINGLE:
@@ -290,7 +290,7 @@ struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
 	struct p9_fid *fid;
 	struct v9fs_session_info *v9ses;
 
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	fid = v9fs_fid_clone_with_uid(dentry, 0);
 	if (IS_ERR(fid))
 		goto error_out;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 89657e86516..9665c2b840e 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -174,6 +174,11 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
 	return (inode->i_sb->s_fs_info);
 }
 
+static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
+{
+	return dentry->d_sb->s_fs_info;
+}
+
 static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
 {
 	return v9ses->flags & V9FS_PROTO_2000U;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 774a20af01e..7f6c6770319 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -936,7 +936,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	err = -EPERM;
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		generic_fillattr(dentry->d_inode, stat);
 		return 0;
@@ -977,7 +977,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 		return retval;
 
 	retval = -EPERM;
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	fid = v9fs_fid_lookup(dentry);
 	if(IS_ERR(fid))
 		return PTR_ERR(fid);
@@ -1139,7 +1139,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 
 	P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
 	retval = -EPERM;
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 8b616dc3567..ffbb113d5f3 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -395,7 +395,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
 
 	P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
 	err = -EPERM;
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
 		generic_fillattr(dentry->d_inode, stat);
 		return 0;
@@ -452,7 +452,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 	p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
 
 	retval = -EPERM;
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return PTR_ERR(fid);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 09fd08d1606..f3eed3383e4 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -262,7 +262,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 		goto done;
 	}
 
-	v9ses = v9fs_inode2v9ses(dentry->d_inode);
+	v9ses = v9fs_dentry2v9ses(dentry);
 	if (v9fs_proto_dotl(v9ses)) {
 		res = p9_client_statfs(fid, &rs);
 		if (res == 0) {
-- 
cgit v1.2.3


From aaf0ef1d2bce05cfd06cf29c96a6973df4d0a6a8 Mon Sep 17 00:00:00 2001
From: "M. Mohan Kumar" <mohan@in.ibm.com>
Date: Wed, 16 Mar 2011 21:40:49 +0530
Subject: 9p: use the updated offset given by generic_write_checks

Without this fix, even if a file is opened in O_APPEND mode, data will be
written at current file position instead of end of file.

Signed-off-by: M. Mohan Kumar <mohan@in.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/vfs_file.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 6997eb62fbb..ffed55817f0 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -508,9 +508,12 @@ v9fs_file_write(struct file *filp, const char __user * data,
 	if (!count)
 		goto out;
 
-	return v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+	retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode,
 					filp->private_data,
-					data, count, offset, 1);
+					data, count, &origin, 1);
+	/* update offset on successful write */
+	if (retval > 0)
+		*offset = origin;
 out:
 	return retval;
 }
-- 
cgit v1.2.3


From 117a91e0f25fd7698e20ac3dfa62086be3dc82a3 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Tue, 22 Mar 2011 23:01:26 +0000
Subject: Squashfs: Use vmalloc rather than kmalloc for zlib workspace

Bugzilla bug 31422 reports occasional "page allocation failure. order:4"
at Squashfs mount time.  Fix this by making zlib workspace allocation
use vmalloc rather than kmalloc.

Reported-by: Mehmet Giritli <mehmet@giritli.eu>
Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/zlib_wrapper.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 195b0d035e9..517688b32ff 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -26,6 +26,7 @@
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/zlib.h>
+#include <linux/vmalloc.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -37,8 +38,7 @@ static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
 	z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
 	if (stream == NULL)
 		goto failed;
-	stream->workspace = kmalloc(zlib_inflate_workspacesize(),
-		GFP_KERNEL);
+	stream->workspace = vmalloc(zlib_inflate_workspacesize());
 	if (stream->workspace == NULL)
 		goto failed;
 
@@ -56,7 +56,7 @@ static void zlib_free(void *strm)
 	z_stream *stream = strm;
 
 	if (stream)
-		kfree(stream->workspace);
+		vfree(stream->workspace);
 	kfree(stream);
 }
 
-- 
cgit v1.2.3


From 9f6af27fb693c633cc46d36fc1d85efe6d6fbc17 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 22 Mar 2011 16:01:49 -0700
Subject: pstore: cleanups to pstore_dump()

pstore_dump() can be called with many different "reason" codes. Save
the name of the code in the persistent store record.

Also - only worthwhile calling pstore_mkfile for KMSG_DUMP_OOPS - that
is the only one where the kernel will continue running.

Reviewed-by: Seiji Aguchi <seiji.aguchi@hds.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/platform.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index ce9ad84d5dd..f835a25625f 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -48,6 +48,10 @@ void pstore_set_kmsg_bytes(int bytes)
 /* Tag each group of saved records with a sequence number */
 static int	oopscount;
 
+static char *reason_str[] = {
+	"Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency"
+};
+
 /*
  * callback from kmsg_dump. (s2,l2) has the most recently
  * written bytes, older bytes are in (s1,l1). Save as much
@@ -61,15 +65,20 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 	unsigned long	s1_start, s2_start;
 	unsigned long	l1_cpy, l2_cpy;
 	unsigned long	size, total = 0;
-	char		*dst;
+	char		*dst, *why;
 	u64		id;
 	int		hsize, part = 1;
 
+	if (reason < ARRAY_SIZE(reason_str))
+		why = reason_str[reason];
+	else
+		why = "Unknown";
+
 	mutex_lock(&psinfo->buf_mutex);
 	oopscount++;
 	while (total < kmsg_bytes) {
 		dst = psinfo->buf;
-		hsize = sprintf(dst, "Oops#%d Part%d\n", oopscount, part++);
+		hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++);
 		size = psinfo->bufsize - hsize;
 		dst += hsize;
 
@@ -86,7 +95,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
 
 		id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
-		if (pstore_is_mounted())
+		if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
 			pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
 				      psinfo->buf, hsize + l1_cpy + l2_cpy,
 				      CURRENT_TIME, psinfo->erase);
-- 
cgit v1.2.3


From ef6a3c63112e865d632ff7c478ba7c7160cad0d1 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 22 Mar 2011 16:30:52 -0700
Subject: mm: add replace_page_cache_page() function

This function basically does:

     remove_from_page_cache(old);
     page_cache_release(old);
     add_to_page_cache_locked(new);

Except it does this atomically, so there's no possibility for the "add" to
fail because of a race.

If memory cgroups are enabled, then the memory cgroup charge is also moved
from the old page to the new.

This function is currently used by fuse to move pages into the page cache
on read, instead of copying the page contents.

[minchan.kim@gmail.com: add freepage() hook to replace_page_cache_page()]
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dev.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 213d3cf4f5e..640fc229df1 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -737,14 +737,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 	if (WARN_ON(PageMlocked(oldpage)))
 		goto out_fallback_unlock;
 
-	remove_from_page_cache(oldpage);
-	page_cache_release(oldpage);
-
-	err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
+	err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
 	if (err) {
-		printk(KERN_WARNING "fuse_try_move_page: failed to add page");
-		goto out_fallback_unlock;
+		unlock_page(newpage);
+		return err;
 	}
+
 	page_cache_get(newpage);
 
 	if (!(buf->flags & PIPE_BUF_FLAG_LRU))
-- 
cgit v1.2.3


From bd65cb86c98a79bc61afd0d80166005f125e9064 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Tue, 22 Mar 2011 16:30:54 -0700
Subject: mm: hugetlbfs: change remove_from_page_cache

This patch series changes remove_from_page_cache()'s page ref counting
rule.  Page cache ref count is decreased in delete_from_page_cache().  So
we don't need to decrease the page reference in callers.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: William Irwin <wli@holomorphy.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9885082b470..b9eeb1cd03f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -332,8 +332,7 @@ static void truncate_huge_page(struct page *page)
 {
 	cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
 	ClearPageUptodate(page);
-	remove_from_page_cache(page);
-	put_page(page);
+	delete_from_page_cache(page);
 }
 
 static void truncate_hugepages(struct inode *inode, loff_t lstart)
-- 
cgit v1.2.3


From 033193275b3ffcfe7f3fde7b569f3d207f6cd6a0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 22 Mar 2011 16:32:56 -0700
Subject: pagewalk: only split huge pages when necessary

Right now, if a mm_walk has either ->pte_entry or ->pmd_entry set, it will
unconditionally split any transparent huge pages it runs in to.  In
practice, that means that anyone doing a

	cat /proc/$pid/smaps

will unconditionally break down every huge page in the process and depend
on khugepaged to re-collapse it later.  This is fairly suboptimal.

This patch changes that behavior.  It teaches each ->pmd_entry handler
(there are five) that they must break down the THPs themselves.  Also, the
_generic_ code will never break down a THP unless a ->pte_entry handler is
actually set.

This means that the ->pmd_entry handlers can now choose to deal with THPs
without breaking them down.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Eric B Munson <emunson@mgebm.net>
Tested-by: Eric B Munson <emunson@mgebm.net>
Cc: Michael J Wolf <mjwolf@us.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 60b914860f8..78fd3621f56 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -343,6 +343,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	struct page *page;
 	int mapcount;
 
+	split_huge_page_pmd(walk->mm, pmd);
+
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;
@@ -467,6 +469,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct page *page;
 
+	split_huge_page_pmd(walk->mm, pmd);
+
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE) {
 		ptent = *pte;
@@ -623,6 +627,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pte_t *pte;
 	int err = 0;
 
+	split_huge_page_pmd(walk->mm, pmd);
+
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
 	for (; addr != end; addr += PAGE_SIZE) {
-- 
cgit v1.2.3


From ae11c4d9f646064cf086e2f8cd4b3c475df7739c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 22 Mar 2011 16:32:58 -0700
Subject: smaps: break out smaps_pte_entry() from smaps_pte_range()

We will use smaps_pte_entry() in a moment to handle both small and
transparent large pages.  But, we must break it out of smaps_pte_range()
first.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Eric B Munson <emunson@mgebm.net>
Tested-by: Eric B Munson <emunson@mgebm.net>
Cc: Michael J Wolf <mjwolf@us.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 87 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 78fd3621f56..5cd06fa3106 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -333,56 +333,63 @@ struct mem_size_stats {
 	u64 pss;
 };
 
-static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			   struct mm_walk *walk)
+
+static void smaps_pte_entry(pte_t ptent, unsigned long addr,
+		struct mm_walk *walk)
 {
 	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = mss->vma;
-	pte_t *pte, ptent;
-	spinlock_t *ptl;
 	struct page *page;
 	int mapcount;
 
-	split_huge_page_pmd(walk->mm, pmd);
-
-	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	for (; addr != end; pte++, addr += PAGE_SIZE) {
-		ptent = *pte;
+	if (is_swap_pte(ptent)) {
+		mss->swap += PAGE_SIZE;
+		return;
+	}
 
-		if (is_swap_pte(ptent)) {
-			mss->swap += PAGE_SIZE;
-			continue;
-		}
+	if (!pte_present(ptent))
+		return;
+
+	page = vm_normal_page(vma, addr, ptent);
+	if (!page)
+		return;
+
+	if (PageAnon(page))
+		mss->anonymous += PAGE_SIZE;
+
+	mss->resident += PAGE_SIZE;
+	/* Accumulate the size in pages that have been accessed. */
+	if (pte_young(ptent) || PageReferenced(page))
+		mss->referenced += PAGE_SIZE;
+	mapcount = page_mapcount(page);
+	if (mapcount >= 2) {
+		if (pte_dirty(ptent) || PageDirty(page))
+			mss->shared_dirty += PAGE_SIZE;
+		else
+			mss->shared_clean += PAGE_SIZE;
+		mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+	} else {
+		if (pte_dirty(ptent) || PageDirty(page))
+			mss->private_dirty += PAGE_SIZE;
+		else
+			mss->private_clean += PAGE_SIZE;
+		mss->pss += (PAGE_SIZE << PSS_SHIFT);
+	}
+}
 
-		if (!pte_present(ptent))
-			continue;
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+			   struct mm_walk *walk)
+{
+	struct mem_size_stats *mss = walk->private;
+	struct vm_area_struct *vma = mss->vma;
+	pte_t *pte;
+	spinlock_t *ptl;
 
-		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
-			continue;
+	split_huge_page_pmd(walk->mm, pmd);
 
-		if (PageAnon(page))
-			mss->anonymous += PAGE_SIZE;
-
-		mss->resident += PAGE_SIZE;
-		/* Accumulate the size in pages that have been accessed. */
-		if (pte_young(ptent) || PageReferenced(page))
-			mss->referenced += PAGE_SIZE;
-		mapcount = page_mapcount(page);
-		if (mapcount >= 2) {
-			if (pte_dirty(ptent) || PageDirty(page))
-				mss->shared_dirty += PAGE_SIZE;
-			else
-				mss->shared_clean += PAGE_SIZE;
-			mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
-		} else {
-			if (pte_dirty(ptent) || PageDirty(page))
-				mss->private_dirty += PAGE_SIZE;
-			else
-				mss->private_clean += PAGE_SIZE;
-			mss->pss += (PAGE_SIZE << PSS_SHIFT);
-		}
-	}
+	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (; addr != end; pte++, addr += PAGE_SIZE)
+		smaps_pte_entry(*pte, addr, walk);
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	return 0;
-- 
cgit v1.2.3


From 3c9acc7849b1eab7ffc75e933404c5f32865d9a2 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 22 Mar 2011 16:32:59 -0700
Subject: smaps: pass pte size argument in to smaps_pte_entry()

Add an argument to the new smaps_pte_entry() function to let it account in
things other than PAGE_SIZE units.  I changed all of the PAGE_SIZE sites,
even though not all of them can be reached for transparent huge pages,
just so this will continue to work without changes as THPs are improved.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Eric B Munson <emunson@mgebm.net>
Tested-by: Eric B Munson <emunson@mgebm.net>
Cc: Michael J Wolf <mjwolf@us.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5cd06fa3106..d7e2af33407 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -335,7 +335,7 @@ struct mem_size_stats {
 
 
 static void smaps_pte_entry(pte_t ptent, unsigned long addr,
-		struct mm_walk *walk)
+		unsigned long ptent_size, struct mm_walk *walk)
 {
 	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = mss->vma;
@@ -343,7 +343,7 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
 	int mapcount;
 
 	if (is_swap_pte(ptent)) {
-		mss->swap += PAGE_SIZE;
+		mss->swap += ptent_size;
 		return;
 	}
 
@@ -355,25 +355,25 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
 		return;
 
 	if (PageAnon(page))
-		mss->anonymous += PAGE_SIZE;
+		mss->anonymous += ptent_size;
 
-	mss->resident += PAGE_SIZE;
+	mss->resident += ptent_size;
 	/* Accumulate the size in pages that have been accessed. */
 	if (pte_young(ptent) || PageReferenced(page))
-		mss->referenced += PAGE_SIZE;
+		mss->referenced += ptent_size;
 	mapcount = page_mapcount(page);
 	if (mapcount >= 2) {
 		if (pte_dirty(ptent) || PageDirty(page))
-			mss->shared_dirty += PAGE_SIZE;
+			mss->shared_dirty += ptent_size;
 		else
-			mss->shared_clean += PAGE_SIZE;
-		mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+			mss->shared_clean += ptent_size;
+		mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
 	} else {
 		if (pte_dirty(ptent) || PageDirty(page))
-			mss->private_dirty += PAGE_SIZE;
+			mss->private_dirty += ptent_size;
 		else
-			mss->private_clean += PAGE_SIZE;
-		mss->pss += (PAGE_SIZE << PSS_SHIFT);
+			mss->private_clean += ptent_size;
+		mss->pss += (ptent_size << PSS_SHIFT);
 	}
 }
 
@@ -389,7 +389,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
-		smaps_pte_entry(*pte, addr, walk);
+		smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	return 0;
-- 
cgit v1.2.3


From 22e057c5923e60debad318cbeaee33033b110bc8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 22 Mar 2011 16:33:00 -0700
Subject: smaps: teach smaps_pte_range() about THP pmds

This adds code to explicitly detect and handle pmd_trans_huge() pmds.  It
then passes HPAGE_SIZE units in to the smap_pte_entry() function instead
of PAGE_SIZE.

This means that using /proc/$pid/smaps now will no longer cause THPs to be
broken down in to small pages.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Reviewed-by: Eric B Munson <emunson@mgebm.net>
Tested-by: Eric B Munson <emunson@mgebm.net>
Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michael J Wolf <mjwolf@us.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d7e2af33407..26f9cc00102 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,5 +1,6 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
@@ -7,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
+#include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 
@@ -385,8 +387,25 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	split_huge_page_pmd(walk->mm, pmd);
-
+	spin_lock(&walk->mm->page_table_lock);
+	if (pmd_trans_huge(*pmd)) {
+		if (pmd_trans_splitting(*pmd)) {
+			spin_unlock(&walk->mm->page_table_lock);
+			wait_split_huge_page(vma->anon_vma, pmd);
+		} else {
+			smaps_pte_entry(*(pte_t *)pmd, addr,
+					HPAGE_PMD_SIZE, walk);
+			spin_unlock(&walk->mm->page_table_lock);
+			return 0;
+		}
+	} else {
+		spin_unlock(&walk->mm->page_table_lock);
+	}
+	/*
+	 * The mmap_sem held all the way back in m_start() is what
+	 * keeps khugepaged out of here and from collapsing things
+	 * in here.
+	 */
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
 		smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
-- 
cgit v1.2.3


From 4031a219d8913da40ade5a6e5b538cc61e975cc8 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 22 Mar 2011 16:33:01 -0700
Subject: smaps: have smaps show transparent huge pages

Now that the mere act of _looking_ at /proc/$pid/smaps will not destroy
transparent huge pages, tell how much of the VMA is actually mapped with
them.

This way, we can make sure that we're getting THPs where we
expect to see them.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Eric B Munson <emunson@mgebm.net>
Tested-by: Eric B Munson <emunson@mgebm.net>
Cc: Michael J Wolf <mjwolf@us.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 26f9cc00102..93381aae936 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -331,6 +331,7 @@ struct mem_size_stats {
 	unsigned long private_dirty;
 	unsigned long referenced;
 	unsigned long anonymous;
+	unsigned long anonymous_thp;
 	unsigned long swap;
 	u64 pss;
 };
@@ -396,6 +397,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			smaps_pte_entry(*(pte_t *)pmd, addr,
 					HPAGE_PMD_SIZE, walk);
 			spin_unlock(&walk->mm->page_table_lock);
+			mss->anonymous_thp += HPAGE_PMD_SIZE;
 			return 0;
 		}
 	} else {
@@ -444,6 +446,7 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Private_Dirty:  %8lu kB\n"
 		   "Referenced:     %8lu kB\n"
 		   "Anonymous:      %8lu kB\n"
+		   "AnonHugePages:  %8lu kB\n"
 		   "Swap:           %8lu kB\n"
 		   "KernelPageSize: %8lu kB\n"
 		   "MMUPageSize:    %8lu kB\n"
@@ -457,6 +460,7 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.private_dirty >> 10,
 		   mss.referenced >> 10,
 		   mss.anonymous >> 10,
+		   mss.anonymous_thp >> 10,
 		   mss.swap >> 10,
 		   vma_kernel_pagesize(vma) >> 10,
 		   vma_mmu_pagesize(vma) >> 10,
-- 
cgit v1.2.3


From 80cdc6dae76ea67d2b21bdca8df17ef47251eb8b Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Tue, 22 Mar 2011 16:33:54 -0700
Subject: fs: use appropriate printk priority levels

printk()s without a priority level default to KERN_WARNING.  To reduce
noise at KERN_WARNING, this patch set the priority level appriopriately
for unleveled printks()s.  This should be useful to folks that look at
dmesg warnings closely.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bio.c       | 2 +-
 fs/namespace.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 4bd454fa844..4cf2a52fbc5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -111,7 +111,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 	if (!slab)
 		goto out_unlock;
 
-	printk("bio: create slab <%s> at %d\n", bslab->name, entry);
+	printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
 	bslab->slab = slab;
 	bslab->slab_ref = 1;
 	bslab->slab_size = sz;
diff --git a/fs/namespace.c b/fs/namespace.c
index 9263995bf6a..7dba2ed0342 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2701,7 +2701,7 @@ void __init mnt_init(void)
 	if (!mount_hashtable)
 		panic("Failed to allocate mount hash table\n");
 
-	printk("Mount-cache hash table entries: %lu\n", HASH_SIZE);
+	printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
 
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mount_hashtable[u]);
-- 
cgit v1.2.3


From 3fb0e584a68cd1c5085e69be441f2ad032aaee72 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 22 Mar 2011 16:34:46 -0700
Subject: epoll: move ready event check into proper inline

Move the event readiness check into a proper inline, and use it uniformly
inside ep_poll() code.  Events in the ->ovflist are no less ready than the
ones in ->rdllist.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Shawn Bohrer <shawn.bohrer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ff12f7ac73e..57298d092f5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -315,6 +315,19 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
 	spin_lock_init(&ncalls->lock);
 }
 
+/**
+ * ep_events_available - Checks if ready events might be available.
+ *
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Returns: Returns a value different than zero if ready events are available,
+ *          or zero otherwise.
+ */
+static inline int ep_events_available(struct eventpoll *ep)
+{
+	return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+}
+
 /**
  * ep_call_nested - Perform a bound (possibly) nested call, by checking
  *                  that the recursion limit is not exceeded, and that
@@ -1158,7 +1171,7 @@ retry:
 	spin_lock_irqsave(&ep->lock, flags);
 
 	res = 0;
-	if (list_empty(&ep->rdllist)) {
+	if (!ep_events_available(ep)) {
 		/*
 		 * We don't have any available event to return to the caller.
 		 * We need to sleep here, and we will be wake up by
@@ -1174,7 +1187,7 @@ retry:
 			 * to TASK_INTERRUPTIBLE before doing the checks.
 			 */
 			set_current_state(TASK_INTERRUPTIBLE);
-			if (!list_empty(&ep->rdllist) || timed_out)
+			if (ep_events_available(ep) || timed_out)
 				break;
 			if (signal_pending(current)) {
 				res = -EINTR;
@@ -1192,7 +1205,7 @@ retry:
 		set_current_state(TASK_RUNNING);
 	}
 	/* Is it worth to try to dig for events ? */
-	eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+	eavail = ep_events_available(ep);
 
 	spin_unlock_irqrestore(&ep->lock, flags);
 
-- 
cgit v1.2.3


From f4d93ad74c18143abd3067ca3c8ffba7d00addf4 Mon Sep 17 00:00:00 2001
From: Shawn Bohrer <shawn.bohrer@gmail.com>
Date: Tue, 22 Mar 2011 16:34:47 -0700
Subject: epoll: fix compiler warning and optimize the non-blocking path

Add a comment to ep_poll(), rename labels a bit clearly, fix a warning of
unused variable from gcc and optimize the non-blocking path a little.

Hinted-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>

hannes@cmpxchg.org:

: The non-blocking ep_poll path optimization introduced skipping over the
: return value setup.
:
: Initialize it properly, my userspace gets upset by epoll_wait() returning
: random things.
:
: In addition, remove the reinitialization at the fetch_events label, the
: return value is garuanteed to be zero when execution reaches there.

[hannes@cmpxchg.org: fix initialization]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shawn Bohrer <shawn.bohrer@gmail.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 57298d092f5..ed38801b57a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1148,12 +1148,29 @@ static inline struct timespec ep_set_mstimeout(long ms)
 	return timespec_add_safe(now, ts);
 }
 
+/**
+ * ep_poll - Retrieves ready events, and delivers them to the caller supplied
+ *           event buffer.
+ *
+ * @ep: Pointer to the eventpoll context.
+ * @events: Pointer to the userspace buffer where the ready events should be
+ *          stored.
+ * @maxevents: Size (in terms of number of events) of the caller event buffer.
+ * @timeout: Maximum timeout for the ready events fetch operation, in
+ *           milliseconds. If the @timeout is zero, the function will not block,
+ *           while if the @timeout is less than zero, the function will block
+ *           until at least one event has been retrieved (or an error
+ *           occurred).
+ *
+ * Returns: Returns the number of ready events which have been fetched, or an
+ *          error code, in case of error.
+ */
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout)
 {
-	int res, eavail, timed_out = 0;
+	int res = 0, eavail, timed_out = 0;
 	unsigned long flags;
-	long slack;
+	long slack = 0;
 	wait_queue_t wait;
 	ktime_t expires, *to = NULL;
 
@@ -1164,13 +1181,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		to = &expires;
 		*to = timespec_to_ktime(end_time);
 	} else if (timeout == 0) {
+		/*
+		 * Avoid the unnecessary trip to the wait queue loop, if the
+		 * caller specified a non blocking operation.
+		 */
 		timed_out = 1;
+		spin_lock_irqsave(&ep->lock, flags);
+		goto check_events;
 	}
 
-retry:
+fetch_events:
 	spin_lock_irqsave(&ep->lock, flags);
 
-	res = 0;
 	if (!ep_events_available(ep)) {
 		/*
 		 * We don't have any available event to return to the caller.
@@ -1204,6 +1226,7 @@ retry:
 
 		set_current_state(TASK_RUNNING);
 	}
+check_events:
 	/* Is it worth to try to dig for events ? */
 	eavail = ep_events_available(ep);
 
@@ -1216,7 +1239,7 @@ retry:
 	 */
 	if (!res && eavail &&
 	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
-		goto retry;
+		goto fetch_events;
 
 	return res;
 }
-- 
cgit v1.2.3


From 1a530a6f23f7dca336311ef60c9ca26f3dc63688 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney@caviumnetworks.com>
Date: Tue, 22 Mar 2011 16:34:48 -0700
Subject: binfmt_elf: quiet GCC-4.6 'set but not used' warning in
 load_elf_binary()

With GCC-4.6 we get warnings about things being 'set but not used'.

In load_elf_binary() this can happen with reloc_func_desc if ELF_PLAT_INIT
is defined, but doesn't use the reloc_func_desc argument.

Quiet the warning/error by marking reloc_func_desc as __maybe_unused.

Signed-off-by: David Daney <ddaney@caviumnetworks.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d5b640ba6cb..b2fae009a4b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -570,7 +570,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	unsigned long elf_entry;
 	unsigned long interp_load_addr = 0;
 	unsigned long start_code, end_code, start_data, end_data;
-	unsigned long reloc_func_desc = 0;
+	unsigned long reloc_func_desc __maybe_unused = 0;
 	int executable_stack = EXSTACK_DEFAULT;
 	unsigned long def_flags = 0;
 	struct {
-- 
cgit v1.2.3


From 0bc825d240abcaf5ed6e9d59b44215b51718ef5b Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Tue, 22 Mar 2011 16:35:00 -0700
Subject: codafs: fix compile warning when CONFIG_SYSCTL=n

When CONFIG_SYSCTL=n, we get the following warning:

fs/coda/sysctl.c:18: warning: `coda_tabl' defined but not used

Fix the warning by making sure coda_table and it's callee function are in
the same context.  Also clean up the code by removing extra #ifdef.

[akpm@linux-foundation.org: remove unneeded stub macros]
Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/sysctl.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index c6405ce3c50..06d27a41807 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -13,7 +13,6 @@
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
-#endif
 
 static ctl_table coda_table[] = {
 	{
@@ -40,7 +39,6 @@ static ctl_table coda_table[] = {
 	{}
 };
 
-#ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
 	{
 		.procname	= "coda",
@@ -49,22 +47,18 @@ static ctl_table fs_table[] = {
 	},
 	{}
 };
-#endif
 
 void coda_sysctl_init(void)
 {
-#ifdef CONFIG_SYSCTL
 	if ( !fs_table_header )
 		fs_table_header = register_sysctl_table(fs_table);
-#endif
 }
 
 void coda_sysctl_clean(void)
 {
-#ifdef CONFIG_SYSCTL
 	if ( fs_table_header ) {
 		unregister_sysctl_table(fs_table_header);
 		fs_table_header = NULL;
 	}
-#endif
 }
+#endif
-- 
cgit v1.2.3


From 2f09719af705db56032ae480a2d9c32c2a3fcbd3 Mon Sep 17 00:00:00 2001
From: Stuart Swales <stuart.swales.croftnuisk@gmail.com>
Date: Tue, 22 Mar 2011 16:35:04 -0700
Subject: adfs: fix E+/F+ dir size > 2048 crashing kernel

Kernel crashes in fs/adfs module when accessing directories with a large
number of objects on mounted Acorn ADFS E+/F+ format discs (or images) as
the existing code writes off the end of the fixed array of struct
buffer_head pointers.

Additionally, each directory access that didn't crash would leak a buffer
as nr_buffers was not adjusted correctly for E+/F+ discs (was always left
as one less than required).

The patch fixes this by allocating a dynamically-sized set of struct
buffer_head pointers if necessary for the E+/F+ case (many directories
still do in fact fit in 2048 bytes) and sets the correct nr_buffers so
that all buffers are released.

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=26072

Tested by tar'ing the contents of my RISC PC's E+ format 20Gb HDD which
contains a number of large directories that previously crashed the kernel.

Signed-off-by: Stuart Swales <stuart.swales.croftnuisk@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/adfs/adfs.h      |   4 +++
 fs/adfs/dir_fplus.c | 101 +++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 84 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 2ff622f6f54..58588ddb178 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -79,6 +79,10 @@ struct adfs_dir {
 
 	int			nr_buffers;
 	struct buffer_head	*bh[4];
+
+	/* big directories need allocated buffers */
+	struct buffer_head	**bh_fplus;
+
 	unsigned int		pos;
 	unsigned int		parent_id;
 
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1796bb352d0..a7f41da8115 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -8,6 +8,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "adfs.h"
 #include "dir_fplus.h"
 
@@ -22,30 +23,53 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
 
 	dir->nr_buffers = 0;
 
+	/* start off using fixed bh set - only alloc for big dirs */
+	dir->bh_fplus = &dir->bh[0];
+
 	block = __adfs_block_map(sb, id, 0);
 	if (!block) {
 		adfs_error(sb, "dir object %X has a hole at offset 0", id);
 		goto out;
 	}
 
-	dir->bh[0] = sb_bread(sb, block);
-	if (!dir->bh[0])
+	dir->bh_fplus[0] = sb_bread(sb, block);
+	if (!dir->bh_fplus[0])
 		goto out;
 	dir->nr_buffers += 1;
 
-	h = (struct adfs_bigdirheader *)dir->bh[0]->b_data;
+	h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data;
 	size = le32_to_cpu(h->bigdirsize);
 	if (size != sz) {
-		printk(KERN_WARNING "adfs: adfs_fplus_read: directory header size\n"
-				" does not match directory size\n");
+		printk(KERN_WARNING "adfs: adfs_fplus_read:"
+					" directory header size %X\n"
+					" does not match directory size %X\n",
+					size, sz);
 	}
 
 	if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
 	    h->bigdirversion[2] != 0 || size & 2047 ||
-	    h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME))
+	    h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) {
+		printk(KERN_WARNING "adfs: dir object %X has"
+					" malformed dir header\n", id);
 		goto out;
+	}
 
 	size >>= sb->s_blocksize_bits;
+	if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) {
+		/* this directory is too big for fixed bh set, must allocate */
+		struct buffer_head **bh_fplus =
+			kzalloc(size * sizeof(struct buffer_head *),
+				GFP_KERNEL);
+		if (!bh_fplus) {
+			adfs_error(sb, "not enough memory for"
+					" dir object %X (%d blocks)", id, size);
+			goto out;
+		}
+		dir->bh_fplus = bh_fplus;
+		/* copy over the pointer to the block that we've already read */
+		dir->bh_fplus[0] = dir->bh[0];
+	}
+
 	for (blk = 1; blk < size; blk++) {
 		block = __adfs_block_map(sb, id, blk);
 		if (!block) {
@@ -53,25 +77,44 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
 			goto out;
 		}
 
-		dir->bh[blk] = sb_bread(sb, block);
-		if (!dir->bh[blk])
+		dir->bh_fplus[blk] = sb_bread(sb, block);
+		if (!dir->bh_fplus[blk]) {
+			adfs_error(sb,	"dir object %X failed read for"
+					" offset %d, mapped block %X",
+					id, blk, block);
 			goto out;
-		dir->nr_buffers = blk;
+		}
+
+		dir->nr_buffers += 1;
 	}
 
-	t = (struct adfs_bigdirtail *)(dir->bh[size - 1]->b_data + (sb->s_blocksize - 8));
+	t = (struct adfs_bigdirtail *)
+		(dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8));
 
 	if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
 	    t->bigdirendmasseq != h->startmasseq ||
-	    t->reserved[0] != 0 || t->reserved[1] != 0)
+	    t->reserved[0] != 0 || t->reserved[1] != 0) {
+		printk(KERN_WARNING "adfs: dir object %X has "
+					"malformed dir end\n", id);
 		goto out;
+	}
 
 	dir->parent_id = le32_to_cpu(h->bigdirparent);
 	dir->sb = sb;
 	return 0;
+
 out:
-	for (i = 0; i < dir->nr_buffers; i++)
-		brelse(dir->bh[i]);
+	if (dir->bh_fplus) {
+		for (i = 0; i < dir->nr_buffers; i++)
+			brelse(dir->bh_fplus[i]);
+
+		if (&dir->bh[0] != dir->bh_fplus)
+			kfree(dir->bh_fplus);
+
+		dir->bh_fplus = NULL;
+	}
+
+	dir->nr_buffers = 0;
 	dir->sb = NULL;
 	return ret;
 }
@@ -79,7 +122,8 @@ out:
 static int
 adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos)
 {
-	struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data;
+	struct adfs_bigdirheader *h =
+		(struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
 	int ret = -ENOENT;
 
 	if (fpos <= le32_to_cpu(h->bigdirentries)) {
@@ -102,21 +146,27 @@ dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len)
 	partial = sb->s_blocksize - offset;
 
 	if (partial >= len)
-		memcpy(to, dir->bh[buffer]->b_data + offset, len);
+		memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len);
 	else {
 		char *c = (char *)to;
 
 		remainder = len - partial;
 
-		memcpy(c, dir->bh[buffer]->b_data + offset, partial);
-		memcpy(c + partial, dir->bh[buffer + 1]->b_data, remainder);
+		memcpy(c,
+			dir->bh_fplus[buffer]->b_data + offset,
+			partial);
+
+		memcpy(c + partial,
+			dir->bh_fplus[buffer + 1]->b_data,
+			remainder);
 	}
 }
 
 static int
 adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
 {
-	struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data;
+	struct adfs_bigdirheader *h =
+		(struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
 	struct adfs_bigdirentry bde;
 	unsigned int offset;
 	int i, ret = -ENOENT;
@@ -160,7 +210,7 @@ adfs_fplus_sync(struct adfs_dir *dir)
 	int i;
 
 	for (i = dir->nr_buffers - 1; i >= 0; i--) {
-		struct buffer_head *bh = dir->bh[i];
+		struct buffer_head *bh = dir->bh_fplus[i];
 		sync_dirty_buffer(bh);
 		if (buffer_req(bh) && !buffer_uptodate(bh))
 			err = -EIO;
@@ -174,8 +224,17 @@ adfs_fplus_free(struct adfs_dir *dir)
 {
 	int i;
 
-	for (i = 0; i < dir->nr_buffers; i++)
-		brelse(dir->bh[i]);
+	if (dir->bh_fplus) {
+		for (i = 0; i < dir->nr_buffers; i++)
+			brelse(dir->bh_fplus[i]);
+
+		if (&dir->bh[0] != dir->bh_fplus)
+			kfree(dir->bh_fplus);
+
+		dir->bh_fplus = NULL;
+	}
+
+	dir->nr_buffers = 0;
 	dir->sb = NULL;
 }
 
-- 
cgit v1.2.3


From 7a9730af9c596749425a98eba136152e5be4602a Mon Sep 17 00:00:00 2001
From: Stuart Swales <stuart.swales.croftnuisk@gmail.com>
Date: Tue, 22 Mar 2011 16:35:05 -0700
Subject: adfs: improve timestamp precision

ADFS (FileCore) storage complies with the RISC OS timestamp specification
(40-bit centiseconds since 01 Jan 1900 00:00:00).  It is desirable that
stored timestamp precision be maintained to facilitate a precise copy of
data and metadata from a hard disc (or image thereof) into a RISC OS
emulator (such as RPCEmu).

This patch implements a full-precision conversion from ADFS to Unix
timestamp as the existing driver, for ease of calculation with old 32-bit
compilers, uses the common trick of shifting the 40-bits representing
centiseconds around into 32-bits representing seconds thereby losing
precision.

Signed-off-by: Stuart Swales<stuart.swales.croftnuisk@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/adfs/inode.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 09fe40198d1..16d7ef2dffe 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -174,50 +174,48 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode)
 
 /*
  * Convert an ADFS time to Unix time.  ADFS has a 40-bit centi-second time
- * referenced to 1 Jan 1900 (til 2248)
+ * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds
+ * of time to convert from RISC OS epoch to Unix epoch.
  */
 static void
 adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
 {
 	unsigned int high, low;
+	/* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
+	 * 01 Jan 1900 00:00:00 (RISC OS epoch)
+	 */
+	static const s64 nsec_unix_epoch_diff_risc_os_epoch =
+							2208988800000000000LL;
+	s64 nsec;
 
 	if (ADFS_I(inode)->stamped == 0)
 		goto cur_time;
 
-	high = ADFS_I(inode)->loadaddr << 24;
-	low  = ADFS_I(inode)->execaddr;
+	high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */
+	low  = ADFS_I(inode)->execaddr;    /* bottom 32 bits of timestamp */
 
-	high |= low >> 8;
-	low  &= 255;
+	/* convert 40-bit centi-seconds to 32-bit seconds
+	 * going via nanoseconds to retain precision
+	 */
+	nsec = (((s64) high << 32) | (s64) low) * 10000000; /* cs to ns */
 
 	/* Files dated pre  01 Jan 1970 00:00:00. */
-	if (high < 0x336e996a)
+	if (nsec < nsec_unix_epoch_diff_risc_os_epoch)
 		goto too_early;
 
-	/* Files dated post 18 Jan 2038 03:14:05. */
-	if (high >= 0x656e9969)
-		goto too_late;
-
-	/* discard 2208988800 (0x336e996a00) seconds of time */
-	high -= 0x336e996a;
+	/* convert from RISC OS to Unix epoch */
+	nsec -= nsec_unix_epoch_diff_risc_os_epoch;
 
-	/* convert 40-bit centi-seconds to 32-bit seconds */
-	tv->tv_sec = (((high % 100) << 8) + low) / 100 + (high / 100 << 8);
-	tv->tv_nsec = 0;
+	*tv = ns_to_timespec(nsec);
 	return;
 
  cur_time:
-	*tv = CURRENT_TIME_SEC;
+	*tv = CURRENT_TIME;
 	return;
 
  too_early:
 	tv->tv_sec = tv->tv_nsec = 0;
 	return;
-
- too_late:
-	tv->tv_sec = 0x7ffffffd;
-	tv->tv_nsec = 0;
-	return;
 }
 
 /*
-- 
cgit v1.2.3


From da23ef0549d4205ca9b576cf6cce9a80d0c3e43a Mon Sep 17 00:00:00 2001
From: Stuart Swales <stuart.swales.croftnuisk@gmail.com>
Date: Tue, 22 Mar 2011 16:35:06 -0700
Subject: adfs: add hexadecimal filetype suffix option

ADFS (FileCore) storage complies with the RISC OS filetype specification
(12 bits of file type information is stored in the file load address,
rather than using a file extension).  The existing driver largely ignores
this information and does not present it to the end user.

It is desirable that stored filetypes be made visible to the end user to
facilitate a precise copy of data and metadata from a hard disc (or image
thereof) into a RISC OS emulator (such as RPCEmu) or to a network share
which can be accessed by real Acorn systems.

This patch implements a per-mount filetype suffix option (use -o
ftsuffix=1) to present any filetype as a ,xyz hexadecimal suffix on each
file.  This type suffix is compatible with that used by RISC OS systems
that access network servers using NFS client software and by RPCemu's host
filing system.

Signed-off-by: Stuart Swales <stuart.swales.croftnuisk@gmail.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/adfs/adfs.h      | 21 +++++++++++++++++++--
 fs/adfs/dir_f.c     | 23 ++++++++++++++++++++---
 fs/adfs/dir_fplus.c | 18 ++++++++++++++++++
 fs/adfs/inode.c     | 22 ++++------------------
 fs/adfs/super.c     | 23 ++++++++++++++++++++---
 5 files changed, 81 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 58588ddb178..a8a58d864f9 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -50,6 +50,7 @@ struct adfs_sb_info {
 	gid_t		s_gid;		/* owner gid				 */
 	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm		 */
 	umode_t		s_other_mask;	/* ADFS other perm -> unix perm		 */
+	int		s_ftsuffix;	/* ,xyz hex filetype suffix option */
 
 	__u32		s_ids_per_zone;	/* max. no ids in one zone		 */
 	__u32		s_idlen;	/* length of ID in map			 */
@@ -93,7 +94,7 @@ struct adfs_dir {
 /*
  * This is the overall maximum name length
  */
-#define ADFS_MAX_NAME_LEN	256
+#define ADFS_MAX_NAME_LEN	(256 + 4) /* +4 for ,xyz hex filetype suffix */
 struct object_info {
 	__u32		parent_id;		/* parent object id	*/
 	__u32		file_id;		/* object id		*/
@@ -101,10 +102,26 @@ struct object_info {
 	__u32		execaddr;		/* execution address	*/
 	__u32		size;			/* size			*/
 	__u8		attr;			/* RISC OS attributes	*/
-	unsigned char	name_len;		/* name length		*/
+	unsigned int	name_len;		/* name length		*/
 	char		name[ADFS_MAX_NAME_LEN];/* file name		*/
+
+	/* RISC OS file type (12-bit: derived from loadaddr) */
+	__u16		filetype;
 };
 
+/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */
+static inline int append_filetype_suffix(char *buf, __u16 filetype)
+{
+	if (filetype == -1)
+		return 0;
+
+	*buf++ = ',';
+	*buf++ = hex_asc_lo(filetype >> 8);
+	*buf++ = hex_asc_lo(filetype >> 4);
+	*buf++ = hex_asc_lo(filetype >> 0);
+	return 4;
+}
+
 struct adfs_dir_ops {
 	int	(*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir);
 	int	(*setpos)(struct adfs_dir *dir, unsigned int fpos);
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index bafc71222e2..4bbe853ee50 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -52,7 +52,6 @@ static inline int adfs_readname(char *buf, char *ptr, int maxlen)
 			*buf++ = *ptr;
 		ptr++;
 	}
-	*buf = '\0';
 
 	return buf - old_buf;
 }
@@ -208,7 +207,8 @@ release_buffers:
  * convert a disk-based directory entry to a Linux ADFS directory entry
  */
 static inline void
-adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de)
+adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj,
+	struct adfs_direntry *de)
 {
 	obj->name_len =	adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN);
 	obj->file_id  = adfs_readval(de->dirinddiscadd, 3);
@@ -216,6 +216,23 @@ adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de)
 	obj->execaddr = adfs_readval(de->direxec, 4);
 	obj->size     = adfs_readval(de->dirlen,  4);
 	obj->attr     = de->newdiratts;
+	obj->filetype = -1;
+
+	/*
+	 * object is a file and is filetyped and timestamped?
+	 * RISC OS 12-bit filetype is stored in load_address[19:8]
+	 */
+	if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+		(0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+		obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+
+		/* optionally append the ,xyz hex filetype suffix */
+		if (ADFS_SB(dir->sb)->s_ftsuffix)
+			obj->name_len +=
+				append_filetype_suffix(
+					&obj->name[obj->name_len],
+					obj->filetype);
+	}
 }
 
 /*
@@ -260,7 +277,7 @@ __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj)
 	if (!de.dirobname[0])
 		return -ENOENT;
 
-	adfs_dir2obj(obj, &de);
+	adfs_dir2obj(dir, obj, &de);
 
 	return 0;
 }
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index a7f41da8115..d9e3bee4e65 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -197,6 +197,24 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
 		if (obj->name[i] == '/')
 			obj->name[i] = '.';
 
+	obj->filetype = -1;
+
+	/*
+	 * object is a file and is filetyped and timestamped?
+	 * RISC OS 12-bit filetype is stored in load_address[19:8]
+	 */
+	if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+		(0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+		obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+
+		/* optionally append the ,xyz hex filetype suffix */
+		if (ADFS_SB(dir->sb)->s_ftsuffix)
+			obj->name_len +=
+				append_filetype_suffix(
+					&obj->name[obj->name_len],
+					obj->filetype);
+	}
+
 	dir->pos += 1;
 	ret = 0;
 out:
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 16d7ef2dffe..92444e94f84 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -78,26 +78,13 @@ static const struct address_space_operations adfs_aops = {
 	.bmap		= _adfs_bmap
 };
 
-static inline unsigned int
-adfs_filetype(struct inode *inode)
-{
-	unsigned int type;
-
-	if (ADFS_I(inode)->stamped)
-		type = (ADFS_I(inode)->loadaddr >> 8) & 0xfff;
-	else
-		type = (unsigned int) -1;
-
-	return type;
-}
-
 /*
  * Convert ADFS attributes and filetype to Linux permission.
  */
 static umode_t
 adfs_atts2mode(struct super_block *sb, struct inode *inode)
 {
-	unsigned int filetype, attr = ADFS_I(inode)->attr;
+	unsigned int attr = ADFS_I(inode)->attr;
 	umode_t mode, rmask;
 	struct adfs_sb_info *asb = ADFS_SB(sb);
 
@@ -106,9 +93,7 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode)
 		return S_IFDIR | S_IXUGO | mode;
 	}
 
-	filetype = adfs_filetype(inode);
-
-	switch (filetype) {
+	switch (ADFS_I(inode)->filetype) {
 	case 0xfc0:	/* LinkFS */
 		return S_IFLNK|S_IRWXUGO;
 
@@ -277,7 +262,8 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
 	ADFS_I(inode)->loadaddr  = obj->loadaddr;
 	ADFS_I(inode)->execaddr  = obj->execaddr;
 	ADFS_I(inode)->attr      = obj->attr;
-	ADFS_I(inode)->stamped	  = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
+	ADFS_I(inode)->filetype  = obj->filetype;
+	ADFS_I(inode)->stamped   = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
 
 	inode->i_mode	 = adfs_atts2mode(sb, inode);
 	adfs_adfs2unix_time(&inode->i_mtime, inode);
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 06d7388b477..c8bf36a1996 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -138,17 +138,20 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
 		seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
 	if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
 		seq_printf(seq, ",othmask=%o", asb->s_other_mask);
+	if (asb->s_ftsuffix != 0)
+		seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix);
 
 	return 0;
 }
 
-enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err};
+enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
 
 static const match_table_t tokens = {
 	{Opt_uid, "uid=%u"},
 	{Opt_gid, "gid=%u"},
 	{Opt_ownmask, "ownmask=%o"},
 	{Opt_othmask, "othmask=%o"},
+	{Opt_ftsuffix, "ftsuffix=%u"},
 	{Opt_err, NULL}
 };
 
@@ -189,6 +192,11 @@ static int parse_options(struct super_block *sb, char *options)
 				return -EINVAL;
 			asb->s_other_mask = option;
 			break;
+		case Opt_ftsuffix:
+			if (match_int(args, &option))
+				return -EINVAL;
+			asb->s_ftsuffix = option;
+			break;
 		default:
 			printk("ADFS-fs: unrecognised mount option \"%s\" "
 					"or missing value\n", p);
@@ -366,6 +374,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 	asb->s_gid = 0;
 	asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
 	asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
+	asb->s_ftsuffix = 0;
 
 	if (parse_options(sb, data))
 		goto error;
@@ -445,11 +454,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root);
 	root_obj.name_len  = 0;
-	root_obj.loadaddr  = 0;
-	root_obj.execaddr  = 0;
+	/* Set root object date as 01 Jan 1987 00:00:00 */
+	root_obj.loadaddr  = 0xfff0003f;
+	root_obj.execaddr  = 0xec22c000;
 	root_obj.size	   = ADFS_NEWDIR_SIZE;
 	root_obj.attr	   = ADFS_NDA_DIRECTORY   | ADFS_NDA_OWNER_READ |
 			     ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ;
+	root_obj.filetype  = -1;
 
 	/*
 	 * If this is a F+ disk with variable length directories,
@@ -463,6 +474,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 		asb->s_dir     = &adfs_f_dir_ops;
 		asb->s_namelen = ADFS_F_NAME_LEN;
 	}
+	/*
+	 * ,xyz hex filetype suffix may be added by driver
+	 * to files that have valid RISC OS filetype
+	 */
+	if (asb->s_ftsuffix)
+		asb->s_namelen += 4;
 
 	sb->s_d_op = &adfs_dentry_operations;
 	root = adfs_iget(sb, &root_obj);
-- 
cgit v1.2.3


From e91f90bb0bb10be9cc8efd09a3cf4ecffcad0db1 Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Tue, 22 Mar 2011 16:35:10 -0700
Subject: aio: wake all waiters when destroying ctx

The test program below will hang because io_getevents() uses
add_wait_queue_exclusive(), which means the wake_up() in io_destroy() only
wakes up one of the threads.  Fix this by using wake_up_all() in the aio
code paths where we want to make sure no one gets stuck.

	// t.c -- compile with gcc -lpthread -laio t.c

	#include <libaio.h>
	#include <pthread.h>
	#include <stdio.h>
	#include <unistd.h>

	static const int nthr = 2;

	void *getev(void *ctx)
	{
		struct io_event ev;
		io_getevents(ctx, 1, 1, &ev, NULL);
		printf("io_getevents returned\n");
		return NULL;
	}

	int main(int argc, char *argv[])
	{
		io_context_t ctx = 0;
		pthread_t thread[nthr];
		int i;

		io_setup(1024, &ctx);

		for (i = 0; i < nthr; ++i)
			pthread_create(&thread[i], NULL, getev, ctx);

		sleep(1);

		io_destroy(ctx);

		for (i = 0; i < nthr; ++i)
			pthread_join(thread[i], NULL);

		return 0;
	}

Signed-off-by: Roland Dreier <roland@purestorage.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 7f54f43b8f7..ebb6a22e4e1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -520,7 +520,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 	ctx->reqs_active--;
 
 	if (unlikely(!ctx->reqs_active && ctx->dead))
-		wake_up(&ctx->wait);
+		wake_up_all(&ctx->wait);
 }
 
 static void aio_fput_routine(struct work_struct *data)
@@ -1229,7 +1229,7 @@ static void io_destroy(struct kioctx *ioctx)
 	 * by other CPUs at this point.  Right now, we rely on the
 	 * locking done by the above calls to ensure this consistency.
 	 */
-	wake_up(&ioctx->wait);
+	wake_up_all(&ioctx->wait);
 	put_ioctx(ioctx);	/* once for the lookup */
 }
 
-- 
cgit v1.2.3


From b12d12596992f608f5506a8dabe4d1299594bd1e Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 22 Mar 2011 16:35:11 -0700
Subject: fs/devpts/inode.c: correctly check d_alloc_name() return code in
 devpts_pty_new()

d_alloc_name return NULL in case error, but we expect errno in
devpts_pty_new.

Addresses http://bugzilla.openvz.org/show_bug.cgi?id=1758

Signed-off-by: Andrey Vagin <avagin@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c6bd815dc79..2f27e578d46 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -502,7 +502,7 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 	mutex_lock(&root->d_inode->i_mutex);
 
 	dentry = d_alloc_name(root, s);
-	if (!IS_ERR(dentry)) {
+	if (dentry) {
 		d_add(dentry, inode);
 		fsnotify_create(root->d_inode, dentry);
 	} else {
-- 
cgit v1.2.3


From 565d76cb7d5fd7cb010fd690602280a69ab116ef Mon Sep 17 00:00:00 2001
From: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Date: Tue, 22 Mar 2011 16:35:12 -0700
Subject: zlib: slim down zlib_deflate() workspace when possible

Instead of always creating a huge (268K) deflate_workspace with the
maximum compression parameters (windowBits=15, memLevel=8), allow the
caller to obtain a smaller workspace by specifying smaller parameter
values.

For example, when capturing oops and panic reports to a medium with
limited capacity, such as NVRAM, compression may be the only way to
capture the whole report.  In this case, a small workspace (24K works
fine) is a win, whether you allocate the workspace when you need it (i.e.,
during an oops or panic) or at boot time.

I've verified that this patch works with all accepted values of windowBits
(positive and negative), memLevel, and compression level.

Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David Miller <davem@davemloft.net>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/zlib.c       | 3 ++-
 fs/jffs2/compr_zlib.c | 7 ++++---
 fs/logfs/compr.c      | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index f5ec2d44150..faccd47c6c4 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -57,7 +57,8 @@ static struct list_head *zlib_alloc_workspace(void)
 	if (!workspace)
 		return ERR_PTR(-ENOMEM);
 
-	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
+						MAX_WBITS, MAX_MEM_LEVEL));
 	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
 	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
 	if (!workspace->def_strm.workspace ||
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index fd05a0b9431..5a001020c54 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -40,12 +40,13 @@ static z_stream inf_strm, def_strm;
 
 static int __init alloc_workspaces(void)
 {
-	def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+	def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS,
+							MAX_MEM_LEVEL));
 	if (!def_strm.workspace) {
-		printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize());
+		printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL));
 		return -ENOMEM;
 	}
-	D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize()));
+	D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)));
 	inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
 	if (!inf_strm.workspace) {
 		printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize());
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
index 44bbfd249ab..961f02b86d9 100644
--- a/fs/logfs/compr.c
+++ b/fs/logfs/compr.c
@@ -81,7 +81,7 @@ error:
 
 int __init logfs_compr_init(void)
 {
-	size_t size = max(zlib_deflate_workspacesize(),
+	size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
 			zlib_inflate_workspacesize());
 	stream.workspace = vmalloc(size);
 	if (!stream.workspace)
-- 
cgit v1.2.3


From bd23a539d0733c9f9ec3f9fc628491fad2658e82 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Mar 2011 09:56:30 -0400
Subject: fix leaks in path_lookupat()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 5a9a6c3094d..a4dfac650c3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1644,13 +1644,16 @@ static int path_lookupat(int dfd, const char *name,
 			err = -ECHILD;
 	}
 
-	if (!err)
+	if (!err) {
 		err = handle_reval_path(nd);
+		if (err)
+			path_put(&nd->path);
+	}
 
 	if (!err && nd->flags & LOOKUP_DIRECTORY) {
 		if (!nd->inode->i_op->lookup) {
 			path_put(&nd->path);
-			return -ENOTDIR;
+			err = -ENOTDIR;
 		}
 	}
 
-- 
cgit v1.2.3


From 26ec3c646e75ce7a69fda429d68fcbdcd5eacc62 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 21:24:05 -0500
Subject: make sessionid permissions in /proc/*/task/* match those in /proc/*

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d49c4b5d2c3..b77236de6d8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3161,7 +3161,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-	REG("sessionid",  S_IRUSR, proc_sessionid_operations),
+	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
-- 
cgit v1.2.3


From ca6b0bf0e086513b9ee5efc0aa5770ecb57778af Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 22:04:37 -0500
Subject: pagemap: close races with suid execve

just use mm_for_maps()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c     | 4 ++--
 fs/proc/task_mmu.c | 7 ++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b77236de6d8..df73573e12c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2797,7 +2797,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
 	REG("smaps",      S_IRUGO, proc_smaps_operations),
-	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+	REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -3133,7 +3133,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
 	REG("smaps",     S_IRUGO, proc_smaps_operations),
-	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+	REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 60b914860f8..c966413c139 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -729,7 +729,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		goto out;
 
 	ret = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+	mm = mm_for_maps(task);
+	if (!mm)
 		goto out_task;
 
 	ret = -EINVAL;
@@ -742,10 +743,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (!count)
 		goto out_task;
 
-	mm = get_task_mm(task);
-	if (!mm)
-		goto out_task;
-
 	pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
 	pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
 	ret = -ENOMEM;
-- 
cgit v1.2.3


From ec6fd8a4355cda81cd9f06bebc048e83eb514ac7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 22:22:54 -0500
Subject: report errors in /proc/*/*map* sanely

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c       |  8 +++++---
 fs/proc/task_mmu.c   | 10 +++++-----
 fs/proc/task_nommu.c |  6 +++---
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index df73573e12c..c2828110208 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -225,15 +225,17 @@ static int check_mem_permission(struct task_struct *task)
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm;
+	int err;
 
-	if (mutex_lock_killable(&task->signal->cred_guard_mutex))
-		return NULL;
+	err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (err)
+		return ERR_PTR(err);
 
 	mm = get_task_mm(task);
 	if (mm && mm != current->mm &&
 			!ptrace_may_access(task, PTRACE_MODE_READ)) {
 		mmput(mm);
-		mm = NULL;
+		mm = ERR_PTR(-EACCES);
 	}
 	mutex_unlock(&task->signal->cred_guard_mutex);
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c966413c139..8fed0f88fbf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,11 +119,11 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 
 	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
 	if (!priv->task)
-		return NULL;
+		return ERR_PTR(-ESRCH);
 
 	mm = mm_for_maps(priv->task);
-	if (!mm)
-		return NULL;
+	if (!mm || IS_ERR(mm))
+		return mm;
 	down_read(&mm->mmap_sem);
 
 	tail_vma = get_gate_vma(priv->task);
@@ -728,9 +728,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (!task)
 		goto out;
 
-	ret = -EACCES;
 	mm = mm_for_maps(task);
-	if (!mm)
+	ret = PTR_ERR(mm);
+	if (!mm || IS_ERR(mm))
 		goto out_task;
 
 	ret = -EINVAL;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index b535d3e5d5f..980de547c07 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -199,13 +199,13 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	/* pin the task and mm whilst we play with them */
 	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
 	if (!priv->task)
-		return NULL;
+		return ERR_PTR(-ESRCH);
 
 	mm = mm_for_maps(priv->task);
-	if (!mm) {
+	if (!mm || IS_ERR(mm)) {
 		put_task_struct(priv->task);
 		priv->task = NULL;
-		return NULL;
+		return mm;
 	}
 	down_read(&mm->mmap_sem);
 
-- 
cgit v1.2.3


From d6f64b89d7ff22ce05896ab4a93a653e8d0b123d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 22:26:01 -0500
Subject: close race in /proc/*/environ

Switch to mm_for_maps().  Maybe we ought to make it r--r--r--,
since we do checks on IO anyway...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c2828110208..fc471b8766d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -919,20 +919,18 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!task)
 		goto out_no_task;
 
-	if (!ptrace_may_access(task, PTRACE_MODE_READ))
-		goto out;
-
 	ret = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
 		goto out;
 
-	ret = 0;
 
-	mm = get_task_mm(task);
-	if (!mm)
+	mm = mm_for_maps(task);
+	ret = PTR_ERR(mm);
+	if (!mm || IS_ERR(mm))
 		goto out_free;
 
+	ret = 0;
 	while (count > 0) {
 		int this_len, retval, max_len;
 
-- 
cgit v1.2.3


From 2fadaef41283aad7100fa73f01998cddaca25833 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 15 Feb 2011 22:52:11 -0500
Subject: auxv: require the target to be tracable (or yourself)

same as for environ, except that we didn't do any checks to
prevent access after suid execve

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fc471b8766d..e94b58b496f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -281,9 +281,9 @@ out:
 
 static int proc_pid_auxv(struct task_struct *task, char *buffer)
 {
-	int res = 0;
-	struct mm_struct *mm = get_task_mm(task);
-	if (mm) {
+	struct mm_struct *mm = mm_for_maps(task);
+	int res = PTR_ERR(mm);
+	if (mm && !IS_ERR(mm)) {
 		unsigned int nwords = 0;
 		do {
 			nwords += 2;
-- 
cgit v1.2.3


From 31db58b3ab432f72ea76be58b12e6ffaf627d5db Mon Sep 17 00:00:00 2001
From: Stephen Wilson <wilsons@start.ca>
Date: Sun, 13 Mar 2011 15:49:15 -0400
Subject: mm: arch: make get_gate_vma take an mm_struct instead of a
 task_struct

Morally, the presence of a gate vma is more an attribute of a particular mm than
a particular task.  Moreover, dropping the dependency on task_struct will help
make both existing and future operations on mm's more flexible and convenient.

Signed-off-by: Stephen Wilson <wilsons@start.ca>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/binfmt_elf.c    | 2 +-
 fs/proc/task_mmu.c | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d5b640ba6cb..bbabdcce117 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1906,7 +1906,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 	segs = current->mm->map_count;
 	segs += elf_core_extra_phdrs();
 
-	gate_vma = get_gate_vma(current);
+	gate_vma = get_gate_vma(current->mm);
 	if (gate_vma != NULL)
 		segs++;
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8fed0f88fbf..e73314afc53 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -126,7 +126,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 		return mm;
 	down_read(&mm->mmap_sem);
 
-	tail_vma = get_gate_vma(priv->task);
+	tail_vma = get_gate_vma(priv->task->mm);
 	priv->tail_vma = tail_vma;
 
 	/* Start with last addr hint */
@@ -277,7 +277,8 @@ static int show_map(struct seq_file *m, void *v)
 	show_map_vma(m, vma);
 
 	if (m->count < m->size)  /* vma is copied successfully */
-		m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
+		m->version = (vma != get_gate_vma(task->mm))
+			? vma->vm_start : 0;
 	return 0;
 }
 
@@ -436,7 +437,8 @@ static int show_smap(struct seq_file *m, void *v)
 			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 
 	if (m->count < m->size)  /* vma is copied successfully */
-		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+		m->version = (vma != get_gate_vma(task->mm))
+			? vma->vm_start : 0;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 26947f8c8f9598209001cdcd31bb2162a2e54691 Mon Sep 17 00:00:00 2001
From: Stephen Wilson <wilsons@start.ca>
Date: Sun, 13 Mar 2011 15:49:21 -0400
Subject: proc: disable mem_write after exec

This change makes mem_write() observe the same constraints as mem_read().  This
is particularly important for mem_write as an accidental leak of the fd across
an exec could result in arbitrary modification of the target process' memory.
IOW, /proc/pid/mem is implicitly close-on-exec.

Signed-off-by: Stephen Wilson <wilsons@start.ca>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e94b58b496f..9af49a3984f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -850,6 +850,10 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 	if (check_mem_permission(task))
 		goto out;
 
+	copied = -EIO;
+	if (file->private_data != (void *)((long)current->self_exec_id))
+		goto out;
+
 	copied = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
-- 
cgit v1.2.3


From 18f661bcf898742212182d75f22f05b048cc04bb Mon Sep 17 00:00:00 2001
From: Stephen Wilson <wilsons@start.ca>
Date: Sun, 13 Mar 2011 15:49:22 -0400
Subject: proc: hold cred_guard_mutex in check_mem_permission()

Avoid a potential race when task exec's and we get a new ->mm but check against
the old credentials in ptrace_may_access().

Holding of the mutex is implemented by factoring out the body of the code into a
helper function __check_mem_permission().  Performing this factorization now
simplifies upcoming changes and minimizes churn in the diff's.

Signed-off-by: Stephen Wilson <wilsons@start.ca>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9af49a3984f..013f116b322 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -191,10 +191,7 @@ static int proc_root_link(struct inode *inode, struct path *path)
 	return result;
 }
 
-/*
- * Return zero if current may access user memory in @task, -error if not.
- */
-static int check_mem_permission(struct task_struct *task)
+static int __check_mem_permission(struct task_struct *task)
 {
 	/*
 	 * A task can always look at itself, in case it chooses
@@ -222,6 +219,27 @@ static int check_mem_permission(struct task_struct *task)
 	return -EPERM;
 }
 
+/*
+ * Return zero if current may access user memory in @task, -error if not.
+ */
+static int check_mem_permission(struct task_struct *task)
+{
+	int err;
+
+	/*
+	 * Avoid racing if task exec's as we might get a new mm but validate
+	 * against old credentials.
+	 */
+	err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (err)
+		return err;
+
+	err = __check_mem_permission(task);
+	mutex_unlock(&task->signal->cred_guard_mutex);
+
+	return err;
+}
+
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
 	struct mm_struct *mm;
-- 
cgit v1.2.3


From 8b0db9db19858b08c46a84540acfd35f6e6487b8 Mon Sep 17 00:00:00 2001
From: Stephen Wilson <wilsons@start.ca>
Date: Sun, 13 Mar 2011 15:49:23 -0400
Subject: proc: make check_mem_permission() return an mm_struct on success

This change allows us to take advantage of access_remote_vm(), which in turn
eliminates a security issue with the mem_write() implementation.

The previous implementation of mem_write() was insecure since the target task
could exec a setuid-root binary between the permission check and the actual
write.  Holding a reference to the target mm_struct eliminates this
vulnerability.

Signed-off-by: Stephen Wilson <wilsons@start.ca>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 58 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 013f116b322..e34c3c33b2d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -191,14 +191,20 @@ static int proc_root_link(struct inode *inode, struct path *path)
 	return result;
 }
 
-static int __check_mem_permission(struct task_struct *task)
+static struct mm_struct *__check_mem_permission(struct task_struct *task)
 {
+	struct mm_struct *mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		return ERR_PTR(-EINVAL);
+
 	/*
 	 * A task can always look at itself, in case it chooses
 	 * to use system calls instead of load instructions.
 	 */
 	if (task == current)
-		return 0;
+		return mm;
 
 	/*
 	 * If current is actively ptrace'ing, and would also be
@@ -210,20 +216,23 @@ static int __check_mem_permission(struct task_struct *task)
 		match = (tracehook_tracer_task(task) == current);
 		rcu_read_unlock();
 		if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
-			return 0;
+			return mm;
 	}
 
 	/*
 	 * Noone else is allowed.
 	 */
-	return -EPERM;
+	mmput(mm);
+	return ERR_PTR(-EPERM);
 }
 
 /*
- * Return zero if current may access user memory in @task, -error if not.
+ * If current may access user memory in @task return a reference to the
+ * corresponding mm, otherwise ERR_PTR.
  */
-static int check_mem_permission(struct task_struct *task)
+static struct mm_struct *check_mem_permission(struct task_struct *task)
 {
+	struct mm_struct *mm;
 	int err;
 
 	/*
@@ -232,12 +241,12 @@ static int check_mem_permission(struct task_struct *task)
 	 */
 	err = mutex_lock_killable(&task->signal->cred_guard_mutex);
 	if (err)
-		return err;
+		return ERR_PTR(err);
 
-	err = __check_mem_permission(task);
+	mm = __check_mem_permission(task);
 	mutex_unlock(&task->signal->cred_guard_mutex);
 
-	return err;
+	return mm;
 }
 
 struct mm_struct *mm_for_maps(struct task_struct *task)
@@ -795,18 +804,14 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 	if (!task)
 		goto out_no_task;
 
-	if (check_mem_permission(task))
-		goto out;
-
 	ret = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
 		goto out;
 
-	ret = 0;
- 
-	mm = get_task_mm(task);
-	if (!mm)
+	mm = check_mem_permission(task);
+	ret = PTR_ERR(mm);
+	if (IS_ERR(mm))
 		goto out_free;
 
 	ret = -EIO;
@@ -820,8 +825,8 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 		int this_len, retval;
 
 		this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-		retval = access_process_vm(task, src, page, this_len, 0);
-		if (!retval || check_mem_permission(task)) {
+		retval = access_remote_vm(mm, src, page, this_len, 0);
+		if (!retval) {
 			if (!ret)
 				ret = -EIO;
 			break;
@@ -860,22 +865,25 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 	char *page;
 	struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 	unsigned long dst = *ppos;
+	struct mm_struct *mm;
 
 	copied = -ESRCH;
 	if (!task)
 		goto out_no_task;
 
-	if (check_mem_permission(task))
-		goto out;
+	mm = check_mem_permission(task);
+	copied = PTR_ERR(mm);
+	if (IS_ERR(mm))
+		goto out_task;
 
 	copied = -EIO;
 	if (file->private_data != (void *)((long)current->self_exec_id))
-		goto out;
+		goto out_mm;
 
 	copied = -ENOMEM;
 	page = (char *)__get_free_page(GFP_TEMPORARY);
 	if (!page)
-		goto out;
+		goto out_mm;
 
 	copied = 0;
 	while (count > 0) {
@@ -886,7 +894,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 			copied = -EFAULT;
 			break;
 		}
-		retval = access_process_vm(task, dst, page, this_len, 1);
+		retval = access_remote_vm(mm, dst, page, this_len, 1);
 		if (!retval) {
 			if (!copied)
 				copied = -EIO;
@@ -899,7 +907,9 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 	}
 	*ppos = dst;
 	free_page((unsigned long) page);
-out:
+out_mm:
+	mmput(mm);
+out_task:
 	put_task_struct(task);
 out_no_task:
 	return copied;
-- 
cgit v1.2.3


From 198214a7ee50375fa71a65e518341980cfd4b2f0 Mon Sep 17 00:00:00 2001
From: Stephen Wilson <wilsons@start.ca>
Date: Sun, 13 Mar 2011 15:49:24 -0400
Subject: proc: enable writing to /proc/pid/mem

With recent changes there is no longer a security hazard with writing to
/proc/pid/mem.  Remove the #ifdef.

Signed-off-by: Stephen Wilson <wilsons@start.ca>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e34c3c33b2d..bc15df390ec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -854,10 +854,6 @@ out_no_task:
 	return ret;
 }
 
-#define mem_write NULL
-
-#ifndef mem_write
-/* This is a security hazard */
 static ssize_t mem_write(struct file * file, const char __user *buf,
 			 size_t count, loff_t *ppos)
 {
@@ -914,7 +910,6 @@ out_task:
 out_no_task:
 	return copied;
 }
-#endif
 
 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 {
-- 
cgit v1.2.3


From a9712bc12c40c172e393f85a9b2ba8db4bf59509 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Mar 2011 15:52:50 -0400
Subject: deal with races in /proc/*/{syscall,stack,personality}

All of those are rw-r--r-- and all are broken for suid - if you open
a file before the target does suid-root exec, you'll be still able
to access it.  For personality it's not a big deal, but for syscall
and stack it's a real problem.

Fix: check that task is tracable for you at the time of read().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 69 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 50 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index bc15df390ec..7d5bb8b9a4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -347,6 +347,23 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_KALLSYMS */
 
+static int lock_trace(struct task_struct *task)
+{
+	int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+	if (err)
+		return err;
+	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+		mutex_unlock(&task->signal->cred_guard_mutex);
+		return -EPERM;
+	}
+	return 0;
+}
+
+static void unlock_trace(struct task_struct *task)
+{
+	mutex_unlock(&task->signal->cred_guard_mutex);
+}
+
 #ifdef CONFIG_STACKTRACE
 
 #define MAX_STACK_TRACE_DEPTH	64
@@ -356,6 +373,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 {
 	struct stack_trace trace;
 	unsigned long *entries;
+	int err;
 	int i;
 
 	entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
@@ -366,15 +384,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
 	trace.entries		= entries;
 	trace.skip		= 0;
-	save_stack_trace_tsk(task, &trace);
 
-	for (i = 0; i < trace.nr_entries; i++) {
-		seq_printf(m, "[<%p>] %pS\n",
-			   (void *)entries[i], (void *)entries[i]);
+	err = lock_trace(task);
+	if (!err) {
+		save_stack_trace_tsk(task, &trace);
+
+		for (i = 0; i < trace.nr_entries; i++) {
+			seq_printf(m, "[<%p>] %pS\n",
+				   (void *)entries[i], (void *)entries[i]);
+		}
+		unlock_trace(task);
 	}
 	kfree(entries);
 
-	return 0;
+	return err;
 }
 #endif
 
@@ -537,18 +560,22 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 {
 	long nr;
 	unsigned long args[6], sp, pc;
+	int res = lock_trace(task);
+	if (res)
+		return res;
 
 	if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
-		return sprintf(buffer, "running\n");
-
-	if (nr < 0)
-		return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
-
-	return sprintf(buffer,
+		res = sprintf(buffer, "running\n");
+	else if (nr < 0)
+		res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+	else
+		res = sprintf(buffer,
 		       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
 		       nr,
 		       args[0], args[1], args[2], args[3], args[4], args[5],
 		       sp, pc);
+	unlock_trace(task);
+	return res;
 }
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
@@ -2775,8 +2802,12 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task)
 {
-	seq_printf(m, "%08x\n", task->personality);
-	return 0;
+	int err = lock_trace(task);
+	if (!err) {
+		seq_printf(m, "%08x\n", task->personality);
+		unlock_trace(task);
+	}
+	return err;
 }
 
 /*
@@ -2795,7 +2826,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("environ",    S_IRUSR, proc_environ_operations),
 	INF("auxv",       S_IRUSR, proc_pid_auxv),
 	ONE("status",     S_IRUGO, proc_pid_status),
-	ONE("personality", S_IRUSR, proc_pid_personality),
+	ONE("personality", S_IRUGO, proc_pid_personality),
 	INF("limits",	  S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2805,7 +2836,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-	INF("syscall",    S_IRUSR, proc_pid_syscall),
+	INF("syscall",    S_IRUGO, proc_pid_syscall),
 #endif
 	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
 	ONE("stat",       S_IRUGO, proc_tgid_stat),
@@ -2833,7 +2864,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	INF("wchan",      S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-	ONE("stack",      S_IRUSR, proc_pid_stack),
+	ONE("stack",      S_IRUGO, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat",  S_IRUGO, proc_pid_schedstat),
@@ -3135,14 +3166,14 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("environ",   S_IRUSR, proc_environ_operations),
 	INF("auxv",      S_IRUSR, proc_pid_auxv),
 	ONE("status",    S_IRUGO, proc_pid_status),
-	ONE("personality", S_IRUSR, proc_pid_personality),
+	ONE("personality", S_IRUGO, proc_pid_personality),
 	INF("limits",	 S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
 	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 	REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-	INF("syscall",   S_IRUSR, proc_pid_syscall),
+	INF("syscall",   S_IRUGO, proc_pid_syscall),
 #endif
 	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
 	ONE("stat",      S_IRUGO, proc_tid_stat),
@@ -3169,7 +3200,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	INF("wchan",     S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-	ONE("stack",      S_IRUSR, proc_pid_stack),
+	ONE("stack",      S_IRUGO, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat", S_IRUGO, proc_pid_schedstat),
-- 
cgit v1.2.3


From 135a9fcf45f8d41b4ab008114792f4f6c6572675 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 23 Mar 2011 16:41:43 -0700
Subject: fs/adfs/adfs.h: fix unsigned comparison

fs/adfs/adfs.h: In function 'append_filetype_suffix':
fs/adfs/adfs.h:115: warning: comparison is always false due to limited range of data type

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Stuart Swales <stuart.swales.croftnuisk@gmail.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/adfs/adfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index a8a58d864f9..718ac1f440c 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -112,7 +112,7 @@ struct object_info {
 /* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */
 static inline int append_filetype_suffix(char *buf, __u16 filetype)
 {
-	if (filetype == -1)
+	if (filetype == 0xffff)	/* no explicit 12-bit file type was set */
 		return 0;
 
 	*buf++ = ',';
-- 
cgit v1.2.3


From 50e0168cc3406fc1a04d57eb08f500a173d1660d Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Mar 2011 16:42:07 -0700
Subject: ext4: use little-endian bitops

As a preparation for removing ext2 non-atomic bit operations from
asm/bitops.h.  This converts ext2 non-atomic bit operations to
little-endian bit operations.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/ext4.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3aa0b72b3b9..4daaf2b753f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -923,14 +923,14 @@ struct ext4_inode_info {
 #define test_opt2(sb, opt)		(EXT4_SB(sb)->s_mount_opt2 & \
 					 EXT4_MOUNT2_##opt)
 
-#define ext4_set_bit			ext2_set_bit
+#define ext4_set_bit			__test_and_set_bit_le
 #define ext4_set_bit_atomic		ext2_set_bit_atomic
-#define ext4_clear_bit			ext2_clear_bit
+#define ext4_clear_bit			__test_and_clear_bit_le
 #define ext4_clear_bit_atomic		ext2_clear_bit_atomic
-#define ext4_test_bit			ext2_test_bit
-#define ext4_find_first_zero_bit	ext2_find_first_zero_bit
-#define ext4_find_next_zero_bit		ext2_find_next_zero_bit
-#define ext4_find_next_bit		ext2_find_next_bit
+#define ext4_test_bit			test_bit_le
+#define ext4_find_first_zero_bit	find_first_zero_bit_le
+#define ext4_find_next_zero_bit		find_next_zero_bit_le
+#define ext4_find_next_bit		find_next_bit_le
 
 /*
  * Maximal mount counts between two filesystem checks
-- 
cgit v1.2.3


From c4354d0d6812ad6729ac33d3c8bc64585cfdb890 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Mar 2011 16:42:08 -0700
Subject: ocfs2: use little-endian bitops

As a preparation for removing ext2 non-atomic bit operations from
asm/bitops.h.  This converts ext2 non-atomic bit operations to
little-endian bit operations.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/ocfs2.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 51cd6898e7f..1a97ba1ec3f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -831,18 +831,18 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
 
 static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
 {
-	ext2_set_bit(bit, bitmap);
+	__test_and_set_bit_le(bit, bitmap);
 }
 #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
 
 static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
 {
-	ext2_clear_bit(bit, bitmap);
+	__test_and_clear_bit_le(bit, bitmap);
 }
 #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
 
-#define ocfs2_test_bit ext2_test_bit
-#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
-#define ocfs2_find_next_bit ext2_find_next_bit
+#define ocfs2_test_bit test_bit_le
+#define ocfs2_find_next_zero_bit find_next_zero_bit_le
+#define ocfs2_find_next_bit find_next_bit_le
 #endif  /* OCFS2_H */
 
-- 
cgit v1.2.3


From a49ebbabb084d345991b72818a119616431416f2 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Mar 2011 16:42:08 -0700
Subject: nilfs2: use little-endian bitops

As a preparation for removing ext2 non-atomic bit operations from
asm/bitops.h.  This converts ext2 non-atomic bit operations to
little-endian bit operations.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/alloc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 9af34a7e6e1..f5fde36b9e2 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -74,7 +74,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
 
 #define nilfs_set_bit_atomic		ext2_set_bit_atomic
 #define nilfs_clear_bit_atomic		ext2_clear_bit_atomic
-#define nilfs_find_next_zero_bit	ext2_find_next_zero_bit
+#define nilfs_find_next_zero_bit	find_next_zero_bit_le
 
 /*
  * persistent object allocator cache
-- 
cgit v1.2.3


From 9ad1e1e405fb2c1ff35f2ec67cc6ba4c6765f192 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Mar 2011 16:42:11 -0700
Subject: udf: use little-endian bitops

As a preparation for removing ext2 non-atomic bit operations from
asm/bitops.h.  This converts ext2 non-atomic bit operations to
little-endian bit operations.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/udf/balloc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 8994dd04166..95518a9f589 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -27,11 +27,10 @@
 #include "udf_i.h"
 #include "udf_sb.h"
 
-#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
-#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
-#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
-#define udf_find_next_one_bit(addr, size, offset) \
-		ext2_find_next_bit((unsigned long *)(addr), size, offset)
+#define udf_clear_bit	__test_and_clear_bit_le
+#define udf_set_bit	__test_and_set_bit_le
+#define udf_test_bit	test_bit_le
+#define udf_find_next_one_bit	find_next_bit_le
 
 static int read_block_bitmap(struct super_block *sb,
 			     struct udf_bitmap *bitmap, unsigned int block,
-- 
cgit v1.2.3


From 3cdc7125c364b2baad8aba69c058b26d3dca5f52 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Mar 2011 16:42:11 -0700
Subject: ufs: use little-endian bitops

As a preparation for removing ext2 non-atomic bit operations from
asm/bitops.h.  This converts ext2 non-atomic bit operations to
little-endian bit operations.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ufs/util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 9f8775ce381..95417592824 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -408,7 +408,7 @@ static inline unsigned _ubh_find_next_zero_bit_(
 	for (;;) {
 		count = min_t(unsigned int, size + offset, uspi->s_bpf);
 		size -= count - offset;
-		pos = ext2_find_next_zero_bit (ubh->bh[base]->b_data, count, offset);
+		pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset);
 		if (pos < count || !size)
 			break;
 		base++;
-- 
cgit v1.2.3


From f312eff8164879e04923d41e9dd23e7850937d85 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Mar 2011 16:42:14 -0700
Subject: bitops: remove ext2 non-atomic bitops from asm/bitops.h

As the result of conversions, there are no users of ext2 non-atomic bit
operations except for ext2 filesystem itself.  Now we can put them into
architecture independent code in ext2 filesystem, and remove from
asm/bitops.h for all architectures.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/ext2.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 1b48c337087..645be9e7ee4 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -174,3 +174,9 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
 	return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
 		le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
 }
+
+#define ext2_set_bit	__test_and_set_bit_le
+#define ext2_clear_bit	__test_and_clear_bit_le
+#define ext2_test_bit	test_bit_le
+#define ext2_find_first_zero_bit	find_first_zero_bit_le
+#define ext2_find_next_zero_bit		find_next_zero_bit_le
-- 
cgit v1.2.3


From 61f2e7b0f474225b4226772830ae4b29a3a21f8d Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 23 Mar 2011 16:42:16 -0700
Subject: bitops: remove minix bitops from asm/bitops.h

minix bit operations are only used by minix filesystem and useless by
other modules.  Because byte order of inode and block bitmaps is different
on each architecture like below:

m68k:
	big-endian 16bit indexed bitmaps

h8300, microblaze, s390, sparc, m68knommu:
	big-endian 32 or 64bit indexed bitmaps

m32r, mips, sh, xtensa:
	big-endian 32 or 64bit indexed bitmaps for big-endian mode
	little-endian bitmaps for little-endian mode

Others:
	little-endian bitmaps

In order to move minix bit operations from asm/bitops.h to architecture
independent code in minix filesystem, this provides two config options.

CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED is only selected by m68k.
CONFIG_MINIX_FS_NATIVE_ENDIAN is selected by the architectures which use
native byte order bitmaps (h8300, microblaze, s390, sparc, m68knommu,
m32r, mips, sh, xtensa).  The architectures which always use little-endian
bitmaps do not select these options.

Finally, we can remove minix bit operations from asm/bitops.h for all
architectures.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Andreas Schwab <schwab@linux-m68k.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Michal Simek <monstr@monstr.eu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/minix/Kconfig |  8 ++++++
 fs/minix/minix.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

(limited to 'fs')

diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index 0fd7ca99426..6624684dd5d 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -15,3 +15,11 @@ config MINIX_FS
 	  module will be called minix.  Note that the file system of your root
 	  partition (the one containing the directory /) cannot be compiled as
 	  a module.
+
+config MINIX_FS_NATIVE_ENDIAN
+	def_bool MINIX_FS
+	depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
+
+config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
+	def_bool MINIX_FS
+	depends on M68K && MMU
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 407b1c84911..341e2122879 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -88,4 +88,78 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
 	return list_entry(inode, struct minix_inode_info, vfs_inode);
 }
 
+#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
+	defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+
+#error Minix file system byte order broken
+
+#elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN)
+
+/*
+ * big-endian 32 or 64 bit indexed bitmaps on big-endian system or
+ * little-endian bitmaps on little-endian system
+ */
+
+#define minix_test_and_set_bit(nr, addr)	\
+	__test_and_set_bit((nr), (unsigned long *)(addr))
+#define minix_set_bit(nr, addr)		\
+	__set_bit((nr), (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr) \
+	__test_and_clear_bit((nr), (unsigned long *)(addr))
+#define minix_test_bit(nr, addr)		\
+	test_bit((nr), (unsigned long *)(addr))
+#define minix_find_first_zero_bit(addr, size) \
+	find_first_zero_bit((unsigned long *)(addr), (size))
+
+#elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+
+/*
+ * big-endian 16bit indexed bitmaps
+ */
+
+static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
+{
+	const unsigned short *p = vaddr, *addr = vaddr;
+	unsigned short num;
+
+	if (!size)
+		return 0;
+
+	size = (size >> 4) + ((size & 15) > 0);
+	while (*p++ == 0xffff) {
+		if (--size == 0)
+			return (p - addr) << 4;
+	}
+
+	num = *--p;
+	return ((p - addr) << 4) + ffz(num);
+}
+
+#define minix_test_and_set_bit(nr, addr)	\
+	__test_and_set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_set_bit(nr, addr)	\
+	__set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr)	\
+	__test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr))
+
+static inline int minix_test_bit(int nr, const void *vaddr)
+{
+	const unsigned short *p = vaddr;
+	return (p[nr >> 4] & (1U << (nr & 15))) != 0;
+}
+
+#else
+
+/*
+ * little-endian bitmaps
+ */
+
+#define minix_test_and_set_bit	__test_and_set_bit_le
+#define minix_set_bit		__set_bit_le
+#define minix_test_and_clear_bit	__test_and_clear_bit_le
+#define minix_test_bit	test_bit_le
+#define minix_find_first_zero_bit	find_first_zero_bit_le
+
+#endif
+
 #endif /* FS_MINIX_H */
-- 
cgit v1.2.3


From 51e031496d50f87ff519a63cfd4fc2f415f03336 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Wed, 23 Mar 2011 16:42:48 -0700
Subject: proc: hide kernel addresses via %pK in /proc/<pid>/stack

This file is readable for the task owner.  Hide kernel addresses from
unprivileged users, leave them function names and offsets.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Kees Cook <kees.cook@canonical.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d49c4b5d2c3..c3af15e9c07 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -340,7 +340,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 	save_stack_trace_tsk(task, &trace);
 
 	for (i = 0; i < trace.nr_entries; i++) {
-		seq_printf(m, "[<%p>] %pS\n",
+		seq_printf(m, "[<%pK>] %pS\n",
 			   (void *)entries[i], (void *)entries[i]);
 	}
 	kfree(entries);
-- 
cgit v1.2.3


From 0db0c01b53a1a421513f91573241aabafb87802a Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@nokia.com>
Date: Wed, 23 Mar 2011 16:42:50 -0700
Subject: procfs: fix /proc/<pid>/maps heap check

The current code fails to print the "[heap]" marking if the heap is split
into multiple mappings.

Fix the check so that the marking is displayed in all possible cases:
	1. vma matches exactly the heap
	2. the heap vma is merged e.g. with bss
	3. the heap vma is splitted e.g. due to locked pages

Test cases. In all cases, the process should have mapping(s) with
[heap] marking:

	(1) vma matches exactly the heap

	#include <stdio.h>
	#include <unistd.h>
	#include <sys/types.h>

	int main (void)
	{
		if (sbrk(4096) != (void *)-1) {
			printf("check /proc/%d/maps\n", (int)getpid());
			while (1)
				sleep(1);
		}
		return 0;
	}

	# ./test1
	check /proc/553/maps
	[1] + Stopped                    ./test1
	# cat /proc/553/maps | head -4
	00008000-00009000 r-xp 00000000 01:00 3113640    /test1
	00010000-00011000 rw-p 00000000 01:00 3113640    /test1
	00011000-00012000 rw-p 00000000 00:00 0          [heap]
	4006f000-40070000 rw-p 00000000 00:00 0

	(2) the heap vma is merged

	#include <stdio.h>
	#include <unistd.h>
	#include <sys/types.h>

	char foo[4096] = "foo";
	char bar[4096];

	int main (void)
	{
		if (sbrk(4096) != (void *)-1) {
			printf("check /proc/%d/maps\n", (int)getpid());
			while (1)
				sleep(1);
		}
		return 0;
	}

	# ./test2
	check /proc/556/maps
	[2] + Stopped                    ./test2
	# cat /proc/556/maps | head -4
	00008000-00009000 r-xp 00000000 01:00 3116312    /test2
	00010000-00012000 rw-p 00000000 01:00 3116312    /test2
	00012000-00014000 rw-p 00000000 00:00 0          [heap]
	4004a000-4004b000 rw-p 00000000 00:00 0

	(3) the heap vma is splitted (this fails without the patch)

	#include <stdio.h>
	#include <unistd.h>
	#include <sys/mman.h>
	#include <sys/types.h>

	int main (void)
	{
		if ((sbrk(4096) != (void *)-1) && !mlockall(MCL_FUTURE) &&
		    (sbrk(4096) != (void *)-1)) {
			printf("check /proc/%d/maps\n", (int)getpid());
			while (1)
				sleep(1);
		}
		return 0;
	}

	# ./test3
	check /proc/559/maps
	[1] + Stopped                    ./test3
	# cat /proc/559/maps|head -4
	00008000-00009000 r-xp 00000000 01:00 3119108    /test3
	00010000-00011000 rw-p 00000000 01:00 3119108    /test3
	00011000-00012000 rw-p 00000000 00:00 0          [heap]
	00012000-00013000 rw-p 00000000 00:00 0          [heap]

It looks like the bug has been there forever, and since it only results in
some information missing from a procfile, it does not fulfil the -stable
"critical issue" criteria.

Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 93381aae936..636f1a1fdf8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -251,8 +251,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		const char *name = arch_vma_name(vma);
 		if (!name) {
 			if (mm) {
-				if (vma->vm_start <= mm->start_brk &&
-						vma->vm_end >= mm->brk) {
+				if (vma->vm_start <= mm->brk &&
+						vma->vm_end >= mm->start_brk) {
 					name = "[heap]";
 				} else if (vma->vm_start <= mm->start_stack &&
 					   vma->vm_end >= mm->start_stack) {
-- 
cgit v1.2.3


From fc3d8767b2b6de955579852d7a150f1734265eaf Mon Sep 17 00:00:00 2001
From: Jovi Zhang <bookjovi@gmail.com>
Date: Wed, 23 Mar 2011 16:42:51 -0700
Subject: procfs: fix some wrong error code usage

[root@wei 1]# cat /proc/1/mem
cat: /proc/1/mem: No such process

error code -ESRCH is wrong in this situation.  Return -EPERM instead.

Signed-off-by: Jovi Zhang <bookjovi@gmail.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c3af15e9c07..daba1365325 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -775,7 +775,8 @@ static ssize_t mem_read(struct file * file, char __user * buf,
 	if (!task)
 		goto out_no_task;
 
-	if (check_mem_permission(task))
+	ret = check_mem_permission(task);
+	if (ret)
 		goto out;
 
 	ret = -ENOMEM;
@@ -845,7 +846,8 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 	if (!task)
 		goto out_no_task;
 
-	if (check_mem_permission(task))
+	copied = check_mem_permission(task);
+	if (copied)
 		goto out;
 
 	copied = -ENOMEM;
@@ -917,6 +919,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!task)
 		goto out_no_task;
 
+	ret = -EPERM;
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out;
 
-- 
cgit v1.2.3


From 312ec7e50c4d3f40b3762af651d1aa79a67f556a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 23 Mar 2011 16:42:52 -0700
Subject: proc: make struct proc_dir_entry::namelen unsigned int

1. namelen is declared "unsigned short" which hints for "maybe space savings".
   Indeed in 2.4 struct proc_dir_entry looked like:

        struct proc_dir_entry {
                unsigned short low_ino;
                unsigned short namelen;

   Now, low_ino is "unsigned int", all savings were gone for a long time.
   "struct proc_dir_entry" is not that countless to worry about it's size,
   anyway.

2. converting from unsigned short to int/unsigned int can only create
   problems, we better play it safe.

Space is not really conserved, because of natural alignment for the next
field.  sizeof(struct proc_dir_entry) remains the same.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 01e07f2a188..f1281339b6f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -28,7 +28,7 @@
 
 DEFINE_SPINLOCK(proc_subdir_lock);
 
-static int proc_match(int len, const char *name, struct proc_dir_entry *de)
+static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
 {
 	if (de->namelen != len)
 		return 0;
@@ -303,7 +303,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 {
 	const char     		*cp = name, *next;
 	struct proc_dir_entry	*de;
-	int			len;
+	unsigned int		len;
 
 	de = *ret;
 	if (!de)
@@ -602,7 +602,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 {
 	struct proc_dir_entry *ent = NULL;
 	const char *fn = name;
-	int len;
+	unsigned int len;
 
 	/* make sure name is valid */
 	if (!name || !strlen(name)) goto out;
@@ -786,7 +786,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	struct proc_dir_entry **p;
 	struct proc_dir_entry *de = NULL;
 	const char *fn = name;
-	int len;
+	unsigned int len;
 
 	spin_lock(&proc_subdir_lock);
 	if (__xlate_proc_name(name, &parent, &fn) != 0) {
-- 
cgit v1.2.3


From 5883f57ca0008ffc93e09cbb9847a1928e50c6f3 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 23 Mar 2011 16:42:53 -0700
Subject: proc: protect mm start_code/end_code in /proc/pid/stat

While mm->start_stack was protected from cross-uid viewing (commit
f83ce3e6b02d5 ("proc: avoid information leaks to non-privileged
processes")), the start_code and end_code values were not.  This would
allow the text location of a PIE binary to leak, defeating ASLR.

Note that the value "1" is used instead of "0" for a protected value since
"ps", "killall", and likely other readers of /proc/pid/stat, take
start_code of "0" to mean a kernel thread and will misbehave.  Thanks to
Brad Spengler for pointing this out.

Addresses CVE-2011-0726

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Cc: <stable@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Eugene Teo <eugeneteo@kernel.sg>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Brad Spengler <spender@grsecurity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7c99c1cf7e5..5e4f776b091 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -489,8 +489,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		vsize,
 		mm ? get_mm_rss(mm) : 0,
 		rsslim,
-		mm ? mm->start_code : 0,
-		mm ? mm->end_code : 0,
+		mm ? (permitted ? mm->start_code : 1) : 0,
+		mm ? (permitted ? mm->end_code : 1) : 0,
 		(permitted && mm) ? mm->start_stack : 0,
 		esp,
 		eip,
-- 
cgit v1.2.3


From cb16e95fa2996743a6e80a665ed2ed0590bd38cf Mon Sep 17 00:00:00 2001
From: Petr Holasek <pholasek@redhat.com>
Date: Wed, 23 Mar 2011 16:43:09 -0700
Subject: sysctl: add some missing input constraint checks

Add boundaries of allowed input ranges for: dirty_expire_centisecs,
drop_caches, overcommit_memory, page-cluster and panic_on_oom.

Signed-off-by: Petr Holasek <pholasek@redhat.com>
Acked-by: Dave Young <hidave.darkstar@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/drop_caches.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2195c213ab2..816f88e6b9c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -45,7 +45,11 @@ static void drop_slab(void)
 int drop_caches_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_minmax(table, write, buffer, length, ppos);
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret)
+		return ret;
 	if (write) {
 		if (sysctl_drop_caches & 1)
 			iterate_supers(drop_pagecache_sb, NULL);
-- 
cgit v1.2.3


From 4308eebbeb2026827d4492ce8c23d99f7f144a82 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 23 Mar 2011 16:43:13 -0700
Subject: pidns: call pid_ns_prepare_proc() from create_pid_namespace()

Reorganize proc_get_sb() so it can be called before the struct pid of the
first process is allocated.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/root.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/root.c b/fs/proc/root.c
index ef9fa8e24ad..e5e2bfa7a03 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	struct pid_namespace *ns;
 	struct proc_inode *ei;
 
-	if (proc_mnt) {
-		/* Seed the root directory with a pid so it doesn't need
-		 * to be special in base.c.  I would do this earlier but
-		 * the only task alive when /proc is mounted the first time
-		 * is the init_task and it doesn't have any pids.
-		 */
-		ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
-		if (!ei->pid)
-			ei->pid = find_get_pid(1);
-	}
-
 	if (flags & MS_KERNMOUNT)
 		ns = (struct pid_namespace *)data;
 	else
@@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 			return ERR_PTR(err);
 		}
 
-		ei = PROC_I(sb->s_root->d_inode);
-		if (!ei->pid) {
-			rcu_read_lock();
-			ei->pid = get_pid(find_pid_ns(1, ns));
-			rcu_read_unlock();
-		}
-
 		sb->s_flags |= MS_ACTIVE;
 	}
 
+	ei = PROC_I(sb->s_root->d_inode);
+	if (!ei->pid) {
+		rcu_read_lock();
+		ei->pid = get_pid(find_pid_ns(1, ns));
+		rcu_read_unlock();
+	}
+
 	return dget(sb->s_root);
 }
 
-- 
cgit v1.2.3


From 52e9fc76d0d4b1e8adeee736172c6c23180059b2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 23 Mar 2011 16:43:14 -0700
Subject: procfs: kill the global proc_mnt variable

After the previous cleanup in proc_get_sb() the global proc_mnt has no
reasons to exists, kill it.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c    | 2 --
 fs/proc/internal.h | 1 -
 fs/proc/root.c     | 7 ++++---
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d6a7ca1fdac..d15aa1b1cc8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -46,8 +46,6 @@ static void proc_evict_inode(struct inode *inode)
 	}
 }
 
-struct vfsmount *proc_mnt;
-
 static struct kmem_cache * proc_inode_cachep;
 
 static struct inode *proc_alloc_inode(struct super_block *sb)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9ad561ded40..c03e8d3a3a5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -107,7 +107,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
 }
 void pde_put(struct proc_dir_entry *pde);
 
-extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
 struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index e5e2bfa7a03..a9000e9cfee 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -90,19 +90,20 @@ static struct file_system_type proc_fs_type = {
 
 void __init proc_root_init(void)
 {
+	struct vfsmount *mnt;
 	int err;
 
 	proc_init_inodecache();
 	err = register_filesystem(&proc_fs_type);
 	if (err)
 		return;
-	proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
-	if (IS_ERR(proc_mnt)) {
+	mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+	if (IS_ERR(mnt)) {
 		unregister_filesystem(&proc_fs_type);
 		return;
 	}
 
-	init_pid_ns.proc_mnt = proc_mnt;
+	init_pid_ns.proc_mnt = mnt;
 	proc_symlink("mounts", NULL, "self/mounts");
 
 	proc_net_init();
-- 
cgit v1.2.3


From e795b71799ff0b27365020c9ddaa25d0d83f99c8 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:25 -0700
Subject: userns: userns: check user namespace for task->file uid equivalence
 checks

Cheat for now and say all files belong to init_user_ns.  Next step will be
to let superblocks belong to a user_ns, and derive inode_userns(inode)
from inode->i_sb->s_user_ns.  Finally we'll introduce more flexible
arrangements.

Changelog:
	Feb 15: make is_owner_or_cap take const struct inode
	Feb 23: make is_owner_or_cap bool

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c | 17 +++++++++++++++++
 fs/namei.c | 21 ++++++++++++++++-----
 2 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 16fefd373fc..a21d5a938a1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,7 @@
 #include <linux/async.h>
 #include <linux/posix_acl.h>
 #include <linux/ima.h>
+#include <linux/cred.h>
 
 /*
  * This is needed for the following functions:
@@ -1733,3 +1734,19 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
 	inode->i_mode = mode;
 }
 EXPORT_SYMBOL(inode_init_owner);
+
+/*
+ * return true if current either has CAP_FOWNER to the
+ * file, or owns the file.
+ */
+bool is_owner_or_cap(const struct inode *inode)
+{
+	struct user_namespace *ns = inode_userns(inode);
+
+	if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
+		return true;
+	if (ns_capable(ns, CAP_FOWNER))
+		return true;
+	return false;
+}
+EXPORT_SYMBOL(is_owner_or_cap);
diff --git a/fs/namei.c b/fs/namei.c
index 5a9a6c3094d..dbb45a652ae 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -183,6 +183,9 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
 
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 
+	if (current_user_ns() != inode_userns(inode))
+		goto other_perms;
+
 	if (current_fsuid() == inode->i_uid)
 		mode >>= 6;
 	else {
@@ -196,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
 			mode >>= 3;
 	}
 
+other_perms:
 	/*
 	 * If the DACs are ok we don't need any capability check.
 	 */
@@ -237,7 +241,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
 	 * Executable DACs are overridable if at least one exec bit is set.
 	 */
 	if (!(mask & MAY_EXEC) || execute_ok(inode))
-		if (capable(CAP_DAC_OVERRIDE))
+		if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
 			return 0;
 
 	/*
@@ -245,7 +249,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
 	 */
 	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
-		if (capable(CAP_DAC_READ_SEARCH))
+		if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
 			return 0;
 
 	return -EACCES;
@@ -654,6 +658,7 @@ static inline int handle_reval_path(struct nameidata *nd)
 static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
 	int ret;
+	struct user_namespace *ns = inode_userns(inode);
 
 	if (inode->i_op->permission) {
 		ret = inode->i_op->permission(inode, MAY_EXEC, flags);
@@ -666,7 +671,8 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
 	if (ret == -ECHILD)
 		return ret;
 
-	if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
+	if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
+			ns_capable(ns, CAP_DAC_READ_SEARCH))
 		goto ok;
 
 	return ret;
@@ -1842,11 +1848,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
 
 	if (!(dir->i_mode & S_ISVTX))
 		return 0;
+	if (current_user_ns() != inode_userns(inode))
+		goto other_userns;
 	if (inode->i_uid == fsuid)
 		return 0;
 	if (dir->i_uid == fsuid)
 		return 0;
-	return !capable(CAP_FOWNER);
+
+other_userns:
+	return !ns_capable(inode_userns(inode), CAP_FOWNER);
 }
 
 /*
@@ -2440,7 +2450,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	if (error)
 		return error;
 
-	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+	if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+	    !ns_capable(inode_userns(dir), CAP_MKNOD))
 		return -EPERM;
 
 	if (!dir->i_op->mknod)
-- 
cgit v1.2.3


From 2e1496707560ecf98e9b0604622c0990f94861d3 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Wed, 23 Mar 2011 16:43:26 -0700
Subject: userns: rename is_owner_or_cap to inode_owner_or_capable

And give it a kernel-doc comment.

[akpm@linux-foundation.org: btrfs changed in linux-next]
Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: David Howells <dhowells@redhat.com>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/9p/acl.c             |  2 +-
 fs/attr.c               |  4 ++--
 fs/btrfs/acl.c          |  2 +-
 fs/btrfs/ioctl.c        |  4 ++--
 fs/ext2/acl.c           |  2 +-
 fs/ext2/ioctl.c         |  6 +++---
 fs/ext3/acl.c           |  2 +-
 fs/ext3/ioctl.c         |  6 +++---
 fs/ext4/acl.c           |  2 +-
 fs/ext4/ioctl.c         |  8 ++++----
 fs/fcntl.c              |  2 +-
 fs/generic_acl.c        |  2 +-
 fs/gfs2/file.c          |  2 +-
 fs/hfsplus/ioctl.c      |  2 +-
 fs/inode.c              | 13 ++++++++-----
 fs/jffs2/acl.c          |  2 +-
 fs/jfs/ioctl.c          |  2 +-
 fs/jfs/xattr.c          |  2 +-
 fs/logfs/file.c         |  2 +-
 fs/namei.c              |  2 +-
 fs/nilfs2/ioctl.c       |  2 +-
 fs/ocfs2/acl.c          |  2 +-
 fs/ocfs2/ioctl.c        |  2 +-
 fs/reiserfs/ioctl.c     |  4 ++--
 fs/reiserfs/xattr_acl.c |  2 +-
 fs/ubifs/ioctl.c        |  2 +-
 fs/utimes.c             |  2 +-
 fs/xattr.c              |  2 +-
 28 files changed, 45 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 33aa116732c..535ab6eccb1 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -323,7 +323,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 	if (value) {
 		/* update the cached acl value */
diff --git a/fs/attr.c b/fs/attr.c
index 7ca41811afa..1007ed61631 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -59,7 +59,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 		/* Also check the setgid bit! */
 		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -69,7 +69,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 
 	/* Check for setting the inode time. */
 	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 	}
 
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9c949348510..de34bfad9ec 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -170,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
 	int ret;
 	struct posix_acl *acl = NULL;
 
-	if (!is_owner_or_cap(dentry->d_inode))
+	if (!inode_owner_or_capable(dentry->d_inode))
 		return -EPERM;
 
 	if (!IS_POSIXACL(dentry->d_inode))
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5fdb2abc4fa..d1bace3df9b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -158,7 +158,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 		      FS_SYNC_FL | FS_DIRSYNC_FL))
 		return -EOPNOTSUPP;
 
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
 	mutex_lock(&inode->i_mutex);
@@ -1077,7 +1077,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 	if (flags & ~BTRFS_SUBVOL_RDONLY)
 		return -EOPNOTSUPP;
 
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
 	down_write(&root->fs_info->subvol_sem);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 7b4180554a6..abea5a17c76 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -406,7 +406,7 @@ ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
 		return -EINVAL;
 	if (!test_opt(dentry->d_sb, POSIX_ACL))
 		return -EOPNOTSUPP;
-	if (!is_owner_or_cap(dentry->d_inode))
+	if (!inode_owner_or_capable(dentry->d_inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index e7431309bdc..f81e250ac5c 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (ret)
 			return ret;
 
-		if (!is_owner_or_cap(inode)) {
+		if (!inode_owner_or_capable(inode)) {
 			ret = -EACCES;
 			goto setflags_out;
 		}
@@ -89,7 +89,7 @@ setflags_out:
 	case EXT2_IOC_GETVERSION:
 		return put_user(inode->i_generation, (int __user *) arg);
 	case EXT2_IOC_SETVERSION:
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 		ret = mnt_want_write(filp->f_path.mnt);
 		if (ret)
@@ -115,7 +115,7 @@ setflags_out:
 		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
 			return -ENOTTY;
 
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
 		if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e4fa49e6c53..9d021c0d472 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -435,7 +435,7 @@ ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return -EOPNOTSUPP;
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index fc080dd561f..f4090bd2f34 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -38,7 +38,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		unsigned int oldflags;
 		unsigned int jflag;
 
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
@@ -123,7 +123,7 @@ flags_out:
 		__u32 generation;
 		int err;
 
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 
 		err = mnt_want_write(filp->f_path.mnt);
@@ -192,7 +192,7 @@ setversion_out:
 		if (err)
 			return err;
 
-		if (!is_owner_or_cap(inode)) {
+		if (!inode_owner_or_capable(inode)) {
 			err = -EACCES;
 			goto setrsvsz_out;
 		}
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e0270d1f8d8..21eacd7b7d7 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -433,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return -EOPNOTSUPP;
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eb3bc2fe647..a84faa110bc 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		unsigned int oldflags;
 		unsigned int jflag;
 
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
@@ -146,7 +146,7 @@ flags_out:
 		__u32 generation;
 		int err;
 
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 
 		err = mnt_want_write(filp->f_path.mnt);
@@ -298,7 +298,7 @@ mext_out:
 	case EXT4_IOC_MIGRATE:
 	{
 		int err;
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
 		err = mnt_want_write(filp->f_path.mnt);
@@ -320,7 +320,7 @@ mext_out:
 	case EXT4_IOC_ALLOC_DA_BLKS:
 	{
 		int err;
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
 		err = mnt_want_write(filp->f_path.mnt);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6c82e5bac03..22764c7c838 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -159,7 +159,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	/* O_NOATIME can only be set by the owner or superuser */
 	if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 
 	/* required for strict SunOS emulation */
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 06c48a89183..8f26d1a5891 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -74,7 +74,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
 		return -EINVAL;
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 	if (value) {
 		acl = posix_acl_from_xattr(value, size);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4074b952b05..b2682e073ee 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -221,7 +221,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 		goto out_drop_write;
 
 	error = -EACCES;
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		goto out;
 
 	error = 0;
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 508ce662ce1..fbaa6690c8e 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -47,7 +47,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	if (err)
 		goto out;
 
-	if (!is_owner_or_cap(inode)) {
+	if (!inode_owner_or_capable(inode)) {
 		err = -EACCES;
 		goto out_drop_write;
 	}
diff --git a/fs/inode.c b/fs/inode.c
index a21d5a938a1..0b3da4a7770 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1735,11 +1735,14 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
 }
 EXPORT_SYMBOL(inode_init_owner);
 
-/*
- * return true if current either has CAP_FOWNER to the
- * file, or owns the file.
+/**
+ * inode_owner_or_capable - check current task permissions to inode
+ * @inode: inode being checked
+ *
+ * Return true if current either has CAP_FOWNER to the inode, or
+ * owns the file.
  */
-bool is_owner_or_cap(const struct inode *inode)
+bool inode_owner_or_capable(const struct inode *inode)
 {
 	struct user_namespace *ns = inode_userns(inode);
 
@@ -1749,4 +1752,4 @@ bool is_owner_or_cap(const struct inode *inode)
 		return true;
 	return false;
 }
-EXPORT_SYMBOL(is_owner_or_cap);
+EXPORT_SYMBOL(inode_owner_or_capable);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 95b79672150..828a0e1ea43 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -402,7 +402,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
 
 	if (name[0] != '\0')
 		return -EINVAL;
-	if (!is_owner_or_cap(dentry->d_inode))
+	if (!inode_owner_or_capable(dentry->d_inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index afe222bf300..6f98a186677 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -72,7 +72,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (err)
 			return err;
 
-		if (!is_owner_or_cap(inode)) {
+		if (!inode_owner_or_capable(inode)) {
 			err = -EACCES;
 			goto setflags_out;
 		}
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 3fa4c32272d..24838f1eeee 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -678,7 +678,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
 	struct posix_acl *acl;
 	int rc;
 
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 
 	/*
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index e86376b87af..c2ad7028def 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -196,7 +196,7 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
 		err = get_user(flags, (int __user *)arg);
diff --git a/fs/namei.c b/fs/namei.c
index dbb45a652ae..fc858b1124c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2036,7 +2036,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
 	}
 
 	/* O_NOATIME can only be set by the owner or superuser */
-	if (flag & O_NOATIME && !is_owner_or_cap(inode))
+	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
 		return -EPERM;
 
 	/*
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 95c04c2f2b3..f2469ba6246 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -113,7 +113,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 	unsigned int flags, oldflags;
 	int ret;
 
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
 	if (get_user(flags, (int __user *)argp))
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 704f6b1742f..90f2729b7a5 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -497,7 +497,7 @@ static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
 	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
 		return -EOPNOTSUPP;
 
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7a486819615..09de77ce002 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -82,7 +82,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 	}
 
 	status = -EACCES;
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		goto bail_unlock;
 
 	if (!S_ISDIR(inode->i_mode))
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 79265fdc317..4e153051bc7 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			if (err)
 				break;
 
-			if (!is_owner_or_cap(inode)) {
+			if (!inode_owner_or_capable(inode)) {
 				err = -EPERM;
 				goto setflags_out;
 			}
@@ -103,7 +103,7 @@ setflags_out:
 		err = put_user(inode->i_generation, (int __user *)arg);
 		break;
 	case REISERFS_IOC_SETVERSION:
-		if (!is_owner_or_cap(inode)) {
+		if (!inode_owner_or_capable(inode)) {
 			err = -EPERM;
 			break;
 		}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 90d2fcb67a3..3dc38f1206f 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -26,7 +26,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
 	size_t jcreate_blocks;
 	if (!reiserfs_posixacl(inode->i_sb))
 		return -EOPNOTSUPP;
-	if (!is_owner_or_cap(inode))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 
 	if (value) {
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 8aacd64957a..548acf494af 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -160,7 +160,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (IS_RDONLY(inode))
 			return -EROFS;
 
-		if (!is_owner_or_cap(inode))
+		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
 		if (get_user(flags, (int __user *) arg))
diff --git a/fs/utimes.c b/fs/utimes.c
index 179b5869065..ba653f3dc1b 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -95,7 +95,7 @@ static int utimes_common(struct path *path, struct timespec *times)
                 if (IS_IMMUTABLE(inode))
 			goto mnt_drop_write_and_out;
 
-		if (!is_owner_or_cap(inode)) {
+		if (!inode_owner_or_capable(inode)) {
 			error = inode_permission(inode, MAY_WRITE);
 			if (error)
 				goto mnt_drop_write_and_out;
diff --git a/fs/xattr.c b/fs/xattr.c
index 01bb8135e14..a19acdb81cd 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -59,7 +59,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
 			return -EPERM;
 		if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
-		    (mask & MAY_WRITE) && !is_owner_or_cap(inode))
+		    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
 			return -EPERM;
 	}
 
-- 
cgit v1.2.3


From fc5e58c0c4fd86881ec8ba8e46e41a07e25dc7a6 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 24 Mar 2011 16:14:26 +0200
Subject: UBIFS: use GFP_NOFS properly

This patch fixes a brown-paperbag bug which was introduced by me:
I used incorrect "GFP_KERNEL | GFP_NOFS" allocation flags to make
sure my allocations do not cause write-back. But the correct form
is "GFP_NOFS".

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c      | 2 +-
 fs/ubifs/lprops.c     | 2 +-
 fs/ubifs/lpt_commit.c | 4 ++--
 fs/ubifs/orphan.c     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 01c2b028e52..f25a7339f80 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -818,7 +818,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
 	       current->pid, lnum);
 
-	buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
 	if (!buf) {
 		ubifs_err("cannot allocate memory for dumping LEB %d", lnum);
 		return;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index c7b25e2f776..0ee0847f242 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1094,7 +1094,7 @@ static int scan_check_cb(struct ubifs_info *c,
 		}
 	}
 
-	buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
 	if (!buf) {
 		ubifs_err("cannot allocate memory to scan LEB %d", lnum);
 		goto out;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 0a3c2c3f5c4..0c9c69bd983 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1633,7 +1633,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
 		return 0;
 
-	buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
 	if (!buf) {
 		ubifs_err("cannot allocate memory for ltab checking");
 		return 0;
@@ -1885,7 +1885,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 
 	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
 	       current->pid, lnum);
-	buf = p = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
 	if (!buf) {
 		ubifs_err("cannot allocate memory to dump LPT");
 		return;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 2cdbd31641d..09df318e368 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -898,7 +898,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
 	if (c->no_orphs)
 		return 0;
 
-	buf = __vmalloc(c->leb_size, GFP_KERNEL | GFP_NOFS, PAGE_KERNEL);
+	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
 	if (!buf) {
 		ubifs_err("cannot allocate memory to check orphans");
 		return 0;
-- 
cgit v1.2.3


From 9d523cafbe0dab5a2b873ecd85c37fec9d1368f3 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 21 Mar 2011 16:16:29 +0200
Subject: UBIFS: kill CONFIG_UBIFS_FS_DEBUG_CHKS

Simplify UBIFS configuration menu and kill the option to enable self-check
compile-time. We do not really need this because we can do this run-time
using the module parameters or the corresponding sysfs interfaces. And
there is a value in simplifying the kernel configuration menu which becomes
increasingly large.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/Kconfig | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 1d1859dc3de..d7440904be1 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -58,12 +58,3 @@ config UBIFS_FS_DEBUG
 	  down UBIFS. You can then further enable / disable individual  debugging
 	  features using UBIFS module parameters and the corresponding sysfs
 	  interfaces.
-
-config UBIFS_FS_DEBUG_CHKS
-	bool "Enable extra checks"
-	depends on UBIFS_FS_DEBUG
-	help
-	  If extra checks are enabled UBIFS will check the consistency of its
-	  internal data structures during operation. However, UBIFS performance
-	  is dramatically slower when this option is selected especially if the
-	  file system is large.
-- 
cgit v1.2.3


From 6ed09c34b7984a978a73a855f4c2e6662acc8bdb Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 23 Mar 2011 10:32:58 +0200
Subject: UBIFS: fix assertion warning and refine comments

This patch fixes the following UBIFS assertion warning:

UBIFS assert failed in do_readpage at 115 (pid 199)
[<b00321b8>] (unwind_backtrace+0x0/0xdc) from [<af025118>]
(do_readpage+0x108/0x594 [ubifs])
[<af025118>] (do_readpage+0x108/0x594 [ubifs]) from [<af025764>]
(ubifs_write_end+0x1c0/0x2e8 [ubifs])
[<af025764>] (ubifs_write_end+0x1c0/0x2e8 [ubifs]) from
[<b00a0164>] (generic_file_buffered_write+0x18c/0x270)
[<b00a0164>] (generic_file_buffered_write+0x18c/0x270) from
[<b00a08d4>] (__generic_file_aio_write+0x478/0x4c0)
[<b00a08d4>] (__generic_file_aio_write+0x478/0x4c0) from
[<b00a0984>] (generic_file_aio_write+0x68/0xc8)
[<b00a0984>] (generic_file_aio_write+0x68/0xc8) from
[<af024a78>] (ubifs_aio_write+0x178/0x1d8 [ubifs])
[<af024a78>] (ubifs_aio_write+0x178/0x1d8 [ubifs]) from
[<b00d104c>] (do_sync_write+0xb0/0x100)
[<b00d104c>] (do_sync_write+0xb0/0x100) from [<b00d1abc>]
(vfs_write+0xac/0x154)
[<b00d1abc>] (vfs_write+0xac/0x154) from [<b00d1c10>]
(sys_write+0x3c/0x68)
[<b00d1c10>] (sys_write+0x3c/0x68) from [<b002d9a0>]
(ret_fast_syscall+0x0/0x2c)

The 'PG_checked' flag is used to indicate that the page does not
supposedly exist on the media (e.g., a hole or a page beyond the
inode size), so it requires slightly bigger budget, because we have
to account the indexing size increase. And this flag basically
tells that the budget for this page has to be "new page budget".
The "new page budget" is slightly bigger than the "existing page
budget".

The 'do_readpage()' function has the following assertion which
sometimes is hit: 'ubifs_assert(!PageChecked(page))'. Obviously,
the meaning of this assertion is: "I should not be asked to read
a page which does not exist on the media".

However, in 'ubifs_write_begin()' we have a small "trick". Notice,
that VFS may write pages which were not read yet, so the page data
were not loaded from the media to the page cache yet. If VFS tells
that it is going to change only some part of the page, we obviously
have to load it from the media. However, if VFS tells that it is
going to change whole page, we do not read it from the media for
optimization purposes.

However, since we do not read it, we do not know if it exists on
the media or not (a hole, etc). So we set the 'PG_checked' flag
to this page to force bigger budget, just in case.

So 'ubifs_write_begin()' sets 'PG_checked'. Then we are in
'ubifs_write_end()'. And VFS tells us: "hey, for some reasons I
changed my mind and did not change whole page". Frankly, I do not
know why this happens, but I hit this somehow on an ARM platform.
And this is extremely rare.

So in this case UBIFS does the following:

1. Cancels allocated budget.
2. Loads the page from the media by calling 'do_readpage()'.
3. Asks VFS to repeat the whole write operation from the very
   beginning (call '->write_begin() again, etc).

And the assertion warning is hit at the step 2 - remember we have
the 'PG_checked' set for this page, and 'do_readpage()' does not
like this. So this patch fixes the problem by adding step 1.5 and
cleaning the 'PG_checked' before calling 'do_readpage()'.

All in all, this patch does not fix any functionality issue, but it
silences UBIFS false positive warning which may happen in very very
rare cases.

And while on it, this patch also improves a commentary which explains
the reasons of setting the 'PG_checked' flag for the page. The old
commentary was a bit difficult to understand.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/file.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index d77db7e3648..28be1e6a65e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -448,10 +448,12 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
 			/*
 			 * We change whole page so no need to load it. But we
-			 * have to set the @PG_checked flag to make the further
-			 * code know that the page is new. This might be not
-			 * true, but it is better to budget more than to read
-			 * the page from the media.
+			 * do not know whether this page exists on the media or
+			 * not, so we assume the latter because it requires
+			 * larger budget. The assumption is that it is better
+			 * to budget a bit more than to read the page from the
+			 * media. Thus, we are setting the @PG_checked flag
+			 * here.
 			 */
 			SetPageChecked(page);
 			skipped_read = 1;
@@ -559,6 +561,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
 		dbg_gen("copied %d instead of %d, read page and repeat",
 			copied, len);
 		cancel_budget(c, page, ui, appending);
+		ClearPageChecked(page);
 
 		/*
 		 * Return 0 to force VFS to repeat the whole operation, or the
-- 
cgit v1.2.3


From 62a7375e5d77d654695297c4b39d5d740d901184 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 25 Mar 2011 01:51:02 +0800
Subject: vfs - check non-mountpoint dentry might block in __follow_mount_rcu()

When following a mount in rcu-walk mode we must check if the incoming dentry
is telling us it may need to block, even if it isn't actually a mountpoint.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d0066e17d45..3cb616d38d9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -992,6 +992,12 @@ int follow_down_one(struct path *path)
 	return 0;
 }
 
+static inline bool managed_dentry_might_block(struct dentry *dentry)
+{
+	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+		dentry->d_op->d_manage(dentry, true) < 0);
+}
+
 /*
  * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
  * meet a managed dentry and we're not walking to "..".  True is returned to
@@ -1000,19 +1006,26 @@ int follow_down_one(struct path *path)
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 			       struct inode **inode, bool reverse_transit)
 {
-	while (d_mountpoint(path->dentry)) {
+	for (;;) {
 		struct vfsmount *mounted;
-		if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
-		    !reverse_transit &&
-		    path->dentry->d_op->d_manage(path->dentry, true) < 0)
+		/*
+		 * Don't forget we might have a non-mountpoint managed dentry
+		 * that wants to block transit.
+		 */
+		*inode = path->dentry->d_inode;
+		if (!reverse_transit &&
+		     unlikely(managed_dentry_might_block(path->dentry)))
 			return false;
+
+		if (!d_mountpoint(path->dentry))
+			break;
+
 		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 		if (!mounted)
 			break;
 		path->mnt = mounted;
 		path->dentry = mounted->mnt_root;
 		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
-		*inode = path->dentry->d_inode;
 	}
 
 	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-- 
cgit v1.2.3


From 3c3199852905ceb90a70e98777e71d369a5f0823 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 25 Mar 2011 01:51:08 +0800
Subject: autofs4 - reinstate last used update on access

When direct (and offset) mounts were introduced the the last used
timeout could no longer be updated in ->d_revalidate(). This is
because covered direct mounts would be followed over without calling
the autofs file system. As a result the definition of the busyness
check for all entries was changed to be "actually busy" being an open
file or working directory within the automount. But now we have a call
back in the follow so the last used update on any access can be
re-instated. This requires DCACHE_MANAGE_TRANSIT to always be set.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/expire.c | 13 ++-----------
 fs/autofs4/root.c   | 35 ++++++++++++-----------------------
 2 files changed, 14 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index f43100b9662..c896dd6c1ea 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -294,7 +294,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 		spin_unlock(&sbi->fs_lock);
 		return NULL;
 	}
-	managed_dentry_set_transit(root);
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
 		struct autofs_info *ino = autofs4_dentry_ino(root);
 		ino->flags |= AUTOFS_INF_EXPIRING;
@@ -302,7 +301,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 		spin_unlock(&sbi->fs_lock);
 		return root;
 	}
-	managed_dentry_clear_transit(root);
 	spin_unlock(&sbi->fs_lock);
 	dput(root);
 
@@ -341,8 +339,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 		ino = autofs4_dentry_ino(dentry);
 		/* No point expiring a pending mount */
 		if (ino->flags & AUTOFS_INF_PENDING)
-			goto cont;
-		managed_dentry_set_transit(dentry);
+			goto next;
 
 		/*
 		 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -402,8 +399,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 			}
 		}
 next:
-		managed_dentry_clear_transit(dentry);
-cont:
 		spin_unlock(&sbi->fs_lock);
 	}
 	return NULL;
@@ -484,8 +479,6 @@ int autofs4_expire_run(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(dentry);
 	ino->flags &= ~AUTOFS_INF_EXPIRING;
-	if (!d_unhashed(dentry))
-		managed_dentry_clear_transit(dentry);
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
@@ -513,9 +506,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 		spin_lock(&sbi->fs_lock);
 		ino->flags &= ~AUTOFS_INF_EXPIRING;
 		spin_lock(&dentry->d_lock);
-		if (ret)
-			__managed_dentry_clear_transit(dentry);
-		else {
+		if (!ret) {
 			if ((IS_ROOT(dentry) ||
 			    (autofs_type_indirect(sbi->type) &&
 			     IS_ROOT(dentry->d_parent))) &&
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e6f84d26f4c..3a93d355248 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -275,17 +275,16 @@ static int autofs4_mount_wait(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
-	int status;
+	int status = 0;
 
 	if (ino->flags & AUTOFS_INF_PENDING) {
 		DPRINTK("waiting for mount name=%.*s",
 			dentry->d_name.len, dentry->d_name.name);
 		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
 		DPRINTK("mount wait done status=%d", status);
-		ino->last_used = jiffies;
-		return status;
 	}
-	return 0;
+	ino->last_used = jiffies;
+	return status;
 }
 
 static int do_expire_wait(struct dentry *dentry)
@@ -319,9 +318,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
 	 */
 	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
 		struct dentry *parent = dentry->d_parent;
+		struct autofs_info *ino;
 		struct dentry *new = d_lookup(parent, &dentry->d_name);
 		if (!new)
 			return NULL;
+		ino = autofs4_dentry_ino(new);
+		ino->last_used = jiffies;
 		dput(path->dentry);
 		path->dentry = new;
 	}
@@ -338,18 +340,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 	DPRINTK("dentry=%p %.*s",
 		dentry, dentry->d_name.len, dentry->d_name.name);
 
-	/*
-	 * Someone may have manually umounted this or it was a submount
-	 * that has gone away.
-	 */
-	spin_lock(&dentry->d_lock);
-	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-		if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
-		     (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-			__managed_dentry_set_transit(path->dentry);
-	}
-	spin_unlock(&dentry->d_lock);
-
 	/* The daemon never triggers a mount. */
 	if (autofs4_oz_mode(sbi))
 		return NULL;
@@ -418,18 +408,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 done:
 	if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
 		/*
-		 * Any needed mounting has been completed and the path updated
-		 * so turn this into a normal dentry so we don't continually
-		 * call ->d_automount() and ->d_manage().
-		 */
-		spin_lock(&dentry->d_lock);
-		__managed_dentry_clear_transit(dentry);
-		/*
+		 * Any needed mounting has been completed and the path
+		 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
+		 * call ->d_automount() on rootless multi-mounts since
+		 * it can lead to an incorrect ELOOP error return.
+		 *
 		 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
 		 * symlinks as in all other cases the dentry will be covered by
 		 * an actual mount so ->d_automount() won't be called during
 		 * the follow.
 		 */
+		spin_lock(&dentry->d_lock);
 		if ((!d_mountpoint(dentry) &&
 		    !list_empty(&dentry->d_subdirs)) ||
 		    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
-- 
cgit v1.2.3


From f9398c233e3201874395eea8558eb616fb198648 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 25 Mar 2011 01:51:14 +0800
Subject: autofs4 - fix dentry leak in autofs4_expire_direct()

There is a missing dput() when returning from autofs4_expire_direct()
when we see that the dentry is already a pending mount.

Signed-off-by: Ian Kent <raven@themaw.net>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/expire.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index c896dd6c1ea..c403abcc725 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -290,10 +290,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 	spin_lock(&sbi->fs_lock);
 	ino = autofs4_dentry_ino(root);
 	/* No point expiring a pending mount */
-	if (ino->flags & AUTOFS_INF_PENDING) {
-		spin_unlock(&sbi->fs_lock);
-		return NULL;
-	}
+	if (ino->flags & AUTOFS_INF_PENDING)
+		goto out;
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
 		struct autofs_info *ino = autofs4_dentry_ino(root);
 		ino->flags |= AUTOFS_INF_EXPIRING;
@@ -301,6 +299,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 		spin_unlock(&sbi->fs_lock);
 		return root;
 	}
+out:
 	spin_unlock(&sbi->fs_lock);
 	dput(root);
 
-- 
cgit v1.2.3


From d4a85e35d1465da055264407d8395e84483084e6 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 25 Mar 2011 01:51:20 +0800
Subject: autofs4 - fix autofs4_expire_indirect() traversal

The vfs-scale changes changed the traversal used in
autofs4_expire_indirect() from a list to a depth first tree traversal
which isn't right.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/expire.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index c403abcc725..bc482e07b92 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -86,6 +86,56 @@ done:
 	return status;
 }
 
+/*
+ * Calculate and dget next entry in the subdirs list under root.
+ */
+static struct dentry *get_next_positive_subdir(struct dentry *prev,
+						struct dentry *root)
+{
+	struct list_head *next;
+	struct dentry *p, *q;
+
+	spin_lock(&autofs4_lock);
+
+	if (prev == NULL) {
+		spin_lock(&root->d_lock);
+		prev = dget_dlock(root);
+		next = prev->d_subdirs.next;
+		p = prev;
+		goto start;
+	}
+
+	p = prev;
+	spin_lock(&p->d_lock);
+again:
+	next = p->d_u.d_child.next;
+start:
+	if (next == &root->d_subdirs) {
+		spin_unlock(&p->d_lock);
+		spin_unlock(&autofs4_lock);
+		dput(prev);
+		return NULL;
+	}
+
+	q = list_entry(next, struct dentry, d_u.d_child);
+
+	spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
+	/* Negative dentry - try next */
+	if (!simple_positive(q)) {
+		spin_unlock(&p->d_lock);
+		p = q;
+		goto again;
+	}
+	dget_dlock(q);
+	spin_unlock(&q->d_lock);
+	spin_unlock(&p->d_lock);
+	spin_unlock(&autofs4_lock);
+
+	dput(prev);
+
+	return q;
+}
+
 /*
  * Calculate and dget next entry in top down tree traversal.
  */
@@ -333,7 +383,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
 	timeout = sbi->exp_timeout;
 
 	dentry = NULL;
-	while ((dentry = get_next_positive_dentry(dentry, root))) {
+	while ((dentry = get_next_positive_subdir(dentry, root))) {
 		spin_lock(&sbi->fs_lock);
 		ino = autofs4_dentry_ino(dentry);
 		/* No point expiring a pending mount */
-- 
cgit v1.2.3


From 83fb96bfc792e5ca693e53f7fd878d51b8493da8 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 25 Mar 2011 01:51:25 +0800
Subject: autofs4 - fix d_manage() return on rcu-walk

The daemon never needs to block and, in the rcu-walk case an error
return isn't used, so always return zero.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/root.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 3a93d355248..ebbfa0ce6d7 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -444,6 +444,8 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 
 	/* The daemon never waits. */
 	if (autofs4_oz_mode(sbi)) {
+		if (rcu_walk)
+			return 0;
 		if (!d_mountpoint(dentry))
 			return -EISDIR;
 		return 0;
-- 
cgit v1.2.3


From e7854723d0f3626f260c880d8db8e5136f29db19 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 25 Mar 2011 01:51:31 +0800
Subject: autofs4 - remove autofs4_lock

The autofs4_lock introduced by the rcu-walk changes has unnecessarily
broad scope. The locking is better handled by the per-autofs super
block lookup_lock.

Signed-off-by: Ian Kent <raven@themaw.net>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h |  2 --
 fs/autofs4/expire.c   | 18 ++++++++++--------
 fs/autofs4/root.c     | 25 +++++++------------------
 fs/autofs4/waitq.c    |  6 +++---
 4 files changed, 20 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 54f92379272..475f9c597cb 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -61,8 +61,6 @@ do {							\
 		current->pid, __func__, ##args);	\
 } while (0)
 
-extern spinlock_t autofs4_lock;
-
 /* Unified info structure.  This is pointed to by both the dentry and
    inode structures.  Each file in the filesystem has an instance of this
    structure.  It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index bc482e07b92..450f529a4ea 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -92,10 +92,11 @@ done:
 static struct dentry *get_next_positive_subdir(struct dentry *prev,
 						struct dentry *root)
 {
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
 	struct list_head *next;
 	struct dentry *p, *q;
 
-	spin_lock(&autofs4_lock);
+	spin_lock(&sbi->lookup_lock);
 
 	if (prev == NULL) {
 		spin_lock(&root->d_lock);
@@ -112,7 +113,7 @@ again:
 start:
 	if (next == &root->d_subdirs) {
 		spin_unlock(&p->d_lock);
-		spin_unlock(&autofs4_lock);
+		spin_unlock(&sbi->lookup_lock);
 		dput(prev);
 		return NULL;
 	}
@@ -129,7 +130,7 @@ start:
 	dget_dlock(q);
 	spin_unlock(&q->d_lock);
 	spin_unlock(&p->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 
 	dput(prev);
 
@@ -142,13 +143,14 @@ start:
 static struct dentry *get_next_positive_dentry(struct dentry *prev,
 						struct dentry *root)
 {
+	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
 	struct list_head *next;
 	struct dentry *p, *ret;
 
 	if (prev == NULL)
 		return dget(root);
 
-	spin_lock(&autofs4_lock);
+	spin_lock(&sbi->lookup_lock);
 relock:
 	p = prev;
 	spin_lock(&p->d_lock);
@@ -160,7 +162,7 @@ again:
 
 			if (p == root) {
 				spin_unlock(&p->d_lock);
-				spin_unlock(&autofs4_lock);
+				spin_unlock(&sbi->lookup_lock);
 				dput(prev);
 				return NULL;
 			}
@@ -190,7 +192,7 @@ again:
 	dget_dlock(ret);
 	spin_unlock(&ret->d_lock);
 	spin_unlock(&p->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 
 	dput(prev);
 
@@ -459,13 +461,13 @@ found:
 	ino->flags |= AUTOFS_INF_EXPIRING;
 	init_completion(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
-	spin_lock(&autofs4_lock);
+	spin_lock(&sbi->lookup_lock);
 	spin_lock(&expired->d_parent->d_lock);
 	spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
 	list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 	spin_unlock(&expired->d_lock);
 	spin_unlock(&expired->d_parent->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 	return expired;
 }
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index ebbfa0ce6d7..96804a17bbd 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,8 +23,6 @@
 
 #include "autofs_i.h"
 
-DEFINE_SPINLOCK(autofs4_lock);
-
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	 * autofs file system so just let the libfs routines handle
 	 * it.
 	 */
-	spin_lock(&autofs4_lock);
+	spin_lock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
-		spin_unlock(&autofs4_lock);
+		spin_unlock(&sbi->lookup_lock);
 		return -ENOENT;
 	}
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 
 out:
 	return dcache_dir_open(inode, file);
@@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 
-	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->active_list;
 	list_for_each(p, head) {
@@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 			dget_dlock(active);
 			spin_unlock(&active->d_lock);
 			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&autofs4_lock);
 			return active;
 		}
 next:
 		spin_unlock(&active->d_lock);
 	}
 	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&autofs4_lock);
 
 	return NULL;
 }
@@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 	const unsigned char *str = name->name;
 	struct list_head *p, *head;
 
-	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	head = &sbi->expiring_list;
 	list_for_each(p, head) {
@@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 			dget_dlock(expiring);
 			spin_unlock(&expiring->d_lock);
 			spin_unlock(&sbi->lookup_lock);
-			spin_unlock(&autofs4_lock);
 			return expiring;
 		}
 next:
 		spin_unlock(&expiring->d_lock);
 	}
 	spin_unlock(&sbi->lookup_lock);
-	spin_unlock(&autofs4_lock);
 
 	return NULL;
 }
@@ -603,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 
 	dir->i_mtime = CURRENT_TIME;
 
-	spin_lock(&autofs4_lock);
-	autofs4_add_expiring(dentry);
+	spin_lock(&sbi->lookup_lock);
+	__autofs4_add_expiring(dentry);
 	spin_lock(&dentry->d_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 
 	return 0;
 }
@@ -677,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!autofs4_oz_mode(sbi))
 		return -EACCES;
 
-	spin_lock(&autofs4_lock);
 	spin_lock(&sbi->lookup_lock);
 	spin_lock(&dentry->d_lock);
 	if (!list_empty(&dentry->d_subdirs)) {
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&sbi->lookup_lock);
-		spin_unlock(&autofs4_lock);
 		return -ENOTEMPTY;
 	}
 	__autofs4_add_expiring(dentry);
-	spin_unlock(&sbi->lookup_lock);
 	__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->lookup_lock);
 
 	if (sbi->version < 5)
 		autofs_clear_leaf_automount_flags(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 56010056b2e..25435987d6a 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -197,12 +197,12 @@ rename_retry:
 
 	seq = read_seqbegin(&rename_lock);
 	rcu_read_lock();
-	spin_lock(&autofs4_lock);
+	spin_lock(&sbi->fs_lock);
 	for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
 		len += tmp->d_name.len + 1;
 
 	if (!len || --len > NAME_MAX) {
-		spin_unlock(&autofs4_lock);
+		spin_unlock(&sbi->fs_lock);
 		rcu_read_unlock();
 		if (read_seqretry(&rename_lock, seq))
 			goto rename_retry;
@@ -218,7 +218,7 @@ rename_retry:
 		p -= tmp->d_name.len;
 		strncpy(p, tmp->d_name.name, tmp->d_name.len);
 	}
-	spin_unlock(&autofs4_lock);
+	spin_unlock(&sbi->fs_lock);
 	rcu_read_unlock();
 	if (read_seqretry(&rename_lock, seq))
 		goto rename_retry;
-- 
cgit v1.2.3


From 3dc8fe4dca9cd3e4aa828ed36451e2bcfd2350da Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Fri, 25 Mar 2011 01:51:37 +0800
Subject: autofs4: Do not potentially dereference NULL pointer returned by
 fget() in autofs_dev_ioctl_setpipefd()

In fs/autofs4/dev-ioctl.c::autofs_dev_ioctl_setpipefd() we call fget(),
which may return NULL, but we do not explicitly test for that NULL return
so we may end up dereferencing a NULL pointer - bad.

When I originally submitted this patch I had chosen EBUSY as the return
value to use if this happens. Ian Kent was kind enough to explain why that
would most likely be wrong and why EBADF should most likely be used
instead. This version of the patch uses EBADF.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/dev-ioctl.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1442da4860e..509fe1eb66a 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 		return -EBUSY;
 	} else {
 		struct file *pipe = fget(pipefd);
+		if (!pipe) {
+			err = -EBADF;
+			goto out;
+		}
 		if (!pipe->f_op || !pipe->f_op->write) {
 			err = -EPIPE;
 			fput(pipe);
-- 
cgit v1.2.3


From 250df6ed274d767da844a5d9f05720b804240197 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:36 +1100
Subject: fs: protect inode->i_state with inode->i_lock

Protect inode state transitions and validity checks with the
inode->i_lock. This enables us to make inode state transitions
independently of the inode_lock and is the first step to peeling
away the inode_lock from the code.

This requires that __iget() is done atomically with i_state checks
during list traversals so that we don't race with another thread
marking the inode I_FREEING between the state check and grabbing the
reference.

Also remove the unlock_new_inode() memory barrier optimisation
required to avoid taking the inode_lock when clearing I_NEW.
Simplify the code by simply taking the inode->i_lock around the
state change and wakeup. Because the wakeup is no longer tricky,
remove the wake_up_inode() function and open code the wakeup where
necessary.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c         |   2 +
 fs/buffer.c            |   2 +-
 fs/drop_caches.c       |   9 ++-
 fs/fs-writeback.c      |  44 +++++++++++----
 fs/inode.c             | 150 ++++++++++++++++++++++++++++++++++---------------
 fs/notify/inode_mark.c |  21 +++++--
 fs/quota/dquot.c       |  13 +++--
 7 files changed, 169 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 88928701959..bc39b18cf3d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -56,9 +56,11 @@ static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 {
 	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
 	if (inode->i_state & I_DIRTY)
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76e2ca..da666f3148f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1144,7 +1144,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
  * inode list.
  *
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and the global inode_lock.
+ * mapping->tree_lock and mapping->host->i_lock.
  */
 void mark_buffer_dirty(struct buffer_head *bh)
 {
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 816f88e6b9c..6c6f73ba086 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,11 +18,14 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
-			continue;
-		if (inode->i_mapping->nrpages == 0)
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    (inode->i_mapping->nrpages == 0)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
 		__iget(inode);
+		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_lock);
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c6e495678..efd1ebe879c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -306,10 +306,12 @@ static void inode_wait_for_writeback(struct inode *inode)
 	wait_queue_head_t *wqh;
 
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-	 while (inode->i_state & I_SYNC) {
+	while (inode->i_state & I_SYNC) {
+		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_lock);
 		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
 		spin_lock(&inode_lock);
+		spin_lock(&inode->i_lock);
 	}
 }
 
@@ -333,6 +335,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	unsigned dirty;
 	int ret;
 
+	spin_lock(&inode->i_lock);
 	if (!atomic_read(&inode->i_count))
 		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
 	else
@@ -348,6 +351,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		 * completed a full scan of b_io.
 		 */
 		if (wbc->sync_mode != WB_SYNC_ALL) {
+			spin_unlock(&inode->i_lock);
 			requeue_io(inode);
 			return 0;
 		}
@@ -363,6 +367,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	/* Set I_SYNC, reset I_DIRTY_PAGES */
 	inode->i_state |= I_SYNC;
 	inode->i_state &= ~I_DIRTY_PAGES;
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
 
 	ret = do_writepages(mapping, wbc);
@@ -384,8 +389,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * write_inode()
 	 */
 	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	dirty = inode->i_state & I_DIRTY;
 	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
@@ -395,6 +402,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
 		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -436,6 +444,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		}
 	}
 	inode_sync_complete(inode);
+	spin_unlock(&inode->i_lock);
 	return ret;
 }
 
@@ -506,7 +515,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		 * kind does not need peridic writeout yet, and for the latter
 		 * kind writeout is handled by the freer.
 		 */
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
 			requeue_io(inode);
 			continue;
 		}
@@ -515,10 +526,14 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		 * Was this inode dirtied after sync_sb_inodes was called?
 		 * This keeps sync from extra jobs and livelock.
 		 */
-		if (inode_dirtied_after(inode, wbc->wb_start))
+		if (inode_dirtied_after(inode, wbc->wb_start)) {
+			spin_unlock(&inode->i_lock);
 			return 1;
+		}
 
 		__iget(inode);
+		spin_unlock(&inode->i_lock);
+
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
 		if (wbc->pages_skipped != pages_skipped) {
@@ -724,7 +739,9 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (!list_empty(&wb->b_more_io))  {
 			inode = wb_inode(wb->b_more_io.prev);
 			trace_wbc_writeback_wait(&wbc, wb->bdi);
+			spin_lock(&inode->i_lock);
 			inode_wait_for_writeback(inode);
+			spin_unlock(&inode->i_lock);
 		}
 		spin_unlock(&inode_lock);
 	}
@@ -1017,6 +1034,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		block_dump___mark_inode_dirty(inode);
 
 	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 
@@ -1028,7 +1046,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 * superblock list, based upon its state.
 		 */
 		if (inode->i_state & I_SYNC)
-			goto out;
+			goto out_unlock_inode;
 
 		/*
 		 * Only add valid (hashed) inodes to the superblock's
@@ -1036,11 +1054,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		 */
 		if (!S_ISBLK(inode->i_mode)) {
 			if (inode_unhashed(inode))
-				goto out;
+				goto out_unlock_inode;
 		}
 		if (inode->i_state & I_FREEING)
-			goto out;
+			goto out_unlock_inode;
 
+		spin_unlock(&inode->i_lock);
 		/*
 		 * If the inode was already on b_dirty/b_io/b_more_io, don't
 		 * reposition it (that would break b_dirty time-ordering).
@@ -1065,7 +1084,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
 		}
+		goto out;
 	}
+out_unlock_inode:
+	spin_unlock(&inode->i_lock);
 out:
 	spin_unlock(&inode_lock);
 
@@ -1111,14 +1133,16 @@ static void wait_sb_inodes(struct super_block *sb)
 	 * we still have to wait for that writeout.
 	 */
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		struct address_space *mapping;
+		struct address_space *mapping = inode->i_mapping;
 
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
-			continue;
-		mapping = inode->i_mapping;
-		if (mapping->nrpages == 0)
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    (mapping->nrpages == 0)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
 		__iget(inode);
+		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_lock);
 		/*
 		 * We hold a reference to 'inode' so it couldn't have
diff --git a/fs/inode.c b/fs/inode.c
index 0b3da4a7770..14b12c4ee02 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -27,6 +27,17 @@
 #include <linux/ima.h>
 #include <linux/cred.h>
 
+/*
+ * inode locking rules.
+ *
+ * inode->i_lock protects:
+ *   inode->i_state, inode->i_hash, __iget()
+ *
+ * Lock ordering:
+ * inode_lock
+ *   inode->i_lock
+ */
+
 /*
  * This is needed for the following functions:
  *  - inode_has_buffers
@@ -137,15 +148,6 @@ int proc_nr_inodes(ctl_table *table, int write,
 }
 #endif
 
-static void wake_up_inode(struct inode *inode)
-{
-	/*
-	 * Prevent speculative execution through spin_unlock(&inode_lock);
-	 */
-	smp_mb();
-	wake_up_bit(&inode->i_state, __I_NEW);
-}
-
 /**
  * inode_init_always - perform inode structure intialisation
  * @sb: superblock inode belongs to
@@ -336,7 +338,7 @@ static void init_once(void *foo)
 }
 
 /*
- * inode_lock must be held
+ * inode->i_lock must be held
  */
 void __iget(struct inode *inode)
 {
@@ -413,7 +415,9 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
 
 	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	hlist_add_head(&inode->i_hash, b);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
 }
 EXPORT_SYMBOL(__insert_inode_hash);
@@ -438,7 +442,9 @@ static void __remove_inode_hash(struct inode *inode)
 void remove_inode_hash(struct inode *inode)
 {
 	spin_lock(&inode_lock);
+	spin_lock(&inode->i_lock);
 	hlist_del_init(&inode->i_hash);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
 }
 EXPORT_SYMBOL(remove_inode_hash);
@@ -495,7 +501,9 @@ static void dispose_list(struct list_head *head)
 		__inode_sb_list_del(inode);
 		spin_unlock(&inode_lock);
 
-		wake_up_inode(inode);
+		spin_lock(&inode->i_lock);
+		wake_up_bit(&inode->i_state, __I_NEW);
+		spin_unlock(&inode->i_lock);
 		destroy_inode(inode);
 	}
 }
@@ -518,10 +526,17 @@ void evict_inodes(struct super_block *sb)
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (atomic_read(&inode->i_count))
 			continue;
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
 
 		inode->i_state |= I_FREEING;
+		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+			inodes_stat.nr_unused--;
+		spin_unlock(&inode->i_lock);
 
 		/*
 		 * Move the inode off the IO lists and LRU once I_FREEING is
@@ -529,8 +544,6 @@ void evict_inodes(struct super_block *sb)
 		 */
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			inodes_stat.nr_unused--;
 	}
 	spin_unlock(&inode_lock);
 
@@ -563,18 +576,26 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
 		if (inode->i_state & I_DIRTY && !kill_dirty) {
+			spin_unlock(&inode->i_lock);
 			busy = 1;
 			continue;
 		}
 		if (atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
 			busy = 1;
 			continue;
 		}
 
 		inode->i_state |= I_FREEING;
+		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+			inodes_stat.nr_unused--;
+		spin_unlock(&inode->i_lock);
 
 		/*
 		 * Move the inode off the IO lists and LRU once I_FREEING is
@@ -582,8 +603,6 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 		 */
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			inodes_stat.nr_unused--;
 	}
 	spin_unlock(&inode_lock);
 
@@ -641,8 +660,10 @@ static void prune_icache(int nr_to_scan)
 		 * Referenced or dirty inodes are still in use. Give them
 		 * another pass through the LRU as we canot reclaim them now.
 		 */
+		spin_lock(&inode->i_lock);
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
+			spin_unlock(&inode->i_lock);
 			list_del_init(&inode->i_lru);
 			inodes_stat.nr_unused--;
 			continue;
@@ -650,12 +671,14 @@ static void prune_icache(int nr_to_scan)
 
 		/* recently referenced inodes get one more pass */
 		if (inode->i_state & I_REFERENCED) {
-			list_move(&inode->i_lru, &inode_lru);
 			inode->i_state &= ~I_REFERENCED;
+			spin_unlock(&inode->i_lock);
+			list_move(&inode->i_lru, &inode_lru);
 			continue;
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
+			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_lock);
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
@@ -666,11 +689,15 @@ static void prune_icache(int nr_to_scan)
 			if (inode != list_entry(inode_lru.next,
 						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
-			if (!can_unuse(inode))
+			spin_lock(&inode->i_lock);
+			if (!can_unuse(inode)) {
+				spin_unlock(&inode->i_lock);
 				continue;
+			}
 		}
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
+		spin_unlock(&inode->i_lock);
 
 		/*
 		 * Move the inode off the IO lists and LRU once I_FREEING is
@@ -737,11 +764,13 @@ repeat:
 			continue;
 		if (!test(inode, data))
 			continue;
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
 		__iget(inode);
+		spin_unlock(&inode->i_lock);
 		return inode;
 	}
 	return NULL;
@@ -763,11 +792,13 @@ repeat:
 			continue;
 		if (inode->i_sb != sb)
 			continue;
+		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
 		__iget(inode);
+		spin_unlock(&inode->i_lock);
 		return inode;
 	}
 	return NULL;
@@ -832,14 +863,23 @@ struct inode *new_inode(struct super_block *sb)
 	inode = alloc_inode(sb);
 	if (inode) {
 		spin_lock(&inode_lock);
-		__inode_sb_list_add(inode);
+		spin_lock(&inode->i_lock);
 		inode->i_state = 0;
+		spin_unlock(&inode->i_lock);
+		__inode_sb_list_add(inode);
 		spin_unlock(&inode_lock);
 	}
 	return inode;
 }
 EXPORT_SYMBOL(new_inode);
 
+/**
+ * unlock_new_inode - clear the I_NEW state and wake up any waiters
+ * @inode:	new inode to unlock
+ *
+ * Called when the inode is fully initialised to clear the new state of the
+ * inode and wake up anyone waiting for the inode to finish initialisation.
+ */
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -859,19 +899,11 @@ void unlock_new_inode(struct inode *inode)
 		}
 	}
 #endif
-	/*
-	 * This is special!  We do not need the spinlock when clearing I_NEW,
-	 * because we're guaranteed that nobody else tries to do anything about
-	 * the state of the inode when it is locked, as we just created it (so
-	 * there can be no old holders that haven't tested I_NEW).
-	 * However we must emit the memory barrier so that other CPUs reliably
-	 * see the clearing of I_NEW after the other inode initialisation has
-	 * completed.
-	 */
-	smp_mb();
+	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW;
-	wake_up_inode(inode);
+	wake_up_bit(&inode->i_state, __I_NEW);
+	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(unlock_new_inode);
 
@@ -900,9 +932,11 @@ static struct inode *get_new_inode(struct super_block *sb,
 			if (set(inode, data))
 				goto set_failed;
 
+			spin_lock(&inode->i_lock);
+			inode->i_state = I_NEW;
 			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
 			__inode_sb_list_add(inode);
-			inode->i_state = I_NEW;
 			spin_unlock(&inode_lock);
 
 			/* Return the locked inode with I_NEW set, the
@@ -947,9 +981,11 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 		old = find_inode_fast(sb, head, ino);
 		if (!old) {
 			inode->i_ino = ino;
+			spin_lock(&inode->i_lock);
+			inode->i_state = I_NEW;
 			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
 			__inode_sb_list_add(inode);
-			inode->i_state = I_NEW;
 			spin_unlock(&inode_lock);
 
 			/* Return the locked inode with I_NEW set, the
@@ -1034,15 +1070,19 @@ EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
 	spin_lock(&inode_lock);
-	if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
+	spin_lock(&inode->i_lock);
+	if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
 		__iget(inode);
-	else
+		spin_unlock(&inode->i_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
 		/*
 		 * Handle the case where s_op->clear_inode is not been
 		 * called yet, and somebody is calling igrab
 		 * while the inode is getting freed.
 		 */
 		inode = NULL;
+	}
 	spin_unlock(&inode_lock);
 	return inode;
 }
@@ -1271,7 +1311,6 @@ int insert_inode_locked(struct inode *inode)
 	ino_t ino = inode->i_ino;
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 
-	inode->i_state |= I_NEW;
 	while (1) {
 		struct hlist_node *node;
 		struct inode *old = NULL;
@@ -1281,16 +1320,23 @@ int insert_inode_locked(struct inode *inode)
 				continue;
 			if (old->i_sb != sb)
 				continue;
-			if (old->i_state & (I_FREEING|I_WILL_FREE))
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
 				continue;
+			}
 			break;
 		}
 		if (likely(!node)) {
+			spin_lock(&inode->i_lock);
+			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_lock);
 			return 0;
 		}
 		__iget(old);
+		spin_unlock(&old->i_lock);
 		spin_unlock(&inode_lock);
 		wait_on_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
@@ -1308,8 +1354,6 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 	struct super_block *sb = inode->i_sb;
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 
-	inode->i_state |= I_NEW;
-
 	while (1) {
 		struct hlist_node *node;
 		struct inode *old = NULL;
@@ -1320,16 +1364,23 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 				continue;
 			if (!test(old, data))
 				continue;
-			if (old->i_state & (I_FREEING|I_WILL_FREE))
+			spin_lock(&old->i_lock);
+			if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+				spin_unlock(&old->i_lock);
 				continue;
+			}
 			break;
 		}
 		if (likely(!node)) {
+			spin_lock(&inode->i_lock);
+			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_lock);
 			return 0;
 		}
 		__iget(old);
+		spin_unlock(&old->i_lock);
 		spin_unlock(&inode_lock);
 		wait_on_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
@@ -1375,6 +1426,9 @@ static void iput_final(struct inode *inode)
 	const struct super_operations *op = inode->i_sb->s_op;
 	int drop;
 
+	spin_lock(&inode->i_lock);
+	WARN_ON(inode->i_state & I_NEW);
+
 	if (op && op->drop_inode)
 		drop = op->drop_inode(inode);
 	else
@@ -1386,21 +1440,23 @@ static void iput_final(struct inode *inode)
 			if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
 				inode_lru_list_add(inode);
 			}
+			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_lock);
 			return;
 		}
-		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_WILL_FREE;
+		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_lock);
 		write_inode_now(inode, 1);
 		spin_lock(&inode_lock);
+		spin_lock(&inode->i_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
 		__remove_inode_hash(inode);
 	}
 
-	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
+	spin_unlock(&inode->i_lock);
 
 	/*
 	 * Move the inode off the IO lists and LRU once I_FREEING is
@@ -1413,8 +1469,10 @@ static void iput_final(struct inode *inode)
 	spin_unlock(&inode_lock);
 	evict(inode);
 	remove_inode_hash(inode);
-	wake_up_inode(inode);
+	spin_lock(&inode->i_lock);
+	wake_up_bit(&inode->i_state, __I_NEW);
 	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+	spin_unlock(&inode->i_lock);
 	destroy_inode(inode);
 }
 
@@ -1611,9 +1669,8 @@ EXPORT_SYMBOL(inode_wait);
  * to recheck inode state.
  *
  * It doesn't matter if I_NEW is not set initially, a call to
- * wake_up_inode() after removing from the hash list will DTRT.
- *
- * This is called with inode_lock held.
+ * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
+ * will DTRT.
  */
 static void __wait_on_freeing_inode(struct inode *inode)
 {
@@ -1621,6 +1678,7 @@ static void __wait_on_freeing_inode(struct inode *inode)
 	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
 	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
 	schedule();
 	finish_wait(wq, &wait.wait);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4c29fcf557d..4dd53fb4412 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
 		 * the inode cannot have any associated watches.
 		 */
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
 
 		/*
 		 * If i_count is zero, the inode cannot have any watches and
@@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		 * evict all inodes with zero i_count from icache which is
 		 * unnecessarily violent and may in fact be illegal to do.
 		 */
-		if (!atomic_read(&inode->i_count))
+		if (!atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
 
 		need_iput_tmp = need_iput;
 		need_iput = NULL;
@@ -274,13 +279,17 @@ void fsnotify_unmount_inodes(struct list_head *list)
 			__iget(inode);
 		else
 			need_iput_tmp = NULL;
+		spin_unlock(&inode->i_lock);
 
 		/* In case the dropping of a reference would nuke next_i. */
 		if ((&next_i->i_sb_list != list) &&
-		    atomic_read(&next_i->i_count) &&
-		    !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
-			__iget(next_i);
-			need_iput = next_i;
+		    atomic_read(&next_i->i_count)) {
+			spin_lock(&next_i->i_lock);
+			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+				__iget(next_i);
+				need_iput = next_i;
+			}
+			spin_unlock(&next_i->i_lock);
 		}
 
 		/*
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a2a622e079f..a1470fda366 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -902,18 +902,19 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+		spin_lock(&inode->i_lock);
+		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+		    !atomic_read(&inode->i_writecount) ||
+		    !dqinit_needed(inode, type)) {
+			spin_unlock(&inode->i_lock);
 			continue;
+		}
 #ifdef CONFIG_QUOTA_DEBUG
 		if (unlikely(inode_get_rsv_space(inode) > 0))
 			reserved = 1;
 #endif
-		if (!atomic_read(&inode->i_writecount))
-			continue;
-		if (!dqinit_needed(inode, type))
-			continue;
-
 		__iget(inode);
+		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_lock);
 
 		iput(old_inode);
-- 
cgit v1.2.3


From b2b2af8e614b4dcd8aca1369d82ce5ad0461a7b1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:37 +1100
Subject: fs: factor inode disposal

We have a couple of places that dispose of inodes. factor the
disposal into evict() to isolate this code and make it simpler to
peel away the inode_lock from the code.

While doing this, change the logic flow in iput_final() to separate
the different cases that need to be handled to make the transitions
the inode goes through more obvious.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 104 ++++++++++++++++++++++++-------------------------------------
 1 file changed, 41 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 14b12c4ee02..f752a959254 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -422,17 +422,6 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 }
 EXPORT_SYMBOL(__insert_inode_hash);
 
-/**
- *	__remove_inode_hash - remove an inode from the hash
- *	@inode: inode to unhash
- *
- *	Remove an inode from the superblock.
- */
-static void __remove_inode_hash(struct inode *inode)
-{
-	hlist_del_init(&inode->i_hash);
-}
-
 /**
  *	remove_inode_hash - remove an inode from the hash
  *	@inode: inode to unhash
@@ -462,10 +451,31 @@ void end_writeback(struct inode *inode)
 }
 EXPORT_SYMBOL(end_writeback);
 
+/*
+ * Free the inode passed in, removing it from the lists it is still connected
+ * to. We remove any pages still attached to the inode and wait for any IO that
+ * is still in progress before finally destroying the inode.
+ *
+ * An inode must already be marked I_FREEING so that we avoid the inode being
+ * moved back onto lists if we race with other code that manipulates the lists
+ * (e.g. writeback_single_inode). The caller is responsible for setting this.
+ *
+ * An inode must already be removed from the LRU list before being evicted from
+ * the cache. This should occur atomically with setting the I_FREEING state
+ * flag, so no inodes here should ever be on the LRU when being evicted.
+ */
 static void evict(struct inode *inode)
 {
 	const struct super_operations *op = inode->i_sb->s_op;
 
+	BUG_ON(!(inode->i_state & I_FREEING));
+	BUG_ON(!list_empty(&inode->i_lru));
+
+	spin_lock(&inode_lock);
+	list_del_init(&inode->i_wb_list);
+	__inode_sb_list_del(inode);
+	spin_unlock(&inode_lock);
+
 	if (op->evict_inode) {
 		op->evict_inode(inode);
 	} else {
@@ -477,6 +487,15 @@ static void evict(struct inode *inode)
 		bd_forget(inode);
 	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
 		cd_forget(inode);
+
+	remove_inode_hash(inode);
+
+	spin_lock(&inode->i_lock);
+	wake_up_bit(&inode->i_state, __I_NEW);
+	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+	spin_unlock(&inode->i_lock);
+
+	destroy_inode(inode);
 }
 
 /*
@@ -495,16 +514,6 @@ static void dispose_list(struct list_head *head)
 		list_del_init(&inode->i_lru);
 
 		evict(inode);
-
-		spin_lock(&inode_lock);
-		__remove_inode_hash(inode);
-		__inode_sb_list_del(inode);
-		spin_unlock(&inode_lock);
-
-		spin_lock(&inode->i_lock);
-		wake_up_bit(&inode->i_state, __I_NEW);
-		spin_unlock(&inode->i_lock);
-		destroy_inode(inode);
 	}
 }
 
@@ -537,13 +546,7 @@ void evict_inodes(struct super_block *sb)
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
 			inodes_stat.nr_unused--;
 		spin_unlock(&inode->i_lock);
-
-		/*
-		 * Move the inode off the IO lists and LRU once I_FREEING is
-		 * set so that it won't get moved back on there if it is dirty.
-		 */
 		list_move(&inode->i_lru, &dispose);
-		list_del_init(&inode->i_wb_list);
 	}
 	spin_unlock(&inode_lock);
 
@@ -596,13 +599,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
 			inodes_stat.nr_unused--;
 		spin_unlock(&inode->i_lock);
-
-		/*
-		 * Move the inode off the IO lists and LRU once I_FREEING is
-		 * set so that it won't get moved back on there if it is dirty.
-		 */
 		list_move(&inode->i_lru, &dispose);
-		list_del_init(&inode->i_wb_list);
 	}
 	spin_unlock(&inode_lock);
 
@@ -699,12 +696,7 @@ static void prune_icache(int nr_to_scan)
 		inode->i_state |= I_FREEING;
 		spin_unlock(&inode->i_lock);
 
-		/*
-		 * Move the inode off the IO lists and LRU once I_FREEING is
-		 * set so that it won't get moved back on there if it is dirty.
-		 */
 		list_move(&inode->i_lru, &freeable);
-		list_del_init(&inode->i_wb_list);
 		inodes_stat.nr_unused--;
 	}
 	if (current_is_kswapd())
@@ -1434,16 +1426,16 @@ static void iput_final(struct inode *inode)
 	else
 		drop = generic_drop_inode(inode);
 
+	if (!drop && (sb->s_flags & MS_ACTIVE)) {
+		inode->i_state |= I_REFERENCED;
+		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+			inode_lru_list_add(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&inode_lock);
+		return;
+	}
+
 	if (!drop) {
-		if (sb->s_flags & MS_ACTIVE) {
-			inode->i_state |= I_REFERENCED;
-			if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
-				inode_lru_list_add(inode);
-			}
-			spin_unlock(&inode->i_lock);
-			spin_unlock(&inode_lock);
-			return;
-		}
 		inode->i_state |= I_WILL_FREE;
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_lock);
@@ -1452,28 +1444,14 @@ static void iput_final(struct inode *inode)
 		spin_lock(&inode->i_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
-		__remove_inode_hash(inode);
 	}
 
 	inode->i_state |= I_FREEING;
-	spin_unlock(&inode->i_lock);
-
-	/*
-	 * Move the inode off the IO lists and LRU once I_FREEING is
-	 * set so that it won't get moved back on there if it is dirty.
-	 */
 	inode_lru_list_del(inode);
-	list_del_init(&inode->i_wb_list);
-
-	__inode_sb_list_del(inode);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_lock);
+
 	evict(inode);
-	remove_inode_hash(inode);
-	spin_lock(&inode->i_lock);
-	wake_up_bit(&inode->i_state, __I_NEW);
-	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
-	spin_unlock(&inode->i_lock);
-	destroy_inode(inode);
 }
 
 /**
-- 
cgit v1.2.3


From 02afc410f363f98ac4f186341e38dcec13fc0e60 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:38 +1100
Subject: fs: Lock the inode LRU list separately

Introduce the inode_lru_lock to protect the inode_lru list. This
lock is nested inside the inode->i_lock to allow the inode to be
added to the LRU list in iput_final without needing to deal with
lock inversions. This keeps iput_final() clean and neat.

Further, where marking the inode I_FREEING and removing it from the
LRU, move the LRU list manipulation within the inode->i_lock to keep
the list manipulation consistent with iput_final. This also means
that most of the open coded LRU list removal + unused inode
accounting can now use the inode_lru_list_del() wrappers which
cleans the code up further.

However, this locking change means what the LRU traversal in
prune_icache() inverts this lock ordering and needs to use trylock
semantics on the inode->i_lock to avoid deadlocking. In these cases,
if we fail to lock the inode we move it to the back of the LRU to
prevent spinning on it.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index f752a959254..b19cb6ee6ca 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -32,10 +32,13 @@
  *
  * inode->i_lock protects:
  *   inode->i_state, inode->i_hash, __iget()
+ * inode_lru_lock protects:
+ *   inode_lru, inode->i_lru
  *
  * Lock ordering:
  * inode_lock
  *   inode->i_lock
+ *     inode_lru_lock
  */
 
 /*
@@ -85,6 +88,7 @@ static unsigned int i_hash_shift __read_mostly;
  */
 
 static LIST_HEAD(inode_lru);
+static DEFINE_SPINLOCK(inode_lru_lock);
 static struct hlist_head *inode_hashtable __read_mostly;
 
 /*
@@ -356,18 +360,22 @@ EXPORT_SYMBOL(ihold);
 
 static void inode_lru_list_add(struct inode *inode)
 {
+	spin_lock(&inode_lru_lock);
 	if (list_empty(&inode->i_lru)) {
 		list_add(&inode->i_lru, &inode_lru);
 		inodes_stat.nr_unused++;
 	}
+	spin_unlock(&inode_lru_lock);
 }
 
 static void inode_lru_list_del(struct inode *inode)
 {
+	spin_lock(&inode_lru_lock);
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
 		inodes_stat.nr_unused--;
 	}
+	spin_unlock(&inode_lru_lock);
 }
 
 static inline void __inode_sb_list_add(struct inode *inode)
@@ -543,10 +551,9 @@ void evict_inodes(struct super_block *sb)
 		}
 
 		inode->i_state |= I_FREEING;
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			inodes_stat.nr_unused--;
+		inode_lru_list_del(inode);
 		spin_unlock(&inode->i_lock);
-		list_move(&inode->i_lru, &dispose);
+		list_add(&inode->i_lru, &dispose);
 	}
 	spin_unlock(&inode_lock);
 
@@ -596,10 +603,9 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 		}
 
 		inode->i_state |= I_FREEING;
-		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			inodes_stat.nr_unused--;
+		inode_lru_list_del(inode);
 		spin_unlock(&inode->i_lock);
-		list_move(&inode->i_lru, &dispose);
+		list_add(&inode->i_lru, &dispose);
 	}
 	spin_unlock(&inode_lock);
 
@@ -623,7 +629,7 @@ static int can_unuse(struct inode *inode)
 
 /*
  * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside inode_lock by dispose_list().
+ * temporary list and then are freed outside inode_lru_lock by dispose_list().
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
@@ -645,6 +651,7 @@ static void prune_icache(int nr_to_scan)
 
 	down_read(&iprune_sem);
 	spin_lock(&inode_lock);
+	spin_lock(&inode_lru_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
 
@@ -653,11 +660,20 @@ static void prune_icache(int nr_to_scan)
 
 		inode = list_entry(inode_lru.prev, struct inode, i_lru);
 
+		/*
+		 * we are inverting the inode_lru_lock/inode->i_lock here,
+		 * so use a trylock. If we fail to get the lock, just move the
+		 * inode to the back of the list so we don't spin on it.
+		 */
+		if (!spin_trylock(&inode->i_lock)) {
+			list_move(&inode->i_lru, &inode_lru);
+			continue;
+		}
+
 		/*
 		 * Referenced or dirty inodes are still in use. Give them
 		 * another pass through the LRU as we canot reclaim them now.
 		 */
-		spin_lock(&inode->i_lock);
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
 			spin_unlock(&inode->i_lock);
@@ -676,17 +692,21 @@ static void prune_icache(int nr_to_scan)
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
 			spin_unlock(&inode->i_lock);
+			spin_unlock(&inode_lru_lock);
 			spin_unlock(&inode_lock);
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 			iput(inode);
 			spin_lock(&inode_lock);
+			spin_lock(&inode_lru_lock);
 
 			if (inode != list_entry(inode_lru.next,
 						struct inode, i_lru))
 				continue;	/* wrong inode or list_empty */
-			spin_lock(&inode->i_lock);
+			/* avoid lock inversions with trylock */
+			if (!spin_trylock(&inode->i_lock))
+				continue;
 			if (!can_unuse(inode)) {
 				spin_unlock(&inode->i_lock);
 				continue;
@@ -703,6 +723,7 @@ static void prune_icache(int nr_to_scan)
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
 	else
 		__count_vm_events(PGINODESTEAL, reap);
+	spin_unlock(&inode_lru_lock);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&freeable);
-- 
cgit v1.2.3


From f283c86afe6aa70b733d1ecebad5d9464943b774 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:39 +1100
Subject: fs: remove inode_lock from iput_final and prune_icache

Now that inode state changes are protected by the inode->i_lock and
the inode LRU manipulations by the inode_lru_lock, we can remove the
inode_lock from prune_icache and the initial part of iput_final().

instead of using the inode_lock to protect the inode during
iput_final, use the inode->i_lock instead. This protects the inode
against new references being taken while we change the inode state
to I_FREEING, as well as preventing prune_icache from grabbing the
inode while we are manipulating it. Hence we no longer need the
inode_lock in iput_final prior to setting I_FREEING on the inode.

For prune_icache, we no longer need the inode_lock to protect the
LRU list, and the inodes themselves are protected against freeing
races by the inode->i_lock. Hence we can lift the inode_lock from
prune_icache as well.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c       | 17 +++--------------
 fs/logfs/inode.c |  2 +-
 2 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index b19cb6ee6ca..389f5a24759 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -650,7 +650,6 @@ static void prune_icache(int nr_to_scan)
 	unsigned long reap = 0;
 
 	down_read(&iprune_sem);
-	spin_lock(&inode_lock);
 	spin_lock(&inode_lru_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
@@ -676,8 +675,8 @@ static void prune_icache(int nr_to_scan)
 		 */
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
-			spin_unlock(&inode->i_lock);
 			list_del_init(&inode->i_lru);
+			spin_unlock(&inode->i_lock);
 			inodes_stat.nr_unused--;
 			continue;
 		}
@@ -685,20 +684,18 @@ static void prune_icache(int nr_to_scan)
 		/* recently referenced inodes get one more pass */
 		if (inode->i_state & I_REFERENCED) {
 			inode->i_state &= ~I_REFERENCED;
-			spin_unlock(&inode->i_lock);
 			list_move(&inode->i_lru, &inode_lru);
+			spin_unlock(&inode->i_lock);
 			continue;
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
 			__iget(inode);
 			spin_unlock(&inode->i_lock);
 			spin_unlock(&inode_lru_lock);
-			spin_unlock(&inode_lock);
 			if (remove_inode_buffers(inode))
 				reap += invalidate_mapping_pages(&inode->i_data,
 								0, -1);
 			iput(inode);
-			spin_lock(&inode_lock);
 			spin_lock(&inode_lru_lock);
 
 			if (inode != list_entry(inode_lru.next,
@@ -724,7 +721,6 @@ static void prune_icache(int nr_to_scan)
 	else
 		__count_vm_events(PGINODESTEAL, reap);
 	spin_unlock(&inode_lru_lock);
-	spin_unlock(&inode_lock);
 
 	dispose_list(&freeable);
 	up_read(&iprune_sem);
@@ -1082,7 +1078,6 @@ EXPORT_SYMBOL(iunique);
 
 struct inode *igrab(struct inode *inode)
 {
-	spin_lock(&inode_lock);
 	spin_lock(&inode->i_lock);
 	if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
 		__iget(inode);
@@ -1096,7 +1091,6 @@ struct inode *igrab(struct inode *inode)
 		 */
 		inode = NULL;
 	}
-	spin_unlock(&inode_lock);
 	return inode;
 }
 EXPORT_SYMBOL(igrab);
@@ -1439,7 +1433,6 @@ static void iput_final(struct inode *inode)
 	const struct super_operations *op = inode->i_sb->s_op;
 	int drop;
 
-	spin_lock(&inode->i_lock);
 	WARN_ON(inode->i_state & I_NEW);
 
 	if (op && op->drop_inode)
@@ -1452,16 +1445,13 @@ static void iput_final(struct inode *inode)
 		if (!(inode->i_state & (I_DIRTY|I_SYNC)))
 			inode_lru_list_add(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_lock);
 		return;
 	}
 
 	if (!drop) {
 		inode->i_state |= I_WILL_FREE;
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_lock);
 		write_inode_now(inode, 1);
-		spin_lock(&inode_lock);
 		spin_lock(&inode->i_lock);
 		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
@@ -1470,7 +1460,6 @@ static void iput_final(struct inode *inode)
 	inode->i_state |= I_FREEING;
 	inode_lru_list_del(inode);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_lock);
 
 	evict(inode);
 }
@@ -1489,7 +1478,7 @@ void iput(struct inode *inode)
 	if (inode) {
 		BUG_ON(inode->i_state & I_CLEAR);
 
-		if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
 			iput_final(inode);
 	}
 }
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 03b8c240aed..edfea7a3a74 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 
-/* called with inode_lock held */
+/* called with inode->i_lock held */
 static int logfs_drop_inode(struct inode *inode)
 {
 	struct logfs_super *super = logfs_super(inode->i_sb);
-- 
cgit v1.2.3


From 55fa6091d83160ca772fc37cebae45d42695a708 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:40 +1100
Subject: fs: move i_sb_list out from under inode_lock

Protect the per-sb inode list with a new global lock
inode_sb_list_lock and use it to protect the list manipulations and
traversals. This lock replaces the inode_lock as the inodes on the
list can be validity checked while holding the inode->i_lock and
hence the inode_lock is no longer needed to protect the list.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/drop_caches.c       |  9 +++++----
 fs/fs-writeback.c      | 21 +++++++++++----------
 fs/inode.c             | 43 +++++++++++++++++++++++--------------------
 fs/internal.h          |  2 ++
 fs/notify/inode_mark.c | 20 ++++++++++----------
 fs/quota/dquot.c       | 28 ++++++++++++++++------------
 6 files changed, 67 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 6c6f73ba086..98b77c89494 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,6 +8,7 @@
 #include <linux/writeback.h>
 #include <linux/sysctl.h>
 #include <linux/gfp.h>
+#include "internal.h"
 
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
@@ -16,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
 	struct inode *inode, *toput_inode = NULL;
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
 		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -26,13 +27,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_sb_list_lock);
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
 		toput_inode = inode;
-		spin_lock(&inode_lock);
+		spin_lock(&inode_sb_list_lock);
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 	iput(toput_inode);
 }
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index efd1ebe879c..5de56a2182b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1123,7 +1123,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	 */
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 
 	/*
 	 * Data integrity sync. Must wait for all pages under writeback,
@@ -1143,14 +1143,15 @@ static void wait_sb_inodes(struct super_block *sb)
 		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_sb_list_lock);
+
 		/*
-		 * We hold a reference to 'inode' so it couldn't have
-		 * been removed from s_inodes list while we dropped the
-		 * inode_lock.  We cannot iput the inode now as we can
-		 * be holding the last reference and we cannot iput it
-		 * under inode_lock. So we keep the reference and iput
-		 * it later.
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock.  We cannot iput the inode now as we can
+		 * be holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
 		 */
 		iput(old_inode);
 		old_inode = inode;
@@ -1159,9 +1160,9 @@ static void wait_sb_inodes(struct super_block *sb)
 
 		cond_resched();
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_sb_list_lock);
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 	iput(old_inode);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 389f5a24759..785b1ab23ff 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -34,10 +34,15 @@
  *   inode->i_state, inode->i_hash, __iget()
  * inode_lru_lock protects:
  *   inode_lru, inode->i_lru
+ * inode_sb_list_lock protects:
+ *   sb->s_inodes, inode->i_sb_list
  *
  * Lock ordering:
  * inode_lock
  *   inode->i_lock
+ *
+ * inode_sb_list_lock
+ *   inode->i_lock
  *     inode_lru_lock
  */
 
@@ -99,6 +104,8 @@ static struct hlist_head *inode_hashtable __read_mostly;
  */
 DEFINE_SPINLOCK(inode_lock);
 
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+
 /*
  * iprune_sem provides exclusion between the icache shrinking and the
  * umount path.
@@ -378,26 +385,23 @@ static void inode_lru_list_del(struct inode *inode)
 	spin_unlock(&inode_lru_lock);
 }
 
-static inline void __inode_sb_list_add(struct inode *inode)
-{
-	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
-}
-
 /**
  * inode_sb_list_add - add inode to the superblock list of inodes
  * @inode: inode to add
  */
 void inode_sb_list_add(struct inode *inode)
 {
-	spin_lock(&inode_lock);
-	__inode_sb_list_add(inode);
-	spin_unlock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
+	list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+	spin_unlock(&inode_sb_list_lock);
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
 
-static inline void __inode_sb_list_del(struct inode *inode)
+static inline void inode_sb_list_del(struct inode *inode)
 {
+	spin_lock(&inode_sb_list_lock);
 	list_del_init(&inode->i_sb_list);
+	spin_unlock(&inode_sb_list_lock);
 }
 
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -481,9 +485,10 @@ static void evict(struct inode *inode)
 
 	spin_lock(&inode_lock);
 	list_del_init(&inode->i_wb_list);
-	__inode_sb_list_del(inode);
 	spin_unlock(&inode_lock);
 
+	inode_sb_list_del(inode);
+
 	if (op->evict_inode) {
 		op->evict_inode(inode);
 	} else {
@@ -539,7 +544,7 @@ void evict_inodes(struct super_block *sb)
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (atomic_read(&inode->i_count))
 			continue;
@@ -555,7 +560,7 @@ void evict_inodes(struct super_block *sb)
 		spin_unlock(&inode->i_lock);
 		list_add(&inode->i_lru, &dispose);
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 
 	dispose_list(&dispose);
 
@@ -584,7 +589,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	struct inode *inode, *next;
 	LIST_HEAD(dispose);
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
@@ -607,7 +612,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 		spin_unlock(&inode->i_lock);
 		list_add(&inode->i_lru, &dispose);
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 
 	dispose_list(&dispose);
 
@@ -867,16 +872,14 @@ struct inode *new_inode(struct super_block *sb)
 {
 	struct inode *inode;
 
-	spin_lock_prefetch(&inode_lock);
+	spin_lock_prefetch(&inode_sb_list_lock);
 
 	inode = alloc_inode(sb);
 	if (inode) {
-		spin_lock(&inode_lock);
 		spin_lock(&inode->i_lock);
 		inode->i_state = 0;
 		spin_unlock(&inode->i_lock);
-		__inode_sb_list_add(inode);
-		spin_unlock(&inode_lock);
+		inode_sb_list_add(inode);
 	}
 	return inode;
 }
@@ -945,7 +948,7 @@ static struct inode *get_new_inode(struct super_block *sb,
 			inode->i_state = I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
-			__inode_sb_list_add(inode);
+			inode_sb_list_add(inode);
 			spin_unlock(&inode_lock);
 
 			/* Return the locked inode with I_NEW set, the
@@ -994,7 +997,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 			inode->i_state = I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
-			__inode_sb_list_add(inode);
+			inode_sb_list_add(inode);
 			spin_unlock(&inode_lock);
 
 			/* Return the locked inode with I_NEW set, the
diff --git a/fs/internal.h b/fs/internal.h
index 8318059b42c..7013ae0c88c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -125,6 +125,8 @@ extern long do_handle_open(int mountdirfd,
 /*
  * inode.c
  */
+extern spinlock_t inode_sb_list_lock;
+
 extern int get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4dd53fb4412..fb3b3c5ef0e 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -29,6 +29,8 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
+#include "../internal.h"
+
 /*
  * Recalculate the mask of events relevant to a given inode locked.
  */
@@ -237,15 +239,14 @@ out:
  * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
  * @list: list of inodes being unmounted (sb->s_inodes)
  *
- * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
- * We temporarily drop inode_lock, however, and CAN block.
+ * Called during unmount with no locks held, so needs to be safe against
+ * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
  */
 void fsnotify_unmount_inodes(struct list_head *list)
 {
 	struct inode *inode, *next_i, *need_iput = NULL;
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
 		struct inode *need_iput_tmp;
 
@@ -293,12 +294,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		}
 
 		/*
-		 * We can safely drop inode_lock here because we hold
+		 * We can safely drop inode_sb_list_lock here because we hold
 		 * references on both inode and next_i.  Also no new inodes
-		 * will be added since the umount has begun.  Finally,
-		 * iprune_mutex keeps shrink_icache_memory() away.
+		 * will be added since the umount has begun.
 		 */
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_sb_list_lock);
 
 		if (need_iput_tmp)
 			iput(need_iput_tmp);
@@ -310,7 +310,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
 
 		iput(inode);
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_sb_list_lock);
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a1470fda366..fcc8ae75d87 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -76,7 +76,7 @@
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/quotaops.h>
-#include <linux/writeback.h> /* for inode_lock, oddly enough.. */
+#include "../internal.h" /* ugh */
 
 #include <asm/uaccess.h>
 
@@ -900,7 +900,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	int reserved = 0;
 #endif
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
 		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -915,19 +915,23 @@ static void add_dquot_ref(struct super_block *sb, int type)
 #endif
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_sb_list_lock);
 
 		iput(old_inode);
 		__dquot_initialize(inode, type);
-		/* We hold a reference to 'inode' so it couldn't have been
-		 * removed from s_inodes list while we dropped the inode_lock.
-		 * We cannot iput the inode now as we can be holding the last
-		 * reference and we cannot iput it under inode_lock. So we
-		 * keep the reference and iput it later. */
+
+		/*
+		 * We hold a reference to 'inode' so it couldn't have been
+		 * removed from s_inodes list while we dropped the
+		 * inode_sb_list_lock We cannot iput the inode now as we can be
+		 * holding the last reference and we cannot iput it under
+		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * later.
+		 */
 		old_inode = inode;
-		spin_lock(&inode_lock);
+		spin_lock(&inode_sb_list_lock);
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 	iput(old_inode);
 
 #ifdef CONFIG_QUOTA_DEBUG
@@ -1008,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 	struct inode *inode;
 	int reserved = 0;
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_sb_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		/*
 		 *  We have to scan also I_NEW inodes because they can already
@@ -1022,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 			remove_inode_dquot_ref(inode, type, tofree_head);
 		}
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_sb_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
 	if (reserved) {
 		printk(KERN_WARNING "VFS (%s): Writes happened after quota"
-- 
cgit v1.2.3


From a66979abad090b2765a6c6790c9fdeab996833f2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:41 +1100
Subject: fs: move i_wb_list out from under inode_lock

Protect the inode writeback list with a new global lock
inode_wb_list_lock and use it to protect the list manipulations and
traversals. This lock replaces the inode_lock as the inodes on the
list can be validity checked while holding the inode->i_lock and
hence the inode_lock is no longer needed to protect the list.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c    |  4 +--
 fs/fs-writeback.c | 76 ++++++++++++++++++++++++++++++++-----------------------
 fs/inode.c        | 12 ++++++---
 fs/internal.h     |  5 ++++
 4 files changed, 59 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index bc39b18cf3d..2bbc0e62102 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,13 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
 static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 {
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
 	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
 	if (inode->i_state & I_DIRTY)
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_wb_list_lock);
 }
 
 static sector_t max_block(struct block_device *bdev)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5de56a2182b..ed800656356 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -175,6 +175,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 	spin_unlock_bh(&bdi->wb_lock);
 }
 
+/*
+ * Remove the inode from the writeback list it is on.
+ */
+void inode_wb_list_del(struct inode *inode)
+{
+	spin_lock(&inode_wb_list_lock);
+	list_del_init(&inode->i_wb_list);
+	spin_unlock(&inode_wb_list_lock);
+}
+
+
 /*
  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  * furthest end of its superblock's dirty-inode list.
@@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
 {
 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 
+	assert_spin_locked(&inode_wb_list_lock);
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
@@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
 {
 	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 
+	assert_spin_locked(&inode_wb_list_lock);
 	list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
 {
 	/*
-	 * Prevent speculative execution through spin_unlock(&inode_lock);
+	 * Prevent speculative execution through
+	 * spin_unlock(&inode_wb_list_lock);
 	 */
+
 	smp_mb();
 	wake_up_bit(&inode->i_state, __I_SYNC);
 }
@@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
  */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
+	assert_spin_locked(&inode_wb_list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
 	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
@@ -308,25 +324,23 @@ static void inode_wait_for_writeback(struct inode *inode)
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
 	while (inode->i_state & I_SYNC) {
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_wb_list_lock);
 		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-		spin_lock(&inode_lock);
+		spin_lock(&inode_wb_list_lock);
 		spin_lock(&inode->i_lock);
 	}
 }
 
 /*
- * Write out an inode's dirty pages.  Called under inode_lock.  Either the
- * caller has ref on the inode (either via __iget or via syscall against an fd)
- * or the inode has I_WILL_FREE set (via generic_forget_inode)
+ * Write out an inode's dirty pages.  Called under inode_wb_list_lock.  Either
+ * the caller has an active reference on the inode or the inode has I_WILL_FREE
+ * set.
  *
  * If `wait' is set, wait on the writeout.
  *
  * The whole writeout design is quite complex and fragile.  We want to avoid
  * starvation of particular inodes when others are being redirtied, prevent
  * livelocks, etc.
- *
- * Called under inode_lock.
  */
 static int
 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -368,7 +382,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	inode->i_state |= I_SYNC;
 	inode->i_state &= ~I_DIRTY_PAGES;
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_wb_list_lock);
 
 	ret = do_writepages(mapping, wbc);
 
@@ -388,12 +402,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * due to delalloc, clear dirty metadata flags right before
 	 * write_inode()
 	 */
-	spin_lock(&inode_lock);
 	spin_lock(&inode->i_lock);
 	dirty = inode->i_state & I_DIRTY;
 	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_lock);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 		int err = write_inode(inode, wbc);
@@ -401,7 +413,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			ret = err;
 	}
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
 	spin_lock(&inode->i_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
@@ -543,10 +555,10 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 			 */
 			redirty_tail(inode);
 		}
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_wb_list_lock);
 		iput(inode);
 		cond_resched();
-		spin_lock(&inode_lock);
+		spin_lock(&inode_wb_list_lock);
 		if (wbc->nr_to_write <= 0) {
 			wbc->more_io = 1;
 			return 1;
@@ -565,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 
 	if (!wbc->wb_start)
 		wbc->wb_start = jiffies; /* livelock avoidance */
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
 
@@ -583,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 		if (ret)
 			break;
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_wb_list_lock);
 	/* Leave any unwritten inodes on b_io */
 }
 
@@ -592,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
 {
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
 	if (!wbc->for_kupdate || list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
 	writeback_sb_inodes(sb, wb, wbc, true);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_wb_list_lock);
 }
 
 /*
@@ -735,7 +747,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * become available for writeback. Otherwise
 		 * we'll just busyloop.
 		 */
-		spin_lock(&inode_lock);
+		spin_lock(&inode_wb_list_lock);
 		if (!list_empty(&wb->b_more_io))  {
 			inode = wb_inode(wb->b_more_io.prev);
 			trace_wbc_writeback_wait(&wbc, wb->bdi);
@@ -743,7 +755,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 			inode_wait_for_writeback(inode);
 			spin_unlock(&inode->i_lock);
 		}
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_wb_list_lock);
 	}
 
 	return wrote;
@@ -1009,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
 	struct backing_dev_info *bdi = NULL;
-	bool wakeup_bdi = false;
 
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1033,7 +1044,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if (unlikely(block_dump))
 		block_dump___mark_inode_dirty(inode);
 
-	spin_lock(&inode_lock);
 	spin_lock(&inode->i_lock);
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
@@ -1059,12 +1069,12 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 		if (inode->i_state & I_FREEING)
 			goto out_unlock_inode;
 
-		spin_unlock(&inode->i_lock);
 		/*
 		 * If the inode was already on b_dirty/b_io/b_more_io, don't
 		 * reposition it (that would break b_dirty time-ordering).
 		 */
 		if (!was_dirty) {
+			bool wakeup_bdi = false;
 			bdi = inode_to_bdi(inode);
 
 			if (bdi_cap_writeback_dirty(bdi)) {
@@ -1081,18 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 					wakeup_bdi = true;
 			}
 
+			spin_unlock(&inode->i_lock);
+			spin_lock(&inode_wb_list_lock);
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			spin_unlock(&inode_wb_list_lock);
+
+			if (wakeup_bdi)
+				bdi_wakeup_thread_delayed(bdi);
+			return;
 		}
-		goto out;
 	}
 out_unlock_inode:
 	spin_unlock(&inode->i_lock);
-out:
-	spin_unlock(&inode_lock);
 
-	if (wakeup_bdi)
-		bdi_wakeup_thread_delayed(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
@@ -1296,9 +1308,9 @@ int write_inode_now(struct inode *inode, int sync)
 		wbc.nr_to_write = 0;
 
 	might_sleep();
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
 	ret = writeback_single_inode(inode, &wbc);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_wb_list_lock);
 	if (sync)
 		inode_sync_wait(inode);
 	return ret;
@@ -1320,9 +1332,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int ret;
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_wb_list_lock);
 	ret = writeback_single_inode(inode, wbc);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_wb_list_lock);
 	return ret;
 }
 EXPORT_SYMBOL(sync_inode);
diff --git a/fs/inode.c b/fs/inode.c
index 785b1ab23ff..239fdc08719 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -26,6 +26,7 @@
 #include <linux/posix_acl.h>
 #include <linux/ima.h>
 #include <linux/cred.h>
+#include "internal.h"
 
 /*
  * inode locking rules.
@@ -36,6 +37,8 @@
  *   inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
+ * inode_wb_list_lock protects:
+ *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
  *
  * Lock ordering:
  * inode_lock
@@ -44,6 +47,9 @@
  * inode_sb_list_lock
  *   inode->i_lock
  *     inode_lru_lock
+ *
+ * inode_wb_list_lock
+ *   inode->i_lock
  */
 
 /*
@@ -105,6 +111,7 @@ static struct hlist_head *inode_hashtable __read_mostly;
 DEFINE_SPINLOCK(inode_lock);
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
 
 /*
  * iprune_sem provides exclusion between the icache shrinking and the
@@ -483,10 +490,7 @@ static void evict(struct inode *inode)
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(!list_empty(&inode->i_lru));
 
-	spin_lock(&inode_lock);
-	list_del_init(&inode->i_wb_list);
-	spin_unlock(&inode_lock);
-
+	inode_wb_list_del(inode);
 	inode_sb_list_del(inode);
 
 	if (op->evict_inode) {
diff --git a/fs/internal.h b/fs/internal.h
index 7013ae0c88c..b29c46e4e32 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -127,6 +127,11 @@ extern long do_handle_open(int mountdirfd,
  */
 extern spinlock_t inode_sb_list_lock;
 
+/*
+ * fs-writeback.c
+ */
+extern void inode_wb_list_del(struct inode *inode);
+
 extern int get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
-- 
cgit v1.2.3


From 67a23c494621ff1d5431c3bc320947865b224625 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:42 +1100
Subject: fs: rename inode_lock to inode_hash_lock

All that remains of the inode_lock is protecting the inode hash list
manipulation and traversals. Rename the inode_lock to
inode_hash_lock to reflect it's actual function.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c                | 111 +++++++++++++++++++++++++---------------------
 fs/notify/inode_mark.c    |   1 -
 fs/notify/mark.c          |   1 -
 fs/notify/vfsmount_mark.c |   1 -
 fs/ntfs/inode.c           |   4 +-
 5 files changed, 63 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 239fdc08719..f9ee4928358 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -39,10 +39,10 @@
  *   sb->s_inodes, inode->i_sb_list
  * inode_wb_list_lock protects:
  *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ * inode_hash_lock protects:
+ *   inode_hashtable, inode->i_hash
  *
  * Lock ordering:
- * inode_lock
- *   inode->i_lock
  *
  * inode_sb_list_lock
  *   inode->i_lock
@@ -50,6 +50,13 @@
  *
  * inode_wb_list_lock
  *   inode->i_lock
+ *
+ * inode_hash_lock
+ *   inode_sb_list_lock
+ *   inode->i_lock
+ *
+ * iunique_lock
+ *   inode_hash_lock
  */
 
 /*
@@ -85,6 +92,8 @@
 
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
+static struct hlist_head *inode_hashtable __read_mostly;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
 /*
  * Each inode can be on two separate lists. One is
@@ -100,15 +109,6 @@ static unsigned int i_hash_shift __read_mostly;
 
 static LIST_HEAD(inode_lru);
 static DEFINE_SPINLOCK(inode_lru_lock);
-static struct hlist_head *inode_hashtable __read_mostly;
-
-/*
- * A simple spinlock to protect the list manipulations.
- *
- * NOTE! You also have to own the lock if you change
- * the i_state of an inode while it is in use..
- */
-DEFINE_SPINLOCK(inode_lock);
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
@@ -433,11 +433,11 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 {
 	struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_hash_lock);
 	spin_lock(&inode->i_lock);
 	hlist_add_head(&inode->i_hash, b);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_hash_lock);
 }
 EXPORT_SYMBOL(__insert_inode_hash);
 
@@ -449,11 +449,11 @@ EXPORT_SYMBOL(__insert_inode_hash);
  */
 void remove_inode_hash(struct inode *inode)
 {
-	spin_lock(&inode_lock);
+	spin_lock(&inode_hash_lock);
 	spin_lock(&inode->i_lock);
 	hlist_del_init(&inode->i_hash);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_hash_lock);
 }
 EXPORT_SYMBOL(remove_inode_hash);
 
@@ -778,11 +778,15 @@ static struct inode *find_inode(struct super_block *sb,
 
 repeat:
 	hlist_for_each_entry(inode, node, head, i_hash) {
-		if (inode->i_sb != sb)
+		spin_lock(&inode->i_lock);
+		if (inode->i_sb != sb) {
+			spin_unlock(&inode->i_lock);
 			continue;
-		if (!test(inode, data))
+		}
+		if (!test(inode, data)) {
+			spin_unlock(&inode->i_lock);
 			continue;
-		spin_lock(&inode->i_lock);
+		}
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
@@ -806,11 +810,15 @@ static struct inode *find_inode_fast(struct super_block *sb,
 
 repeat:
 	hlist_for_each_entry(inode, node, head, i_hash) {
-		if (inode->i_ino != ino)
+		spin_lock(&inode->i_lock);
+		if (inode->i_ino != ino) {
+			spin_unlock(&inode->i_lock);
 			continue;
-		if (inode->i_sb != sb)
+		}
+		if (inode->i_sb != sb) {
+			spin_unlock(&inode->i_lock);
 			continue;
-		spin_lock(&inode->i_lock);
+		}
 		if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
@@ -924,7 +932,7 @@ void unlock_new_inode(struct inode *inode)
 EXPORT_SYMBOL(unlock_new_inode);
 
 /*
- * This is called without the inode lock held.. Be careful.
+ * This is called without the inode hash lock held.. Be careful.
  *
  * We no longer cache the sb_flags in i_flags - see fs.h
  *	-- rmk@arm.uk.linux.org
@@ -941,7 +949,7 @@ static struct inode *get_new_inode(struct super_block *sb,
 	if (inode) {
 		struct inode *old;
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_hash_lock);
 		/* We released the lock, so.. */
 		old = find_inode(sb, head, test, data);
 		if (!old) {
@@ -953,7 +961,7 @@ static struct inode *get_new_inode(struct super_block *sb,
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
 			inode_sb_list_add(inode);
-			spin_unlock(&inode_lock);
+			spin_unlock(&inode_hash_lock);
 
 			/* Return the locked inode with I_NEW set, the
 			 * caller is responsible for filling in the contents
@@ -966,7 +974,7 @@ static struct inode *get_new_inode(struct super_block *sb,
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 */
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_hash_lock);
 		destroy_inode(inode);
 		inode = old;
 		wait_on_inode(inode);
@@ -974,7 +982,7 @@ static struct inode *get_new_inode(struct super_block *sb,
 	return inode;
 
 set_failed:
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_hash_lock);
 	destroy_inode(inode);
 	return NULL;
 }
@@ -992,7 +1000,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 	if (inode) {
 		struct inode *old;
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_hash_lock);
 		/* We released the lock, so.. */
 		old = find_inode_fast(sb, head, ino);
 		if (!old) {
@@ -1002,7 +1010,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
 			inode_sb_list_add(inode);
-			spin_unlock(&inode_lock);
+			spin_unlock(&inode_hash_lock);
 
 			/* Return the locked inode with I_NEW set, the
 			 * caller is responsible for filling in the contents
@@ -1015,7 +1023,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 		 * us. Use the old inode instead of the one we just
 		 * allocated.
 		 */
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_hash_lock);
 		destroy_inode(inode);
 		inode = old;
 		wait_on_inode(inode);
@@ -1036,10 +1044,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
 	struct hlist_node *node;
 	struct inode *inode;
 
+	spin_lock(&inode_hash_lock);
 	hlist_for_each_entry(inode, node, b, i_hash) {
-		if (inode->i_ino == ino && inode->i_sb == sb)
+		if (inode->i_ino == ino && inode->i_sb == sb) {
+			spin_unlock(&inode_hash_lock);
 			return 0;
+		}
 	}
+	spin_unlock(&inode_hash_lock);
 
 	return 1;
 }
@@ -1069,7 +1081,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
 	static unsigned int counter;
 	ino_t res;
 
-	spin_lock(&inode_lock);
 	spin_lock(&iunique_lock);
 	do {
 		if (counter <= max_reserved)
@@ -1077,7 +1088,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
 		res = counter++;
 	} while (!test_inode_iunique(sb, res));
 	spin_unlock(&iunique_lock);
-	spin_unlock(&inode_lock);
 
 	return res;
 }
@@ -1119,7 +1129,7 @@ EXPORT_SYMBOL(igrab);
  *
  * Otherwise NULL is returned.
  *
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note, @test is called with the inode_hash_lock held, so can't sleep.
  */
 static struct inode *ifind(struct super_block *sb,
 		struct hlist_head *head, int (*test)(struct inode *, void *),
@@ -1127,15 +1137,15 @@ static struct inode *ifind(struct super_block *sb,
 {
 	struct inode *inode;
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_hash_lock);
 	inode = find_inode(sb, head, test, data);
 	if (inode) {
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_hash_lock);
 		if (likely(wait))
 			wait_on_inode(inode);
 		return inode;
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_hash_lock);
 	return NULL;
 }
 
@@ -1159,14 +1169,14 @@ static struct inode *ifind_fast(struct super_block *sb,
 {
 	struct inode *inode;
 
-	spin_lock(&inode_lock);
+	spin_lock(&inode_hash_lock);
 	inode = find_inode_fast(sb, head, ino);
 	if (inode) {
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_hash_lock);
 		wait_on_inode(inode);
 		return inode;
 	}
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_hash_lock);
 	return NULL;
 }
 
@@ -1189,7 +1199,7 @@ static struct inode *ifind_fast(struct super_block *sb,
  *
  * Otherwise NULL is returned.
  *
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note, @test is called with the inode_hash_lock held, so can't sleep.
  */
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
@@ -1217,7 +1227,7 @@ EXPORT_SYMBOL(ilookup5_nowait);
  *
  * Otherwise NULL is returned.
  *
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note, @test is called with the inode_hash_lock held, so can't sleep.
  */
 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
@@ -1268,7 +1278,8 @@ EXPORT_SYMBOL(ilookup);
  * inode and this is returned locked, hashed, and with the I_NEW flag set. The
  * file system gets to fill it in before unlocking it via unlock_new_inode().
  *
- * Note both @test and @set are called with the inode_lock held, so can't sleep.
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
  */
 struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *),
@@ -1328,7 +1339,7 @@ int insert_inode_locked(struct inode *inode)
 	while (1) {
 		struct hlist_node *node;
 		struct inode *old = NULL;
-		spin_lock(&inode_lock);
+		spin_lock(&inode_hash_lock);
 		hlist_for_each_entry(old, node, head, i_hash) {
 			if (old->i_ino != ino)
 				continue;
@@ -1346,12 +1357,12 @@ int insert_inode_locked(struct inode *inode)
 			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
-			spin_unlock(&inode_lock);
+			spin_unlock(&inode_hash_lock);
 			return 0;
 		}
 		__iget(old);
 		spin_unlock(&old->i_lock);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_hash_lock);
 		wait_on_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
@@ -1372,7 +1383,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 		struct hlist_node *node;
 		struct inode *old = NULL;
 
-		spin_lock(&inode_lock);
+		spin_lock(&inode_hash_lock);
 		hlist_for_each_entry(old, node, head, i_hash) {
 			if (old->i_sb != sb)
 				continue;
@@ -1390,12 +1401,12 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 			inode->i_state |= I_NEW;
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode->i_lock);
-			spin_unlock(&inode_lock);
+			spin_unlock(&inode_hash_lock);
 			return 0;
 		}
 		__iget(old);
 		spin_unlock(&old->i_lock);
-		spin_unlock(&inode_lock);
+		spin_unlock(&inode_hash_lock);
 		wait_on_inode(old);
 		if (unlikely(!inode_unhashed(old))) {
 			iput(old);
@@ -1674,10 +1685,10 @@ static void __wait_on_freeing_inode(struct inode *inode)
 	wq = bit_waitqueue(&inode->i_state, __I_NEW);
 	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_lock);
+	spin_unlock(&inode_hash_lock);
 	schedule();
 	finish_wait(wq, &wait.wait);
-	spin_lock(&inode_lock);
+	spin_lock(&inode_hash_lock);
 }
 
 static __initdata unsigned long ihash_entries;
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index fb3b3c5ef0e..07ea8d3e6ea 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -22,7 +22,6 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
 
 #include <asm/atomic.h>
 
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 325185e514b..50c00856f73 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,7 +91,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/srcu.h>
-#include <linux/writeback.h> /* for inode_lock */
 
 #include <asm/atomic.h>
 
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 85eebff6d0d..e86577d6c5c 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -23,7 +23,6 @@
 #include <linux/mount.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
 
 #include <asm/atomic.h>
 
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index a627ed82c0a..0b56c6b7ec0 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -54,7 +54,7 @@
  *
  * Return 1 if the attributes match and 0 if not.
  *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
  * allowed to sleep.
  */
 int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
  *
  * Return 0 on success and -errno on error.
  *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
  * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
  */
 static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
-- 
cgit v1.2.3


From 0f1b1fd86f6fd662e04da3e82a6780b226fcd0d1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 22 Mar 2011 22:23:43 +1100
Subject: fs: pull inode->i_lock up out of writeback_single_inode

First thing we do in writeback_single_inode() is take the i_lock and
the last thing we do is drop it. A caller already holds the i_lock,
so pull the i_lock out of writeback_single_inode() to reduce the
round trips on this lock during inode writeback.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs-writeback.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ed800656356..b5ed541fb13 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -332,9 +332,9 @@ static void inode_wait_for_writeback(struct inode *inode)
 }
 
 /*
- * Write out an inode's dirty pages.  Called under inode_wb_list_lock.  Either
- * the caller has an active reference on the inode or the inode has I_WILL_FREE
- * set.
+ * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
+ * inode->i_lock.  Either the caller has an active reference on the inode or
+ * the inode has I_WILL_FREE set.
  *
  * If `wait' is set, wait on the writeout.
  *
@@ -349,7 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	unsigned dirty;
 	int ret;
 
-	spin_lock(&inode->i_lock);
+	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&inode->i_lock);
+
 	if (!atomic_read(&inode->i_count))
 		WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
 	else
@@ -365,7 +367,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		 * completed a full scan of b_io.
 		 */
 		if (wbc->sync_mode != WB_SYNC_ALL) {
-			spin_unlock(&inode->i_lock);
 			requeue_io(inode);
 			return 0;
 		}
@@ -456,7 +457,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		}
 	}
 	inode_sync_complete(inode);
-	spin_unlock(&inode->i_lock);
 	return ret;
 }
 
@@ -544,7 +544,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		}
 
 		__iget(inode);
-		spin_unlock(&inode->i_lock);
 
 		pages_skipped = wbc->pages_skipped;
 		writeback_single_inode(inode, wbc);
@@ -555,6 +554,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 			 */
 			redirty_tail(inode);
 		}
+		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_wb_list_lock);
 		iput(inode);
 		cond_resched();
@@ -1309,7 +1309,9 @@ int write_inode_now(struct inode *inode, int sync)
 
 	might_sleep();
 	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
 	ret = writeback_single_inode(inode, &wbc);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_wb_list_lock);
 	if (sync)
 		inode_sync_wait(inode);
@@ -1333,7 +1335,9 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret;
 
 	spin_lock(&inode_wb_list_lock);
+	spin_lock(&inode->i_lock);
 	ret = writeback_single_inode(inode, wbc);
+	spin_unlock(&inode->i_lock);
 	spin_unlock(&inode_wb_list_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 0b2d0724e26a335cd326eb7ad552c109116a8795 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 23 Mar 2011 15:03:28 -0400
Subject: fs: simplify iget & friends

Merge get_new_inode/get_new_inode_fast into iget5_locked/iget_locked
as those were the only callers.  Remove the internal ifind/ifind_fast
helpers - ifind_fast only had a single caller, and ifind had two
callers wanting it to do different things.  Also clean up the comments
in this area to focus on information important to a developer trying
to use it, instead of overloading them with implementation details.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 262 ++++++++++++++++++++-----------------------------------------
 1 file changed, 83 insertions(+), 179 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index f9ee4928358..05a1f75ae79 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -931,20 +931,42 @@ void unlock_new_inode(struct inode *inode)
 }
 EXPORT_SYMBOL(unlock_new_inode);
 
-/*
- * This is called without the inode hash lock held.. Be careful.
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb:		super block of file system
+ * @hashval:	hash value (usually inode number) to get
+ * @test:	callback used for comparisons between inodes
+ * @set:	callback used to initialize a new struct inode
+ * @data:	opaque data pointer to pass to @test and @set
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if present it is return it with an increased reference count. This is
+ * a generalized version of iget_locked() for file systems where the inode
+ * number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set. The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
  *
- * We no longer cache the sb_flags in i_flags - see fs.h
- *	-- rmk@arm.uk.linux.org
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
  */
-static struct inode *get_new_inode(struct super_block *sb,
-				struct hlist_head *head,
-				int (*test)(struct inode *, void *),
-				int (*set)(struct inode *, void *),
-				void *data)
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *),
+		int (*set)(struct inode *, void *), void *data)
 {
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode;
 
+	spin_lock(&inode_hash_lock);
+	inode = find_inode(sb, head, test, data);
+	spin_unlock(&inode_hash_lock);
+
+	if (inode) {
+		wait_on_inode(inode);
+		return inode;
+	}
+
 	inode = alloc_inode(sb);
 	if (inode) {
 		struct inode *old;
@@ -986,16 +1008,34 @@ set_failed:
 	destroy_inode(inode);
 	return NULL;
 }
+EXPORT_SYMBOL(iget5_locked);
 
-/*
- * get_new_inode_fast is the fast path version of get_new_inode, see the
- * comment at iget_locked for details.
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb:		super block of file system
+ * @ino:	inode number to get
+ *
+ * Search for the inode specified by @ino in the inode cache and if present
+ * return it with an increased reference count. This is for file systems
+ * where the inode number is sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set.  The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
  */
-static struct inode *get_new_inode_fast(struct super_block *sb,
-				struct hlist_head *head, unsigned long ino)
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 {
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct inode *inode;
 
+	spin_lock(&inode_hash_lock);
+	inode = find_inode_fast(sb, head, ino);
+	spin_unlock(&inode_hash_lock);
+	if (inode) {
+		wait_on_inode(inode);
+		return inode;
+	}
+
 	inode = alloc_inode(sb);
 	if (inode) {
 		struct inode *old;
@@ -1030,6 +1070,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
 	}
 	return inode;
 }
+EXPORT_SYMBOL(iget_locked);
 
 /*
  * search the inode cache for a matching inode number.
@@ -1113,100 +1154,32 @@ struct inode *igrab(struct inode *inode)
 EXPORT_SYMBOL(igrab);
 
 /**
- * ifind - internal function, you want ilookup5() or iget5().
+ * ilookup5_nowait - search for an inode in the inode cache
  * @sb:		super block of file system to search
- * @head:       the head of the list to search
+ * @hashval:	hash value (usually inode number) to search for
  * @test:	callback used for comparisons between inodes
  * @data:	opaque data pointer to pass to @test
- * @wait:	if true wait for the inode to be unlocked, if false do not
- *
- * ifind() searches for the inode specified by @data in the inode
- * cache. This is a generalized version of ifind_fast() for file systems where
- * the inode number is not sufficient for unique identification of an inode.
  *
+ * Search for the inode specified by @hashval and @data in the inode cache.
  * If the inode is in the cache, the inode is returned with an incremented
  * reference count.
  *
- * Otherwise NULL is returned.
+ * Note: I_NEW is not waited upon so you have to be very careful what you do
+ * with the returned inode.  You probably should be using ilookup5() instead.
  *
- * Note, @test is called with the inode_hash_lock held, so can't sleep.
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
  */
-static struct inode *ifind(struct super_block *sb,
-		struct hlist_head *head, int (*test)(struct inode *, void *),
-		void *data, const int wait)
+struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
 {
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode;
 
 	spin_lock(&inode_hash_lock);
 	inode = find_inode(sb, head, test, data);
-	if (inode) {
-		spin_unlock(&inode_hash_lock);
-		if (likely(wait))
-			wait_on_inode(inode);
-		return inode;
-	}
-	spin_unlock(&inode_hash_lock);
-	return NULL;
-}
-
-/**
- * ifind_fast - internal function, you want ilookup() or iget().
- * @sb:		super block of file system to search
- * @head:       head of the list to search
- * @ino:	inode number to search for
- *
- * ifind_fast() searches for the inode @ino in the inode cache. This is for
- * file systems where the inode number is sufficient for unique identification
- * of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
- */
-static struct inode *ifind_fast(struct super_block *sb,
-		struct hlist_head *head, unsigned long ino)
-{
-	struct inode *inode;
-
-	spin_lock(&inode_hash_lock);
-	inode = find_inode_fast(sb, head, ino);
-	if (inode) {
-		spin_unlock(&inode_hash_lock);
-		wait_on_inode(inode);
-		return inode;
-	}
 	spin_unlock(&inode_hash_lock);
-	return NULL;
-}
-
-/**
- * ilookup5_nowait - search for an inode in the inode cache
- * @sb:		super block of file system to search
- * @hashval:	hash value (usually inode number) to search for
- * @test:	callback used for comparisons between inodes
- * @data:	opaque data pointer to pass to @test
- *
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
- * @data in the inode cache. This is a generalized version of ilookup() for
- * file systems where the inode number is not sufficient for unique
- * identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.  Note, the inode lock is not waited upon so you have to be
- * very careful what you do with the returned inode.  You probably should be
- * using ilookup5() instead.
- *
- * Otherwise NULL is returned.
- *
- * Note, @test is called with the inode_hash_lock held, so can't sleep.
- */
-struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
-		int (*test)(struct inode *, void *), void *data)
-{
-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 
-	return ifind(sb, head, test, data, 0);
+	return inode;
 }
 EXPORT_SYMBOL(ilookup5_nowait);
 
@@ -1217,24 +1190,24 @@ EXPORT_SYMBOL(ilookup5_nowait);
  * @test:	callback used for comparisons between inodes
  * @data:	opaque data pointer to pass to @test
  *
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
- * @data in the inode cache. This is a generalized version of ilookup() for
- * file systems where the inode number is not sufficient for unique
- * identification of an inode.
- *
- * If the inode is in the cache, the inode lock is waited upon and the inode is
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if the inode is in the cache, return the inode with an incremented
+ * reference count.  Waits on I_NEW before returning the inode.
  * returned with an incremented reference count.
  *
- * Otherwise NULL is returned.
+ * This is a generalized version of ilookup() for file systems where the
+ * inode number is not sufficient for unique identification of an inode.
  *
- * Note, @test is called with the inode_hash_lock held, so can't sleep.
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
  */
 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data)
 {
-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
 
-	return ifind(sb, head, test, data, 1);
+	if (inode)
+		wait_on_inode(inode);
+	return inode;
 }
 EXPORT_SYMBOL(ilookup5);
 
@@ -1243,92 +1216,23 @@ EXPORT_SYMBOL(ilookup5);
  * @sb:		super block of file system to search
  * @ino:	inode number to search for
  *
- * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
- * This is for file systems where the inode number is sufficient for unique
- * identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
+ * Search for the inode @ino in the inode cache, and if the inode is in the
+ * cache, the inode is returned with an incremented reference count.
  */
 struct inode *ilookup(struct super_block *sb, unsigned long ino)
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
-
-	return ifind_fast(sb, head, ino);
-}
-EXPORT_SYMBOL(ilookup);
-
-/**
- * iget5_locked - obtain an inode from a mounted file system
- * @sb:		super block of file system
- * @hashval:	hash value (usually inode number) to get
- * @test:	callback used for comparisons between inodes
- * @set:	callback used to initialize a new struct inode
- * @data:	opaque data pointer to pass to @test and @set
- *
- * iget5_locked() uses ifind() to search for the inode specified by @hashval
- * and @data in the inode cache and if present it is returned with an increased
- * reference count. This is a generalized version of iget_locked() for file
- * systems where the inode number is not sufficient for unique identification
- * of an inode.
- *
- * If the inode is not in cache, get_new_inode() is called to allocate a new
- * inode and this is returned locked, hashed, and with the I_NEW flag set. The
- * file system gets to fill it in before unlocking it via unlock_new_inode().
- *
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
- */
-struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
-		int (*test)(struct inode *, void *),
-		int (*set)(struct inode *, void *), void *data)
-{
-	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode;
 
-	inode = ifind(sb, head, test, data, 1);
-	if (inode)
-		return inode;
-	/*
-	 * get_new_inode() will do the right thing, re-trying the search
-	 * in case it had to block at any point.
-	 */
-	return get_new_inode(sb, head, test, set, data);
-}
-EXPORT_SYMBOL(iget5_locked);
-
-/**
- * iget_locked - obtain an inode from a mounted file system
- * @sb:		super block of file system
- * @ino:	inode number to get
- *
- * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
- * the inode cache and if present it is returned with an increased reference
- * count. This is for file systems where the inode number is sufficient for
- * unique identification of an inode.
- *
- * If the inode is not in cache, get_new_inode_fast() is called to allocate a
- * new inode and this is returned locked, hashed, and with the I_NEW flag set.
- * The file system gets to fill it in before unlocking it via
- * unlock_new_inode().
- */
-struct inode *iget_locked(struct super_block *sb, unsigned long ino)
-{
-	struct hlist_head *head = inode_hashtable + hash(sb, ino);
-	struct inode *inode;
+	spin_lock(&inode_hash_lock);
+	inode = find_inode_fast(sb, head, ino);
+	spin_unlock(&inode_hash_lock);
 
-	inode = ifind_fast(sb, head, ino);
 	if (inode)
-		return inode;
-	/*
-	 * get_new_inode_fast() will do the right thing, re-trying the search
-	 * in case it had to block at any point.
-	 */
-	return get_new_inode_fast(sb, head, ino);
+		wait_on_inode(inode);
+	return inode;
 }
-EXPORT_SYMBOL(iget_locked);
+EXPORT_SYMBOL(ilookup);
 
 int insert_inode_locked(struct inode *inode)
 {
-- 
cgit v1.2.3