From fb6a82c94a9c69adfb6b9f6ce9f84be36884e471 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 11 Apr 2006 20:12:10 -0400
Subject: [PATCH] jffs2: fix printk warnings

Fix printk format warnings in jffs2.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/readinode.c | 6 +++---
 fs/jffs2/summary.c   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index f1695642d0f..e1acce8fb2b 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -204,7 +204,7 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 	tn = jffs2_alloc_tmp_dnode_info();
 	if (!tn) {
-		JFFS2_ERROR("failed to allocate tn (%d bytes).\n", sizeof(*tn));
+		JFFS2_ERROR("failed to allocate tn (%zu bytes).\n", sizeof(*tn));
 		return -ENOMEM;
 	}
 
@@ -434,7 +434,7 @@ static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
 	}
 
 	if (retlen < len) {
-		JFFS2_ERROR("short read at %#08x: %d instead of %d.\n",
+		JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n",
 				offs, retlen, len);
 		return -EIO;
 	}
@@ -542,7 +542,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
 		}
 
 		if (retlen < len) {
-			JFFS2_ERROR("short read at %#08x: %d instead of %d.\n", ref_offset(ref), retlen, len);
+			JFFS2_ERROR("short read at %#08x: %zu instead of %d.\n", ref_offset(ref), retlen, len);
 			err = -EIO;
 			goto free_out;
 		}
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index fb9cec61fcf..3240d62d970 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -655,7 +655,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 
 
 	if (ret || (retlen != infosize)) {
-		JFFS2_WARNING("Write of %zd bytes at 0x%08x failed. returned %d, retlen %zd\n",
+		JFFS2_WARNING("Write of %d bytes at 0x%08x failed. returned %d, retlen %zu\n",
 			infosize, jeb->offset + c->sector_size - jeb->free_size, ret, retlen);
 
 		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
-- 
cgit v1.2.3


From d96fb997c6174f98a2a0a98200f99ac13b053bd6 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 17 Apr 2006 00:19:48 +0100
Subject: [JFFS2] Fix race in post-mount node checking

For a while now, we've postponed CRC-checking of data nodes to be done
by the GC thread, instead of being done while the user is waiting for
mount to finish. The GC thread would iterate through all the inodes on
the system and check each of their data nodes. It would skip over inodes
which had already been used or were already being read in by
read_inode(), because their data nodes would have been examined anyway.

However, we could sometimes reach the end of the for-each-inode loop and
still have some unchecked space left, if an inode we'd skipped was
_still_ in the process of being read. This fixes that race by actually
waiting for read_inode() to finish rather than just moving on.

Thanks to Ladislav Michl for coming up with a reproducible test case and
helping to track it down.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/gc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index f9ffece453a..967fb2cf8e2 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -181,6 +181,10 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 			   and trigger the BUG() above while we haven't yet
 			   finished checking all its nodes */
 			D1(printk(KERN_DEBUG "Waiting for ino #%u to finish reading\n", ic->ino));
+			/* We need to come back again for the _same_ inode. We've
+			 made no progress in this case, but that should be OK */
+			c->checked_ino--;
+
 			up(&c->alloc_sem);
 			sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock);
 			return 0;
-- 
cgit v1.2.3


From 373d5e71833978fe3d91264d86857762bb92cfe2 Mon Sep 17 00:00:00 2001
From: Richard Purdie <rpurdie@rpsys.net>
Date: Tue, 18 Apr 2006 02:05:46 +0100
Subject: JFFS2: Return an error for long filenames

Return an error if a name is too long for JFFS2 rather than
corrupting data.

Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
---
 fs/jffs2/dir.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 8bc7a5018e4..f92840a3a52 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -78,6 +78,9 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
 
 	D1(printk(KERN_DEBUG "jffs2_lookup()\n"));
 
+	if (target->d_name.len > JFFS2_MAX_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
 	dir_f = JFFS2_INODE_INFO(dir_i);
 	c = JFFS2_SB_INFO(dir_i->i_sb);
 
-- 
cgit v1.2.3


From 52b5108ca7490c0609e4dbddd8439bc03d702c99 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 21 Apr 2006 13:15:57 +0100
Subject: [RBTREE] Update ext3 to use rb_parent() accessor macro.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/ext3/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index f37528ed222..fbb0d4ed07d 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -284,7 +284,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 		 * beginning of the loop and try to free the parent
 		 * node.
 		 */
-		parent = n->rb_parent;
+		parent = rb_parent(n);
 		fname = rb_entry(n, struct fname, rb_hash);
 		while (fname) {
 			struct fname * old = fname;
-- 
cgit v1.2.3


From c569882b2e70a0c4eac99acdb39b493549041ba1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 21 Apr 2006 13:17:24 +0100
Subject: [RBTREE] Update eventpoll.c to use rb_parent() accessor macro.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/eventpoll.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1b4491cdd11..2695337d4d6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -337,20 +337,20 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1,
 /* Special initialization for the rb-tree node to detect linkage */
 static inline void ep_rb_initnode(struct rb_node *n)
 {
-	n->rb_parent = n;
+	rb_set_parent(n, n);
 }
 
 /* Removes a node from the rb-tree and marks it for a fast is-linked check */
 static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r)
 {
 	rb_erase(n, r);
-	n->rb_parent = n;
+	rb_set_parent(n, n);
 }
 
 /* Fast check to verify that the item is linked to the main rb-tree */
 static inline int ep_rb_linked(struct rb_node *n)
 {
-	return n->rb_parent != n;
+	return rb_parent(n) != n;
 }
 
 /*
-- 
cgit v1.2.3


From 21f1d5fc592e145574dede8debe9603334d08fde Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 21 Apr 2006 13:17:57 +0100
Subject: [RBTREE] Update JFFS2 to use rb_parent() accessor macro.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodelist.h  |  1 -
 fs/jffs2/readinode.c | 18 +++++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 23a67bb3052..131b67b6c35 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -299,7 +299,6 @@ static inline struct jffs2_node_frag *frag_last(struct rb_root *root)
 	return rb_entry(node, struct jffs2_node_frag, rb);
 }
 
-#define rb_parent(rb) ((rb)->rb_parent)
 #define frag_next(frag) rb_entry(rb_next(&(frag)->rb), struct jffs2_node_frag, rb)
 #define frag_prev(frag) rb_entry(rb_prev(&(frag)->rb), struct jffs2_node_frag, rb)
 #define frag_parent(frag) rb_entry(rb_parent(&(frag)->rb), struct jffs2_node_frag, rb)
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index f1695642d0f..6f4a7d846e8 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -66,7 +66,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
 			jffs2_free_full_dnode(tn->fn);
 			jffs2_free_tmp_dnode_info(tn);
 
-			this = this->rb_parent;
+			this = rb_parent(this);
 			if (!this)
 				break;
 
@@ -679,12 +679,12 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 			jffs2_mark_node_obsolete(c, fn->raw);
 
 		BUG_ON(rb->rb_left);
-		if (rb->rb_parent && rb->rb_parent->rb_left == rb) {
+		if (rb_parent(rb) && rb_parent(rb)->rb_left == rb) {
 			/* We were then left-hand child of our parent. We need
 			 * to move our own right-hand child into our place. */
 			repl_rb = rb->rb_right;
 			if (repl_rb)
-				repl_rb->rb_parent = rb->rb_parent;
+				rb_set_parent(repl_rb, rb_parent(rb));
 		} else
 			repl_rb = NULL;
 
@@ -692,14 +692,14 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
 
 		/* Remove the spent tn from the tree; don't bother rebalancing
 		 * but put our right-hand child in our own place. */
-		if (tn->rb.rb_parent) {
-			if (tn->rb.rb_parent->rb_left == &tn->rb)
-				tn->rb.rb_parent->rb_left = repl_rb;
-			else if (tn->rb.rb_parent->rb_right == &tn->rb)
-				tn->rb.rb_parent->rb_right = repl_rb;
+		if (rb_parent(&tn->rb)) {
+			if (rb_parent(&tn->rb)->rb_left == &tn->rb)
+				rb_parent(&tn->rb)->rb_left = repl_rb;
+			else if (rb_parent(&tn->rb)->rb_right == &tn->rb)
+				rb_parent(&tn->rb)->rb_right = repl_rb;
 			else BUG();
 		} else if (tn->rb.rb_right)
-			tn->rb.rb_right->rb_parent = NULL;
+			rb_set_parent(tn->rb.rb_right, NULL);
 
 		jffs2_free_tmp_dnode_info(tn);
 		if (ret) {
-- 
cgit v1.2.3


From cbb9a56177b16294ed347ba7fcb1c66c8adb5dc4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 3 May 2006 13:07:27 +0100
Subject: Move jffs2_fs_i.h and jffs2_fs_sb.h from include/linux/ to fs/jffs2/

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/compr.h       |   4 +-
 fs/jffs2/dir.c         |   4 +-
 fs/jffs2/jffs2_fs_i.h  |  50 ++++++++++++++++++++
 fs/jffs2/jffs2_fs_sb.h | 122 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/jffs2/nodelist.h    |   4 +-
 5 files changed, 178 insertions(+), 6 deletions(-)
 create mode 100644 fs/jffs2/jffs2_fs_i.h
 create mode 100644 fs/jffs2/jffs2_fs_sb.h

(limited to 'fs')

diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h
index a77e830d85c..509b8b1c081 100644
--- a/fs/jffs2/compr.h
+++ b/fs/jffs2/compr.h
@@ -23,8 +23,8 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_i.h>
-#include <linux/jffs2_fs_sb.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
 #include "nodelist.h"
 
 #define JFFS2_RUBINMIPS_PRIORITY 10
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index f92840a3a52..1c8e8c0f6ce 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -17,8 +17,8 @@
 #include <linux/fs.h>
 #include <linux/crc32.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_i.h>
-#include <linux/jffs2_fs_sb.h>
+#include "jffs2_fs_i.h"
+#include "jffs2_fs_sb.h"
 #include <linux/time.h>
 #include "nodelist.h"
 
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
new file mode 100644
index 00000000000..ad565bf9dcc
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -0,0 +1,50 @@
+/* $Id: jffs2_fs_i.h,v 1.19 2005/11/07 11:14:52 gleixner Exp $ */
+
+#ifndef _JFFS2_FS_I
+#define _JFFS2_FS_I
+
+#include <linux/version.h>
+#include <linux/rbtree.h>
+#include <asm/semaphore.h>
+
+struct jffs2_inode_info {
+	/* We need an internal mutex similar to inode->i_mutex.
+	   Unfortunately, we can't used the existing one, because
+	   either the GC would deadlock, or we'd have to release it
+	   before letting GC proceed. Or we'd have to put ugliness
+	   into the GC code so it didn't attempt to obtain the i_mutex
+	   for the inode(s) which are already locked */
+	struct semaphore sem;
+
+	/* The highest (datanode) version number used for this ino */
+	uint32_t highest_version;
+
+	/* List of data fragments which make up the file */
+	struct rb_root fragtree;
+
+	/* There may be one datanode which isn't referenced by any of the
+	   above fragments, if it contains a metadata update but no actual
+	   data - or if this is a directory inode */
+	/* This also holds the _only_ dnode for symlinks/device nodes,
+	   etc. */
+	struct jffs2_full_dnode *metadata;
+
+	/* Directory entries */
+	struct jffs2_full_dirent *dents;
+
+	/* The target path if this is the inode of a symlink */
+	unsigned char *target;
+
+	/* Some stuff we just have to keep in-core at all times, for each inode. */
+	struct jffs2_inode_cache *inocache;
+
+	uint16_t flags;
+	uint8_t usercompr;
+#if !defined (__ECOS)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,2)
+	struct inode vfs_inode;
+#endif
+#endif
+};
+
+#endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
new file mode 100644
index 00000000000..4bcfb557022
--- /dev/null
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -0,0 +1,122 @@
+/* $Id: jffs2_fs_sb.h,v 1.54 2005/09/21 13:37:34 dedekind Exp $ */
+
+#ifndef _JFFS2_FS_SB
+#define _JFFS2_FS_SB
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <asm/semaphore.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+
+#define JFFS2_SB_FLAG_RO 1
+#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */
+#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */
+
+struct jffs2_inodirty;
+
+/* A struct for the overall file system control.  Pointers to
+   jffs2_sb_info structs are named `c' in the source code.
+   Nee jffs_control
+*/
+struct jffs2_sb_info {
+	struct mtd_info *mtd;
+
+	uint32_t highest_ino;
+	uint32_t checked_ino;
+
+	unsigned int flags;
+
+	struct task_struct *gc_task;	/* GC task struct */
+	struct completion gc_thread_start; /* GC thread start completion */
+	struct completion gc_thread_exit; /* GC thread exit completion port */
+
+	struct semaphore alloc_sem;	/* Used to protect all the following
+					   fields, and also to protect against
+					   out-of-order writing of nodes. And GC. */
+	uint32_t cleanmarker_size;	/* Size of an _inline_ CLEANMARKER
+					 (i.e. zero for OOB CLEANMARKER */
+
+	uint32_t flash_size;
+	uint32_t used_size;
+	uint32_t dirty_size;
+	uint32_t wasted_size;
+	uint32_t free_size;
+	uint32_t erasing_size;
+	uint32_t bad_size;
+	uint32_t sector_size;
+	uint32_t unchecked_size;
+
+	uint32_t nr_free_blocks;
+	uint32_t nr_erasing_blocks;
+
+	/* Number of free blocks there must be before we... */
+	uint8_t resv_blocks_write;	/* ... allow a normal filesystem write */
+	uint8_t resv_blocks_deletion;	/* ... allow a normal filesystem deletion */
+	uint8_t resv_blocks_gctrigger;	/* ... wake up the GC thread */
+	uint8_t resv_blocks_gcbad;	/* ... pick a block from the bad_list to GC */
+	uint8_t resv_blocks_gcmerge;	/* ... merge pages when garbage collecting */
+
+	uint32_t nospc_dirty_size;
+
+	uint32_t nr_blocks;
+	struct jffs2_eraseblock *blocks;	/* The whole array of blocks. Used for getting blocks
+						 * from the offset (blocks[ofs / sector_size]) */
+	struct jffs2_eraseblock *nextblock;	/* The block we're currently filling */
+
+	struct jffs2_eraseblock *gcblock;	/* The block we're currently garbage-collecting */
+
+	struct list_head clean_list;		/* Blocks 100% full of clean data */
+	struct list_head very_dirty_list;	/* Blocks with lots of dirty space */
+	struct list_head dirty_list;		/* Blocks with some dirty space */
+	struct list_head erasable_list;		/* Blocks which are completely dirty, and need erasing */
+	struct list_head erasable_pending_wbuf_list;	/* Blocks which need erasing but only after the current wbuf is flushed */
+	struct list_head erasing_list;		/* Blocks which are currently erasing */
+	struct list_head erase_pending_list;	/* Blocks which need erasing now */
+	struct list_head erase_complete_list;	/* Blocks which are erased and need the clean marker written to them */
+	struct list_head free_list;		/* Blocks which are free and ready to be used */
+	struct list_head bad_list;		/* Bad blocks. */
+	struct list_head bad_used_list;		/* Bad blocks with valid data in. */
+
+	spinlock_t erase_completion_lock;	/* Protect free_list and erasing_list
+						   against erase completion handler */
+	wait_queue_head_t erase_wait;		/* For waiting for erases to complete */
+
+	wait_queue_head_t inocache_wq;
+	struct jffs2_inode_cache **inocache_list;
+	spinlock_t inocache_lock;
+
+	/* Sem to allow jffs2_garbage_collect_deletion_dirent to
+	   drop the erase_completion_lock while it's holding a pointer
+	   to an obsoleted node. I don't like this. Alternatives welcomed. */
+	struct semaphore erase_free_sem;
+
+	uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */
+
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+	/* Write-behind buffer for NAND flash */
+	unsigned char *wbuf;
+	uint32_t wbuf_ofs;
+	uint32_t wbuf_len;
+	struct jffs2_inodirty *wbuf_inodes;
+
+	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
+
+	/* Information about out-of-band area usage... */
+	struct nand_oobinfo *oobinfo;
+	uint32_t badblock_pos;
+	uint32_t fsdata_pos;
+	uint32_t fsdata_len;
+#endif
+
+	struct jffs2_summary *summary;		/* Summary information */
+
+	/* OS-private pointer for getting back to master superblock info */
+	void *os_priv;
+};
+
+#endif /* _JFFS2_FB_SB */
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 23a67bb3052..f6645afe88e 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -18,8 +18,8 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/jffs2.h>
-#include <linux/jffs2_fs_sb.h>
-#include <linux/jffs2_fs_i.h>
+#include "jffs2_fs_sb.h"
+#include "jffs2_fs_i.h"
 #include "summary.h"
 
 #ifdef __ECOS
-- 
cgit v1.2.3


From 422138dd68202fbd8ca9fb0df65e92d733249374 Mon Sep 17 00:00:00 2001
From: Dmitry Bazhenov <atrey@emcraft.com>
Date: Fri, 5 May 2006 22:46:49 +0100
Subject: [JFFS2] Fix race in setting file attributes

It seems like there is a potential race in the function jffs2_do_setattr()
in the case when attributes of a symlink are updated. The symlink metadata
is read without having f->sem locked.

The following patch should fix the race.

Signed-off-by: Dmitry Bazhenov <atrey@emcraft.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/fs.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 09e5d10b884..ea1f37d4fc5 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -56,15 +56,20 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 		mdatalen = sizeof(dev);
 		D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen));
 	} else if (S_ISLNK(inode->i_mode)) {
+		down(&f->sem);
 		mdatalen = f->metadata->size;
 		mdata = kmalloc(f->metadata->size, GFP_USER);
-		if (!mdata)
+		if (!mdata) {
+			up(&f->sem);
 			return -ENOMEM;
+		}
 		ret = jffs2_read_dnode(c, f, f->metadata, mdata, 0, mdatalen);
 		if (ret) {
+			up(&f->sem);
 			kfree(mdata);
 			return ret;
 		}
+		up(&f->sem);
 		D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of symlink target\n", mdatalen));
 	}
 
-- 
cgit v1.2.3


From 7e59f2ccd7fc2424b2e70132ac613d16acae37da Mon Sep 17 00:00:00 2001
From: Domen Puncer <domen@coderock.org>
Date: Fri, 12 May 2006 11:51:46 +0100
Subject: [JFFS2] Remove obsolete histo.h

This file hasn't actually been used since the very early days of JFFS2
when Arjan was playing with compression methods. It can go now.

Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/histo.h | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 fs/jffs2/histo.h

(limited to 'fs')

diff --git a/fs/jffs2/histo.h b/fs/jffs2/histo.h
deleted file mode 100644
index 22a93a08210..00000000000
--- a/fs/jffs2/histo.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/* This file provides the bit-probabilities for the input file */
-#define BIT_DIVIDER 629
-static int bits[9] = { 179,167,183,165,159,198,178,119,}; /* ia32 .so files */
-- 
cgit v1.2.3


From 20ffdcb00a792073f6e620dc2c644b3c8fbab528 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Fri, 12 May 2006 11:55:51 +0100
Subject: [JFFS2] Remove number of pointer dereferences in fs/jffs2/summary.c

Reduce the nr.  of pointer dereferences in fs/jffs2/summary.c

Benefits:
 - micro speed optimization due to fewer pointer derefs
 - generated code is slightly smaller
 - better readability

(The first two sound like a compiler problem but I'll go with the third. dwmw2).

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 3240d62d970..7b0ed77a4c3 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -581,16 +581,17 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	wpage = c->summary->sum_buf;
 
 	while (c->summary->sum_num) {
+		temp = c->summary->sum_list_head;
 
-		switch (je16_to_cpu(c->summary->sum_list_head->u.nodetype)) {
+		switch (je16_to_cpu(temp->u.nodetype)) {
 			case JFFS2_NODETYPE_INODE: {
 				struct jffs2_sum_inode_flash *sino_ptr = wpage;
 
-				sino_ptr->nodetype = c->summary->sum_list_head->i.nodetype;
-				sino_ptr->inode = c->summary->sum_list_head->i.inode;
-				sino_ptr->version = c->summary->sum_list_head->i.version;
-				sino_ptr->offset = c->summary->sum_list_head->i.offset;
-				sino_ptr->totlen = c->summary->sum_list_head->i.totlen;
+				sino_ptr->nodetype = temp->i.nodetype;
+				sino_ptr->inode = temp->i.inode;
+				sino_ptr->version = temp->i.version;
+				sino_ptr->offset = temp->i.offset;
+				sino_ptr->totlen = temp->i.totlen;
 
 				wpage += JFFS2_SUMMARY_INODE_SIZE;
 
@@ -600,19 +601,19 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 			case JFFS2_NODETYPE_DIRENT: {
 				struct jffs2_sum_dirent_flash *sdrnt_ptr = wpage;
 
-				sdrnt_ptr->nodetype = c->summary->sum_list_head->d.nodetype;
-				sdrnt_ptr->totlen = c->summary->sum_list_head->d.totlen;
-				sdrnt_ptr->offset = c->summary->sum_list_head->d.offset;
-				sdrnt_ptr->pino = c->summary->sum_list_head->d.pino;
-				sdrnt_ptr->version = c->summary->sum_list_head->d.version;
-				sdrnt_ptr->ino = c->summary->sum_list_head->d.ino;
-				sdrnt_ptr->nsize = c->summary->sum_list_head->d.nsize;
-				sdrnt_ptr->type = c->summary->sum_list_head->d.type;
+				sdrnt_ptr->nodetype = temp->d.nodetype;
+				sdrnt_ptr->totlen = temp->d.totlen;
+				sdrnt_ptr->offset = temp->d.offset;
+				sdrnt_ptr->pino = temp->d.pino;
+				sdrnt_ptr->version = temp->d.version;
+				sdrnt_ptr->ino = temp->d.ino;
+				sdrnt_ptr->nsize = temp->d.nsize;
+				sdrnt_ptr->type = temp->d.type;
 
-				memcpy(sdrnt_ptr->name, c->summary->sum_list_head->d.name,
-							c->summary->sum_list_head->d.nsize);
+				memcpy(sdrnt_ptr->name, temp->d.name,
+							temp->d.nsize);
 
-				wpage += JFFS2_SUMMARY_DIRENT_SIZE(c->summary->sum_list_head->d.nsize);
+				wpage += JFFS2_SUMMARY_DIRENT_SIZE(temp->d.nsize);
 
 				break;
 			}
@@ -622,8 +623,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 			}
 		}
 
-		temp = c->summary->sum_list_head;
-		c->summary->sum_list_head = c->summary->sum_list_head->u.next;
+		c->summary->sum_list_head = temp->u.next;
 		kfree(temp);
 
 		c->summary->sum_num--;
-- 
cgit v1.2.3


From aa98d7cf59b5b0764d3502662053489585faf2fe Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:09:47 +0900
Subject: [JFFS2][XATTR] XATTR support on JFFS2 (version. 5)

This attached patches provide xattr support including POSIX-ACL and
SELinux support on JFFS2 (version.5).

There are some significant differences from previous version posted
at last December.
The biggest change is addition of EBS(Erase Block Summary) support.
Currently, both kernel and usermode utility (sumtool) can recognize
xattr nodes which have JFFS2_NODETYPE_XATTR/_XREF nodetype.

In addition, some bugs are fixed.
- A potential race condition was fixed.
- Unexpected fail when updating a xattr by same name/value pair was fixed.
- A bug when removing xattr name/value pair was fixed.

The fundamental structures (such as using two new nodetypes and exclusion
mechanism by rwsem) are unchanged. But most of implementation were reviewed
and updated if necessary.
Espacially, we had to change several internal implementations related to
load_xattr_datum() to avoid a potential race condition.

[1/2] xattr_on_jffs2.kernel.version-5.patch
[2/2] xattr_on_jffs2.utils.version-5.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/Kconfig               |   38 ++
 fs/jffs2/Makefile        |    3 +
 fs/jffs2/acl.c           |  483 ++++++++++++++++++
 fs/jffs2/acl.h           |   46 ++
 fs/jffs2/build.c         |    2 +
 fs/jffs2/debug.h         |    6 +
 fs/jffs2/dir.c           |   62 ++-
 fs/jffs2/file.c          |    7 +-
 fs/jffs2/fs.c            |   11 +-
 fs/jffs2/gc.c            |   16 +-
 fs/jffs2/jffs2_fs_i.h    |    5 +
 fs/jffs2/jffs2_fs_sb.h   |   10 +
 fs/jffs2/malloc.c        |   68 ++-
 fs/jffs2/nodelist.c      |    1 +
 fs/jffs2/nodelist.h      |   21 +-
 fs/jffs2/os-linux.h      |    4 +
 fs/jffs2/readinode.c     |    1 +
 fs/jffs2/scan.c          |  168 ++++++
 fs/jffs2/security.c      |   82 +++
 fs/jffs2/summary.c       |  191 +++++++
 fs/jffs2/summary.h       |   42 ++
 fs/jffs2/super.c         |    6 +-
 fs/jffs2/symlink.c       |    7 +-
 fs/jffs2/write.c         |    2 +-
 fs/jffs2/xattr.c         | 1271 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/jffs2/xattr.h         |  120 +++++
 fs/jffs2/xattr_trusted.c |   51 ++
 fs/jffs2/xattr_user.c    |   51 ++
 28 files changed, 2760 insertions(+), 15 deletions(-)
 create mode 100644 fs/jffs2/acl.c
 create mode 100644 fs/jffs2/acl.h
 create mode 100644 fs/jffs2/security.c
 create mode 100644 fs/jffs2/xattr.c
 create mode 100644 fs/jffs2/xattr.h
 create mode 100644 fs/jffs2/xattr_trusted.c
 create mode 100644 fs/jffs2/xattr_user.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f9b5842c8d2..2496ccbe260 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1075,6 +1075,44 @@ config JFFS2_FS_DEBUG
 	  If reporting bugs, please try to have available a full dump of the
 	  messages at debug level 1 while the misbehaviour was occurring.
 
+config JFFS2_FS_XATTR
+	bool "JFFS2 XATTR support"
+	depends on JFFS2_FS
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+	  
+	  If unsure, say N.
+
+config JFFS2_FS_POSIX_ACL
+	bool "JFFS2 POSIX Access Control Lists"
+	depends on JFFS2_FS_XATTR
+	default y
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+	  
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+	  
+	  If you don't know what Access Control Lists are, say N
+
+config JFFS2_FS_SECURITY
+	bool "JFFS2 Security Labels"
+	depends on JFFS2_FS_XATTR
+	default y
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jffs2 filesystem.
+	  
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
 config JFFS2_FS_WRITEBUFFER
 	bool "JFFS2 write-buffering support"
 	depends on JFFS2_FS
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index 77dc5561a04..7f28ee0bd13 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -12,6 +12,9 @@ jffs2-y	+= symlink.o build.o erase.o background.o fs.o writev.o
 jffs2-y	+= super.o debug.o
 
 jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER)	+= wbuf.o
+jffs2-$(CONFIG_JFFS2_FS_XATTR)		+= xattr.o xattr_trusted.o xattr_user.o
+jffs2-$(CONFIG_JFFS2_FS_SECURITY)	+= security.o
+jffs2-$(CONFIG_JFFS2_FS_POSIX_ACL)	+= acl.o
 jffs2-$(CONFIG_JFFS2_RUBIN)	+= compr_rubin.o
 jffs2-$(CONFIG_JFFS2_RTIME)	+= compr_rtime.o
 jffs2-$(CONFIG_JFFS2_ZLIB)	+= compr_zlib.o
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
new file mode 100644
index 00000000000..080bb51e4b6
--- /dev/null
+++ b/fs/jffs2/acl.c
@@ -0,0 +1,483 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/acl.c
+ *  POSIX ACL support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static size_t jffs2_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(jffs2_acl_header)
+		       + count * sizeof(jffs2_acl_entry_short);
+	} else {
+		return sizeof(jffs2_acl_header)
+		       + 4 * sizeof(jffs2_acl_entry_short)
+		       + (count - 4) * sizeof(jffs2_acl_entry);
+	}
+}
+
+static int jffs2_acl_count(size_t size)
+{
+	size_t s;
+
+	size -= sizeof(jffs2_acl_header);
+	s = size - 4 * sizeof(jffs2_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(jffs2_acl_entry_short))
+			return -1;
+		return size / sizeof(jffs2_acl_entry_short);
+	} else {
+		if (s % sizeof(jffs2_acl_entry))
+			return -1;
+		return s / sizeof(jffs2_acl_entry) + 4;
+	}
+}
+
+static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	struct posix_acl *acl;
+	uint32_t ver;
+	int i, count;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(jffs2_acl_header))
+		return ERR_PTR(-EINVAL);
+	ver = je32_to_cpu(((jffs2_acl_header *)value)->a_version);
+	if (ver != JFFS2_ACL_VERSION) {
+		JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver);
+		return ERR_PTR(-EINVAL);
+	}
+
+	value = (char *)value + sizeof(jffs2_acl_header);
+	count = jffs2_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+
+	for (i=0; i < count; i++) {
+		jffs2_acl_entry *entry = (jffs2_acl_entry *)value;
+		if ((char *)value + sizeof(jffs2_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag);
+		acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm);
+		switch (acl->a_entries[i].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value = (char *)value + sizeof(jffs2_acl_entry_short);
+				acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				value = (char *)value + sizeof(jffs2_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[i].e_id = je32_to_cpu(entry->e_id);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+ fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
+{
+	jffs2_acl_header *jffs2_acl;
+	char *e;
+	size_t i;
+
+	*size = jffs2_acl_size(acl->a_count);
+	jffs2_acl = (jffs2_acl_header *)kmalloc(sizeof(jffs2_acl_header)
+						+ acl->a_count * sizeof(jffs2_acl_entry),
+						GFP_KERNEL);
+	if (!jffs2_acl)
+		return ERR_PTR(-ENOMEM);
+	jffs2_acl->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
+	e = (char *)jffs2_acl + sizeof(jffs2_acl_header);
+	for (i=0; i < acl->a_count; i++) {
+		jffs2_acl_entry *entry = (jffs2_acl_entry *)e;
+		entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag);
+		entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm);
+		switch(acl->a_entries[i].e_tag) {
+			case ACL_USER:
+			case ACL_GROUP:
+				entry->e_id = cpu_to_je32(acl->a_entries[i].e_id);
+				e += sizeof(jffs2_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(jffs2_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return (char *)jffs2_acl;
+ fail:
+	kfree(jffs2_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static struct posix_acl *jffs2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = JFFS2_ACL_NOT_CACHED;
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != JFFS2_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+	return acl;
+}
+
+static void jffs2_iset_acl(struct inode *inode, struct posix_acl **i_acl, struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != JFFS2_ACL_NOT_CACHED)
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct posix_acl *acl;
+	char *value = NULL;
+	int rc, xprefix;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		acl = jffs2_iget_acl(inode, &f->i_acl_access);
+		if (acl != JFFS2_ACL_NOT_CACHED)
+			return acl;
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		acl = jffs2_iget_acl(inode, &f->i_acl_default);
+		if (acl != JFFS2_ACL_NOT_CACHED)
+			return acl;
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+	rc = do_jffs2_getxattr(inode, xprefix, "", NULL, 0);
+	if (rc > 0) {
+		value = kmalloc(rc, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		rc = do_jffs2_getxattr(inode, xprefix, "", value, rc);
+	}
+	if (rc > 0) {
+		acl = jffs2_acl_from_medium(value, rc);
+	} else if (rc == -ENODATA || rc == -ENOSYS) {
+		acl = NULL;
+	} else {
+		acl = ERR_PTR(rc);
+	}
+	if (value)
+		kfree(value);
+	if (!IS_ERR(acl)) {
+		switch (type) {
+		case ACL_TYPE_ACCESS:
+			jffs2_iset_acl(inode, &f->i_acl_access, acl);
+			break;
+		case ACL_TYPE_DEFAULT:
+			jffs2_iset_acl(inode, &f->i_acl_default, acl);
+			break;
+		}
+	}
+	return acl;
+}
+
+static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	size_t size = 0;
+	char *value = NULL;
+	int rc, xprefix;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			rc = posix_acl_equiv_mode(acl, &mode);
+			if (rc < 0)
+				return rc;
+			if (inode->i_mode != mode) {
+				inode->i_mode = mode;
+				jffs2_dirty_inode(inode);
+			}
+			if (rc == 0)
+				acl = NULL;
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		xprefix = JFFS2_XPREFIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (acl) {
+		value = jffs2_acl_to_medium(acl, &size);
+		if (IS_ERR(value))
+			return PTR_ERR(value);
+	}
+
+	rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0);
+	if (value)
+		kfree(value);
+	if (!rc) {
+		switch(type) {
+		case ACL_TYPE_ACCESS:
+			jffs2_iset_acl(inode, &f->i_acl_access, acl);
+			break;
+		case ACL_TYPE_DEFAULT:
+			jffs2_iset_acl(inode, &f->i_acl_default, acl);
+			break;
+		}
+	}
+	return rc;
+}
+
+static int jffs2_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		rc = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return rc;
+	}
+	return -EAGAIN;
+}
+
+int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, jffs2_check_acl);
+}
+
+int jffs2_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct posix_acl *acl = NULL, *clone;
+	mode_t mode;
+	int rc = 0;
+
+	f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+	f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+	if (!S_ISLNK(inode->i_mode)) {
+		acl = jffs2_get_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+	if (acl) {
+		if (S_ISDIR(inode->i_mode)) {
+			rc = jffs2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+			if (rc)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		rc = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+		mode = inode->i_mode;
+		rc = posix_acl_create_masq(clone, &mode);
+		if (rc >= 0) {
+			inode->i_mode = mode;
+			if (rc > 0)
+				rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+		}
+		posix_acl_release(clone);
+	}
+ cleanup:
+	posix_acl_release(acl);
+	return rc;
+}
+
+void jffs2_clear_acl(struct inode *inode)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+
+	if (f->i_acl_access && f->i_acl_access != JFFS2_ACL_NOT_CACHED) {
+		posix_acl_release(f->i_acl_access);
+		f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+	}
+	if (f->i_acl_default && f->i_acl_default != JFFS2_ACL_NOT_CACHED) {
+		posix_acl_release(f->i_acl_default);
+		f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+	}
+}
+
+int jffs2_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int rc;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	rc = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!rc)
+		rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone);
+	posix_acl_release(clone);
+	return rc;
+}
+
+static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size,
+					 const char *name, size_t name_len)
+{
+	const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (list && retlen <= list_size)
+		strcpy(list, POSIX_ACL_XATTR_ACCESS);
+	return retlen;
+}
+
+static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size,
+					  const char *name, size_t name_len)
+{
+	const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (list && retlen <= list_size)
+		strcpy(list, POSIX_ACL_XATTR_DEFAULT);
+	return retlen;
+}
+
+static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	acl = jffs2_get_acl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (!acl)
+		return -ENODATA;
+	rc = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return rc;
+}
+
+static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size)
+{
+	struct posix_acl *acl;
+	int rc;
+
+	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (acl) {
+			rc = posix_acl_valid(acl);
+			if (rc)
+				goto out;
+		}
+	} else {
+		acl = NULL;
+	}
+	rc = jffs2_set_acl(inode, type, acl);
+ out:
+	posix_acl_release(acl);
+	return rc;
+}
+
+static int jffs2_acl_access_setxattr(struct inode *inode, const char *name,
+				     const void *buffer, size_t size, int flags)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int jffs2_acl_default_setxattr(struct inode *inode, const char *name,
+				      const void *buffer, size_t size, int flags)
+{
+	if (name[0] != '\0')
+		return -EINVAL;
+	return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+struct xattr_handler jffs2_acl_access_xattr_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.list	= jffs2_acl_access_listxattr,
+	.get	= jffs2_acl_access_getxattr,
+	.set	= jffs2_acl_access_setxattr,
+};
+
+struct xattr_handler jffs2_acl_default_xattr_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.list	= jffs2_acl_default_listxattr,
+	.get	= jffs2_acl_default_getxattr,
+	.set	= jffs2_acl_default_setxattr,
+};
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
new file mode 100644
index 00000000000..c98610b4e81
--- /dev/null
+++ b/fs/jffs2/acl.h
@@ -0,0 +1,46 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/acl.h
+ *  POSIX ACL support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+typedef struct {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+	jint32_t	e_id;
+} jffs2_acl_entry;
+
+typedef struct {
+	jint16_t	e_tag;
+	jint16_t	e_perm;
+} jffs2_acl_entry_short;
+
+typedef struct {
+	jint32_t	a_version;
+} jffs2_acl_header;
+
+#ifdef __KERNEL__
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+
+#define JFFS2_ACL_NOT_CACHED ((void *)-1)
+
+extern int jffs2_permission(struct inode *, int, struct nameidata *);
+extern int jffs2_acl_chmod(struct inode *);
+extern int jffs2_init_acl(struct inode *, struct inode *);
+extern void jffs2_clear_acl(struct inode *);
+
+extern struct xattr_handler jffs2_acl_access_xattr_handler;
+extern struct xattr_handler jffs2_acl_default_xattr_handler;
+
+#else
+
+#define jffs2_permission NULL
+#define jffs2_acl_chmod(inode)		(0)
+#define jffs2_init_acl(inode,dir)	(0)
+#define jffs2_clear_acl(inode)
+
+#endif	/* CONFIG_JFFS2_FS_POSIX_ACL */
+#endif	/* __KERNEL__ */
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 70f7a896c04..02826967ab5 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -160,6 +160,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 		ic->scan_dents = NULL;
 		cond_resched();
 	}
+	jffs2_build_xattr_subsystem(c);
 	c->flags &= ~JFFS2_SB_FLAG_BUILDING;
 
 	dbg_fsbuild("FS build complete\n");
@@ -178,6 +179,7 @@ exit:
 				jffs2_free_full_dirent(fd);
 			}
 		}
+		jffs2_clear_xattr_subsystem(c);
 	}
 
 	return ret;
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index 162af6dfe29..5fa494a792b 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -171,6 +171,12 @@
 #define dbg_memalloc(fmt, ...)
 #endif
 
+/* Watch the XATTR subsystem */
+#ifdef JFFS2_DBG_XATTR_MESSAGES
+#define dbg_xattr(fmt, ...)  JFFS2_DEBUG(fmt, ##__VA_ARGS__)
+#else
+#define dbg_xattr(fmt, ...)
+#endif 
 
 /* "Sanity" checks */
 void
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 1c8e8c0f6ce..f1b18b99a3c 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -57,7 +57,12 @@ struct inode_operations jffs2_dir_inode_operations =
 	.rmdir =	jffs2_rmdir,
 	.mknod =	jffs2_mknod,
 	.rename =	jffs2_rename,
+	.permission =	jffs2_permission,
 	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
 };
 
 /***********************************************************************/
@@ -209,12 +214,15 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	ret = jffs2_do_create(c, dir_f, f, ri,
 			      dentry->d_name.name, dentry->d_name.len);
 
-	if (ret) {
-		make_bad_inode(inode);
-		iput(inode);
-		jffs2_free_raw_inode(ri);
-		return ret;
-	}
+	if (ret)
+		goto fail;
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret)
+		goto fail;
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret)
+		goto fail;
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
 
@@ -224,6 +232,12 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
 	D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
 		  inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->nlink, inode->i_mapping->nrpages));
 	return 0;
+
+ fail:
+	make_bad_inode(inode);
+	iput(inode);
+	jffs2_free_raw_inode(ri);
+	return ret;
 }
 
 /***********************************************************************/
@@ -374,6 +388,18 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	up(&f->sem);
 
 	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
@@ -504,6 +530,18 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	up(&f->sem);
 
 	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
@@ -660,6 +698,18 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	up(&f->sem);
 
 	jffs2_complete_reservation(c);
+
+	ret = jffs2_init_security(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+	ret = jffs2_init_acl(inode, dir_i);
+	if (ret) {
+		jffs2_clear_inode(inode);
+		return ret;
+	}
+
 	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 9f4171213e5..e92187f34d5 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -54,7 +54,12 @@ const struct file_operations jffs2_file_operations =
 
 struct inode_operations jffs2_file_inode_operations =
 {
-	.setattr =	jffs2_setattr
+	.permission =	jffs2_permission,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
 };
 
 struct address_space_operations jffs2_file_address_operations =
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index ea1f37d4fc5..4607cdc4c46 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -185,7 +185,12 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 
 int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 {
-	return jffs2_do_setattr(dentry->d_inode, iattr);
+	int rc;
+
+	rc = jffs2_do_setattr(dentry->d_inode, iattr);
+	if (!rc && (iattr->ia_valid & ATTR_MODE))
+		rc = jffs2_acl_chmod(dentry->d_inode);
+	return rc;
 }
 
 int jffs2_statfs(struct super_block *sb, struct kstatfs *buf)
@@ -224,6 +229,7 @@ void jffs2_clear_inode (struct inode *inode)
 
 	D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
 
+	jffs2_xattr_delete_inode(c, f->inocache);
 	jffs2_do_clear_inode(c, f);
 }
 
@@ -497,6 +503,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	memset(c->inocache_list, 0, INOCACHE_HASHSIZE * sizeof(struct jffs2_inode_cache *));
 
+	jffs2_init_xattr_subsystem(c);
+
 	if ((ret = jffs2_do_mount_fs(c)))
 		goto out_inohash;
 
@@ -531,6 +539,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 	else
 		kfree(c->blocks);
  out_inohash:
+	jffs2_clear_xattr_subsystem(c);
 	kfree(c->inocache_list);
  out_wbuf:
 	jffs2_flash_cleanup(c);
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 967fb2cf8e2..4ea1b7f0ae7 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -125,6 +125,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 	struct jffs2_eraseblock *jeb;
 	struct jffs2_raw_node_ref *raw;
 	int ret = 0, inum, nlink;
+	int xattr = 0;
 
 	if (down_interruptible(&c->alloc_sem))
 		return -EINTR;
@@ -138,7 +139,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 		   the node CRCs etc. Do it now. */
 
 		/* checked_ino is protected by the alloc_sem */
-		if (c->checked_ino > c->highest_ino) {
+		if (c->checked_ino > c->highest_ino && xattr) {
 			printk(KERN_CRIT "Checked all inodes but still 0x%x bytes of unchecked space?\n",
 			       c->unchecked_size);
 			jffs2_dbg_dump_block_lists_nolock(c);
@@ -148,6 +149,9 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 		spin_unlock(&c->erase_completion_lock);
 
+		if (!xattr)
+			xattr = jffs2_verify_xattr(c);
+
 		spin_lock(&c->inocache_lock);
 
 		ic = jffs2_get_ino_cache(c, c->checked_ino++);
@@ -262,6 +266,16 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 	ic = jffs2_raw_ref_to_ic(raw);
 
+	/* When 'ic' refers xattr_datum/xattr_ref, this node is GCed as xattr.
+	   We can decide whether this node is inode or xattr by ic->class.
+	   ret = 0 : ic is xattr_datum/xattr_ref, and GC was SUCCESSED.
+	   ret < 0 : ic is xattr_datum/xattr_ref, but GC was FAILED.
+	   ret > 0 : ic is NOT xattr_datum/xattr_ref.
+	*/
+	ret = jffs2_garbage_collect_xattr(c, ic);
+	if (ret <= 0)
+		goto release_sem;
+
 	/* We need to hold the inocache. Either the erase_completion_lock or
 	   the inocache_lock are sufficient; we trade down since the inocache_lock
 	   causes less contention. */
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index ad565bf9dcc..2e0cc8e00b8 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -5,6 +5,7 @@
 
 #include <linux/version.h>
 #include <linux/rbtree.h>
+#include <linux/posix_acl.h>
 #include <asm/semaphore.h>
 
 struct jffs2_inode_info {
@@ -45,6 +46,10 @@ struct jffs2_inode_info {
 	struct inode vfs_inode;
 #endif
 #endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	struct posix_acl *i_acl_access;
+	struct posix_acl *i_acl_default;
+#endif
 };
 
 #endif /* _JFFS2_FS_I */
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 4bcfb557022..3b4e0edd6db 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -115,6 +115,16 @@ struct jffs2_sb_info {
 
 	struct jffs2_summary *summary;		/* Summary information */
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+#define XATTRINDEX_HASHSIZE	(57)
+	uint32_t highest_xid;
+	struct list_head xattrindex[XATTRINDEX_HASHSIZE];
+	struct list_head xattr_temp;
+	struct list_head xattr_unchecked;
+	struct rw_semaphore xattr_sem;
+	uint32_t xdatum_mem_usage;
+	uint32_t xdatum_mem_threshold;
+#endif
 	/* OS-private pointer for getting back to master superblock info */
 	void *os_priv;
 };
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 036cbd11c00..3d5b7ecfbf8 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -26,6 +26,10 @@ static kmem_cache_t *tmp_dnode_info_slab;
 static kmem_cache_t *raw_node_ref_slab;
 static kmem_cache_t *node_frag_slab;
 static kmem_cache_t *inode_cache_slab;
+#ifdef CONFIG_JFFS2_FS_XATTR
+static kmem_cache_t *xattr_datum_cache;
+static kmem_cache_t *xattr_ref_cache;
+#endif
 
 int __init jffs2_create_slab_caches(void)
 {
@@ -68,8 +72,24 @@ int __init jffs2_create_slab_caches(void)
 	inode_cache_slab = kmem_cache_create("jffs2_inode_cache",
 					     sizeof(struct jffs2_inode_cache),
 					     0, 0, NULL, NULL);
-	if (inode_cache_slab)
-		return 0;
+	if (!inode_cache_slab)
+		goto err;
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+	xattr_datum_cache = kmem_cache_create("jffs2_xattr_datum",
+					     sizeof(struct jffs2_xattr_datum),
+					     0, 0, NULL, NULL);
+	if (!xattr_datum_cache)
+		goto err;
+
+	xattr_ref_cache = kmem_cache_create("jffs2_xattr_ref",
+					   sizeof(struct jffs2_xattr_ref),
+					   0, 0, NULL, NULL);
+	if (!xattr_ref_cache)
+		goto err;
+#endif
+
+	return 0;
  err:
 	jffs2_destroy_slab_caches();
 	return -ENOMEM;
@@ -91,6 +111,12 @@ void jffs2_destroy_slab_caches(void)
 		kmem_cache_destroy(node_frag_slab);
 	if(inode_cache_slab)
 		kmem_cache_destroy(inode_cache_slab);
+#ifdef CONFIG_JFFS2_FS_XATTR
+	if (xattr_datum_cache)
+		kmem_cache_destroy(xattr_datum_cache);
+	if (xattr_ref_cache)
+		kmem_cache_destroy(xattr_ref_cache);
+#endif
 }
 
 struct jffs2_full_dirent *jffs2_alloc_full_dirent(int namesize)
@@ -205,3 +231,41 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
 	dbg_memalloc("%p\n", x);
 	kmem_cache_free(inode_cache_slab, x);
 }
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
+{
+	struct jffs2_xattr_datum *xd;
+	xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", xd);
+
+	memset(xd, 0, sizeof(struct jffs2_xattr_datum));
+	xd->class = RAWNODE_CLASS_XATTR_DATUM;
+	INIT_LIST_HEAD(&xd->xindex);
+	return xd;
+}
+
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
+{
+	dbg_memalloc("%p\n", xd);
+	kmem_cache_free(xattr_datum_cache, xd);
+}
+
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
+{
+	struct jffs2_xattr_ref *ref;
+	ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+	dbg_memalloc("%p\n", ref);
+
+	memset(ref, 0, sizeof(struct jffs2_xattr_ref));
+	ref->class = RAWNODE_CLASS_XATTR_REF;
+	INIT_LIST_HEAD(&ref->ilist);
+	return ref;
+}
+
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *ref)
+{
+	dbg_memalloc("%p\n", ref);
+	kmem_cache_free(xattr_ref_cache, ref);
+}
+#endif
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index d4d0c41490c..9c575733659 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -938,6 +938,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c)
 		this = c->inocache_list[i];
 		while (this) {
 			next = this->next;
+			jffs2_xattr_free_inode(c, this);
 			jffs2_free_inode_cache(this);
 			this = next;
 		}
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index f6645afe88e..6f6279cf490 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -20,6 +20,8 @@
 #include <linux/jffs2.h>
 #include "jffs2_fs_sb.h"
 #include "jffs2_fs_i.h"
+#include "xattr.h"
+#include "acl.h"
 #include "summary.h"
 
 #ifdef __ECOS
@@ -107,11 +109,16 @@ struct jffs2_inode_cache {
 		temporary lists of dirents, and later must be set to
 		NULL to mark the end of the raw_node_ref->next_in_ino
 		chain. */
+	u8 class;	/* It's used for identification */
+	u8 flags;
+	uint16_t state;
 	struct jffs2_inode_cache *next;
 	struct jffs2_raw_node_ref *nodes;
 	uint32_t ino;
 	int nlink;
-	int state;
+#ifdef CONFIG_JFFS2_FS_XATTR
+	struct list_head ilist;
+#endif
 };
 
 /* Inode states for 'state' above. We need the 'GC' state to prevent
@@ -125,6 +132,12 @@ struct jffs2_inode_cache {
 #define INO_STATE_READING	5	/* In read_inode() */
 #define INO_STATE_CLEARING	6	/* In clear_inode() */
 
+#define INO_FLAGS_XATTR_CHECKED	0x01	/* has no duplicate xattr_ref */
+
+#define RAWNODE_CLASS_INODE_CACHE	0
+#define RAWNODE_CLASS_XATTR_DATUM	1
+#define RAWNODE_CLASS_XATTR_REF		2
+
 #define INOCACHE_HASHSIZE 128
 
 /*
@@ -374,6 +387,12 @@ struct jffs2_node_frag *jffs2_alloc_node_frag(void);
 void jffs2_free_node_frag(struct jffs2_node_frag *);
 struct jffs2_inode_cache *jffs2_alloc_inode_cache(void);
 void jffs2_free_inode_cache(struct jffs2_inode_cache *);
+#ifdef CONFIG_JFFS2_FS_XATTR
+struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void);
+void jffs2_free_xattr_datum(struct jffs2_xattr_datum *);
+struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void);
+void jffs2_free_xattr_ref(struct jffs2_xattr_ref *);
+#endif
 
 /* gc.c */
 int jffs2_garbage_collect_pass(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index d307cf54862..9936ae23f8d 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -60,6 +60,10 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 	f->target = NULL;
 	f->flags = 0;
 	f->usercompr = 0;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	f->i_acl_access = JFFS2_ACL_NOT_CACHED;
+	f->i_acl_default = JFFS2_ACL_NOT_CACHED;
+#endif
 }
 
 
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e1acce8fb2b..61ccdf4f104 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -902,6 +902,7 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 		f->inocache->ino = f->inocache->nlink = 1;
 		f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
 		f->inocache->state = INO_STATE_READING;
+		init_xattr_inode_cache(f->inocache);
 		jffs2_add_ino_cache(c, f->inocache);
 	}
 	if (!f->inocache) {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index cf55b221fc2..f09689e320f 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -306,6 +306,136 @@ int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *je
 		return BLK_STATE_ALLDIRTY;
 }
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				 struct jffs2_raw_xattr *rx, uint32_t ofs,
+				 struct jffs2_summary *s)
+{
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t totlen, crc;
+
+	crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4);
+	if (crc != je32_to_cpu(rx->node_crc)) {
+		if (je32_to_cpu(rx->node_crc) != 0xffffffff)
+			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				      ofs, je32_to_cpu(rx->node_crc), crc);
+		DIRTY_SPACE(je32_to_cpu(rx->totlen));
+		return 0;
+	}
+
+	totlen = PAD(sizeof(*rx) + rx->name_len + 1 + je16_to_cpu(rx->value_len));
+	if (totlen != je32_to_cpu(rx->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
+			      ofs, je32_to_cpu(rx->totlen), totlen);
+		DIRTY_SPACE(je32_to_cpu(rx->totlen));
+		return 0;
+	}
+
+	raw =  jffs2_alloc_raw_node_ref();
+	if (!raw)
+		return -ENOMEM;
+
+	xd = jffs2_setup_xattr_datum(c, je32_to_cpu(rx->xid), je32_to_cpu(rx->version));
+	if (IS_ERR(xd)) {
+		jffs2_free_raw_node_ref(raw);
+		if (PTR_ERR(xd) == -EEXIST) {
+			DIRTY_SPACE(PAD(je32_to_cpu(rx->totlen)));
+			return 0;
+		}
+		return PTR_ERR(xd);
+	}
+	xd->xprefix = rx->xprefix;
+	xd->name_len = rx->name_len;
+	xd->value_len = je16_to_cpu(rx->value_len);
+	xd->data_crc = je32_to_cpu(rx->data_crc);
+	xd->node = raw;
+
+	raw->__totlen = totlen;
+	raw->flash_offset = ofs | REF_PRISTINE;
+	raw->next_phys = NULL;
+	raw->next_in_ino = (void *)xd;
+	if (!jeb->first_node)
+		jeb->first_node = raw;
+	if (jeb->last_node)
+		jeb->last_node->next_phys = raw;
+	jeb->last_node = raw;
+
+	USED_SPACE(PAD(je32_to_cpu(rx->totlen)));
+	if (jffs2_sum_active())
+		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
+	dbg_xattr("scaning xdatum at %#08x (xid=%u, version=%u)\n",
+		  ofs, xd->xid, xd->version);
+	return 0;
+}
+
+static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+				struct jffs2_raw_xref *rr, uint32_t ofs,
+				struct jffs2_summary *s)
+{
+	struct jffs2_xattr_ref *ref;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t crc;
+
+	crc = crc32(0, rr, sizeof(*rr) - 4);
+	if (crc != je32_to_cpu(rr->node_crc)) {
+		if (je32_to_cpu(rr->node_crc) != 0xffffffff)
+			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				      ofs, je32_to_cpu(rr->node_crc), crc);
+		DIRTY_SPACE(PAD(je32_to_cpu(rr->totlen)));
+		return 0;
+	}
+
+	if (PAD(sizeof(struct jffs2_raw_xref)) != je32_to_cpu(rr->totlen)) {
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
+			      ofs, je32_to_cpu(rr->totlen),
+			      PAD(sizeof(struct jffs2_raw_xref)));
+		DIRTY_SPACE(je32_to_cpu(rr->totlen));
+		return 0;
+	}
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return -ENOMEM;
+
+	raw =  jffs2_alloc_raw_node_ref();
+	if (!raw) {
+		jffs2_free_xattr_ref(ref);
+		return -ENOMEM;
+	}
+
+	/* BEFORE jffs2_build_xattr_subsystem() called, 
+	 * ref->xid is used to store 32bit xid, xd is not used
+	 * ref->ino is used to store 32bit inode-number, ic is not used
+	 * Thoes variables are declared as union, thus using those
+	 * are exclusive. In a similar way, ref->ilist is temporarily
+	 * used to chain all xattr_ref object. It's re-chained to
+	 * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly.
+	 */
+	ref->node = raw;
+	ref->ino = je32_to_cpu(rr->ino);
+	ref->xid = je32_to_cpu(rr->xid);
+	list_add_tail(&ref->ilist, &c->xattr_temp);
+
+	raw->__totlen = PAD(je32_to_cpu(rr->totlen));
+	raw->flash_offset = ofs | REF_PRISTINE;
+	raw->next_phys = NULL;
+	raw->next_in_ino = (void *)ref;
+	if (!jeb->first_node)
+		jeb->first_node = raw;
+	if (jeb->last_node)
+		jeb->last_node->next_phys = raw;
+	jeb->last_node = raw;
+
+	USED_SPACE(PAD(je32_to_cpu(rr->totlen)));	
+	if (jffs2_sum_active())
+		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
+	dbg_xattr("scan xref at %#08x (xid=%u, ino=%u)\n",
+		  ofs, ref->xid, ref->ino);
+	return 0;
+}
+#endif
+
 static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 				unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
 	struct jffs2_unknown_node *node;
@@ -614,6 +744,43 @@ scan_more:
 			ofs += PAD(je32_to_cpu(node->totlen));
 			break;
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %d bytes (xattr node)"
+					  " left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xattr_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			if (buf_ofs + buf_len < ofs + je32_to_cpu(node->totlen)) {
+				buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
+				D1(printk(KERN_DEBUG "Fewer than %d bytes (xref node)"
+					  " left to end of buf. Reading 0x%x at 0x%08x\n",
+					  je32_to_cpu(node->totlen), buf_len, ofs));
+				err = jffs2_fill_scan_buf(c, buf, ofs, buf_len);
+				if (err)
+					return err;
+				buf_ofs = ofs;
+				node = (void *)buf;
+			}
+			err = jffs2_scan_xref_node(c, jeb, (void *)node, ofs, s);
+			if (err)
+				return err;
+			ofs += PAD(je32_to_cpu(node->totlen));
+			break;
+#endif	/* CONFIG_JFFS2_FS_XATTR */
+
 		case JFFS2_NODETYPE_CLEANMARKER:
 			D1(printk(KERN_DEBUG "CLEANMARKER node found at 0x%08x\n", ofs));
 			if (je32_to_cpu(node->totlen) != c->cleanmarker_size) {
@@ -721,6 +888,7 @@ struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uin
 
 	ic->ino = ino;
 	ic->nodes = (void *)ic;
+	init_xattr_inode_cache(ic);
 	jffs2_add_ino_cache(c, ic);
 	if (ino == 1)
 		ic->nlink = 1;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
new file mode 100644
index 00000000000..4b6c3b22524
--- /dev/null
+++ b/fs/jffs2/security.c
@@ -0,0 +1,82 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/security.c
+ *  Security Labels support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include <linux/security.h>
+#include "nodelist.h"
+
+/* ---- Initial Security Label Attachment -------------- */
+int jffs2_init_security(struct inode *inode, struct inode *dir)
+{
+	int rc;
+	size_t len;
+	void *value;
+	char *name;
+
+	rc = security_inode_init_security(inode, dir, &name, &value, &len);
+	if (rc) {
+		if (rc == -EOPNOTSUPP)
+			return 0;
+		return rc;
+	}
+	rc = do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, value, len, 0);
+
+        kfree(name);
+        kfree(value);
+        return rc;
+}
+
+/* ---- XATTR Handler for "security.*" ----------------- */
+static int jffs2_security_getxattr(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size);
+}
+
+static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer,
+				   size_t size, int flags)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags);
+}
+
+static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size,
+				       const char *name, size_t name_len)
+{
+	size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_SECURITY_PREFIX);
+		strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+struct xattr_handler jffs2_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list = jffs2_security_listxattr,
+	.set = jffs2_security_setxattr,
+	.get = jffs2_security_getxattr
+};
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 7b0ed77a4c3..5d9ec8e3652 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -5,6 +5,7 @@
  *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
  *                     Patrik Kluba <pajko@halom.u-szeged.hu>,
  *                     University of Szeged, Hungary
+ *               2005  KaiGai Kohei <kaigai@ak.jp.nec.com>
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
@@ -81,6 +82,19 @@ static int jffs2_sum_add_mem(struct jffs2_summary *s, union jffs2_sum_mem *item)
 			dbg_summary("dirent (%u) added to summary\n",
 						je32_to_cpu(item->d.ino));
 			break;
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR:
+			s->sum_size += JFFS2_SUMMARY_XATTR_SIZE;
+			s->sum_num++;
+			dbg_summary("xattr (xid=%u, version=%u) added to summary\n",
+				    je32_to_cpu(item->x.xid), je32_to_cpu(item->x.version));
+			break;
+		case JFFS2_NODETYPE_XREF:
+			s->sum_size += JFFS2_SUMMARY_XREF_SIZE;
+			s->sum_num++;
+			dbg_summary("xref added to summary\n");
+			break;
+#endif
 		default:
 			JFFS2_WARNING("UNKNOWN node type %u\n",
 					    je16_to_cpu(item->u.nodetype));
@@ -141,6 +155,40 @@ int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *r
 	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
 }
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs)
+{
+	struct jffs2_sum_xattr_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rx->nodetype;
+	temp->xid = rx->xid;
+	temp->version = rx->version;
+	temp->offset = cpu_to_je32(ofs);
+	temp->totlen = rx->totlen;
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs)
+{
+	struct jffs2_sum_xref_mem *temp;
+
+	temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+	if (!temp)
+		return -ENOMEM;
+
+	temp->nodetype = rr->nodetype;
+	temp->offset = cpu_to_je32(ofs);
+	temp->next = NULL;
+
+	return jffs2_sum_add_mem(s, (union jffs2_sum_mem *)temp);
+}
+#endif
 /* Cleanup every collected summary information */
 
 static void jffs2_sum_clean_collected(struct jffs2_summary *s)
@@ -259,7 +307,40 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
 
 			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
 		}
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case JFFS2_NODETYPE_XATTR: {
+			struct jffs2_sum_xattr_mem *temp;
+			if (je32_to_cpu(node->x.version) == 0xffffffff)
+				return 0;
+			temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
 
+			temp->nodetype = node->x.nodetype;
+			temp->xid = node->x.xid;
+			temp->version = node->x.version;
+			temp->totlen = node->x.totlen;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+		case JFFS2_NODETYPE_XREF: {
+			struct jffs2_sum_xref_mem *temp;
+
+			if (je32_to_cpu(node->r.ino) == 0xffffffff
+			    && je32_to_cpu(node->r.xid) == 0xffffffff)
+				return 0;
+			temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
+			if (!temp)
+				goto no_mem;
+			temp->nodetype = node->r.nodetype;
+			temp->offset = cpu_to_je32(ofs);
+			temp->next = NULL;
+
+			return jffs2_sum_add_mem(c->summary, (union jffs2_sum_mem *)temp);
+		}
+#endif
 		case JFFS2_NODETYPE_PADDING:
 			dbg_summary("node PADDING\n");
 			c->summary->sum_padded += je32_to_cpu(node->u.totlen);
@@ -408,8 +489,94 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 				break;
 			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_xattr_datum *xd;
+				struct jffs2_sum_xattr_flash *spx;
+				uint32_t ofs;
+
+				spx = (struct jffs2_sum_xattr_flash *)sp;
+				ofs = jeb->offset + je32_to_cpu(spx->offset);
+				dbg_summary("xattr at %#08x (xid=%u, version=%u)\n", ofs,
+					    je32_to_cpu(spx->xid), je32_to_cpu(spx->version));
+				raw = jffs2_alloc_raw_node_ref();
+				if (!raw) {
+					JFFS2_NOTICE("allocation of node reference failed\n");
+					kfree(summary);
+					return -ENOMEM;
+				}
+				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
+								je32_to_cpu(spx->version));
+				if (IS_ERR(xd)) {
+					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+					jffs2_free_raw_node_ref(raw);
+					kfree(summary);
+					return PTR_ERR(xd);
+				}
+				xd->node = raw;
 
+				raw->flash_offset = ofs | REF_UNCHECKED;
+				raw->__totlen = PAD(je32_to_cpu(spx->totlen));
+				raw->next_phys = NULL;
+				raw->next_in_ino = (void *)xd;
+				if (!jeb->first_node)
+					jeb->first_node = raw;
+				if (jeb->last_node)
+					jeb->last_node->next_phys = raw;
+				jeb->last_node = raw;
+
+				*pseudo_random += je32_to_cpu(spx->xid);
+				UNCHECKED_SPACE(je32_to_cpu(spx->totlen));
+				sp += JFFS2_SUMMARY_XATTR_SIZE;
+
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_xattr_ref *ref;
+				struct jffs2_sum_xref_flash *spr;
+				uint32_t ofs;
+
+				spr = (struct jffs2_sum_xref_flash *)sp;
+				ofs = jeb->offset + je32_to_cpu(spr->offset);
+				dbg_summary("xref at %#08x (xid=%u, ino=%u)\n", ofs,
+					    je32_to_cpu(spr->xid), je32_to_cpu(spr->ino));
+				raw = jffs2_alloc_raw_node_ref();
+				if (!raw) {
+					JFFS2_NOTICE("allocation of node reference failed\n");
+					kfree(summary);
+					return -ENOMEM;
+				}
+				ref = jffs2_alloc_xattr_ref();
+				if (!ref) {
+					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+					jffs2_free_raw_node_ref(raw);
+					kfree(summary);
+					return -ENOMEM;
+				}
+				ref->ino = 0xfffffffe;
+				ref->xid = 0xfffffffd;
+				ref->node = raw;
+				list_add_tail(&ref->ilist, &c->xattr_temp);
+
+				raw->__totlen = PAD(sizeof(struct jffs2_raw_xref));
+				raw->flash_offset = ofs | REF_UNCHECKED;
+				raw->next_phys = NULL;
+				raw->next_in_ino = (void *)ref;
+				if (!jeb->first_node)
+					jeb->first_node = raw;
+				if (jeb->last_node)
+					jeb->last_node->next_phys = raw;
+				jeb->last_node = raw;
+
+				UNCHECKED_SPACE(PAD(sizeof(struct jffs2_raw_xref)));
+				*pseudo_random += ofs;
+				sp += JFFS2_SUMMARY_XREF_SIZE;
+
+				break;
+			}
+#endif
 			default : {
+printk("nodetype = %#04x\n",je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype));
 				JFFS2_WARNING("Unsupported node type found in summary! Exiting...");
 				kfree(summary);
 				return -EIO;
@@ -617,7 +784,31 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 
 				break;
 			}
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case JFFS2_NODETYPE_XATTR: {
+				struct jffs2_sum_xattr_flash *sxattr_ptr = wpage;
+
+				temp = c->summary->sum_list_head;
+				sxattr_ptr->nodetype = temp->x.nodetype;
+				sxattr_ptr->xid = temp->x.xid;
+				sxattr_ptr->version = temp->x.version;
+				sxattr_ptr->offset = temp->x.offset;
+				sxattr_ptr->totlen = temp->x.totlen;
+
+				wpage += JFFS2_SUMMARY_XATTR_SIZE;
+				break;
+			}
+			case JFFS2_NODETYPE_XREF: {
+				struct jffs2_sum_xref_flash *sxref_ptr = wpage;
+
+				temp = c->summary->sum_list_head;
+				sxref_ptr->nodetype = temp->r.nodetype;
+				sxref_ptr->offset = temp->r.offset;
 
+				wpage += JFFS2_SUMMARY_XREF_SIZE;
+				break;
+			}
+#endif
 			default : {
 				BUG();	/* unknown node in summary information */
 			}
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index b7a678be170..a3b66c18aae 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -45,6 +45,8 @@
 #define JFFS2_SUMMARY_NOSUM_SIZE 0xffffffff
 #define JFFS2_SUMMARY_INODE_SIZE (sizeof(struct jffs2_sum_inode_flash))
 #define JFFS2_SUMMARY_DIRENT_SIZE(x) (sizeof(struct jffs2_sum_dirent_flash) + (x))
+#define JFFS2_SUMMARY_XATTR_SIZE (sizeof(struct jffs2_sum_xattr_flash))
+#define JFFS2_SUMMARY_XREF_SIZE (sizeof(struct jffs2_sum_xref_flash))
 
 /* Summary structures used on flash */
 
@@ -75,11 +77,28 @@ struct jffs2_sum_dirent_flash
 	uint8_t name[0];	/* dirent name */
 } __attribute__((packed));
 
+struct jffs2_sum_xattr_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XATR */
+	jint32_t xid;		/* xattr identifier */
+	jint32_t version;	/* version number */
+	jint32_t offset;	/* offset on jeb */
+	jint32_t totlen;	/* node length */
+} __attribute__((packed));
+
+struct jffs2_sum_xref_flash
+{
+	jint16_t nodetype;	/* == JFFS2_NODETYPE_XREF */
+	jint32_t offset;	/* offset on jeb */
+} __attribute__((packed));
+
 union jffs2_sum_flash
 {
 	struct jffs2_sum_unknown_flash u;
 	struct jffs2_sum_inode_flash i;
 	struct jffs2_sum_dirent_flash d;
+	struct jffs2_sum_xattr_flash x;
+	struct jffs2_sum_xref_flash r;
 };
 
 /* Summary structures used in the memory */
@@ -114,11 +133,30 @@ struct jffs2_sum_dirent_mem
 	uint8_t name[0];	/* dirent name */
 } __attribute__((packed));
 
+struct jffs2_sum_xattr_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t xid;
+	jint32_t version;
+	jint32_t offset;
+	jint32_t totlen;
+} __attribute__((packed));
+
+struct jffs2_sum_xref_mem
+{
+	union jffs2_sum_mem *next;
+	jint16_t nodetype;
+	jint32_t offset;
+} __attribute__((packed));
+
 union jffs2_sum_mem
 {
 	struct jffs2_sum_unknown_mem u;
 	struct jffs2_sum_inode_mem i;
 	struct jffs2_sum_dirent_mem d;
+	struct jffs2_sum_xattr_mem x;
+	struct jffs2_sum_xref_mem r;
 };
 
 /* Summary related information stored in superblock */
@@ -159,6 +197,8 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c);
 int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size);
 int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, uint32_t ofs);
 int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, uint32_t ofs);
+int jffs2_sum_add_xattr_mem(struct jffs2_summary *s, struct jffs2_raw_xattr *rx, uint32_t ofs);
+int jffs2_sum_add_xref_mem(struct jffs2_summary *s, struct jffs2_raw_xref *rr, uint32_t ofs);
 int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 			uint32_t ofs, uint32_t *pseudo_random);
 
@@ -176,6 +216,8 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 #define jffs2_sum_add_padding_mem(a,b)
 #define jffs2_sum_add_inode_mem(a,b,c)
 #define jffs2_sum_add_dirent_mem(a,b,c)
+#define jffs2_sum_add_xattr_mem(a,b,c)
+#define jffs2_sum_add_xref_mem(a,b,c)
 #define jffs2_sum_scan_sumnode(a,b,c,d) (0)
 
 #endif /* CONFIG_JFFS2_SUMMARY */
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index ffd8e84b22c..c8b539ee7d8 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -151,7 +151,10 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 
 	sb->s_op = &jffs2_super_operations;
 	sb->s_flags = flags | MS_NOATIME;
-
+	sb->s_xattr = jffs2_xattr_handlers;
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
 	ret = jffs2_do_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
 
 	if (ret) {
@@ -293,6 +296,7 @@ static void jffs2_put_super (struct super_block *sb)
 		kfree(c->blocks);
 	jffs2_flash_cleanup(c);
 	kfree(c->inocache_list);
+	jffs2_clear_xattr_subsystem(c);
 	if (c->mtd->sync)
 		c->mtd->sync(c->mtd);
 
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index d55754fe892..fc211b6e9b0 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -24,7 +24,12 @@ struct inode_operations jffs2_symlink_inode_operations =
 {
 	.readlink =	generic_readlink,
 	.follow_link =	jffs2_follow_link,
-	.setattr =	jffs2_setattr
+	.permission =	jffs2_permission,
+	.setattr =	jffs2_setattr,
+	.setxattr =	jffs2_setxattr,
+	.getxattr =	jffs2_getxattr,
+	.listxattr =	jffs2_listxattr,
+	.removexattr =	jffs2_removexattr
 };
 
 static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 1342f0158e9..d5c78195f3b 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -36,7 +36,7 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint
 	f->inocache->nlink = 1;
 	f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
 	f->inocache->state = INO_STATE_PRESENT;
-
+	init_xattr_inode_cache(f->inocache);
 
 	jffs2_add_ino_cache(c, f->inocache);
 	D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino));
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
new file mode 100644
index 00000000000..c9a185c54ce
--- /dev/null
+++ b/fs/jffs2/xattr.c
@@ -0,0 +1,1271 @@
+/* -------------------------------------------------------------------------
+ *  File: fs/jffs2/xattr.c
+ *  XATTR support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ * ------------------------------------------------------------------------- */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/crc32.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+/* -------- xdatum related functions ----------------
+ * xattr_datum_hashkey(xprefix, xname, xvalue, xsize)
+ *   is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is
+ *   the index of the xattr name/value pair cache (c->xattrindex).
+ * unload_xattr_datum(c, xd)
+ *   is used to release xattr name/value pair and detach from c->xattrindex.
+ * reclaim_xattr_datum(c)
+ *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
+ *   is hard coded as 32KiB.
+ * delete_xattr_datum_node(c, xd)
+ *   is used to delete a jffs2 node is dominated by xdatum. When EBS(Erase Block Summary) is
+ *   enabled, it overwrites the obsolete node by myself.
+ * delete_xattr_datum(c, xd)
+ *   is used to delete jffs2_xattr_datum object. It must be called with 0-value of reference
+ *   counter. (It means how many jffs2_xattr_ref object refers this xdatum.)
+ * do_verify_xattr_datum(c, xd)
+ *   is used to load the xdatum informations without name/value pair from the medium.
+ *   It's necessary once, because those informations are not collected during mounting
+ *   process when EBS is enabled.
+ *   0 will be returned, if success. An negative return value means recoverable error, and
+ *   positive return value means unrecoverable error. Thus, caller must remove this xdatum
+ *   and xref when it returned positive value.
+ * do_load_xattr_datum(c, xd)
+ *   is used to load name/value pair from the medium.
+ *   The meanings of return value is same as do_verify_xattr_datum().
+ * load_xattr_datum(c, xd)
+ *   is used to be as a wrapper of do_verify_xattr_datum() and do_load_xattr_datum().
+ *   If xd need to call do_verify_xattr_datum() at first, it's called before calling
+ *   do_load_xattr_datum(). The meanings of return value is same as do_verify_xattr_datum().
+ * save_xattr_datum(c, xd, phys_ofs)
+ *   is used to write xdatum to medium. xd->version will be incremented.
+ * create_xattr_datum(c, xprefix, xname, xvalue, xsize, phys_ofs)
+ *   is used to create new xdatum and write to medium.
+ * -------------------------------------------------- */
+
+static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
+{
+	int name_len = strlen(xname);
+
+	return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize);
+}
+
+static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	D1(dbg_xattr("%s: xid=%u, version=%u\n", __FUNCTION__, xd->xid, xd->version));
+	if (xd->xname) {
+		c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len);
+		kfree(xd->xname);
+	}
+
+	list_del_init(&xd->xindex);
+	xd->hashkey = 0;
+	xd->xname = NULL;
+	xd->xvalue = NULL;
+}
+
+static void reclaim_xattr_datum(struct jffs2_sb_info *c)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd, *_xd;
+	uint32_t target, before;
+	static int index = 0;
+	int count;
+
+	if (c->xdatum_mem_threshold > c->xdatum_mem_usage)
+		return;
+
+	before = c->xdatum_mem_usage;
+	target = c->xdatum_mem_usage * 4 / 5; /* 20% reduction */
+	for (count = 0; count < XATTRINDEX_HASHSIZE; count++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[index], xindex) {
+			if (xd->flags & JFFS2_XFLAGS_HOT) {
+				xd->flags &= ~JFFS2_XFLAGS_HOT;
+			} else if (!(xd->flags & JFFS2_XFLAGS_BIND)) {
+				unload_xattr_datum(c, xd);
+			}
+			if (c->xdatum_mem_usage <= target)
+				goto out;
+		}
+		index = (index+1) % XATTRINDEX_HASHSIZE;
+	}
+ out:
+	JFFS2_NOTICE("xdatum_mem_usage from %u byte to %u byte (%u byte reclaimed)\n",
+		     before, c->xdatum_mem_usage, before - c->xdatum_mem_usage);
+}
+
+static void delete_xattr_datum_node(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_xattr rx;
+	uint32_t length;
+	int rc;
+
+	if (!xd->node) {
+		JFFS2_WARNING("xdatum (xid=%u) is removed twice.\n", xd->xid);
+		return;
+	}
+	if (jffs2_sum_active()) {
+		memset(&rx, 0xff, sizeof(struct jffs2_raw_xattr));
+		rc = jffs2_flash_read(c, ref_offset(xd->node),
+				      sizeof(struct jffs2_unknown_node),
+				      &length, (char *)&rx);
+		if (rc || length != sizeof(struct jffs2_unknown_node)) {
+			JFFS2_ERROR("jffs2_flash_read()=%d, req=%u, read=%u at %#08x\n",
+				    rc, sizeof(struct jffs2_unknown_node),
+				    length, ref_offset(xd->node));
+		}
+		rc = jffs2_flash_write(c, ref_offset(xd->node), sizeof(rx),
+				       &length, (char *)&rx);
+		if (rc || length != sizeof(struct jffs2_raw_xattr)) {
+			JFFS2_ERROR("jffs2_flash_write()=%d, req=%u, wrote=%u ar %#08x\n",
+				    rc, sizeof(rx), length, ref_offset(xd->node));
+		}
+	}
+	spin_lock(&c->erase_completion_lock);
+	xd->node->next_in_ino = NULL;
+	spin_unlock(&c->erase_completion_lock);
+	jffs2_mark_node_obsolete(c, xd->node);
+	xd->node = NULL;
+}
+
+static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	BUG_ON(xd->refcnt);
+
+	unload_xattr_datum(c, xd);
+	if (xd->node) {
+		delete_xattr_datum_node(c, xd);
+		xd->node = NULL;
+	}
+	jffs2_free_xattr_datum(xd);
+}
+
+static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_xattr rx;
+	size_t readlen;
+	uint32_t crc, totlen;
+	int rc;
+
+	BUG_ON(!xd->node);
+	BUG_ON(ref_flags(xd->node) != REF_UNCHECKED);
+
+	rc = jffs2_flash_read(c, ref_offset(xd->node), sizeof(rx), &readlen, (char *)&rx);
+	if (rc || readlen != sizeof(rx)) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%u, read=%u at %#08x\n",
+			      rc, sizeof(rx), readlen, ref_offset(xd->node));
+		return rc ? rc : -EIO;
+	}
+	crc = crc32(0, &rx, sizeof(rx) - 4);
+	if (crc != je32_to_cpu(rx.node_crc)) {
+		if (je32_to_cpu(rx.node_crc) != 0xffffffff)
+			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				    ref_offset(xd->node), je32_to_cpu(rx.hdr_crc), crc);
+		return EIO;
+	}
+	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
+	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR
+	    || je32_to_cpu(rx.totlen) != totlen
+	    || je32_to_cpu(rx.xid) != xd->xid
+	    || je32_to_cpu(rx.version) != xd->version) {
+		JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n",
+			    ref_offset(xd->node), je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR,
+			    je32_to_cpu(rx.totlen), totlen,
+			    je32_to_cpu(rx.xid), xd->xid,
+			    je32_to_cpu(rx.version), xd->version);
+		return EIO;
+	}
+	xd->xprefix = rx.xprefix;
+	xd->name_len = rx.name_len;
+	xd->value_len = je16_to_cpu(rx.value_len);
+	xd->data_crc = je32_to_cpu(rx.data_crc);
+
+	/* This JFFS2_NODETYPE_XATTR node is checked */
+	jeb = &c->blocks[ref_offset(xd->node) / c->sector_size];
+	totlen = PAD(je32_to_cpu(rx.totlen));
+
+	spin_lock(&c->erase_completion_lock);
+	c->unchecked_size -= totlen; c->used_size += totlen;
+	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+	xd->node->flash_offset = ref_offset(xd->node) | REF_PRISTINE;
+	spin_unlock(&c->erase_completion_lock);
+
+	/* unchecked xdatum is chained with c->xattr_unchecked */
+	list_del_init(&xd->xindex);
+
+	dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n",
+		  xd->xid, xd->version);
+
+	return 0;
+}
+
+static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	char *data;
+	size_t readlen;
+	uint32_t crc, length;
+	int i, ret, retry = 0;
+
+	BUG_ON(!xd->node);
+	BUG_ON(ref_flags(xd->node) != REF_PRISTINE);
+	BUG_ON(!list_empty(&xd->xindex));
+ retry:
+	length = xd->name_len + 1 + xd->value_len;
+	data = kmalloc(length, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ret = jffs2_flash_read(c, ref_offset(xd->node)+sizeof(struct jffs2_raw_xattr),
+			       length, &readlen, data);
+
+	if (ret || length!=readlen) {
+		JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%d, at %#08x\n",
+			      ret, length, readlen, ref_offset(xd->node));
+		kfree(data);
+		return ret ? ret : -EIO;
+	}
+
+	data[xd->name_len] = '\0';
+	crc = crc32(0, data, length);
+	if (crc != xd->data_crc) {
+		JFFS2_WARNING("node CRC failed (JFFS2_NODETYPE_XREF)"
+			      " at %#08x, read: 0x%08x calculated: 0x%08x\n",
+			      ref_offset(xd->node), xd->data_crc, crc);
+		kfree(data);
+		return EIO;
+	}
+
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xname = data;
+	xd->xvalue = data + xd->name_len+1;
+
+	c->xdatum_mem_usage += length;
+
+	xd->hashkey = xattr_datum_hashkey(xd->xprefix, xd->xname, xd->xvalue, xd->value_len);
+	i = xd->hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+	if (!retry) {
+		retry = 1;
+		reclaim_xattr_datum(c);
+		if (!xd->xname)
+			goto retry;
+	}
+
+	dbg_xattr("success on loading xdatum (xid=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem);
+	 * rc < 0 : recoverable error, try again
+	 * rc = 0 : success
+	 * rc > 0 : Unrecoverable error, this node should be deleted.
+	 */
+	int rc = 0;
+	BUG_ON(xd->xname);
+	if (!xd->node)
+		return EIO;
+	if (unlikely(ref_flags(xd->node) != REF_PRISTINE)) {
+		rc = do_verify_xattr_datum(c, xd);
+		if (rc > 0) {
+			list_del_init(&xd->xindex);
+			delete_xattr_datum_node(c, xd);
+		}
+	}
+	if (!rc)
+		rc = do_load_xattr_datum(c, xd);
+	return rc;
+}
+
+static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd, uint32_t phys_ofs)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_xattr rx;
+	struct jffs2_raw_node_ref *raw;
+	struct kvec vecs[2];
+	uint32_t length;
+	int rc, totlen;
+
+	BUG_ON(!xd->xname);
+
+	vecs[0].iov_base = &rx;
+	vecs[0].iov_len = PAD(sizeof(rx));
+	vecs[1].iov_base = xd->xname;
+	vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
+	totlen = vecs[0].iov_len + vecs[1].iov_len;
+
+	raw = jffs2_alloc_raw_node_ref();
+	if (!raw)
+		return -ENOMEM;
+	raw->flash_offset = phys_ofs;
+	raw->__totlen = PAD(totlen);
+	raw->next_phys = NULL;
+	raw->next_in_ino = (void *)xd;
+
+	/* Setup raw-xattr */
+	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR);
+	rx.totlen = cpu_to_je32(PAD(totlen));
+	rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4));
+
+	rx.xid = cpu_to_je32(xd->xid);
+	rx.version = cpu_to_je32(++xd->version);
+	rx.xprefix = xd->xprefix;
+	rx.name_len = xd->name_len;
+	rx.value_len = cpu_to_je16(xd->value_len);
+	rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+	rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4));
+
+	rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
+	if (rc || totlen != length) {
+		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%u, at %#08x\n",
+			      rc, totlen, length, phys_ofs);
+		rc = rc ? rc : -EIO;
+		if (length) {
+			raw->flash_offset |= REF_OBSOLETE;
+			raw->next_in_ino = NULL;
+			jffs2_add_physical_node_ref(c, raw);
+			jffs2_mark_node_obsolete(c, raw);
+		} else {
+			jffs2_free_raw_node_ref(raw);
+		}
+		return rc;
+	}
+	BUG_ON(raw->__totlen < sizeof(struct jffs2_raw_xattr));
+	/* success */
+	raw->flash_offset |= REF_PRISTINE;
+	jffs2_add_physical_node_ref(c, raw);
+	if (xd->node)
+		delete_xattr_datum_node(c, xd);
+	xd->node = raw;
+
+	dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n",
+		  xd->xid, xd->version, xd->xprefix, xd->xname);
+
+	return 0;
+}
+
+static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
+						    int xprefix, const char *xname,
+						    const char *xvalue, int xsize,
+						    uint32_t phys_ofs)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+	uint32_t hashkey, name_len;
+	char *data;
+	int i, rc;
+
+	/* Search xattr_datum has same xname/xvalue by index */
+	hashkey = xattr_datum_hashkey(xprefix, xname, xvalue, xsize);
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->hashkey==hashkey
+		    && xd->xprefix==xprefix
+		    && xd->value_len==xsize
+		    && !strcmp(xd->xname, xname)
+		    && !memcmp(xd->xvalue, xvalue, xsize)) {
+			xd->refcnt++;
+			return xd;
+		}
+	}
+
+	/* Not found, Create NEW XATTR-Cache */
+	name_len = strlen(xname);
+
+	xd = jffs2_alloc_xattr_datum();
+	if (!xd)
+		return ERR_PTR(-ENOMEM);
+
+	data = kmalloc(name_len + 1 + xsize, GFP_KERNEL);
+	if (!data) {
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(-ENOMEM);
+	}
+	strcpy(data, xname);
+	memcpy(data + name_len + 1, xvalue, xsize);
+
+	xd->refcnt = 1;
+	xd->xid = ++c->highest_xid;
+	xd->flags |= JFFS2_XFLAGS_HOT;
+	xd->xprefix = xprefix;
+
+	xd->hashkey = hashkey;
+	xd->xname = data;
+	xd->xvalue = data + name_len + 1;
+	xd->name_len = name_len;
+	xd->value_len = xsize;
+	xd->data_crc = crc32(0, data, xd->name_len + 1 + xd->value_len);
+
+	rc = save_xattr_datum(c, xd, phys_ofs);
+	if (rc) {
+		kfree(xd->xname);
+		jffs2_free_xattr_datum(xd);
+		return ERR_PTR(rc);
+	}
+
+	/* Insert Hash Index */
+	i = hashkey % XATTRINDEX_HASHSIZE;
+	list_add(&xd->xindex, &c->xattrindex[i]);
+
+	c->xdatum_mem_usage += (xd->name_len + 1 + xd->value_len);
+	reclaim_xattr_datum(c);
+
+	return xd;
+}
+
+/* -------- xdatum related functions ----------------
+ * verify_xattr_ref(c, ref)
+ *   is used to load xref information from medium. Because summary data does not
+ *   contain xid/ino, it's necessary to verify once while mounting process.
+ * delete_xattr_ref_node(c, ref)
+ *   is used to delete a jffs2 node is dominated by xref. When EBS is enabled,
+ *   it overwrites the obsolete node by myself. 
+ * delete_xattr_ref(c, ref)
+ *   is used to delete jffs2_xattr_ref object. If the reference counter of xdatum
+ *   is refered by this xref become 0, delete_xattr_datum() is called later.
+ * save_xattr_ref(c, ref, phys_ofs)
+ *   is used to write xref to medium.
+ * create_xattr_ref(c, ic, xd, phys_ofs)
+ *   is used to create a new xref and write to medium.
+ * jffs2_xattr_delete_inode(c, ic)
+ *   is called to remove xrefs related to obsolete inode when inode is unlinked.
+ * jffs2_xattr_free_inode(c, ic)
+ *   is called to release xattr related objects when unmounting. 
+ * check_xattr_ref_ilist(c, ic)
+ *   is used to confirm inode does not have duplicate xattr name/value pair.
+ * -------------------------------------------------- */
+static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_xref rr;
+	size_t readlen;
+	uint32_t crc, totlen;
+	int rc;
+
+	BUG_ON(ref_flags(ref->node) != REF_UNCHECKED);
+
+	rc = jffs2_flash_read(c, ref_offset(ref->node), sizeof(rr), &readlen, (char *)&rr);
+	if (rc || sizeof(rr) != readlen) {
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%u, read=%u, at %#08x\n",
+			      rc, sizeof(rr), readlen, ref_offset(ref->node));
+		return rc ? rc : -EIO;
+	}
+	/* obsolete node */
+	crc = crc32(0, &rr, sizeof(rr) - 4);
+	if (crc != je32_to_cpu(rr.node_crc)) {
+		if (je32_to_cpu(rr.node_crc) != 0xffffffff)
+			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+				    ref_offset(ref->node), je32_to_cpu(rr.node_crc), crc);
+		return EIO;
+	}
+	if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
+	    || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
+	    || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) {
+		JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, "
+			    "nodetype=%#04x/%#04x, totlen=%u/%u\n",
+			    ref_offset(ref->node), je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
+			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
+			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
+		return EIO;
+	}
+	ref->ino = je32_to_cpu(rr.ino);
+	ref->xid = je32_to_cpu(rr.xid);
+
+	/* fixup superblock/eraseblock info */
+	jeb = &c->blocks[ref_offset(ref->node) / c->sector_size];
+	totlen = PAD(sizeof(rr));
+
+	spin_lock(&c->erase_completion_lock);
+	c->unchecked_size -= totlen; c->used_size += totlen;
+	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+	ref->node->flash_offset = ref_offset(ref->node) | REF_PRISTINE;
+	spin_unlock(&c->erase_completion_lock);
+
+	dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n",
+		  ref->ino, ref->xid, ref_offset(ref->node));
+	return 0;
+}
+
+static void delete_xattr_ref_node(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	struct jffs2_raw_xref rr;
+	uint32_t length;
+	int rc;
+
+	if (jffs2_sum_active()) {
+		memset(&rr, 0xff, sizeof(rr));
+		rc = jffs2_flash_read(c, ref_offset(ref->node),
+				      sizeof(struct jffs2_unknown_node),
+				      &length, (char *)&rr);
+		if (rc || length != sizeof(struct jffs2_unknown_node)) {
+			JFFS2_ERROR("jffs2_flash_read()=%d, req=%u, read=%u at %#08x\n",
+				    rc, sizeof(struct jffs2_unknown_node),
+				    length, ref_offset(ref->node));
+		}
+		rc = jffs2_flash_write(c, ref_offset(ref->node), sizeof(rr),
+				       &length, (char *)&rr);
+		if (rc || length != sizeof(struct jffs2_raw_xref)) {
+			JFFS2_ERROR("jffs2_flash_write()=%d, req=%u, wrote=%u at %#08x\n",
+				    rc, sizeof(rr), length, ref_offset(ref->node));
+		}
+	}
+	spin_lock(&c->erase_completion_lock);
+	ref->node->next_in_ino = NULL;
+	spin_unlock(&c->erase_completion_lock);
+	jffs2_mark_node_obsolete(c, ref->node);
+	ref->node = NULL;
+}
+
+static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+
+	BUG_ON(!ref->node);
+	delete_xattr_ref_node(c, ref);
+
+	list_del(&ref->ilist);
+	xd = ref->xd;
+	xd->refcnt--;
+	if (!xd->refcnt)
+		delete_xattr_datum(c, xd);
+	jffs2_free_xattr_ref(ref);
+}
+
+static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, uint32_t phys_ofs)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_raw_node_ref *raw;
+	struct jffs2_raw_xref rr;
+	uint32_t length;
+	int ret;
+
+	raw = jffs2_alloc_raw_node_ref();
+	if (!raw)
+		return -ENOMEM;
+	raw->flash_offset = phys_ofs;
+	raw->__totlen = PAD(sizeof(rr));
+	raw->next_phys = NULL;
+	raw->next_in_ino = (void *)ref;
+
+	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
+	rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF);
+	rr.totlen = cpu_to_je32(PAD(sizeof(rr)));
+	rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4));
+
+	rr.ino = cpu_to_je32(ref->ic->ino);
+	rr.xid = cpu_to_je32(ref->xd->xid);
+	rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4));
+
+	ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr);
+	if (ret || sizeof(rr) != length) {
+		JFFS2_WARNING("jffs2_flash_write() returned %d, request=%u, retlen=%u, at %#08x\n",
+			      ret, sizeof(rr), length, phys_ofs);
+		ret = ret ? ret : -EIO;
+		if (length) {
+			raw->flash_offset |= REF_OBSOLETE;
+			raw->next_in_ino = NULL;
+			jffs2_add_physical_node_ref(c, raw);
+			jffs2_mark_node_obsolete(c, raw);
+		} else {
+			jffs2_free_raw_node_ref(raw);
+		}
+		return ret;
+	}
+	raw->flash_offset |= REF_PRISTINE;
+
+	jffs2_add_physical_node_ref(c, raw);
+	if (ref->node)
+		delete_xattr_ref_node(c, ref);
+	ref->node = raw;
+
+	dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid);
+
+	return 0;
+}
+
+static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic,
+						struct jffs2_xattr_datum *xd, uint32_t phys_ofs)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_ref *ref;
+	int ret;
+
+	ref = jffs2_alloc_xattr_ref();
+	if (!ref)
+		return ERR_PTR(-ENOMEM);
+	ref->ic = ic;
+	ref->xd = xd;
+
+	ret = save_xattr_ref(c, ref, phys_ofs);
+	if (ret) {
+		jffs2_free_xattr_ref(ref);
+		return ERR_PTR(ret);
+	}
+
+	/* Chain to inode */
+	list_add(&ref->ilist, &ic->ilist);
+
+	return ref; /* success */
+}
+
+void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_clear_inode() on inode removing.
+	   When an inode with XATTR is removed, those XATTRs must be removed. */
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	if (!ic || ic->nlink > 0)
+		return;
+
+	down_write(&c->xattr_sem);
+	list_for_each_entry_safe(ref, _ref, &ic->ilist, ilist)
+		delete_xattr_ref(c, ref);
+	up_write(&c->xattr_sem);
+}
+
+void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* It's called from jffs2_free_ino_caches() until unmounting FS. */
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+
+	down_write(&c->xattr_sem);
+	list_for_each_entry_safe(ref, _ref, &ic->ilist, ilist) {
+		list_del(&ref->ilist);
+		xd = ref->xd;
+		xd->refcnt--;
+		if (!xd->refcnt) {
+			unload_xattr_datum(c, xd);
+			jffs2_free_xattr_datum(xd);
+		}
+		jffs2_free_xattr_ref(ref);
+	}
+	up_write(&c->xattr_sem);
+}
+
+static int check_xattr_ref_ilist(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	/* success of check_xattr_ref_ilist() means taht inode (ic) dose not have
+	 * duplicate name/value pairs. If duplicate name/value pair would be found,
+	 * one will be removed.
+	 */
+	struct jffs2_xattr_ref *ref, *cmp;
+	int rc = 0;
+
+	if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED))
+		return 0;
+	down_write(&c->xattr_sem);
+ retry:
+	rc = 0;
+	list_for_each_entry(ref, &ic->ilist, ilist) {
+		if (!ref->xd->xname) {
+			rc = load_xattr_datum(c, ref->xd);
+			if (unlikely(rc > 0)) {
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		cmp = ref;
+		list_for_each_entry_continue(cmp, &ic->ilist, ilist) {
+			if (!cmp->xd->xname) {
+				ref->xd->flags |= JFFS2_XFLAGS_BIND;
+				rc = load_xattr_datum(c, cmp->xd);
+				ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
+				if (unlikely(rc > 0)) {
+					delete_xattr_ref(c, cmp);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+			if (ref->xd->xprefix == cmp->xd->xprefix
+			    && !strcmp(ref->xd->xname, cmp->xd->xname)) {
+				delete_xattr_ref(c, cmp);
+				goto retry;
+			}
+		}
+	}
+	ic->flags |= INO_FLAGS_XATTR_CHECKED;
+ out:
+	up_write(&c->xattr_sem);
+
+	return rc;
+}
+
+/* -------- xattr subsystem functions ---------------
+ * jffs2_init_xattr_subsystem(c)
+ *   is used to initialize semaphore and list_head, and some variables.
+ * jffs2_find_xattr_datum(c, xid)
+ *   is used to lookup xdatum while scanning process.
+ * jffs2_clear_xattr_subsystem(c)
+ *   is used to release any xattr related objects.
+ * jffs2_build_xattr_subsystem(c)
+ *   is used to associate xdatum and xref while super block building process.
+ * jffs2_setup_xattr_datum(c, xid, version)
+ *   is used to insert xdatum while scanning process.
+ * -------------------------------------------------- */
+void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	int i;
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++)
+		INIT_LIST_HEAD(&c->xattrindex[i]);
+	INIT_LIST_HEAD(&c->xattr_temp);
+	INIT_LIST_HEAD(&c->xattr_unchecked);
+
+	init_rwsem(&c->xattr_sem);
+	c->xdatum_mem_usage = 0;
+	c->xdatum_mem_threshold = 32 * 1024;	/* Default 32KB */
+}
+
+static struct jffs2_xattr_datum *jffs2_find_xattr_datum(struct jffs2_sb_info *c, uint32_t xid)
+{
+	struct jffs2_xattr_datum *xd;
+	int i = xid % XATTRINDEX_HASHSIZE;
+
+	/* It's only used in scanning/building process. */
+	BUG_ON(!(c->flags & (JFFS2_SB_FLAG_SCANNING|JFFS2_SB_FLAG_BUILDING)));
+
+	list_for_each_entry(xd, &c->xattrindex[i], xindex) {
+		if (xd->xid==xid)
+			return xd;
+	}
+	return NULL;
+}
+
+void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_xattr_ref *ref, *_ref;
+	int i;
+
+	list_for_each_entry_safe(ref, _ref, &c->xattr_temp, ilist)
+		jffs2_free_xattr_ref(ref);
+
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			list_del(&xd->xindex);
+			if (xd->xname)
+				kfree(xd->xname);
+			jffs2_free_xattr_datum(xd);
+		}
+	}
+}
+
+void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_ref *ref, *_ref;
+	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_inode_cache *ic;
+	int i, xdatum_count =0, xdatum_unchecked_count = 0, xref_count = 0;
+
+	BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+
+	/* Phase.1 */
+	list_for_each_entry_safe(ref, _ref, &c->xattr_temp, ilist) {
+		list_del_init(&ref->ilist);
+		/* checking REF_UNCHECKED nodes */
+		if (ref_flags(ref->node) != REF_PRISTINE) {
+			if (verify_xattr_ref(c, ref)) {
+				delete_xattr_ref_node(c, ref);
+				jffs2_free_xattr_ref(ref);
+				continue;
+			}
+		}
+		/* At this point, ref->xid and ref->ino contain XID and inode number.
+		   ref->xd and ref->ic are not valid yet. */
+		xd = jffs2_find_xattr_datum(c, ref->xid);
+		ic = jffs2_get_ino_cache(c, ref->ino);
+		if (!xd || !ic) {
+			if (ref_flags(ref->node) != REF_UNCHECKED)
+				JFFS2_WARNING("xref(ino=%u, xid=%u) is orphan. \n",
+					      ref->ino, ref->xid);
+			delete_xattr_ref_node(c, ref);
+			jffs2_free_xattr_ref(ref);
+			continue;
+		}
+		ref->xd = xd;
+		ref->ic = ic;
+		xd->refcnt++;
+		list_add_tail(&ref->ilist, &ic->ilist);
+		xref_count++;
+	}
+	/* After this, ref->xid/ino are NEVER used. */
+
+	/* Phase.2 */
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			list_del_init(&xd->xindex);
+			if (!xd->refcnt) {
+				if (ref_flags(xd->node) != REF_UNCHECKED)
+					JFFS2_WARNING("orphan xdatum(xid=%u, version=%u) at %#08x\n",
+						      xd->xid, xd->version, ref_offset(xd->node));
+				delete_xattr_datum(c, xd);
+				continue;
+			}
+			if (ref_flags(xd->node) != REF_PRISTINE) {
+				dbg_xattr("unchecked xdatum(xid=%u) at %#08x\n",
+					  xd->xid, ref_offset(xd->node));
+				list_add(&xd->xindex, &c->xattr_unchecked);
+				xdatum_unchecked_count++;
+			}
+			xdatum_count++;
+		}
+	}
+	/* build complete */
+	JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum (%u unchecked) and "
+		     "%u of xref found.\n", xdatum_count, xdatum_unchecked_count, xref_count);
+}
+
+struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+						  uint32_t xid, uint32_t version)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+
+	_xd = jffs2_find_xattr_datum(c, xid);
+	if (_xd) {
+		dbg_xattr("duplicate xdatum (xid=%u, version=%u/%u) at %#08x\n",
+			  xid, version, _xd->version, ref_offset(_xd->node));
+		if (version < _xd->version)
+			return ERR_PTR(-EEXIST);
+	}
+	xd = jffs2_alloc_xattr_datum();
+	if (!xd)
+		return ERR_PTR(-ENOMEM);
+	xd->xid = xid;
+	xd->version = version;
+	if (xd->xid > c->highest_xid)
+		c->highest_xid = xd->xid;
+	list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
+
+	if (_xd) {
+		list_del_init(&_xd->xindex);
+		delete_xattr_datum_node(c, _xd);
+		jffs2_free_xattr_datum(_xd);
+	}
+	return xd;
+}
+
+/* -------- xattr subsystem functions ---------------
+ * xprefix_to_handler(xprefix)
+ *   is used to translate xprefix into xattr_handler.
+ * jffs2_listxattr(dentry, buffer, size)
+ *   is an implementation of listxattr handler on jffs2.
+ * do_jffs2_getxattr(inode, xprefix, xname, buffer, size)
+ *   is an implementation of getxattr handler on jffs2.
+ * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
+ *   is an implementation of setxattr handler on jffs2.
+ * -------------------------------------------------- */
+struct xattr_handler *jffs2_xattr_handlers[] = {
+	&jffs2_user_xattr_handler,
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	&jffs2_security_xattr_handler,
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	&jffs2_acl_access_xattr_handler,
+	&jffs2_acl_default_xattr_handler,
+#endif
+	&jffs2_trusted_xattr_handler,
+	NULL
+};
+
+static struct xattr_handler *xprefix_to_handler(int xprefix) {
+	struct xattr_handler *ret;
+
+	switch (xprefix) {
+	case JFFS2_XPREFIX_USER:
+		ret = &jffs2_user_xattr_handler;
+		break;
+#ifdef CONFIG_JFFS2_FS_SECURITY
+	case JFFS2_XPREFIX_SECURITY:
+		ret = &jffs2_security_xattr_handler;
+		break;
+#endif
+#ifdef CONFIG_JFFS2_FS_POSIX_ACL
+	case JFFS2_XPREFIX_ACL_ACCESS:
+		ret = &jffs2_acl_access_xattr_handler;
+		break;
+	case JFFS2_XPREFIX_ACL_DEFAULT:
+		ret = &jffs2_acl_default_xattr_handler;
+		break;
+#endif
+	case JFFS2_XPREFIX_TRUSTED:
+		ret = &jffs2_trusted_xattr_handler;
+		break;
+	default:
+		ret = NULL;
+		break;
+	}
+	return ret;
+}
+
+ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_ref *ref;
+	struct jffs2_xattr_datum *xd;
+	struct xattr_handler *xhandle;
+	ssize_t len, rc;
+	int retry = 0;
+
+	rc = check_xattr_ref_ilist(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	len = 0;
+	list_for_each_entry(ref, &ic->ilist, ilist) {
+		BUG_ON(ref->ic != ic);
+		xd = ref->xd;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0))
+					goto out;
+			}
+		}
+		xhandle = xprefix_to_handler(xd->xprefix);
+		if (!xhandle)
+			continue;
+		if (buffer) {
+			rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len);
+		} else {
+			rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len);
+		}
+		if (rc < 0)
+			goto out;
+		len += rc;
+	}
+	rc = len;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+		      char *buffer, size_t size)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref;
+	int rc, retry = 0;
+
+	rc = check_xattr_ref_ilist(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	down_read(&c->xattr_sem);
+ retry:
+	list_for_each_entry(ref, &ic->ilist, ilist) {
+		BUG_ON(ref->ic!=ic);
+
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			/* xdatum is unchached */
+			if (!retry) {
+				retry = 1;
+				up_read(&c->xattr_sem);
+				down_write(&c->xattr_sem);
+				goto retry;
+			} else {
+				rc = load_xattr_datum(c, xd);
+				if (unlikely(rc > 0)) {
+					delete_xattr_ref(c, ref);
+					goto retry;
+				} else if (unlikely(rc < 0)) {
+					goto out;
+				}
+			}
+		}
+		if (!strcmp(xname, xd->xname)) {
+			rc = xd->value_len;
+			if (buffer) {
+				if (size < rc) {
+					rc = -ERANGE;
+				} else {
+					memcpy(buffer, xd->xvalue, rc);
+				}
+			}
+			goto out;
+		}
+	}
+	rc = -ENODATA;
+ out:
+	if (!retry) {
+		up_read(&c->xattr_sem);
+	} else {
+		up_write(&c->xattr_sem);
+	}
+	return rc;
+}
+
+int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+		      const char *buffer, size_t size, int flags)
+{
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+	struct jffs2_inode_cache *ic = f->inocache;
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref, *newref;
+	uint32_t phys_ofs, length, request;
+	int rc;
+
+	rc = check_xattr_ref_ilist(c, ic);
+	if (unlikely(rc))
+		return rc;
+
+	request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size);
+	rc = jffs2_reserve_space(c, request, &phys_ofs, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		return rc;
+	}
+
+	/* Find existing xattr */
+	down_write(&c->xattr_sem);
+ retry:
+	list_for_each_entry(ref, &ic->ilist, ilist) {
+		xd = ref->xd;
+		if (xd->xprefix != xprefix)
+			continue;
+		if (!xd->xname) {
+			rc = load_xattr_datum(c, xd);
+			if (unlikely(rc > 0)) {
+				delete_xattr_ref(c, ref);
+				goto retry;
+			} else if (unlikely(rc < 0))
+				goto out;
+		}
+		if (!strcmp(xd->xname, xname)) {
+			if (flags & XATTR_CREATE) {
+				rc = -EEXIST;
+				goto out;
+			}
+			if (!buffer) {
+				delete_xattr_ref(c, ref);
+				rc = 0;
+				goto out;
+			}
+			goto found;
+		}
+	}
+	/* not found */
+	ref = NULL;
+	if (flags & XATTR_REPLACE) {
+		rc = -ENODATA;
+		goto out;
+	}
+	if (!buffer) {
+		rc = -EINVAL;
+		goto out;
+	}
+ found:
+	xd = create_xattr_datum(c, xprefix, xname, buffer, size, phys_ofs);
+	if (IS_ERR(xd)) {
+		rc = PTR_ERR(xd);
+		goto out;
+	}
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+
+	/* create xattr_ref */
+	request = PAD(sizeof(struct jffs2_raw_xref));
+	rc = jffs2_reserve_space(c, request, &phys_ofs, &length,
+				 ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+		down_write(&c->xattr_sem);
+		xd->refcnt--;
+		if (!xd->refcnt)
+			delete_xattr_datum(c, xd);
+		up_write(&c->xattr_sem);
+		return rc;
+	}
+	down_write(&c->xattr_sem);
+	newref = create_xattr_ref(c, ic, xd, phys_ofs);
+	if (IS_ERR(newref)) {
+		rc = PTR_ERR(newref);
+		xd->refcnt--;
+		if (!xd->refcnt)
+			delete_xattr_datum(c, xd);
+	} else if (ref) {
+		/* If replaced xattr_ref exists */
+		delete_xattr_ref(c, ref);
+	}
+ out:
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+	return rc;
+}
+
+/* -------- garbage collector functions -------------
+ * jffs2_garbage_collect_xattr_datum(c, xd)
+ *   is used to move xdatum into new node.
+ * jffs2_garbage_collect_xattr_ref(c, ref)
+ *   is used to move xref into new node.
+ * jffs2_garbage_collect_xattr(c, ic)
+ *   is used to call appropriate garbage collector function, if argument
+ *   pointer (ic) is the reference of xdatum/xref.
+ * jffs2_verify_xattr(c)
+ *   is used to call do_verify_xattr_datum() before garbage collecting.
+ * -------------------------------------------------- */
+static int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c,
+					     struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem), and called from GC thread */
+	uint32_t phys_ofs, totlen, length, old_ofs;
+	int rc;
+
+	BUG_ON(!xd->node);
+
+	old_ofs = ref_offset(xd->node);
+	totlen = ref_totlen(c, c->gcblock, xd->node);
+	if (totlen < sizeof(struct jffs2_raw_xattr))
+		return -EINVAL;
+
+	if (!xd->xname) {
+		rc = load_xattr_datum(c, xd);
+		if (unlikely(rc > 0)) {
+			delete_xattr_datum_node(c, xd);
+			return 0;
+		} else if (unlikely(rc < 0))
+			return -EINVAL;
+	}
+	rc = jffs2_reserve_space_gc(c, totlen, &phys_ofs, &length, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc || length < totlen) {
+		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, totlen);
+		return rc ? rc : -EBADFD;
+	}
+	rc = save_xattr_datum(c, xd, phys_ofs);
+	if (!rc)
+		dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
+			  xd->xid, xd->version, old_ofs, ref_offset(xd->node));
+	return rc;
+}
+
+
+static int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c,
+					   struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down(alloc_sem) */
+	uint32_t phys_ofs, totlen, length, old_ofs;
+	int rc;
+
+	BUG_ON(!ref->node);
+
+	old_ofs = ref_offset(ref->node);
+	totlen = ref_totlen(c, c->gcblock, ref->node);
+	if (totlen != sizeof(struct jffs2_raw_xref))
+		return -EINVAL;
+	rc = jffs2_reserve_space_gc(c, totlen, &phys_ofs, &length, JFFS2_SUMMARY_XREF_SIZE);
+	if (rc || length < totlen) {
+		JFFS2_WARNING("%s: jffs2_reserve_space() = %d, request = %u\n",
+			      __FUNCTION__, rc, totlen);
+		return rc ? rc : -EBADFD;
+	}
+	rc = save_xattr_ref(c, ref, phys_ofs);
+	if (!rc)
+		dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
+			  ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
+	return rc;
+}
+
+int jffs2_garbage_collect_xattr(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+{
+	struct jffs2_xattr_datum *xd;
+	struct jffs2_xattr_ref *ref;
+	int ret;
+
+	switch (ic->class) {
+	case RAWNODE_CLASS_XATTR_DATUM:
+		spin_unlock(&c->erase_completion_lock);
+
+		down_write(&c->xattr_sem);
+		xd = (struct jffs2_xattr_datum *)ic;
+		ret = xd ? jffs2_garbage_collect_xattr_datum(c, xd) : 0;
+		up_write(&c->xattr_sem);
+		break;
+	case RAWNODE_CLASS_XATTR_REF:
+		spin_unlock(&c->erase_completion_lock);
+
+		down_write(&c->xattr_sem);
+		ref = (struct jffs2_xattr_ref *)ic;
+		ret = ref ? jffs2_garbage_collect_xattr_ref(c, ref) : 0;
+		up_write(&c->xattr_sem);
+		break;
+	default:
+		/* This node is not xattr_datum/xattr_ref */
+		ret = 1;
+		break;
+	}
+	return ret;
+}
+
+int jffs2_verify_xattr(struct jffs2_sb_info *c)
+{
+	struct jffs2_xattr_datum *xd, *_xd;
+	int rc;
+
+	down_write(&c->xattr_sem);
+	list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
+		rc = do_verify_xattr_datum(c, xd);
+		if (rc == 0) {
+			list_del_init(&xd->xindex);
+			break;
+		} else if (rc > 0) {
+			list_del_init(&xd->xindex);
+			delete_xattr_datum_node(c, xd);
+		}
+	}
+	up_write(&c->xattr_sem);
+
+	return list_empty(&c->xattr_unchecked) ? 1 : 0;
+}
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
new file mode 100644
index 00000000000..d157ad641ed
--- /dev/null
+++ b/fs/jffs2/xattr.h
@@ -0,0 +1,120 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/xattr.c
+ *  XATTR support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+
+#ifndef _JFFS2_FS_XATTR_H_
+#define _JFFS2_FS_XATTR_H_
+
+#include <linux/xattr.h>
+
+#define JFFS2_XFLAGS_HOT	(0x01)	/* This datum is HOT */
+#define JFFS2_XFLAGS_BIND	(0x02)	/* This datum is not reclaimed */
+
+struct jffs2_xattr_datum
+{
+	void *always_null;
+	u8 class;
+	u8 flags;
+	u16 xprefix;			/* see JFFS2_XATTR_PREFIX_* */
+
+	struct jffs2_raw_node_ref *node;
+	struct list_head xindex;	/* chained from c->xattrindex[n] */
+	uint32_t refcnt;		/* # of xattr_ref refers this */
+	uint32_t xid;
+	uint32_t version;
+
+	uint32_t data_crc;
+	uint32_t hashkey;
+	char *xname;		/* XATTR name without prefix */
+	uint32_t name_len;	/* length of xname */
+	char *xvalue;		/* XATTR value */
+	uint32_t value_len;	/* length of xvalue */
+};
+
+struct jffs2_inode_cache;	/* forward refence */
+struct jffs2_xattr_ref
+{
+	void *always_null;
+	u8 class;
+	u8 flags;		/* Currently unused */
+	u16 unused;
+
+	struct jffs2_raw_node_ref *node;
+	union {
+		struct jffs2_inode_cache *ic;	/* reference to jffs2_inode_cache */
+		uint32_t ino;			/* only used in scanning/building  */
+	};
+	union {
+		struct jffs2_xattr_datum *xd;	/* reference to jffs2_xattr_datum */
+		uint32_t xid;			/* only used in sccanning/building */
+	};
+	struct list_head ilist;			/* chained from ic->ilist */
+};
+
+#ifdef CONFIG_JFFS2_FS_XATTR
+
+extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
+extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c);
+
+extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
+                                                  uint32_t xid, uint32_t version);
+
+extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+
+extern int jffs2_garbage_collect_xattr(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern int jffs2_verify_xattr(struct jffs2_sb_info *c);
+
+extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
+			     char *buffer, size_t size);
+extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
+			     const char *buffer, size_t size, int flags);
+
+extern struct xattr_handler *jffs2_xattr_handlers[];
+extern struct xattr_handler jffs2_user_xattr_handler;
+extern struct xattr_handler jffs2_trusted_xattr_handler;
+
+extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
+#define jffs2_getxattr		generic_getxattr
+#define jffs2_setxattr		generic_setxattr
+#define jffs2_removexattr	generic_removexattr
+
+/*---- Any inline initialize functions ----*/
+#define init_xattr_inode_cache(x) INIT_LIST_HEAD(&((x)->ilist))
+
+#else
+
+#define jffs2_init_xattr_subsystem(c)
+#define jffs2_build_xattr_subsystem(c)
+#define jffs2_clear_xattr_subsystem(c)
+
+#define jffs2_xattr_delete_inode(c, ic)
+#define jffs2_xattr_free_inode(c, ic)
+#define jffs2_garbage_collect_xattr(c, ic)	(1)
+#define jffs2_verify_xattr(c)			(1)
+
+#define jffs2_xattr_handlers	NULL
+#define jffs2_listxattr		NULL
+#define jffs2_getxattr		NULL
+#define jffs2_setxattr		NULL
+#define jffs2_removexattr	NULL
+
+#define init_xattr_inode_cache(x)
+
+#endif /* CONFIG_JFFS2_FS_XATTR */
+
+#ifdef CONFIG_JFFS2_FS_SECURITY
+extern int jffs2_init_security(struct inode *inode, struct inode *dir);
+extern struct xattr_handler jffs2_security_xattr_handler;
+#else
+#define jffs2_init_security(inode,dir)	(0)
+#endif /* CONFIG_JFFS2_FS_SECURITY */
+
+#endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
new file mode 100644
index 00000000000..a018c9c31a6
--- /dev/null
+++ b/fs/jffs2/xattr_trusted.c
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/xattr_trusted.c
+ *  XATTR support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_trusted_getxattr(struct inode *inode, const char *name,
+				  void *buffer, size_t size)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size);
+}
+
+static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer,
+				  size_t size, int flags)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags);
+}
+
+static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size,
+				      const char *name, size_t name_len)
+{
+	size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen<=list_size) {
+		strcpy(list, XATTR_TRUSTED_PREFIX);
+		strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+struct xattr_handler jffs2_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.list = jffs2_trusted_listxattr,
+	.set = jffs2_trusted_setxattr,
+	.get = jffs2_trusted_getxattr
+};
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
new file mode 100644
index 00000000000..d8c13636ea4
--- /dev/null
+++ b/fs/jffs2/xattr_user.c
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------*
+ *  File: fs/jffs2/xattr_user.c
+ *  XATTR support on JFFS2 FileSystem
+ *
+ *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *  Copyright (C) 2006 NEC Corporation
+ *
+ *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
+ *-------------------------------------------------------------------------*/
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/jffs2.h>
+#include <linux/xattr.h>
+#include <linux/mtd/mtd.h>
+#include "nodelist.h"
+
+static int jffs2_user_getxattr(struct inode *inode, const char *name,
+                               void *buffer, size_t size)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size);
+}
+
+static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer,
+                               size_t size, int flags)
+{
+	if (!strcmp(name, ""))
+		return -EINVAL;
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags);
+}
+
+static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size,
+				   const char *name, size_t name_len)
+{
+	size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
+
+	if (list && retlen <= list_size) {
+		strcpy(list, XATTR_USER_PREFIX);
+		strcpy(list + XATTR_USER_PREFIX_LEN, name);
+	}
+
+	return retlen;
+}
+
+struct xattr_handler jffs2_user_xattr_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.list = jffs2_user_listxattr,
+	.set = jffs2_user_setxattr,
+	.get = jffs2_user_getxattr
+};
-- 
cgit v1.2.3


From de1f72fab35d2b6215017690c6dc27b8f4aa14bc Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:13:27 +0900
Subject: [JFFS2][XATTR] remove typedef from posix_acl related definition.

jffs2_acl_header, jffs2_acl_entry and jffs2_acl_entry_short were redefined
with using 'struct' instead of 'typedef' in kernel implementation.

[1/10] jffs2-xattr-v5.1-01-remove_typedef_kernel.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/acl.c | 52 ++++++++++++++++++++++++++--------------------------
 fs/jffs2/acl.h | 12 ++++++------
 2 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 080bb51e4b6..de173df84a8 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -21,12 +21,12 @@
 static size_t jffs2_acl_size(int count)
 {
 	if (count <= 4) {
-		return sizeof(jffs2_acl_header)
-		       + count * sizeof(jffs2_acl_entry_short);
+		return sizeof(struct jffs2_acl_header)
+		       + count * sizeof(struct jffs2_acl_entry_short);
 	} else {
-		return sizeof(jffs2_acl_header)
-		       + 4 * sizeof(jffs2_acl_entry_short)
-		       + (count - 4) * sizeof(jffs2_acl_entry);
+		return sizeof(struct jffs2_acl_header)
+		       + 4 * sizeof(struct jffs2_acl_entry_short)
+		       + (count - 4) * sizeof(struct jffs2_acl_entry);
 	}
 }
 
@@ -34,16 +34,16 @@ static int jffs2_acl_count(size_t size)
 {
 	size_t s;
 
-	size -= sizeof(jffs2_acl_header);
-	s = size - 4 * sizeof(jffs2_acl_entry_short);
+	size -= sizeof(struct jffs2_acl_header);
+	s = size - 4 * sizeof(struct jffs2_acl_entry_short);
 	if (s < 0) {
-		if (size % sizeof(jffs2_acl_entry_short))
+		if (size % sizeof(struct jffs2_acl_entry_short))
 			return -1;
-		return size / sizeof(jffs2_acl_entry_short);
+		return size / sizeof(struct jffs2_acl_entry_short);
 	} else {
-		if (s % sizeof(jffs2_acl_entry))
+		if (s % sizeof(struct jffs2_acl_entry))
 			return -1;
-		return s / sizeof(jffs2_acl_entry) + 4;
+		return s / sizeof(struct jffs2_acl_entry) + 4;
 	}
 }
 
@@ -56,15 +56,15 @@ static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
 
 	if (!value)
 		return NULL;
-	if (size < sizeof(jffs2_acl_header))
+	if (size < sizeof(struct jffs2_acl_header))
 		return ERR_PTR(-EINVAL);
-	ver = je32_to_cpu(((jffs2_acl_header *)value)->a_version);
+	ver = je32_to_cpu(((struct jffs2_acl_header *)value)->a_version);
 	if (ver != JFFS2_ACL_VERSION) {
 		JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver);
 		return ERR_PTR(-EINVAL);
 	}
 
-	value = (char *)value + sizeof(jffs2_acl_header);
+	value = (char *)value + sizeof(struct jffs2_acl_header);
 	count = jffs2_acl_count(size);
 	if (count < 0)
 		return ERR_PTR(-EINVAL);
@@ -76,8 +76,8 @@ static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
 		return ERR_PTR(-ENOMEM);
 
 	for (i=0; i < count; i++) {
-		jffs2_acl_entry *entry = (jffs2_acl_entry *)value;
-		if ((char *)value + sizeof(jffs2_acl_entry_short) > end)
+		struct jffs2_acl_entry *entry = (struct jffs2_acl_entry *)value;
+		if ((char *)value + sizeof(struct jffs2_acl_entry_short) > end)
 			goto fail;
 		acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag);
 		acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm);
@@ -86,13 +86,13 @@ static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
 			case ACL_GROUP_OBJ:
 			case ACL_MASK:
 			case ACL_OTHER:
-				value = (char *)value + sizeof(jffs2_acl_entry_short);
+				value = (char *)value + sizeof(struct jffs2_acl_entry_short);
 				acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
 				break;
 
 			case ACL_USER:
 			case ACL_GROUP:
-				value = (char *)value + sizeof(jffs2_acl_entry);
+				value = (char *)value + sizeof(struct jffs2_acl_entry);
 				if ((char *)value > end)
 					goto fail;
 				acl->a_entries[i].e_id = je32_to_cpu(entry->e_id);
@@ -112,34 +112,34 @@ static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
 
 static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
 {
-	jffs2_acl_header *jffs2_acl;
+	struct jffs2_acl_header *jffs2_acl;
 	char *e;
 	size_t i;
 
 	*size = jffs2_acl_size(acl->a_count);
-	jffs2_acl = (jffs2_acl_header *)kmalloc(sizeof(jffs2_acl_header)
-						+ acl->a_count * sizeof(jffs2_acl_entry),
-						GFP_KERNEL);
+	jffs2_acl = kmalloc(sizeof(struct jffs2_acl_header)
+			     + acl->a_count * sizeof(struct jffs2_acl_entry),
+			    GFP_KERNEL);
 	if (!jffs2_acl)
 		return ERR_PTR(-ENOMEM);
 	jffs2_acl->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
-	e = (char *)jffs2_acl + sizeof(jffs2_acl_header);
+	e = (char *)jffs2_acl + sizeof(struct jffs2_acl_header);
 	for (i=0; i < acl->a_count; i++) {
-		jffs2_acl_entry *entry = (jffs2_acl_entry *)e;
+		struct jffs2_acl_entry *entry = (struct jffs2_acl_entry *)e;
 		entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag);
 		entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm);
 		switch(acl->a_entries[i].e_tag) {
 			case ACL_USER:
 			case ACL_GROUP:
 				entry->e_id = cpu_to_je32(acl->a_entries[i].e_id);
-				e += sizeof(jffs2_acl_entry);
+				e += sizeof(struct jffs2_acl_entry);
 				break;
 
 			case ACL_USER_OBJ:
 			case ACL_GROUP_OBJ:
 			case ACL_MASK:
 			case ACL_OTHER:
-				e += sizeof(jffs2_acl_entry_short);
+				e += sizeof(struct jffs2_acl_entry_short);
 				break;
 
 			default:
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index c98610b4e81..3f41e09200b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -7,20 +7,20 @@
  *
  *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
  *-------------------------------------------------------------------------*/
-typedef struct {
+struct jffs2_acl_entry {
 	jint16_t	e_tag;
 	jint16_t	e_perm;
 	jint32_t	e_id;
-} jffs2_acl_entry;
+};
 
-typedef struct {
+struct jffs2_acl_entry_short {
 	jint16_t	e_tag;
 	jint16_t	e_perm;
-} jffs2_acl_entry_short;
+};
 
-typedef struct {
+struct jffs2_acl_header {
 	jint32_t	a_version;
-} jffs2_acl_header;
+};
 
 #ifdef __KERNEL__
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-- 
cgit v1.2.3


From 8b0b339d46ca0105a9936e3caa3bac80b72de7a3 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:14:14 +0900
Subject: [JFFS2][XATTR] Add a description about c->xattr_sem

Add a description about the c->xattr_sem read/write semaphore
into README.Locking.

[3/10] jffs2-xattr-v5.1-03-append_README.Locking.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/README.Locking | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index b7943439b6e..c8f0bd64e53 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -150,3 +150,24 @@ the buffer.
 
 Ordering constraints:
 	Lock wbuf_sem last, after the alloc_sem or and f->sem.
+
+
+	c->xattr_sem
+	------------
+
+This read/write semaphore protects against concurrent access to the
+xattr related objects which include stuff in superblock and ic->xref.
+In read-only path, write-semaphore is too much exclusion. It's enough
+by read-semaphore. But you must hold write-semaphore when updating,
+creating or deleting any xattr related object.
+
+Once xattr_sem released, there would be no assurance for the existence
+of those objects. Thus, a series of processes is often required to retry,
+when updating such a object is necessary under holding read semaphore.
+For example, do_jffs2_getxattr() holds read-semaphore to scan xref and
+xdatum at first. But it retries this process with holding write-semaphore
+after release read-semaphore, if it's necessary to load name/value pair
+from medium.
+
+Ordering constraints:
+	Lock xattr_sem last, after the alloc_sem.
-- 
cgit v1.2.3


From 8f2b6f49c656dd4597904f8c20661d6b73cdbbeb Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:15:07 +0900
Subject: [JFFS2][XATTR] Remove 'struct list_head ilist' from
 jffs2_inode_cache.

This patch can reduce 4-byte of memory usage per inode_cache.

[4/10] jffs2-xattr-v5.1-04-remove_ilist_from_ic.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/jffs2_fs_sb.h |  2 +-
 fs/jffs2/malloc.c      |  1 -
 fs/jffs2/nodelist.h    |  2 +-
 fs/jffs2/readinode.c   |  1 -
 fs/jffs2/scan.c        |  6 ++--
 fs/jffs2/summary.c     |  3 +-
 fs/jffs2/write.c       |  1 -
 fs/jffs2/xattr.c       | 75 +++++++++++++++++++++++++++++++-------------------
 fs/jffs2/xattr.h       |  7 +----
 9 files changed, 55 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 3b4e0edd6db..272fbea5519 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -119,8 +119,8 @@ struct jffs2_sb_info {
 #define XATTRINDEX_HASHSIZE	(57)
 	uint32_t highest_xid;
 	struct list_head xattrindex[XATTRINDEX_HASHSIZE];
-	struct list_head xattr_temp;
 	struct list_head xattr_unchecked;
+	struct jffs2_xattr_ref *xref_temp;
 	struct rw_semaphore xattr_sem;
 	uint32_t xdatum_mem_usage;
 	uint32_t xdatum_mem_threshold;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 3d5b7ecfbf8..f2473fa2fd1 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -259,7 +259,6 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 
 	memset(ref, 0, sizeof(struct jffs2_xattr_ref));
 	ref->class = RAWNODE_CLASS_XATTR_REF;
-	INIT_LIST_HEAD(&ref->ilist);
 	return ref;
 }
 
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 6f6279cf490..351d947c937 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -117,7 +117,7 @@ struct jffs2_inode_cache {
 	uint32_t ino;
 	int nlink;
 #ifdef CONFIG_JFFS2_FS_XATTR
-	struct list_head ilist;
+	struct jffs2_xattr_ref *xref;
 #endif
 };
 
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 61ccdf4f104..e1acce8fb2b 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -902,7 +902,6 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 		f->inocache->ino = f->inocache->nlink = 1;
 		f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
 		f->inocache->state = INO_STATE_READING;
-		init_xattr_inode_cache(f->inocache);
 		jffs2_add_ino_cache(c, f->inocache);
 	}
 	if (!f->inocache) {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index f09689e320f..0a79fc921e9 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -408,14 +408,15 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	 * ref->xid is used to store 32bit xid, xd is not used
 	 * ref->ino is used to store 32bit inode-number, ic is not used
 	 * Thoes variables are declared as union, thus using those
-	 * are exclusive. In a similar way, ref->ilist is temporarily
+	 * are exclusive. In a similar way, ref->next is temporarily
 	 * used to chain all xattr_ref object. It's re-chained to
 	 * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly.
 	 */
 	ref->node = raw;
 	ref->ino = je32_to_cpu(rr->ino);
 	ref->xid = je32_to_cpu(rr->xid);
-	list_add_tail(&ref->ilist, &c->xattr_temp);
+	ref->next = c->xref_temp;
+	c->xref_temp = ref;
 
 	raw->__totlen = PAD(je32_to_cpu(rr->totlen));
 	raw->flash_offset = ofs | REF_PRISTINE;
@@ -888,7 +889,6 @@ struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uin
 
 	ic->ino = ino;
 	ic->nodes = (void *)ic;
-	init_xattr_inode_cache(ic);
 	jffs2_add_ino_cache(c, ic);
 	if (ino == 1)
 		ic->nlink = 1;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 5d9ec8e3652..831a42c1305 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -556,7 +556,8 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				ref->ino = 0xfffffffe;
 				ref->xid = 0xfffffffd;
 				ref->node = raw;
-				list_add_tail(&ref->ilist, &c->xattr_temp);
+				ref->next = c->xref_temp;
+				c->xref_temp = ref;
 
 				raw->__totlen = PAD(sizeof(struct jffs2_raw_xref));
 				raw->flash_offset = ofs | REF_UNCHECKED;
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index d5c78195f3b..ff2b00b604e 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -36,7 +36,6 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint
 	f->inocache->nlink = 1;
 	f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache;
 	f->inocache->state = INO_STATE_PRESENT;
-	init_xattr_inode_cache(f->inocache);
 
 	jffs2_add_ino_cache(c, f->inocache);
 	D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino));
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index c9a185c54ce..b16bc71c9cd 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -456,7 +456,7 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
  *   is called to remove xrefs related to obsolete inode when inode is unlinked.
  * jffs2_xattr_free_inode(c, ic)
  *   is called to release xattr related objects when unmounting. 
- * check_xattr_ref_ilist(c, ic)
+ * check_xattr_ref_inode(c, ic)
  *   is used to confirm inode does not have duplicate xattr name/value pair.
  * -------------------------------------------------- */
 static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
@@ -549,7 +549,6 @@ static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
 	BUG_ON(!ref->node);
 	delete_xattr_ref_node(c, ref);
 
-	list_del(&ref->ilist);
 	xd = ref->xd;
 	xd->refcnt--;
 	if (!xd->refcnt)
@@ -629,7 +628,8 @@ static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct
 	}
 
 	/* Chain to inode */
-	list_add(&ref->ilist, &ic->ilist);
+	ref->next = ic->xref;
+	ic->xref = ref;
 
 	return ref; /* success */
 }
@@ -644,8 +644,11 @@ void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache
 		return;
 
 	down_write(&c->xattr_sem);
-	list_for_each_entry_safe(ref, _ref, &ic->ilist, ilist)
+	for (ref = ic->xref; ref; ref = _ref) {
+		_ref = ref->next;
 		delete_xattr_ref(c, ref);
+	}
+	ic->xref = NULL;
 	up_write(&c->xattr_sem);
 }
 
@@ -656,8 +659,8 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
 	struct jffs2_xattr_ref *ref, *_ref;
 
 	down_write(&c->xattr_sem);
-	list_for_each_entry_safe(ref, _ref, &ic->ilist, ilist) {
-		list_del(&ref->ilist);
+	for (ref = ic->xref; ref; ref = _ref) {
+		_ref = ref->next;
 		xd = ref->xd;
 		xd->refcnt--;
 		if (!xd->refcnt) {
@@ -666,16 +669,17 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
 		}
 		jffs2_free_xattr_ref(ref);
 	}
+	ic->xref = NULL;
 	up_write(&c->xattr_sem);
 }
 
-static int check_xattr_ref_ilist(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
+static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
-	/* success of check_xattr_ref_ilist() means taht inode (ic) dose not have
+	/* success of check_xattr_ref_inode() means taht inode (ic) dose not have
 	 * duplicate name/value pairs. If duplicate name/value pair would be found,
 	 * one will be removed.
 	 */
-	struct jffs2_xattr_ref *ref, *cmp;
+	struct jffs2_xattr_ref *ref, *cmp, **pref;
 	int rc = 0;
 
 	if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED))
@@ -683,22 +687,23 @@ static int check_xattr_ref_ilist(struct jffs2_sb_info *c, struct jffs2_inode_cac
 	down_write(&c->xattr_sem);
  retry:
 	rc = 0;
-	list_for_each_entry(ref, &ic->ilist, ilist) {
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
 		if (!ref->xd->xname) {
 			rc = load_xattr_datum(c, ref->xd);
 			if (unlikely(rc > 0)) {
+				*pref = ref->next;
 				delete_xattr_ref(c, ref);
 				goto retry;
 			} else if (unlikely(rc < 0))
 				goto out;
 		}
-		cmp = ref;
-		list_for_each_entry_continue(cmp, &ic->ilist, ilist) {
+		for (cmp=ref->next, pref=&ref->next; cmp; pref=&cmp->next, cmp=cmp->next) {
 			if (!cmp->xd->xname) {
 				ref->xd->flags |= JFFS2_XFLAGS_BIND;
 				rc = load_xattr_datum(c, cmp->xd);
 				ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
 				if (unlikely(rc > 0)) {
+					*pref = cmp->next;
 					delete_xattr_ref(c, cmp);
 					goto retry;
 				} else if (unlikely(rc < 0))
@@ -706,6 +711,7 @@ static int check_xattr_ref_ilist(struct jffs2_sb_info *c, struct jffs2_inode_cac
 			}
 			if (ref->xd->xprefix == cmp->xd->xprefix
 			    && !strcmp(ref->xd->xname, cmp->xd->xname)) {
+				*pref = cmp->next;
 				delete_xattr_ref(c, cmp);
 				goto retry;
 			}
@@ -736,8 +742,8 @@ void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c)
 
 	for (i=0; i < XATTRINDEX_HASHSIZE; i++)
 		INIT_LIST_HEAD(&c->xattrindex[i]);
-	INIT_LIST_HEAD(&c->xattr_temp);
 	INIT_LIST_HEAD(&c->xattr_unchecked);
+	c->xref_temp = NULL;
 
 	init_rwsem(&c->xattr_sem);
 	c->xdatum_mem_usage = 0;
@@ -765,8 +771,11 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
 	struct jffs2_xattr_ref *ref, *_ref;
 	int i;
 
-	list_for_each_entry_safe(ref, _ref, &c->xattr_temp, ilist)
+	for (ref=c->xref_temp; ref; ref = _ref) {
+		_ref = ref->next;
 		jffs2_free_xattr_ref(ref);
+	}
+	c->xref_temp = NULL;
 
 	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
 		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
@@ -788,8 +797,8 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 	BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
 
 	/* Phase.1 */
-	list_for_each_entry_safe(ref, _ref, &c->xattr_temp, ilist) {
-		list_del_init(&ref->ilist);
+	for (ref=c->xref_temp; ref; ref=_ref) {
+		_ref = ref->next;
 		/* checking REF_UNCHECKED nodes */
 		if (ref_flags(ref->node) != REF_PRISTINE) {
 			if (verify_xattr_ref(c, ref)) {
@@ -813,9 +822,11 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 		ref->xd = xd;
 		ref->ic = ic;
 		xd->refcnt++;
-		list_add_tail(&ref->ilist, &ic->ilist);
+		ref->next = ic->xref;
+		ic->xref = ref;
 		xref_count++;
 	}
+	c->xref_temp = NULL;
 	/* After this, ref->xid/ino are NEVER used. */
 
 	/* Phase.2 */
@@ -931,20 +942,20 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	struct jffs2_inode_cache *ic = f->inocache;
-	struct jffs2_xattr_ref *ref;
+	struct jffs2_xattr_ref *ref, **pref;
 	struct jffs2_xattr_datum *xd;
 	struct xattr_handler *xhandle;
 	ssize_t len, rc;
 	int retry = 0;
 
-	rc = check_xattr_ref_ilist(c, ic);
+	rc = check_xattr_ref_inode(c, ic);
 	if (unlikely(rc))
 		return rc;
 
 	down_read(&c->xattr_sem);
  retry:
 	len = 0;
-	list_for_each_entry(ref, &ic->ilist, ilist) {
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
 		BUG_ON(ref->ic != ic);
 		xd = ref->xd;
 		if (!xd->xname) {
@@ -957,6 +968,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 			} else {
 				rc = load_xattr_datum(c, xd);
 				if (unlikely(rc > 0)) {
+					*pref = ref->next;
 					delete_xattr_ref(c, ref);
 					goto retry;
 				} else if (unlikely(rc < 0))
@@ -992,16 +1004,16 @@ int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	struct jffs2_inode_cache *ic = f->inocache;
 	struct jffs2_xattr_datum *xd;
-	struct jffs2_xattr_ref *ref;
+	struct jffs2_xattr_ref *ref, **pref;
 	int rc, retry = 0;
 
-	rc = check_xattr_ref_ilist(c, ic);
+	rc = check_xattr_ref_inode(c, ic);
 	if (unlikely(rc))
 		return rc;
 
 	down_read(&c->xattr_sem);
  retry:
-	list_for_each_entry(ref, &ic->ilist, ilist) {
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
 		BUG_ON(ref->ic!=ic);
 
 		xd = ref->xd;
@@ -1017,6 +1029,7 @@ int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
 			} else {
 				rc = load_xattr_datum(c, xd);
 				if (unlikely(rc > 0)) {
+					*pref = ref->next;
 					delete_xattr_ref(c, ref);
 					goto retry;
 				} else if (unlikely(rc < 0)) {
@@ -1053,11 +1066,11 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	struct jffs2_inode_cache *ic = f->inocache;
 	struct jffs2_xattr_datum *xd;
-	struct jffs2_xattr_ref *ref, *newref;
+	struct jffs2_xattr_ref *ref, *newref, **pref;
 	uint32_t phys_ofs, length, request;
 	int rc;
 
-	rc = check_xattr_ref_ilist(c, ic);
+	rc = check_xattr_ref_inode(c, ic);
 	if (unlikely(rc))
 		return rc;
 
@@ -1072,13 +1085,14 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 	/* Find existing xattr */
 	down_write(&c->xattr_sem);
  retry:
-	list_for_each_entry(ref, &ic->ilist, ilist) {
+	for (ref=ic->xref, pref=&ic->xref; ref; pref=&ref->next, ref=ref->next) {
 		xd = ref->xd;
 		if (xd->xprefix != xprefix)
 			continue;
 		if (!xd->xname) {
 			rc = load_xattr_datum(c, xd);
 			if (unlikely(rc > 0)) {
+				*pref = ref->next;
 				delete_xattr_ref(c, ref);
 				goto retry;
 			} else if (unlikely(rc < 0))
@@ -1090,6 +1104,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 				goto out;
 			}
 			if (!buffer) {
+				*pref = ref->next;
 				delete_xattr_ref(c, ref);
 				rc = 0;
 				goto out;
@@ -1098,7 +1113,6 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		}
 	}
 	/* not found */
-	ref = NULL;
 	if (flags & XATTR_REPLACE) {
 		rc = -ENODATA;
 		goto out;
@@ -1130,14 +1144,19 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		return rc;
 	}
 	down_write(&c->xattr_sem);
+	if (ref)
+		*pref = ref->next;
 	newref = create_xattr_ref(c, ic, xd, phys_ofs);
 	if (IS_ERR(newref)) {
+		if (ref) {
+			ref->next = ic->xref;
+			ic->xref = ref;
+		}
 		rc = PTR_ERR(newref);
 		xd->refcnt--;
 		if (!xd->refcnt)
 			delete_xattr_datum(c, xd);
 	} else if (ref) {
-		/* If replaced xattr_ref exists */
 		delete_xattr_ref(c, ref);
 	}
  out:
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index d157ad641ed..0360097e593 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -54,7 +54,7 @@ struct jffs2_xattr_ref
 		struct jffs2_xattr_datum *xd;	/* reference to jffs2_xattr_datum */
 		uint32_t xid;			/* only used in sccanning/building */
 	};
-	struct list_head ilist;			/* chained from ic->ilist */
+	struct jffs2_xattr_ref *next;		/* chained from ic->xref_list */
 };
 
 #ifdef CONFIG_JFFS2_FS_XATTR
@@ -86,9 +86,6 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #define jffs2_setxattr		generic_setxattr
 #define jffs2_removexattr	generic_removexattr
 
-/*---- Any inline initialize functions ----*/
-#define init_xattr_inode_cache(x) INIT_LIST_HEAD(&((x)->ilist))
-
 #else
 
 #define jffs2_init_xattr_subsystem(c)
@@ -106,8 +103,6 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #define jffs2_setxattr		NULL
 #define jffs2_removexattr	NULL
 
-#define init_xattr_inode_cache(x)
-
 #endif /* CONFIG_JFFS2_FS_XATTR */
 
 #ifdef CONFIG_JFFS2_FS_SECURITY
-- 
cgit v1.2.3


From 084702e00111eb9ffb6d8a5c1938b8e5423e40a8 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:16:13 +0900
Subject: [JFFS2][XATTR] Remove jffs2_garbage_collect_xattr(c, ic)

Remove jffs2_garbage_collect_xattr(c, ic).
jffs2_garbage_collect_xattr_datum/ref() are called from gc.c directly.

In original implementation, jffs2_garbage_collect_xattr(c, ic) returns
with holding a spinlock if 'ic' is inode_cache. But it returns after
releasing a spinlock if 'ic' is xattr_datum/ref.
It looks so confusable behavior. Thus, this patch makes caller manage
locking/unlocking.

[5/10] jffs2-xattr-v5.1-05-update_xattr_gc.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/gc.c    | 21 +++++++++++------
 fs/jffs2/xattr.c | 68 +++++++++++++++++---------------------------------------
 fs/jffs2/xattr.h |  4 ++--
 3 files changed, 36 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 4ea1b7f0ae7..a5ef9814f16 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -266,15 +266,22 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 	ic = jffs2_raw_ref_to_ic(raw);
 
+#ifdef CONFIG_JFFS2_FS_XATTR
 	/* When 'ic' refers xattr_datum/xattr_ref, this node is GCed as xattr.
-	   We can decide whether this node is inode or xattr by ic->class.
-	   ret = 0 : ic is xattr_datum/xattr_ref, and GC was SUCCESSED.
-	   ret < 0 : ic is xattr_datum/xattr_ref, but GC was FAILED.
-	   ret > 0 : ic is NOT xattr_datum/xattr_ref.
-	*/
-	ret = jffs2_garbage_collect_xattr(c, ic);
-	if (ret <= 0)
+	 * We can decide whether this node is inode or xattr by ic->class.     */
+	if (ic->class == RAWNODE_CLASS_XATTR_DATUM
+	    || ic->class == RAWNODE_CLASS_XATTR_REF) {
+		BUG_ON(raw->next_in_ino != (void *)ic);
+		spin_unlock(&c->erase_completion_lock);
+
+		if (ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+			ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+		} else {
+			ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+		}
 		goto release_sem;
+	}
+#endif
 
 	/* We need to hold the inocache. Either the erase_completion_lock or
 	   the inocache_lock are sufficient; we trade down since the inocache_lock
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index b16bc71c9cd..9c1f401d12d 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1170,104 +1170,76 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
  *   is used to move xdatum into new node.
  * jffs2_garbage_collect_xattr_ref(c, ref)
  *   is used to move xref into new node.
- * jffs2_garbage_collect_xattr(c, ic)
- *   is used to call appropriate garbage collector function, if argument
- *   pointer (ic) is the reference of xdatum/xref.
  * jffs2_verify_xattr(c)
  *   is used to call do_verify_xattr_datum() before garbage collecting.
  * -------------------------------------------------- */
-static int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c,
-					     struct jffs2_xattr_datum *xd)
+int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
-	/* must be called under down_write(xattr_sem), and called from GC thread */
 	uint32_t phys_ofs, totlen, length, old_ofs;
-	int rc;
+	int rc = -EINVAL;
 
+	down_write(&c->xattr_sem);
 	BUG_ON(!xd->node);
 
 	old_ofs = ref_offset(xd->node);
 	totlen = ref_totlen(c, c->gcblock, xd->node);
 	if (totlen < sizeof(struct jffs2_raw_xattr))
-		return -EINVAL;
+		goto out;
 
 	if (!xd->xname) {
 		rc = load_xattr_datum(c, xd);
 		if (unlikely(rc > 0)) {
 			delete_xattr_datum_node(c, xd);
-			return 0;
+			rc = 0;
+			goto out;
 		} else if (unlikely(rc < 0))
-			return -EINVAL;
+			goto out;
 	}
 	rc = jffs2_reserve_space_gc(c, totlen, &phys_ofs, &length, JFFS2_SUMMARY_XATTR_SIZE);
 	if (rc || length < totlen) {
 		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, totlen);
-		return rc ? rc : -EBADFD;
+		rc = rc ? rc : -EBADFD;
+		goto out;
 	}
 	rc = save_xattr_datum(c, xd, phys_ofs);
 	if (!rc)
 		dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
 			  xd->xid, xd->version, old_ofs, ref_offset(xd->node));
+ out:
+	up_write(&c->xattr_sem);
 	return rc;
 }
 
 
-static int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c,
-					   struct jffs2_xattr_ref *ref)
+int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 {
-	/* must be called under down(alloc_sem) */
 	uint32_t phys_ofs, totlen, length, old_ofs;
-	int rc;
+	int rc = -EINVAL;
 
+	down_write(&c->xattr_sem);
 	BUG_ON(!ref->node);
 
 	old_ofs = ref_offset(ref->node);
 	totlen = ref_totlen(c, c->gcblock, ref->node);
 	if (totlen != sizeof(struct jffs2_raw_xref))
-		return -EINVAL;
+		goto out;
+
 	rc = jffs2_reserve_space_gc(c, totlen, &phys_ofs, &length, JFFS2_SUMMARY_XREF_SIZE);
 	if (rc || length < totlen) {
 		JFFS2_WARNING("%s: jffs2_reserve_space() = %d, request = %u\n",
 			      __FUNCTION__, rc, totlen);
-		return rc ? rc : -EBADFD;
+		rc = rc ? rc : -EBADFD;
+		goto out;
 	}
 	rc = save_xattr_ref(c, ref, phys_ofs);
 	if (!rc)
 		dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
 			  ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
+ out:
+	up_write(&c->xattr_sem);
 	return rc;
 }
 
-int jffs2_garbage_collect_xattr(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
-{
-	struct jffs2_xattr_datum *xd;
-	struct jffs2_xattr_ref *ref;
-	int ret;
-
-	switch (ic->class) {
-	case RAWNODE_CLASS_XATTR_DATUM:
-		spin_unlock(&c->erase_completion_lock);
-
-		down_write(&c->xattr_sem);
-		xd = (struct jffs2_xattr_datum *)ic;
-		ret = xd ? jffs2_garbage_collect_xattr_datum(c, xd) : 0;
-		up_write(&c->xattr_sem);
-		break;
-	case RAWNODE_CLASS_XATTR_REF:
-		spin_unlock(&c->erase_completion_lock);
-
-		down_write(&c->xattr_sem);
-		ref = (struct jffs2_xattr_ref *)ic;
-		ret = ref ? jffs2_garbage_collect_xattr_ref(c, ref) : 0;
-		up_write(&c->xattr_sem);
-		break;
-	default:
-		/* This node is not xattr_datum/xattr_ref */
-		ret = 1;
-		break;
-	}
-	return ret;
-}
-
 int jffs2_verify_xattr(struct jffs2_sb_info *c)
 {
 	struct jffs2_xattr_datum *xd, *_xd;
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 0360097e593..762814b7107 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -69,7 +69,8 @@ extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c
 extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
 extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
 
-extern int jffs2_garbage_collect_xattr(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
+extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd);
+extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref);
 extern int jffs2_verify_xattr(struct jffs2_sb_info *c);
 
 extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
@@ -94,7 +95,6 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 
 #define jffs2_xattr_delete_inode(c, ic)
 #define jffs2_xattr_free_inode(c, ic)
-#define jffs2_garbage_collect_xattr(c, ic)	(1)
 #define jffs2_verify_xattr(c)			(1)
 
 #define jffs2_xattr_handlers	NULL
-- 
cgit v1.2.3


From 4470d0409bfe093abbf965dcc97e5c1450c80afb Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:17:11 +0900
Subject: [JFFS2][XATTR] '#include <linux/list.h>' was added into xattr.h.

'#include <linux/list.h>' was added into xattr.h.
because 'struct list_head' is used in this header file.

[6/10] jffs2-xattr-v5.1-06-add_list.h.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/xattr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 762814b7107..750d77ee688 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -12,6 +12,7 @@
 #define _JFFS2_FS_XATTR_H_
 
 #include <linux/xattr.h>
+#include <linux/list.h>
 
 #define JFFS2_XFLAGS_HOT	(0x01)	/* This datum is HOT */
 #define JFFS2_XFLAGS_BIND	(0x02)	/* This datum is not reclaimed */
-- 
cgit v1.2.3


From 652ecc20d1f5b4fd745c185c940e5b3afb2a0711 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:18:27 +0900
Subject: [JFFS2][XATTR] Unify each file header part with any jffs2 file.

Unify each file header part with any jffs2 file.

[7/10] jffs2-xattr-v5.1-07-unify_file_header.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/acl.c           | 15 ++++++++-------
 fs/jffs2/acl.h           | 15 ++++++++-------
 fs/jffs2/security.c      | 16 ++++++++--------
 fs/jffs2/xattr.c         | 16 ++++++++--------
 fs/jffs2/xattr.h         | 16 ++++++++--------
 fs/jffs2/xattr_trusted.c | 15 ++++++++-------
 fs/jffs2/xattr_user.c    | 15 ++++++++-------
 7 files changed, 56 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index de173df84a8..1682278d974 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -1,12 +1,13 @@
-/*-------------------------------------------------------------------------*
- *  File: fs/jffs2/acl.c
- *  POSIX ACL support on JFFS2 FileSystem
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
  *
- *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
- *  Copyright (C) 2006 NEC Corporation
+ * Copyright (C) 2006  NEC Corporation
  *
- *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
- *-------------------------------------------------------------------------*/
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 3f41e09200b..54761fa8f23 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -1,12 +1,13 @@
-/*-------------------------------------------------------------------------*
- *  File: fs/jffs2/acl.h
- *  POSIX ACL support on JFFS2 FileSystem
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
  *
- *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
- *  Copyright (C) 2006 NEC Corporation
+ * Copyright (C) 2006  NEC Corporation
  *
- *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
- *-------------------------------------------------------------------------*/
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 struct jffs2_acl_entry {
 	jint16_t	e_tag;
 	jint16_t	e_perm;
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 4b6c3b22524..52a9894a636 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -1,13 +1,13 @@
-/*-------------------------------------------------------------------------*
- *  File: fs/jffs2/security.c
- *  Security Labels support on JFFS2 FileSystem
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
  *
- *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
- *  Copyright (C) 2006 NEC Corporation
+ * Copyright (C) 2006  NEC Corporation
  *
- *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
- *-------------------------------------------------------------------------*/
-
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9c1f401d12d..330e4491e73 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1,13 +1,13 @@
-/* -------------------------------------------------------------------------
- *  File: fs/jffs2/xattr.c
- *  XATTR support on JFFS2 FileSystem
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
  *
- *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
- *  Copyright (C) 2006 NEC Corporation
+ * Copyright (C) 2006  NEC Corporation
  *
- *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
- * ------------------------------------------------------------------------- */
-
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 750d77ee688..07cc737479f 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -1,13 +1,13 @@
-/*-------------------------------------------------------------------------*
- *  File: fs/jffs2/xattr.c
- *  XATTR support on JFFS2 FileSystem
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
  *
- *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
- *  Copyright (C) 2006 NEC Corporation
+ * Copyright (C) 2006  NEC Corporation
  *
- *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
- *-------------------------------------------------------------------------*/
-
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #ifndef _JFFS2_FS_XATTR_H_
 #define _JFFS2_FS_XATTR_H_
 
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index a018c9c31a6..ed046e19dbf 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -1,12 +1,13 @@
-/*-------------------------------------------------------------------------*
- *  File: fs/jffs2/xattr_trusted.c
- *  XATTR support on JFFS2 FileSystem
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
  *
- *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
- *  Copyright (C) 2006 NEC Corporation
+ * Copyright (C) 2006  NEC Corporation
  *
- *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
- *-------------------------------------------------------------------------*/
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index d8c13636ea4..2f8e9aa01ea 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -1,12 +1,13 @@
-/*-------------------------------------------------------------------------*
- *  File: fs/jffs2/xattr_user.c
- *  XATTR support on JFFS2 FileSystem
+/*
+ * JFFS2 -- Journalling Flash File System, Version 2.
  *
- *  Implemented by KaiGai Kohei <kaigai@ak.jp.nec.com>
- *  Copyright (C) 2006 NEC Corporation
+ * Copyright (C) 2006  NEC Corporation
  *
- *  For licensing information, see the file 'LICENCE' in the jffs2 directory.
- *-------------------------------------------------------------------------*/
+ * Created by KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * For licensing information, see the file 'LICENCE' in this directory.
+ *
+ */
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/jffs2.h>
-- 
cgit v1.2.3


From ee886b5df17f9791a72cf0afe7f6c0c079231ef8 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:19:03 +0900
Subject: [JFFS2][XATTR] remove senseless comment

remove senseless comment.

[8/10] jffs2-xattr-v5.1-08-remove_senseless_comment.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/xattr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 07cc737479f..e2aa2394ab6 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -38,7 +38,7 @@ struct jffs2_xattr_datum
 	uint32_t value_len;	/* length of xvalue */
 };
 
-struct jffs2_inode_cache;	/* forward refence */
+struct jffs2_inode_cache;
 struct jffs2_xattr_ref
 {
 	void *always_null;
-- 
cgit v1.2.3


From 5a14959c0700cd389d9e7ba312e15c8e85255e1f Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:19:36 +0900
Subject: [JFFS2][XATTR] remove '__KERNEL__' from acl.h

[9/10] jffs2-xattr-v5.1-09-remove__KERNEL__.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/acl.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 54761fa8f23..8893bd1a6ba 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -23,7 +23,6 @@ struct jffs2_acl_header {
 	jint32_t	a_version;
 };
 
-#ifdef __KERNEL__
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
 
 #define JFFS2_ACL_NOT_CACHED ((void *)-1)
@@ -44,4 +43,3 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
 #define jffs2_clear_acl(inode)
 
 #endif	/* CONFIG_JFFS2_FS_POSIX_ACL */
-#endif	/* __KERNEL__ */
-- 
cgit v1.2.3


From dea80134dc4d54df52c0c59b0ba2bb5aa999bf30 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:20:24 +0900
Subject: [JFFS2][XATTR] remove redundant pointer cast in acl.c

remove redundant pointer cast in acl.c.

[10/10] jffs2-xattr-v5.1-10-remove_pointer_cast.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/acl.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 1682278d974..320dd48b834 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -48,9 +48,11 @@ static int jffs2_acl_count(size_t size)
 	}
 }
 
-static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
+static struct posix_acl *jffs2_acl_from_medium(void *value, size_t size)
 {
-	const char *end = (char *)value + size;
+	void *end = value + size;
+	struct jffs2_acl_header *header = value;
+	struct jffs2_acl_entry *entry;
 	struct posix_acl *acl;
 	uint32_t ver;
 	int i, count;
@@ -59,13 +61,13 @@ static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
 		return NULL;
 	if (size < sizeof(struct jffs2_acl_header))
 		return ERR_PTR(-EINVAL);
-	ver = je32_to_cpu(((struct jffs2_acl_header *)value)->a_version);
+	ver = je32_to_cpu(header->a_version);
 	if (ver != JFFS2_ACL_VERSION) {
 		JFFS2_WARNING("Invalid ACL version. (=%u)\n", ver);
 		return ERR_PTR(-EINVAL);
 	}
 
-	value = (char *)value + sizeof(struct jffs2_acl_header);
+	value += sizeof(struct jffs2_acl_header);
 	count = jffs2_acl_count(size);
 	if (count < 0)
 		return ERR_PTR(-EINVAL);
@@ -77,8 +79,8 @@ static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
 		return ERR_PTR(-ENOMEM);
 
 	for (i=0; i < count; i++) {
-		struct jffs2_acl_entry *entry = (struct jffs2_acl_entry *)value;
-		if ((char *)value + sizeof(struct jffs2_acl_entry_short) > end)
+		entry = value;
+		if (value + sizeof(struct jffs2_acl_entry_short) > end)
 			goto fail;
 		acl->a_entries[i].e_tag = je16_to_cpu(entry->e_tag);
 		acl->a_entries[i].e_perm = je16_to_cpu(entry->e_perm);
@@ -87,14 +89,14 @@ static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
 			case ACL_GROUP_OBJ:
 			case ACL_MASK:
 			case ACL_OTHER:
-				value = (char *)value + sizeof(struct jffs2_acl_entry_short);
+				value += sizeof(struct jffs2_acl_entry_short);
 				acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
 				break;
 
 			case ACL_USER:
 			case ACL_GROUP:
-				value = (char *)value + sizeof(struct jffs2_acl_entry);
-				if ((char *)value > end)
+				value += sizeof(struct jffs2_acl_entry);
+				if (value > end)
 					goto fail;
 				acl->a_entries[i].e_id = je32_to_cpu(entry->e_id);
 				break;
@@ -113,20 +115,19 @@ static struct posix_acl *jffs2_acl_from_medium(const void *value, size_t size)
 
 static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
 {
-	struct jffs2_acl_header *jffs2_acl;
-	char *e;
+	struct jffs2_acl_header *header;
+	struct jffs2_acl_entry *entry;
+	void *e;
 	size_t i;
 
 	*size = jffs2_acl_size(acl->a_count);
-	jffs2_acl = kmalloc(sizeof(struct jffs2_acl_header)
-			     + acl->a_count * sizeof(struct jffs2_acl_entry),
-			    GFP_KERNEL);
-	if (!jffs2_acl)
+	header = kmalloc(sizeof(*header) + acl->a_count * sizeof(*entry), GFP_KERNEL);
+	if (!header)
 		return ERR_PTR(-ENOMEM);
-	jffs2_acl->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
-	e = (char *)jffs2_acl + sizeof(struct jffs2_acl_header);
+	header->a_version = cpu_to_je32(JFFS2_ACL_VERSION);
+	e = header + 1;
 	for (i=0; i < acl->a_count; i++) {
-		struct jffs2_acl_entry *entry = (struct jffs2_acl_entry *)e;
+		entry = e;
 		entry->e_tag = cpu_to_je16(acl->a_entries[i].e_tag);
 		entry->e_perm = cpu_to_je16(acl->a_entries[i].e_perm);
 		switch(acl->a_entries[i].e_tag) {
@@ -147,9 +148,9 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size)
 				goto fail;
 		}
 	}
-	return (char *)jffs2_acl;
+	return header;
  fail:
-	kfree(jffs2_acl);
+	kfree(header);
 	return ERR_PTR(-EINVAL);
 }
 
-- 
cgit v1.2.3


From c8708a9275928cc8e77bd443cd12565dda0a3ded Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:21:38 +0900
Subject: [JFFS2][XATTR] Handling the duplicate JFFS2_NODETYPE_XATTR node
 cases.

When jffs2_sum_process_sum_data() found a JFFS2_NODETYPE_XATTR
which has duplicate xid and older version, an error was returned
without appropriate process.
In the result, mounting filesystem is failed.

This patch fix this problem. If jffs2_setup_xattr_datum() returned
-EEXIST, the caller marks this node as DIRTY_SPACE().

[1/2] jffs2-xattr-v5.2-01-fix-duplicate-xdatum.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/summary.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 831a42c1305..9763d73c0da 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -508,8 +508,14 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
 								je32_to_cpu(spx->version));
 				if (IS_ERR(xd)) {
-					JFFS2_NOTICE("allocation of xattr_datum failed\n");
 					jffs2_free_raw_node_ref(raw);
+					if (PTR_ERR(xd) == -EEXIST) {
+						/* a newer version of xd exists */
+						DIRTY_SPACE(je32_to_cpu(spx->totlen));
+						sp += JFFS2_SUMMARY_XATTR_SIZE;
+						break;
+					}
+					JFFS2_NOTICE("allocation of xattr_datum failed\n");
 					kfree(summary);
 					return PTR_ERR(xd);
 				}
-- 
cgit v1.2.3


From 21b9879bf2817aca343cdda11ade6a87f5373e74 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 13 May 2006 15:22:29 +0900
Subject: [JFFS2][XATTR] Fix obvious typo

[2/2] jffs2-xattr-v5.2-02-fix_obvious_typo.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
---
 fs/jffs2/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 330e4491e73..057bd4dcf66 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -438,7 +438,7 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 	return xd;
 }
 
-/* -------- xdatum related functions ----------------
+/* -------- xref related functions ------------------
  * verify_xattr_ref(c, ref)
  *   is used to load xref information from medium. Because summary data does not
  *   contain xid/ino, it's necessary to verify once while mounting process.
-- 
cgit v1.2.3


From cf5eba53346fbfdf1b80e05ca3fd7fe2ec841077 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 14 May 2006 04:06:24 +0100
Subject: [JFFS2] Reduce excessive node count for syslog files.

We currently get fairly poor behaviour with files which get many short
writes, such as system logs. This is because we end up with many tiny
data nodes, and the rbtree gets massive. None of these nodes are
actually obsolete, so they are counted as 'clean' space. Eraseblocks can
be entirely full of these nodes (which are REF_NORMAL instead of
REF_PRISTINE), and still they count entirely towards 'used_size' and the
eraseblocks can sit on the clean_list for a long time without being
picked for GC.

One way to alleviate this in the long term is to account REF_NORMAL
space separately from REF_PRISTINE space, rather than counting them both
towards used_size. Then these eraseblocks can be picked for GC and the
offending nodes will be garbage collected.

The short-term fix, though -- which probably makes sense even if we do
eventually implement the above -- is to merge these nodes as they're
written. When we write the last byte in a page, write the _whole_ page.
This obsoletes the earlier nodes in the page _immediately_ and we don't
even need to wait for the garbage collection to do it.

Original implementation from Ferenc Havasi <havasi@inf.u-szeged.hu>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/file.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 9f4171213e5..3349db0a786 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -215,12 +215,20 @@ static int jffs2_commit_write (struct file *filp, struct page *pg,
 	D1(printk(KERN_DEBUG "jffs2_commit_write(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
 		  inode->i_ino, pg->index << PAGE_CACHE_SHIFT, start, end, pg->flags));
 
-	if (!start && end == PAGE_CACHE_SIZE) {
-		/* We need to avoid deadlock with page_cache_read() in
-		   jffs2_garbage_collect_pass(). So we have to mark the
-		   page up to date, to prevent page_cache_read() from
-		   trying to re-lock it. */
-		SetPageUptodate(pg);
+	if (end == PAGE_CACHE_SIZE) {
+		if (!start) {
+			/* We need to avoid deadlock with page_cache_read() in
+			   jffs2_garbage_collect_pass(). So we have to mark the
+			   page up to date, to prevent page_cache_read() from
+			   trying to re-lock it. */
+			SetPageUptodate(pg);
+		} else {
+			/* When writing out the end of a page, write out the 
+			   _whole_ page. This helps to reduce the number of
+			   nodes in files which have many short writes, like
+			   syslog files. */
+			start = aligned_start = 0;
+		}
 	}
 
 	ri = jffs2_alloc_raw_inode();
-- 
cgit v1.2.3


From 3e68fbb59b3d4e6b47b65e9928b5929e02179759 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 15 May 2006 00:49:43 +0100
Subject: [JFFS2] Don't pack on-medium structures, because GCC emits crappy
 code

If we use __attribute__((packed)), GCC will _also_ assume that the
structures aren't sensibly aligned, and it'll emit code to cope with
that instead of straight word load/save. This can be _very_ suboptimal
on architectures like ARM.

Ideally, we want an attribute which just tells GCC not to do any
padding, without the alignment side-effects. In the absense of that,
we'll just drop the 'packed' attribute and hope that everything stays as
it was (which to be fair is fairly much what we expect). And add some
paranoia checks in the initialisation code, which should be optimised
away completely in the normal case.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/super.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index ffd8e84b22c..5f73de58692 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -320,6 +320,18 @@ static int __init init_jffs2_fs(void)
 {
 	int ret;
 
+	/* Paranoia checks for on-medium structures. If we ask GCC
+	   to pack them with __attribute__((packed)) then it _also_
+	   assumes that they're not aligned -- so it emits crappy
+	   code on some architectures. Ideally we want an attribute
+	   which means just 'no padding', without the alignment
+	   thing. But GCC doesn't have that -- we have to just
+	   hope the structs are the right sizes, instead. */
+	BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
+	BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
+	BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
+	BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
+
 	printk(KERN_INFO "JFFS2 version 2.2."
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	       " (NAND)"
@@ -327,7 +339,7 @@ static int __init init_jffs2_fs(void)
 #ifdef CONFIG_JFFS2_SUMMARY
 	       " (SUMMARY) "
 #endif
-	       " (C) 2001-2003 Red Hat, Inc.\n");
+	       " (C) 2001-2006 Red Hat, Inc.\n");
 
 	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
 					     sizeof(struct jffs2_inode_info),
-- 
cgit v1.2.3


From 184f565210c6c8a852c53ffc070f9add61e0f331 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 15 May 2006 13:45:58 +0100
Subject: [JFFS2] Fix printk format in some error messages.

fs/jffs2/nodelist.c: In function `check_node_data':
fs/jffs2/nodelist.c:441: warning: unsigned int format, different type arg (arg 4)
fs/jffs2/nodelist.c:464: warning: int format, different type arg (arg 5)

Modified from Andrew's original fix because while his terminal may indeed
only have eighty columns, mine only has _TWENTYFOUR_ lines. So the
cosmetic fluff is perfectly OK out past column 80 where it was -- the
casual reader doesn't _care_ about anything more than the fact that it
goes 'if (foo) JFFS2_WARNING...', and there's no point wasting a whole
line to display the tail end of the printk which nobody actually cares
about.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodelist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index d4d0c41490c..7d71593d7fd 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -438,7 +438,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 	if (c->mtd->point) {
 		err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer);
 		if (!err && retlen < tn->csize) {
-			JFFS2_WARNING("MTD point returned len too short: %u instead of %u.\n", retlen, tn->csize);
+			JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
 			c->mtd->unpoint(c->mtd, buffer, ofs, len);
 		} else if (err)
 			JFFS2_WARNING("MTD point failed: error code %d.\n", err);
@@ -461,7 +461,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 		}
 
 		if (retlen != len) {
-			JFFS2_ERROR("short read at %#08x: %d instead of %d.\n", ofs, retlen, len);
+			JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", ofs, retlen, len);
 			err = -EIO;
 			goto free_out;
 		}
-- 
cgit v1.2.3


From 5b5ffbc1e6d62d89747f3f59c09b2e488a7d7fce Mon Sep 17 00:00:00 2001
From: Florin Malita <fmalita@gmail.com>
Date: Mon, 15 May 2006 23:42:31 +0100
Subject: [PATCH] jffs2: memory leak in jffs2_scan_medium()

If jffs2_scan_eraseblock() fails and the exit path is taken, 's' is not
being deallocated.

Reported by Coverity, CID: 1258.

Signed-off-by: Florin Malita <fmalita@gmail.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index cf55b221fc2..eca0996def6 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -222,9 +222,6 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 		}
 	}
 
-	if (jffs2_sum_active() && s)
-		kfree(s);
-
 	/* Nextblock dirty is always seen as wasted, because we cannot recycle it now */
 	if (c->nextblock && (c->nextblock->dirty_size)) {
 		c->nextblock->wasted_size += c->nextblock->dirty_size;
@@ -266,6 +263,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 	else
 		c->mtd->unpoint(c->mtd, flashbuf, 0, c->mtd->size);
 #endif
+	if (s)
+		kfree(s);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 7d2beb135986477f53da77303356bd04329f8d0e Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Tue, 16 May 2006 16:08:10 +0100
Subject: [JFFS2] Fix section mismatch warnings in JFFS2.

Mark certain functions with __init and __exit appropriately.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/compr.c      | 4 ++--
 fs/jffs2/compr_zlib.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index e7944e665b9..5f45e01d71e 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -412,7 +412,7 @@ void jffs2_free_comprbuf(unsigned char *comprbuf, unsigned char *orig)
                 kfree(comprbuf);
 }
 
-int jffs2_compressors_init(void)
+int __init jffs2_compressors_init(void)
 {
 /* Registering compressors */
 #ifdef CONFIG_JFFS2_ZLIB
@@ -440,7 +440,7 @@ int jffs2_compressors_init(void)
         return 0;
 }
 
-int jffs2_compressors_exit(void)
+int __exit jffs2_compressors_exit(void)
 {
 /* Unregistering compressors */
 #ifdef CONFIG_JFFS2_RUBIN
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 5c63e0cdcf4..d43cbac4fb9 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -60,7 +60,7 @@ static int __init alloc_workspaces(void)
 	return 0;
 }
 
-static void free_workspaces(void)
+static void __exit free_workspaces(void)
 {
 	vfree(def_strm.workspace);
 	vfree(inf_strm.workspace);
@@ -216,7 +216,7 @@ int __init jffs2_zlib_init(void)
     return ret;
 }
 
-void jffs2_zlib_exit(void)
+void __exit jffs2_zlib_exit(void)
 {
     jffs2_unregister_compressor(&jffs2_zlib_comp);
     free_workspaces();
-- 
cgit v1.2.3


From c41ff6e5f38b02ff927d0d510e28dc1392bb4690 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 16 May 2006 17:05:33 +0100
Subject: [JFFS2] Fix printk format in jffs2_sum_write_data() error message.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fs/jffs2/summary.c: In function ‘jffs2_sum_write_data’:
fs/jffs2/summary.c:658: warning: format ‘%zd’ expects type ‘signed size_t’, but argument 4 has type ‘uint32_t’

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 7b0ed77a4c3..48293c197f1 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -655,7 +655,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 
 
 	if (ret || (retlen != infosize)) {
-		JFFS2_WARNING("Write of %d bytes at 0x%08x failed. returned %d, retlen %zu\n",
+		JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n",
 			infosize, jeb->offset + c->sector_size - jeb->free_size, ret, retlen);
 
 		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
-- 
cgit v1.2.3


From aef9ab47841af45888d950baa6448072cc70bdd5 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 19 May 2006 00:28:49 +0100
Subject: [JFFS2] Support new device nodes

Device node major/minor numbers are just stored in the payload of a single
data node. Just extend that to 4 bytes and use new_encode_dev() for it.

We only use the 4-byte format if we _need_ to, if !old_valid_dev(foo).
This preserves backwards compatibility with older code as much as
possible. If we do make devices with major or minor numbers above 255, and
then mount the file system with the old code, it'll just read the first
two bytes and get the numbers wrong. If it comes to garbage-collect it,
it'll then write back those wrong numbers. But that's about the best we
can expect.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/dir.c      | 12 +++++-------
 fs/jffs2/fs.c       | 25 ++++++++++++++++++-------
 fs/jffs2/gc.c       |  7 ++-----
 fs/jffs2/nodelist.h | 11 +++++++++++
 fs/jffs2/os-linux.h |  4 +---
 5 files changed, 37 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 1c8e8c0f6ce..a6c11cef1b7 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -591,12 +591,12 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	struct jffs2_full_dnode *fn;
 	struct jffs2_full_dirent *fd;
 	int namelen;
-	jint16_t dev;
+	union jffs2_device_node dev;
 	int devlen = 0;
 	uint32_t alloclen, phys_ofs;
 	int ret;
 
-	if (!old_valid_dev(rdev))
+	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
 	ri = jffs2_alloc_raw_inode();
@@ -605,17 +605,15 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 
 	c = JFFS2_SB_INFO(dir_i->i_sb);
 
-	if (S_ISBLK(mode) || S_ISCHR(mode)) {
-		dev = cpu_to_je16(old_encode_dev(rdev));
-		devlen = sizeof(dev);
-	}
+	if (S_ISBLK(mode) || S_ISCHR(mode))
+		devlen = jffs2_encode_dev(&dev, rdev);
 
 	/* Try to reserve enough space for both node and dirent.
 	 * Just the node will do for now, though
 	 */
 	namelen = dentry->d_name.len;
 	ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 
 	if (ret) {
 		jffs2_free_raw_inode(ri);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index ea1f37d4fc5..24cb4c688ef 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -33,7 +33,7 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	struct jffs2_raw_inode *ri;
-	unsigned short dev;
+	union jffs2_device_node dev;
 	unsigned char *mdata = NULL;
 	int mdatalen = 0;
 	unsigned int ivalid;
@@ -51,9 +51,8 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	   it out again with the appropriate data attached */
 	if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
 		/* For these, we don't actually need to read the old node */
-		dev = old_encode_dev(inode->i_rdev);
+		mdatalen = jffs2_encode_dev(&dev, inode->i_rdev);
 		mdata = (char *)&dev;
-		mdatalen = sizeof(dev);
 		D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen));
 	} else if (S_ISLNK(inode->i_mode)) {
 		down(&f->sem);
@@ -232,6 +231,8 @@ void jffs2_read_inode (struct inode *inode)
 	struct jffs2_inode_info *f;
 	struct jffs2_sb_info *c;
 	struct jffs2_raw_inode latest_node;
+	union jffs2_device_node jdev;
+	dev_t rdev = 0;
 	int ret;
 
 	D1(printk(KERN_DEBUG "jffs2_read_inode(): inode->i_ino == %lu\n", inode->i_ino));
@@ -263,7 +264,6 @@ void jffs2_read_inode (struct inode *inode)
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 
 	switch (inode->i_mode & S_IFMT) {
-		jint16_t rdev;
 
 	case S_IFLNK:
 		inode->i_op = &jffs2_symlink_inode_operations;
@@ -297,8 +297,16 @@ void jffs2_read_inode (struct inode *inode)
 	case S_IFBLK:
 	case S_IFCHR:
 		/* Read the device numbers from the media */
+		if (f->metadata->size != sizeof(jdev.old) &&
+		    f->metadata->size != sizeof(jdev.new)) {
+			printk(KERN_NOTICE "Device node has strange size %d\n", f->metadata->size);
+			up(&f->sem);
+			jffs2_do_clear_inode(c, f);
+			make_bad_inode(inode);
+			return;
+		}
 		D1(printk(KERN_DEBUG "Reading device numbers from flash\n"));
-		if (jffs2_read_dnode(c, f, f->metadata, (char *)&rdev, 0, sizeof(rdev)) < 0) {
+		if (jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size) < 0) {
 			/* Eep */
 			printk(KERN_NOTICE "Read device numbers for inode %lu failed\n", (unsigned long)inode->i_ino);
 			up(&f->sem);
@@ -306,12 +314,15 @@ void jffs2_read_inode (struct inode *inode)
 			make_bad_inode(inode);
 			return;
 		}
+		if (f->metadata->size == sizeof(jdev.old))
+			rdev = old_decode_dev(je16_to_cpu(jdev.old));
+		else
+			rdev = new_decode_dev(je32_to_cpu(jdev.new));
 
 	case S_IFSOCK:
 	case S_IFIFO:
 		inode->i_op = &jffs2_file_inode_operations;
-		init_special_inode(inode, inode->i_mode,
-				   old_decode_dev((je16_to_cpu(rdev))));
+		init_special_inode(inode, inode->i_mode, rdev);
 		break;
 
 	default:
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 967fb2cf8e2..77d30707de5 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -679,7 +679,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 	struct jffs2_full_dnode *new_fn;
 	struct jffs2_raw_inode ri;
 	struct jffs2_node_frag *last_frag;
-	jint16_t dev;
+	union jffs2_device_node dev;
 	char *mdata = NULL, mdatalen = 0;
 	uint32_t alloclen, phys_ofs, ilen;
 	int ret;
@@ -687,11 +687,8 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 	if (S_ISBLK(JFFS2_F_I_MODE(f)) ||
 	    S_ISCHR(JFFS2_F_I_MODE(f)) ) {
 		/* For these, we don't actually need to read the old node */
-		/* FIXME: for minor or major > 255. */
-		dev = cpu_to_je16(((JFFS2_F_I_RDEV_MAJ(f) << 8) |
-			JFFS2_F_I_RDEV_MIN(f)));
+		mdatalen = jffs2_encode_dev(&dev, JFFS2_F_I_RDEV(f));
 		mdata = (char *)&dev;
-		mdatalen = sizeof(dev);
 		D1(printk(KERN_DEBUG "jffs2_garbage_collect_metadata(): Writing %d bytes of kdev_t\n", mdatalen));
 	} else if (S_ISLNK(JFFS2_F_I_MODE(f))) {
 		mdatalen = fn->size;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index f6645afe88e..24e0f28a8ba 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -268,6 +268,17 @@ static inline uint32_t ref_totlen(struct jffs2_sb_info *c,
 
 #define PAD(x) (((x)+3)&~3)
 
+static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
+{
+	if (old_valid_dev(rdev)) {
+		jdev->old = cpu_to_je16(old_encode_dev(rdev));
+		return sizeof(jdev->old);
+	} else {
+		jdev->new = cpu_to_je32(new_encode_dev(rdev));
+		return sizeof(jdev->new);
+	}
+}
+
 static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
 {
 	while(raw->next_in_ino) {
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index d307cf54862..a10eb03ac95 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -31,9 +31,7 @@ struct kvec;
 #define JFFS2_F_I_MODE(f) (OFNI_EDONI_2SFFJ(f)->i_mode)
 #define JFFS2_F_I_UID(f) (OFNI_EDONI_2SFFJ(f)->i_uid)
 #define JFFS2_F_I_GID(f) (OFNI_EDONI_2SFFJ(f)->i_gid)
-
-#define JFFS2_F_I_RDEV_MIN(f) (iminor(OFNI_EDONI_2SFFJ(f)))
-#define JFFS2_F_I_RDEV_MAJ(f) (imajor(OFNI_EDONI_2SFFJ(f)))
+#define JFFS2_F_I_RDEV(f) (OFNI_EDONI_2SFFJ(f)->i_rdev)
 
 #define ITIME(sec) ((struct timespec){sec, 0})
 #define I_SEC(tv) ((tv).tv_sec)
-- 
cgit v1.2.3


From 8e4482fba21d15da99f39a13396d3361e810d199 Mon Sep 17 00:00:00 2001
From: Ferenc Havasi <havasi@inf.u-szeged.hu>
Date: Fri, 19 May 2006 21:00:36 +0100
Subject: [JFFS2] Remove forgotten summary code

Remove forgotten lines from jffs2_scan_eraseblock() which
were unnecessary and may cause problem in some environments.

Thanks to Alexander Belyakov <alexander.belyakov@intel.com>.

Signed-off-by: Ferenc Havasi <havasi@inf.u-szeged.hu>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index eca0996def6..352ada892f3 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -365,23 +365,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 		}
 
 		kfree(sm);
-
-		ofs = jeb->offset;
-		prevofs = jeb->offset - 1;
 	}
 
 	buf_ofs = jeb->offset;
 
 	if (!buf_size) {
 		buf_len = c->sector_size;
-
-		if (jffs2_sum_active()) {
-			/* must reread because of summary test */
-			err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len);
-			if (err)
-				return err;
-		}
-
 	} else {
 		buf_len = EMPTY_SCAN_SIZE(c->sector_size);
 		err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len);
-- 
cgit v1.2.3


From 9641b784ff82cf0a48a6c70ef9867f5fd728de67 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 20 May 2006 16:13:34 +0100
Subject: [JFFS2] Optimise reading of eraseblock summary nodes

This improves the time to mount 512MiB of NAND flash on my OLPC prototype
by about 4%. We used to read the last page of the eraseblock twice -- once
to find the offset of the summary node, and again to actually _read_ the
summary node. Now we read the last page only once, and read more only if
we need to.

We also don't allocate a new buffer just for the summary code -- we use
the buffer which was already allocated for the scan. Better still, if the
'buffer' for the scan is actually just a pointer directly into NOR flash,
we use that too, avoiding the memcpy() which we used to do.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c    | 74 +++++++++++++++++++++++++++++++++++++++++-------------
 fs/jffs2/summary.c | 36 +++++---------------------
 fs/jffs2/summary.h |  3 ++-
 3 files changed, 64 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 352ada892f3..2a24b44662b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -306,11 +306,12 @@ int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *je
 		return BLK_STATE_ALLDIRTY;
 }
 
+/* Called with 'buf_size == 0' if buf is in fact a pointer _directly_ into
+   the flash, XIP-style */
 static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-				unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
+				  unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) {
 	struct jffs2_unknown_node *node;
 	struct jffs2_unknown_node crcnode;
-	struct jffs2_sum_marker *sm;
 	uint32_t ofs, prevofs;
 	uint32_t hdr_crc, buf_ofs, buf_len;
 	int err;
@@ -344,32 +345,69 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 #endif
 
 	if (jffs2_sum_active()) {
-		sm = kmalloc(sizeof(struct jffs2_sum_marker), GFP_KERNEL);
-		if (!sm) {
-			return -ENOMEM;
-		}
+		struct jffs2_sum_marker *sm;
+		void *sumptr = NULL;
+		uint32_t sumlen;
+	      
+		if (!buf_size) {
+			/* XIP case. Just look, point at the summary if it's there */
+			sm = (void *)buf + jeb->offset - sizeof(*sm);
+			if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
+				sumptr = buf + je32_to_cpu(sm->offset);
+				sumlen = c->sector_size - je32_to_cpu(sm->offset);
+			}
+		} else {
+			/* If NAND flash, read a whole page of it. Else just the end */
+			if (c->wbuf_pagesize)
+				buf_len = c->wbuf_pagesize;
+			else
+				buf_len = sizeof(*sm);
+
+			/* Read as much as we want into the _end_ of the preallocated buffer */
+			err = jffs2_fill_scan_buf(c, buf + buf_size - buf_len, 
+						  jeb->offset + c->sector_size - buf_len,
+						  buf_len);				
+			if (err)
+				return err;
+
+			sm = (void *)buf + buf_size - sizeof(*sm);
+			if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
+				sumlen = c->sector_size - je32_to_cpu(sm->offset);
+				sumptr = buf + buf_size - sumlen;
+
+				/* Now, make sure the summary itself is available */
+				if (sumlen > buf_size) {
+					/* Need to kmalloc for this. */
+					sumptr = kmalloc(sumlen, GFP_KERNEL);
+					if (!sumptr)
+						return -ENOMEM;
+					memcpy(sumptr + sumlen - buf_len, buf + buf_size - buf_len, buf_len);
+				}
+				if (buf_len < sumlen) {
+					/* Need to read more so that the entire summary node is present */
+					err = jffs2_fill_scan_buf(c, sumptr, 
+								  jeb->offset + c->sector_size - sumlen,
+								  sumlen - buf_len);				
+					if (err)
+						return err;
+				}
+			}
 
-		err = jffs2_fill_scan_buf(c, (unsigned char *) sm, jeb->offset + c->sector_size -
-					sizeof(struct jffs2_sum_marker), sizeof(struct jffs2_sum_marker));
-		if (err) {
-			kfree(sm);
-			return err;
 		}
 
-		if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC ) {
-			err = jffs2_sum_scan_sumnode(c, jeb, je32_to_cpu(sm->offset), &pseudo_random);
-			if (err) {
-				kfree(sm);
+		if (sumptr) {
+			err = jffs2_sum_scan_sumnode(c, jeb, sumptr, sumlen, &pseudo_random);
+			if (err)
 				return err;
-			}
+			if (buf_size && sumlen > buf_size)
+				kfree(sumptr);
 		}
-
-		kfree(sm);
 	}
 
 	buf_ofs = jeb->offset;
 
 	if (!buf_size) {
+		/* This is the XIP case -- we're reading _directly_ from the flash chip */
 		buf_len = c->sector_size;
 	} else {
 		buf_len = EMPTY_SCAN_SIZE(c->sector_size);
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 48293c197f1..82a3706c54d 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -318,7 +318,6 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				raw = jffs2_alloc_raw_node_ref();
 				if (!raw) {
 					JFFS2_NOTICE("allocation of node reference failed\n");
-					kfree(summary);
 					return -ENOMEM;
 				}
 
@@ -326,7 +325,6 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				if (!ic) {
 					JFFS2_NOTICE("scan_make_ino_cache failed\n");
 					jffs2_free_raw_node_ref(raw);
-					kfree(summary);
 					return -ENOMEM;
 				}
 
@@ -358,10 +356,8 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 							jeb->offset + je32_to_cpu(spd->offset));
 
 				fd = jffs2_alloc_full_dirent(spd->nsize+1);
-				if (!fd) {
-					kfree(summary);
+				if (!fd)
 					return -ENOMEM;
-				}
 
 				memcpy(&fd->name, spd->name, spd->nsize);
 				fd->name[spd->nsize] = 0;
@@ -370,7 +366,6 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				if (!raw) {
 					jffs2_free_full_dirent(fd);
 					JFFS2_NOTICE("allocation of node reference failed\n");
-					kfree(summary);
 					return -ENOMEM;
 				}
 
@@ -378,7 +373,6 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				if (!ic) {
 					jffs2_free_full_dirent(fd);
 					jffs2_free_raw_node_ref(raw);
-					kfree(summary);
 					return -ENOMEM;
 				}
 
@@ -411,45 +405,28 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 			default : {
 				JFFS2_WARNING("Unsupported node type found in summary! Exiting...");
-				kfree(summary);
 				return -EIO;
 			}
 		}
 	}
 
-	kfree(summary);
 	return 0;
 }
 
 /* Process the summary node - called from jffs2_scan_eraseblock() */
-
 int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-				uint32_t ofs, uint32_t *pseudo_random)
+			   struct jffs2_raw_summary *summary, uint32_t sumsize,
+			   uint32_t *pseudo_random)
 {
 	struct jffs2_unknown_node crcnode;
 	struct jffs2_raw_node_ref *cache_ref;
-	struct jffs2_raw_summary *summary;
-	int ret, sumsize;
+	int ret, ofs;
 	uint32_t crc;
 
-	sumsize = c->sector_size - ofs;
-	ofs += jeb->offset;
+	ofs = jeb->offset + c->sector_size - sumsize;
 
 	dbg_summary("summary found for 0x%08x at 0x%08x (0x%x bytes)\n",
-				jeb->offset, ofs, sumsize);
-
-	summary = kmalloc(sumsize, GFP_KERNEL);
-
-	if (!summary) {
-		return -ENOMEM;
-	}
-
-	ret = jffs2_fill_scan_buf(c, (unsigned char *)summary, ofs, sumsize);
-
-	if (ret) {
-		kfree(summary);
-		return ret;
-	}
+		    jeb->offset, ofs, sumsize);
 
 	/* OK, now check for node validity and CRC */
 	crcnode.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -499,7 +476,6 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 
 			if (!marker_ref) {
 				JFFS2_NOTICE("Failed to allocate node ref for clean marker\n");
-				kfree(summary);
 				return -ENOMEM;
 			}
 
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index b7a678be170..afff4bd551a 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -160,7 +160,8 @@ int jffs2_sum_add_padding_mem(struct jffs2_summary *s, uint32_t size);
 int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri, uint32_t ofs);
 int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *rd, uint32_t ofs);
 int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-			uint32_t ofs, uint32_t *pseudo_random);
+			   struct jffs2_raw_summary *summary, uint32_t sumlen,
+			   uint32_t *pseudo_random);
 
 #else				/* SUMMARY DISABLED */
 
-- 
cgit v1.2.3


From 1417fc44ee923418df3adadeb4846c891bba1ba5 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 20 May 2006 16:20:19 +0100
Subject: [JFFS2] Reduce calls to ref_totlen() in jffs2_mark_node_obsolete()

We were calling ref_totlen() 18 times. Even before that becomes a real
function rather than just a dereference, apparently some compilers still
suck anyway. It'll _certainly_ suck after ref_totlen() becomes more
complicated, so calculate it once and don't rely on CSE.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodemgmt.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 49127a1f045..0e1f58aa606 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -470,6 +470,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 	struct jffs2_unknown_node n;
 	int ret, addedsize;
 	size_t retlen;
+	uint32_t freed_len;
 
 	if(!ref) {
 		printk(KERN_NOTICE "EEEEEK. jffs2_mark_node_obsolete called with NULL node\n");
@@ -499,32 +500,34 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 	spin_lock(&c->erase_completion_lock);
 
+	freed_len = ref_totlen(c, jeb, ref);
+
 	if (ref_flags(ref) == REF_UNCHECKED) {
-		D1(if (unlikely(jeb->unchecked_size < ref_totlen(c, jeb, ref))) {
+		D1(if (unlikely(jeb->unchecked_size < freed_len)) {
 			printk(KERN_NOTICE "raw unchecked node of size 0x%08x freed from erase block %d at 0x%08x, but unchecked_size was already 0x%08x\n",
-			       ref_totlen(c, jeb, ref), blocknr, ref->flash_offset, jeb->used_size);
+			       freed_len, blocknr, ref->flash_offset, jeb->used_size);
 			BUG();
 		})
-		D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), ref_totlen(c, jeb, ref)));
-		jeb->unchecked_size -= ref_totlen(c, jeb, ref);
-		c->unchecked_size -= ref_totlen(c, jeb, ref);
+		D1(printk(KERN_DEBUG "Obsoleting previously unchecked node at 0x%08x of len %x: ", ref_offset(ref), freed_len));
+		jeb->unchecked_size -= freed_len;
+		c->unchecked_size -= freed_len;
 	} else {
-		D1(if (unlikely(jeb->used_size < ref_totlen(c, jeb, ref))) {
+		D1(if (unlikely(jeb->used_size < freed_len)) {
 			printk(KERN_NOTICE "raw node of size 0x%08x freed from erase block %d at 0x%08x, but used_size was already 0x%08x\n",
-			       ref_totlen(c, jeb, ref), blocknr, ref->flash_offset, jeb->used_size);
+			       freed_len, blocknr, ref->flash_offset, jeb->used_size);
 			BUG();
 		})
-		D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), ref_totlen(c, jeb, ref)));
-		jeb->used_size -= ref_totlen(c, jeb, ref);
-		c->used_size -= ref_totlen(c, jeb, ref);
+		D1(printk(KERN_DEBUG "Obsoleting node at 0x%08x of len %#x: ", ref_offset(ref), freed_len));
+		jeb->used_size -= freed_len;
+		c->used_size -= freed_len;
 	}
 
 	// Take care, that wasted size is taken into concern
-	if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + ref_totlen(c, jeb, ref))) && jeb != c->nextblock) {
+	if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) {
 		D1(printk(KERN_DEBUG "Dirtying\n"));
-		addedsize = ref_totlen(c, jeb, ref);
-		jeb->dirty_size += ref_totlen(c, jeb, ref);
-		c->dirty_size += ref_totlen(c, jeb, ref);
+		addedsize = freed_len;
+		jeb->dirty_size += freed_len;
+		c->dirty_size += freed_len;
 
 		/* Convert wasted space to dirty, if not a bad block */
 		if (jeb->wasted_size) {
@@ -545,8 +548,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 	} else {
 		D1(printk(KERN_DEBUG "Wasting\n"));
 		addedsize = 0;
-		jeb->wasted_size += ref_totlen(c, jeb, ref);
-		c->wasted_size += ref_totlen(c, jeb, ref);
+		jeb->wasted_size += freed_len;
+		c->wasted_size += freed_len;
 	}
 	ref->flash_offset = ref_offset(ref) | REF_OBSOLETE;
 
@@ -634,8 +637,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 		printk(KERN_WARNING "Short read from obsoleted node at 0x%08x: %zd\n", ref_offset(ref), retlen);
 		goto out_erase_sem;
 	}
-	if (PAD(je32_to_cpu(n.totlen)) != PAD(ref_totlen(c, jeb, ref))) {
-		printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), ref_totlen(c, jeb, ref));
+	if (PAD(je32_to_cpu(n.totlen)) != PAD(freed_len)) {
+		printk(KERN_WARNING "Node totlen on flash (0x%08x) != totlen from node ref (0x%08x)\n", je32_to_cpu(n.totlen), freed_len);
 		goto out_erase_sem;
 	}
 	if (!(je16_to_cpu(n.nodetype) & JFFS2_NODE_ACCURATE)) {
-- 
cgit v1.2.3


From f1f9671bd8f7d2ac6a918bad806ab5bdc0daaf4e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 20 May 2006 19:45:26 +0100
Subject: [JFFS2] Introduce jffs2_link_node_ref() function to reduce code
 duplication

The same sequence of code was repeated in many places, to add a new
struct jffs2_raw_node_ref to an eraseblock and adjust the space accounting
accordingly. Move it out-of-line.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c    | 18 ++++----------
 fs/jffs2/nodelist.c | 34 +++++++++++++++++++++++++
 fs/jffs2/nodelist.h |  2 ++
 fs/jffs2/nodemgmt.c | 16 +-----------
 fs/jffs2/scan.c     | 49 +++++++++---------------------------
 fs/jffs2/summary.c  | 71 +++++++++++------------------------------------------
 6 files changed, 68 insertions(+), 122 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index dad68fdffe9..fecf5584f83 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -373,12 +373,8 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 				goto filebad;
 		}
 
-		jeb->first_node = jeb->last_node = NULL;
+		/* Everything else got zeroed before the erase */
 		jeb->free_size = c->sector_size;
-		jeb->used_size = 0;
-		jeb->dirty_size = 0;
-		jeb->wasted_size = 0;
-
 	} else {
 
 		struct kvec vecs[1];
@@ -412,17 +408,13 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 			goto filebad;
 		}
 
+		/* Everything else got zeroed before the erase */
+		jeb->free_size = c->sector_size;
+
 		marker_ref->next_in_ino = NULL;
-		marker_ref->next_phys = NULL;
 		marker_ref->flash_offset = jeb->offset | REF_NORMAL;
-		marker_ref->__totlen = c->cleanmarker_size;
-
-		jeb->first_node = jeb->last_node = marker_ref;
 
-		jeb->free_size = c->sector_size - c->cleanmarker_size;
-		jeb->used_size = c->cleanmarker_size;
-		jeb->dirty_size = 0;
-		jeb->wasted_size = 0;
+		jffs2_link_node_ref(c, jeb, marker_ref, c->cleanmarker_size);
 	}
 
 	spin_lock(&c->erase_completion_lock);
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 4973cd648ba..1fc8aedb56f 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1046,3 +1046,37 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
 		cond_resched();
 	}
 }
+
+void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			 struct jffs2_raw_node_ref *ref, uint32_t len)
+{
+	if (!jeb->first_node)
+		jeb->first_node = ref;
+	if (jeb->last_node)
+		jeb->last_node->next_phys = ref;
+	jeb->last_node = ref;
+
+	switch(ref_flags(ref)) {
+	case REF_UNCHECKED:
+		c->unchecked_size += len;
+		jeb->unchecked_size += len;
+		break;
+
+	case REF_NORMAL:
+	case REF_PRISTINE:
+		c->used_size += len;
+		jeb->used_size += len;
+		break;
+
+	case REF_OBSOLETE:
+		c->dirty_size += len;
+		jeb->used_size += len;
+		break;
+	}
+	c->free_size -= len;
+	jeb->free_size -= len;
+
+	/* Set __totlen field... for now */
+	ref->__totlen = len;
+	ref->next_phys = NULL;
+}
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 8dda98ff556..bac4ec35bbd 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -348,6 +348,8 @@ void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, struct jffs2_node_frag *t
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
 void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
 int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
+void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			 struct jffs2_raw_node_ref *ref, uint32_t len);
 
 /* nodemgmt.c */
 int jffs2_thread_should_wake(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 0e1f58aa606..d6eab1b7ad5 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -403,21 +403,7 @@ int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 #endif
 	spin_lock(&c->erase_completion_lock);
 
-	if (!jeb->first_node)
-		jeb->first_node = new;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = new;
-	jeb->last_node = new;
-
-	jeb->free_size -= len;
-	c->free_size -= len;
-	if (ref_obsolete(new)) {
-		jeb->dirty_size += len;
-		c->dirty_size += len;
-	} else {
-		jeb->used_size += len;
-		c->used_size += len;
-	}
+	jffs2_link_node_ref(c, jeb, new, len);
 
 	if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) {
 		/* If it lives on the dirty_list, jffs2_reserve_space will put it there */
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 40d62d057aa..3cbe9f029e0 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -351,17 +351,11 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	xd->data_crc = je32_to_cpu(rx->data_crc);
 	xd->node = raw;
 
-	raw->__totlen = totlen;
 	raw->flash_offset = ofs | REF_PRISTINE;
-	raw->next_phys = NULL;
 	raw->next_in_ino = (void *)xd;
-	if (!jeb->first_node)
-		jeb->first_node = raw;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = raw;
-	jeb->last_node = raw;
 
-	USED_SPACE(PAD(je32_to_cpu(rx->totlen)));
+	jffs2_link_node_ref(c, jeb, raw, totlen);
+
 	if (jffs2_sum_active())
 		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
 	dbg_xattr("scaning xdatum at %#08x (xid=%u, version=%u)\n",
@@ -418,17 +412,11 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	ref->next = c->xref_temp;
 	c->xref_temp = ref;
 
-	raw->__totlen = PAD(je32_to_cpu(rr->totlen));
 	raw->flash_offset = ofs | REF_PRISTINE;
-	raw->next_phys = NULL;
 	raw->next_in_ino = (void *)ref;
-	if (!jeb->first_node)
-		jeb->first_node = raw;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = raw;
-	jeb->last_node = raw;
 
-	USED_SPACE(PAD(je32_to_cpu(rr->totlen)));	
+	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(rr->totlen)));
+
 	if (jffs2_sum_active())
 		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
 	dbg_xattr("scan xref at %#08x (xid=%u, ino=%u)\n",
@@ -827,12 +815,10 @@ scan_more:
 					return -ENOMEM;
 				}
 				marker_ref->next_in_ino = NULL;
-				marker_ref->next_phys = NULL;
 				marker_ref->flash_offset = ofs | REF_NORMAL;
-				marker_ref->__totlen = c->cleanmarker_size;
-				jeb->first_node = jeb->last_node = marker_ref;
 
-				USED_SPACE(PAD(c->cleanmarker_size));
+				jffs2_link_node_ref(c, jeb, marker_ref, c->cleanmarker_size);
+
 				ofs += PAD(c->cleanmarker_size);
 			}
 			break;
@@ -971,16 +957,11 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	/* Wheee. It worked */
 
 	raw->flash_offset = ofs | REF_UNCHECKED;
-	raw->__totlen = PAD(je32_to_cpu(ri->totlen));
-	raw->next_phys = NULL;
-	raw->next_in_ino = ic->nodes;
 
+	raw->next_in_ino = ic->nodes;
 	ic->nodes = raw;
-	if (!jeb->first_node)
-		jeb->first_node = raw;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = raw;
-	jeb->last_node = raw;
+	
+	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(ri->totlen)));
 
 	D1(printk(KERN_DEBUG "Node is ino #%u, version %d. Range 0x%x-0x%x\n",
 		  je32_to_cpu(ri->ino), je32_to_cpu(ri->version),
@@ -989,8 +970,6 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 
 	pseudo_random += je32_to_cpu(ri->version);
 
-	UNCHECKED_SPACE(PAD(je32_to_cpu(ri->totlen)));
-
 	if (jffs2_sum_active()) {
 		jffs2_sum_add_inode_mem(s, ri, ofs - jeb->offset);
 	}
@@ -1053,16 +1032,11 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 		return -ENOMEM;
 	}
 
-	raw->__totlen = PAD(je32_to_cpu(rd->totlen));
 	raw->flash_offset = ofs | REF_PRISTINE;
-	raw->next_phys = NULL;
 	raw->next_in_ino = ic->nodes;
 	ic->nodes = raw;
-	if (!jeb->first_node)
-		jeb->first_node = raw;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = raw;
-	jeb->last_node = raw;
+
+	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(rd->totlen)));
 
 	fd->raw = raw;
 	fd->next = NULL;
@@ -1070,7 +1044,6 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 	fd->ino = je32_to_cpu(rd->ino);
 	fd->nhash = full_name_hash(fd->name, rd->nsize);
 	fd->type = rd->type;
-	USED_SPACE(PAD(je32_to_cpu(rd->totlen)));
 	jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
 
 	if (jffs2_sum_active()) {
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 5dbe87b67ab..91218976232 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -410,19 +410,13 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				}
 
 				raw->flash_offset = (jeb->offset + je32_to_cpu(spi->offset)) | REF_UNCHECKED;
-				raw->__totlen = PAD(je32_to_cpu(spi->totlen));
-				raw->next_phys = NULL;
-				raw->next_in_ino = ic->nodes;
 
+				raw->next_in_ino = ic->nodes;
 				ic->nodes = raw;
-				if (!jeb->first_node)
-					jeb->first_node = raw;
-				if (jeb->last_node)
-					jeb->last_node->next_phys = raw;
-				jeb->last_node = raw;
-				*pseudo_random += je32_to_cpu(spi->version);
 
-				UNCHECKED_SPACE(PAD(je32_to_cpu(spi->totlen)));
+				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spi->totlen)));
+
+				*pseudo_random += je32_to_cpu(spi->version);
 
 				sp += JFFS2_SUMMARY_INODE_SIZE;
 
@@ -457,16 +451,11 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					return -ENOMEM;
 				}
 
-				raw->__totlen = PAD(je32_to_cpu(spd->totlen));
 				raw->flash_offset = (jeb->offset + je32_to_cpu(spd->offset)) | REF_PRISTINE;
-				raw->next_phys = NULL;
 				raw->next_in_ino = ic->nodes;
 				ic->nodes = raw;
-				if (!jeb->first_node)
-					jeb->first_node = raw;
-				if (jeb->last_node)
-					jeb->last_node->next_phys = raw;
-				jeb->last_node = raw;
+
+				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spd->totlen)));
 
 				fd->raw = raw;
 				fd->next = NULL;
@@ -474,7 +463,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				fd->ino = je32_to_cpu(spd->ino);
 				fd->nhash = full_name_hash(fd->name, spd->nsize);
 				fd->type = spd->type;
-				USED_SPACE(PAD(je32_to_cpu(spd->totlen)));
+
 				jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
 
 				*pseudo_random += je32_to_cpu(spd->version);
@@ -516,17 +505,11 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				xd->node = raw;
 
 				raw->flash_offset = ofs | REF_UNCHECKED;
-				raw->__totlen = PAD(je32_to_cpu(spx->totlen));
-				raw->next_phys = NULL;
 				raw->next_in_ino = (void *)xd;
-				if (!jeb->first_node)
-					jeb->first_node = raw;
-				if (jeb->last_node)
-					jeb->last_node->next_phys = raw;
-				jeb->last_node = raw;
+
+				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spx->totlen)));
 
 				*pseudo_random += je32_to_cpu(spx->xid);
-				UNCHECKED_SPACE(je32_to_cpu(spx->totlen));
 				sp += JFFS2_SUMMARY_XATTR_SIZE;
 
 				break;
@@ -559,17 +542,11 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				ref->next = c->xref_temp;
 				c->xref_temp = ref;
 
-				raw->__totlen = PAD(sizeof(struct jffs2_raw_xref));
 				raw->flash_offset = ofs | REF_UNCHECKED;
-				raw->next_phys = NULL;
 				raw->next_in_ino = (void *)ref;
-				if (!jeb->first_node)
-					jeb->first_node = raw;
-				if (jeb->last_node)
-					jeb->last_node->next_phys = raw;
-				jeb->last_node = raw;
 
-				UNCHECKED_SPACE(PAD(sizeof(struct jffs2_raw_xref)));
+				jffs2_link_node_ref(c, jeb, raw, PAD(sizeof(struct jffs2_raw_xref)));
+
 				*pseudo_random += ofs;
 				sp += JFFS2_SUMMARY_XREF_SIZE;
 
@@ -653,13 +630,10 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 				return -ENOMEM;
 			}
 
-			marker_ref->next_in_ino = NULL;
-			marker_ref->next_phys = NULL;
 			marker_ref->flash_offset = jeb->offset | REF_NORMAL;
-			marker_ref->__totlen = je32_to_cpu(summary->cln_mkr);
-			jeb->first_node = jeb->last_node = marker_ref;
+			marker_ref->next_in_ino = NULL;
 
-			USED_SPACE( PAD(je32_to_cpu(summary->cln_mkr)) );
+			jffs2_link_node_ref(c, jeb, marker_ref, je32_to_cpu(summary->cln_mkr));
 		}
 	}
 
@@ -682,15 +656,8 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 	cache_ref->next_in_ino = NULL;
 	cache_ref->next_phys = NULL;
 	cache_ref->flash_offset = ofs | REF_NORMAL;
-	cache_ref->__totlen = sumsize;
-
-	if (!jeb->first_node)
-		jeb->first_node = cache_ref;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = cache_ref;
-	jeb->last_node = cache_ref;
 
-	USED_SPACE(sumsize);
+	jffs2_link_node_ref(c, jeb, cache_ref, sumsize);
 
 	jeb->wasted_size += jeb->free_size;
 	c->wasted_size += jeb->free_size;
@@ -888,17 +855,9 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 	}
 
 	summary_ref->next_in_ino = NULL;
-	summary_ref->next_phys = NULL;
 	summary_ref->flash_offset = (jeb->offset + c->sector_size - jeb->free_size) | REF_NORMAL;
-	summary_ref->__totlen = infosize;
-
-	if (!jeb->first_node)
-		jeb->first_node = summary_ref;
-	if (jeb->last_node)
-		jeb->last_node->next_phys = summary_ref;
-	jeb->last_node = summary_ref;
 
-	USED_SPACE(infosize);
+	jffs2_link_node_ref(c, jeb, summary_ref, infosize);
 
 	return 0;
 }
-- 
cgit v1.2.3


From fb9fbbcc9389edabb172ac1b6419c01e32046787 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 20 May 2006 20:08:42 +0100
Subject: [JFFS2] Correct accounting of erroneous cleanmarkers and failed
 summaries.

It should all be counted as dirty space, not wasted and _definitely_ not
unchecked.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 91218976232..95b5bf8f4a9 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -617,11 +617,11 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		if (je32_to_cpu(summary->cln_mkr) != c->cleanmarker_size) {
 			dbg_summary("CLEANMARKER node has totlen 0x%x != normal 0x%x\n",
 				je32_to_cpu(summary->cln_mkr), c->cleanmarker_size);
-			UNCHECKED_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
+			DIRTY_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
 		} else if (jeb->first_node) {
 			dbg_summary("CLEANMARKER node not first node in block "
 					"(0x%08x)\n", jeb->offset);
-			UNCHECKED_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
+			DIRTY_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
 		} else {
 			struct jffs2_raw_node_ref *marker_ref = jffs2_alloc_raw_node_ref();
 
@@ -800,7 +800,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 			infosize, jeb->offset + c->sector_size - jeb->free_size, ret, retlen);
 
 		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
-		WASTED_SPACE(infosize);
+		DIRTY_SPACE(infosize);
 
 		return 1;
 	}
-- 
cgit v1.2.3


From 6171586a7ae5198988774e8480631e8d15f65dfe Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 00:02:06 +0100
Subject: [JFFS2] Correct handling of JFFS2_FEATURE_RWCOMPAT_COPY nodes.

We should preserve these when we come to garbage collect them, not let
them get erased. Use jffs2_garbage_collect_pristine() for this, and make
sure the summary code copes -- just refrain from writing a summary for any
block which contains a node we don't understand.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/gc.c       | 49 +++++++++++++++++++++++++++++--------------------
 fs/jffs2/nodelist.h |  5 +++++
 fs/jffs2/scan.c     | 15 +++++++++++++--
 fs/jffs2/summary.c  |  9 ++++++++-
 4 files changed, 55 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 23587f8a221..b0a5c407b47 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -256,10 +256,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 	if (!raw->next_in_ino) {
 		/* Inode-less node. Clean marker, snapshot or something like that */
-		/* FIXME: If it's something that needs to be copied, including something
-		   we don't grok that has JFFS2_NODETYPE_RWCOMPAT_COPY, we should do so */
 		spin_unlock(&c->erase_completion_lock);
-		jffs2_mark_node_obsolete(c, raw);
+		if (ref_flags(raw) == REF_PRISTINE) {
+			/* It's an unknown node with JFFS2_FEATURE_RWCOMPAT_COPY */
+			jffs2_garbage_collect_pristine(c, NULL, raw);
+		} else {
+			/* Just mark it obsolete */
+			jffs2_mark_node_obsolete(c, raw);
+		}
 		up(&c->alloc_sem);
 		goto eraseit_lock;
 	}
@@ -533,15 +537,16 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 
 	D1(printk(KERN_DEBUG "Going to GC REF_PRISTINE node at 0x%08x\n", ref_offset(raw)));
 
-	rawlen = ref_totlen(c, c->gcblock, raw);
+	alloclen = rawlen = ref_totlen(c, c->gcblock, raw);
 
 	/* Ask for a small amount of space (or the totlen if smaller) because we
 	   don't want to force wastage of the end of a block if splitting would
 	   work. */
-	ret = jffs2_reserve_space_gc(c, min_t(uint32_t, sizeof(struct jffs2_raw_inode) +
-				JFFS2_MIN_DATA_LEN, rawlen), &phys_ofs, &alloclen, rawlen);
-				/* this is not the exact summary size of it,
-					it is only an upper estimation */
+	if (ic && alloclen > sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN)
+		alloclen = sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN;
+
+	ret = jffs2_reserve_space_gc(c, alloclen, &phys_ofs, &alloclen, rawlen);
+	/* 'rawlen' is not the exact summary size; it is only an upper estimation */
 
 	if (ret)
 		return ret;
@@ -605,9 +610,12 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 		}
 		break;
 	default:
-		printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
-		       ref_offset(raw), je16_to_cpu(node->u.nodetype));
-		goto bail;
+		/* If it's inode-less, we don't _know_ what it is. Just copy it intact */
+		if (ic) {
+			printk(KERN_WARNING "Unknown node type for REF_PRISTINE node at 0x%08x: 0x%04x\n",
+			       ref_offset(raw), je16_to_cpu(node->u.nodetype));
+			goto bail;
+		}
 	}
 
 	nraw = jffs2_alloc_raw_node_ref();
@@ -674,15 +682,16 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 	nraw->flash_offset |= REF_PRISTINE;
 	jffs2_add_physical_node_ref(c, nraw);
 
-	/* Link into per-inode list. This is safe because of the ic
-	   state being INO_STATE_GC. Note that if we're doing this
-	   for an inode which is in-core, the 'nraw' pointer is then
-	   going to be fetched from ic->nodes by our caller. */
-	spin_lock(&c->erase_completion_lock);
-        nraw->next_in_ino = ic->nodes;
-        ic->nodes = nraw;
-	spin_unlock(&c->erase_completion_lock);
-
+	if (ic) {
+		/* Link into per-inode list. This is safe because of the ic
+		   state being INO_STATE_GC. Note that if we're doing this
+		   for an inode which is in-core, the 'nraw' pointer is then
+		   going to be fetched from ic->nodes by our caller. */
+		spin_lock(&c->erase_completion_lock);
+		nraw->next_in_ino = ic->nodes;
+		ic->nodes = nraw;
+		spin_unlock(&c->erase_completion_lock);
+	}
 	jffs2_mark_node_obsolete(c, raw);
 	D1(printk(KERN_DEBUG "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", ref_offset(raw)));
 
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index bac4ec35bbd..1f5d5b0100a 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -97,6 +97,11 @@ struct jffs2_raw_node_ref
 #define ref_obsolete(ref)	(((ref)->flash_offset & 3) == REF_OBSOLETE)
 #define mark_ref_normal(ref)    do { (ref)->flash_offset = ref_offset(ref) | REF_NORMAL; } while(0)
 
+/* NB: REF_PRISTINE for an inode-less node (ref->next_in_ino == NULL) indicates
+   it is an unknown node of type JFFS2_NODETYPE_RWCOMPAT_COPY, so it'll get
+   copied. If you need to do anything different to GC inode-less nodes, then
+   you need to modify gc.c accordingly. */
+
 /* For each inode in the filesystem, we need to keep a record of
    nlink, because it would be a PITA to scan the whole directory tree
    at read_inode() time to calculate it, and to keep sufficient information
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 3cbe9f029e0..06637050749 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -851,11 +851,22 @@ scan_more:
 				ofs += PAD(je32_to_cpu(node->totlen));
 				break;
 
-			case JFFS2_FEATURE_RWCOMPAT_COPY:
+			case JFFS2_FEATURE_RWCOMPAT_COPY: {
+				struct jffs2_raw_node_ref *ref;
 				D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
-				USED_SPACE(PAD(je32_to_cpu(node->totlen)));
+
+				ref = jffs2_alloc_raw_node_ref();
+				if (!ref)
+					return -ENOMEM;
+				ref->flash_offset = ofs | REF_PRISTINE;
+				ref->next_in_ino = 0;
+				jffs2_link_node_ref(c, jeb, ref, PAD(je32_to_cpu(node->totlen)));
+
+				/* We can't summarise nodes we don't grok */
+				jffs2_sum_disable_collecting(s);
 				ofs += PAD(je32_to_cpu(node->totlen));
 				break;
+				}
 			}
 		}
 	}
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 95b5bf8f4a9..53a84b468cf 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -760,7 +760,14 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 			}
 #endif
 			default : {
-				BUG();	/* unknown node in summary information */
+				if ((je16_to_cpu(temp->u.nodetype) & JFFS2_COMPAT_MASK)
+				    == JFFS2_FEATURE_RWCOMPAT_COPY) {
+					dbg_summary("Writing unknown RWCOMPAT_COPY node type %x\n",
+						    je16_to_cpu(temp->u.nodetype));
+					jffs2_sum_disable_collecting(c->summary);
+				} else {
+					BUG();	/* unknown node in summary information */
+				}
 			}
 		}
 
-- 
cgit v1.2.3


From 3560160aa26ebced1944aaa2e7e436d2a1b1bf70 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 01:28:05 +0100
Subject: [JFFS2] Fix memory leak in scan code; improve comments.

If we had to allocate extra space for the summary node, we weren't
correctly freeing it when jffs2_sum_scan_sumnode() returned nonzero --
which is both the success and the failure case. Only when it returned
zero, which means fall through to the full scan, were we correctly freeing
the buffer.

Document the meaning of those return codes while we're at it.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 06637050749..192b0bd2118 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -516,10 +516,15 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 
 		if (sumptr) {
 			err = jffs2_sum_scan_sumnode(c, jeb, sumptr, sumlen, &pseudo_random);
-			if (err)
-				return err;
+
 			if (buf_size && sumlen > buf_size)
 				kfree(sumptr);
+			/* If it returns with a real error, bail. 
+			   If it returns positive, that's a block classification
+			   (i.e. BLK_STATE_xxx) so return that too.
+			   If it returns zero, fall through to full scan. */
+			if (err)
+				return err;
 		}
 	}
 
-- 
cgit v1.2.3


From 7807ef7ba2a41c05f6197381f572dd38baa6c1ce Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 03:45:27 +0100
Subject: [JFFS2] Fix summary handling of unknown but compatible nodes.

For RWCOMPAT and ROCOMPAT nodes, we should still allow the mount to
succeed. Just abandon the summary and fall through to the full scan.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c    |  3 +--
 fs/jffs2/nodelist.h |  1 +
 fs/jffs2/summary.c  | 24 ++++++++++++++++++++----
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index fecf5584f83..f677d6950fd 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -30,7 +30,6 @@ static void jffs2_erase_callback(struct erase_info *);
 #endif
 static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t bad_offset);
 static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
-static void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 
 static void jffs2_erase_block(struct jffs2_sb_info *c,
@@ -283,7 +282,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 		jffs2_del_ino_cache(c, ic);
 }
 
-static void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
 	struct jffs2_raw_node_ref *ref;
 	D1(printk(KERN_DEBUG "Freeing all node refs for eraseblock offset 0x%08x\n", jeb->offset));
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 1f5d5b0100a..194cff7c485 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -436,6 +436,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 
 /* erase.c */
 void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
+void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 /* wbuf.c */
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 53a84b468cf..9ced3aa9581 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -554,9 +554,21 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 			}
 #endif
 			default : {
-printk("nodetype = %#04x\n",je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype));
-				JFFS2_WARNING("Unsupported node type found in summary! Exiting...");
-				return -EIO;
+				uint16_t nodetype = je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype);
+				JFFS2_WARNING("Unsupported node type %x found in summary! Exiting...\n", nodetype);
+				if ((nodetype & JFFS2_COMPAT_MASK) == JFFS2_FEATURE_INCOMPAT)
+					return -EIO;
+
+				/* For compatible node types, just fall back to the full scan */
+				c->wasted_size -= jeb->wasted_size;
+				c->free_size += c->sector_size - jeb->free_size;
+				c->used_size -= jeb->used_size;
+				c->dirty_size -= jeb->dirty_size;
+				jeb->wasted_size = jeb->used_size = jeb->dirty_size = 0;
+				jeb->free_size = c->sector_size;
+
+				jffs2_free_all_node_refs(c, jeb);
+				return -ENOTRECOVERABLE;
 			}
 		}
 	}
@@ -642,8 +654,12 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 	}
 
 	ret = jffs2_sum_process_sum_data(c, jeb, summary, pseudo_random);
+	/* -ENOTRECOVERABLE isn't a fatal error -- it means we should do a full
+	   scan of this eraseblock. So return zero */
+	if (ret == -ENOTRECOVERABLE)
+		return 0;
 	if (ret)
-		return ret;
+		return ret;		/* real error */
 
 	/* for PARANOIA_CHECK */
 	cache_ref = jffs2_alloc_raw_node_ref();
-- 
cgit v1.2.3


From 68270995f29f1a82b3eaab01df63ea7e721e2fa6 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 03:46:05 +0100
Subject: [JFFS2] Introduce jffs2_scan_dirty_space() function.

To eliminate the __totlen field from struct jffs2_raw_node_ref, we need
to allocate nodes for dirty space instead of just tweaking the accounting
data. Introduce jffs2_scan_dirty_space() in preparation for that.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodelist.c | 11 ++++++++
 fs/jffs2/nodelist.h |  1 +
 fs/jffs2/scan.c     | 76 ++++++++++++++++++++++++++++++++++++-----------------
 fs/jffs2/summary.c  | 16 +++++++----
 fs/jffs2/summary.h  | 17 ------------
 5 files changed, 75 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 1fc8aedb56f..00506857eab 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1080,3 +1080,14 @@ void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 	ref->__totlen = len;
 	ref->next_phys = NULL;
 }
+
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			   uint32_t size)
+{
+	c->dirty_size += size;
+	c->free_size -= size;
+	jeb->dirty_size += size;
+	jeb->free_size -= size;
+
+	return 0;
+}
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 194cff7c485..ca15b3c731c 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -430,6 +430,7 @@ int jffs2_fill_scan_buf(struct jffs2_sb_info *c, void *buf,
 				uint32_t ofs, uint32_t len);
 struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uint32_t ino);
 int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t size);
 
 /* build.c */
 int jffs2_do_mount_fs(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 192b0bd2118..b3fc9fd5b03 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -314,13 +314,15 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	struct jffs2_xattr_datum *xd;
 	struct jffs2_raw_node_ref *raw;
 	uint32_t totlen, crc;
+	int err;
 
 	crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4);
 	if (crc != je32_to_cpu(rx->node_crc)) {
 		if (je32_to_cpu(rx->node_crc) != 0xffffffff)
 			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
 				      ofs, je32_to_cpu(rx->node_crc), crc);
-		DIRTY_SPACE(je32_to_cpu(rx->totlen));
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+			return err;
 		return 0;
 	}
 
@@ -328,7 +330,8 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	if (totlen != je32_to_cpu(rx->totlen)) {
 		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
 			      ofs, je32_to_cpu(rx->totlen), totlen);
-		DIRTY_SPACE(je32_to_cpu(rx->totlen));
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
+			return err;
 		return 0;
 	}
 
@@ -340,7 +343,8 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	if (IS_ERR(xd)) {
 		jffs2_free_raw_node_ref(raw);
 		if (PTR_ERR(xd) == -EEXIST) {
-			DIRTY_SPACE(PAD(je32_to_cpu(rx->totlen)));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rx->totlen)))))
+				return err;
 			return 0;
 		}
 		return PTR_ERR(xd);
@@ -370,13 +374,15 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	struct jffs2_xattr_ref *ref;
 	struct jffs2_raw_node_ref *raw;
 	uint32_t crc;
+	int err;
 
 	crc = crc32(0, rr, sizeof(*rr) - 4);
 	if (crc != je32_to_cpu(rr->node_crc)) {
 		if (je32_to_cpu(rr->node_crc) != 0xffffffff)
 			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
 				      ofs, je32_to_cpu(rr->node_crc), crc);
-		DIRTY_SPACE(PAD(je32_to_cpu(rr->totlen)));
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen)))))
+			return err;
 		return 0;
 	}
 
@@ -384,7 +390,8 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
 			      ofs, je32_to_cpu(rr->totlen),
 			      PAD(sizeof(struct jffs2_raw_xref)));
-		DIRTY_SPACE(je32_to_cpu(rr->totlen));
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rr->totlen))))
+			return err;
 		return 0;
 	}
 
@@ -569,7 +576,8 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 	if (ofs) {
 		D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset,
 			  jeb->offset + ofs));
-		DIRTY_SPACE(ofs);
+		if ((err = jffs2_scan_dirty_space(c, jeb, ofs)))
+			return err;
 	}
 
 	/* Now ofs is a complete physical flash offset as it always was... */
@@ -593,7 +601,8 @@ scan_more:
 		}
 		if (ofs == prevofs) {
 			printk(KERN_WARNING "ofs 0x%08x has already been seen. Skipping\n", ofs);
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -602,7 +611,8 @@ scan_more:
 		if (jeb->offset + c->sector_size < ofs + sizeof(*node)) {
 			D1(printk(KERN_DEBUG "Fewer than %zd bytes left to end of block. (%x+%x<%x+%zx) Not reading\n", sizeof(struct jffs2_unknown_node),
 				  jeb->offset, c->sector_size, ofs, sizeof(*node)));
-			DIRTY_SPACE((jeb->offset + c->sector_size)-ofs);
+			if ((err = jffs2_scan_dirty_space(c, jeb, (jeb->offset + c->sector_size)-ofs)))
+				return err;
 			break;
 		}
 
@@ -632,7 +642,8 @@ scan_more:
 				if (*(uint32_t *)(&buf[inbuf_ofs]) != 0xffffffff) {
 					printk(KERN_WARNING "Empty flash at 0x%08x ends at 0x%08x\n",
 					       empty_start, ofs);
-					DIRTY_SPACE(ofs-empty_start);
+					if ((err = jffs2_scan_dirty_space(c, jeb, ofs-empty_start)))
+						return err;
 					goto scan_more;
 				}
 
@@ -669,20 +680,23 @@ scan_more:
 
 		if (ofs == jeb->offset && je16_to_cpu(node->magic) == KSAMTIB_CIGAM_2SFFJ) {
 			printk(KERN_WARNING "Magic bitmask is backwards at offset 0x%08x. Wrong endian filesystem?\n", ofs);
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
 		if (je16_to_cpu(node->magic) == JFFS2_DIRTY_BITMASK) {
 			D1(printk(KERN_DEBUG "Dirty bitmask at 0x%08x\n", ofs));
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
 		if (je16_to_cpu(node->magic) == JFFS2_OLD_MAGIC_BITMASK) {
 			printk(KERN_WARNING "Old JFFS2 bitmask found at 0x%08x\n", ofs);
 			printk(KERN_WARNING "You cannot use older JFFS2 filesystems with newer kernels\n");
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -691,7 +705,8 @@ scan_more:
 			noisy_printk(&noise, "jffs2_scan_eraseblock(): Magic bitmask 0x%04x not found at 0x%08x: 0x%04x instead\n",
 				     JFFS2_MAGIC_BITMASK, ofs,
 				     je16_to_cpu(node->magic));
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -708,7 +723,8 @@ scan_more:
 				     je32_to_cpu(node->totlen),
 				     je32_to_cpu(node->hdr_crc),
 				     hdr_crc);
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -719,7 +735,8 @@ scan_more:
 			printk(KERN_WARNING "Node at 0x%08x with length 0x%08x would run over the end of the erase block\n",
 			       ofs, je32_to_cpu(node->totlen));
 			printk(KERN_WARNING "Perhaps the file system was created with the wrong erase size?\n");
-			DIRTY_SPACE(4);
+			if ((err = jffs2_scan_dirty_space(c, jeb, 4)))
+				return err;
 			ofs += 4;
 			continue;
 		}
@@ -727,7 +744,8 @@ scan_more:
 		if (!(je16_to_cpu(node->nodetype) & JFFS2_NODE_ACCURATE)) {
 			/* Wheee. This is an obsoleted node */
 			D2(printk(KERN_DEBUG "Node at 0x%08x is obsolete. Skipping\n", ofs));
-			DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+				return err;
 			ofs += PAD(je32_to_cpu(node->totlen));
 			continue;
 		}
@@ -807,11 +825,13 @@ scan_more:
 			if (je32_to_cpu(node->totlen) != c->cleanmarker_size) {
 				printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x has totlen 0x%x != normal 0x%x\n",
 				       ofs, je32_to_cpu(node->totlen), c->cleanmarker_size);
-				DIRTY_SPACE(PAD(sizeof(struct jffs2_unknown_node)));
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+					return err;
 				ofs += PAD(sizeof(struct jffs2_unknown_node));
 			} else if (jeb->first_node) {
 				printk(KERN_NOTICE "CLEANMARKER node found at 0x%08x, not first node in block (0x%08x)\n", ofs, jeb->offset);
-				DIRTY_SPACE(PAD(sizeof(struct jffs2_unknown_node)));
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(sizeof(struct jffs2_unknown_node)))))
+					return err;
 				ofs += PAD(sizeof(struct jffs2_unknown_node));
 			} else {
 				struct jffs2_raw_node_ref *marker_ref = jffs2_alloc_raw_node_ref();
@@ -831,7 +851,8 @@ scan_more:
 		case JFFS2_NODETYPE_PADDING:
 			if (jffs2_sum_active())
 				jffs2_sum_add_padding_mem(s, je32_to_cpu(node->totlen));
-			DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+				return err;
 			ofs += PAD(je32_to_cpu(node->totlen));
 			break;
 
@@ -842,7 +863,8 @@ scan_more:
 			        c->flags |= JFFS2_SB_FLAG_RO;
 				if (!(jffs2_is_readonly(c)))
 					return -EROFS;
-				DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+					return err;
 				ofs += PAD(je32_to_cpu(node->totlen));
 				break;
 
@@ -852,7 +874,8 @@ scan_more:
 
 			case JFFS2_FEATURE_RWCOMPAT_DELETE:
 				D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
-				DIRTY_SPACE(PAD(je32_to_cpu(node->totlen)));
+				if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(node->totlen)))))
+					return err;
 				ofs += PAD(je32_to_cpu(node->totlen));
 				break;
 
@@ -930,6 +953,7 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	struct jffs2_raw_node_ref *raw;
 	struct jffs2_inode_cache *ic;
 	uint32_t ino = je32_to_cpu(ri->ino);
+	int err;
 
 	D1(printk(KERN_DEBUG "jffs2_scan_inode_node(): Node at 0x%08x\n", ofs));
 
@@ -959,7 +983,8 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 			printk(KERN_NOTICE "jffs2_scan_inode_node(): CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
 			       ofs, je32_to_cpu(ri->node_crc), crc);
 			/* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
-			DIRTY_SPACE(PAD(je32_to_cpu(ri->totlen)));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(ri->totlen)))))
+				return err;
 			jffs2_free_raw_node_ref(raw);
 			return 0;
 		}
@@ -1000,6 +1025,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 	struct jffs2_full_dirent *fd;
 	struct jffs2_inode_cache *ic;
 	uint32_t crc;
+	int err;
 
 	D1(printk(KERN_DEBUG "jffs2_scan_dirent_node(): Node at 0x%08x\n", ofs));
 
@@ -1011,7 +1037,8 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 		printk(KERN_NOTICE "jffs2_scan_dirent_node(): Node CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n",
 		       ofs, je32_to_cpu(rd->node_crc), crc);
 		/* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
-		DIRTY_SPACE(PAD(je32_to_cpu(rd->totlen)));
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+			return err;
 		return 0;
 	}
 
@@ -1032,7 +1059,8 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 		jffs2_free_full_dirent(fd);
 		/* FIXME: Why do we believe totlen? */
 		/* We believe totlen because the CRC on the node _header_ was OK, just the name failed. */
-		DIRTY_SPACE(PAD(je32_to_cpu(rd->totlen)));
+		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rd->totlen)))))
+			return err;
 		return 0;
 	}
 	raw = jffs2_alloc_raw_node_ref();
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 9ced3aa9581..11ea54c90f4 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -380,6 +380,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 	struct jffs2_full_dirent *fd;
 	void *sp;
 	int i, ino;
+	int err;
 
 	sp = summary->sum;
 
@@ -494,7 +495,8 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					jffs2_free_raw_node_ref(raw);
 					if (PTR_ERR(xd) == -EEXIST) {
 						/* a newer version of xd exists */
-						DIRTY_SPACE(je32_to_cpu(spx->totlen));
+						if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(spx->totlen))))
+							return err;
 						sp += JFFS2_SUMMARY_XATTR_SIZE;
 						break;
 					}
@@ -585,6 +587,7 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 	struct jffs2_raw_node_ref *cache_ref;
 	int ret, ofs;
 	uint32_t crc;
+	int err;
 
 	ofs = jeb->offset + c->sector_size - sumsize;
 
@@ -629,11 +632,13 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		if (je32_to_cpu(summary->cln_mkr) != c->cleanmarker_size) {
 			dbg_summary("CLEANMARKER node has totlen 0x%x != normal 0x%x\n",
 				je32_to_cpu(summary->cln_mkr), c->cleanmarker_size);
-			DIRTY_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return err;
 		} else if (jeb->first_node) {
 			dbg_summary("CLEANMARKER node not first node in block "
 					"(0x%08x)\n", jeb->offset);
-			DIRTY_SPACE(PAD(je32_to_cpu(summary->cln_mkr)));
+			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return err;
 		} else {
 			struct jffs2_raw_node_ref *marker_ref = jffs2_alloc_raw_node_ref();
 
@@ -650,7 +655,8 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 	}
 
 	if (je32_to_cpu(summary->padded)) {
-		DIRTY_SPACE(je32_to_cpu(summary->padded));
+		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(summary->padded))))
+			return err;
 	}
 
 	ret = jffs2_sum_process_sum_data(c, jeb, summary, pseudo_random);
@@ -823,7 +829,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 			infosize, jeb->offset + c->sector_size - jeb->free_size, ret, retlen);
 
 		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
-		DIRTY_SPACE(infosize);
+		jffs2_scan_dirty_space(c, jeb, infosize);
 
 		return 1;
 	}
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index ce892d57ad5..e7eb0c5814f 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -18,23 +18,6 @@
 #include <linux/uio.h>
 #include <linux/jffs2.h>
 
-#define DIRTY_SPACE(x) do { typeof(x) _x = (x); \
-		c->free_size -= _x; c->dirty_size += _x; \
-		jeb->free_size -= _x ; jeb->dirty_size += _x; \
-		}while(0)
-#define USED_SPACE(x) do { typeof(x) _x = (x); \
-		c->free_size -= _x; c->used_size += _x; \
-		jeb->free_size -= _x ; jeb->used_size += _x; \
-		}while(0)
-#define WASTED_SPACE(x) do { typeof(x) _x = (x); \
-		c->free_size -= _x; c->wasted_size += _x; \
-		jeb->free_size -= _x ; jeb->wasted_size += _x; \
-		}while(0)
-#define UNCHECKED_SPACE(x) do { typeof(x) _x = (x); \
-		c->free_size -= _x; c->unchecked_size += _x; \
-		jeb->free_size -= _x ; jeb->unchecked_size += _x; \
-		}while(0)
-
 #define BLK_STATE_ALLFF		0
 #define BLK_STATE_CLEAN		1
 #define BLK_STATE_PARTDIRTY	2
-- 
cgit v1.2.3


From 25090a6b23906552cf3d204aa421f811327e1b15 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 03:57:56 +0100
Subject: [JFFS2] Discard remaining free space when filing a dirty block in
 scan.

The incoming ref_totlen() calculation is going to rely on the existence
of nodes which cover all dirty space. We can't just tweak the accounting
data any more; we have to call jffs2_scan_dirty_space() to do it.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c | 47 ++++++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index b3fc9fd5b03..cffafec01e4 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -65,6 +65,25 @@ static inline uint32_t EMPTY_SCAN_SIZE(uint32_t sector_size) {
 		return DEFAULT_EMPTY_SCAN_SIZE;
 }
 
+static int file_dirty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+{
+	int ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size);
+	if (ret)
+		return ret;
+	/* Turned wasted size into dirty, since we apparently 
+	   think it's recoverable now. */
+	jeb->dirty_size += jeb->wasted_size;
+	c->dirty_size += jeb->wasted_size;
+	c->wasted_size -= jeb->wasted_size;
+	jeb->wasted_size = 0;
+	if (VERYDIRTY(c, jeb->dirty_size)) {
+		list_add(&jeb->list, &c->very_dirty_list);
+	} else {
+		list_add(&jeb->list, &c->dirty_list);
+	}
+	return 0;
+}
+
 int jffs2_scan_medium(struct jffs2_sb_info *c)
 {
 	int i, ret;
@@ -170,34 +189,20 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 					(!c->nextblock || c->nextblock->free_size < jeb->free_size)) {
 				/* Better candidate for the next writes to go to */
 				if (c->nextblock) {
-					c->nextblock->dirty_size += c->nextblock->free_size + c->nextblock->wasted_size;
-					c->dirty_size += c->nextblock->free_size + c->nextblock->wasted_size;
-					c->free_size -= c->nextblock->free_size;
-					c->wasted_size -= c->nextblock->wasted_size;
-					c->nextblock->free_size = c->nextblock->wasted_size = 0;
-					if (VERYDIRTY(c, c->nextblock->dirty_size)) {
-						list_add(&c->nextblock->list, &c->very_dirty_list);
-					} else {
-						list_add(&c->nextblock->list, &c->dirty_list);
-					}
+					ret = file_dirty(c, c->nextblock);
+					if (ret)
+						return ret;
 					/* deleting summary information of the old nextblock */
 					jffs2_sum_reset_collected(c->summary);
 				}
-				/* update collected summary infromation for the current nextblock */
+				/* update collected summary information for the current nextblock */
 				jffs2_sum_move_collected(c, s);
 				D1(printk(KERN_DEBUG "jffs2_scan_medium(): new nextblock = 0x%08x\n", jeb->offset));
 				c->nextblock = jeb;
 			} else {
-				jeb->dirty_size += jeb->free_size + jeb->wasted_size;
-				c->dirty_size += jeb->free_size + jeb->wasted_size;
-				c->free_size -= jeb->free_size;
-				c->wasted_size -= jeb->wasted_size;
-				jeb->free_size = jeb->wasted_size = 0;
-				if (VERYDIRTY(c, jeb->dirty_size)) {
-					list_add(&jeb->list, &c->very_dirty_list);
-				} else {
-					list_add(&jeb->list, &c->dirty_list);
-				}
+				ret = file_dirty(c, jeb);
+				if (ret)
+					return ret;
 			}
 			break;
 
-- 
cgit v1.2.3


From 49f11d40751b974f3b829f208eefa6f97a10cac8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 04:00:01 +0100
Subject: [JFFS2] Mark gaps in summary list as dirty space

Make sure we allocate a ref for any dirty space which exists between nodes
which we find in an eraseblock summary.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.c | 71 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 11ea54c90f4..e60289ada83 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -369,6 +369,23 @@ no_mem:
 	return -ENOMEM;
 }
 
+static struct jffs2_raw_node_ref *alloc_ref_at(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+					       uint32_t offset)
+{
+	struct jffs2_raw_node_ref *ref;
+	/* If there was a gap, mark it dirty */
+	if (offset > c->sector_size - jeb->free_size) {
+		int ret = jffs2_scan_dirty_space(c, jeb, offset - (c->sector_size - jeb->free_size));
+		if (ret)
+			return NULL;
+	}
+	ref = jffs2_alloc_raw_node_ref();
+	if (!ref)
+		return NULL;
+
+	ref->flash_offset = jeb->offset + offset;
+	return ref;
+}
 
 /* Process the stored summary information - helper function for jffs2_sum_scan_sumnode() */
 
@@ -397,7 +414,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				dbg_summary("Inode at 0x%08x\n",
 							jeb->offset + je32_to_cpu(spi->offset));
 
-				raw = jffs2_alloc_raw_node_ref();
+				raw = alloc_ref_at(c, jeb, je32_to_cpu(spi->offset));
 				if (!raw) {
 					JFFS2_NOTICE("allocation of node reference failed\n");
 					return -ENOMEM;
@@ -410,7 +427,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					return -ENOMEM;
 				}
 
-				raw->flash_offset = (jeb->offset + je32_to_cpu(spi->offset)) | REF_UNCHECKED;
+				raw->flash_offset |= REF_UNCHECKED;
 
 				raw->next_in_ino = ic->nodes;
 				ic->nodes = raw;
@@ -438,7 +455,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				memcpy(&fd->name, spd->name, spd->nsize);
 				fd->name[spd->nsize] = 0;
 
-				raw = jffs2_alloc_raw_node_ref();
+				raw = alloc_ref_at(c, jeb, je32_to_cpu(spd->offset));
 				if (!raw) {
 					jffs2_free_full_dirent(fd);
 					JFFS2_NOTICE("allocation of node reference failed\n");
@@ -452,7 +469,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					return -ENOMEM;
 				}
 
-				raw->flash_offset = (jeb->offset + je32_to_cpu(spd->offset)) | REF_PRISTINE;
+				raw->flash_offset |= REF_PRISTINE;
 				raw->next_in_ino = ic->nodes;
 				ic->nodes = raw;
 
@@ -477,13 +494,12 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 			case JFFS2_NODETYPE_XATTR: {
 				struct jffs2_xattr_datum *xd;
 				struct jffs2_sum_xattr_flash *spx;
-				uint32_t ofs;
 
 				spx = (struct jffs2_sum_xattr_flash *)sp;
-				ofs = jeb->offset + je32_to_cpu(spx->offset);
-				dbg_summary("xattr at %#08x (xid=%u, version=%u)\n", ofs,
+				dbg_summary("xattr at %#08x (xid=%u, version=%u)\n", 
+					    jeb->offset + je32_to_cpu(spx->offset),
 					    je32_to_cpu(spx->xid), je32_to_cpu(spx->version));
-				raw = jffs2_alloc_raw_node_ref();
+				raw = alloc_ref_at(c, jeb, je32_to_cpu(spx->offset));
 				if (!raw) {
 					JFFS2_NOTICE("allocation of node reference failed\n");
 					kfree(summary);
@@ -506,7 +522,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				}
 				xd->node = raw;
 
-				raw->flash_offset = ofs | REF_UNCHECKED;
+				raw->flash_offset |= REF_UNCHECKED;
 				raw->next_in_ino = (void *)xd;
 
 				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spx->totlen)));
@@ -519,13 +535,12 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 			case JFFS2_NODETYPE_XREF: {
 				struct jffs2_xattr_ref *ref;
 				struct jffs2_sum_xref_flash *spr;
-				uint32_t ofs;
 
 				spr = (struct jffs2_sum_xref_flash *)sp;
-				ofs = jeb->offset + je32_to_cpu(spr->offset);
-				dbg_summary("xref at %#08x (xid=%u, ino=%u)\n", ofs,
+				dbg_summary("xref at %#08x (xid=%u, ino=%u)\n",
+					    jeb->offset + je32_to_cpu(spr->offset),
 					    je32_to_cpu(spr->xid), je32_to_cpu(spr->ino));
-				raw = jffs2_alloc_raw_node_ref();
+				raw = alloc_ref_at(c, jeb, je32_to_cpu(spr->offset));
 				if (!raw) {
 					JFFS2_NOTICE("allocation of node reference failed\n");
 					kfree(summary);
@@ -544,12 +559,12 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				ref->next = c->xref_temp;
 				c->xref_temp = ref;
 
-				raw->flash_offset = ofs | REF_UNCHECKED;
+				raw->flash_offset |= REF_UNCHECKED;
 				raw->next_in_ino = (void *)ref;
 
 				jffs2_link_node_ref(c, jeb, raw, PAD(sizeof(struct jffs2_raw_xref)));
 
-				*pseudo_random += ofs;
+				*pseudo_random += raw->flash_offset;
 				sp += JFFS2_SUMMARY_XREF_SIZE;
 
 				break;
@@ -589,10 +604,10 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 	uint32_t crc;
 	int err;
 
-	ofs = jeb->offset + c->sector_size - sumsize;
+	ofs = c->sector_size - sumsize;
 
 	dbg_summary("summary found for 0x%08x at 0x%08x (0x%x bytes)\n",
-		    jeb->offset, ofs, sumsize);
+		    jeb->offset, jeb->offset + ofs, sumsize);
 
 	/* OK, now check for node validity and CRC */
 	crcnode.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -654,11 +669,6 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		}
 	}
 
-	if (je32_to_cpu(summary->padded)) {
-		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(summary->padded))))
-			return err;
-	}
-
 	ret = jffs2_sum_process_sum_data(c, jeb, summary, pseudo_random);
 	/* -ENOTRECOVERABLE isn't a fatal error -- it means we should do a full
 	   scan of this eraseblock. So return zero */
@@ -668,7 +678,7 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		return ret;		/* real error */
 
 	/* for PARANOIA_CHECK */
-	cache_ref = jffs2_alloc_raw_node_ref();
+	cache_ref = alloc_ref_at(c, jeb, ofs);
 
 	if (!cache_ref) {
 		JFFS2_NOTICE("Failed to allocate node ref for cache\n");
@@ -676,15 +686,18 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 	}
 
 	cache_ref->next_in_ino = NULL;
-	cache_ref->next_phys = NULL;
-	cache_ref->flash_offset = ofs | REF_NORMAL;
+	cache_ref->flash_offset |= REF_NORMAL;
 
 	jffs2_link_node_ref(c, jeb, cache_ref, sumsize);
 
-	jeb->wasted_size += jeb->free_size;
-	c->wasted_size += jeb->free_size;
-	c->free_size -= jeb->free_size;
-	jeb->free_size = 0;
+	if (unlikely(jeb->free_size)) {
+		JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n",
+			      jeb->free_size, jeb->offset);
+		jeb->wasted_size += jeb->free_size;
+		c->wasted_size += jeb->free_size;
+		c->free_size -= jeb->free_size;
+		jeb->free_size = 0;
+	}
 
 	return jffs2_scan_classify_jeb(c, jeb);
 
-- 
cgit v1.2.3


From b64335f2b740d6f5dbf5d3b04af30d407bf599f5 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 04:36:45 +0100
Subject: [JFFS2] Add length argument to jffs2_add_physical_node_ref()

If __totlen is going away, we need to pass the length in separately.
Also stop callers from needlessly setting ref->next_phys to NULL,
since that's done for them... and since that'll also be going away soon.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/gc.c       |  6 ++----
 fs/jffs2/nodelist.h |  2 +-
 fs/jffs2/nodemgmt.c |  6 ++----
 fs/jffs2/wbuf.c     |  4 +---
 fs/jffs2/write.c    | 12 ++++--------
 fs/jffs2/xattr.c    | 14 +++++---------
 6 files changed, 15 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index b0a5c407b47..4773ba24304 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -627,8 +627,6 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 	/* OK, all the CRCs are good; this node can just be copied as-is. */
  retry:
 	nraw->flash_offset = phys_ofs;
-	nraw->__totlen = rawlen;
-	nraw->next_phys = NULL;
 
 	ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node);
 
@@ -640,7 +638,7 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 			nraw->next_in_ino = NULL;
 
 			nraw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, nraw);
+			jffs2_add_physical_node_ref(c, nraw, rawlen);
 			jffs2_mark_node_obsolete(c, nraw);
 		} else {
 			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", nraw->flash_offset);
@@ -680,7 +678,7 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 		goto out_node;
 	}
 	nraw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, nraw);
+	jffs2_add_physical_node_ref(c, nraw, rawlen);
 
 	if (ic) {
 		/* Link into per-inode list. This is safe because of the ic
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index ca15b3c731c..94ef8878734 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -362,7 +362,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs
 			uint32_t *len, int prio, uint32_t sumsize);
 int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
 			uint32_t *len, uint32_t sumsize);
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new);
+int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new, uint32_t len);
 void jffs2_complete_reservation(struct jffs2_sb_info *c);
 void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *raw);
 
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index d6eab1b7ad5..4701556be49 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -374,7 +374,6 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
  *	@c: superblock info
  *	@new: new node reference to add
  *	@len: length of this physical node
- *	@dirty: dirty flag for new node
  *
  *	Should only be used to report nodes for which space has been allocated
  *	by jffs2_reserve_space.
@@ -382,13 +381,12 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
  *	Must be called with the alloc_sem held.
  */
 
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new)
+int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new, uint32_t len)
 {
 	struct jffs2_eraseblock *jeb;
-	uint32_t len;
 
 	jeb = &c->blocks[new->flash_offset / c->sector_size];
-	len = ref_totlen(c, jeb, new);
+	new->__totlen = len;
 
 	D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n", ref_offset(new), ref_flags(new), len));
 #if 1
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4cebf0e57c4..676b83410f8 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -312,11 +312,9 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 					return;
 
 				raw2->flash_offset = ofs | REF_OBSOLETE;
-				raw2->__totlen = ref_totlen(c, jeb, *first_raw);
-				raw2->next_phys = NULL;
 				raw2->next_in_ino = NULL;
 
-				jffs2_add_physical_node_ref(c, raw2);
+				jffs2_add_physical_node_ref(c, raw2, ref_totlen(c, jeb, *first_raw));
 			}
 			return;
 		}
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index ff2b00b604e..4462541d11f 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -103,8 +103,6 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	fn->raw = raw;
 
 	raw->flash_offset = flash_ofs;
-	raw->__totlen = PAD(sizeof(*ri)+datalen);
-	raw->next_phys = NULL;
 
 	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) {
 		BUG_ON(!retried);
@@ -133,7 +131,7 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 			   any node we write before the original intended end of
 			   this node */
 			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw);
+			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*ri)+datalen));
 			jffs2_mark_node_obsolete(c, raw);
 		} else {
 			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
@@ -191,7 +189,7 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	} else {
 		raw->flash_offset |= REF_NORMAL;
 	}
-	jffs2_add_physical_node_ref(c, raw);
+	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*ri)+datalen));
 
 	/* Link into per-inode list */
 	spin_lock(&c->erase_completion_lock);
@@ -259,8 +257,6 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 	fd->raw = raw;
 
 	raw->flash_offset = flash_ofs;
-	raw->__totlen = PAD(sizeof(*rd)+namelen);
-	raw->next_phys = NULL;
 
 	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) {
 		BUG_ON(!retried);
@@ -281,7 +277,7 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 		if (retlen) {
 			raw->next_in_ino = NULL;
 			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw);
+			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*rd)+namelen));
 			jffs2_mark_node_obsolete(c, raw);
 		} else {
 			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
@@ -327,7 +323,7 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 	}
 	/* Mark the space used */
 	raw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, raw);
+	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*rd)+namelen));
 
 	spin_lock(&c->erase_completion_lock);
 	raw->next_in_ino = f->inocache->nodes;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 057bd4dcf66..e16f8460ff0 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -322,8 +322,6 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	if (!raw)
 		return -ENOMEM;
 	raw->flash_offset = phys_ofs;
-	raw->__totlen = PAD(totlen);
-	raw->next_phys = NULL;
 	raw->next_in_ino = (void *)xd;
 
 	/* Setup raw-xattr */
@@ -348,17 +346,17 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 		if (length) {
 			raw->flash_offset |= REF_OBSOLETE;
 			raw->next_in_ino = NULL;
-			jffs2_add_physical_node_ref(c, raw);
+			jffs2_add_physical_node_ref(c, raw, PAD(totlen));
 			jffs2_mark_node_obsolete(c, raw);
 		} else {
 			jffs2_free_raw_node_ref(raw);
 		}
 		return rc;
 	}
-	BUG_ON(raw->__totlen < sizeof(struct jffs2_raw_xattr));
+
 	/* success */
 	raw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, raw);
+	jffs2_add_physical_node_ref(c, raw, PAD(totlen));
 	if (xd->node)
 		delete_xattr_datum_node(c, xd);
 	xd->node = raw;
@@ -568,8 +566,6 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
 	if (!raw)
 		return -ENOMEM;
 	raw->flash_offset = phys_ofs;
-	raw->__totlen = PAD(sizeof(rr));
-	raw->next_phys = NULL;
 	raw->next_in_ino = (void *)ref;
 
 	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -589,7 +585,7 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
 		if (length) {
 			raw->flash_offset |= REF_OBSOLETE;
 			raw->next_in_ino = NULL;
-			jffs2_add_physical_node_ref(c, raw);
+			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(rr)));
 			jffs2_mark_node_obsolete(c, raw);
 		} else {
 			jffs2_free_raw_node_ref(raw);
@@ -598,7 +594,7 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
 	}
 	raw->flash_offset |= REF_PRISTINE;
 
-	jffs2_add_physical_node_ref(c, raw);
+	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(rr)));
 	if (ref->node)
 		delete_xattr_ref_node(c, ref);
 	ref->node = raw;
-- 
cgit v1.2.3


From 0bcc099d6d1a7b9fa2adf7c19812e4e816915e10 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 13:00:54 +0100
Subject: [JFFS2] File node reference for wasted space when flushing wbuf

Next step in ongoing campaign to file a struct jffs2_raw_node_ref for every
piece of dirty space in the system, so that __totlen can be killed off....

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/wbuf.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 676b83410f8..404b547c6cf 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -481,11 +481,11 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 		return ret;
 	}
 
-	spin_lock(&c->erase_completion_lock);
-
 	/* Adjust free size of the block if we padded. */
 	if (pad) {
 		struct jffs2_eraseblock *jeb;
+		struct jffs2_raw_node_ref *ref;
+		uint32_t waste = c->wbuf_pagesize - c->wbuf_len;
 
 		jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
 
@@ -495,18 +495,29 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 		/* wbuf_pagesize - wbuf_len is the amount of space that's to be
 		   padded. If there is less free space in the block than that,
 		   something screwed up */
-		if (jeb->free_size < (c->wbuf_pagesize - c->wbuf_len)) {
+		if (jeb->free_size < waste) {
 			printk(KERN_CRIT "jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n",
-			       c->wbuf_ofs, c->wbuf_len, c->wbuf_pagesize-c->wbuf_len);
+			       c->wbuf_ofs, c->wbuf_len, waste);
 			printk(KERN_CRIT "jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n",
 			       jeb->offset, jeb->free_size);
 			BUG();
 		}
-		jeb->free_size -= (c->wbuf_pagesize - c->wbuf_len);
-		c->free_size -= (c->wbuf_pagesize - c->wbuf_len);
-		jeb->wasted_size += (c->wbuf_pagesize - c->wbuf_len);
-		c->wasted_size += (c->wbuf_pagesize - c->wbuf_len);
-	}
+		ref = jffs2_alloc_raw_node_ref();
+		if (!ref)
+			return -ENOMEM;
+		ref->flash_offset = c->wbuf_ofs + c->wbuf_len;
+		ref->flash_offset |= REF_OBSOLETE;
+
+		spin_lock(&c->erase_completion_lock);
+
+		jffs2_link_node_ref(c, jeb, ref, waste);
+		/* FIXME: that made it count as dirty. Convert to wasted */
+		jeb->dirty_size -= waste;
+		c->dirty_size -= waste;
+		jeb->wasted_size += waste;
+		c->wasted_size += waste;
+	} else
+		spin_lock(&c->erase_completion_lock);
 
 	/* Stick any now-obsoleted blocks on the erase_pending_list */
 	jffs2_refile_wbuf_blocks(c);
-- 
cgit v1.2.3


From 9167e0f811cbe28564c44a99c2f07b0ce5b368cf Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 13:13:45 +0100
Subject: [JFFS2] Remove stray kfree of summary info in XATTR code.

We don't allocate this locally any more -- it's given to us and owner by
our caller. Also improve the debug messages a little.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index e60289ada83..5a59c618840 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -411,8 +411,9 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 				ino = je32_to_cpu(spi->inode);
 
-				dbg_summary("Inode at 0x%08x\n",
-							jeb->offset + je32_to_cpu(spi->offset));
+				dbg_summary("Inode at 0x%08x-0x%08x\n",
+					    jeb->offset + je32_to_cpu(spi->offset),
+					    jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spu->totlen));
 
 				raw = alloc_ref_at(c, jeb, je32_to_cpu(spi->offset));
 				if (!raw) {
@@ -446,7 +447,9 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				spd = sp;
 
 				dbg_summary("Dirent at 0x%08x\n",
-							jeb->offset + je32_to_cpu(spd->offset));
+					    jeb->offset + je32_to_cpu(spd->offset),
+					    jeb->offset + je32_to_cpu(spd->offset) + je32_to_cpu(spd->totlen));
+
 
 				fd = jffs2_alloc_full_dirent(spd->nsize+1);
 				if (!fd)
@@ -496,13 +499,13 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				struct jffs2_sum_xattr_flash *spx;
 
 				spx = (struct jffs2_sum_xattr_flash *)sp;
-				dbg_summary("xattr at %#08x (xid=%u, version=%u)\n", 
+				dbg_summary("xattr at %#08x-%#08x (xid=%u, version=%u)\n", 
 					    jeb->offset + je32_to_cpu(spx->offset),
+					    jeb->offset + je32_to_cpu(spx->offset) + je32_to_cpu(spx->totlen),
 					    je32_to_cpu(spx->xid), je32_to_cpu(spx->version));
 				raw = alloc_ref_at(c, jeb, je32_to_cpu(spx->offset));
 				if (!raw) {
 					JFFS2_NOTICE("allocation of node reference failed\n");
-					kfree(summary);
 					return -ENOMEM;
 				}
 				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
@@ -517,7 +520,6 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 						break;
 					}
 					JFFS2_NOTICE("allocation of xattr_datum failed\n");
-					kfree(summary);
 					return PTR_ERR(xd);
 				}
 				xd->node = raw;
@@ -537,20 +539,19 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				struct jffs2_sum_xref_flash *spr;
 
 				spr = (struct jffs2_sum_xref_flash *)sp;
-				dbg_summary("xref at %#08x (xid=%u, ino=%u)\n",
+				dbg_summary("xref at %#08x-%#08x\n",
 					    jeb->offset + je32_to_cpu(spr->offset),
-					    je32_to_cpu(spr->xid), je32_to_cpu(spr->ino));
+					    jeb->offset + je32_to_cpu(spr->offset) + PAD(sizeof(struct jffs2_raw_xref)));
+
 				raw = alloc_ref_at(c, jeb, je32_to_cpu(spr->offset));
 				if (!raw) {
 					JFFS2_NOTICE("allocation of node reference failed\n");
-					kfree(summary);
 					return -ENOMEM;
 				}
 				ref = jffs2_alloc_xattr_ref();
 				if (!ref) {
 					JFFS2_NOTICE("allocation of xattr_datum failed\n");
 					jffs2_free_raw_node_ref(raw);
-					kfree(summary);
 					return -ENOMEM;
 				}
 				ref->ino = 0xfffffffe;
-- 
cgit v1.2.3


From 010b06d6d07d9fa5ea6070aa72bb3e0de1761ab7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 13:15:59 +0100
Subject: [JFFS2] Locking issues in summary write code.

We can't use jffs2_scan_dirty_space() because it doesn't do any locking; it's
only for use at scan time -- hence the 'scan' in the name.

Also, don't allocate refs while we have c->erase_completion_lock held.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 5a59c618840..1451732e1fa 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -835,19 +835,32 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	spin_unlock(&c->erase_completion_lock);
 	ret = jffs2_flash_writev(c, vecs, 2, jeb->offset + c->sector_size -
 				jeb->free_size, &retlen, 0);
-	spin_lock(&c->erase_completion_lock);
-
 
 	if (ret || (retlen != infosize)) {
+		struct jffs2_raw_node_ref *ref;
+
 		JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n",
 			infosize, jeb->offset + c->sector_size - jeb->free_size, ret, retlen);
 
+		/* Waste remaining space */
+		ref = jffs2_alloc_raw_node_ref();
+		if (ref) {
+			spin_lock(&c->erase_completion_lock);
+
+			ref->flash_offset = jeb->offset + c->sector_size - jeb->free_size;
+			ref->flash_offset |= REF_OBSOLETE;
+			ref->next_in_ino = 0;
+
+			jffs2_link_node_ref(c, jeb, ref, c->sector_size - jeb->free_size);
+		}
+
 		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
-		jffs2_scan_dirty_space(c, jeb, infosize);
 
 		return 1;
 	}
 
+	spin_lock(&c->erase_completion_lock);
+
 	return 0;
 }
 
@@ -890,7 +903,6 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 	/* for ACCT_PARANOIA_CHECK */
 	spin_unlock(&c->erase_completion_lock);
 	summary_ref = jffs2_alloc_raw_node_ref();
-	spin_lock(&c->erase_completion_lock);
 
 	if (!summary_ref) {
 		JFFS2_NOTICE("Failed to allocate node ref for summary\n");
@@ -900,6 +912,7 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 	summary_ref->next_in_ino = NULL;
 	summary_ref->flash_offset = (jeb->offset + c->sector_size - jeb->free_size) | REF_NORMAL;
 
+	spin_lock(&c->erase_completion_lock);
 	jffs2_link_node_ref(c, jeb, summary_ref, infosize);
 
 	return 0;
-- 
cgit v1.2.3


From ca89a517fa577e6f26621463d3aa4f3c3d530b1e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 21 May 2006 13:29:11 +0100
Subject: [JFFS2] Finally eliminate __totlen field from struct
 jffs2_raw_node_ref

Well, almost. We'll actually keep a 'TEST_TOTLEN' macro set for now, and keep
doing some paranoia checks to make sure it's all working correctly. But if
TEST_TOTLEN is unset, the size of struct jffs2_raw_node_ref drops from 16
bytes to 12 on 32-bit machines. That's a saving of about half a megabyte of
memory on the OLPC prototype board, with 125K or so nodes in its 512MiB of
flash.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodelist.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/jffs2/nodelist.h |  58 ++++----------------------
 fs/jffs2/nodemgmt.c |   6 +++
 3 files changed, 121 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 00506857eab..9a6ced05f89 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1052,8 +1052,17 @@ void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 {
 	if (!jeb->first_node)
 		jeb->first_node = ref;
-	if (jeb->last_node)
+	if (jeb->last_node) {
 		jeb->last_node->next_phys = ref;
+#ifdef TEST_TOTLEN
+		if (ref_offset(jeb->last_node) + jeb->last_node->__totlen != ref_offset(ref)) {
+			printk(KERN_CRIT "Adding new ref %p at (0x%08x-0x%08x) not immediately after previous (0x%08x-0x%08x)\n",
+			       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
+			       ref_offset(jeb->last_node), ref_offset(jeb->last_node)+jeb->last_node->__totlen);
+			WARN_ON(1);
+		}
+#endif
+	}
 	jeb->last_node = ref;
 
 	switch(ref_flags(ref)) {
@@ -1076,18 +1085,110 @@ void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 	c->free_size -= len;
 	jeb->free_size -= len;
 
-	/* Set __totlen field... for now */
-	ref->__totlen = len;
 	ref->next_phys = NULL;
+#ifdef TEST_TOTLEN
+	/* Set (and test) __totlen field... for now */
+	ref->__totlen = len;
+	ref_totlen(c, jeb, ref);
+#endif
 }
 
+/* No locking. Do not use on a live file system */
 int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 			   uint32_t size)
 {
-	c->dirty_size += size;
-	c->free_size -= size;
-	jeb->dirty_size += size;
-	jeb->free_size -= size;
+	if (!size)
+		return 0;
+	if (size > c->sector_size - jeb->used_size) {
+		printk(KERN_CRIT "Dirty space 0x%x larger then used_size 0x%x (wasted 0x%x)\n",
+		       size, jeb->used_size, jeb->wasted_size);
+		BUG();
+	}
+	if (jeb->last_node && ref_obsolete(jeb->last_node)) {
+#ifdef TEST_TOTLEN
+		jeb->last_node->__totlen += size;
+#endif
+		c->dirty_size += size;
+		c->free_size -= size;
+		jeb->dirty_size += size;
+		jeb->free_size -= size;
+	} else {
+		struct jffs2_raw_node_ref *ref;
+		ref = jffs2_alloc_raw_node_ref();
+		if (!ref)
+			return -ENOMEM;
+
+		ref->flash_offset = jeb->offset + c->sector_size - jeb->free_size;
+		ref->flash_offset |= REF_OBSOLETE;
+		ref->next_in_ino = 0;
+#ifdef TEST_TOTLEN
+		ref->__totlen = size;
+#endif
+
+		jffs2_link_node_ref(c, jeb, ref, size);
+	}
 
 	return 0;
 }
+
+/* Calculate totlen from surrounding nodes or eraseblock */
+static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
+				    struct jffs2_eraseblock *jeb,
+				    struct jffs2_raw_node_ref *ref)
+{
+	uint32_t ref_end;
+
+	if (ref->next_phys)
+		ref_end = ref_offset(ref->next_phys);
+	else {
+		if (!jeb)
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+
+		/* Last node in block. Use free_space */
+		if (ref != jeb->last_node) {
+			printk(KERN_CRIT "ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n",
+			       ref, ref_offset(ref), jeb->last_node, jeb->last_node?ref_offset(jeb->last_node):0);
+			BUG();
+		}
+		ref_end = jeb->offset + c->sector_size - jeb->free_size;
+	}
+	return ref_end - ref_offset(ref);
+}
+
+uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
+			    struct jffs2_raw_node_ref *ref)
+{
+	uint32_t ret;
+
+#if CONFIG_JFFS2_FS_DEBUG > 0
+	if (jeb && jeb != &c->blocks[ref->flash_offset / c->sector_size]) {
+		printk(KERN_CRIT "ref_totlen called with wrong block -- at 0x%08x instead of 0x%08x; ref 0x%08x\n",
+		       jeb->offset, c->blocks[ref->flash_offset / c->sector_size].offset, ref_offset(ref));
+		BUG();
+	}
+#endif
+
+	ret = __ref_totlen(c, jeb, ref);
+#ifdef TEST_TOTLEN
+	if (ret != ref->__totlen) {
+		printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
+		       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
+		       ret, ref->__totlen);
+		if (ref->next_phys) {
+			printk(KERN_CRIT "next_phys %p (0x%08x-0x%08x)\n", ref->next_phys, ref_offset(ref->next_phys),
+			       ref_offset(ref->next_phys)+ref->__totlen);
+		} else 
+			printk(KERN_CRIT "No next_phys. jeb->last_node is %p\n", jeb->last_node);
+
+		printk(KERN_CRIT "jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", jeb->wasted_size, jeb->dirty_size, jeb->used_size, jeb->free_size);
+		ret = ref->__totlen;
+		if (!jeb)
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+#if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS)
+		__jffs2_dbg_dump_node_refs_nolock(c, jeb);
+#endif
+		WARN_ON(1);
+	}
+#endif /* TEST_TOTLEN */
+	return ret;
+}
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 94ef8878734..80d1fda2212 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -82,7 +82,10 @@ struct jffs2_raw_node_ref
 		word so you know when you've got there :) */
 	struct jffs2_raw_node_ref *next_phys;
 	uint32_t flash_offset;
+#define TEST_TOTLEN
+#ifdef TEST_TOTLEN
 	uint32_t __totlen; /* This may die; use ref_totlen(c, jeb, ) below */
+#endif
 };
 
         /* flash_offset & 3 always has to be zero, because nodes are
@@ -221,57 +224,7 @@ static inline int jffs2_blocks_use_vmalloc(struct jffs2_sb_info *c)
 	return ((c->flash_size / c->sector_size) * sizeof (struct jffs2_eraseblock)) > (128 * 1024);
 }
 
-/* Calculate totlen from surrounding nodes or eraseblock */
-static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
-				    struct jffs2_eraseblock *jeb,
-				    struct jffs2_raw_node_ref *ref)
-{
-	uint32_t ref_end;
-
-	if (ref->next_phys)
-		ref_end = ref_offset(ref->next_phys);
-	else {
-		if (!jeb)
-			jeb = &c->blocks[ref->flash_offset / c->sector_size];
-
-		/* Last node in block. Use free_space */
-		BUG_ON(ref != jeb->last_node);
-		ref_end = jeb->offset + c->sector_size - jeb->free_size;
-	}
-	return ref_end - ref_offset(ref);
-}
-
-static inline uint32_t ref_totlen(struct jffs2_sb_info *c,
-				  struct jffs2_eraseblock *jeb,
-				  struct jffs2_raw_node_ref *ref)
-{
-	uint32_t ret;
-
-#if CONFIG_JFFS2_FS_DEBUG > 0
-	if (jeb && jeb != &c->blocks[ref->flash_offset / c->sector_size]) {
-		printk(KERN_CRIT "ref_totlen called with wrong block -- at 0x%08x instead of 0x%08x; ref 0x%08x\n",
-		       jeb->offset, c->blocks[ref->flash_offset / c->sector_size].offset, ref_offset(ref));
-		BUG();
-	}
-#endif
-
-#if 1
-	ret = ref->__totlen;
-#else
-	/* This doesn't actually work yet */
-	ret = __ref_totlen(c, jeb, ref);
-	if (ret != ref->__totlen) {
-		printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
-		       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
-		       ret, ref->__totlen);
-		if (!jeb)
-			jeb = &c->blocks[ref->flash_offset / c->sector_size];
-		jffs2_dbg_dump_node_refs_nolock(c, jeb);
-		BUG();
-	}
-#endif
-	return ret;
-}
+#define ref_totlen(a, b, c) __jffs2_ref_totlen((a), (b), (c))
 
 #define ALLOC_NORMAL	0	/* Normal allocation */
 #define ALLOC_DELETION	1	/* Deletion node. Best to allow it */
@@ -355,6 +308,9 @@ void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uin
 int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
 void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 			 struct jffs2_raw_node_ref *ref, uint32_t len);
+extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c,
+				   struct jffs2_eraseblock *jeb,
+				   struct jffs2_raw_node_ref *ref);
 
 /* nodemgmt.c */
 int jffs2_thread_should_wake(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 4701556be49..9a0f312cfcd 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -386,7 +386,9 @@ int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 	struct jffs2_eraseblock *jeb;
 
 	jeb = &c->blocks[new->flash_offset / c->sector_size];
+#ifdef TEST_TOTLEN
 	new->__totlen = len;
+#endif
 
 	D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n", ref_offset(new), ref_flags(new), len));
 #if 1
@@ -679,7 +681,9 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 		spin_lock(&c->erase_completion_lock);
 
+#ifdef TEST_TOTLEN
 		ref->__totlen += n->__totlen;
+#endif
 		ref->next_phys = n->next_phys;
                 if (jeb->last_node == n) jeb->last_node = ref;
 		if (jeb->gc_node == n) {
@@ -702,7 +706,9 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 			p = p->next_phys;
 
 		if (ref_obsolete(p) && !ref->next_in_ino) {
+#ifdef TEST_TOTLEN
 			p->__totlen += ref->__totlen;
+#endif
 			if (jeb->last_node == ref) {
 				jeb->last_node = p;
 			}
-- 
cgit v1.2.3


From 06c6764b5830798c39617b24497cade90451592f Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 22 May 2006 11:27:14 +0100
Subject: [JFFS2] Fix dummy jffs2_sum_scan_sumnode() macro for !SUMMARY case.

I added an argument to the real function...

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index e7eb0c5814f..6bf1f6aa455 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -202,7 +202,7 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 #define jffs2_sum_add_dirent_mem(a,b,c)
 #define jffs2_sum_add_xattr_mem(a,b,c)
 #define jffs2_sum_add_xref_mem(a,b,c)
-#define jffs2_sum_scan_sumnode(a,b,c,d) (0)
+#define jffs2_sum_scan_sumnode(a,b,c,d,e) (0)
 
 #endif /* CONFIG_JFFS2_SUMMARY */
 
-- 
cgit v1.2.3


From 3b79673cfae93d0ed63eceb058bb26aba602a278 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 22 May 2006 12:15:47 +0100
Subject: [JFFS2] Fix accounting error in jffs2_link_node_ref()

When filing REF_OBSOLETE nodes, we'd add their size to the global
'dirty_size' count, but then to the eraseblock's 'used_size' count.
That's not clever.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodelist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 9a6ced05f89..7d563f938b1 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1079,7 +1079,7 @@ void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 
 	case REF_OBSOLETE:
 		c->dirty_size += len;
-		jeb->used_size += len;
+		jeb->dirty_size += len;
 		break;
 	}
 	c->free_size -= len;
-- 
cgit v1.2.3


From a1b563d652b54647ffacb2d6edf7859d3e97a723 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 22 May 2006 13:55:46 +0100
Subject: [JFFS2] Initialise ref->next_in_ino when marking dirty space in wbuf
 flush

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/wbuf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 404b547c6cf..45e3573cf10 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -507,6 +507,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 			return -ENOMEM;
 		ref->flash_offset = c->wbuf_ofs + c->wbuf_len;
 		ref->flash_offset |= REF_OBSOLETE;
+		ref->next_in_ino = NULL;
 
 		spin_lock(&c->erase_completion_lock);
 
-- 
cgit v1.2.3


From fcb7578719529898aef9edce8e409e457a1c2d15 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 22 May 2006 15:23:10 +0100
Subject: [JFFS2] Extend jffs2_link_node_ref() to link into per-inode list too.

Let's avoid the potential for forgetting to set ref->next_in_ino, by doing
it within jffs2_link_node_ref() instead.

This highlights the ugliness of what we're currently doing with
xattr_datum and xattr_ref structures -- we should find a nicer way of
dealing with that.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c    |  3 +--
 fs/jffs2/gc.c       | 17 ++---------------
 fs/jffs2/nodelist.c | 13 ++++++++++---
 fs/jffs2/nodelist.h |  8 ++++++--
 fs/jffs2/nodemgmt.c |  5 +++--
 fs/jffs2/scan.c     | 24 ++++++++----------------
 fs/jffs2/summary.c  | 30 ++++++++++--------------------
 fs/jffs2/wbuf.c     |  6 ++----
 fs/jffs2/write.c    | 23 ++++-------------------
 fs/jffs2/xattr.c    | 15 +++++++--------
 10 files changed, 53 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index f677d6950fd..0fc19a2fb5d 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -410,10 +410,9 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 		/* Everything else got zeroed before the erase */
 		jeb->free_size = c->sector_size;
 
-		marker_ref->next_in_ino = NULL;
 		marker_ref->flash_offset = jeb->offset | REF_NORMAL;
 
-		jffs2_link_node_ref(c, jeb, marker_ref, c->cleanmarker_size);
+		jffs2_link_node_ref(c, jeb, marker_ref, c->cleanmarker_size, NULL);
 	}
 
 	spin_lock(&c->erase_completion_lock);
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 4773ba24304..153755bc1d5 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -634,11 +634,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 		printk(KERN_NOTICE "Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n",
                        rawlen, phys_ofs, ret, retlen);
 		if (retlen) {
-                        /* Doesn't belong to any inode */
-			nraw->next_in_ino = NULL;
-
 			nraw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, nraw, rawlen);
+			jffs2_add_physical_node_ref(c, nraw, rawlen, NULL);
 			jffs2_mark_node_obsolete(c, nraw);
 		} else {
 			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", nraw->flash_offset);
@@ -678,18 +675,8 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 		goto out_node;
 	}
 	nraw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, nraw, rawlen);
+	jffs2_add_physical_node_ref(c, nraw, rawlen, ic);
 
-	if (ic) {
-		/* Link into per-inode list. This is safe because of the ic
-		   state being INO_STATE_GC. Note that if we're doing this
-		   for an inode which is in-core, the 'nraw' pointer is then
-		   going to be fetched from ic->nodes by our caller. */
-		spin_lock(&c->erase_completion_lock);
-		nraw->next_in_ino = ic->nodes;
-		ic->nodes = nraw;
-		spin_unlock(&c->erase_completion_lock);
-	}
 	jffs2_mark_node_obsolete(c, raw);
 	D1(printk(KERN_DEBUG "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", ref_offset(raw)));
 
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 7d563f938b1..d25d4919ca9 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1048,7 +1048,8 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
 }
 
 void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-			 struct jffs2_raw_node_ref *ref, uint32_t len)
+			 struct jffs2_raw_node_ref *ref, uint32_t len,
+			 struct jffs2_inode_cache *ic)
 {
 	if (!jeb->first_node)
 		jeb->first_node = ref;
@@ -1065,6 +1066,13 @@ void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 	}
 	jeb->last_node = ref;
 
+	if (ic) {
+		ref->next_in_ino = ic->nodes;
+		ic->nodes = ref;
+	} else {
+		ref->next_in_ino = NULL;
+	}
+
 	switch(ref_flags(ref)) {
 	case REF_UNCHECKED:
 		c->unchecked_size += len;
@@ -1120,12 +1128,11 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 
 		ref->flash_offset = jeb->offset + c->sector_size - jeb->free_size;
 		ref->flash_offset |= REF_OBSOLETE;
-		ref->next_in_ino = 0;
 #ifdef TEST_TOTLEN
 		ref->__totlen = size;
 #endif
 
-		jffs2_link_node_ref(c, jeb, ref, size);
+		jffs2_link_node_ref(c, jeb, ref, size, NULL);
 	}
 
 	return 0;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 80d1fda2212..ee5aedcffc1 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -307,7 +307,8 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
 void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
 int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
 void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-			 struct jffs2_raw_node_ref *ref, uint32_t len);
+			 struct jffs2_raw_node_ref *ref, uint32_t len,
+			 struct jffs2_inode_cache *ic);
 extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c,
 				   struct jffs2_eraseblock *jeb,
 				   struct jffs2_raw_node_ref *ref);
@@ -318,7 +319,10 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs
 			uint32_t *len, int prio, uint32_t sumsize);
 int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
 			uint32_t *len, uint32_t sumsize);
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new, uint32_t len);
+int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, 
+				struct jffs2_raw_node_ref *new,
+				uint32_t len,
+				struct jffs2_inode_cache *ic);
 void jffs2_complete_reservation(struct jffs2_sb_info *c);
 void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *raw);
 
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 9a0f312cfcd..e10e58eab8e 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -381,7 +381,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
  *	Must be called with the alloc_sem held.
  */
 
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new, uint32_t len)
+int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new,
+				uint32_t len, struct jffs2_inode_cache *ic)
 {
 	struct jffs2_eraseblock *jeb;
 
@@ -403,7 +404,7 @@ int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 #endif
 	spin_lock(&c->erase_completion_lock);
 
-	jffs2_link_node_ref(c, jeb, new, len);
+	jffs2_link_node_ref(c, jeb, new, len, ic);
 
 	if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) {
 		/* If it lives on the dirty_list, jffs2_reserve_space will put it there */
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index cffafec01e4..6fce703c054 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -361,9 +361,9 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	xd->node = raw;
 
 	raw->flash_offset = ofs | REF_PRISTINE;
-	raw->next_in_ino = (void *)xd;
 
-	jffs2_link_node_ref(c, jeb, raw, totlen);
+	jffs2_link_node_ref(c, jeb, raw, totlen, NULL);
+	/* FIXME */ raw->next_in_ino = (void *)xd;
 
 	if (jffs2_sum_active())
 		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
@@ -425,9 +425,9 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	c->xref_temp = ref;
 
 	raw->flash_offset = ofs | REF_PRISTINE;
-	raw->next_in_ino = (void *)ref;
 
-	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(rr->totlen)));
+	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(rr->totlen)), NULL);
+	/* FIXME */ raw->next_in_ino = (void *)ref;
 
 	if (jffs2_sum_active())
 		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
@@ -844,10 +844,9 @@ scan_more:
 					printk(KERN_NOTICE "Failed to allocate node ref for clean marker\n");
 					return -ENOMEM;
 				}
-				marker_ref->next_in_ino = NULL;
 				marker_ref->flash_offset = ofs | REF_NORMAL;
 
-				jffs2_link_node_ref(c, jeb, marker_ref, c->cleanmarker_size);
+				jffs2_link_node_ref(c, jeb, marker_ref, c->cleanmarker_size, NULL);
 
 				ofs += PAD(c->cleanmarker_size);
 			}
@@ -892,8 +891,7 @@ scan_more:
 				if (!ref)
 					return -ENOMEM;
 				ref->flash_offset = ofs | REF_PRISTINE;
-				ref->next_in_ino = 0;
-				jffs2_link_node_ref(c, jeb, ref, PAD(je32_to_cpu(node->totlen)));
+				jffs2_link_node_ref(c, jeb, ref, PAD(je32_to_cpu(node->totlen)), NULL);
 
 				/* We can't summarise nodes we don't grok */
 				jffs2_sum_disable_collecting(s);
@@ -1004,10 +1002,7 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 
 	raw->flash_offset = ofs | REF_UNCHECKED;
 
-	raw->next_in_ino = ic->nodes;
-	ic->nodes = raw;
-	
-	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(ri->totlen)));
+	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(ri->totlen)), ic);
 
 	D1(printk(KERN_DEBUG "Node is ino #%u, version %d. Range 0x%x-0x%x\n",
 		  je32_to_cpu(ri->ino), je32_to_cpu(ri->version),
@@ -1082,10 +1077,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 	}
 
 	raw->flash_offset = ofs | REF_PRISTINE;
-	raw->next_in_ino = ic->nodes;
-	ic->nodes = raw;
-
-	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(rd->totlen)));
+	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(rd->totlen)), ic);
 
 	fd->raw = raw;
 	fd->next = NULL;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 1451732e1fa..351ba9f8185 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -430,10 +430,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 				raw->flash_offset |= REF_UNCHECKED;
 
-				raw->next_in_ino = ic->nodes;
-				ic->nodes = raw;
-
-				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spi->totlen)));
+				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spi->totlen)), ic);
 
 				*pseudo_random += je32_to_cpu(spi->version);
 
@@ -473,10 +470,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				}
 
 				raw->flash_offset |= REF_PRISTINE;
-				raw->next_in_ino = ic->nodes;
-				ic->nodes = raw;
-
-				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spd->totlen)));
+				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spd->totlen)), ic);
 
 				fd->raw = raw;
 				fd->next = NULL;
@@ -525,9 +519,9 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				xd->node = raw;
 
 				raw->flash_offset |= REF_UNCHECKED;
-				raw->next_in_ino = (void *)xd;
 
-				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spx->totlen)));
+				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spx->totlen)), NULL);
+				/* FIXME */ raw->next_in_ino = (void *)xd;
 
 				*pseudo_random += je32_to_cpu(spx->xid);
 				sp += JFFS2_SUMMARY_XATTR_SIZE;
@@ -561,9 +555,9 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				c->xref_temp = ref;
 
 				raw->flash_offset |= REF_UNCHECKED;
-				raw->next_in_ino = (void *)ref;
 
-				jffs2_link_node_ref(c, jeb, raw, PAD(sizeof(struct jffs2_raw_xref)));
+				jffs2_link_node_ref(c, jeb, raw, PAD(sizeof(struct jffs2_raw_xref)), NULL);
+				/* FIXME */ raw->next_in_ino = (void *)ref;
 
 				*pseudo_random += raw->flash_offset;
 				sp += JFFS2_SUMMARY_XREF_SIZE;
@@ -664,9 +658,8 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 			}
 
 			marker_ref->flash_offset = jeb->offset | REF_NORMAL;
-			marker_ref->next_in_ino = NULL;
 
-			jffs2_link_node_ref(c, jeb, marker_ref, je32_to_cpu(summary->cln_mkr));
+			jffs2_link_node_ref(c, jeb, marker_ref, je32_to_cpu(summary->cln_mkr), NULL);
 		}
 	}
 
@@ -686,10 +679,9 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		return -ENOMEM;
 	}
 
-	cache_ref->next_in_ino = NULL;
 	cache_ref->flash_offset |= REF_NORMAL;
 
-	jffs2_link_node_ref(c, jeb, cache_ref, sumsize);
+	jffs2_link_node_ref(c, jeb, cache_ref, sumsize, NULL);
 
 	if (unlikely(jeb->free_size)) {
 		JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n",
@@ -849,9 +841,8 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 
 			ref->flash_offset = jeb->offset + c->sector_size - jeb->free_size;
 			ref->flash_offset |= REF_OBSOLETE;
-			ref->next_in_ino = 0;
 
-			jffs2_link_node_ref(c, jeb, ref, c->sector_size - jeb->free_size);
+			jffs2_link_node_ref(c, jeb, ref, c->sector_size - jeb->free_size, NULL);
 		}
 
 		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
@@ -909,11 +900,10 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 		return -ENOMEM;
 	}
 
-	summary_ref->next_in_ino = NULL;
 	summary_ref->flash_offset = (jeb->offset + c->sector_size - jeb->free_size) | REF_NORMAL;
 
 	spin_lock(&c->erase_completion_lock);
-	jffs2_link_node_ref(c, jeb, summary_ref, infosize);
+	jffs2_link_node_ref(c, jeb, summary_ref, infosize, NULL);
 
 	return 0;
 }
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 45e3573cf10..62f685faeba 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -312,9 +312,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 					return;
 
 				raw2->flash_offset = ofs | REF_OBSOLETE;
-				raw2->next_in_ino = NULL;
 
-				jffs2_add_physical_node_ref(c, raw2, ref_totlen(c, jeb, *first_raw));
+				jffs2_add_physical_node_ref(c, raw2, ref_totlen(c, jeb, *first_raw), NULL);
 			}
 			return;
 		}
@@ -507,11 +506,10 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 			return -ENOMEM;
 		ref->flash_offset = c->wbuf_ofs + c->wbuf_len;
 		ref->flash_offset |= REF_OBSOLETE;
-		ref->next_in_ino = NULL;
 
 		spin_lock(&c->erase_completion_lock);
 
-		jffs2_link_node_ref(c, jeb, ref, waste);
+		jffs2_link_node_ref(c, jeb, ref, waste, NULL);
 		/* FIXME: that made it count as dirty. Convert to wasted */
 		jeb->dirty_size -= waste;
 		c->dirty_size -= waste;
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 4462541d11f..319a70f531f 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -122,16 +122,13 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 
 		/* Mark the space as dirtied */
 		if (retlen) {
-			/* Doesn't belong to any inode */
-			raw->next_in_ino = NULL;
-
 			/* Don't change raw->size to match retlen. We may have
 			   written the node header already, and only the data will
 			   seem corrupted, in which case the scan would skip over
 			   any node we write before the original intended end of
 			   this node */
 			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*ri)+datalen));
+			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*ri)+datalen), NULL);
 			jffs2_mark_node_obsolete(c, raw);
 		} else {
 			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
@@ -189,13 +186,7 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	} else {
 		raw->flash_offset |= REF_NORMAL;
 	}
-	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*ri)+datalen));
-
-	/* Link into per-inode list */
-	spin_lock(&c->erase_completion_lock);
-	raw->next_in_ino = f->inocache->nodes;
-	f->inocache->nodes = raw;
-	spin_unlock(&c->erase_completion_lock);
+	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*ri)+datalen), f->inocache);
 
 	D1(printk(KERN_DEBUG "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n",
 		  flash_ofs, ref_flags(raw), je32_to_cpu(ri->dsize),
@@ -275,9 +266,8 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 			       sizeof(*rd)+namelen, flash_ofs, ret, retlen);
 		/* Mark the space as dirtied */
 		if (retlen) {
-			raw->next_in_ino = NULL;
 			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*rd)+namelen));
+			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*rd)+namelen), NULL);
 			jffs2_mark_node_obsolete(c, raw);
 		} else {
 			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
@@ -323,12 +313,7 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 	}
 	/* Mark the space used */
 	raw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*rd)+namelen));
-
-	spin_lock(&c->erase_completion_lock);
-	raw->next_in_ino = f->inocache->nodes;
-	f->inocache->nodes = raw;
-	spin_unlock(&c->erase_completion_lock);
+	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*rd)+namelen), f->inocache);
 
 	if (retried) {
 		jffs2_dbg_acct_sanity_check(c,NULL);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index e16f8460ff0..76d16614038 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -322,7 +322,6 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	if (!raw)
 		return -ENOMEM;
 	raw->flash_offset = phys_ofs;
-	raw->next_in_ino = (void *)xd;
 
 	/* Setup raw-xattr */
 	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -345,8 +344,7 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 		rc = rc ? rc : -EIO;
 		if (length) {
 			raw->flash_offset |= REF_OBSOLETE;
-			raw->next_in_ino = NULL;
-			jffs2_add_physical_node_ref(c, raw, PAD(totlen));
+			jffs2_add_physical_node_ref(c, raw, PAD(totlen), NULL);
 			jffs2_mark_node_obsolete(c, raw);
 		} else {
 			jffs2_free_raw_node_ref(raw);
@@ -356,7 +354,9 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 
 	/* success */
 	raw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, raw, PAD(totlen));
+	jffs2_add_physical_node_ref(c, raw, PAD(totlen), NULL);
+	/* FIXME */ raw->next_in_ino = (void *)xd;
+
 	if (xd->node)
 		delete_xattr_datum_node(c, xd);
 	xd->node = raw;
@@ -566,7 +566,6 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
 	if (!raw)
 		return -ENOMEM;
 	raw->flash_offset = phys_ofs;
-	raw->next_in_ino = (void *)ref;
 
 	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
 	rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF);
@@ -584,8 +583,7 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
 		ret = ret ? ret : -EIO;
 		if (length) {
 			raw->flash_offset |= REF_OBSOLETE;
-			raw->next_in_ino = NULL;
-			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(rr)));
+			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(rr)), NULL);
 			jffs2_mark_node_obsolete(c, raw);
 		} else {
 			jffs2_free_raw_node_ref(raw);
@@ -594,7 +592,8 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
 	}
 	raw->flash_offset |= REF_PRISTINE;
 
-	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(rr)));
+	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(rr)), NULL);
+	/* FIXME */ raw->next_in_ino = (void *)ref;
 	if (ref->node)
 		delete_xattr_ref_node(c, ref);
 	ref->node = raw;
-- 
cgit v1.2.3


From 0eac940b8a087576c66ecf8e0f294f2ceb3b607b Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 22 May 2006 16:29:23 +0100
Subject: [JFFS2] Add some preemptive BUG checks for XATTR code

In a couple of places, we assume that what's at the end of the
->next_in_ino list is a struct jffs2_inode_cache. Let's check
for that, since we expect it to change soon.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c    | 1 +
 fs/jffs2/nodemgmt.c | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 0fc19a2fb5d..4616fed7573 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -230,6 +230,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 			   at the end of the linked list. Stash it and continue
 			   from the beginning of the list */
 			ic = (struct jffs2_inode_cache *)(*prev);
+			BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE);
 			prev = &ic->nodes;
 			continue;
 		}
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index e10e58eab8e..34a452bdde0 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -661,6 +661,10 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 		spin_lock(&c->erase_completion_lock);
 
 		ic = jffs2_raw_ref_to_ic(ref);
+		/* It seems we should never call jffs2_mark_node_obsolete() for
+		   XATTR nodes.... yet. Make sure we notice if/when we change
+		   that :) */
+		BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE);
 		for (p = &ic->nodes; (*p) != ref; p = &((*p)->next_in_ino))
 			;
 
-- 
cgit v1.2.3


From 987d47b71a85bd83dc40c870abee3b64f2002163 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 22 May 2006 16:32:05 +0100
Subject: [JFFS2] Put list of nodes in common part of ic/x_ref/x_datum
 structure

We'll be using a proper list of nodes in the jffs2_xattr_datum and
jffs2_xattr_ref structures, because the existing code to overwrite
them is just broken. Put it in the common part at the front of the
structure which is shared with the jffs2_inode_cache, so that the
jffs2_link_node_ref() function can do the right thing.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodelist.h | 44 +++++++++++++++++++++++++++-----------------
 fs/jffs2/xattr.h    | 14 +++++++-------
 2 files changed, 34 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index ee5aedcffc1..1e1c39da6da 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -77,9 +77,9 @@
 struct jffs2_raw_node_ref
 {
 	struct jffs2_raw_node_ref *next_in_ino; /* Points to the next raw_node_ref
-		for this inode. If this is the last, it points to the inode_cache
-		for this inode instead. The inode_cache will have NULL in the first
-		word so you know when you've got there :) */
+		for this object. If this _is_ the last, it points to the inode_cache,
+		xattr_ref or xattr_datum instead. The common part of those structures
+		has NULL in the first word. See jffs2_raw_ref_to_ic() below */
 	struct jffs2_raw_node_ref *next_phys;
 	uint32_t flash_offset;
 #define TEST_TOTLEN
@@ -88,6 +88,18 @@ struct jffs2_raw_node_ref
 #endif
 };
 
+static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
+{
+	while(raw->next_in_ino) {
+		raw = raw->next_in_ino;
+	}
+
+	/* NB. This can be a jffs2_xattr_datum or jffs2_xattr_ref and
+	   not actually a jffs2_inode_cache. Check ->class */
+	return ((struct jffs2_inode_cache *)raw);
+}
+
+
         /* flash_offset & 3 always has to be zero, because nodes are
 	   always aligned at 4 bytes. So we have a couple of extra bits
 	   to play with, which indicate the node's status; see below: */
@@ -113,20 +125,27 @@ struct jffs2_raw_node_ref
    a pointer to the first physical node which is part of this inode, too.
 */
 struct jffs2_inode_cache {
+	/* First part of structure is shared with other objects which
+	   can terminate the raw node refs' next_in_ino list -- which
+	   currently struct jffs2_xattr_datum and struct jffs2_xattr_ref. */
+
 	struct jffs2_full_dirent *scan_dents; /* Used during scan to hold
 		temporary lists of dirents, and later must be set to
 		NULL to mark the end of the raw_node_ref->next_in_ino
 		chain. */
-	u8 class;	/* It's used for identification */
-	u8 flags;
-	uint16_t state;
-	struct jffs2_inode_cache *next;
 	struct jffs2_raw_node_ref *nodes;
+	uint8_t class;	/* It's used for identification */
+
+	/* end of shared structure */
+
+	uint8_t flags;
+	uint16_t state;
 	uint32_t ino;
-	int nlink;
+	struct jffs2_inode_cache *next;
 #ifdef CONFIG_JFFS2_FS_XATTR
 	struct jffs2_xattr_ref *xref;
 #endif
+	int nlink;
 };
 
 /* Inode states for 'state' above. We need the 'GC' state to prevent
@@ -250,15 +269,6 @@ static inline int jffs2_encode_dev(union jffs2_device_node *jdev, dev_t rdev)
 	}
 }
 
-static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
-{
-	while(raw->next_in_ino) {
-		raw = raw->next_in_ino;
-	}
-
-	return ((struct jffs2_inode_cache *)raw);
-}
-
 static inline struct jffs2_node_frag *frag_first(struct rb_root *root)
 {
 	struct rb_node *node = root->rb_node;
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index e2aa2394ab6..2c199856c58 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -20,11 +20,11 @@
 struct jffs2_xattr_datum
 {
 	void *always_null;
-	u8 class;
-	u8 flags;
-	u16 xprefix;			/* see JFFS2_XATTR_PREFIX_* */
-
 	struct jffs2_raw_node_ref *node;
+	uint8_t class;
+	uint8_t flags;
+	uint16_t xprefix;			/* see JFFS2_XATTR_PREFIX_* */
+
 	struct list_head xindex;	/* chained from c->xattrindex[n] */
 	uint32_t refcnt;		/* # of xattr_ref refers this */
 	uint32_t xid;
@@ -42,11 +42,11 @@ struct jffs2_inode_cache;
 struct jffs2_xattr_ref
 {
 	void *always_null;
-	u8 class;
-	u8 flags;		/* Currently unused */
+	struct jffs2_raw_node_ref *node;
+	uint8_t class;
+	uint8_t flags;		/* Currently unused */
 	u16 unused;
 
-	struct jffs2_raw_node_ref *node;
 	union {
 		struct jffs2_inode_cache *ic;	/* reference to jffs2_inode_cache */
 		uint32_t ino;			/* only used in scanning/building  */
-- 
cgit v1.2.3


From 28318776a80bc3261f9af91ef79e6e38bb9f5bec Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@wh.fh-wedel.de>
Date: Mon, 22 May 2006 23:18:05 +0200
Subject: [MTD] Introduce writesize

At least two flashes exists that have the concept of a minimum write unit,
similar to NAND pages, but no other NAND characteristics.  Therefore, rename
the minimum write unit to "writesize" for all flashes, including NAND.

Signed-off-by: Joern Engel <joern@wh.fh-wedel.de>
---
 fs/jffs2/wbuf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 62f685faeba..355226d8ce2 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1173,7 +1173,7 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 
 	/* Initialise write buffer */
 	init_rwsem(&c->wbuf_sem);
-	c->wbuf_pagesize = c->mtd->oobblock;
+	c->wbuf_pagesize = c->mtd->writesize;
 	c->wbuf_ofs = 0xFFFFFFFF;
 
 	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
@@ -1266,11 +1266,11 @@ void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c) {
 
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
 	/* Cleanmarker currently occupies a whole programming region */
-	c->cleanmarker_size = MTD_PROGREGION_SIZE(c->mtd);
+	c->cleanmarker_size = c->mtd->writesize;
 
 	/* Initialize write buffer */
 	init_rwsem(&c->wbuf_sem);
-	c->wbuf_pagesize = MTD_PROGREGION_SIZE(c->mtd);
+	c->wbuf_pagesize = c->mtd->writesize;
 	c->wbuf_ofs = 0xFFFFFFFF;
 
 	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
-- 
cgit v1.2.3


From c8b229de2b05c2b3e8d282ce260935a88ac030ca Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@wh.fh-wedel.de>
Date: Mon, 22 May 2006 23:18:12 +0200
Subject: [MTD] Merge STMicro NOR_ECC code with Intel Sibley code

In 2002, STMicro started producing NOR flashes with internal ECC protection
for small blocks (8 or 16 bytes).  Support for those flashes was added by me.
In 2005, Intel Sibley flashes copied this strategy and Nico added support for
those.  Merge the code for both.

Signed-off-by: Joern Engel <joern@wh.fh-wedel.de>
---
 fs/jffs2/fs.c       | 12 ------------
 fs/jffs2/os-linux.h | 10 +---------
 fs/jffs2/wbuf.c     | 36 +++---------------------------------
 3 files changed, 4 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index a0f84673ce5..79f70251a4e 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -664,13 +664,6 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
 			return ret;
 	}
 
-	/* add setups for other bizarre flashes here... */
-	if (jffs2_nor_ecc(c)) {
-		ret = jffs2_nor_ecc_flash_setup(c);
-		if (ret)
-			return ret;
-	}
-
 	/* and Dataflash */
 	if (jffs2_dataflash(c)) {
 		ret = jffs2_dataflash_setup(c);
@@ -694,11 +687,6 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
 		jffs2_nand_flash_cleanup(c);
 	}
 
-	/* add cleanups for other bizarre flashes here... */
-	if (jffs2_nor_ecc(c)) {
-		jffs2_nor_ecc_flash_cleanup(c);
-	}
-
 	/* and DataFlash */
 	if (jffs2_dataflash(c)) {
 		jffs2_dataflash_cleanup(c);
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index d2ad2a2081d..1a6eb955e0b 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -92,11 +92,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_flash_writev(a,b,c,d,e,f) jffs2_flash_direct_writev(a,b,c,d,e)
 #define jffs2_wbuf_timeout NULL
 #define jffs2_wbuf_process NULL
-#define jffs2_nor_ecc(c) (0)
 #define jffs2_dataflash(c) (0)
-#define jffs2_nor_wbuf_flash(c) (0)
-#define jffs2_nor_ecc_flash_setup(c) (0)
-#define jffs2_nor_ecc_flash_cleanup(c) do {} while (0)
 #define jffs2_dataflash_setup(c) (0)
 #define jffs2_dataflash_cleanup(c) do {} while (0)
 #define jffs2_nor_wbuf_flash_setup(c) (0)
@@ -110,7 +106,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_can_mark_obsolete(c) (0)
 #else
 #define jffs2_can_mark_obsolete(c) \
-  ((c->mtd->type == MTD_NORFLASH && !(c->mtd->flags & (MTD_ECC|MTD_PROGRAM_REGIONS))) || \
+  ((c->mtd->type == MTD_NORFLASH && !(c->mtd->flags & (MTD_PROGRAM_REGIONS))) || \
    c->mtd->type == MTD_RAM)
 #endif
 
@@ -135,10 +131,6 @@ int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c);
 int jffs2_nand_flash_setup(struct jffs2_sb_info *c);
 void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
 
-#define jffs2_nor_ecc(c) (c->mtd->type == MTD_NORFLASH && (c->mtd->flags & MTD_ECC))
-int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c);
-void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c);
-
 #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
 int jffs2_dataflash_setup(struct jffs2_sb_info *c);
 void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 355226d8ce2..087c2e438a6 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -637,17 +637,6 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig
 		memset(c->wbuf,0xff,c->wbuf_pagesize);
 	}
 
-	/* Fixup the wbuf if we are moving to a new eraseblock.  The checks below
-	   fail for ECC'd NOR because cleanmarker == 16, so a block starts at
-	   xxx0010.  */
-	if (jffs2_nor_ecc(c)) {
-		if (((c->wbuf_ofs % c->sector_size) == 0) && !c->wbuf_len) {
-			c->wbuf_ofs = PAGE_DIV(to);
-			c->wbuf_len = PAGE_MOD(to);
-			memset(c->wbuf,0xff,c->wbuf_pagesize);
-		}
-	}
-
 	/* Sanity checks on target address.
 	   It's permitted to write at PAD(c->wbuf_len+c->wbuf_ofs),
 	   and it's permitted to write at the beginning of a new
@@ -1244,29 +1233,10 @@ void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) {
 	kfree(c->wbuf);
 }
 
-int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c) {
-	/* Cleanmarker is actually larger on the flashes */
-	c->cleanmarker_size = 16;
-
-	/* Initialize write buffer */
-	init_rwsem(&c->wbuf_sem);
-	c->wbuf_pagesize = c->mtd->eccsize;
-	c->wbuf_ofs = 0xFFFFFFFF;
-
-	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
-	if (!c->wbuf)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c) {
-	kfree(c->wbuf);
-}
-
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
-	/* Cleanmarker currently occupies a whole programming region */
-	c->cleanmarker_size = c->mtd->writesize;
+	/* Cleanmarker currently occupies whole programming regions,
+	 * either one or 2 for 8Byte STMicro flashes. */
+	c->cleanmarker_size = max(16u, c->mtd->writesize);
 
 	/* Initialize write buffer */
 	init_rwsem(&c->wbuf_sem);
-- 
cgit v1.2.3


From 5fa433942ba4e399f7e28764c9db4ade89e91d40 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@wh.fh-wedel.de>
Date: Mon, 22 May 2006 23:18:29 +0200
Subject: [MTD] Introduce MTD_BIT_WRITEABLE

o Add a flag MTD_BIT_WRITEABLE for devices that allow single bits to be
  cleared.
o Replace MTD_PROGRAM_REGIONS with a cleared MTD_BIT_WRITEABLE flag for
  STMicro and Intel Sibley flashes with internal ECC.  Those flashes
  disallow clearing of single bits, unlike regular NOR flashes, so the
  new flag models their behaviour better.
o Remove MTD_ECC.  After the STMicro/Sibley merge, this flag is only set
  and never checked.

Signed-off-by: Joern Engel <joern@wh.fh-wedel.de>
---
 fs/jffs2/os-linux.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 1a6eb955e0b..743c9e52152 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -105,9 +105,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #ifdef CONFIG_JFFS2_SUMMARY
 #define jffs2_can_mark_obsolete(c) (0)
 #else
-#define jffs2_can_mark_obsolete(c) \
-  ((c->mtd->type == MTD_NORFLASH && !(c->mtd->flags & (MTD_PROGRAM_REGIONS))) || \
-   c->mtd->type == MTD_RAM)
+#define jffs2_can_mark_obsolete(c) (c->mtd->flags & (MTD_BIT_WRITEABLE))
 #endif
 
 #define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH)
@@ -135,7 +133,7 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
 int jffs2_dataflash_setup(struct jffs2_sb_info *c);
 void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
 
-#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && (c->mtd->flags & MTD_PROGRAM_REGIONS))
+#define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
 void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
 
-- 
cgit v1.2.3


From 9fe4854cd1f60273f9a3ece053f4789605f58a5e Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 23 May 2006 00:38:06 +0100
Subject: [JFFS2] Remove flash offset argument from various functions.

We don't need the upper layers to deal with the physical offset. It's
_always_ c->nextblock->offset + c->sector_size - c->nextblock->free_size
so we might as well just let the actual write functions deal with that.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/dir.c      | 40 +++++++++++++++++-----------------
 fs/jffs2/file.c     |  8 +++----
 fs/jffs2/fs.c       |  8 +++----
 fs/jffs2/gc.c       | 34 ++++++++++++++---------------
 fs/jffs2/nodelist.h | 23 ++++++++++++++------
 fs/jffs2/nodemgmt.c | 26 +++++++++++-----------
 fs/jffs2/wbuf.c     |  4 +++-
 fs/jffs2/write.c    | 62 ++++++++++++++++++++++++++++++-----------------------
 fs/jffs2/xattr.c    | 45 +++++++++++++++++++-------------------
 9 files changed, 135 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ff1b7950dd4..edd8371fc6a 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -308,7 +308,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	struct jffs2_full_dnode *fn;
 	struct jffs2_full_dirent *fd;
 	int namelen;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret, targetlen = strlen(target);
 
 	/* FIXME: If you care. We'd need to use frags for the target
@@ -327,8 +327,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	 * Just the node will do for now, though
 	 */
 	namelen = dentry->d_name.len;
-	ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+	ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 
 	if (ret) {
 		jffs2_free_raw_inode(ri);
@@ -356,7 +356,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	ri->data_crc = cpu_to_je32(crc32(0, target, targetlen));
 	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 
-	fn = jffs2_write_dnode(c, f, ri, target, targetlen, phys_ofs, ALLOC_NORMAL);
+	fn = jffs2_write_dnode(c, f, ri, target, targetlen, ALLOC_NORMAL);
 
 	jffs2_free_raw_inode(ri);
 
@@ -400,8 +400,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 		return ret;
 	}
 
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
 		/* Eep. */
 		jffs2_clear_inode(inode);
@@ -433,7 +433,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
 
 	if (IS_ERR(fd)) {
 		/* dirent failed to write. Delete the inode normally
@@ -471,7 +471,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	struct jffs2_full_dnode *fn;
 	struct jffs2_full_dirent *fd;
 	int namelen;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	mode |= S_IFDIR;
@@ -486,8 +486,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	 * Just the node will do for now, though
 	 */
 	namelen = dentry->d_name.len;
-	ret = jffs2_reserve_space(c, sizeof(*ri), &phys_ofs, &alloclen, ALLOC_NORMAL,
-				JFFS2_SUMMARY_INODE_SIZE);
+	ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
+				  JFFS2_SUMMARY_INODE_SIZE);
 
 	if (ret) {
 		jffs2_free_raw_inode(ri);
@@ -512,7 +512,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	ri->data_crc = cpu_to_je32(0);
 	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 
-	fn = jffs2_write_dnode(c, f, ri, NULL, 0, phys_ofs, ALLOC_NORMAL);
+	fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
 
 	jffs2_free_raw_inode(ri);
 
@@ -542,8 +542,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 		return ret;
 	}
 
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
 		/* Eep. */
 		jffs2_clear_inode(inode);
@@ -575,7 +575,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
 
 	if (IS_ERR(fd)) {
 		/* dirent failed to write. Delete the inode normally
@@ -631,7 +631,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	int namelen;
 	union jffs2_device_node dev;
 	int devlen = 0;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	if (!new_valid_dev(rdev))
@@ -650,7 +650,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	 * Just the node will do for now, though
 	 */
 	namelen = dentry->d_name.len;
-	ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &phys_ofs, &alloclen,
+	ret = jffs2_reserve_space(c, sizeof(*ri) + devlen, &alloclen,
 				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 
 	if (ret) {
@@ -678,7 +678,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	ri->data_crc = cpu_to_je32(crc32(0, &dev, devlen));
 	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 
-	fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, phys_ofs, ALLOC_NORMAL);
+	fn = jffs2_write_dnode(c, f, ri, (char *)&dev, devlen, ALLOC_NORMAL);
 
 	jffs2_free_raw_inode(ri);
 
@@ -708,8 +708,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 		return ret;
 	}
 
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
 		/* Eep. */
 		jffs2_clear_inode(inode);
@@ -744,7 +744,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, dentry->d_name.name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, dentry->d_name.name, namelen, ALLOC_NORMAL);
 
 	if (IS_ERR(fd)) {
 		/* dirent failed to write. Delete the inode normally
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e18c9437d58..bb8844f40e4 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -134,13 +134,13 @@ static int jffs2_prepare_write (struct file *filp, struct page *pg,
 		struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 		struct jffs2_raw_inode ri;
 		struct jffs2_full_dnode *fn;
-		uint32_t phys_ofs, alloc_len;
+		uint32_t alloc_len;
 
 		D1(printk(KERN_DEBUG "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
 			  (unsigned int)inode->i_size, pageofs));
 
-		ret = jffs2_reserve_space(c, sizeof(ri), &phys_ofs, &alloc_len,
-					ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 		if (ret)
 			return ret;
 
@@ -166,7 +166,7 @@ static int jffs2_prepare_write (struct file *filp, struct page *pg,
 		ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
 		ri.data_crc = cpu_to_je32(0);
 
-		fn = jffs2_write_dnode(c, f, &ri, NULL, 0, phys_ofs, ALLOC_NORMAL);
+		fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_NORMAL);
 
 		if (IS_ERR(fn)) {
 			ret = PTR_ERR(fn);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 79f70251a4e..7b6c24b14f8 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -37,7 +37,7 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	unsigned char *mdata = NULL;
 	int mdatalen = 0;
 	unsigned int ivalid;
-	uint32_t phys_ofs, alloclen;
+	uint32_t alloclen;
 	int ret;
 	D1(printk(KERN_DEBUG "jffs2_setattr(): ino #%lu\n", inode->i_ino));
 	ret = inode_change_ok(inode, iattr);
@@ -79,8 +79,8 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 		return -ENOMEM;
 	}
 
-	ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &phys_ofs, &alloclen,
-				ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+	ret = jffs2_reserve_space(c, sizeof(*ri) + mdatalen, &alloclen,
+				  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 	if (ret) {
 		jffs2_free_raw_inode(ri);
 		if (S_ISLNK(inode->i_mode & S_IFMT))
@@ -131,7 +131,7 @@ static int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
 	else
 		ri->data_crc = cpu_to_je32(0);
 
-	new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, phys_ofs, ALLOC_NORMAL);
+	new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, ALLOC_NORMAL);
 	if (S_ISLNK(inode->i_mode))
 		kfree(mdata);
 
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 153755bc1d5..f9e982a65ac 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -545,7 +545,7 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 	if (ic && alloclen > sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN)
 		alloclen = sizeof(struct jffs2_raw_inode) + JFFS2_MIN_DATA_LEN;
 
-	ret = jffs2_reserve_space_gc(c, alloclen, &phys_ofs, &alloclen, rawlen);
+	ret = jffs2_reserve_space_gc(c, alloclen, &alloclen, rawlen);
 	/* 'rawlen' is not the exact summary size; it is only an upper estimation */
 
 	if (ret)
@@ -626,13 +626,13 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 
 	/* OK, all the CRCs are good; this node can just be copied as-is. */
  retry:
-	nraw->flash_offset = phys_ofs;
+	nraw->flash_offset = phys_ofs = write_ofs(c);
 
 	ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node);
 
 	if (ret || (retlen != rawlen)) {
 		printk(KERN_NOTICE "Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n",
-                       rawlen, phys_ofs, ret, retlen);
+                       rawlen, nraw->flash_offset, ret, retlen);
 		if (retlen) {
 			nraw->flash_offset |= REF_OBSOLETE;
 			jffs2_add_physical_node_ref(c, nraw, rawlen, NULL);
@@ -653,7 +653,7 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 			jffs2_dbg_acct_sanity_check(c,jeb);
 			jffs2_dbg_acct_paranoia_check(c, jeb);
 
-			ret = jffs2_reserve_space_gc(c, rawlen, &phys_ofs, &dummy, rawlen);
+			ret = jffs2_reserve_space_gc(c, rawlen, &dummy, rawlen);
 						/* this is not the exact summary size of it,
 							it is only an upper estimation */
 
@@ -696,7 +696,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 	struct jffs2_node_frag *last_frag;
 	union jffs2_device_node dev;
 	char *mdata = NULL, mdatalen = 0;
-	uint32_t alloclen, phys_ofs, ilen;
+	uint32_t alloclen, ilen;
 	int ret;
 
 	if (S_ISBLK(JFFS2_F_I_MODE(f)) ||
@@ -722,7 +722,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 
 	}
 
-	ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &phys_ofs, &alloclen,
+	ret = jffs2_reserve_space_gc(c, sizeof(ri) + mdatalen, &alloclen,
 				JFFS2_SUMMARY_INODE_SIZE);
 	if (ret) {
 		printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_metadata failed: %d\n",
@@ -760,7 +760,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
 	ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
 	ri.data_crc = cpu_to_je32(crc32(0, mdata, mdatalen));
 
-	new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, phys_ofs, ALLOC_GC);
+	new_fn = jffs2_write_dnode(c, f, &ri, mdata, mdatalen, ALLOC_GC);
 
 	if (IS_ERR(new_fn)) {
 		printk(KERN_WARNING "Error writing new dnode: %ld\n", PTR_ERR(new_fn));
@@ -781,7 +781,7 @@ static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_er
 {
 	struct jffs2_full_dirent *new_fd;
 	struct jffs2_raw_dirent rd;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	rd.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -803,14 +803,14 @@ static int jffs2_garbage_collect_dirent(struct jffs2_sb_info *c, struct jffs2_er
 	rd.node_crc = cpu_to_je32(crc32(0, &rd, sizeof(rd)-8));
 	rd.name_crc = cpu_to_je32(crc32(0, fd->name, rd.nsize));
 
-	ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &phys_ofs, &alloclen,
+	ret = jffs2_reserve_space_gc(c, sizeof(rd)+rd.nsize, &alloclen,
 				JFFS2_SUMMARY_DIRENT_SIZE(rd.nsize));
 	if (ret) {
 		printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_dirent failed: %d\n",
 		       sizeof(rd)+rd.nsize, ret);
 		return ret;
 	}
-	new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, phys_ofs, ALLOC_GC);
+	new_fd = jffs2_write_dirent(c, f, &rd, fd->name, rd.nsize, ALLOC_GC);
 
 	if (IS_ERR(new_fd)) {
 		printk(KERN_WARNING "jffs2_write_dirent in garbage_collect_dirent failed: %ld\n", PTR_ERR(new_fd));
@@ -938,7 +938,7 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
 	struct jffs2_raw_inode ri;
 	struct jffs2_node_frag *frag;
 	struct jffs2_full_dnode *new_fn;
-	uint32_t alloclen, phys_ofs, ilen;
+	uint32_t alloclen, ilen;
 	int ret;
 
 	D1(printk(KERN_DEBUG "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n",
@@ -1017,14 +1017,14 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras
 	ri.data_crc = cpu_to_je32(0);
 	ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
 
-	ret = jffs2_reserve_space_gc(c, sizeof(ri), &phys_ofs, &alloclen,
-				JFFS2_SUMMARY_INODE_SIZE);
+	ret = jffs2_reserve_space_gc(c, sizeof(ri), &alloclen,
+				     JFFS2_SUMMARY_INODE_SIZE);
 	if (ret) {
 		printk(KERN_WARNING "jffs2_reserve_space_gc of %zd bytes for garbage_collect_hole failed: %d\n",
 		       sizeof(ri), ret);
 		return ret;
 	}
-	new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, phys_ofs, ALLOC_GC);
+	new_fn = jffs2_write_dnode(c, f, &ri, NULL, 0, ALLOC_GC);
 
 	if (IS_ERR(new_fn)) {
 		printk(KERN_WARNING "Error writing new hole node: %ld\n", PTR_ERR(new_fn));
@@ -1086,7 +1086,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 {
 	struct jffs2_full_dnode *new_fn;
 	struct jffs2_raw_inode ri;
-	uint32_t alloclen, phys_ofs, offset, orig_end, orig_start;
+	uint32_t alloclen, offset, orig_end, orig_start;
 	int ret = 0;
 	unsigned char *comprbuf = NULL, *writebuf;
 	unsigned long pg;
@@ -1243,7 +1243,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 		uint32_t cdatalen;
 		uint16_t comprtype = JFFS2_COMPR_NONE;
 
-		ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN, &phys_ofs,
+		ret = jffs2_reserve_space_gc(c, sizeof(ri) + JFFS2_MIN_DATA_LEN,
 					&alloclen, JFFS2_SUMMARY_INODE_SIZE);
 
 		if (ret) {
@@ -1280,7 +1280,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 		ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
 		ri.data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
 
-		new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, phys_ofs, ALLOC_GC);
+		new_fn = jffs2_write_dnode(c, f, &ri, comprbuf, cdatalen, ALLOC_GC);
 
 		jffs2_free_comprbuf(comprbuf, writebuf);
 
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 1e1c39da6da..76f1b9419ee 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -167,6 +167,8 @@ struct jffs2_inode_cache {
 
 #define INOCACHE_HASHSIZE 128
 
+#define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size)
+
 /*
   Larger representation of a raw node, kept in-core only when the
   struct inode for this particular ino is instantiated.
@@ -325,9 +327,9 @@ extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c,
 
 /* nodemgmt.c */
 int jffs2_thread_should_wake(struct jffs2_sb_info *c);
-int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 			uint32_t *len, int prio, uint32_t sumsize);
-int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
 			uint32_t *len, uint32_t sumsize);
 int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, 
 				struct jffs2_raw_node_ref *new,
@@ -339,14 +341,21 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 /* write.c */
 int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t mode, struct jffs2_raw_inode *ri);
 
-struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const unsigned char *data, uint32_t datalen, uint32_t flash_ofs, int alloc_mode);
-struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_dirent *rd, const unsigned char *name, uint32_t namelen, uint32_t flash_ofs, int alloc_mode);
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					   struct jffs2_raw_inode *ri, const unsigned char *data,
+					   uint32_t datalen, int alloc_mode);
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					     struct jffs2_raw_dirent *rd, const unsigned char *name,
+					     uint32_t namelen, int alloc_mode);
 int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			    struct jffs2_raw_inode *ri, unsigned char *buf,
 			    uint32_t offset, uint32_t writelen, uint32_t *retlen);
-int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen);
-int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name, int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
-int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, uint8_t type, const char *name, int namelen, uint32_t time);
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
+		    struct jffs2_raw_inode *ri, const char *name, int namelen);
+int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
+		    int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
+int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
+		   uint8_t type, const char *name, int namelen, uint32_t time);
 
 
 /* readinode.c */
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 34a452bdde0..8feb8749bc7 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -23,13 +23,12 @@
  *	jffs2_reserve_space - request physical space to write nodes to flash
  *	@c: superblock info
  *	@minsize: Minimum acceptable size of allocation
- *	@ofs: Returned value of node offset
  *	@len: Returned value of allocation length
  *	@prio: Allocation type - ALLOC_{NORMAL,DELETION}
  *
  *	Requests a block of physical space on the flash. Returns zero for success
- *	and puts 'ofs' and 'len' into the appriopriate place, or returns -ENOSPC
- *	or other error if appropriate.
+ *	and puts 'len' into the appropriate place, or returns -ENOSPC or other 
+ *	error if appropriate. Doesn't return len since that's 
  *
  *	If it returns zero, jffs2_reserve_space() also downs the per-filesystem
  *	allocation semaphore, to prevent more than one allocation from being
@@ -40,9 +39,9 @@
  */
 
 static int jffs2_do_reserve_space(struct jffs2_sb_info *c,  uint32_t minsize,
-					uint32_t *ofs, uint32_t *len, uint32_t sumsize);
+				  uint32_t *len, uint32_t sumsize);
 
-int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
+int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 			uint32_t *len, int prio, uint32_t sumsize)
 {
 	int ret = -EAGAIN;
@@ -132,7 +131,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs
 			spin_lock(&c->erase_completion_lock);
 		}
 
-		ret = jffs2_do_reserve_space(c, minsize, ofs, len, sumsize);
+		ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
 		if (ret) {
 			D1(printk(KERN_DEBUG "jffs2_reserve_space: ret is %d\n", ret));
 		}
@@ -143,8 +142,8 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs
 	return ret;
 }
 
-int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs,
-			uint32_t *len, uint32_t sumsize)
+int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
+			   uint32_t *len, uint32_t sumsize)
 {
 	int ret = -EAGAIN;
 	minsize = PAD(minsize);
@@ -153,7 +152,7 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *
 
 	spin_lock(&c->erase_completion_lock);
 	while(ret == -EAGAIN) {
-		ret = jffs2_do_reserve_space(c, minsize, ofs, len, sumsize);
+		ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
 		if (ret) {
 		        D1(printk(KERN_DEBUG "jffs2_reserve_space_gc: looping, ret is %d\n", ret));
 		}
@@ -259,10 +258,11 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 }
 
 /* Called with alloc sem _and_ erase_completion_lock */
-static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs, uint32_t *len, uint32_t sumsize)
+static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
+				  uint32_t *len, uint32_t sumsize)
 {
 	struct jffs2_eraseblock *jeb = c->nextblock;
-	uint32_t reserved_size; 			/* for summary information at the end of the jeb */
+	uint32_t reserved_size;				/* for summary information at the end of the jeb */
 	int ret;
 
  restart:
@@ -349,7 +349,6 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
 	}
 	/* OK, jeb (==c->nextblock) is now pointing at a block which definitely has
 	   enough space */
-	*ofs = jeb->offset + (c->sector_size - jeb->free_size);
 	*len = jeb->free_size - reserved_size;
 
 	if (c->cleanmarker_size && jeb->used_size == c->cleanmarker_size &&
@@ -365,7 +364,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uin
 		spin_lock(&c->erase_completion_lock);
 	}
 
-	D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n", *len, *ofs));
+	D1(printk(KERN_DEBUG "jffs2_do_reserve_space(): Giving 0x%x bytes at 0x%x\n",
+		  *len, jeb->offset + (c->sector_size - jeb->free_size)));
 	return 0;
 }
 
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 087c2e438a6..c7e3040240b 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -265,12 +265,14 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 
 
 	/* ... and get an allocation of space from a shiny new block instead */
-	ret = jffs2_reserve_space_gc(c, end-start, &ofs, &len, JFFS2_SUMMARY_NOSUM_SIZE);
+	ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE);
 	if (ret) {
 		printk(KERN_WARNING "Failed to allocate space for wbuf recovery. Data loss ensues.\n");
 		kfree(buf);
 		return;
 	}
+	ofs = write_ofs(c);
+
 	if (end-start >= c->wbuf_pagesize) {
 		/* Need to do another write immediately, but it's possible
 		   that this is just because the wbuf itself is completely
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 319a70f531f..0e12b7561b7 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -56,12 +56,15 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint
 /* jffs2_write_dnode - given a raw_inode, allocate a full_dnode for it,
    write it to the flash, link it into the existing inode/fragment list */
 
-struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const unsigned char *data, uint32_t datalen, uint32_t flash_ofs, int alloc_mode)
+struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					   struct jffs2_raw_inode *ri, const unsigned char *data,
+					   uint32_t datalen, int alloc_mode)
 
 {
 	struct jffs2_raw_node_ref *raw;
 	struct jffs2_full_dnode *fn;
 	size_t retlen;
+	uint32_t flash_ofs;
 	struct kvec vecs[2];
 	int ret;
 	int retried = 0;
@@ -77,8 +80,6 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	vecs[1].iov_base = (unsigned char *)data;
 	vecs[1].iov_len = datalen;
 
-	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
-
 	if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) {
 		printk(KERN_WARNING "jffs2_write_dnode: ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", je32_to_cpu(ri->totlen), sizeof(*ri), datalen);
 	}
@@ -102,7 +103,9 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
  retry:
 	fn->raw = raw;
 
-	raw->flash_offset = flash_ofs;
+	raw->flash_offset = flash_ofs = write_ofs(c);
+
+	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
 
 	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) {
 		BUG_ON(!retried);
@@ -147,19 +150,20 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 			jffs2_dbg_acct_paranoia_check(c, jeb);
 
 			if (alloc_mode == ALLOC_GC) {
-				ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &flash_ofs,
-							&dummy, JFFS2_SUMMARY_INODE_SIZE);
+				ret = jffs2_reserve_space_gc(c, sizeof(*ri) + datalen, &dummy,
+							     JFFS2_SUMMARY_INODE_SIZE);
 			} else {
 				/* Locking pain */
 				up(&f->sem);
 				jffs2_complete_reservation(c);
 
-				ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &flash_ofs,
-							&dummy, alloc_mode, JFFS2_SUMMARY_INODE_SIZE);
+				ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &dummy,
+							  alloc_mode, JFFS2_SUMMARY_INODE_SIZE);
 				down(&f->sem);
 			}
 
 			if (!ret) {
+				flash_ofs = write_ofs(c);
 				D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
 
 				jffs2_dbg_acct_sanity_check(c,jeb);
@@ -200,12 +204,15 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	return fn;
 }
 
-struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_dirent *rd, const unsigned char *name, uint32_t namelen, uint32_t flash_ofs, int alloc_mode)
+struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
+					     struct jffs2_raw_dirent *rd, const unsigned char *name,
+					     uint32_t namelen, int alloc_mode)
 {
 	struct jffs2_raw_node_ref *raw;
 	struct jffs2_full_dirent *fd;
 	size_t retlen;
 	struct kvec vecs[2];
+	uint32_t flash_ofs = write_ofs(c);
 	int retried = 0;
 	int ret;
 
@@ -286,19 +293,20 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 			jffs2_dbg_acct_paranoia_check(c, jeb);
 
 			if (alloc_mode == ALLOC_GC) {
-				ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &flash_ofs,
-							&dummy, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+				ret = jffs2_reserve_space_gc(c, sizeof(*rd) + namelen, &dummy,
+							     JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 			} else {
 				/* Locking pain */
 				up(&f->sem);
 				jffs2_complete_reservation(c);
 
-				ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &flash_ofs,
-							&dummy, alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+				ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &dummy,
+							  alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 				down(&f->sem);
 			}
 
 			if (!ret) {
+				flash_ofs = write_ofs(c);
 				D1(printk(KERN_DEBUG "Allocated space at 0x%08x to retry failed write.\n", flash_ofs));
 				jffs2_dbg_acct_sanity_check(c,jeb);
 				jffs2_dbg_acct_paranoia_check(c, jeb);
@@ -339,14 +347,14 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 		struct jffs2_full_dnode *fn;
 		unsigned char *comprbuf = NULL;
 		uint16_t comprtype = JFFS2_COMPR_NONE;
-		uint32_t phys_ofs, alloclen;
+		uint32_t alloclen;
 		uint32_t datalen, cdatalen;
 		int retried = 0;
 
 	retry:
 		D2(printk(KERN_DEBUG "jffs2_commit_write() loop: 0x%x to write to 0x%x\n", writelen, offset));
 
-		ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN, &phys_ofs,
+		ret = jffs2_reserve_space(c, sizeof(*ri) + JFFS2_MIN_DATA_LEN,
 					&alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
 		if (ret) {
 			D1(printk(KERN_DEBUG "jffs2_reserve_space returned %d\n", ret));
@@ -374,7 +382,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 		ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 		ri->data_crc = cpu_to_je32(crc32(0, comprbuf, cdatalen));
 
-		fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, phys_ofs, ALLOC_NORETRY);
+		fn = jffs2_write_dnode(c, f, ri, comprbuf, cdatalen, ALLOC_NORETRY);
 
 		jffs2_free_comprbuf(comprbuf, buf);
 
@@ -428,13 +436,13 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 	struct jffs2_raw_dirent *rd;
 	struct jffs2_full_dnode *fn;
 	struct jffs2_full_dirent *fd;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	/* Try to reserve enough space for both node and dirent.
 	 * Just the node will do for now, though
 	 */
-	ret = jffs2_reserve_space(c, sizeof(*ri), &phys_ofs, &alloclen, ALLOC_NORMAL,
+	ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL,
 				JFFS2_SUMMARY_INODE_SIZE);
 	D1(printk(KERN_DEBUG "jffs2_do_create(): reserved 0x%x bytes\n", alloclen));
 	if (ret) {
@@ -445,7 +453,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 	ri->data_crc = cpu_to_je32(0);
 	ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8));
 
-	fn = jffs2_write_dnode(c, f, ri, NULL, 0, phys_ofs, ALLOC_NORMAL);
+	fn = jffs2_write_dnode(c, f, ri, NULL, 0, ALLOC_NORMAL);
 
 	D1(printk(KERN_DEBUG "jffs2_do_create created file with mode 0x%x\n",
 		  jemode_to_cpu(ri->mode)));
@@ -464,7 +472,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 
 	up(&f->sem);
 	jffs2_complete_reservation(c);
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 
 	if (ret) {
@@ -496,7 +504,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
 
 	jffs2_free_raw_dirent(rd);
 
@@ -525,7 +533,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
 {
 	struct jffs2_raw_dirent *rd;
 	struct jffs2_full_dirent *fd;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	if (1 /* alternative branch needs testing */ ||
@@ -536,7 +544,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
 		if (!rd)
 			return -ENOMEM;
 
-		ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
+		ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 					ALLOC_DELETION, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 		if (ret) {
 			jffs2_free_raw_dirent(rd);
@@ -560,7 +568,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
 		rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 		rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
 
-		fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, phys_ofs, ALLOC_DELETION);
+		fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_DELETION);
 
 		jffs2_free_raw_dirent(rd);
 
@@ -639,14 +647,14 @@ int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint
 {
 	struct jffs2_raw_dirent *rd;
 	struct jffs2_full_dirent *fd;
-	uint32_t alloclen, phys_ofs;
+	uint32_t alloclen;
 	int ret;
 
 	rd = jffs2_alloc_raw_dirent();
 	if (!rd)
 		return -ENOMEM;
 
-	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &phys_ofs, &alloclen,
+	ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
 				ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
 	if (ret) {
 		jffs2_free_raw_dirent(rd);
@@ -672,7 +680,7 @@ int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint
 	rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
 	rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
 
-	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, phys_ofs, ALLOC_NORMAL);
+	fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
 
 	jffs2_free_raw_dirent(rd);
 
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 76d16614038..008f91b1c17 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -49,9 +49,9 @@
  *   is used to be as a wrapper of do_verify_xattr_datum() and do_load_xattr_datum().
  *   If xd need to call do_verify_xattr_datum() at first, it's called before calling
  *   do_load_xattr_datum(). The meanings of return value is same as do_verify_xattr_datum().
- * save_xattr_datum(c, xd, phys_ofs)
+ * save_xattr_datum(c, xd)
  *   is used to write xdatum to medium. xd->version will be incremented.
- * create_xattr_datum(c, xprefix, xname, xvalue, xsize, phys_ofs)
+ * create_xattr_datum(c, xprefix, xname, xvalue, xsize)
  *   is used to create new xdatum and write to medium.
  * -------------------------------------------------- */
 
@@ -301,7 +301,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	return rc;
 }
 
-static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd, uint32_t phys_ofs)
+static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
 	struct jffs2_raw_xattr rx;
@@ -309,6 +309,7 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	struct kvec vecs[2];
 	uint32_t length;
 	int rc, totlen;
+	uint32_t phys_ofs = write_ofs(c);
 
 	BUG_ON(!xd->xname);
 
@@ -369,8 +370,7 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 
 static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 						    int xprefix, const char *xname,
-						    const char *xvalue, int xsize,
-						    uint32_t phys_ofs)
+						    const char *xvalue, int xsize)
 {
 	/* must be called under down_write(xattr_sem) */
 	struct jffs2_xattr_datum *xd;
@@ -419,7 +419,7 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 	xd->value_len = xsize;
 	xd->data_crc = crc32(0, data, xd->name_len + 1 + xd->value_len);
 
-	rc = save_xattr_datum(c, xd, phys_ofs);
+	rc = save_xattr_datum(c, xd);
 	if (rc) {
 		kfree(xd->xname);
 		jffs2_free_xattr_datum(xd);
@@ -446,9 +446,9 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
  * delete_xattr_ref(c, ref)
  *   is used to delete jffs2_xattr_ref object. If the reference counter of xdatum
  *   is refered by this xref become 0, delete_xattr_datum() is called later.
- * save_xattr_ref(c, ref, phys_ofs)
+ * save_xattr_ref(c, ref)
  *   is used to write xref to medium.
- * create_xattr_ref(c, ic, xd, phys_ofs)
+ * create_xattr_ref(c, ic, xd)
  *   is used to create a new xref and write to medium.
  * jffs2_xattr_delete_inode(c, ic)
  *   is called to remove xrefs related to obsolete inode when inode is unlinked.
@@ -554,12 +554,13 @@ static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
 	jffs2_free_xattr_ref(ref);
 }
 
-static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, uint32_t phys_ofs)
+static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 {
 	/* must be called under down_write(xattr_sem) */
 	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xref rr;
 	uint32_t length;
+	uint32_t phys_ofs = write_ofs(c);
 	int ret;
 
 	raw = jffs2_alloc_raw_node_ref();
@@ -604,7 +605,7 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
 }
 
 static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic,
-						struct jffs2_xattr_datum *xd, uint32_t phys_ofs)
+						struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
 	struct jffs2_xattr_ref *ref;
@@ -616,7 +617,7 @@ static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct
 	ref->ic = ic;
 	ref->xd = xd;
 
-	ret = save_xattr_ref(c, ref, phys_ofs);
+	ret = save_xattr_ref(c, ref);
 	if (ret) {
 		jffs2_free_xattr_ref(ref);
 		return ERR_PTR(ret);
@@ -1062,7 +1063,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 	struct jffs2_inode_cache *ic = f->inocache;
 	struct jffs2_xattr_datum *xd;
 	struct jffs2_xattr_ref *ref, *newref, **pref;
-	uint32_t phys_ofs, length, request;
+	uint32_t length, request;
 	int rc;
 
 	rc = check_xattr_ref_inode(c, ic);
@@ -1070,7 +1071,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		return rc;
 
 	request = PAD(sizeof(struct jffs2_raw_xattr) + strlen(xname) + 1 + size);
-	rc = jffs2_reserve_space(c, request, &phys_ofs, &length,
+	rc = jffs2_reserve_space(c, request, &length,
 				 ALLOC_NORMAL, JFFS2_SUMMARY_XATTR_SIZE);
 	if (rc) {
 		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
@@ -1117,7 +1118,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		goto out;
 	}
  found:
-	xd = create_xattr_datum(c, xprefix, xname, buffer, size, phys_ofs);
+	xd = create_xattr_datum(c, xprefix, xname, buffer, size);
 	if (IS_ERR(xd)) {
 		rc = PTR_ERR(xd);
 		goto out;
@@ -1127,7 +1128,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 
 	/* create xattr_ref */
 	request = PAD(sizeof(struct jffs2_raw_xref));
-	rc = jffs2_reserve_space(c, request, &phys_ofs, &length,
+	rc = jffs2_reserve_space(c, request, &length,
 				 ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE);
 	if (rc) {
 		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
@@ -1141,7 +1142,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 	down_write(&c->xattr_sem);
 	if (ref)
 		*pref = ref->next;
-	newref = create_xattr_ref(c, ic, xd, phys_ofs);
+	newref = create_xattr_ref(c, ic, xd);
 	if (IS_ERR(newref)) {
 		if (ref) {
 			ref->next = ic->xref;
@@ -1170,7 +1171,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
  * -------------------------------------------------- */
 int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
-	uint32_t phys_ofs, totlen, length, old_ofs;
+	uint32_t totlen, length, old_ofs;
 	int rc = -EINVAL;
 
 	down_write(&c->xattr_sem);
@@ -1190,13 +1191,13 @@ int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xatt
 		} else if (unlikely(rc < 0))
 			goto out;
 	}
-	rc = jffs2_reserve_space_gc(c, totlen, &phys_ofs, &length, JFFS2_SUMMARY_XATTR_SIZE);
+	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE);
 	if (rc || length < totlen) {
 		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, totlen);
 		rc = rc ? rc : -EBADFD;
 		goto out;
 	}
-	rc = save_xattr_datum(c, xd, phys_ofs);
+	rc = save_xattr_datum(c, xd);
 	if (!rc)
 		dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
 			  xd->xid, xd->version, old_ofs, ref_offset(xd->node));
@@ -1208,7 +1209,7 @@ int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xatt
 
 int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 {
-	uint32_t phys_ofs, totlen, length, old_ofs;
+	uint32_t totlen, length, old_ofs;
 	int rc = -EINVAL;
 
 	down_write(&c->xattr_sem);
@@ -1219,14 +1220,14 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_
 	if (totlen != sizeof(struct jffs2_raw_xref))
 		goto out;
 
-	rc = jffs2_reserve_space_gc(c, totlen, &phys_ofs, &length, JFFS2_SUMMARY_XREF_SIZE);
+	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE);
 	if (rc || length < totlen) {
 		JFFS2_WARNING("%s: jffs2_reserve_space() = %d, request = %u\n",
 			      __FUNCTION__, rc, totlen);
 		rc = rc ? rc : -EBADFD;
 		goto out;
 	}
-	rc = save_xattr_ref(c, ref, phys_ofs);
+	rc = save_xattr_ref(c, ref);
 	if (!rc)
 		dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
 			  ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
-- 
cgit v1.2.3


From dcb0932884b801290efd80fbc37630297b98181f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 11:49:14 +0200
Subject: [JFFS2] Simplify writebuffer handling

The writev based write buffer implementation was far to complex as
in most use cases the write buffer had to be handled anyway.
Simplify the write buffer handling and use mtd->write instead.

From extensive testing no performance impact has been noted.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jffs2/wbuf.c | 272 +++++++++++++++++++++-----------------------------------
 1 file changed, 102 insertions(+), 170 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 45e3573cf10..0442a5753d3 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -613,20 +613,30 @@ int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c)
 
 	return ret;
 }
-int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsigned long count, loff_t to, size_t *retlen, uint32_t ino)
+
+static size_t jffs2_fill_wbuf(struct jffs2_sb_info *c, const uint8_t *buf,
+			      size_t len)
 {
-	struct kvec outvecs[3];
-	uint32_t totlen = 0;
-	uint32_t split_ofs = 0;
-	uint32_t old_totlen;
-	int ret, splitvec = -1;
-	int invec, outvec;
-	size_t wbuf_retlen;
-	unsigned char *wbuf_ptr;
-	size_t donelen = 0;
+	if (len && !c->wbuf_len && (len >= c->wbuf_pagesize))
+		return 0;
+
+	if (len > (c->wbuf_pagesize - c->wbuf_len))
+		len = c->wbuf_pagesize - c->wbuf_len;
+	memcpy(c->wbuf + c->wbuf_len, buf, len);
+	c->wbuf_len += (uint32_t) len;
+	return len;
+}
+
+int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
+		       unsigned long count, loff_t to, size_t *retlen,
+		       uint32_t ino)
+{
+	struct jffs2_eraseblock *jeb;
+	size_t wbuf_retlen, donelen = 0;
 	uint32_t outvec_to = to;
+	int ret, invec;
 
-	/* If not NAND flash, don't bother */
+	/* If not writebuffered flash, don't bother */
 	if (!jffs2_is_writebuffered(c))
 		return jffs2_flash_direct_writev(c, invecs, count, to, retlen);
 
@@ -639,9 +649,11 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig
 		memset(c->wbuf,0xff,c->wbuf_pagesize);
 	}
 
-	/* Fixup the wbuf if we are moving to a new eraseblock.  The checks below
-	   fail for ECC'd NOR because cleanmarker == 16, so a block starts at
-	   xxx0010.  */
+	/*
+	 * Fixup the wbuf if we are moving to a new eraseblock. The
+	 * checks below fail for ECC'd NOR because cleanmarker == 16,
+	 * so a block starts at xxx0010.
+	 */
 	if (jffs2_nor_ecc(c)) {
 		if (((c->wbuf_ofs % c->sector_size) == 0) && !c->wbuf_len) {
 			c->wbuf_ofs = PAGE_DIV(to);
@@ -650,23 +662,22 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig
 		}
 	}
 
-	/* Sanity checks on target address.
-	   It's permitted to write at PAD(c->wbuf_len+c->wbuf_ofs),
-	   and it's permitted to write at the beginning of a new
-	   erase block. Anything else, and you die.
-	   New block starts at xxx000c (0-b = block header)
-	*/
+	/*
+	 * Sanity checks on target address.  It's permitted to write
+	 * at PAD(c->wbuf_len+c->wbuf_ofs), and it's permitted to
+	 * write at the beginning of a new erase block. Anything else,
+	 * and you die.  New block starts at xxx000c (0-b = block
+	 * header)
+	 */
 	if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) {
 		/* It's a write to a new block */
 		if (c->wbuf_len) {
-			D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx causes flush of wbuf at 0x%08x\n", (unsigned long)to, c->wbuf_ofs));
+			D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx "
+				  "causes flush of wbuf at 0x%08x\n",
+				  (unsigned long)to, c->wbuf_ofs));
 			ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT);
-			if (ret) {
-				/* the underlying layer has to check wbuf_len to do the cleanup */
-				D1(printk(KERN_WARNING "jffs2_flush_wbuf() called from jffs2_flash_writev() failed %d\n", ret));
-				*retlen = 0;
-				goto exit;
-			}
+			if (ret)
+				goto outerr;
 		}
 		/* set pointer to new block */
 		c->wbuf_ofs = PAGE_DIV(to);
@@ -675,165 +686,70 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig
 
 	if (to != PAD(c->wbuf_ofs + c->wbuf_len)) {
 		/* We're not writing immediately after the writebuffer. Bad. */
-		printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write to %08lx\n", (unsigned long)to);
+		printk(KERN_CRIT "jffs2_flash_writev(): Non-contiguous write "
+		       "to %08lx\n", (unsigned long)to);
 		if (c->wbuf_len)
 			printk(KERN_CRIT "wbuf was previously %08x-%08x\n",
-					  c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len);
-		BUG();
-	}
-
-	/* Note outvecs[3] above. We know count is never greater than 2 */
-	if (count > 2) {
-		printk(KERN_CRIT "jffs2_flash_writev(): count is %ld\n", count);
+			       c->wbuf_ofs, c->wbuf_ofs+c->wbuf_len);
 		BUG();
 	}
 
-	invec = 0;
-	outvec = 0;
-
-	/* Fill writebuffer first, if already in use */
-	if (c->wbuf_len) {
-		uint32_t invec_ofs = 0;
-
-		/* adjust alignment offset */
-		if (c->wbuf_len != PAGE_MOD(to)) {
-			c->wbuf_len = PAGE_MOD(to);
-			/* take care of alignment to next page */
-			if (!c->wbuf_len)
-				c->wbuf_len = c->wbuf_pagesize;
-		}
-
-		while(c->wbuf_len < c->wbuf_pagesize) {
-			uint32_t thislen;
-
-			if (invec == count)
-				goto alldone;
-
-			thislen = c->wbuf_pagesize - c->wbuf_len;
-
-			if (thislen >= invecs[invec].iov_len)
-				thislen = invecs[invec].iov_len;
-
-			invec_ofs = thislen;
-
-			memcpy(c->wbuf + c->wbuf_len, invecs[invec].iov_base, thislen);
-			c->wbuf_len += thislen;
-			donelen += thislen;
-			/* Get next invec, if actual did not fill the buffer */
-			if (c->wbuf_len < c->wbuf_pagesize)
-				invec++;
-		}
-
-		/* write buffer is full, flush buffer */
-		ret = __jffs2_flush_wbuf(c, NOPAD);
-		if (ret) {
-			/* the underlying layer has to check wbuf_len to do the cleanup */
-			D1(printk(KERN_WARNING "jffs2_flush_wbuf() called from jffs2_flash_writev() failed %d\n", ret));
-			/* Retlen zero to make sure our caller doesn't mark the space dirty.
-			   We've already done everything that's necessary */
-			*retlen = 0;
-			goto exit;
-		}
-		outvec_to += donelen;
-		c->wbuf_ofs = outvec_to;
-
-		/* All invecs done ? */
-		if (invec == count)
-			goto alldone;
-
-		/* Set up the first outvec, containing the remainder of the
-		   invec we partially used */
-		if (invecs[invec].iov_len > invec_ofs) {
-			outvecs[0].iov_base = invecs[invec].iov_base+invec_ofs;
-			totlen = outvecs[0].iov_len = invecs[invec].iov_len-invec_ofs;
-			if (totlen > c->wbuf_pagesize) {
-				splitvec = outvec;
-				split_ofs = outvecs[0].iov_len - PAGE_MOD(totlen);
-			}
-			outvec++;
-		}
-		invec++;
-	}
-
-	/* OK, now we've flushed the wbuf and the start of the bits
-	   we have been asked to write, now to write the rest.... */
-
-	/* totlen holds the amount of data still to be written */
-	old_totlen = totlen;
-	for ( ; invec < count; invec++,outvec++ ) {
-		outvecs[outvec].iov_base = invecs[invec].iov_base;
-		totlen += outvecs[outvec].iov_len = invecs[invec].iov_len;
-		if (PAGE_DIV(totlen) != PAGE_DIV(old_totlen)) {
-			splitvec = outvec;
-			split_ofs = outvecs[outvec].iov_len - PAGE_MOD(totlen);
-			old_totlen = totlen;
+	/* adjust alignment offset */
+	if (c->wbuf_len != PAGE_MOD(to)) {
+		c->wbuf_len = PAGE_MOD(to);
+		/* take care of alignment to next page */
+		if (!c->wbuf_len) {
+			c->wbuf_len = c->wbuf_pagesize;
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
 		}
 	}
 
-	/* Now the outvecs array holds all the remaining data to write */
-	/* Up to splitvec,split_ofs is to be written immediately. The rest
-	   goes into the (now-empty) wbuf */
-
-	if (splitvec != -1) {
-		uint32_t remainder;
-
-		remainder = outvecs[splitvec].iov_len - split_ofs;
-		outvecs[splitvec].iov_len = split_ofs;
-
-		/* We did cross a page boundary, so we write some now */
-		if (jffs2_cleanmarker_oob(c))
-			ret = c->mtd->writev_ecc(c->mtd, outvecs, splitvec+1, outvec_to, &wbuf_retlen, NULL, c->oobinfo);
-		else
-			ret = jffs2_flash_direct_writev(c, outvecs, splitvec+1, outvec_to, &wbuf_retlen);
-
-		if (ret < 0 || wbuf_retlen != PAGE_DIV(totlen)) {
-			/* At this point we have no problem,
-			   c->wbuf is empty. However refile nextblock to avoid
-			   writing again to same address.
-			*/
-			struct jffs2_eraseblock *jeb;
+	for (invec = 0; invec < count; invec++) {
+		int vlen = invecs[invec].iov_len;
+		uint8_t *v = invecs[invec].iov_base;
 
-			spin_lock(&c->erase_completion_lock);
+		wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
 
-			jeb = &c->blocks[outvec_to / c->sector_size];
-			jffs2_block_refile(c, jeb, REFILE_ANYWAY);
-
-			*retlen = 0;
-			spin_unlock(&c->erase_completion_lock);
-			goto exit;
+		if (c->wbuf_len == c->wbuf_pagesize) {
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
 		}
-
+		vlen -= wbuf_retlen;
+		outvec_to += wbuf_retlen;
 		donelen += wbuf_retlen;
-		c->wbuf_ofs = PAGE_DIV(outvec_to) + PAGE_DIV(totlen);
-
-		if (remainder) {
-			outvecs[splitvec].iov_base += split_ofs;
-			outvecs[splitvec].iov_len = remainder;
-		} else {
-			splitvec++;
+		v += wbuf_retlen;
+
+		if (vlen >= c->wbuf_pagesize) {
+			ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen),
+					    &wbuf_retlen, v);
+			if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen))
+				goto outfile;
+
+			vlen -= wbuf_retlen;
+			outvec_to += wbuf_retlen;
+			c->wbuf_ofs = outvec_to;
+			donelen += wbuf_retlen;
+			v += wbuf_retlen;
 		}
 
-	} else {
-		splitvec = 0;
-	}
-
-	/* Now splitvec points to the start of the bits we have to copy
-	   into the wbuf */
-	wbuf_ptr = c->wbuf;
+		wbuf_retlen = jffs2_fill_wbuf(c, v, vlen);
+		if (c->wbuf_len == c->wbuf_pagesize) {
+			ret = __jffs2_flush_wbuf(c, NOPAD);
+			if (ret)
+				goto outerr;
+		}
 
-	for ( ; splitvec < outvec; splitvec++) {
-		/* Don't copy the wbuf into itself */
-		if (outvecs[splitvec].iov_base == c->wbuf)
-			continue;
-		memcpy(wbuf_ptr, outvecs[splitvec].iov_base, outvecs[splitvec].iov_len);
-		wbuf_ptr += outvecs[splitvec].iov_len;
-		donelen += outvecs[splitvec].iov_len;
+		outvec_to += wbuf_retlen;
+		donelen += wbuf_retlen;
 	}
-	c->wbuf_len = wbuf_ptr - c->wbuf;
 
-	/* If there's a remainder in the wbuf and it's a non-GC write,
-	   remember that the wbuf affects this ino */
-alldone:
+	/*
+	 * If there's a remainder in the wbuf and it's a non-GC write,
+	 * remember that the wbuf affects this ino
+	 */
 	*retlen = donelen;
 
 	if (jffs2_sum_active()) {
@@ -846,8 +762,24 @@ alldone:
 		jffs2_wbuf_dirties_inode(c, ino);
 
 	ret = 0;
+	up_write(&c->wbuf_sem);
+	return ret;
 
-exit:
+outfile:
+	/*
+	 * At this point we have no problem, c->wbuf is empty. However
+	 * refile nextblock to avoid writing again to same address.
+	 */
+
+	spin_lock(&c->erase_completion_lock);
+
+	jeb = &c->blocks[outvec_to / c->sector_size];
+	jffs2_block_refile(c, jeb, REFILE_ANYWAY);
+
+	spin_unlock(&c->erase_completion_lock);
+
+outerr:
+	*retlen = 0;
 	up_write(&c->wbuf_sem);
 	return ret;
 }
-- 
cgit v1.2.3


From 9223a456da8ed357bf7e0b128c853e2c8bd54614 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Tue, 23 May 2006 17:21:03 +0200
Subject: [MTD] Remove read/write _ecc variants

MTD clients are agnostic of FLASH which needs ECC suppport.
Remove the functions and fixup the callers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jffs2/wbuf.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 916c87d3393..76d4c361ef1 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -233,10 +233,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		}
 
 		/* Do the read... */
-		if (jffs2_cleanmarker_oob(c))
-			ret = c->mtd->read_ecc(c->mtd, start, c->wbuf_ofs - start, &retlen, buf, NULL, c->oobinfo);
-		else
-			ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
+		ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
 
 		if (ret == -EBADMSG && retlen == c->wbuf_ofs - start) {
 			/* ECC recovered */
@@ -290,16 +287,13 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		if (breakme++ == 20) {
 			printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
 			breakme = 0;
-			c->mtd->write_ecc(c->mtd, ofs, towrite, &retlen,
-					  brokenbuf, NULL, c->oobinfo);
+			c->mtd->write(c->mtd, ofs, towrite, &retlen,
+				      brokenbuf);
 			ret = -EIO;
 		} else
 #endif
-		if (jffs2_cleanmarker_oob(c))
-			ret = c->mtd->write_ecc(c->mtd, ofs, towrite, &retlen,
-						rewrite_buf, NULL, c->oobinfo);
-		else
-			ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, rewrite_buf);
+			ret = c->mtd->write(c->mtd, ofs, towrite, &retlen,
+					    rewrite_buf);
 
 		if (ret || retlen != towrite) {
 			/* Argh. We tried. Really we did. */
@@ -457,15 +451,12 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	if (breakme++ == 20) {
 		printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
 		breakme = 0;
-		c->mtd->write_ecc(c->mtd, c->wbuf_ofs, c->wbuf_pagesize,
-					&retlen, brokenbuf, NULL, c->oobinfo);
+		c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
+			      brokenbuf);
 		ret = -EIO;
 	} else
 #endif
 
-	if (jffs2_cleanmarker_oob(c))
-		ret = c->mtd->write_ecc(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf, NULL, c->oobinfo);
-	else
 		ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf);
 
 	if (ret || retlen != c->wbuf_pagesize) {
@@ -800,10 +791,7 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
 
 	/* Read flash */
 	down_read(&c->wbuf_sem);
-	if (jffs2_cleanmarker_oob(c))
-		ret = c->mtd->read_ecc(c->mtd, ofs, len, retlen, buf, NULL, c->oobinfo);
-	else
-		ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
+	ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
 
 	if ( (ret == -EBADMSG) && (*retlen == len) ) {
 		printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n",
-- 
cgit v1.2.3


From 2f785402f39b96a077b6e62bf26164bfb8e0c980 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 24 May 2006 02:04:45 +0100
Subject: [JFFS2] Reduce visibility of raw_node_ref to upper layers of JFFS2
 code.

As the first step towards eliminating the ref->next_phys member and saving
memory by using an _array_ of struct jffs2_raw_node_ref per eraseblock,
stop the write functions from allocating their own refs; have them just
_reserve_ the appropriate number instead. Then jffs2_link_node_ref() can
just fill them in.

Use a linked list of pre-allocated refs in the superblock, for now. Once
we switch to an array, it'll just be a case of extending that array.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c       |  16 ++---
 fs/jffs2/gc.c          |  25 ++------
 fs/jffs2/jffs2_fs_sb.h |   3 +
 fs/jffs2/malloc.c      |  26 +++++++-
 fs/jffs2/nodelist.c    |  51 ++++++++++------
 fs/jffs2/nodelist.h    |  19 +++---
 fs/jffs2/nodemgmt.c    |  39 ++++++------
 fs/jffs2/scan.c        |  77 +++++------------------
 fs/jffs2/summary.c     | 162 +++++++++++++++----------------------------------
 fs/jffs2/wbuf.c        |  26 +++-----
 fs/jffs2/write.c       |  71 +++++++---------------
 fs/jffs2/xattr.c       |  38 +++---------
 12 files changed, 204 insertions(+), 349 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 4616fed7573..f939f908b94 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -296,7 +296,7 @@ void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *
 			jffs2_remove_node_refs_from_ino_list(c, ref, jeb);
 		/* else it was a non-inode node or already removed, so don't bother */
 
-		jffs2_free_raw_node_ref(ref);
+		__jffs2_free_raw_node_ref(ref);
 	}
 	jeb->last_node = NULL;
 }
@@ -351,7 +351,6 @@ fail:
 
 static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
-	struct jffs2_raw_node_ref *marker_ref = NULL;
 	size_t retlen;
 	int ret;
 	uint32_t bad_offset;
@@ -384,11 +383,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 			.totlen =	cpu_to_je32(c->cleanmarker_size)
 		};
 
-		marker_ref = jffs2_alloc_raw_node_ref();
-		if (!marker_ref) {
-			printk(KERN_WARNING "Failed to allocate raw node ref for clean marker. Refiling\n");
-			goto refile;
-		}
+		jffs2_prealloc_raw_node_refs(c, 1);
 
 		marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
 
@@ -404,16 +399,13 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 				printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n",
 				       jeb->offset, sizeof(marker), retlen);
 
-			jffs2_free_raw_node_ref(marker_ref);
 			goto filebad;
 		}
 
 		/* Everything else got zeroed before the erase */
 		jeb->free_size = c->sector_size;
-
-		marker_ref->flash_offset = jeb->offset | REF_NORMAL;
-
-		jffs2_link_node_ref(c, jeb, marker_ref, c->cleanmarker_size, NULL);
+		/* FIXME Special case for cleanmarker in empty block */
+		jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, c->cleanmarker_size, NULL);
 	}
 
 	spin_lock(&c->erase_completion_lock);
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index f9e982a65ac..a22ff5df7fc 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -528,7 +528,6 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 					  struct jffs2_raw_node_ref *raw)
 {
 	union jffs2_node_union *node;
-	struct jffs2_raw_node_ref *nraw;
 	size_t retlen;
 	int ret;
 	uint32_t phys_ofs, alloclen;
@@ -618,30 +617,21 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 		}
 	}
 
-	nraw = jffs2_alloc_raw_node_ref();
-	if (!nraw) {
-		ret = -ENOMEM;
-		goto out_node;
-	}
-
 	/* OK, all the CRCs are good; this node can just be copied as-is. */
  retry:
-	nraw->flash_offset = phys_ofs = write_ofs(c);
+	phys_ofs = write_ofs(c);
 
 	ret = jffs2_flash_write(c, phys_ofs, rawlen, &retlen, (char *)node);
 
 	if (ret || (retlen != rawlen)) {
 		printk(KERN_NOTICE "Write of %d bytes at 0x%08x failed. returned %d, retlen %zd\n",
-                       rawlen, nraw->flash_offset, ret, retlen);
+                       rawlen, phys_ofs, ret, retlen);
 		if (retlen) {
-			nraw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, nraw, rawlen, NULL);
-			jffs2_mark_node_obsolete(c, nraw);
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, rawlen, NULL);
 		} else {
-			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", nraw->flash_offset);
-                        jffs2_free_raw_node_ref(nraw);
+			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", phys_ofs);
 		}
-		if (!retried && (nraw = jffs2_alloc_raw_node_ref())) {
+		if (!retried) {
 			/* Try to reallocate space and retry */
 			uint32_t dummy;
 			struct jffs2_eraseblock *jeb = &c->blocks[phys_ofs / c->sector_size];
@@ -666,16 +656,13 @@ static int jffs2_garbage_collect_pristine(struct jffs2_sb_info *c,
 				goto retry;
 			}
 			D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
-			jffs2_free_raw_node_ref(nraw);
 		}
 
-		jffs2_free_raw_node_ref(nraw);
 		if (!ret)
 			ret = -EIO;
 		goto out_node;
 	}
-	nraw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, nraw, rawlen, ic);
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, rawlen, ic);
 
 	jffs2_mark_node_obsolete(c, raw);
 	D1(printk(KERN_DEBUG "WHEEE! GC REF_PRISTINE node at 0x%08x succeeded\n", ref_offset(raw)));
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 272fbea5519..67529f0a44d 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -26,6 +26,9 @@ struct jffs2_inodirty;
 struct jffs2_sb_info {
 	struct mtd_info *mtd;
 
+	struct jffs2_raw_node_ref *refs;
+	int reserved_refs;
+
 	uint32_t highest_ino;
 	uint32_t checked_ino;
 
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f2473fa2fd1..3df3250314a 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -190,7 +190,29 @@ void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x)
 	kmem_cache_free(tmp_dnode_info_slab, x);
 }
 
-struct jffs2_raw_node_ref *jffs2_alloc_raw_node_ref(void)
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, int nr)
+{
+	struct jffs2_raw_node_ref *p = c->refs;
+
+	dbg_memalloc("%d\n", nr);
+
+	while (nr && p) {
+		p = p->next_in_ino;
+		nr--;
+	}
+	while (nr) {
+		p = __jffs2_alloc_raw_node_ref();
+		if (!p)
+			return -ENOMEM;
+		p->next_in_ino = c->refs;
+		c->refs = p;
+		nr--;
+	}
+	c->reserved_refs = nr;
+	return 0;
+}
+
+struct jffs2_raw_node_ref *__jffs2_alloc_raw_node_ref(void)
 {
 	struct jffs2_raw_node_ref *ret;
 	ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL);
@@ -198,7 +220,7 @@ struct jffs2_raw_node_ref *jffs2_alloc_raw_node_ref(void)
 	return ret;
 }
 
-void jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *x)
+void __jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *x)
 {
 	dbg_memalloc("%p\n", x);
 	kmem_cache_free(raw_node_ref_slab, x);
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index d25d4919ca9..1e6eabd730f 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -953,13 +953,19 @@ void jffs2_free_raw_node_refs(struct jffs2_sb_info *c)
 
 	for (i=0; i<c->nr_blocks; i++) {
 		this = c->blocks[i].first_node;
-		while(this) {
+		while (this) {
 			next = this->next_phys;
-			jffs2_free_raw_node_ref(this);
+			__jffs2_free_raw_node_ref(this);
 			this = next;
 		}
 		c->blocks[i].first_node = c->blocks[i].last_node = NULL;
 	}
+	this = c->refs;
+	while (this) {
+		next = this->next_in_ino;
+		__jffs2_free_raw_node_ref(this);
+		this = next;
+	}
 }
 
 struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset)
@@ -1047,10 +1053,27 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
 	}
 }
 
-void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-			 struct jffs2_raw_node_ref *ref, uint32_t len,
-			 struct jffs2_inode_cache *ic)
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+					       struct jffs2_eraseblock *jeb,
+					       uint32_t ofs, uint32_t len,
+					       struct jffs2_inode_cache *ic)
 {
+	struct jffs2_raw_node_ref *ref;
+
+	/* These will be preallocated _very_ shortly. */
+	ref = c->refs;
+	if (!c->refs) {
+		JFFS2_WARNING("Using non-preallocated refs!\n");
+		ref = __jffs2_alloc_raw_node_ref();
+		BUG_ON(!ref);
+		WARN_ON(1);
+	} else {
+		c->refs = ref->next_in_ino;
+	}
+
+	ref->next_phys = NULL;
+	ref->flash_offset = ofs;
+
 	if (!jeb->first_node)
 		jeb->first_node = ref;
 	if (jeb->last_node) {
@@ -1093,15 +1116,15 @@ void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 	c->free_size -= len;
 	jeb->free_size -= len;
 
-	ref->next_phys = NULL;
 #ifdef TEST_TOTLEN
 	/* Set (and test) __totlen field... for now */
 	ref->__totlen = len;
 	ref_totlen(c, jeb, ref);
 #endif
+	return ref;
 }
 
-/* No locking. Do not use on a live file system */
+/* No locking, no reservation of 'ref'. Do not use on a live file system */
 int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 			   uint32_t size)
 {
@@ -1121,18 +1144,10 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		jeb->dirty_size += size;
 		jeb->free_size -= size;
 	} else {
-		struct jffs2_raw_node_ref *ref;
-		ref = jffs2_alloc_raw_node_ref();
-		if (!ref)
-			return -ENOMEM;
-
-		ref->flash_offset = jeb->offset + c->sector_size - jeb->free_size;
-		ref->flash_offset |= REF_OBSOLETE;
-#ifdef TEST_TOTLEN
-		ref->__totlen = size;
-#endif
+		uint32_t ofs = jeb->offset + c->sector_size - jeb->free_size;
+		ofs |= REF_OBSOLETE;
 
-		jffs2_link_node_ref(c, jeb, ref, size, NULL);
+		jffs2_link_node_ref(c, jeb, ofs, size, NULL);
 	}
 
 	return 0;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 76f1b9419ee..7cc74d2ab4d 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -318,9 +318,10 @@ void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, struct jffs2_node_frag *t
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
 void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
 int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
-void jffs2_link_node_ref(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-			 struct jffs2_raw_node_ref *ref, uint32_t len,
-			 struct jffs2_inode_cache *ic);
+struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
+					       struct jffs2_eraseblock *jeb,
+					       uint32_t ofs, uint32_t len,
+					       struct jffs2_inode_cache *ic);
 extern uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c,
 				   struct jffs2_eraseblock *jeb,
 				   struct jffs2_raw_node_ref *ref);
@@ -331,10 +332,9 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 			uint32_t *len, int prio, uint32_t sumsize);
 int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
 			uint32_t *len, uint32_t sumsize);
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, 
-				struct jffs2_raw_node_ref *new,
-				uint32_t len,
-				struct jffs2_inode_cache *ic);
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c, 
+						       uint32_t ofs, uint32_t len,
+						       struct jffs2_inode_cache *ic);
 void jffs2_complete_reservation(struct jffs2_sb_info *c);
 void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *raw);
 
@@ -378,8 +378,9 @@ struct jffs2_raw_inode *jffs2_alloc_raw_inode(void);
 void jffs2_free_raw_inode(struct jffs2_raw_inode *);
 struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void);
 void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *);
-struct jffs2_raw_node_ref *jffs2_alloc_raw_node_ref(void);
-void jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *);
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, int nr);
+struct jffs2_raw_node_ref *__jffs2_alloc_raw_node_ref(void);
+void __jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *);
 struct jffs2_node_frag *jffs2_alloc_node_frag(void);
 void jffs2_free_node_frag(struct jffs2_node_frag *);
 struct jffs2_inode_cache *jffs2_alloc_inode_cache(void);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 8feb8749bc7..01bf2773fe4 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -137,6 +137,8 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 		}
 	}
 	spin_unlock(&c->erase_completion_lock);
+	if (!ret)
+		ret = jffs2_prealloc_raw_node_refs(c, 1);
 	if (ret)
 		up(&c->alloc_sem);
 	return ret;
@@ -158,6 +160,9 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
 		}
 	}
 	spin_unlock(&c->erase_completion_lock);
+	if (!ret)
+		ret = jffs2_prealloc_raw_node_refs(c, 1);
+
 	return ret;
 }
 
@@ -381,30 +386,30 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
  *	Must be called with the alloc_sem held.
  */
 
-int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *new,
-				uint32_t len, struct jffs2_inode_cache *ic)
+struct jffs2_raw_node_ref *jffs2_add_physical_node_ref(struct jffs2_sb_info *c,
+						       uint32_t ofs, uint32_t len,
+						       struct jffs2_inode_cache *ic)
 {
 	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *new;
 
-	jeb = &c->blocks[new->flash_offset / c->sector_size];
-#ifdef TEST_TOTLEN
-	new->__totlen = len;
-#endif
+	jeb = &c->blocks[ofs / c->sector_size];
 
-	D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n", ref_offset(new), ref_flags(new), len));
+	D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n",
+		  ofs & ~3, ofs & 3, len));
 #if 1
-	/* we could get some obsolete nodes after nextblock was refiled
-	   in wbuf.c */
-	if ((c->nextblock || !ref_obsolete(new))
-	    &&(jeb != c->nextblock || ref_offset(new) != jeb->offset + (c->sector_size - jeb->free_size))) {
+	/* Allow non-obsolete nodes only to be added at the end of c->nextblock, 
+	   if c->nextblock is set. Note that wbuf.c will file obsolete nodes
+	   even after refiling c->nextblock */
+	if ((c->nextblock || ((ofs & 3) != REF_OBSOLETE))
+	    && (jeb != c->nextblock || (ofs & ~3) != jeb->offset + (c->sector_size - jeb->free_size))) {
 		printk(KERN_WARNING "argh. node added in wrong place\n");
-		jffs2_free_raw_node_ref(new);
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 	}
 #endif
 	spin_lock(&c->erase_completion_lock);
 
-	jffs2_link_node_ref(c, jeb, new, len, ic);
+	new = jffs2_link_node_ref(c, jeb, ofs, len, ic);
 
 	if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) {
 		/* If it lives on the dirty_list, jffs2_reserve_space will put it there */
@@ -425,7 +430,7 @@ int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 
 	spin_unlock(&c->erase_completion_lock);
 
-	return 0;
+	return new;
 }
 
 
@@ -697,7 +702,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 		}
 		spin_unlock(&c->erase_completion_lock);
 
-		jffs2_free_raw_node_ref(n);
+		__jffs2_free_raw_node_ref(n);
 	}
 
 	/* Also merge with the previous node in the list, if there is one
@@ -722,7 +727,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 				jeb->gc_node=p;
 			}
 			p->next_phys = ref->next_phys;
-			jffs2_free_raw_node_ref(ref);
+			__jffs2_free_raw_node_ref(ref);
 		}
 		spin_unlock(&c->erase_completion_lock);
 	}
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 6fce703c054..87b0a416b6a 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -317,7 +317,6 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 				 struct jffs2_summary *s)
 {
 	struct jffs2_xattr_datum *xd;
-	struct jffs2_raw_node_ref *raw;
 	uint32_t totlen, crc;
 	int err;
 
@@ -340,13 +339,8 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 		return 0;
 	}
 
-	raw =  jffs2_alloc_raw_node_ref();
-	if (!raw)
-		return -ENOMEM;
-
 	xd = jffs2_setup_xattr_datum(c, je32_to_cpu(rx->xid), je32_to_cpu(rx->version));
 	if (IS_ERR(xd)) {
-		jffs2_free_raw_node_ref(raw);
 		if (PTR_ERR(xd) == -EEXIST) {
 			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rx->totlen)))))
 				return err;
@@ -358,12 +352,9 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	xd->name_len = rx->name_len;
 	xd->value_len = je16_to_cpu(rx->value_len);
 	xd->data_crc = je32_to_cpu(rx->data_crc);
-	xd->node = raw;
-
-	raw->flash_offset = ofs | REF_PRISTINE;
 
-	jffs2_link_node_ref(c, jeb, raw, totlen, NULL);
-	/* FIXME */ raw->next_in_ino = (void *)xd;
+	xd->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL);
+	/* FIXME */ xd->node->next_in_ino = (void *)xd;
 
 	if (jffs2_sum_active())
 		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
@@ -377,7 +368,6 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 				struct jffs2_summary *s)
 {
 	struct jffs2_xattr_ref *ref;
-	struct jffs2_raw_node_ref *raw;
 	uint32_t crc;
 	int err;
 
@@ -404,12 +394,6 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	if (!ref)
 		return -ENOMEM;
 
-	raw =  jffs2_alloc_raw_node_ref();
-	if (!raw) {
-		jffs2_free_xattr_ref(ref);
-		return -ENOMEM;
-	}
-
 	/* BEFORE jffs2_build_xattr_subsystem() called, 
 	 * ref->xid is used to store 32bit xid, xd is not used
 	 * ref->ino is used to store 32bit inode-number, ic is not used
@@ -418,16 +402,13 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	 * used to chain all xattr_ref object. It's re-chained to
 	 * jffs2_inode_cache in jffs2_build_xattr_subsystem() correctly.
 	 */
-	ref->node = raw;
 	ref->ino = je32_to_cpu(rr->ino);
 	ref->xid = je32_to_cpu(rr->xid);
 	ref->next = c->xref_temp;
 	c->xref_temp = ref;
 
-	raw->flash_offset = ofs | REF_PRISTINE;
-
-	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(rr->totlen)), NULL);
-	/* FIXME */ raw->next_in_ino = (void *)ref;
+	ref->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), NULL);
+	/* FIXME */ ref->node->next_in_ino = (void *)ref;
 
 	if (jffs2_sum_active())
 		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
@@ -597,6 +578,11 @@ scan_more:
 
 		jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
 
+		/* Make sure there are node refs available for use */
+		err = jffs2_prealloc_raw_node_refs(c, 2);
+		if (err)
+			return err;
+
 		cond_resched();
 
 		if (ofs & 3) {
@@ -839,14 +825,7 @@ scan_more:
 					return err;
 				ofs += PAD(sizeof(struct jffs2_unknown_node));
 			} else {
-				struct jffs2_raw_node_ref *marker_ref = jffs2_alloc_raw_node_ref();
-				if (!marker_ref) {
-					printk(KERN_NOTICE "Failed to allocate node ref for clean marker\n");
-					return -ENOMEM;
-				}
-				marker_ref->flash_offset = ofs | REF_NORMAL;
-
-				jffs2_link_node_ref(c, jeb, marker_ref, c->cleanmarker_size, NULL);
+				jffs2_link_node_ref(c, jeb, ofs | REF_NORMAL, c->cleanmarker_size, NULL);
 
 				ofs += PAD(c->cleanmarker_size);
 			}
@@ -884,14 +863,9 @@ scan_more:
 				break;
 
 			case JFFS2_FEATURE_RWCOMPAT_COPY: {
-				struct jffs2_raw_node_ref *ref;
 				D1(printk(KERN_NOTICE "Unknown but compatible feature node (0x%04x) found at offset 0x%08x\n", je16_to_cpu(node->nodetype), ofs));
 
-				ref = jffs2_alloc_raw_node_ref();
-				if (!ref)
-					return -ENOMEM;
-				ref->flash_offset = ofs | REF_PRISTINE;
-				jffs2_link_node_ref(c, jeb, ref, PAD(je32_to_cpu(node->totlen)), NULL);
+				jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(node->totlen)), NULL);
 
 				/* We can't summarise nodes we don't grok */
 				jffs2_sum_disable_collecting(s);
@@ -953,7 +927,6 @@ struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uin
 static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 				 struct jffs2_raw_inode *ri, uint32_t ofs, struct jffs2_summary *s)
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_inode_cache *ic;
 	uint32_t ino = je32_to_cpu(ri->ino);
 	int err;
@@ -969,12 +942,6 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	   Which means that the _full_ amount of time to get to proper write mode with GC
 	   operational may actually be _longer_ than before. Sucks to be me. */
 
-	raw = jffs2_alloc_raw_node_ref();
-	if (!raw) {
-		printk(KERN_NOTICE "jffs2_scan_inode_node(): allocation of node reference failed\n");
-		return -ENOMEM;
-	}
-
 	ic = jffs2_get_ino_cache(c, ino);
 	if (!ic) {
 		/* Inocache get failed. Either we read a bogus ino# or it's just genuinely the
@@ -988,21 +955,15 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 			/* We believe totlen because the CRC on the node _header_ was OK, just the node itself failed. */
 			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(ri->totlen)))))
 				return err;
-			jffs2_free_raw_node_ref(raw);
 			return 0;
 		}
 		ic = jffs2_scan_make_ino_cache(c, ino);
-		if (!ic) {
-			jffs2_free_raw_node_ref(raw);
+		if (!ic)
 			return -ENOMEM;
-		}
 	}
 
 	/* Wheee. It worked */
-
-	raw->flash_offset = ofs | REF_UNCHECKED;
-
-	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(ri->totlen)), ic);
+	jffs2_link_node_ref(c, jeb, ofs | REF_UNCHECKED, PAD(je32_to_cpu(ri->totlen)), ic);
 
 	D1(printk(KERN_DEBUG "Node is ino #%u, version %d. Range 0x%x-0x%x\n",
 		  je32_to_cpu(ri->ino), je32_to_cpu(ri->version),
@@ -1021,7 +982,6 @@ static int jffs2_scan_inode_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 				  struct jffs2_raw_dirent *rd, uint32_t ofs, struct jffs2_summary *s)
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_full_dirent *fd;
 	struct jffs2_inode_cache *ic;
 	uint32_t crc;
@@ -1063,23 +1023,14 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 			return err;
 		return 0;
 	}
-	raw = jffs2_alloc_raw_node_ref();
-	if (!raw) {
-		jffs2_free_full_dirent(fd);
-		printk(KERN_NOTICE "jffs2_scan_dirent_node(): allocation of node reference failed\n");
-		return -ENOMEM;
-	}
 	ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(rd->pino));
 	if (!ic) {
 		jffs2_free_full_dirent(fd);
-		jffs2_free_raw_node_ref(raw);
 		return -ENOMEM;
 	}
 
-	raw->flash_offset = ofs | REF_PRISTINE;
-	jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(rd->totlen)), ic);
+	fd->raw = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rd->totlen)), ic);
 
-	fd->raw = raw;
 	fd->next = NULL;
 	fd->version = je32_to_cpu(rd->version);
 	fd->ino = je32_to_cpu(rd->ino);
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 351ba9f8185..ccb6803a6e4 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -369,22 +369,18 @@ no_mem:
 	return -ENOMEM;
 }
 
-static struct jffs2_raw_node_ref *alloc_ref_at(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
-					       uint32_t offset)
+static struct jffs2_raw_node_ref *sum_link_node_ref(struct jffs2_sb_info *c,
+						    struct jffs2_eraseblock *jeb,
+						    uint32_t ofs, uint32_t len,
+						    struct jffs2_inode_cache *ic)
 {
-	struct jffs2_raw_node_ref *ref;
 	/* If there was a gap, mark it dirty */
-	if (offset > c->sector_size - jeb->free_size) {
-		int ret = jffs2_scan_dirty_space(c, jeb, offset - (c->sector_size - jeb->free_size));
-		if (ret)
-			return NULL;
+	if ((ofs & ~3) > c->sector_size - jeb->free_size) {
+		/* Ew. Summary doesn't actually tell us explicitly about dirty space */
+		jffs2_scan_dirty_space(c, jeb, (ofs & ~3) - (c->sector_size - jeb->free_size));
 	}
-	ref = jffs2_alloc_raw_node_ref();
-	if (!ref)
-		return NULL;
 
-	ref->flash_offset = jeb->offset + offset;
-	return ref;
+	return jffs2_link_node_ref(c, jeb, jeb->offset + ofs, len, ic);
 }
 
 /* Process the stored summary information - helper function for jffs2_sum_scan_sumnode() */
@@ -392,7 +388,6 @@ static struct jffs2_raw_node_ref *alloc_ref_at(struct jffs2_sb_info *c, struct j
 static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
 				struct jffs2_raw_summary *summary, uint32_t *pseudo_random)
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_inode_cache *ic;
 	struct jffs2_full_dirent *fd;
 	void *sp;
@@ -404,6 +399,11 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 	for (i=0; i<je32_to_cpu(summary->sum_num); i++) {
 		dbg_summary("processing summary index %d\n", i);
 
+		/* Make sure there's a spare ref for dirty space */
+		err = jffs2_prealloc_raw_node_refs(c, 2);
+		if (err)
+			return err;
+
 		switch (je16_to_cpu(((struct jffs2_sum_unknown_flash *)sp)->nodetype)) {
 			case JFFS2_NODETYPE_INODE: {
 				struct jffs2_sum_inode_flash *spi;
@@ -415,22 +415,14 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					    jeb->offset + je32_to_cpu(spi->offset),
 					    jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spu->totlen));
 
-				raw = alloc_ref_at(c, jeb, je32_to_cpu(spi->offset));
-				if (!raw) {
-					JFFS2_NOTICE("allocation of node reference failed\n");
-					return -ENOMEM;
-				}
-
 				ic = jffs2_scan_make_ino_cache(c, ino);
 				if (!ic) {
 					JFFS2_NOTICE("scan_make_ino_cache failed\n");
-					jffs2_free_raw_node_ref(raw);
 					return -ENOMEM;
 				}
 
-				raw->flash_offset |= REF_UNCHECKED;
-
-				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spi->totlen)), ic);
+				sum_link_node_ref(c, jeb, je32_to_cpu(spi->offset) | REF_UNCHECKED,
+						  PAD(je32_to_cpu(spi->totlen)), ic);
 
 				*pseudo_random += je32_to_cpu(spi->version);
 
@@ -455,24 +447,15 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				memcpy(&fd->name, spd->name, spd->nsize);
 				fd->name[spd->nsize] = 0;
 
-				raw = alloc_ref_at(c, jeb, je32_to_cpu(spd->offset));
-				if (!raw) {
-					jffs2_free_full_dirent(fd);
-					JFFS2_NOTICE("allocation of node reference failed\n");
-					return -ENOMEM;
-				}
-
 				ic = jffs2_scan_make_ino_cache(c, je32_to_cpu(spd->pino));
 				if (!ic) {
 					jffs2_free_full_dirent(fd);
-					jffs2_free_raw_node_ref(raw);
 					return -ENOMEM;
 				}
 
-				raw->flash_offset |= REF_PRISTINE;
-				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spd->totlen)), ic);
+				fd->raw = sum_link_node_ref(c, jeb,  je32_to_cpu(spd->offset) | REF_PRISTINE,
+							    PAD(je32_to_cpu(spd->totlen)), ic);
 
-				fd->raw = raw;
 				fd->next = NULL;
 				fd->version = je32_to_cpu(spd->version);
 				fd->ino = je32_to_cpu(spd->ino);
@@ -497,15 +480,10 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					    jeb->offset + je32_to_cpu(spx->offset),
 					    jeb->offset + je32_to_cpu(spx->offset) + je32_to_cpu(spx->totlen),
 					    je32_to_cpu(spx->xid), je32_to_cpu(spx->version));
-				raw = alloc_ref_at(c, jeb, je32_to_cpu(spx->offset));
-				if (!raw) {
-					JFFS2_NOTICE("allocation of node reference failed\n");
-					return -ENOMEM;
-				}
+
 				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
 								je32_to_cpu(spx->version));
 				if (IS_ERR(xd)) {
-					jffs2_free_raw_node_ref(raw);
 					if (PTR_ERR(xd) == -EEXIST) {
 						/* a newer version of xd exists */
 						if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(spx->totlen))))
@@ -516,12 +494,10 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					JFFS2_NOTICE("allocation of xattr_datum failed\n");
 					return PTR_ERR(xd);
 				}
-				xd->node = raw;
 
-				raw->flash_offset |= REF_UNCHECKED;
-
-				jffs2_link_node_ref(c, jeb, raw, PAD(je32_to_cpu(spx->totlen)), NULL);
-				/* FIXME */ raw->next_in_ino = (void *)xd;
+				xd->node = sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+							     PAD(je32_to_cpu(spx->totlen)), NULL);
+				/* FIXME */ xd->node->next_in_ino = (void *)xd;
 
 				*pseudo_random += je32_to_cpu(spx->xid);
 				sp += JFFS2_SUMMARY_XATTR_SIZE;
@@ -537,29 +513,21 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					    jeb->offset + je32_to_cpu(spr->offset),
 					    jeb->offset + je32_to_cpu(spr->offset) + PAD(sizeof(struct jffs2_raw_xref)));
 
-				raw = alloc_ref_at(c, jeb, je32_to_cpu(spr->offset));
-				if (!raw) {
-					JFFS2_NOTICE("allocation of node reference failed\n");
-					return -ENOMEM;
-				}
 				ref = jffs2_alloc_xattr_ref();
 				if (!ref) {
 					JFFS2_NOTICE("allocation of xattr_datum failed\n");
-					jffs2_free_raw_node_ref(raw);
 					return -ENOMEM;
 				}
 				ref->ino = 0xfffffffe;
 				ref->xid = 0xfffffffd;
-				ref->node = raw;
 				ref->next = c->xref_temp;
 				c->xref_temp = ref;
 
-				raw->flash_offset |= REF_UNCHECKED;
+				ref->node = sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
+							      PAD(sizeof(struct jffs2_raw_xref)), NULL);
+				/* FIXME */ ref->node->next_in_ino = (void *)ref;
 
-				jffs2_link_node_ref(c, jeb, raw, PAD(sizeof(struct jffs2_raw_xref)), NULL);
-				/* FIXME */ raw->next_in_ino = (void *)ref;
-
-				*pseudo_random += raw->flash_offset;
+				*pseudo_random += ref->node->flash_offset;
 				sp += JFFS2_SUMMARY_XREF_SIZE;
 
 				break;
@@ -584,7 +552,6 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 			}
 		}
 	}
-
 	return 0;
 }
 
@@ -594,7 +561,6 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 			   uint32_t *pseudo_random)
 {
 	struct jffs2_unknown_node crcnode;
-	struct jffs2_raw_node_ref *cache_ref;
 	int ret, ofs;
 	uint32_t crc;
 	int err;
@@ -650,16 +616,8 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
 				return err;
 		} else {
-			struct jffs2_raw_node_ref *marker_ref = jffs2_alloc_raw_node_ref();
-
-			if (!marker_ref) {
-				JFFS2_NOTICE("Failed to allocate node ref for clean marker\n");
-				return -ENOMEM;
-			}
-
-			marker_ref->flash_offset = jeb->offset | REF_NORMAL;
-
-			jffs2_link_node_ref(c, jeb, marker_ref, je32_to_cpu(summary->cln_mkr), NULL);
+			jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL,
+					    je32_to_cpu(summary->cln_mkr), NULL);
 		}
 	}
 
@@ -672,16 +630,11 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		return ret;		/* real error */
 
 	/* for PARANOIA_CHECK */
-	cache_ref = alloc_ref_at(c, jeb, ofs);
-
-	if (!cache_ref) {
-		JFFS2_NOTICE("Failed to allocate node ref for cache\n");
-		return -ENOMEM;
-	}
-
-	cache_ref->flash_offset |= REF_NORMAL;
+	ret = jffs2_prealloc_raw_node_refs(c, 1);
+	if (ret)
+		return ret;
 
-	jffs2_link_node_ref(c, jeb, cache_ref, sumsize, NULL);
+	jffs2_link_node_ref(c, jeb, (jeb->offset + ofs) | REF_NORMAL, sumsize, NULL);
 
 	if (unlikely(jeb->free_size)) {
 		JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n",
@@ -709,6 +662,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	union jffs2_sum_mem *temp;
 	struct jffs2_sum_marker *sm;
 	struct kvec vecs[2];
+	uint32_t sum_ofs;
 	void *wpage;
 	int ret;
 	size_t retlen;
@@ -821,36 +775,31 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	vecs[1].iov_base = c->summary->sum_buf;
 	vecs[1].iov_len = datasize;
 
+	sum_ofs = jeb->offset + c->sector_size - jeb->free_size;
+
 	dbg_summary("JFFS2: writing out data to flash to pos : 0x%08x\n",
-			jeb->offset + c->sector_size - jeb->free_size);
+		    sum_ofs);
 
-	spin_unlock(&c->erase_completion_lock);
-	ret = jffs2_flash_writev(c, vecs, 2, jeb->offset + c->sector_size -
-				jeb->free_size, &retlen, 0);
+	ret = jffs2_flash_writev(c, vecs, 2, sum_ofs, &retlen, 0);
 
 	if (ret || (retlen != infosize)) {
-		struct jffs2_raw_node_ref *ref;
 
 		JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n",
-			infosize, jeb->offset + c->sector_size - jeb->free_size, ret, retlen);
+			      infosize, sum_ofs, ret, retlen);
 
 		/* Waste remaining space */
-		ref = jffs2_alloc_raw_node_ref();
-		if (ref) {
-			spin_lock(&c->erase_completion_lock);
-
-			ref->flash_offset = jeb->offset + c->sector_size - jeb->free_size;
-			ref->flash_offset |= REF_OBSOLETE;
-
-			jffs2_link_node_ref(c, jeb, ref, c->sector_size - jeb->free_size, NULL);
-		}
+		spin_lock(&c->erase_completion_lock);
+		jffs2_link_node_ref(c, jeb, sum_ofs | REF_OBSOLETE, infosize, NULL);
+		spin_unlock(&c->erase_completion_lock);
 
 		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
 
-		return 1;
+		return 0;
 	}
 
 	spin_lock(&c->erase_completion_lock);
+	jffs2_link_node_ref(c, jeb, sum_ofs | REF_NORMAL, infosize, NULL);
+	spin_unlock(&c->erase_completion_lock);
 
 	return 0;
 }
@@ -859,12 +808,15 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 
 int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 {
-	struct jffs2_raw_node_ref *summary_ref;
-	int datasize, infosize, padsize, ret;
+	int datasize, infosize, padsize;
 	struct jffs2_eraseblock *jeb;
+	int ret;
 
 	dbg_summary("called\n");
 
+	spin_unlock(&c->erase_completion_lock);
+	jffs2_prealloc_raw_node_refs(c, 1);
+
 	jeb = c->nextblock;
 
 	if (!c->summary->sum_num || !c->summary->sum_list_head) {
@@ -888,22 +840,6 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 	}
 
 	ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
-	if (ret)
-		return 0; /* can't write out summary, block is marked as NOSUM_SIZE */
-
-	/* for ACCT_PARANOIA_CHECK */
-	spin_unlock(&c->erase_completion_lock);
-	summary_ref = jffs2_alloc_raw_node_ref();
-
-	if (!summary_ref) {
-		JFFS2_NOTICE("Failed to allocate node ref for summary\n");
-		return -ENOMEM;
-	}
-
-	summary_ref->flash_offset = (jeb->offset + c->sector_size - jeb->free_size) | REF_NORMAL;
-
 	spin_lock(&c->erase_completion_lock);
-	jffs2_link_node_ref(c, jeb, summary_ref, infosize, NULL);
-
-	return 0;
+	return ret;
 }
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 916c87d3393..0d7abb26048 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -179,6 +179,9 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 	unsigned char *buf;
 	uint32_t start, end, ofs, len;
 
+	if (jffs2_prealloc_raw_node_refs(c, c->reserved_refs + 1))
+		return;
+
 	spin_lock(&c->erase_completion_lock);
 
 	jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
@@ -306,17 +309,9 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 			printk(KERN_CRIT "Recovery of wbuf failed due to a second write error\n");
 			kfree(buf);
 
-			if (retlen) {
-				struct jffs2_raw_node_ref *raw2;
-
-				raw2 = jffs2_alloc_raw_node_ref();
-				if (!raw2)
-					return;
-
-				raw2->flash_offset = ofs | REF_OBSOLETE;
+			if (retlen)
+				jffs2_add_physical_node_ref(c, ofs | REF_OBSOLETE, ref_totlen(c, jeb, *first_raw), NULL);
 
-				jffs2_add_physical_node_ref(c, raw2, ref_totlen(c, jeb, *first_raw), NULL);
-			}
 			return;
 		}
 		printk(KERN_NOTICE "Recovery of wbuf succeeded to %08x\n", ofs);
@@ -428,6 +423,9 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	if (!c->wbuf_len)	/* already checked c->wbuf above */
 		return 0;
 
+	if (jffs2_prealloc_raw_node_refs(c, c->reserved_refs + 1))
+		return -ENOMEM;
+
 	/* claim remaining space on the page
 	   this happens, if we have a change to a new block,
 	   or if fsync forces us to flush the writebuffer.
@@ -485,7 +483,6 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	/* Adjust free size of the block if we padded. */
 	if (pad) {
 		struct jffs2_eraseblock *jeb;
-		struct jffs2_raw_node_ref *ref;
 		uint32_t waste = c->wbuf_pagesize - c->wbuf_len;
 
 		jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
@@ -503,15 +500,10 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 			       jeb->offset, jeb->free_size);
 			BUG();
 		}
-		ref = jffs2_alloc_raw_node_ref();
-		if (!ref)
-			return -ENOMEM;
-		ref->flash_offset = c->wbuf_ofs + c->wbuf_len;
-		ref->flash_offset |= REF_OBSOLETE;
 
 		spin_lock(&c->erase_completion_lock);
 
-		jffs2_link_node_ref(c, jeb, ref, waste, NULL);
+		jffs2_link_node_ref(c, jeb, (c->wbuf_ofs + c->wbuf_len) | REF_OBSOLETE, waste, NULL);
 		/* FIXME: that made it count as dirty. Convert to wasted */
 		jeb->dirty_size -= waste;
 		c->dirty_size -= waste;
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 0e12b7561b7..67176792e13 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -61,7 +61,6 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 					   uint32_t datalen, int alloc_mode)
 
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_full_dnode *fn;
 	size_t retlen;
 	uint32_t flash_ofs;
@@ -83,27 +82,16 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	if (je32_to_cpu(ri->totlen) != sizeof(*ri) + datalen) {
 		printk(KERN_WARNING "jffs2_write_dnode: ri->totlen (0x%08x) != sizeof(*ri) (0x%08zx) + datalen (0x%08x)\n", je32_to_cpu(ri->totlen), sizeof(*ri), datalen);
 	}
-	raw = jffs2_alloc_raw_node_ref();
-	if (!raw)
-		return ERR_PTR(-ENOMEM);
 
 	fn = jffs2_alloc_full_dnode();
-	if (!fn) {
-		jffs2_free_raw_node_ref(raw);
+	if (!fn)
 		return ERR_PTR(-ENOMEM);
-	}
-
-	fn->ofs = je32_to_cpu(ri->offset);
-	fn->size = je32_to_cpu(ri->dsize);
-	fn->frags = 0;
 
 	/* check number of valid vecs */
 	if (!datalen || !data)
 		cnt = 1;
  retry:
-	fn->raw = raw;
-
-	raw->flash_offset = flash_ofs = write_ofs(c);
+	flash_ofs = write_ofs(c);
 
 	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
 
@@ -130,14 +118,11 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 			   seem corrupted, in which case the scan would skip over
 			   any node we write before the original intended end of
 			   this node */
-			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*ri)+datalen), NULL);
-			jffs2_mark_node_obsolete(c, raw);
+			jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*ri)+datalen), NULL);
 		} else {
-			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
-			jffs2_free_raw_node_ref(raw);
+			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
 		}
-		if (!retried && alloc_mode != ALLOC_NORETRY && (raw = jffs2_alloc_raw_node_ref())) {
+		if (!retried && alloc_mode != ALLOC_NORETRY) {
 			/* Try to reallocate space and retry */
 			uint32_t dummy;
 			struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
@@ -172,7 +157,6 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 				goto retry;
 			}
 			D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
-			jffs2_free_raw_node_ref(raw);
 		}
 		/* Release the full_dnode which is now useless, and return */
 		jffs2_free_full_dnode(fn);
@@ -186,14 +170,17 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) ||
 	    ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) &&
 	      (je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) ==  je32_to_cpu(ri->isize)))) {
-		raw->flash_offset |= REF_PRISTINE;
+		flash_ofs |= REF_PRISTINE;
 	} else {
-		raw->flash_offset |= REF_NORMAL;
+		flash_ofs |= REF_NORMAL;
 	}
-	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*ri)+datalen), f->inocache);
+	fn->raw = jffs2_add_physical_node_ref(c, flash_ofs, PAD(sizeof(*ri)+datalen), f->inocache);
+	fn->ofs = je32_to_cpu(ri->offset);
+	fn->size = je32_to_cpu(ri->dsize);
+	fn->frags = 0;
 
 	D1(printk(KERN_DEBUG "jffs2_write_dnode wrote node at 0x%08x(%d) with dsize 0x%x, csize 0x%x, node_crc 0x%08x, data_crc 0x%08x, totlen 0x%08x\n",
-		  flash_ofs, ref_flags(raw), je32_to_cpu(ri->dsize),
+		  flash_ofs & ~3, flash_ofs & 3, je32_to_cpu(ri->dsize),
 		  je32_to_cpu(ri->csize), je32_to_cpu(ri->node_crc),
 		  je32_to_cpu(ri->data_crc), je32_to_cpu(ri->totlen)));
 
@@ -208,11 +195,10 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 					     struct jffs2_raw_dirent *rd, const unsigned char *name,
 					     uint32_t namelen, int alloc_mode)
 {
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_full_dirent *fd;
 	size_t retlen;
 	struct kvec vecs[2];
-	uint32_t flash_ofs = write_ofs(c);
+	uint32_t flash_ofs;
 	int retried = 0;
 	int ret;
 
@@ -223,26 +209,16 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 	D1(if(je32_to_cpu(rd->hdr_crc) != crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)) {
 		printk(KERN_CRIT "Eep. CRC not correct in jffs2_write_dirent()\n");
 		BUG();
-	}
-	   );
+	   });
 
 	vecs[0].iov_base = rd;
 	vecs[0].iov_len = sizeof(*rd);
 	vecs[1].iov_base = (unsigned char *)name;
 	vecs[1].iov_len = namelen;
 
-	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
-
-	raw = jffs2_alloc_raw_node_ref();
-
-	if (!raw)
-		return ERR_PTR(-ENOMEM);
-
 	fd = jffs2_alloc_full_dirent(namelen+1);
-	if (!fd) {
-		jffs2_free_raw_node_ref(raw);
+	if (!fd)
 		return ERR_PTR(-ENOMEM);
-	}
 
 	fd->version = je32_to_cpu(rd->version);
 	fd->ino = je32_to_cpu(rd->ino);
@@ -252,9 +228,9 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 	fd->name[namelen]=0;
 
  retry:
-	fd->raw = raw;
+	flash_ofs = write_ofs(c);
 
-	raw->flash_offset = flash_ofs;
+	jffs2_dbg_prewrite_paranoia_check(c, flash_ofs, vecs[0].iov_len + vecs[1].iov_len);
 
 	if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) {
 		BUG_ON(!retried);
@@ -273,14 +249,11 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 			       sizeof(*rd)+namelen, flash_ofs, ret, retlen);
 		/* Mark the space as dirtied */
 		if (retlen) {
-			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*rd)+namelen), NULL);
-			jffs2_mark_node_obsolete(c, raw);
+			jffs2_add_physical_node_ref(c, flash_ofs | REF_OBSOLETE, PAD(sizeof(*rd)+namelen), NULL);
 		} else {
-			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", raw->flash_offset);
-			jffs2_free_raw_node_ref(raw);
+			printk(KERN_NOTICE "Not marking the space at 0x%08x as dirty because the flash driver returned retlen zero\n", flash_ofs);
 		}
-		if (!retried && (raw = jffs2_alloc_raw_node_ref())) {
+		if (!retried) {
 			/* Try to reallocate space and retry */
 			uint32_t dummy;
 			struct jffs2_eraseblock *jeb = &c->blocks[flash_ofs / c->sector_size];
@@ -313,15 +286,13 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
 				goto retry;
 			}
 			D1(printk(KERN_DEBUG "Failed to allocate space to retry failed write: %d!\n", ret));
-			jffs2_free_raw_node_ref(raw);
 		}
 		/* Release the full_dnode which is now useless, and return */
 		jffs2_free_full_dirent(fd);
 		return ERR_PTR(ret?ret:-EIO);
 	}
 	/* Mark the space used */
-	raw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(*rd)+namelen), f->inocache);
+	fd->raw = jffs2_add_physical_node_ref(c, flash_ofs | REF_PRISTINE, PAD(sizeof(*rd)+namelen), f->inocache);
 
 	if (retried) {
 		jffs2_dbg_acct_sanity_check(c,NULL);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 008f91b1c17..2255f1367bd 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -304,8 +304,8 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
-	struct jffs2_raw_xattr rx;
 	struct jffs2_raw_node_ref *raw;
+	struct jffs2_raw_xattr rx;
 	struct kvec vecs[2];
 	uint32_t length;
 	int rc, totlen;
@@ -319,11 +319,6 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
 	totlen = vecs[0].iov_len + vecs[1].iov_len;
 
-	raw = jffs2_alloc_raw_node_ref();
-	if (!raw)
-		return -ENOMEM;
-	raw->flash_offset = phys_ofs;
-
 	/* Setup raw-xattr */
 	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
 	rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR);
@@ -343,19 +338,14 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%u, at %#08x\n",
 			      rc, totlen, length, phys_ofs);
 		rc = rc ? rc : -EIO;
-		if (length) {
-			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw, PAD(totlen), NULL);
-			jffs2_mark_node_obsolete(c, raw);
-		} else {
-			jffs2_free_raw_node_ref(raw);
-		}
+		if (length)
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(totlen), NULL);
+
 		return rc;
 	}
 
 	/* success */
-	raw->flash_offset |= REF_PRISTINE;
-	jffs2_add_physical_node_ref(c, raw, PAD(totlen), NULL);
+	raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), NULL);
 	/* FIXME */ raw->next_in_ino = (void *)xd;
 
 	if (xd->node)
@@ -563,11 +553,6 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 	uint32_t phys_ofs = write_ofs(c);
 	int ret;
 
-	raw = jffs2_alloc_raw_node_ref();
-	if (!raw)
-		return -ENOMEM;
-	raw->flash_offset = phys_ofs;
-
 	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
 	rr.nodetype = cpu_to_je16(JFFS2_NODETYPE_XREF);
 	rr.totlen = cpu_to_je32(PAD(sizeof(rr)));
@@ -582,18 +567,13 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 		JFFS2_WARNING("jffs2_flash_write() returned %d, request=%u, retlen=%u, at %#08x\n",
 			      ret, sizeof(rr), length, phys_ofs);
 		ret = ret ? ret : -EIO;
-		if (length) {
-			raw->flash_offset |= REF_OBSOLETE;
-			jffs2_add_physical_node_ref(c, raw, PAD(sizeof(rr)), NULL);
-			jffs2_mark_node_obsolete(c, raw);
-		} else {
-			jffs2_free_raw_node_ref(raw);
-		}
+		if (length)
+			jffs2_add_physical_node_ref(c, phys_ofs | REF_OBSOLETE, PAD(sizeof(rr)), NULL);
+
 		return ret;
 	}
-	raw->flash_offset |= REF_PRISTINE;
 
-	jffs2_add_physical_node_ref(c, raw, PAD(sizeof(rr)), NULL);
+	raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), NULL);
 	/* FIXME */ raw->next_in_ino = (void *)ref;
 	if (ref->node)
 		delete_xattr_ref_node(c, ref);
-- 
cgit v1.2.3


From 99988f7bbd16b861590dda4631c4db6cb17b5091 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 24 May 2006 09:04:17 +0100
Subject: [JFFS2] Introduce ref_next() macro for finding next physical node

Another part of the preparation for switching to an array...

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/debug.c    | 14 +++++++-------
 fs/jffs2/gc.c       |  2 +-
 fs/jffs2/nodelist.c | 13 +++++++------
 fs/jffs2/nodelist.h |  6 +++---
 fs/jffs2/nodemgmt.c | 22 +++++++++++-----------
 fs/jffs2/scan.c     |  4 ++--
 6 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 1fe17de713e..72b4fc13a10 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -192,13 +192,13 @@ __jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c,
 		else
 			my_dirty_size += totlen;
 
-		if ((!ref2->next_phys) != (ref2 == jeb->last_node)) {
-			JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next_phys at %#08x (mem %p), last_node is at %#08x (mem %p).\n",
-				ref_offset(ref2), ref2, ref_offset(ref2->next_phys), ref2->next_phys,
-				ref_offset(jeb->last_node), jeb->last_node);
+		if ((!ref_next(ref2)) != (ref2 == jeb->last_node)) {
+			JFFS2_ERROR("node_ref for node at %#08x (mem %p) has next at %#08x (mem %p), last_node is at %#08x (mem %p).\n",
+				    ref_offset(ref2), ref2, ref_offset(ref_next(ref2)), ref_next(ref2),
+				    ref_offset(jeb->last_node), jeb->last_node);
 			goto error;
 		}
-		ref2 = ref2->next_phys;
+		ref2 = ref_next(ref2);
 	}
 
 	if (my_used_size != jeb->used_size) {
@@ -268,9 +268,9 @@ __jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c,
 	}
 
 	printk(JFFS2_DBG);
-	for (ref = jeb->first_node; ; ref = ref->next_phys) {
+	for (ref = jeb->first_node; ; ref = ref_next(ref)) {
 		printk("%#08x(%#x)", ref_offset(ref), ref->__totlen);
-		if (ref->next_phys)
+		if (ref_next(ref))
 			printk("->");
 		else
 			break;
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index a22ff5df7fc..477c526d638 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -239,7 +239,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 
 	while(ref_obsolete(raw)) {
 		D1(printk(KERN_DEBUG "Node at 0x%08x is obsolete... skipping\n", ref_offset(raw)));
-		raw = raw->next_phys;
+		raw = ref_next(raw);
 		if (unlikely(!raw)) {
 			printk(KERN_WARNING "eep. End of raw list while still supposedly nodes to GC\n");
 			printk(KERN_WARNING "erase block at 0x%08x. free_size 0x%08x, dirty_size 0x%08x, used_size 0x%08x\n",
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 1e6eabd730f..0e82979c741 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1159,9 +1159,10 @@ static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
 				    struct jffs2_raw_node_ref *ref)
 {
 	uint32_t ref_end;
+	struct jffs2_raw_node_ref *next_ref = ref_next(ref);
 
-	if (ref->next_phys)
-		ref_end = ref_offset(ref->next_phys);
+	if (next_ref)
+		ref_end = ref_offset(next_ref);
 	else {
 		if (!jeb)
 			jeb = &c->blocks[ref->flash_offset / c->sector_size];
@@ -1196,11 +1197,11 @@ uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *je
 		printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
 		       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
 		       ret, ref->__totlen);
-		if (ref->next_phys) {
-			printk(KERN_CRIT "next_phys %p (0x%08x-0x%08x)\n", ref->next_phys, ref_offset(ref->next_phys),
-			       ref_offset(ref->next_phys)+ref->__totlen);
+		if (ref_next(ref)) {
+			printk(KERN_CRIT "next %p (0x%08x-0x%08x)\n", ref_next(ref), ref_offset(ref_next(ref)),
+			       ref_offset(ref_next(ref))+ref->__totlen);
 		} else 
-			printk(KERN_CRIT "No next_phys. jeb->last_node is %p\n", jeb->last_node);
+			printk(KERN_CRIT "No next ref. jeb->last_node is %p\n", jeb->last_node);
 
 		printk(KERN_CRIT "jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", jeb->wasted_size, jeb->dirty_size, jeb->used_size, jeb->free_size);
 		ret = ref->__totlen;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 7cc74d2ab4d..94d152de95e 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -88,18 +88,18 @@ struct jffs2_raw_node_ref
 #endif
 };
 
+#define ref_next(r) ((r)->next_phys)
+
 static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
 {
-	while(raw->next_in_ino) {
+	while(raw->next_in_ino)
 		raw = raw->next_in_ino;
-	}
 
 	/* NB. This can be a jffs2_xattr_datum or jffs2_xattr_ref and
 	   not actually a jffs2_inode_cache. Check ->class */
 	return ((struct jffs2_inode_cache *)raw);
 }
 
-
         /* flash_offset & 3 always has to be zero, because nodes are
 	   always aligned at 4 bytes. So we have a couple of extra bits
 	   to play with, which indicate the node's status; see below: */
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 01bf2773fe4..f4649c275fb 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -458,6 +458,7 @@ static inline int on_list(struct list_head *obj, struct list_head *head)
 void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref)
 {
 	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *next_ref;
 	int blocknr;
 	struct jffs2_unknown_node n;
 	int ret, addedsize;
@@ -685,24 +686,23 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 	/* Merge with the next node in the physical list, if there is one
 	   and if it's also obsolete and if it doesn't belong to any inode */
-	if (ref->next_phys && ref_obsolete(ref->next_phys) &&
-	    !ref->next_phys->next_in_ino) {
-		struct jffs2_raw_node_ref *n = ref->next_phys;
+	next_ref = ref_next(ref);
 
+	if (next_ref && ref_obsolete(next_ref) && !next_ref->next_in_ino) {
 		spin_lock(&c->erase_completion_lock);
 
 #ifdef TEST_TOTLEN
-		ref->__totlen += n->__totlen;
+		ref->__totlen += next_ref->__totlen;
 #endif
-		ref->next_phys = n->next_phys;
-                if (jeb->last_node == n) jeb->last_node = ref;
-		if (jeb->gc_node == n) {
+		ref->next_phys = ref_next(next_ref);
+                if (jeb->last_node == next_ref) jeb->last_node = ref;
+		if (jeb->gc_node == next_ref) {
 			/* gc will be happy continuing gc on this node */
 			jeb->gc_node=ref;
 		}
 		spin_unlock(&c->erase_completion_lock);
 
-		__jffs2_free_raw_node_ref(n);
+		__jffs2_free_raw_node_ref(next_ref);
 	}
 
 	/* Also merge with the previous node in the list, if there is one
@@ -712,8 +712,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 		spin_lock(&c->erase_completion_lock);
 
-		while (p->next_phys != ref)
-			p = p->next_phys;
+		while ((next_ref = ref_next(ref)) != ref)
+			p = next_ref;
 
 		if (ref_obsolete(p) && !ref->next_in_ino) {
 #ifdef TEST_TOTLEN
@@ -726,7 +726,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 				/* gc will be happy continuing gc on this node */
 				jeb->gc_node=p;
 			}
-			p->next_phys = ref->next_phys;
+			p->next_phys = ref_next(ref);
 			__jffs2_free_raw_node_ref(ref);
 		}
 		spin_unlock(&c->erase_completion_lock);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 87b0a416b6a..3551c39d747 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -295,7 +295,7 @@ int jffs2_fill_scan_buf (struct jffs2_sb_info *c, void *buf,
 int jffs2_scan_classify_jeb(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
 	if ((jeb->used_size + jeb->unchecked_size) == PAD(c->cleanmarker_size) && !jeb->dirty_size
-		&& (!jeb->first_node || !jeb->first_node->next_phys) )
+	    && (!jeb->first_node || !ref_next(jeb->first_node)) )
 		return BLK_STATE_CLEANMARKER;
 
 	/* move blocks with max 4 byte dirty space to cleanlist */
@@ -647,7 +647,7 @@ scan_more:
 			/* If we're only checking the beginning of a block with a cleanmarker,
 			   bail now */
 			if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) &&
-			    c->cleanmarker_size && !jeb->dirty_size && !jeb->first_node->next_phys) {
+			    c->cleanmarker_size && !jeb->dirty_size && !ref_next(jeb->first_node)) {
 				D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size)));
 				return BLK_STATE_CLEANMARKER;
 			}
-- 
cgit v1.2.3


From c7c16c8e7604a8974d48562fecf8d9c7e221fced Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 24 May 2006 14:24:02 +0100
Subject: [JFFS2] Revert Artem's Bunkage in debug messages.

Random unthinking 'cleanup' caused debug messages like this:
   Obsoleting node at 0x0006daf4 of len 0x3a4: <7>Dirtying

If messages are continuation of an existing line, they don't need
to be prefixed with KERN_DEBUG.

THINK. Or you will be replaced by a small shell script.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodemgmt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index f4649c275fb..8264eab2c89 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -517,7 +517,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 	// Take care, that wasted size is taken into concern
 	if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + freed_len)) && jeb != c->nextblock) {
-		D1(printk(KERN_DEBUG "Dirtying\n"));
+		D1(printk("Dirtying\n"));
 		addedsize = freed_len;
 		jeb->dirty_size += freed_len;
 		c->dirty_size += freed_len;
@@ -539,7 +539,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 			}
 		}
 	} else {
-		D1(printk(KERN_DEBUG "Wasting\n"));
+		D1(printk("Wasting\n"));
 		addedsize = 0;
 		jeb->wasted_size += freed_len;
 		c->wasted_size += freed_len;
-- 
cgit v1.2.3


From f560928baa605e8faaa3de6cc1b11ebb645e97db Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 25 May 2006 01:37:28 +0100
Subject: [JFFS2] Allocate node_ref for wasted space when skipping to page
 boundary

One more place where we were changing the accounting info without
actually allocating a ref for the lost space...

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 3551c39d747..c25750aa85e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -244,11 +244,8 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 
 		D1(printk(KERN_DEBUG "jffs2_scan_medium(): Skipping %d bytes in nextblock to ensure page alignment\n",
 			  skip));
-		c->nextblock->wasted_size += skip;
-		c->wasted_size += skip;
-
-		c->nextblock->free_size -= skip;
-		c->free_size -= skip;
+		jffs2_prealloc_raw_node_refs(c, 1);
+		jffs2_scan_dirty_space(c, c->nextblock, skip);
 	}
 #endif
 	if (c->nr_erasing_blocks) {
-- 
cgit v1.2.3


From c38c1b613d742b5fa075071568f44dc8ec9f1cb8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 25 May 2006 01:38:27 +0100
Subject: [JFFS2] jffs2_free_all_node_refs() doesn't free them all. Rename it.

... to jffs2_free_jeb_node_refs() since that's what it does.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c    | 4 ++--
 fs/jffs2/nodelist.h | 2 +-
 fs/jffs2/nodemgmt.c | 2 +-
 fs/jffs2/summary.c  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index f939f908b94..aea58ee64d2 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -135,7 +135,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count)
 			c->used_size -= jeb->used_size;
 			c->dirty_size -= jeb->dirty_size;
 			jeb->wasted_size = jeb->used_size = jeb->dirty_size = jeb->free_size = 0;
-			jffs2_free_all_node_refs(c, jeb);
+			jffs2_free_jeb_node_refs(c, jeb);
 			list_add(&jeb->list, &c->erasing_list);
 			spin_unlock(&c->erase_completion_lock);
 
@@ -283,7 +283,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 		jffs2_del_ino_cache(c, ic);
 }
 
-void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
 	struct jffs2_raw_node_ref *ref;
 	D1(printk(KERN_DEBUG "Freeing all node refs for eraseblock offset 0x%08x\n", jeb->offset));
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 94d152de95e..21b0b7b6c75 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -417,7 +417,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 
 /* erase.c */
 void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
-void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
+void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb);
 
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 /* wbuf.c */
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 8264eab2c89..bbc873e2080 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -618,7 +618,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 	/* The erase_free_sem is locked, and has been since before we marked the node obsolete
 	   and potentially put its eraseblock onto the erase_pending_list. Thus, we know that
 	   the block hasn't _already_ been erased, and that 'ref' itself hasn't been freed yet
-	   by jffs2_free_all_node_refs() in erase.c. Which is nice. */
+	   by jffs2_free_jeb_node_refs() in erase.c. Which is nice. */
 
 	D1(printk(KERN_DEBUG "obliterating obsoleted node at 0x%08x\n", ref_offset(ref)));
 	ret = jffs2_flash_read(c, ref_offset(ref), sizeof(n), &retlen, (char *)&n);
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index ccb6803a6e4..7bddd33f866 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -547,7 +547,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				jeb->wasted_size = jeb->used_size = jeb->dirty_size = 0;
 				jeb->free_size = c->sector_size;
 
-				jffs2_free_all_node_refs(c, jeb);
+				jffs2_free_jeb_node_refs(c, jeb);
 				return -ENOTRECOVERABLE;
 			}
 		}
-- 
cgit v1.2.3


From f61579c33736476e41e296a16c0d4ead4b953187 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 25 May 2006 01:42:40 +0100
Subject: [JFFS2] Correctly handle wasted space before summary node.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 7bddd33f866..0c66d16eaa2 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -630,11 +630,11 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		return ret;		/* real error */
 
 	/* for PARANOIA_CHECK */
-	ret = jffs2_prealloc_raw_node_refs(c, 1);
+	ret = jffs2_prealloc_raw_node_refs(c, 2);
 	if (ret)
 		return ret;
 
-	jffs2_link_node_ref(c, jeb, (jeb->offset + ofs) | REF_NORMAL, sumsize, NULL);
+	sum_link_node_ref(c, jeb, ofs | REF_NORMAL, sumsize, NULL);
 
 	if (unlikely(jeb->free_size)) {
 		JFFS2_WARNING("Free size 0x%x bytes in eraseblock @0x%08x with summary?\n",
-- 
cgit v1.2.3


From 046b8b9808127d63326a33bc6298c90eaee90eeb Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 25 May 2006 01:50:35 +0100
Subject: [JFFS2] Add 'jeb' argument to jffs2_prealloc_raw_node_refs()

Preallocation of refs is shortly going to be a per-eraseblock thing,
rather than per-filesystem. Add the required argument to the function.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c    | 2 +-
 fs/jffs2/malloc.c   | 3 ++-
 fs/jffs2/nodelist.h | 3 ++-
 fs/jffs2/nodemgmt.c | 4 ++--
 fs/jffs2/scan.c     | 4 ++--
 fs/jffs2/summary.c  | 6 +++---
 fs/jffs2/wbuf.c     | 8 ++++----
 7 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index aea58ee64d2..c8386b25683 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -383,7 +383,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 			.totlen =	cpu_to_je32(c->cleanmarker_size)
 		};
 
-		jffs2_prealloc_raw_node_refs(c, 1);
+		jffs2_prealloc_raw_node_refs(c, jeb, 1);
 
 		marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
 
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 3df3250314a..171483ef0e4 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -190,7 +190,8 @@ void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x)
 	kmem_cache_free(tmp_dnode_info_slab, x);
 }
 
-int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, int nr)
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb, int nr)
 {
 	struct jffs2_raw_node_ref *p = c->refs;
 
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 21b0b7b6c75..6c92dc46fe9 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -378,7 +378,8 @@ struct jffs2_raw_inode *jffs2_alloc_raw_inode(void);
 void jffs2_free_raw_inode(struct jffs2_raw_inode *);
 struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void);
 void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *);
-int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, int nr);
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, 
+				 struct jffs2_eraseblock *jeb, int nr);
 struct jffs2_raw_node_ref *__jffs2_alloc_raw_node_ref(void);
 void __jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *);
 struct jffs2_node_frag *jffs2_alloc_node_frag(void);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index bbc873e2080..73a06d01db4 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -138,7 +138,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 	}
 	spin_unlock(&c->erase_completion_lock);
 	if (!ret)
-		ret = jffs2_prealloc_raw_node_refs(c, 1);
+		ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
 	if (ret)
 		up(&c->alloc_sem);
 	return ret;
@@ -161,7 +161,7 @@ int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
 	}
 	spin_unlock(&c->erase_completion_lock);
 	if (!ret)
-		ret = jffs2_prealloc_raw_node_refs(c, 1);
+		ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
 
 	return ret;
 }
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index c25750aa85e..5bbd4a36c75 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -244,7 +244,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 
 		D1(printk(KERN_DEBUG "jffs2_scan_medium(): Skipping %d bytes in nextblock to ensure page alignment\n",
 			  skip));
-		jffs2_prealloc_raw_node_refs(c, 1);
+		jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
 		jffs2_scan_dirty_space(c, c->nextblock, skip);
 	}
 #endif
@@ -576,7 +576,7 @@ scan_more:
 		jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
 
 		/* Make sure there are node refs available for use */
-		err = jffs2_prealloc_raw_node_refs(c, 2);
+		err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
 		if (err)
 			return err;
 
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0c66d16eaa2..a60bbcea5b7 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -400,7 +400,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 		dbg_summary("processing summary index %d\n", i);
 
 		/* Make sure there's a spare ref for dirty space */
-		err = jffs2_prealloc_raw_node_refs(c, 2);
+		err = jffs2_prealloc_raw_node_refs(c, jeb, 2);
 		if (err)
 			return err;
 
@@ -630,7 +630,7 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		return ret;		/* real error */
 
 	/* for PARANOIA_CHECK */
-	ret = jffs2_prealloc_raw_node_refs(c, 2);
+	ret = jffs2_prealloc_raw_node_refs(c, jeb, 2);
 	if (ret)
 		return ret;
 
@@ -815,9 +815,9 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 	dbg_summary("called\n");
 
 	spin_unlock(&c->erase_completion_lock);
-	jffs2_prealloc_raw_node_refs(c, 1);
 
 	jeb = c->nextblock;
+	jffs2_prealloc_raw_node_refs(c, jeb, 1);
 
 	if (!c->summary->sum_num || !c->summary->sum_list_head) {
 		JFFS2_WARNING("Empty summary info!!!\n");
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 1871140e1e7..e16e45ea047 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -179,13 +179,13 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 	unsigned char *buf;
 	uint32_t start, end, ofs, len;
 
-	if (jffs2_prealloc_raw_node_refs(c, c->reserved_refs + 1))
+	jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+
+	if (jffs2_prealloc_raw_node_refs(c, jeb, c->reserved_refs + 1))
 		return;
 
 	spin_lock(&c->erase_completion_lock);
 
-	jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
-
 	jffs2_block_refile(c, jeb, REFILE_NOTEMPTY);
 
 	/* Find the first node to be recovered, by skipping over every
@@ -417,7 +417,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	if (!c->wbuf_len)	/* already checked c->wbuf above */
 		return 0;
 
-	if (jffs2_prealloc_raw_node_refs(c, c->reserved_refs + 1))
+	if (jffs2_prealloc_raw_node_refs(c, c->nextblock, c->reserved_refs + 1))
 		return -ENOMEM;
 
 	/* claim remaining space on the page
-- 
cgit v1.2.3


From 8b9e9fe8c6ee354aa75dc5a33e1575b21aa52084 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 25 May 2006 01:53:09 +0100
Subject: [JFFS2] Fix and improve debugging output during scan.

Print wasted_size in scanned eraseblocks, print range correctly for
summary dirent and inode entries.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c    | 6 +++---
 fs/jffs2/summary.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 5bbd4a36c75..404ba6e4858 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -881,9 +881,9 @@ scan_more:
 		}
 	}
 
-	D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x\n", jeb->offset,
-		  jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size));
-
+	D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n",
+		  jeb->offset,jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size, jeb->wasted_size));
+	
 	/* mark_node_obsolete can add to wasted !! */
 	if (jeb->wasted_size) {
 		jeb->dirty_size += jeb->wasted_size;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index a60bbcea5b7..523a8f330ef 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -413,7 +413,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 				dbg_summary("Inode at 0x%08x-0x%08x\n",
 					    jeb->offset + je32_to_cpu(spi->offset),
-					    jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spu->totlen));
+					    jeb->offset + je32_to_cpu(spi->offset) + je32_to_cpu(spi->totlen));
 
 				ic = jffs2_scan_make_ino_cache(c, ino);
 				if (!ic) {
@@ -435,7 +435,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				struct jffs2_sum_dirent_flash *spd;
 				spd = sp;
 
-				dbg_summary("Dirent at 0x%08x\n",
+				dbg_summary("Dirent at 0x%08x-0x%08x\n",
 					    jeb->offset + je32_to_cpu(spd->offset),
 					    jeb->offset + je32_to_cpu(spd->offset) + je32_to_cpu(spd->totlen));
 
-- 
cgit v1.2.3


From 89291a9d5b70778e49e2563247c6c7e3efac9b14 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 25 May 2006 13:30:24 +0100
Subject: [JFFS2] Fix 64-bit size_t problems in XATTR code.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c  |  2 +-
 fs/jffs2/xattr.c | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 404ba6e4858..3fb0e7e82cf 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -379,7 +379,7 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	}
 
 	if (PAD(sizeof(struct jffs2_raw_xref)) != je32_to_cpu(rr->totlen)) {
-		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
+		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%zd\n",
 			      ofs, je32_to_cpu(rr->totlen),
 			      PAD(sizeof(struct jffs2_raw_xref)));
 		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rr->totlen))))
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 2255f1367bd..2d82e250be3 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -111,7 +111,7 @@ static void delete_xattr_datum_node(struct jffs2_sb_info *c, struct jffs2_xattr_
 {
 	/* must be called under down_write(xattr_sem) */
 	struct jffs2_raw_xattr rx;
-	uint32_t length;
+	size_t length;
 	int rc;
 
 	if (!xd->node) {
@@ -124,14 +124,14 @@ static void delete_xattr_datum_node(struct jffs2_sb_info *c, struct jffs2_xattr_
 				      sizeof(struct jffs2_unknown_node),
 				      &length, (char *)&rx);
 		if (rc || length != sizeof(struct jffs2_unknown_node)) {
-			JFFS2_ERROR("jffs2_flash_read()=%d, req=%u, read=%u at %#08x\n",
+			JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
 				    rc, sizeof(struct jffs2_unknown_node),
 				    length, ref_offset(xd->node));
 		}
 		rc = jffs2_flash_write(c, ref_offset(xd->node), sizeof(rx),
 				       &length, (char *)&rx);
 		if (rc || length != sizeof(struct jffs2_raw_xattr)) {
-			JFFS2_ERROR("jffs2_flash_write()=%d, req=%u, wrote=%u ar %#08x\n",
+			JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu ar %#08x\n",
 				    rc, sizeof(rx), length, ref_offset(xd->node));
 		}
 	}
@@ -169,7 +169,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 
 	rc = jffs2_flash_read(c, ref_offset(xd->node), sizeof(rx), &readlen, (char *)&rx);
 	if (rc || readlen != sizeof(rx)) {
-		JFFS2_WARNING("jffs2_flash_read()=%d, req=%u, read=%u at %#08x\n",
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
 			      rc, sizeof(rx), readlen, ref_offset(xd->node));
 		return rc ? rc : -EIO;
 	}
@@ -240,7 +240,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
 			       length, &readlen, data);
 
 	if (ret || length!=readlen) {
-		JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%d, at %#08x\n",
+		JFFS2_WARNING("jffs2_flash_read() returned %d, request=%d, readlen=%zu, at %#08x\n",
 			      ret, length, readlen, ref_offset(xd->node));
 		kfree(data);
 		return ret ? ret : -EIO;
@@ -307,7 +307,7 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xattr rx;
 	struct kvec vecs[2];
-	uint32_t length;
+	size_t length;
 	int rc, totlen;
 	uint32_t phys_ofs = write_ofs(c);
 
@@ -335,7 +335,7 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 
 	rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
 	if (rc || totlen != length) {
-		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%u, at %#08x\n",
+		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n",
 			      rc, totlen, length, phys_ofs);
 		rc = rc ? rc : -EIO;
 		if (length)
@@ -459,7 +459,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 
 	rc = jffs2_flash_read(c, ref_offset(ref->node), sizeof(rr), &readlen, (char *)&rr);
 	if (rc || sizeof(rr) != readlen) {
-		JFFS2_WARNING("jffs2_flash_read()=%d, req=%u, read=%u, at %#08x\n",
+		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n",
 			      rc, sizeof(rr), readlen, ref_offset(ref->node));
 		return rc ? rc : -EIO;
 	}
@@ -475,7 +475,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 	    || je16_to_cpu(rr.nodetype) != JFFS2_NODETYPE_XREF
 	    || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) {
 		JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, "
-			    "nodetype=%#04x/%#04x, totlen=%u/%u\n",
+			    "nodetype=%#04x/%#04x, totlen=%u/%zu\n",
 			    ref_offset(ref->node), je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
 			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
 			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
@@ -502,7 +502,7 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 static void delete_xattr_ref_node(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 {
 	struct jffs2_raw_xref rr;
-	uint32_t length;
+	size_t length;
 	int rc;
 
 	if (jffs2_sum_active()) {
@@ -511,14 +511,14 @@ static void delete_xattr_ref_node(struct jffs2_sb_info *c, struct jffs2_xattr_re
 				      sizeof(struct jffs2_unknown_node),
 				      &length, (char *)&rr);
 		if (rc || length != sizeof(struct jffs2_unknown_node)) {
-			JFFS2_ERROR("jffs2_flash_read()=%d, req=%u, read=%u at %#08x\n",
+			JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
 				    rc, sizeof(struct jffs2_unknown_node),
 				    length, ref_offset(ref->node));
 		}
 		rc = jffs2_flash_write(c, ref_offset(ref->node), sizeof(rr),
 				       &length, (char *)&rr);
 		if (rc || length != sizeof(struct jffs2_raw_xref)) {
-			JFFS2_ERROR("jffs2_flash_write()=%d, req=%u, wrote=%u at %#08x\n",
+			JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu at %#08x\n",
 				    rc, sizeof(rr), length, ref_offset(ref->node));
 		}
 	}
@@ -549,7 +549,7 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 	/* must be called under down_write(xattr_sem) */
 	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xref rr;
-	uint32_t length;
+	size_t length;
 	uint32_t phys_ofs = write_ofs(c);
 	int ret;
 
@@ -564,7 +564,7 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 
 	ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr);
 	if (ret || sizeof(rr) != length) {
-		JFFS2_WARNING("jffs2_flash_write() returned %d, request=%u, retlen=%u, at %#08x\n",
+		JFFS2_WARNING("jffs2_flash_write() returned %d, request=%zu, retlen=%zu, at %#08x\n",
 			      ret, sizeof(rr), length, phys_ofs);
 		ret = ret ? ret : -EIO;
 		if (length)
-- 
cgit v1.2.3


From 9bfeb691e75b21fdaa80ffae719083200b190381 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 26 May 2006 21:19:05 +0100
Subject: [JFFS2] Switch to using an array of jffs2_raw_node_refs instead of a
 list.

This allows us to drop another pointer from the struct jffs2_raw_node_ref,
shrinking it to 8 bytes on 32-bit machines (if the TEST_TOTLEN) paranoia
check is turned off, which will be committed soon).

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c       |  21 ++--
 fs/jffs2/jffs2_fs_sb.h |   3 -
 fs/jffs2/malloc.c      |  75 ++++++++-----
 fs/jffs2/nodelist.c    |  93 ++++++++--------
 fs/jffs2/nodelist.h    |  31 +++++-
 fs/jffs2/nodemgmt.c    |  51 +--------
 fs/jffs2/os-linux.h    |   1 +
 fs/jffs2/summary.c     |  14 ++-
 fs/jffs2/wbuf.c        | 287 ++++++++++++++++++++++++++++++++++---------------
 9 files changed, 351 insertions(+), 225 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index c8386b25683..1862e8bc101 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -285,20 +285,25 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 
 void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
-	struct jffs2_raw_node_ref *ref;
+	struct jffs2_raw_node_ref *block, *ref;
 	D1(printk(KERN_DEBUG "Freeing all node refs for eraseblock offset 0x%08x\n", jeb->offset));
-	while(jeb->first_node) {
-		ref = jeb->first_node;
-		jeb->first_node = ref->next_phys;
 
-		/* Remove from the inode-list */
-		if (ref->next_in_ino)
+	block = ref = jeb->first_node;
+
+	while (ref) {
+		if (ref->flash_offset == REF_LINK_NODE) {
+			ref = ref->next_in_ino;
+			jffs2_free_refblock(block);
+			block = ref;
+			continue;
+		}
+		if (ref->flash_offset != REF_EMPTY_NODE && ref->next_in_ino)
 			jffs2_remove_node_refs_from_ino_list(c, ref, jeb);
 		/* else it was a non-inode node or already removed, so don't bother */
 
-		__jffs2_free_raw_node_ref(ref);
+		ref++;
 	}
-	jeb->last_node = NULL;
+	jeb->first_node = jeb->last_node = NULL;
 }
 
 static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t *bad_offset)
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 67529f0a44d..272fbea5519 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -26,9 +26,6 @@ struct jffs2_inodirty;
 struct jffs2_sb_info {
 	struct mtd_info *mtd;
 
-	struct jffs2_raw_node_ref *refs;
-	int reserved_refs;
-
 	uint32_t highest_ino;
 	uint32_t checked_ino;
 
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 171483ef0e4..4889d0700c0 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -57,8 +57,8 @@ int __init jffs2_create_slab_caches(void)
 	if (!tmp_dnode_info_slab)
 		goto err;
 
-	raw_node_ref_slab = kmem_cache_create("jffs2_raw_node_ref",
-					      sizeof(struct jffs2_raw_node_ref),
+	raw_node_ref_slab = kmem_cache_create("jffs2_refblock",
+					      sizeof(struct jffs2_raw_node_ref) * (REFS_PER_BLOCK + 1),
 					      0, 0, NULL, NULL);
 	if (!raw_node_ref_slab)
 		goto err;
@@ -190,38 +190,65 @@ void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *x)
 	kmem_cache_free(tmp_dnode_info_slab, x);
 }
 
+struct jffs2_raw_node_ref *jffs2_alloc_refblock(void)
+{
+	struct jffs2_raw_node_ref *ret;
+
+	ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL);
+	if (ret) {
+		int i = 0;
+		for (i=0; i < REFS_PER_BLOCK; i++) {
+			ret[i].flash_offset = REF_EMPTY_NODE;
+			ret[i].next_in_ino = NULL;
+		}
+		ret[i].flash_offset = REF_LINK_NODE;
+		ret[i].next_in_ino = NULL;
+	}
+	return ret;
+}
+
 int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
 				 struct jffs2_eraseblock *jeb, int nr)
 {
-	struct jffs2_raw_node_ref *p = c->refs;
+	struct jffs2_raw_node_ref **p, *ref;
+	int i = nr;
 
 	dbg_memalloc("%d\n", nr);
 
-	while (nr && p) {
-		p = p->next_in_ino;
-		nr--;
-	}
-	while (nr) {
-		p = __jffs2_alloc_raw_node_ref();
-		if (!p)
-			return -ENOMEM;
-		p->next_in_ino = c->refs;
-		c->refs = p;
-		nr--;
+	p = &jeb->last_node;
+	ref = *p;
+
+	dbg_memalloc("Reserving %d refs for block @0x%08x\n", nr, jeb->offset);
+
+	/* If jeb->last_node is really a valid node then skip over it */
+	if (ref && ref->flash_offset != REF_EMPTY_NODE)
+		ref++;
+
+	while (i) {
+		if (!ref) {
+			dbg_memalloc("Allocating new refblock linked from %p\n", p);
+			ref = *p = jffs2_alloc_refblock();
+			if (!ref)
+				return -ENOMEM;
+		}
+		if (ref->flash_offset == REF_LINK_NODE) {
+			p = &ref->next_in_ino;
+			ref = *p;
+			continue;
+		}
+		i--;
+		ref++;
 	}
-	c->reserved_refs = nr;
-	return 0;
-}
+	jeb->allocated_refs = nr;
 
-struct jffs2_raw_node_ref *__jffs2_alloc_raw_node_ref(void)
-{
-	struct jffs2_raw_node_ref *ret;
-	ret = kmem_cache_alloc(raw_node_ref_slab, GFP_KERNEL);
-	dbg_memalloc("%p\n", ret);
-	return ret;
+	dbg_memalloc("Reserved %d refs for block @0x%08x, last_node is %p (%08x,%p)\n",
+		  nr, jeb->offset, jeb->last_node, jeb->last_node->flash_offset,
+		  jeb->last_node->next_in_ino);
+
+	return 0;
 }
 
-void __jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *x)
+void jffs2_free_refblock(struct jffs2_raw_node_ref *x)
 {
 	dbg_memalloc("%p\n", x);
 	kmem_cache_free(raw_node_ref_slab, x);
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 0e82979c741..5d36e9b4d7c 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -954,18 +954,16 @@ void jffs2_free_raw_node_refs(struct jffs2_sb_info *c)
 	for (i=0; i<c->nr_blocks; i++) {
 		this = c->blocks[i].first_node;
 		while (this) {
-			next = this->next_phys;
-			__jffs2_free_raw_node_ref(this);
+			if (this[REFS_PER_BLOCK].flash_offset == REF_LINK_NODE)
+				next = this[REFS_PER_BLOCK].next_in_ino;
+			else
+				next = NULL;
+
+			jffs2_free_refblock(this);
 			this = next;
 		}
 		c->blocks[i].first_node = c->blocks[i].last_node = NULL;
 	}
-	this = c->refs;
-	while (this) {
-		next = this->next_in_ino;
-		__jffs2_free_raw_node_ref(this);
-		this = next;
-	}
 }
 
 struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset)
@@ -1060,32 +1058,37 @@ struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
 {
 	struct jffs2_raw_node_ref *ref;
 
-	/* These will be preallocated _very_ shortly. */
-	ref = c->refs;
-	if (!c->refs) {
-		JFFS2_WARNING("Using non-preallocated refs!\n");
-		ref = __jffs2_alloc_raw_node_ref();
-		BUG_ON(!ref);
-		WARN_ON(1);
-	} else {
-		c->refs = ref->next_in_ino;
+	BUG_ON(!jeb->allocated_refs);
+	jeb->allocated_refs--;
+
+	ref = jeb->last_node;
+
+	dbg_noderef("Last node at %p is (%08x,%p)\n", ref, ref->flash_offset,
+		    ref->next_in_ino);
+
+	while (ref->flash_offset != REF_EMPTY_NODE) {
+		if (ref->flash_offset == REF_LINK_NODE)
+			ref = ref->next_in_ino;
+		else
+			ref++;
 	}
 
-	ref->next_phys = NULL;
+	dbg_noderef("New ref is %p (%08x becomes %08x,%p) len 0x%x\n", ref, 
+		    ref->flash_offset, ofs, ref->next_in_ino, len);
+
 	ref->flash_offset = ofs;
 
-	if (!jeb->first_node)
+	if (!jeb->first_node) {
 		jeb->first_node = ref;
-	if (jeb->last_node) {
-		jeb->last_node->next_phys = ref;
-#ifdef TEST_TOTLEN
-		if (ref_offset(jeb->last_node) + jeb->last_node->__totlen != ref_offset(ref)) {
-			printk(KERN_CRIT "Adding new ref %p at (0x%08x-0x%08x) not immediately after previous (0x%08x-0x%08x)\n",
-			       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
-			       ref_offset(jeb->last_node), ref_offset(jeb->last_node)+jeb->last_node->__totlen);
-			WARN_ON(1);
-		}
-#endif
+		BUG_ON(ref_offset(ref) != jeb->offset);
+	} else if (unlikely(ref_offset(ref) != jeb->offset + c->sector_size - jeb->free_size)) {
+		uint32_t last_len = ref_totlen(c, jeb, jeb->last_node);
+
+		JFFS2_ERROR("Adding new ref %p at (0x%08x-0x%08x) not immediately after previous (0x%08x-0x%08x)\n",
+			    ref, ref_offset(ref), ref_offset(ref)+len,
+			    ref_offset(jeb->last_node), 
+			    ref_offset(jeb->last_node)+last_len);
+		BUG();
 	}
 	jeb->last_node = ref;
 
@@ -1130,12 +1133,13 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 {
 	if (!size)
 		return 0;
-	if (size > c->sector_size - jeb->used_size) {
-		printk(KERN_CRIT "Dirty space 0x%x larger then used_size 0x%x (wasted 0x%x)\n",
-		       size, jeb->used_size, jeb->wasted_size);
+	if (unlikely(size > jeb->free_size)) {
+		printk(KERN_CRIT "Dirty space 0x%x larger then free_size 0x%x (wasted 0x%x)\n",
+		       size, jeb->free_size, jeb->wasted_size);
 		BUG();
 	}
-	if (jeb->last_node && ref_obsolete(jeb->last_node)) {
+	/* REF_EMPTY_NODE is !obsolete, so that works OK */
+	if (ref_obsolete(jeb->last_node)) {
 #ifdef TEST_TOTLEN
 		jeb->last_node->__totlen += size;
 #endif
@@ -1168,7 +1172,7 @@ static inline uint32_t __ref_totlen(struct jffs2_sb_info *c,
 			jeb = &c->blocks[ref->flash_offset / c->sector_size];
 
 		/* Last node in block. Use free_space */
-		if (ref != jeb->last_node) {
+		if (unlikely(ref != jeb->last_node)) {
 			printk(KERN_CRIT "ref %p @0x%08x is not jeb->last_node (%p @0x%08x)\n",
 			       ref, ref_offset(ref), jeb->last_node, jeb->last_node?ref_offset(jeb->last_node):0);
 			BUG();
@@ -1183,17 +1187,13 @@ uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *je
 {
 	uint32_t ret;
 
-#if CONFIG_JFFS2_FS_DEBUG > 0
-	if (jeb && jeb != &c->blocks[ref->flash_offset / c->sector_size]) {
-		printk(KERN_CRIT "ref_totlen called with wrong block -- at 0x%08x instead of 0x%08x; ref 0x%08x\n",
-		       jeb->offset, c->blocks[ref->flash_offset / c->sector_size].offset, ref_offset(ref));
-		BUG();
-	}
-#endif
-
 	ret = __ref_totlen(c, jeb, ref);
+
 #ifdef TEST_TOTLEN
-	if (ret != ref->__totlen) {
+	if (unlikely(ret != ref->__totlen)) {
+		if (!jeb)
+			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+
 		printk(KERN_CRIT "Totlen for ref at %p (0x%08x-0x%08x) miscalculated as 0x%x instead of %x\n",
 		       ref, ref_offset(ref), ref_offset(ref)+ref->__totlen,
 		       ret, ref->__totlen);
@@ -1204,13 +1204,14 @@ uint32_t __jffs2_ref_totlen(struct jffs2_sb_info *c, struct jffs2_eraseblock *je
 			printk(KERN_CRIT "No next ref. jeb->last_node is %p\n", jeb->last_node);
 
 		printk(KERN_CRIT "jeb->wasted_size %x, dirty_size %x, used_size %x, free_size %x\n", jeb->wasted_size, jeb->dirty_size, jeb->used_size, jeb->free_size);
-		ret = ref->__totlen;
-		if (!jeb)
-			jeb = &c->blocks[ref->flash_offset / c->sector_size];
+
 #if defined(JFFS2_DBG_DUMPS) || defined(JFFS2_DBG_PARANOIA_CHECKS)
 		__jffs2_dbg_dump_node_refs_nolock(c, jeb);
 #endif
+
 		WARN_ON(1);
+
+		ret = ref->__totlen;
 	}
 #endif /* TEST_TOTLEN */
 	return ret;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 6c92dc46fe9..7ad8ee04388 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -80,7 +80,6 @@ struct jffs2_raw_node_ref
 		for this object. If this _is_ the last, it points to the inode_cache,
 		xattr_ref or xattr_datum instead. The common part of those structures
 		has NULL in the first word. See jffs2_raw_ref_to_ic() below */
-	struct jffs2_raw_node_ref *next_phys;
 	uint32_t flash_offset;
 #define TEST_TOTLEN
 #ifdef TEST_TOTLEN
@@ -88,7 +87,29 @@ struct jffs2_raw_node_ref
 #endif
 };
 
-#define ref_next(r) ((r)->next_phys)
+#define REF_LINK_NODE ((int32_t)-1)
+#define REF_EMPTY_NODE ((int32_t)-2)
+
+/* Use blocks of about 256 bytes */
+#define REFS_PER_BLOCK ((255/sizeof(struct jffs2_raw_node_ref))-1)
+
+static inline struct jffs2_raw_node_ref *ref_next(struct jffs2_raw_node_ref *ref)
+{
+	ref++;
+
+	/* Link to another block of refs */
+	if (ref->flash_offset == REF_LINK_NODE) {
+		ref = ref->next_in_ino;
+		if (!ref)
+			return ref;
+	}
+
+	/* End of chain */
+	if (ref->flash_offset == REF_EMPTY_NODE)
+		return NULL;
+
+	return ref;
+}
 
 static inline struct jffs2_inode_cache *jffs2_raw_ref_to_ic(struct jffs2_raw_node_ref *raw)
 {
@@ -234,6 +255,7 @@ struct jffs2_eraseblock
 	uint32_t wasted_size;
 	uint32_t free_size;	/* Note that sector_size - free_size
 				   is the address of the first free space */
+	uint32_t allocated_refs;
 	struct jffs2_raw_node_ref *first_node;
 	struct jffs2_raw_node_ref *last_node;
 
@@ -378,10 +400,9 @@ struct jffs2_raw_inode *jffs2_alloc_raw_inode(void);
 void jffs2_free_raw_inode(struct jffs2_raw_inode *);
 struct jffs2_tmp_dnode_info *jffs2_alloc_tmp_dnode_info(void);
 void jffs2_free_tmp_dnode_info(struct jffs2_tmp_dnode_info *);
-int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c, 
+int jffs2_prealloc_raw_node_refs(struct jffs2_sb_info *c,
 				 struct jffs2_eraseblock *jeb, int nr);
-struct jffs2_raw_node_ref *__jffs2_alloc_raw_node_ref(void);
-void __jffs2_free_raw_node_ref(struct jffs2_raw_node_ref *);
+void jffs2_free_refblock(struct jffs2_raw_node_ref *);
 struct jffs2_node_frag *jffs2_alloc_node_frag(void);
 void jffs2_free_node_frag(struct jffs2_node_frag *);
 struct jffs2_inode_cache *jffs2_alloc_inode_cache(void);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 73a06d01db4..71d1630609a 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -458,14 +458,13 @@ static inline int on_list(struct list_head *obj, struct list_head *head)
 void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref)
 {
 	struct jffs2_eraseblock *jeb;
-	struct jffs2_raw_node_ref *next_ref;
 	int blocknr;
 	struct jffs2_unknown_node n;
 	int ret, addedsize;
 	size_t retlen;
 	uint32_t freed_len;
 
-	if(!ref) {
+	if(unlikely(!ref)) {
 		printk(KERN_NOTICE "EEEEEK. jffs2_mark_node_obsolete called with NULL node\n");
 		return;
 	}
@@ -683,54 +682,6 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 		spin_unlock(&c->erase_completion_lock);
 	}
 
-
-	/* Merge with the next node in the physical list, if there is one
-	   and if it's also obsolete and if it doesn't belong to any inode */
-	next_ref = ref_next(ref);
-
-	if (next_ref && ref_obsolete(next_ref) && !next_ref->next_in_ino) {
-		spin_lock(&c->erase_completion_lock);
-
-#ifdef TEST_TOTLEN
-		ref->__totlen += next_ref->__totlen;
-#endif
-		ref->next_phys = ref_next(next_ref);
-                if (jeb->last_node == next_ref) jeb->last_node = ref;
-		if (jeb->gc_node == next_ref) {
-			/* gc will be happy continuing gc on this node */
-			jeb->gc_node=ref;
-		}
-		spin_unlock(&c->erase_completion_lock);
-
-		__jffs2_free_raw_node_ref(next_ref);
-	}
-
-	/* Also merge with the previous node in the list, if there is one
-	   and that one is obsolete */
-	if (ref != jeb->first_node ) {
-		struct jffs2_raw_node_ref *p = jeb->first_node;
-
-		spin_lock(&c->erase_completion_lock);
-
-		while ((next_ref = ref_next(ref)) != ref)
-			p = next_ref;
-
-		if (ref_obsolete(p) && !ref->next_in_ino) {
-#ifdef TEST_TOTLEN
-			p->__totlen += ref->__totlen;
-#endif
-			if (jeb->last_node == ref) {
-				jeb->last_node = p;
-			}
-			if (jeb->gc_node == ref) {
-				/* gc will be happy continuing gc on this node */
-				jeb->gc_node=p;
-			}
-			p->next_phys = ref_next(ref);
-			__jffs2_free_raw_node_ref(ref);
-		}
-		spin_unlock(&c->erase_completion_lock);
-	}
  out_erase_sem:
 	up(&c->erase_free_sem);
 }
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 743c9e52152..cd4021bcb94 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -95,6 +95,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_dataflash(c) (0)
 #define jffs2_dataflash_setup(c) (0)
 #define jffs2_dataflash_cleanup(c) do {} while (0)
+#define jffs2_nor_wbuf_flash(c) (0)
 #define jffs2_nor_wbuf_flash_setup(c) (0)
 #define jffs2_nor_wbuf_flash_cleanup(c) do {} while (0)
 
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 523a8f330ef..00e856e4fdb 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -511,7 +511,8 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 				spr = (struct jffs2_sum_xref_flash *)sp;
 				dbg_summary("xref at %#08x-%#08x\n",
 					    jeb->offset + je32_to_cpu(spr->offset),
-					    jeb->offset + je32_to_cpu(spr->offset) + PAD(sizeof(struct jffs2_raw_xref)));
+					    jeb->offset + je32_to_cpu(spr->offset) + 
+					    (uint32_t)PAD(sizeof(struct jffs2_raw_xref)));
 
 				ref = jffs2_alloc_xattr_ref();
 				if (!ref) {
@@ -787,10 +788,12 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
 		JFFS2_WARNING("Write of %u bytes at 0x%08x failed. returned %d, retlen %zd\n",
 			      infosize, sum_ofs, ret, retlen);
 
-		/* Waste remaining space */
-		spin_lock(&c->erase_completion_lock);
-		jffs2_link_node_ref(c, jeb, sum_ofs | REF_OBSOLETE, infosize, NULL);
-		spin_unlock(&c->erase_completion_lock);
+		if (retlen) {
+			/* Waste remaining space */
+			spin_lock(&c->erase_completion_lock);
+			jffs2_link_node_ref(c, jeb, sum_ofs | REF_OBSOLETE, infosize, NULL);
+			spin_unlock(&c->erase_completion_lock);
+		}
 
 		c->summary->sum_size = JFFS2_SUMMARY_NOSUM_SIZE;
 
@@ -836,6 +839,7 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
 		jffs2_sum_disable_collecting(c->summary);
 
 		JFFS2_WARNING("Not enough space for summary, padsize = %d\n", padsize);
+		spin_lock(&c->erase_completion_lock);
 		return 0;
 	}
 
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index e16e45ea047..2febece8906 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -156,72 +156,126 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock
 		jffs2_erase_pending_trigger(c);
 	}
 
-	/* Adjust its size counts accordingly */
-	c->wasted_size += jeb->free_size;
-	c->free_size -= jeb->free_size;
-	jeb->wasted_size += jeb->free_size;
-	jeb->free_size = 0;
+	if (!jffs2_prealloc_raw_node_refs(c, jeb, 1)) {
+		uint32_t oldfree = jeb->free_size;
+
+		jffs2_link_node_ref(c, jeb, 
+				    (jeb->offset+c->sector_size-oldfree) | REF_OBSOLETE,
+				    oldfree, NULL);
+		/* convert to wasted */
+		c->wasted_size += oldfree;
+		jeb->wasted_size += oldfree;
+		c->dirty_size -= oldfree;
+		jeb->dirty_size -= oldfree;
+	}
 
 	jffs2_dbg_dump_block_lists_nolock(c);
 	jffs2_dbg_acct_sanity_check_nolock(c,jeb);
 	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
 }
 
+static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info *c,
+							    struct jffs2_inode_info *f,
+							    struct jffs2_raw_node_ref *raw,
+							    union jffs2_node_union *node)
+{
+	struct jffs2_node_frag *frag;
+	struct jffs2_full_dirent *fd;
+
+	dbg_noderef("incore_replace_raw: node at %p is {%04x,%04x}\n",
+		    node, je16_to_cpu(node->u.magic), je16_to_cpu(node->u.nodetype));
+
+	BUG_ON(je16_to_cpu(node->u.magic) != 0x1985 &&
+	       je16_to_cpu(node->u.magic) != 0);
+
+	switch (je16_to_cpu(node->u.nodetype)) {
+	case JFFS2_NODETYPE_INODE:
+		frag = jffs2_lookup_node_frag(&f->fragtree, je32_to_cpu(node->i.offset));
+		BUG_ON(!frag);
+		/* Find a frag which refers to the full_dnode we want to modify */
+		while (!frag->node || frag->node->raw != raw) {
+			frag = frag_next(frag);
+			BUG_ON(!frag);
+		}
+		dbg_noderef("Will replace ->raw in full_dnode at %p\n", frag->node);
+		return &frag->node->raw;
+		break;
+
+	case JFFS2_NODETYPE_DIRENT:
+		for (fd = f->dents; fd; fd = fd->next) {
+			if (fd->raw == raw) {
+				dbg_noderef("Will replace ->raw in full_dirent at %p\n", fd);
+				return &fd->raw;
+			}
+		}
+		BUG();
+	default:
+		dbg_noderef("Don't care about replacing raw for nodetype %x\n",
+			    je16_to_cpu(node->u.nodetype));
+		break;
+	}
+	return NULL;
+}
+
 /* Recover from failure to write wbuf. Recover the nodes up to the
  * wbuf, not the one which we were starting to try to write. */
 
 static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 {
 	struct jffs2_eraseblock *jeb, *new_jeb;
-	struct jffs2_raw_node_ref **first_raw, **raw;
+	struct jffs2_raw_node_ref *raw, *next, *first_raw = NULL;
 	size_t retlen;
 	int ret;
+	int nr_refile = 0;
 	unsigned char *buf;
 	uint32_t start, end, ofs, len;
 
 	jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
 
-	if (jffs2_prealloc_raw_node_refs(c, jeb, c->reserved_refs + 1))
-		return;
-
 	spin_lock(&c->erase_completion_lock);
-
 	jffs2_block_refile(c, jeb, REFILE_NOTEMPTY);
+	spin_unlock(&c->erase_completion_lock);
+
+	BUG_ON(!ref_obsolete(jeb->last_node));
 
 	/* Find the first node to be recovered, by skipping over every
 	   node which ends before the wbuf starts, or which is obsolete. */
-	first_raw = &jeb->first_node;
-	while (*first_raw &&
-	       (ref_obsolete(*first_raw) ||
-		(ref_offset(*first_raw)+ref_totlen(c, jeb, *first_raw)) < c->wbuf_ofs)) {
-		D1(printk(KERN_DEBUG "Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n",
-			  ref_offset(*first_raw), ref_flags(*first_raw),
-			  (ref_offset(*first_raw) + ref_totlen(c, jeb, *first_raw)),
-			  c->wbuf_ofs));
-		first_raw = &(*first_raw)->next_phys;
+	for (next = raw = jeb->first_node; next; raw = next) {
+		next = ref_next(raw);
+
+		if (ref_obsolete(raw) || 
+		    (next && ref_offset(next) <= c->wbuf_ofs)) {
+			dbg_noderef("Skipping node at 0x%08x(%d)-0x%08x which is either before 0x%08x or obsolete\n",
+				    ref_offset(raw), ref_flags(raw),
+				    (ref_offset(raw) + ref_totlen(c, jeb, raw)),
+				    c->wbuf_ofs);
+			continue;
+		}
+		dbg_noderef("First node to be recovered is at 0x%08x(%d)-0x%08x\n",
+			    ref_offset(raw), ref_flags(raw),
+			    (ref_offset(raw) + ref_totlen(c, jeb, raw)));
+
+		first_raw = raw;
+		break;
 	}
 
-	if (!*first_raw) {
+	if (!first_raw) {
 		/* All nodes were obsolete. Nothing to recover. */
 		D1(printk(KERN_DEBUG "No non-obsolete nodes to be recovered. Just filing block bad\n"));
-		spin_unlock(&c->erase_completion_lock);
+		c->wbuf_len = 0;
 		return;
 	}
 
-	start = ref_offset(*first_raw);
-	end = ref_offset(*first_raw) + ref_totlen(c, jeb, *first_raw);
+	start = ref_offset(first_raw);
+	end = ref_offset(jeb->last_node);
+	nr_refile = 1;
 
-	/* Find the last node to be recovered */
-	raw = first_raw;
-	while ((*raw)) {
-		if (!ref_obsolete(*raw))
-			end = ref_offset(*raw) + ref_totlen(c, jeb, *raw);
+	/* Count the number of refs which need to be copied */
+	while ((raw = ref_next(raw)) != jeb->last_node)
+		nr_refile++;
 
-		raw = &(*raw)->next_phys;
-	}
-	spin_unlock(&c->erase_completion_lock);
-
-	D1(printk(KERN_DEBUG "wbuf recover %08x-%08x\n", start, end));
+	dbg_noderef("wbuf recover %08x-%08x (%d bytes in %d nodes)\n",
+		    start, end, end - start, nr_refile);
 
 	buf = NULL;
 	if (start < c->wbuf_ofs) {
@@ -248,13 +302,24 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 			kfree(buf);
 			buf = NULL;
 		read_failed:
-			first_raw = &(*first_raw)->next_phys;
+			first_raw = ref_next(first_raw);
+			nr_refile--;
+			while (first_raw && ref_obsolete(first_raw)) {
+				first_raw = ref_next(first_raw);
+				nr_refile--;
+			}
+
 			/* If this was the only node to be recovered, give up */
-			if (!(*first_raw))
+			if (!first_raw) {
+				c->wbuf_len = 0;
 				return;
+			}
 
 			/* It wasn't. Go on and try to recover nodes complete in the wbuf */
-			start = ref_offset(*first_raw);
+			start = ref_offset(first_raw);
+			dbg_noderef("wbuf now recover %08x-%08x (%d bytes in %d nodes)\n",
+				    start, end, end - start, nr_refile);
+
 		} else {
 			/* Read succeeded. Copy the remaining data from the wbuf */
 			memcpy(buf + (c->wbuf_ofs - start), c->wbuf, end - c->wbuf_ofs);
@@ -263,7 +328,6 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 	/* OK... we're to rewrite (end-start) bytes of data from first_raw onwards.
 	   Either 'buf' contains the data, or we find it in the wbuf */
 
-
 	/* ... and get an allocation of space from a shiny new block instead */
 	ret = jffs2_reserve_space_gc(c, end-start, &len, JFFS2_SUMMARY_NOSUM_SIZE);
 	if (ret) {
@@ -271,6 +335,14 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		kfree(buf);
 		return;
 	}
+
+	ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, nr_refile);
+	if (ret) {
+		printk(KERN_WARNING "Failed to allocate node refs for wbuf recovery. Data loss ensues.\n");
+		kfree(buf);
+		return;
+	}
+
 	ofs = write_ofs(c);
 
 	if (end-start >= c->wbuf_pagesize) {
@@ -304,7 +376,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 			kfree(buf);
 
 			if (retlen)
-				jffs2_add_physical_node_ref(c, ofs | REF_OBSOLETE, ref_totlen(c, jeb, *first_raw), NULL);
+				jffs2_add_physical_node_ref(c, ofs | REF_OBSOLETE, ref_totlen(c, jeb, first_raw), NULL);
 
 			return;
 		}
@@ -314,12 +386,10 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		c->wbuf_ofs = ofs + towrite;
 		memmove(c->wbuf, rewrite_buf + towrite, c->wbuf_len);
 		/* Don't muck about with c->wbuf_inodes. False positives are harmless. */
-		kfree(buf);
 	} else {
 		/* OK, now we're left with the dregs in whichever buffer we're using */
 		if (buf) {
 			memcpy(c->wbuf, buf, end-start);
-			kfree(buf);
 		} else {
 			memmove(c->wbuf, c->wbuf + (start - c->wbuf_ofs), end - start);
 		}
@@ -331,62 +401,111 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 	new_jeb = &c->blocks[ofs / c->sector_size];
 
 	spin_lock(&c->erase_completion_lock);
-	if (new_jeb->first_node) {
-		/* Odd, but possible with ST flash later maybe */
-		new_jeb->last_node->next_phys = *first_raw;
-	} else {
-		new_jeb->first_node = *first_raw;
-	}
-
-	raw = first_raw;
-	while (*raw) {
-		uint32_t rawlen = ref_totlen(c, jeb, *raw);
+	for (raw = first_raw; raw != jeb->last_node; raw = ref_next(raw)) {
+		uint32_t rawlen = ref_totlen(c, jeb, raw);
+		struct jffs2_inode_cache *ic;
+		struct jffs2_raw_node_ref *new_ref;
+		struct jffs2_raw_node_ref **adjust_ref = NULL;
+		struct jffs2_inode_info *f = NULL;
 
 		D1(printk(KERN_DEBUG "Refiling block of %08x at %08x(%d) to %08x\n",
-			  rawlen, ref_offset(*raw), ref_flags(*raw), ofs));
+			  rawlen, ref_offset(raw), ref_flags(raw), ofs));
+
+		ic = jffs2_raw_ref_to_ic(raw);
+
+		/* Ick. This XATTR mess should be fixed shortly... */
+		if (ic && ic->class == RAWNODE_CLASS_XATTR_DATUM) {
+			struct jffs2_xattr_datum *xd = (void *)ic;
+			BUG_ON(xd->node != raw);
+			adjust_ref = &xd->node;
+			raw->next_in_ino = NULL;
+			ic = NULL;
+		} else if (ic && ic->class == RAWNODE_CLASS_XATTR_REF) {
+			struct jffs2_xattr_datum *xr = (void *)ic;
+			BUG_ON(xr->node != raw);
+			adjust_ref = &xr->node;
+			raw->next_in_ino = NULL;
+			ic = NULL;
+		} else if (ic && ic->class == RAWNODE_CLASS_INODE_CACHE) {
+			struct jffs2_raw_node_ref **p = &ic->nodes;
+
+			/* Remove the old node from the per-inode list */
+			while (*p && *p != (void *)ic) {
+				if (*p == raw) {
+					(*p) = (raw->next_in_ino);
+					raw->next_in_ino = NULL;
+					break;
+				}
+				p = &((*p)->next_in_ino);
+			}
 
-		if (ref_obsolete(*raw)) {
-			/* Shouldn't really happen much */
-			new_jeb->dirty_size += rawlen;
-			new_jeb->free_size -= rawlen;
-			c->dirty_size += rawlen;
-		} else {
-			new_jeb->used_size += rawlen;
-			new_jeb->free_size -= rawlen;
+			if (ic->state == INO_STATE_PRESENT && !ref_obsolete(raw)) {
+				/* If it's an in-core inode, then we have to adjust any
+				   full_dirent or full_dnode structure to point to the
+				   new version instead of the old */
+				f = jffs2_gc_fetch_inode(c, ic->ino, ic->nlink);
+				if (IS_ERR(f)) {
+					/* Should never happen; it _must_ be present */
+					JFFS2_ERROR("Failed to iget() ino #%u, err %ld\n",
+						    ic->ino, PTR_ERR(f));
+					BUG();
+				}
+				/* We don't lock f->sem. There's a number of ways we could
+				   end up in here with it already being locked, and nobody's
+				   going to modify it on us anyway because we hold the
+				   alloc_sem. We're only changing one ->raw pointer too,
+				   which we can get away with without upsetting readers. */
+				adjust_ref = jffs2_incore_replace_raw(c, f, raw,
+								      (void *)(buf?:c->wbuf) + (ref_offset(raw) - start));
+			} else if (unlikely(ic->state != INO_STATE_PRESENT &&
+					    ic->state != INO_STATE_CHECKEDABSENT &&
+					    ic->state != INO_STATE_GC)) {
+				JFFS2_ERROR("Inode #%u is in strange state %d!\n", ic->ino, ic->state);
+				BUG();
+			}
+		}
+
+		new_ref = jffs2_link_node_ref(c, new_jeb, ofs | ref_flags(raw), rawlen, ic);
+
+		if (adjust_ref) {
+			BUG_ON(*adjust_ref != raw);
+			*adjust_ref = new_ref;
+		}
+		if (f)
+			jffs2_gc_release_inode(c, f);
+
+		if (!ref_obsolete(raw)) {
 			jeb->dirty_size += rawlen;
 			jeb->used_size  -= rawlen;
 			c->dirty_size += rawlen;
+			c->used_size -= rawlen;
+			raw->flash_offset = ref_offset(raw) | REF_OBSOLETE;
+			BUG_ON(raw->next_in_ino);
 		}
-		c->free_size -= rawlen;
-		(*raw)->flash_offset = ofs | ref_flags(*raw);
 		ofs += rawlen;
-		new_jeb->last_node = *raw;
-
-		raw = &(*raw)->next_phys;
 	}
 
+	kfree(buf);
+
 	/* Fix up the original jeb now it's on the bad_list */
-	*first_raw = NULL;
-	if (first_raw == &jeb->first_node) {
-		jeb->last_node = NULL;
+	if (first_raw == jeb->first_node) {
 		D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
 		list_del(&jeb->list);
 		list_add(&jeb->list, &c->erase_pending_list);
 		c->nr_erasing_blocks++;
 		jffs2_erase_pending_trigger(c);
 	}
-	else
-		jeb->last_node = container_of(first_raw, struct jffs2_raw_node_ref, next_phys);
 
 	jffs2_dbg_acct_sanity_check_nolock(c, jeb);
-        jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, jeb);
 
 	jffs2_dbg_acct_sanity_check_nolock(c, new_jeb);
-        jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb);
+	jffs2_dbg_acct_paranoia_check_nolock(c, new_jeb);
 
 	spin_unlock(&c->erase_completion_lock);
 
-	D1(printk(KERN_DEBUG "wbuf recovery completed OK\n"));
+	D1(printk(KERN_DEBUG "wbuf recovery completed OK. wbuf_ofs 0x%08x, len 0x%x\n", c->wbuf_ofs, c->wbuf_len));
+
 }
 
 /* Meaning of pad argument:
@@ -400,6 +519,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 
 static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 {
+	struct jffs2_eraseblock *wbuf_jeb;
 	int ret;
 	size_t retlen;
 
@@ -417,7 +537,8 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	if (!c->wbuf_len)	/* already checked c->wbuf above */
 		return 0;
 
-	if (jffs2_prealloc_raw_node_refs(c, c->nextblock, c->reserved_refs + 1))
+	wbuf_jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
+	if (jffs2_prealloc_raw_node_refs(c, wbuf_jeb, c->nextblock->allocated_refs + 1))
 		return -ENOMEM;
 
 	/* claim remaining space on the page
@@ -473,32 +594,29 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 
 	/* Adjust free size of the block if we padded. */
 	if (pad) {
-		struct jffs2_eraseblock *jeb;
 		uint32_t waste = c->wbuf_pagesize - c->wbuf_len;
 
-		jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
-
 		D1(printk(KERN_DEBUG "jffs2_flush_wbuf() adjusting free_size of %sblock at %08x\n",
-			  (jeb==c->nextblock)?"next":"", jeb->offset));
+			  (wbuf_jeb==c->nextblock)?"next":"", wbuf_jeb->offset));
 
 		/* wbuf_pagesize - wbuf_len is the amount of space that's to be
 		   padded. If there is less free space in the block than that,
 		   something screwed up */
-		if (jeb->free_size < waste) {
+		if (wbuf_jeb->free_size < waste) {
 			printk(KERN_CRIT "jffs2_flush_wbuf(): Accounting error. wbuf at 0x%08x has 0x%03x bytes, 0x%03x left.\n",
 			       c->wbuf_ofs, c->wbuf_len, waste);
 			printk(KERN_CRIT "jffs2_flush_wbuf(): But free_size for block at 0x%08x is only 0x%08x\n",
-			       jeb->offset, jeb->free_size);
+			       wbuf_jeb->offset, wbuf_jeb->free_size);
 			BUG();
 		}
 
 		spin_lock(&c->erase_completion_lock);
 
-		jffs2_link_node_ref(c, jeb, (c->wbuf_ofs + c->wbuf_len) | REF_OBSOLETE, waste, NULL);
+		jffs2_link_node_ref(c, wbuf_jeb, (c->wbuf_ofs + c->wbuf_len) | REF_OBSOLETE, waste, NULL);
 		/* FIXME: that made it count as dirty. Convert to wasted */
-		jeb->dirty_size -= waste;
+		wbuf_jeb->dirty_size -= waste;
 		c->dirty_size -= waste;
-		jeb->wasted_size += waste;
+		wbuf_jeb->wasted_size += waste;
 		c->wasted_size += waste;
 	} else
 		spin_lock(&c->erase_completion_lock);
@@ -758,7 +876,8 @@ outerr:
  *	This is the entry for flash write.
  *	Check, if we work on NAND FLASH, if so build an kvec and write it via vritev
 */
-int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *retlen, const u_char *buf)
+int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
+		      size_t *retlen, const u_char *buf)
 {
 	struct kvec vecs[1];
 
@@ -953,7 +1072,7 @@ int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c, struct jffs2_eraseblo
 			}
 			D1(if (retval == 1) {
 				printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): Cleanmarker node not detected in block at %08x\n", jeb->offset);
-				printk(KERN_WARNING "OOB at %08x was ", offset);
+				printk(KERN_WARNING "OOB at %08zx was ", offset);
 				for (i=0; i < oob_size; i++) {
 					printk("%02x ", buf[i]);
 				}
-- 
cgit v1.2.3


From ddc58bd65ebe58c243e9f609384825df9ffd04ad Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 27 May 2006 13:15:16 +0100
Subject: [JFFS2] Fix wbuf recovery of f->metadata->raw node.

A data node might not be in the fraglist; it could be f->metadata.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/wbuf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 2febece8906..717fa2f52ac 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -190,6 +190,10 @@ static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info
 
 	switch (je16_to_cpu(node->u.nodetype)) {
 	case JFFS2_NODETYPE_INODE:
+		if (f->metadata && f->metadata->raw == raw) {
+			dbg_noderef("Will replace ->raw in f->metadata at %p\n", f->metadata);
+			return &f->metadata->raw;
+		}
 		frag = jffs2_lookup_node_frag(&f->fragtree, je32_to_cpu(node->i.offset));
 		BUG_ON(!frag);
 		/* Find a frag which refers to the full_dnode we want to modify */
@@ -199,7 +203,6 @@ static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info
 		}
 		dbg_noderef("Will replace ->raw in full_dnode at %p\n", frag->node);
 		return &frag->node->raw;
-		break;
 
 	case JFFS2_NODETYPE_DIRENT:
 		for (fd = f->dents; fd; fd = fd->next) {
@@ -209,6 +212,7 @@ static struct jffs2_raw_node_ref **jffs2_incore_replace_raw(struct jffs2_sb_info
 			}
 		}
 		BUG();
+
 	default:
 		dbg_noderef("Don't care about replacing raw for nodetype %x\n",
 			    je16_to_cpu(node->u.nodetype));
-- 
cgit v1.2.3


From 2ebf09c2491433a499e0ae7723d04e9e810afa84 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 28 May 2006 22:13:25 +0100
Subject: [JFFS2] Fix oops when marking space dirty in scan, but no previous
 node exists.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodelist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 5d36e9b4d7c..927dfe42ba7 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -1139,7 +1139,7 @@ int jffs2_scan_dirty_space(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 		BUG();
 	}
 	/* REF_EMPTY_NODE is !obsolete, so that works OK */
-	if (ref_obsolete(jeb->last_node)) {
+	if (jeb->last_node && ref_obsolete(jeb->last_node)) {
 #ifdef TEST_TOTLEN
 		jeb->last_node->__totlen += size;
 #endif
-- 
cgit v1.2.3


From a6a8bef722875a95bb73e6de7da924a8d417b52c Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 29 May 2006 00:41:11 +0100
Subject: [JFFS2] Preallocate raw_node_refs in a couple of missing places in
 scan

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 3fb0e7e82cf..42c1ff21d35 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -67,8 +67,11 @@ static inline uint32_t EMPTY_SCAN_SIZE(uint32_t sector_size) {
 
 static int file_dirty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
-	int ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size);
-	if (ret)
+	int ret;
+
+	if ((ret = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+		return ret;
+	if ((ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size)))
 		return ret;
 	/* Turned wasted size into dirty, since we apparently 
 	   think it's recoverable now. */
@@ -559,6 +562,8 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 	if (ofs) {
 		D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset,
 			  jeb->offset + ofs));
+		if ((err = jffs2_prealloc_raw_node_refs(c, jeb, 1)))
+			return err;
 		if ((err = jffs2_scan_dirty_space(c, jeb, ofs)))
 			return err;
 	}
-- 
cgit v1.2.3


From ff268fb8791cf18df536113355d7184007c269d9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Sat, 27 May 2006 20:36:12 +0200
Subject: [MTD] NAND Consolidate oobinfo handling

The info structure for out of band data was copied into
the mtd structure. Make it a pointer and remove the ability
to set it from userspace. The position of ecc bytes is
defined by the hardware and should not be changed by software.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jffs2/wbuf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 717fa2f52ac..dc275cedfe4 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1151,7 +1151,7 @@ static struct nand_oobinfo jffs2_oobinfo_docecc = {
 
 static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
 {
-	struct nand_oobinfo *oinfo = &c->mtd->oobinfo;
+	struct nand_oobinfo *oinfo = c->mtd->oobinfo;
 
 	/* Do this only, if we have an oob buffer */
 	if (!c->mtd->oobsize)
-- 
cgit v1.2.3


From 5bd34c091a044d130601370c370f84b1c59f1627 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Sat, 27 May 2006 22:16:10 +0200
Subject: [MTD] NAND Replace oobinfo by ecclayout

The nand_oobinfo structure is not fitting the newer error correction
demands anymore. Replace it by struct nand_ecclayout and fixup the users
all over the place. Keep the nand_oobinfo based ioctl for user space
compability reasons.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jffs2/jffs2_fs_sb.h |  2 +-
 fs/jffs2/wbuf.c        | 51 ++++++++++++++++----------------------------------
 2 files changed, 17 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 272fbea5519..506690cc9a7 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -107,7 +107,7 @@ struct jffs2_sb_info {
 	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
 
 	/* Information about out-of-band area usage... */
-	struct nand_oobinfo *oobinfo;
+	struct nand_ecclayout *ecclayout;
 	uint32_t badblock_pos;
 	uint32_t fsdata_pos;
 	uint32_t fsdata_len;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index dc275cedfe4..c6a62e16296 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1140,18 +1140,9 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
 	return 1;
 }
 
-#define NAND_JFFS2_OOB16_FSDALEN	8
-
-static struct nand_oobinfo jffs2_oobinfo_docecc = {
-	.useecc = MTD_NANDECC_PLACE,
-	.eccbytes = 6,
-	.eccpos = {0,1,2,3,4,5}
-};
-
-
 static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
 {
-	struct nand_oobinfo *oinfo = c->mtd->oobinfo;
+	struct nand_ecclayout *oinfo = c->mtd->ecclayout;
 
 	/* Do this only, if we have an oob buffer */
 	if (!c->mtd->oobsize)
@@ -1161,33 +1152,23 @@ static int jffs2_nand_set_oobinfo(struct jffs2_sb_info *c)
 	c->cleanmarker_size = 0;
 
 	/* Should we use autoplacement ? */
-	if (oinfo && oinfo->useecc == MTD_NANDECC_AUTOPLACE) {
-		D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n"));
-		/* Get the position of the free bytes */
-		if (!oinfo->oobfree[0][1]) {
-			printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep. Autoplacement selected and no empty space in oob\n");
-			return -ENOSPC;
-		}
-		c->fsdata_pos = oinfo->oobfree[0][0];
-		c->fsdata_len = oinfo->oobfree[0][1];
-		if (c->fsdata_len > 8)
-			c->fsdata_len = 8;
-	} else {
-		/* This is just a legacy fallback and should go away soon */
-		switch(c->mtd->ecctype) {
-		case MTD_ECC_RS_DiskOnChip:
-			printk(KERN_WARNING "JFFS2 using DiskOnChip hardware ECC without autoplacement. Fix it!\n");
-			c->oobinfo = &jffs2_oobinfo_docecc;
-			c->fsdata_pos = 6;
-			c->fsdata_len = NAND_JFFS2_OOB16_FSDALEN;
-			c->badblock_pos = 15;
-			break;
+	if (!oinfo) {
+		D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n"));
+		return -EINVAL;
+	}
 
-		default:
-			D1(printk(KERN_DEBUG "JFFS2 on NAND. No autoplacment info found\n"));
-			return -EINVAL;
-		}
+	D1(printk(KERN_DEBUG "JFFS2 using autoplace on NAND\n"));
+	/* Get the position of the free bytes */
+	if (!oinfo->oobfree[0].length) {
+		printk (KERN_WARNING "jffs2_nand_set_oobinfo(): Eeep."
+			" Autoplacement selected and no empty space in oob\n");
+		return -ENOSPC;
 	}
+	c->fsdata_pos = oinfo->oobfree[0].offset;
+	c->fsdata_len = oinfo->oobfree[0].length;
+	if (c->fsdata_len > 8)
+		c->fsdata_len = 8;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From f4a43cfcecfcaeeaa40a9dbc1d1378298c22446e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Sun, 28 May 2006 11:01:53 +0200
Subject: [MTD] Remove silly MTD_WRITE/READ macros

Most of those macros are unused and the used ones just obfuscate
the code. Remove them and fixup all users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jffs/intrep.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 0ef207dfaf6..5371a403130 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -247,7 +247,7 @@ flash_safe_read(struct mtd_info *mtd, loff_t from,
 	D3(printk(KERN_NOTICE "flash_safe_read(%p, %08x, %p, %08x)\n",
 		  mtd, (unsigned int) from, buf, count));
 
-	res = MTD_READ(mtd, from, count, &retlen, buf);
+	res = mtd->read(mtd, from, count, &retlen, buf);
 	if (retlen != count) {
 		panic("Didn't read all bytes in flash_safe_read(). Returned %d\n", res);
 	}
@@ -262,7 +262,7 @@ flash_read_u32(struct mtd_info *mtd, loff_t from)
 	__u32 ret;
 	int res;
 
-	res = MTD_READ(mtd, from, 4, &retlen, (unsigned char *)&ret);
+	res = mtd->read(mtd, from, 4, &retlen, (unsigned char *)&ret);
 	if (retlen != 4) {
 		printk("Didn't read all bytes in flash_read_u32(). Returned %d\n", res);
 		return 0;
@@ -282,7 +282,7 @@ flash_safe_write(struct mtd_info *mtd, loff_t to,
 	D3(printk(KERN_NOTICE "flash_safe_write(%p, %08x, %p, %08x)\n",
 		  mtd, (unsigned int) to, buf, count));
 
-	res = MTD_WRITE(mtd, to, count, &retlen, buf);
+	res = mtd->write(mtd, to, count, &retlen, buf);
 	if (retlen != count) {
 		printk("Didn't write all bytes in flash_safe_write(). Returned %d\n", res);
 	}
@@ -300,9 +300,9 @@ flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs,
 
 	D3(printk(KERN_NOTICE "flash_safe_writev(%p, %08x, %p)\n",
 		  mtd, (unsigned int) to, vecs));
-	
+
 	if (mtd->writev) {
-		res = MTD_WRITEV(mtd, vecs, iovec_cnt, to, &retlen);
+		res = mtd->writev(mtd, vecs, iovec_cnt, to, &retlen);
 		return res ? res : retlen;
 	}
 	/* Not implemented writev. Repeatedly use write - on the not so
@@ -312,7 +312,8 @@ flash_safe_writev(struct mtd_info *mtd, const struct kvec *vecs,
 	retlen=0;
 
 	for (i=0; !res && i<iovec_cnt; i++) {
-		res = MTD_WRITE(mtd, to, vecs[i].iov_len, &retlen_a, vecs[i].iov_base);
+		res = mtd->write(mtd, to, vecs[i].iov_len, &retlen_a,
+				 vecs[i].iov_base);
 		if (retlen_a != vecs[i].iov_len) {
 			printk("Didn't write all bytes in flash_safe_writev(). Returned %d\n", res);
 			if (i != iovec_cnt-1)
@@ -393,7 +394,7 @@ flash_erase_region(struct mtd_info *mtd, loff_t start,
 	set_current_state(TASK_UNINTERRUPTIBLE);
 	add_wait_queue(&wait_q, &wait);
 
-	if (MTD_ERASE(mtd, erase) < 0) {
+	if (mtd->erase(mtd, erase) < 0) {
 		set_current_state(TASK_RUNNING);
 		remove_wait_queue(&wait_q, &wait);
 		kfree(erase);
-- 
cgit v1.2.3


From 8593fbc68b0df1168995de76d1af38eb62fd6b62 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Mon, 29 May 2006 03:26:58 +0200
Subject: [MTD] Rework the out of band handling completely

Hopefully the last iteration on this!

The handling of out of band data on NAND was accompanied by tons of fruitless
discussions and halfarsed patches to make it work for a particular
problem. Sufficiently annoyed by I all those "I know it better" mails and the
resonable amount of discarded "it solves my problem" patches, I finally decided
to go for the big rework. After removing the _ecc variants of mtd read/write
functions the solution to satisfy the various requirements was to refactor the
read/write _oob functions in mtd.

The major change is that read/write_oob now takes a pointer to an operation
descriptor structure "struct mtd_oob_ops".instead of having a function with at
least seven arguments.

read/write_oob which should probably renamed to a more descriptive name, can do
the following tasks:

- read/write out of band data
- read/write data content and out of band data
- read/write raw data content and out of band data (ecc disabled)

struct mtd_oob_ops has a mode field, which determines the oob handling mode.

Aside of the MTD_OOB_RAW mode, which is intended to be especially for
diagnostic purposes and some internal functions e.g. bad block table creation,
the other two modes are for mtd clients:

MTD_OOB_PLACE puts/gets the given oob data exactly to/from the place which is
described by the ooboffs and ooblen fields of the mtd_oob_ops strcuture. It's
up to the caller to make sure that the byte positions are not used by the ECC
placement algorithms.

MTD_OOB_AUTO puts/gets the given oob data automaticaly to/from the places in
the out of band area which are described by the oobfree tuples in the ecclayout
data structre which is associated to the devicee.

The decision whether data plus oob or oob only handling is done depends on the
setting of the datbuf member of the data structure. When datbuf == NULL then
the internal read/write_oob functions are selected, otherwise the read/write
data routines are invoked.

Tested on a few platforms with all variants. Please be aware of possible
regressions for your particular device / application scenario

Disclaimer: Any whining will be ignored from those who just contributed "hot
air blurb" and never sat down to tackle the underlying problem of the mess in
the NAND driver grown over time and the big chunk of work to fix up the
existing users. The problem was not the holiness of the existing MTD
interfaces. The problems was the lack of time to go for the big overhaul. It's
easy to add more mess to the existing one, but it takes alot of effort to go
for a real solution.

Improvements and bugfixes are welcome!

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jffs2/jffs2_fs_sb.h |   1 +
 fs/jffs2/wbuf.c        | 230 +++++++++++++++++++++++++------------------------
 2 files changed, 119 insertions(+), 112 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 506690cc9a7..935fec1b120 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -100,6 +100,7 @@ struct jffs2_sb_info {
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	/* Write-behind buffer for NAND flash */
 	unsigned char *wbuf;
+	unsigned char *oobbuf;
 	uint32_t wbuf_ofs;
 	uint32_t wbuf_len;
 	struct jffs2_inodirty *wbuf_inodes;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index c6a62e16296..1195d06d437 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -955,158 +955,159 @@ exit:
 	return ret;
 }
 
+#define NR_OOB_SCAN_PAGES	4
+
 /*
- *	Check, if the out of band area is empty
+ * Check, if the out of band area is empty
  */
-int jffs2_check_oob_empty( struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int mode)
+int jffs2_check_oob_empty(struct jffs2_sb_info *c,
+			  struct jffs2_eraseblock *jeb, int mode)
 {
-	unsigned char *buf;
-	int 	ret = 0;
-	int	i,len,page;
-	size_t  retlen;
-	int	oob_size;
-
-	/* allocate a buffer for all oob data in this sector */
-	oob_size = c->mtd->oobsize;
-	len = 4 * oob_size;
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf) {
-		printk(KERN_NOTICE "jffs2_check_oob_empty(): allocation of temporary data buffer for oob check failed\n");
-		return -ENOMEM;
-	}
-	/*
-	 * if mode = 0, we scan for a total empty oob area, else we have
-	 * to take care of the cleanmarker in the first page of the block
-	*/
-	ret = jffs2_flash_read_oob(c, jeb->offset, len , &retlen, buf);
+	int i, page, ret;
+	int oobsize = c->mtd->oobsize;
+	struct mtd_oob_ops ops;
+
+	ops.len = NR_OOB_SCAN_PAGES * oobsize;
+	ops.ooblen = oobsize;
+	ops.oobbuf = c->oobbuf;
+	ops.ooboffs = 0;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
+
+	ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
 	if (ret) {
-		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB failed %d for block at %08x\n", ret, jeb->offset));
-		goto out;
+		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
+			  "failed %d for block at %08x\n", ret, jeb->offset));
+		return ret;
 	}
 
-	if (retlen < len) {
-		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB return short read "
-			  "(%zd bytes not %d) for block at %08x\n", retlen, len, jeb->offset));
-		ret = -EIO;
-		goto out;
+	if (ops.retlen < ops.len) {
+		D1(printk(KERN_WARNING "jffs2_check_oob_empty(): Read OOB "
+			  "returned short read (%zd bytes not %d) for block "
+			  "at %08x\n", ops.retlen, ops.len, jeb->offset));
+		return -EIO;
 	}
 
 	/* Special check for first page */
-	for(i = 0; i < oob_size ; i++) {
+	for(i = 0; i < oobsize ; i++) {
 		/* Yeah, we know about the cleanmarker. */
 		if (mode && i >= c->fsdata_pos &&
 		    i < c->fsdata_pos + c->fsdata_len)
 			continue;
 
-		if (buf[i] != 0xFF) {
-			D2(printk(KERN_DEBUG "Found %02x at %x in OOB for %08x\n",
-				  buf[i], i, jeb->offset));
-			ret = 1;
-			goto out;
+		if (ops.oobbuf[i] != 0xFF) {
+			D2(printk(KERN_DEBUG "Found %02x at %x in OOB for "
+				  "%08x\n", ops.oobbuf[i], i, jeb->offset));
+			return 1;
 		}
 	}
 
 	/* we know, we are aligned :) */
-	for (page = oob_size; page < len; page += sizeof(long)) {
-		unsigned long dat = *(unsigned long *)(&buf[page]);
-		if(dat != -1) {
-			ret = 1;
-			goto out;
-		}
+	for (page = oobsize; page < ops.len; page += sizeof(long)) {
+		long dat = *(long *)(&ops.oobbuf[page]);
+		if(dat != -1)
+			return 1;
 	}
-
-out:
-	kfree(buf);
-
-	return ret;
+	return 0;
 }
 
 /*
-*	Scan for a valid cleanmarker and for bad blocks
-*	For virtual blocks (concatenated physical blocks) check the cleanmarker
-*	only in the first page of the first physical block, but scan for bad blocks in all
-*	physical blocks
-*/
-int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+ * Scan for a valid cleanmarker and for bad blocks
+ */
+int jffs2_check_nand_cleanmarker (struct jffs2_sb_info *c,
+				  struct jffs2_eraseblock *jeb)
 {
 	struct jffs2_unknown_node n;
-	unsigned char buf[2 * NAND_MAX_OOBSIZE];
-	unsigned char *p;
-	int ret, i, cnt, retval = 0;
-	size_t retlen, offset;
-	int oob_size;
-
-	offset = jeb->offset;
-	oob_size = c->mtd->oobsize;
-
-	/* Loop through the physical blocks */
-	for (cnt = 0; cnt < (c->sector_size / c->mtd->erasesize); cnt++) {
-		/* Check first if the block is bad. */
-		if (c->mtd->block_isbad (c->mtd, offset)) {
-			D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Bad block at %08x\n", jeb->offset));
-			return 2;
-		}
-		/*
-		   *    We read oob data from page 0 and 1 of the block.
-		   *    page 0 contains cleanmarker and badblock info
-		   *    page 1 contains failure count of this block
-		 */
-		ret = c->mtd->read_oob (c->mtd, offset, oob_size << 1, &retlen, buf);
+	struct mtd_oob_ops ops;
+	int oobsize = c->mtd->oobsize;
+	unsigned char *p,*b;
+	int i, ret;
+	size_t offset = jeb->offset;
+
+	/* Check first if the block is bad. */
+	if (c->mtd->block_isbad(c->mtd, offset)) {
+		D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker()"
+			   ": Bad block at %08x\n", jeb->offset));
+		return 2;
+	}
 
-		if (ret) {
-			D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Read OOB failed %d for block at %08x\n", ret, jeb->offset));
-			return ret;
-		}
-		if (retlen < (oob_size << 1)) {
-			D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): Read OOB return short read (%zd bytes not %d) for block at %08x\n", retlen, oob_size << 1, jeb->offset));
-			return -EIO;
-		}
+	ops.len = oobsize;
+	ops.ooblen = oobsize;
+	ops.oobbuf = c->oobbuf;
+	ops.ooboffs = 0;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
 
-		/* Check cleanmarker only on the first physical block */
-		if (!cnt) {
-			n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK);
-			n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER);
-			n.totlen = cpu_to_je32 (8);
-			p = (unsigned char *) &n;
+	ret = c->mtd->read_oob(c->mtd, offset, &ops);
+	if (ret) {
+		D1 (printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+			   "Read OOB failed %d for block at %08x\n",
+			   ret, jeb->offset));
+		return ret;
+	}
 
-			for (i = 0; i < c->fsdata_len; i++) {
-				if (buf[c->fsdata_pos + i] != p[i]) {
-					retval = 1;
-				}
-			}
-			D1(if (retval == 1) {
-				printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): Cleanmarker node not detected in block at %08x\n", jeb->offset);
-				printk(KERN_WARNING "OOB at %08zx was ", offset);
-				for (i=0; i < oob_size; i++) {
-					printk("%02x ", buf[i]);
-				}
-				printk("\n");
-			})
-		}
-		offset += c->mtd->erasesize;
+	if (ops.retlen < ops.len) {
+		D1 (printk (KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+			    "Read OOB return short read (%zd bytes not %d) "
+			    "for block at %08x\n", ops.retlen, ops.len,
+			    jeb->offset));
+		return -EIO;
 	}
-	return retval;
+
+	n.magic = cpu_to_je16 (JFFS2_MAGIC_BITMASK);
+	n.nodetype = cpu_to_je16 (JFFS2_NODETYPE_CLEANMARKER);
+	n.totlen = cpu_to_je32 (8);
+	p = (unsigned char *) &n;
+	b = c->oobbuf + c->fsdata_pos;
+
+	for (i = c->fsdata_len; i; i--) {
+		if (*b++ != *p++)
+			ret = 1;
+	}
+
+	D1(if (ret == 1) {
+		printk(KERN_WARNING "jffs2_check_nand_cleanmarker(): "
+		       "Cleanmarker node not detected in block at %08x\n",
+		       offset);
+		printk(KERN_WARNING "OOB at %08zx was ", offset);
+		for (i=0; i < oobsize; i++)
+			printk("%02x ", c->oobbuf[i]);
+		printk("\n");
+	});
+	return ret;
 }
 
-int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
+int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
+				 struct jffs2_eraseblock *jeb)
 {
-	struct 	jffs2_unknown_node n;
-	int 	ret;
-	size_t 	retlen;
+	struct jffs2_unknown_node n;
+	int	ret;
+	struct mtd_oob_ops ops;
 
 	n.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
 	n.nodetype = cpu_to_je16(JFFS2_NODETYPE_CLEANMARKER);
 	n.totlen = cpu_to_je32(8);
 
-	ret = jffs2_flash_write_oob(c, jeb->offset + c->fsdata_pos, c->fsdata_len, &retlen, (unsigned char *)&n);
+	ops.len = c->fsdata_len;
+	ops.ooblen = c->fsdata_len;;
+	ops.oobbuf = (uint8_t *)&n;
+	ops.ooboffs = c->fsdata_pos;
+	ops.datbuf = NULL;
+	ops.mode = MTD_OOB_PLACE;
+
+	ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops);
 
 	if (ret) {
-		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
+		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
+			  "Write failed for block at %08x: error %d\n",
+			  jeb->offset, ret));
 		return ret;
 	}
-	if (retlen != c->fsdata_len) {
-		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): Short write for block at %08x: %zd not %d\n", jeb->offset, retlen, c->fsdata_len));
-		return ret;
+	if (ops.retlen != ops.len) {
+		D1(printk(KERN_WARNING "jffs2_write_nand_cleanmarker(): "
+			  "Short write for block at %08x: %zd not %d\n",
+			  jeb->offset, ops.retlen, ops.len));
+		return -EIO;
 	}
 	return 0;
 }
@@ -1185,6 +1186,10 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 	if (!c->wbuf)
 		return -ENOMEM;
 
+	c->oobbuf = kmalloc(NR_OOB_SCAN_PAGES * c->mtd->oobsize, GFP_KERNEL);
+	if (!c->oobbuf)
+		return -ENOMEM;
+
 	res = jffs2_nand_set_oobinfo(c);
 
 #ifdef BREAKME
@@ -1202,6 +1207,7 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c)
 {
 	kfree(c->wbuf);
+	kfree(c->oobbuf);
 }
 
 int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
-- 
cgit v1.2.3


From 9a1fcdfd4bee27c418424cac47abf7c049541297 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@cruncher.tec.linutronix.de>
Date: Mon, 29 May 2006 14:56:39 +0200
Subject: [MTD] NAND Signal that a bitflip was corrected by ECC

Return -EUCLEAN on read when a bitflip was detected and corrected, so the
clients can react and eventually copy the affected block to a spare one.
Make all in kernel users aware of the change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jffs2/wbuf.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 1195d06d437..a7f153f79ec 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -296,10 +296,11 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 		/* Do the read... */
 		ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
 
-		if (ret == -EBADMSG && retlen == c->wbuf_ofs - start) {
-			/* ECC recovered */
+		/* ECC recovered ? */
+		if ((ret == -EUCLEAN || ret == -EBADMSG) &&
+		    (retlen == c->wbuf_ofs - start))
 			ret = 0;
-		}
+
 		if (ret || retlen != c->wbuf_ofs - start) {
 			printk(KERN_CRIT "Old data are already lost in wbuf recovery. Data loss ensues.\n");
 
@@ -908,20 +909,21 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
 	down_read(&c->wbuf_sem);
 	ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
 
-	if ( (ret == -EBADMSG) && (*retlen == len) ) {
-		printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n",
-		       len, ofs);
+	if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
+		if (ret == -EBADMSG)
+			printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx)"
+			       " returned ECC error\n", len, ofs);
 		/*
-		 * We have the raw data without ECC correction in the buffer, maybe
-		 * we are lucky and all data or parts are correct. We check the node.
-		 * If data are corrupted node check will sort it out.
-		 * We keep this block, it will fail on write or erase and the we
-		 * mark it bad. Or should we do that now? But we should give him a chance.
-		 * Maybe we had a system crash or power loss before the ecc write or
-		 * a erase was completed.
+		 * We have the raw data without ECC correction in the buffer,
+		 * maybe we are lucky and all data or parts are correct. We
+		 * check the node.  If data are corrupted node check will sort
+		 * it out.  We keep this block, it will fail on write or erase
+		 * and the we mark it bad. Or should we do that now? But we
+		 * should give him a chance.  Maybe we had a system crash or
+		 * power loss before the ecc write or a erase was completed.
 		 * So we return success. :)
 		 */
-	 	ret = 0;
+		ret = 0;
 	}
 
 	/* if no writebuffer available or write buffer empty, return */
@@ -943,7 +945,7 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
 		orbf = (c->wbuf_ofs - ofs);	/* offset in read buffer */
 		if (orbf > len)			/* is write beyond write buffer ? */
 			goto exit;
-		lwbf = len - orbf; 		/* number of bytes to copy */
+		lwbf = len - orbf;		/* number of bytes to copy */
 		if (lwbf > c->wbuf_len)
 			lwbf = c->wbuf_len;
 	}
-- 
cgit v1.2.3


From 13ba42df4a385d7b77d7aac32b93bdcd73e6d6e1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 30 May 2006 08:59:34 +0100
Subject: [JFFS2] Fix calculation of potential summary marker offset on NOR
 flash.

Helps if we look _inside_ the buffer, rather than adding jeb->offset to
it. Doh.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 42c1ff21d35..61618080b86 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -463,7 +463,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 	      
 		if (!buf_size) {
 			/* XIP case. Just look, point at the summary if it's there */
-			sm = (void *)buf + jeb->offset - sizeof(*sm);
+			sm = (void *)buf + c->sector_size - sizeof(*sm);
 			if (je32_to_cpu(sm->magic) == JFFS2_SUM_MAGIC) {
 				sumptr = buf + je32_to_cpu(sm->offset);
 				sumlen = c->sector_size - je32_to_cpu(sm->offset);
-- 
cgit v1.2.3


From 098a19811b483f8e047d8292641c88e776452de0 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Tue, 30 May 2006 09:00:14 +0100
Subject: [JFFS2] Preallocate node refs for cleanmarker in summary scan

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 00e856e4fdb..51bf1654ce3 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -564,7 +564,6 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 	struct jffs2_unknown_node crcnode;
 	int ret, ofs;
 	uint32_t crc;
-	int err;
 
 	ofs = c->sector_size - sumsize;
 
@@ -606,16 +605,20 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb
 
 		dbg_summary("Summary : CLEANMARKER node \n");
 
+		ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+		if (ret)
+			return ret;
+
 		if (je32_to_cpu(summary->cln_mkr) != c->cleanmarker_size) {
 			dbg_summary("CLEANMARKER node has totlen 0x%x != normal 0x%x\n",
 				je32_to_cpu(summary->cln_mkr), c->cleanmarker_size);
-			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
-				return err;
+			if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return ret;
 		} else if (jeb->first_node) {
 			dbg_summary("CLEANMARKER node not first node in block "
 					"(0x%08x)\n", jeb->offset);
-			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
-				return err;
+			if ((ret = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(summary->cln_mkr)))))
+				return ret;
 		} else {
 			jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL,
 					    je32_to_cpu(summary->cln_mkr), NULL);
-- 
cgit v1.2.3


From 26a21b980b1897b11fd7f9ba4bf6060c9e15df10 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 31 May 2006 18:05:34 +0000
Subject: [CIFS] Cleanup extra whitespace in dmesg logging. Update cifs change
 log

---
 fs/cifs/CHANGES     |  5 ++++-
 fs/cifs/asn1.c      | 10 +++++-----
 fs/cifs/cifssmb.c   |  2 +-
 fs/cifs/dir.c       |  2 +-
 fs/cifs/file.c      | 18 +++++++++---------
 fs/cifs/inode.c     | 24 ++++++++++++------------
 fs/cifs/link.c      |  7 +++----
 fs/cifs/readdir.c   |  2 +-
 fs/cifs/transport.c |  3 +--
 9 files changed, 37 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 7271bb0257f..953d2f7f88b 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,10 @@ Version 1.43
 POSIX locking to servers which support CIFS POSIX Extensions
 (disabled by default controlled by proc/fs/cifs/Experimental).
 Handle conversion of long share names (especially Asian languages)
-to Unicode during mount. 
+to Unicode during mount. Fix memory leak in sess struct on reconnect.
+Fix rare oops after acpi suspend.  Fix O_TRUNC opens to overwrite on
+cifs open which helps rare case when setpathinfo fails or server does
+not support it. 
 
 Version 1.42
 ------------
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 086ae8f4a20..031cdf29325 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -467,7 +467,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 	asn1_open(&ctx, security_blob, length);
 
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding negTokenInit header "));
+		cFYI(1, ("Error decoding negTokenInit header"));
 		return 0;
 	} else if ((cls != ASN1_APL) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
@@ -495,7 +495,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 		}
 
 		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1, ("Error decoding negTokenInit "));
+			cFYI(1, ("Error decoding negTokenInit"));
 			return 0;
 		} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 			   || (tag != ASN1_EOC)) {
@@ -505,7 +505,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 		}
 
 		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1, ("Error decoding negTokenInit "));
+			cFYI(1, ("Error decoding negTokenInit"));
 			return 0;
 		} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 			   || (tag != ASN1_SEQ)) {
@@ -515,7 +515,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 		}
 
 		if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-			cFYI(1, ("Error decoding 2nd part of negTokenInit "));
+			cFYI(1, ("Error decoding 2nd part of negTokenInit"));
 			return 0;
 		} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 			   || (tag != ASN1_EOC)) {
@@ -527,7 +527,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 		if (asn1_header_decode
 		    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-			cFYI(1, ("Error decoding 2nd part of negTokenInit "));
+			cFYI(1, ("Error decoding 2nd part of negTokenInit"));
 			return 0;
 		} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 			   || (tag != ASN1_SEQ)) {
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 925881e00ff..f00369277c0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2239,7 +2239,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 			}
 			symlinkinfo[buflen] = 0; /* just in case so the caller
 					does not go off the end of the buffer */
-			cFYI(1,("readlink result - %s ",symlinkinfo));
+			cFYI(1,("readlink result - %s",symlinkinfo));
 		}
 	}
 qreparse_out:
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 82315edc77d..57bdf7f734b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -191,7 +191,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} 
 	if (rc) {
-		cFYI(1, ("cifs_create returned 0x%x ", rc));
+		cFYI(1, ("cifs_create returned 0x%x", rc));
 	} else {
 		/* If Open reported that we actually created a file
 		then we now have to set the mode if possible */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e2b4ce1dad6..379369ecda9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -201,7 +201,7 @@ int cifs_open(struct inode *inode, struct file *file)
 		} else {
 			if (file->f_flags & O_EXCL)
 				cERROR(1, ("could not find file instance for "
-					   "new file %p ", file));
+					   "new file %p", file));
 		}
 	}
 
@@ -272,7 +272,7 @@ int cifs_open(struct inode *inode, struct file *file)
 				& CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 	if (rc) {
-		cFYI(1, ("cifs_open returned 0x%x ", rc));
+		cFYI(1, ("cifs_open returned 0x%x", rc));
 		goto out;
 	}
 	file->private_data =
@@ -409,8 +409,8 @@ static int cifs_reopen_file(struct inode *inode, struct file *file,
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
 		up(&pCifsFile->fh_sem);
-		cFYI(1, ("cifs_open returned 0x%x ", rc));
-		cFYI(1, ("oplock: %d ", oplock));
+		cFYI(1, ("cifs_open returned 0x%x", rc));
+		cFYI(1, ("oplock: %d", oplock));
 	} else {
 		pCifsFile->netfid = netfid;
 		pCifsFile->invalidHandle = FALSE;
@@ -531,7 +531,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	    (struct cifsFileInfo *)file->private_data;
 	char *ptmp;
 
-	cFYI(1, ("Closedir inode = 0x%p with ", inode));
+	cFYI(1, ("Closedir inode = 0x%p", inode));
 
 	xid = GetXid();
 
@@ -605,7 +605,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 	}
 	if (pfLock->fl_flags & FL_ACCESS)
 		cFYI(1, ("Process suspended by mandatory locking - "
-			 "not implemented yet "));
+			 "not implemented yet"));
 	if (pfLock->fl_flags & FL_LEASE)
 		cFYI(1, ("Lease on file - not implemented yet"));
 	if (pfLock->fl_flags & 
@@ -1377,7 +1377,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 	xid = GetXid();
 
-	cFYI(1, ("Sync file - name: %s datasync: 0x%x ", 
+	cFYI(1, ("Sync file - name: %s datasync: 0x%x", 
 		dentry->d_name.name, datasync));
 	
 	rc = filemap_fdatawrite(inode->i_mapping);
@@ -1406,7 +1406,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 /*	fill in rpages then 
 	result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
 
-/*	cFYI(1, ("rpages is %d for sync page of Index %ld ", rpages, index));
+/*	cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index));
 
 #if 0
 	if (rc < 0)
@@ -1838,7 +1838,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	if (rc < 0)
 		goto io_error;
 	else
-		cFYI(1, ("Bytes read %d ",rc));
+		cFYI(1, ("Bytes read %d",rc));
                                                                                                                            
 	file->f_dentry->d_inode->i_atime =
 		current_fs_time(file->f_dentry->d_inode->i_sb);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4093764ef46..77a9e2f912f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -41,7 +41,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	char *tmp_path;
 
 	pTcon = cifs_sb->tcon;
-	cFYI(1, ("Getting info on %s ", search_path));
+	cFYI(1, ("Getting info on %s", search_path));
 	/* could have done a find first instead but this returns more info */
 	rc = CIFSSMBUnixQPathInfo(xid, pTcon, search_path, &findData,
 				  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
@@ -97,9 +97,9 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 		inode = *pinode;
 		cifsInfo = CIFS_I(inode);
 
-		cFYI(1, ("Old time %ld ", cifsInfo->time));
+		cFYI(1, ("Old time %ld", cifsInfo->time));
 		cifsInfo->time = jiffies;
-		cFYI(1, ("New time %ld ", cifsInfo->time));
+		cFYI(1, ("New time %ld", cifsInfo->time));
 		/* this is ok to set on every inode revalidate */
 		atomic_set(&cifsInfo->inUse,1);
 
@@ -421,23 +421,23 @@ int cifs_get_inode_info(struct inode **pinode,
 		inode = *pinode;
 		cifsInfo = CIFS_I(inode);
 		cifsInfo->cifsAttrs = attr;
-		cFYI(1, ("Old time %ld ", cifsInfo->time));
+		cFYI(1, ("Old time %ld", cifsInfo->time));
 		cifsInfo->time = jiffies;
-		cFYI(1, ("New time %ld ", cifsInfo->time));
+		cFYI(1, ("New time %ld", cifsInfo->time));
 
 		/* blksize needs to be multiple of two. So safer to default to
 		blksize and blkbits set in superblock so 2**blkbits and blksize
 		will match rather than setting to:
 		(pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
 
-		/* Linux can not store file creation time unfortunately so we ignore it */
+		/* Linux can not store file creation time so ignore it */
 		inode->i_atime =
 		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
 		inode->i_mtime =
 		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
 		inode->i_ctime =
 		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
-		cFYI(0, ("Attributes came in as 0x%x ", attr));
+		cFYI(0, ("Attributes came in as 0x%x", attr));
 
 		/* set default mode. will override for dirs below */
 		if (atomic_read(&cifsInfo->inUse) == 0)
@@ -731,7 +731,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
 			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		cFYI(1, ("cifs_mkdir returned 0x%x ", rc));
+		cFYI(1, ("cifs_mkdir returned 0x%x", rc));
 		d_drop(direntry);
 	} else {
 		inode->i_nlink++;
@@ -798,7 +798,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	char *full_path = NULL;
 	struct cifsInodeInfo *cifsInode;
 
-	cFYI(1, ("cifs_rmdir, inode = 0x%p with ", inode));
+	cFYI(1, ("cifs_rmdir, inode = 0x%p", inode));
 
 	xid = GetXid();
 
@@ -1121,7 +1121,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 
 	xid = GetXid();
 
-	cFYI(1, ("In cifs_setattr, name = %s attrs->iavalid 0x%x ",
+	cFYI(1, ("In cifs_setattr, name = %s attrs->iavalid 0x%x",
 		 direntry->d_name.name, attrs->ia_valid));
 
 	cifs_sb = CIFS_SB(direntry->d_inode->i_sb);
@@ -1289,7 +1289,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		it may be useful to Windows - but we do
 		not want to set ctime unless some other
 		timestamp is changing */
-		cFYI(1, ("CIFS - CTIME changed "));
+		cFYI(1, ("CIFS - CTIME changed"));
 		time_buf.ChangeTime =
 		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
 	} else
@@ -1356,7 +1356,7 @@ cifs_setattr_exit:
 
 void cifs_delete_inode(struct inode *inode)
 {
-	cFYI(1, ("In cifs_delete_inode, inode = 0x%p ", inode));
+	cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode));
 	/* may have to add back in if and when safe distributed caching of
 	   directories added e.g. via FindNotify */
 }
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 2ec99f83314..a57f5d6e621 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -167,7 +167,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 		return -ENOMEM;
 	}
 
-	cFYI(1, ("Full path: %s ", full_path));
+	cFYI(1, ("Full path: %s", full_path));
 	cFYI(1, ("symname is %s", symname));
 
 	/* BB what if DFS and this volume is on different share? BB */
@@ -186,8 +186,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 						 inode->i_sb,xid);
 
 		if (rc != 0) {
-			cFYI(1,
-			     ("Create symlink worked but get_inode_info failed with rc = %d ",
+			cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d",
 			      rc));
 		} else {
 			if (pTcon->nocase)
@@ -289,7 +288,7 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 					else {
 						cFYI(1,("num referral: %d",num_referrals));
 						if(referrals) {
-							cFYI(1,("referral string: %s ",referrals));
+							cFYI(1,("referral string: %s",referrals));
 							strncpy(tmpbuffer, referrals, len-1);                            
 						}
 					}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b689c503512..6b36c43d38f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -909,7 +909,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	case 1:
 		if (filldir(direntry, "..", 2, file->f_pos,
 		     file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, ("Filldir for parent dir failed "));
+			cERROR(1, ("Filldir for parent dir failed"));
 			rc = -ENOMEM;
 			break;
 		}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 3da80409466..17ba329e2b3 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -654,8 +654,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 
 	if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
 		up(&ses->server->tcpSem);
-		cERROR(1,
-		       ("Illegal length, greater than maximum frame, %d ",
+		cERROR(1, ("Illegal length, greater than maximum frame, %d",
 			in_buf->smb_buf_length));
 		DeleteMidQEntry(midQ);
 		/* If not lock req, update # of requests on wire to server */
-- 
cgit v1.2.3


From 3979877e5606ecc58c5a31bd0078c6d80ba9cbe7 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 31 May 2006 22:40:51 +0000
Subject: [CIFS] Support for setting up SMB sessions to legacy lanman servers

---
 fs/Kconfig             |  40 +++-
 fs/cifs/CHANGES        |   5 +
 fs/cifs/Makefile       |   2 +-
 fs/cifs/cifs_debug.c   |  84 ++++++--
 fs/cifs/cifs_debug.h   |   4 +
 fs/cifs/cifs_unicode.c |   1 +
 fs/cifs/cifsencrypt.c  |   2 +
 fs/cifs/cifsfs.c       |   4 +-
 fs/cifs/cifsglob.h     |  51 +++--
 fs/cifs/cifspdu.h      |  37 +++-
 fs/cifs/cifsproto.h    |   2 +-
 fs/cifs/cifssmb.c      |  95 ++++++++-
 fs/cifs/connect.c      |  39 ++--
 fs/cifs/dir.c          |   2 +-
 fs/cifs/fcntl.c        |   4 +-
 fs/cifs/inode.c        |   3 +-
 fs/cifs/misc.c         |  10 +-
 fs/cifs/readdir.c      |  16 +-
 fs/cifs/sess.c         | 511 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/smbencrypt.c   |   1 +
 20 files changed, 837 insertions(+), 76 deletions(-)
 create mode 100644 fs/cifs/sess.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f9b5842c8d2..c4eaacb8607 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1663,7 +1663,7 @@ config CIFS_STATS
 	  mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
 
 config CIFS_STATS2
-	bool "CIFS extended statistics"
+	bool "Extended statistics"
 	depends on CIFS_STATS
 	help
 	  Enabling this option will allow more detailed statistics on SMB
@@ -1676,6 +1676,32 @@ config CIFS_STATS2
 	  Unless you are a developer or are doing network performance analysis
 	  or tuning, say N.
 
+config CIFS_WEAK_PW_HASH
+	bool "Support legacy servers which use weaker LANMAN security"
+	depends on CIFS
+	help
+	  Modern CIFS servers including Samba and most Windows versions
+	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
+	  security mechanisms. These hash the password more securely
+	  than the mechanisms used in the older LANMAN version of the
+          SMB protocol needed to establish sessions with old SMB servers.
+
+	  Enabling this option allows the cifs module to mount to older
+	  LANMAN based servers such as OS/2 and Windows 95, but such
+	  mounts may be less secure than mounts using NTLM or more recent
+	  security mechanisms if you are on a public network.  Unless you
+	  have a need to access old SMB servers (and are on a private 
+	  network) you probably want to say N.  Even if this support
+	  is enabled in the kernel build, they will not be used
+	  automatically. At runtime LANMAN mounts are disabled but
+	  can be set to required (or optional) either in
+	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
+	  option on the mount command. This support is disabled by 
+	  default in order to reduce the possibility of a downgrade
+	  attack.
+ 
+	  If unsure, say N.
+
 config CIFS_XATTR
         bool "CIFS extended attributes"
         depends on CIFS
@@ -1704,6 +1730,16 @@ config CIFS_POSIX
 	  (such as Samba 3.10 and later) which can negotiate
 	  CIFS POSIX ACL support.  If unsure, say N.
 
+config CIFS_DEBUG2
+	bool "Enable additional CIFS debugging routines
+	help
+	   Enabling this option adds a few more debugging routines
+	   to the cifs code which slightly increases the size of
+	   the cifs module and can cause additional logging of debug
+	   messages in some error paths, slowing performance. This
+	   option can be turned off unless you are debugging
+	   cifs problems.  If unsure, say N.
+	   
 config CIFS_EXPERIMENTAL
 	  bool "CIFS Experimental Features (EXPERIMENTAL)"
 	  depends on CIFS && EXPERIMENTAL
@@ -1719,7 +1755,7 @@ config CIFS_EXPERIMENTAL
 	    If unsure, say N.
 
 config CIFS_UPCALL
-	  bool "CIFS Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
+	  bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
 	  depends on CIFS_EXPERIMENTAL
 	  select CONNECTOR
 	  help
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 953d2f7f88b..b878dfcff0f 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,8 @@
+Version 1.44
+------------
+Rewritten sessionsetup support, including support for legacy SMB
+session setup needed for OS/2 and older servers such as Windows 95 and 98.
+
 Version 1.43
 ------------
 POSIX locking to servers which support CIFS POSIX Extensions
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 58c77254a23..a26f26ed5a1 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -3,4 +3,4 @@
 #
 obj-$(CONFIG_CIFS) += cifs.o
 
-cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o ntlmssp.o
+cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o sess.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index f4124a32bef..7f4013a8607 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -39,7 +39,7 @@ cifs_dump_mem(char *label, void *data, int length)
 	char *charptr = data;
 	char buf[10], line[80];
 
-	printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n\n", 
+	printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n", 
 		label, length, data);
 	for (i = 0; i < length; i += 16) {
 		line[0] = 0;
@@ -57,6 +57,57 @@ cifs_dump_mem(char *label, void *data, int length)
 	}
 }
 
+#ifdef CONFIG_CIFS_DEBUG2
+void cifs_dump_detail(struct smb_hdr * smb)
+{
+	cERROR(1,("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
+		  smb->Command, smb->Status.CifsError,
+		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid));
+	cERROR(1,("smb buf %p len %d", smb, smbCalcSize_LE(smb)));
+}
+
+
+void cifs_dump_mids(struct TCP_Server_Info * server)
+{
+	struct list_head *tmp;
+	struct mid_q_entry * mid_entry;
+
+	if(server == NULL)
+		return;
+
+	cERROR(1,("Dump pending requests:"));
+	spin_lock(&GlobalMid_Lock);
+	list_for_each(tmp, &server->pending_mid_q) {
+		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+		if(mid_entry) {
+			cERROR(1,("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+				mid_entry->midState,
+				(int)mid_entry->command,
+				mid_entry->pid,
+				mid_entry->tsk,
+				mid_entry->mid));
+#ifdef CONFIG_CIFS_STATS2
+			cERROR(1,("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+				mid_entry->largeBuf,
+				mid_entry->resp_buf,
+				mid_entry->when_received,
+				jiffies));
+#endif /* STATS2 */
+			cERROR(1,("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+				  mid_entry->multiEnd));
+			if(mid_entry->resp_buf) {
+				cifs_dump_detail(mid_entry->resp_buf);
+				cifs_dump_mem("existing buf: ",
+					mid_entry->resp_buf,
+					62 /* fixme */);
+			}
+			
+		}
+	}
+	spin_unlock(&GlobalMid_Lock);
+}
+#endif /* CONFIG_CIFS_DEBUG2 */
+
 #ifdef CONFIG_PROC_FS
 static int
 cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
@@ -73,7 +124,6 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
 
 	*beginBuffer = buf + offset;
 
-	
 	length =
 	    sprintf(buf,
 		    "Display Internal CIFS Data Structures for Debugging\n"
@@ -397,10 +447,10 @@ static read_proc_t multiuser_mount_read;
 static write_proc_t multiuser_mount_write;
 static read_proc_t extended_security_read;
 static write_proc_t extended_security_write;
-static read_proc_t ntlmv2_enabled_read;
+/* static read_proc_t ntlmv2_enabled_read;
 static write_proc_t ntlmv2_enabled_write;
 static read_proc_t packet_signing_enabled_read;
-static write_proc_t packet_signing_enabled_write;
+static write_proc_t packet_signing_enabled_write;*/
 static read_proc_t experimEnabled_read;
 static write_proc_t experimEnabled_write;
 static read_proc_t linuxExtensionsEnabled_read;
@@ -469,7 +519,7 @@ cifs_proc_init(void)
 	if (pde)
 		pde->write_proc = lookupFlag_write;
 
-	pde =
+/*	pde =
 	    create_proc_read_entry("NTLMV2Enabled", 0, proc_fs_cifs,
 				ntlmv2_enabled_read, NULL);
 	if (pde)
@@ -479,7 +529,7 @@ cifs_proc_init(void)
 	    create_proc_read_entry("PacketSigningEnabled", 0, proc_fs_cifs,
 				packet_signing_enabled_read, NULL);
 	if (pde)
-		pde->write_proc = packet_signing_enabled_write;
+		pde->write_proc = packet_signing_enabled_write;*/
 }
 
 void
@@ -496,9 +546,9 @@ cifs_proc_clean(void)
 #endif
 	remove_proc_entry("MultiuserMount", proc_fs_cifs);
 	remove_proc_entry("OplockEnabled", proc_fs_cifs);
-	remove_proc_entry("NTLMV2Enabled",proc_fs_cifs);
+/*	remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); */
 	remove_proc_entry("ExtendedSecurity",proc_fs_cifs);
-	remove_proc_entry("PacketSigningEnabled",proc_fs_cifs);
+/*	remove_proc_entry("PacketSigningEnabled",proc_fs_cifs); */
 	remove_proc_entry("LinuxExtensionsEnabled",proc_fs_cifs);
 	remove_proc_entry("Experimental",proc_fs_cifs);
 	remove_proc_entry("LookupCacheEnabled",proc_fs_cifs);
@@ -787,7 +837,7 @@ extended_security_read(char *page, char **start, off_t off,
 {
 	int len;
 
-	len = sprintf(page, "%d\n", extended_security);
+	len = sprintf(page, "0x%x\n", extended_security);
 
 	len -= off;
 	*start = page + off;
@@ -808,19 +858,25 @@ extended_security_write(struct file *file, const char __user *buffer,
 {
 	char c;
 	int rc;
+	cERROR(1,("size %ld",count)); /* BB removeme BB */
+	if((count < 2) || (count > 8))
+		return -EINVAL;
 
 	rc = get_user(c, buffer);
+
+/* BB fixme need to parse more characters in order to handle CIFSSEC flags */ 
+
 	if (rc)
 		return rc;
 	if (c == '0' || c == 'n' || c == 'N')
-		extended_security = 0;
+		extended_security = CIFSSEC_DEF; /* default */
 	else if (c == '1' || c == 'y' || c == 'Y')
-		extended_security = 1;
+		extended_security = CIFSSEC_MAX;
 
 	return count;
 }
 
-static int
+/* static int
 ntlmv2_enabled_read(char *page, char **start, off_t off,
 		       int count, int *eof, void *data)
 {
@@ -855,6 +911,8 @@ ntlmv2_enabled_write(struct file *file, const char __user *buffer,
 		ntlmv2_support = 0;
 	else if (c == '1' || c == 'y' || c == 'Y')
 		ntlmv2_support = 1;
+	else if (c == '2')
+		ntlmv2_support = 2;
 
 	return count;
 }
@@ -898,7 +956,7 @@ packet_signing_enabled_write(struct file *file, const char __user *buffer,
 		sign_CIFS_PDUs = 2;
 
 	return count;
-}
+} */
 
 
 #endif
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 4304d9dcfb6..c26cd0d2c6d 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -24,6 +24,10 @@
 #define _H_CIFS_DEBUG
 
 void cifs_dump_mem(char *label, void *data, int length);
+#ifdef CONFIG_CIFS_DEBUG2
+void cifs_dump_detail(struct smb_hdr *);
+void cifs_dump_mids(struct TCP_Server_Info *);
+#endif
 extern int traceSMB;		/* flag which enables the function below */
 void dump_smb(struct smb_hdr *, int);
 #define CIFS_INFO	0x01
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index d2b12825594..d2a8b2941fc 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -22,6 +22,7 @@
 #include "cifs_unicode.h"
 #include "cifs_uniupr.h"
 #include "cifspdu.h"
+#include "cifsglob.h"
 #include "cifs_debug.h"
 
 /*
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e7d63737e65..08781a4698b 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -225,6 +225,8 @@ int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_
 	user_name_len = strlen(ses->userName);
 	if(user_name_len > MAX_USERNAME_SIZE)
 		return -EINVAL;
+	if(ses->domainName == NULL)
+		return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
 	dom_name_len = strlen(ses->domainName);
 	if(dom_name_len > MAX_USERNAME_SIZE)
 		return -EINVAL;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce..700570522b2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -56,8 +56,8 @@ unsigned int experimEnabled = 0;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
-unsigned int extended_security = 0;
-unsigned int ntlmv2_support = 0;
+unsigned int extended_security = CIFSSEC_DEF;
+/* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
 extern struct task_struct * oplockThread; /* remove sparse warning */
 struct task_struct * oplockThread = NULL;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 006eb33bff5..7a042041a16 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -88,7 +88,8 @@ enum statusEnum {
 };
 
 enum securityEnum {
-	NTLM = 0,		/* Legacy NTLM012 auth with NTLM hash */
+	LANMAN = 0,             /* Legacy LANMAN auth */
+	NTLM,			/* Legacy NTLM012 auth with NTLM hash */
 	NTLMv2,			/* Legacy NTLM auth with NTLMv2 hash */
 	RawNTLMSSP,		/* NTLMSSP without SPNEGO */
 	NTLMSSP,		/* NTLMSSP via SPNEGO */
@@ -179,7 +180,9 @@ struct cifsUidInfo {
 struct cifsSesInfo {
 	struct list_head cifsSessionList;
 	struct semaphore sesSem;
+#if 0
 	struct cifsUidInfo *uidInfo;	/* pointer to user info */
+#endif
 	struct TCP_Server_Info *server;	/* pointer to server info */
 	atomic_t inUse; /* # of mounts (tree connections) on this ses */
 	enum statusEnum status;
@@ -194,7 +197,7 @@ struct cifsSesInfo {
 	char serverName[SERVER_NAME_LEN_WITH_NULL * 2];	/* BB make bigger for 
 				TCP names - will ipv6 and sctp addresses fit? */
 	char userName[MAX_USERNAME_SIZE + 1];
-	char domainName[MAX_USERNAME_SIZE + 1];
+	char * domainName;
 	char * password;
 };
 /* session flags */
@@ -391,9 +394,9 @@ struct mid_q_entry {
 	struct smb_hdr *resp_buf;	/* response buffer */
 	int midState;	/* wish this were enum but can not pass to wait_event */
 	__u8 command;	/* smb command code */
-	unsigned multiPart:1;	/* multiple responses to one SMB request */
 	unsigned largeBuf:1;    /* if valid response, is pointer to large buf */
-	unsigned multiResp:1;   /* multiple trans2 responses for one request  */
+	unsigned multiRsp:1;   /* multiple trans2 responses for one request  */
+	unsigned multiEnd:1; /* both received */
 };
 
 struct oplock_q_entry {
@@ -430,15 +433,35 @@ struct dir_notify_req {
 #define   CIFS_LARGE_BUFFER     2
 #define   CIFS_IOVEC            4    /* array of response buffers */
 
-/* Type of session setup needed */
-#define   CIFS_PLAINTEXT	0
-#define   CIFS_LANMAN		1
-#define   CIFS_NTLM		2
-#define   CIFS_NTLMSSP_NEG	3
-#define   CIFS_NTLMSSP_AUTH	4
-#define   CIFS_SPNEGO_INIT	5
-#define   CIFS_SPNEGO_TARG	6
-
+/* Security Flags: indicate type of session setup needed */
+#define   CIFSSEC_MAY_SIGN	0x00001
+#define   CIFSSEC_MAY_NTLM	0x00002
+#define   CIFSSEC_MAY_NTLMV2	0x00004
+#define   CIFSSEC_MAY_KRB5	0x00008
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define   CIFSSEC_MAY_LANMAN	0x00010
+#define   CIFSSEC_MAY_PLNTXT	0x00020
+#endif /* weak passwords */
+#define   CIFSSEC_MAY_SEAL	0x00040 /* not supported yet */
+
+#define   CIFSSEC_MUST_SIGN	0x01001
+/* note that only one of the following can be set so the
+result of setting MUST flags more than once will be to
+require use of the stronger protocol */
+#define   CIFSSEC_MUST_NTLM	0x02002
+#define   CIFSSEC_MUST_NTLMV2	0x04004
+#define   CIFSSEC_MUST_KRB5	0x08008
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define   CIFSSEC_MUST_LANMAN	0x10010
+#define   CIFSSEC_MUST_PLNTXT	0x20020
+#define   CIFSSEC_MASK          0x37037 /* current flags supported if weak */
+#else	  
+#define	  CIFSSEC_MASK          0x07007 /* flags supported if no weak config */
+#endif /* WEAK_PW_HASH */
+#define   CIFSSEC_MUST_SEAL	0x40040 /* not supported yet */
+
+#define   CIFSSEC_DEF  CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2
+#define   CIFSSEC_MAX  CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2
 /*
  *****************************************************************
  * All constants go here
@@ -540,8 +563,8 @@ GLOBAL_EXTERN unsigned int experimEnabled;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int extended_security;	/* if on, session setup sent 
 				with more secure ntlmssp2 challenge/resp */
-GLOBAL_EXTERN unsigned int ntlmv2_support;  /* better optional password hash */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
+GLOBAL_EXTERN unsigned int secFlags;
 GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
 GLOBAL_EXTERN unsigned int CIFSMaxBufSize;  /* max size not including hdr */
 GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b2233ac05bd..e0ff9b56cc4 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -16,7 +16,7 @@
  *
  *   You should have received a copy of the GNU Lesser General Public License
  *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
 #ifndef _CIFSPDU_H
@@ -24,8 +24,14 @@
 
 #include <net/sock.h>
 
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define LANMAN_PROT 0
+#define CIFS_PROT   1
+#else
 #define CIFS_PROT   0
-#define BAD_PROT    CIFS_PROT+1
+#endif
+#define POSIX_PROT  CIFS_PROT+1
+#define BAD_PROT 0xFFFF
 
 /* SMB command codes */
 /* Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
@@ -400,6 +406,25 @@ typedef struct negotiate_req {
 	unsigned char DialectsArray[1];
 } __attribute__((packed)) NEGOTIATE_REQ;
 
+/* Dialect index is 13 for LANMAN */
+
+typedef struct lanman_neg_rsp {
+	struct smb_hdr hdr;	/* wct = 13 */
+	__le16 DialectIndex;
+	__le16 SecurityMode;
+	__le16 MaxBufSize;
+	__le16 MaxMpxCount;
+	__le16 MaxNumberVcs;
+	__le16 RawMode;
+	__le32 SessionKey;
+	__le32 ServerTime;
+	__le16 ServerTimeZone;
+	__le16 EncryptionKeyLength;
+	__le16 Reserved;
+	__u16  ByteCount;
+	unsigned char EncryptionKey[1];
+} __attribute__((packed)) LANMAN_NEG_RSP;
+
 typedef struct negotiate_rsp {
 	struct smb_hdr hdr;	/* wct = 17 */
 	__le16 DialectIndex;
@@ -520,8 +545,8 @@ typedef union smb_com_session_setup_andx {
 		__le16 MaxMpxCount;
 		__le16 VcNumber;
 		__u32 SessionKey;
-		__le16 PassswordLength;
-		__u32 Reserved;
+		__le16 PasswordLength;
+		__u32 Reserved; /* encrypt key len and offset */
 		__le16 ByteCount;
 		unsigned char AccountPassword[1];	/* followed by */
 		/* STRING AccountName */
@@ -1844,13 +1869,13 @@ typedef struct {
 typedef struct {
 	__le32 DeviceType;
 	__le32 DeviceCharacteristics;
-} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO;	/* device info, level 0x104 */
+} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */
 
 typedef struct {
 	__le32 Attributes;
 	__le32 MaxPathNameComponentLength;
 	__le32 FileSystemNameLen;
-	char FileSystemName[52]; /* do not really need to save this - so potentially get only subset of name */
+	char FileSystemName[52]; /* do not have to save this - get subset? */
 } __attribute__((packed)) FILE_SYSTEM_ATTRIBUTE_INFO;
 
 /******************************************************************************/
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 310ea2f0e0b..ff78cf7b0d1 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -69,7 +69,7 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
 				struct cifsSesInfo *ses,
 				void ** request_buf);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-			     const int stage, int * pNTLMv2_flg,
+			     const int stage, 
 			     const struct nls_table *nls_cp);
 #endif
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f00369277c0..da3154fa9c8 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -44,8 +44,11 @@ static struct {
 	int index;
 	char *name;
 } protocols[] = {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	{LANMAN_PROT, "\2LM1.2X002"},
+#endif /* weak password hashing for legacy clients */
 	{CIFS_PROT, "\2NT LM 0.12"}, 
-	{CIFS_PROT, "\2POSIX 2"},
+	{POSIX_PROT, "\2POSIX 2"},
 	{BAD_PROT, "\2"}
 };
 #else
@@ -53,11 +56,29 @@ static struct {
 	int index;
 	char *name;
 } protocols[] = {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+	{LANMAN_PROT, "\2LM1.2X002"},
+#endif /* weak password hashing for legacy clients */
 	{CIFS_PROT, "\2NT LM 0.12"}, 
 	{BAD_PROT, "\2"}
 };
 #endif
 
+/* define the number of elements in the cifs dialect array */
+#ifdef CONFIG_CIFS_POSIX
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define CIFS_NUM_PROT 3
+#else
+#define CIFS_NUM_PROT 2
+#endif /* CIFS_WEAK_PW_HASH */
+#else /* not posix */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#define CIFS_NUM_PROT 2
+#else
+#define CIFS_NUM_PROT 1
+#endif /* CONFIG_CIFS_WEAK_PW_HASH */
+#endif /* CIFS_POSIX */
+
 
 /* Mark as invalid, all open files on tree connections since they
    were closed when session to server was lost */
@@ -322,7 +343,8 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
     /* potential retries of smb operations it turns out we can determine */
     /* from the mid flags when the request buffer can be resent without  */
     /* having to use a second distinct buffer for the response */
-	*response_buf = *request_buf; 
+	if(response_buf)
+		*response_buf = *request_buf; 
 
 	header_assemble((struct smb_hdr *) *request_buf, smb_command, tcon,
 			wct /*wct */ );
@@ -373,6 +395,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	NEGOTIATE_RSP *pSMBr;
 	int rc = 0;
 	int bytes_returned;
+	int i;
 	struct TCP_Server_Info * server;
 	u16 count;
 
@@ -388,19 +411,71 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		return rc;
 	pSMB->hdr.Mid = GetNextMid(server);
 	pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
-	if (extended_security)
-		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
-
-	count = strlen(protocols[0].name) + 1;
-	strncpy(pSMB->DialectsArray, protocols[0].name, 30);	
-    /* null guaranteed to be at end of source and target buffers anyway */
-
+/*	if (extended_security)
+		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;*/
+	
+	count = 0;
+	for(i=0;i<CIFS_NUM_PROT;i++) {
+		strncpy(pSMB->DialectsArray+count, protocols[i].name, 16);
+		count += strlen(protocols[i].name) + 1;
+		/* null at end of source and target buffers anyway */
+	}
 	pSMB->hdr.smb_buf_length += count;
 	pSMB->ByteCount = cpu_to_le16(count);
 
 	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc == 0) {
+		cFYI(1,("Dialect: %d", pSMBr->DialectIndex));
+		/* Check wct = 1 error case */
+		if((pSMBr->hdr.WordCount < 13)  
+			|| (pSMBr->DialectIndex == BAD_PROT)) {
+			/* core returns wct = 1, but we do not ask for
+			core - otherwise it just comes when dialect
+			index is -1 indicating we could not negotiate
+			a common dialect */
+			rc = -EOPNOTSUPP;
+			goto neg_err_exit;
+		} else if((pSMBr->hdr.WordCount == 13) && 
+			(pSMBr->DialectIndex == LANMAN_PROT)) {
+			struct lanman_neg_rsp * rsp = 
+				(struct lanman_neg_rsp *)pSMBr;
+
+
+			/* BB Mark ses struct as negotiated lanman level BB */
+			server->secType = LANMAN;
+			server->secMode = (__u8)le16_to_cpu(rsp->SecurityMode);
+			server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
+			server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
+				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+
+			/* BB what do we do with raw mode? BB */
+			server->timeZone = le16_to_cpu(rsp->ServerTimeZone);
+			/* Do we have to set signing flags? no signing
+			was available LANMAN - default should be ok */
+
+			/* BB FIXME set default dummy capabilities since
+			they are not returned by the server in this dialect */
+
+			/* get server time for time conversions and add
+			code to use it and timezone since this is not UTC */	
+
+			if (rsp->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+				memcpy(server->cryptKey, rsp->EncryptionKey,
+					CIFS_CRYPTO_KEY_SIZE);
+			} else {
+				rc = -EIO;
+				goto neg_err_exit;
+			}
+
+			cFYI(1,("LANMAN negotiated")); /* BB removeme BB */
+			goto neg_err_exit;
+		} else if(pSMBr->hdr.WordCount != 17) {
+			/* unknown wct */
+			rc = -EOPNOTSUPP;
+			goto neg_err_exit;
+		}
+
 		server->secMode = pSMBr->SecurityMode;
 		if((server->secMode & SECMODE_USER) == 0)
 			cFYI(1,("share mode security"));
@@ -479,7 +554,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		}
 				
 	}
-	
+neg_err_exit:	
 	cifs_buf_release(pSMB);
 	return rc;
 }
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index bae1479318d..7ffb8f244f6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -49,8 +49,6 @@
 
 static DECLARE_COMPLETION(cifsd_complete);
 
-extern void SMBencrypt(unsigned char *passwd, unsigned char *c8,
-		       unsigned char *p24);
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
@@ -585,9 +583,11 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 						/* merge response - fix up 1st*/
 						if(coalesce_t2(smb_buffer, 
 							mid_entry->resp_buf)) {
+							mid_entry->multiRsp = 1;
 							break;
 						} else {
 							/* all parts received */
+							mid_entry->multiEnd = 1;
 							goto multi_t2_fnd; 
 						}
 					} else {
@@ -632,9 +632,14 @@ multi_t2_fnd:
 			wake_up_process(task_to_wake);
 		} else if ((is_valid_oplock_break(smb_buffer, server) == FALSE)
 		    && (isMultiRsp == FALSE)) {                          
-			cERROR(1, ("No task to wake, unknown frame rcvd!"));
+			cERROR(1, ("No task to wake, unknown frame rcvd! NumMids %d", midCount.counter));
 			cifs_dump_mem("Received Data is: ",(char *)smb_buffer,
 				      sizeof(struct smb_hdr));
+#ifdef CONFIG_CIFS_DEBUG2
+			cifs_dump_detail(smb_buffer);
+			cifs_dump_mids(server);
+#endif /* CIFS_DEBUG2 */
+			
 		}
 	} /* end while !EXITING */
 
@@ -976,7 +981,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 			}
 			/* BB are there cases in which a comma can be valid in
 			a domain name and need special handling? */
-			if (strnlen(value, 65) < 65) {
+			if (strnlen(value, 256) < 256) {
 				vol->domainname = value;
 				cFYI(1, ("Domain name set"));
 			} else {
@@ -1762,9 +1767,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			if (volume_info.username)
 				strncpy(pSesInfo->userName,
 					volume_info.username,MAX_USERNAME_SIZE);
-			if (volume_info.domainname)
-				strncpy(pSesInfo->domainName,
-					volume_info.domainname,MAX_USERNAME_SIZE);
+			if (volume_info.domainname) {
+				int len = strlen(volume_info.domainname);
+				pSesInfo->domainName = 
+					kmalloc(len + 1, GFP_KERNEL);
+				if(pSesInfo->domainName)
+					strcpy(pSesInfo->domainName,
+						volume_info.domainname);
+			}
 			pSesInfo->linux_uid = volume_info.linux_uid;
 			down(&pSesInfo->sesSem);
 			rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls);
@@ -2054,7 +2064,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 			bcc_ptr++;
 		}
 		if(user == NULL)
-			bytes_returned = 0; /* skill null user */
+			bytes_returned = 0; /* skip null user */
 	        else
 			bytes_returned =
 			        cifs_strtoUCS((__le16 *) bcc_ptr, user, 100,
@@ -2635,8 +2645,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 	    /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN | */ NTLMSSP_NEGOTIATE_128;
 	if(sign_CIFS_PDUs)
 		negotiate_flags |= NTLMSSP_NEGOTIATE_SIGN;
-	if(ntlmv2_support)
-		negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;
+/*	if(ntlmv2_support)
+		negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;*/
 	/* setup pointers to domain name and workstation name */
 	bcc_ptr += SecurityBlobLength;
 
@@ -3429,7 +3439,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 			}
 			/* else do not bother copying these informational fields */
 		}
-		tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
+		if(smb_buffer_response->WordCount == 3)
+			tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
+		else
+			tcon->Flags = 0;
 		cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags));
 	} else if ((rc == 0) && tcon == NULL) {
         /* all we need to save for IPC$ connection */
@@ -3528,8 +3541,8 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 			pSesInfo->server->timeZone));
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if(experimEnabled > 1)
-			rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */,
-					    &ntlmv2_flag, nls_info);	
+			rc = CIFS_SessSetup(xid, pSesInfo,
+					    first_time, nls_info);
 		else
 #endif
 		if (extended_security
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 57bdf7f734b..e6ed64e94b7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -113,7 +113,7 @@ cifs_bp_rename_retry:
 	full_path[namelen+2] = 0;
 BB remove above eight lines BB */
 
-/* Inode operations in similar order to how they appear in the Linux file fs.h */
+/* Inode operations in similar order to how they appear in Linux file fs.h */
 
 int
 cifs_create(struct inode *inode, struct dentry *direntry, int mode,
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
index 633a9381132..d91a3d44e9e 100644
--- a/fs/cifs/fcntl.c
+++ b/fs/cifs/fcntl.c
@@ -91,14 +91,14 @@ int cifs_dir_notify(struct file * file, unsigned long arg)
 	if(full_path == NULL) {
 		rc = -ENOMEM;
 	} else {
-		cERROR(1,("cifs dir notify on file %s with arg 0x%lx",full_path,arg)); /* BB removeme BB */
+		cFYI(1,("dir notify on file %s Arg 0x%lx",full_path,arg));
 		rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, 
 			GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
 			&netfid, &oplock,NULL, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 		/* BB fixme - add this handle to a notify handle list */
 		if(rc) {
-			cERROR(1,("Could not open directory for notify"));  /* BB remove BB */
+			cFYI(1,("Could not open directory for notify"));
 		} else {
 			filter = convert_to_cifs_notify_flags(arg);
 			if(filter != 0) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 77a9e2f912f..a609d266803 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1121,7 +1121,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 
 	xid = GetXid();
 
-	cFYI(1, ("In cifs_setattr, name = %s attrs->iavalid 0x%x",
+	cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
 		 direntry->d_name.name, attrs->ia_valid));
 
 	cifs_sb = CIFS_SB(direntry->d_inode->i_sb);
@@ -1157,6 +1157,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		   when the local oplock break takes longer to flush
 		   writebehind data than the SMB timeout for the SetPathInfo
 		   request would allow */
+
 		open_file = find_writable_file(cifsInode);
 		if (open_file) {
 			__u16 nfid = open_file->netfid;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index fafd056426e..22c937e5884 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -101,6 +101,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
 	kfree(buf_to_free->serverDomain);
 	kfree(buf_to_free->serverNOS);
 	kfree(buf_to_free->password);
+	kfree(buf_to_free->domainName);
 	kfree(buf_to_free);
 }
 
@@ -499,11 +500,12 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		if(pSMBr->ByteCount > sizeof(struct file_notify_information)) {
 			data_offset = le32_to_cpu(pSMBr->DataOffset);
 
-			pnotify = (struct file_notify_information *)((char *)&pSMBr->hdr.Protocol
-				+ data_offset);
-			cFYI(1,("dnotify on %s with action: 0x%x",pnotify->FileName,
+			pnotify = (struct file_notify_information *)
+				((char *)&pSMBr->hdr.Protocol + data_offset);
+			cFYI(1,("dnotify on %s Action: 0x%x",pnotify->FileName,
 				pnotify->Action));  /* BB removeme BB */
-	             /*   cifs_dump_mem("Received notify Data is: ",buf,sizeof(struct smb_hdr)+60); */
+	             /*   cifs_dump_mem("Rcvd notify Data: ",buf,
+				sizeof(struct smb_hdr)+60); */
 			return TRUE;
 		}
 		if(pSMBr->hdr.Status.CifsError) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 6b36c43d38f..53903a27f78 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -31,8 +31,8 @@
 #include "cifs_fs_sb.h"
 #include "cifsfs.h"
 
-/* BB fixme - add debug wrappers around this function to disable it fixme BB */
-/* static void dump_cifs_file_struct(struct file *file, char *label)
+#ifdef CONFIG_CIFS_DEBUG2
+static void dump_cifs_file_struct(struct file *file, char *label)
 {
 	struct cifsFileInfo * cf;
 
@@ -53,7 +53,8 @@
 		}
 		
 	}
-} */
+}
+#endif /* DEBUG2 */
 
 /* Returns one if new inode created (which therefore needs to be hashed) */
 /* Might check in the future if inode number changed so we can rehash inode */
@@ -597,7 +598,9 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	. and .. for the root of a drive and for those we need
 	to start two entries earlier */
 
-/*	dump_cifs_file_struct(file, "In fce ");*/
+#ifdef CONFIG_CIFS_DEBUG2
+	dump_cifs_file_struct(file, "In fce ");
+#endif
 	if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && 
 	     is_dir_changed(file)) || 
 	   (index_to_find < first_entry_in_buffer)) {
@@ -980,9 +983,10 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			rc = cifs_filldir(current_entry, file, 
 					filldir, direntry,tmp_buf);
 			file->f_pos++;
-			if(file->f_pos == cifsFile->srch_inf.index_of_last_entry) {
+			if(file->f_pos == 
+				cifsFile->srch_inf.index_of_last_entry) {
 				cFYI(1,("last entry in buf at pos %lld %s",
-					file->f_pos,tmp_buf)); /* BB removeme BB */
+					file->f_pos,tmp_buf));
 				cifs_save_resume_key(current_entry,cifsFile);
 				break;
 			} else 
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
new file mode 100644
index 00000000000..34998416eb7
--- /dev/null
+++ b/fs/cifs/sess.c
@@ -0,0 +1,511 @@
+/*
+ *   fs/cifs/sess.c
+ *
+ *   SMB/CIFS session setup handling routines
+ *
+ *   Copyright (c) International Business Machines  Corp., 2006
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "ntlmssp.h"
+#include "nterr.h"
+#include <linux/ctype.h>
+
+extern void SMBencrypt(unsigned char *passwd, unsigned char *c8,
+                       unsigned char *p24);
+
+extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
+                         unsigned char *p24);
+
+#ifdef CONFIG_CIFS_EXPERIMENTAL
+
+static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
+{
+	__u32 capabilities = 0;
+
+	/* init fields common to all four types of SessSetup */
+	/* note that header is initialized to zero in header_assemble */
+	pSMB->req.AndXCommand = 0xFF;
+	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
+	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+
+	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
+
+	/* BB verify whether signing required on neg or just on auth frame 
+	   (and NTLM case) */
+
+	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
+			CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
+
+	if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+		pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+	if (ses->capabilities & CAP_UNICODE) {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE;
+		capabilities |= CAP_UNICODE;
+	}
+	if (ses->capabilities & CAP_STATUS32) {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS;
+		capabilities |= CAP_STATUS32;
+	}
+	if (ses->capabilities & CAP_DFS) {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_DFS;
+		capabilities |= CAP_DFS;
+	}
+	if (ses->capabilities & CAP_UNIX) {
+		capabilities |= CAP_UNIX;
+	}
+
+	/* BB check whether to init vcnum BB */
+	return capabilities;
+}
+
+void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
+			    const struct nls_table * nls_cp)
+{
+	char * bcc_ptr = *pbcc_area;
+	int bytes_ret = 0;
+
+	/* BB FIXME add check that strings total less
+	than 335 or will need to send them as arrays */
+
+	/* align unicode strings, must be word aligned */
+	if ((long) bcc_ptr % 2)	{
+		*bcc_ptr = 0;
+		bcc_ptr++;
+	}
+	/* copy user */
+	if(ses->userName == NULL) {
+		/* BB what about null user mounts - check that we do this BB */
+	} else { /* 300 should be long enough for any conceivable user name */
+		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
+					  300, nls_cp);
+	}
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2; /* account for null termination */
+	/* copy domain */
+	if(ses->domainName == NULL)
+		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr,
+					  "CIFS_LINUX_DOM", 32, nls_cp);
+	else
+		bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName, 
+					  256, nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2;  /* account for null terminator */
+
+	/* Copy OS version */
+	bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
+				  nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release,
+				  32, nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2; /* trailing null */
+
+	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+                                  32, nls_cp);
+	bcc_ptr += 2 * bytes_ret;
+	bcc_ptr += 2; /* trailing null */
+
+	*pbcc_area = bcc_ptr;
+}
+
+void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
+			  const struct nls_table * nls_cp)
+{
+	char * bcc_ptr = *pbcc_area;
+
+	/* copy user */
+	/* BB what about null user mounts - check that we do this BB */
+        /* copy user */
+        if(ses->userName == NULL) {
+                /* BB what about null user mounts - check that we do this BB */
+        } else { /* 300 should be long enough for any conceivable user name */
+                strncpy(bcc_ptr, ses->userName, 300);
+        }
+	/* BB improve check for overflow */
+        bcc_ptr += strnlen(ses->userName, 200);
+	*bcc_ptr = 0;
+        bcc_ptr++; /* account for null termination */
+
+        /* copy domain */
+	
+        if(ses->domainName == NULL) {
+                strcpy(bcc_ptr, "CIFS_LINUX_DOM");
+		bcc_ptr += 14;  /* strlen(CIFS_LINUX_DOM) */
+ 	} else {
+                strncpy(bcc_ptr, ses->domainName, 256); 
+		bcc_ptr += strnlen(ses->domainName, 256);
+	}
+	*bcc_ptr = 0;
+	bcc_ptr++;
+
+	/* BB check for overflow here */
+
+	strcpy(bcc_ptr, "Linux version ");
+	bcc_ptr += strlen("Linux version ");
+	strcpy(bcc_ptr, system_utsname.release);
+	bcc_ptr += strlen(system_utsname.release) + 1;
+
+	strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
+	bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+
+        *pbcc_area = bcc_ptr;
+}
+
+int decode_unicode_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses,
+                            const struct nls_table * nls_cp)
+{
+	int rc = 0;
+	int words_left, len;
+	char * data = *pbcc_area;
+
+
+
+	cFYI(1,("bleft %d",bleft));
+
+
+	/* word align, if bytes remaining is not even */
+	if(bleft % 2) {
+		bleft--;
+		data++;
+	}
+	words_left = bleft / 2;
+
+	/* save off server operating system */
+	len = UniStrnlen((wchar_t *) data, words_left);
+
+/* We look for obvious messed up bcc or strings in response so we do not go off
+   the end since (at least) WIN2K and Windows XP have a major bug in not null
+   terminating last Unicode string in response  */
+	if(len >= words_left)
+		return rc;
+
+	if(ses->serverOS)
+		kfree(ses->serverOS);
+	/* UTF-8 string will not grow more than four times as big as UCS-16 */
+	ses->serverOS = kzalloc(4 * len, GFP_KERNEL);
+	if(ses->serverOS != NULL) {
+		cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len,
+				   nls_cp);
+	}
+	data += 2 * (len + 1);
+	words_left -= len + 1;
+
+	/* save off server network operating system */
+	len = UniStrnlen((wchar_t *) data, words_left);
+
+	if(len >= words_left)
+		return rc;
+
+	if(ses->serverNOS)
+		kfree(ses->serverNOS);
+	ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */
+	if(ses->serverNOS != NULL) {
+		cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
+				   nls_cp);
+		if(strncmp(ses->serverNOS, "NT LAN Manager 4",16) == 0) {
+			cFYI(1,("NT4 server"));
+			ses->flags |= CIFS_SES_NT4;
+		}
+	}
+	data += 2 * (len + 1);
+	words_left -= len + 1;
+
+        /* save off server domain */
+        len = UniStrnlen((wchar_t *) data, words_left);
+
+        if(len > words_left)
+                return rc;
+
+        if(ses->serverDomain)
+                kfree(ses->serverDomain);
+        ses->serverDomain = kzalloc(2 * (len + 1), GFP_KERNEL); /* BB FIXME wrong length */
+        if(ses->serverDomain != NULL) {
+                cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len,
+                                   nls_cp);
+                ses->serverDomain[2*len] = 0;
+                ses->serverDomain[(2*len) + 1] = 0;
+        }
+        data += 2 * (len + 1);
+        words_left -= len + 1;
+	
+	cFYI(1,("words left: %d",words_left));
+
+	return rc;
+}
+
+int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses,
+                            const struct nls_table * nls_cp)
+{
+	int rc = 0;
+	int len;
+	char * bcc_ptr = *pbcc_area;
+
+	cFYI(1,("decode sessetup ascii. bleft %d", bleft));
+	
+	len = strnlen(bcc_ptr, bleft);
+	if(len >= bleft)
+		return rc;
+	
+	if(ses->serverOS)
+		kfree(ses->serverOS);
+
+	ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
+	if(ses->serverOS)
+		strncpy(ses->serverOS, bcc_ptr, len);
+
+	bcc_ptr += len + 1;
+	bleft -= len + 1;
+
+	len = strnlen(bcc_ptr, bleft);
+	if(len >= bleft)
+		return rc;
+
+	if(ses->serverNOS)
+		kfree(ses->serverNOS);
+
+	ses->serverNOS = kzalloc(len + 1, GFP_KERNEL);
+	if(ses->serverNOS)
+		strncpy(ses->serverNOS, bcc_ptr, len);
+
+	bcc_ptr += len + 1;
+	bleft -= len + 1;
+
+        len = strnlen(bcc_ptr, bleft);
+        if(len > bleft)
+                return rc;
+
+        if(ses->serverDomain)
+                kfree(ses->serverDomain);
+
+        ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
+        if(ses->serverOS)
+                strncpy(ses->serverOS, bcc_ptr, len);
+
+        bcc_ptr += len + 1;
+	bleft -= len + 1;
+
+	cFYI(1,("ascii: bytes left %d",bleft));
+
+	return rc;
+}
+
+int 
+CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
+		const struct nls_table *nls_cp)
+{
+	int rc = 0;
+	int wct;
+	int i;
+	struct smb_hdr *smb_buf;
+	char *bcc_ptr;
+	SESSION_SETUP_ANDX *pSMB;
+	__u32 capabilities;
+	int count;
+	int resp_buf_type = 0;
+	struct kvec iov[1];
+	enum securityEnum type;
+	__u16 action;
+	int bytes_remaining;
+	
+	if(ses == NULL)
+		return -EINVAL;
+
+	type = ses->server->secType;
+	if(type == LANMAN) {
+#ifndef CONFIG_CIFS_WEAK_PW_HASH
+		/* LANMAN and plaintext are less secure and off by default.
+		So we make this explicitly be turned on in kconfig (in the
+		build) and turned on at runtime (changed from the default)
+		in proc/fs/cifs or via mount parm.  Unfortunately this is
+		needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
+		return -EOPNOTSUPP;
+#endif
+		wct = 10; /* lanman 2 style sessionsetup */
+	} else if(type == NTLM) /* NTLMv2 may retry NTLM */
+		wct = 13; /* old style NTLM sessionsetup */
+	else /* same size for negotiate or auth, NTLMSSP or extended security */
+		wct = 12;
+
+	rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
+			    (void **)&smb_buf);
+	if(rc)
+		return rc;
+
+	pSMB = (SESSION_SETUP_ANDX *)smb_buf;
+
+	capabilities = cifs_ssetup_hdr(ses, pSMB);
+	bcc_ptr = pByteArea(smb_buf);
+
+	if(type == LANMAN) {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+		char lnm_session_key[CIFS_SESSION_KEY_SIZE];
+		char password_with_pad[CIFS_ENCPWD_SIZE];
+
+		/* no capabilities flags in old lanman negotiation */
+
+		pSMB->old_req.PasswordLength = CIFS_SESSION_KEY_SIZE; 
+		/* BB calculate hash with password */
+		/* and copy into bcc */
+
+		memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+		strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE);
+
+		/* calculate old style session key */
+		/* toupper may be less broken then repeatedly calling
+		nls_toupper would be, but neither handles multibyte code pages
+		but the only alternative would be converting to UCS-16 (Unicode)
+		uppercasing and converting back which is only worth doing if
+		we knew it were utf8. utf8 code page needs its own
+		toupper and tolower and strnicmp functions */
+		
+		for(i = 0; i< CIFS_ENCPWD_SIZE; i++) {
+			password_with_pad[i] = toupper(password_with_pad[i]);
+		}
+
+		SMBencrypt(password_with_pad, ses->server->cryptKey,
+			   lnm_session_key);
+
+#ifdef CONFIG_CIFS_DEBUG2
+		cifs_dump_mem("cryptkey: ",ses->server->cryptKey,
+			CIFS_SESSION_KEY_SIZE);
+#endif
+		/* clear password before we return/free memory */
+		memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESSION_KEY_SIZE);
+		bcc_ptr += CIFS_SESSION_KEY_SIZE;
+
+		/* can not sign if LANMAN negotiated so no need
+		to calculate signing key? but what if server
+		changed to do higher than lanman dialect and
+		we reconnected would we ever calc signing_key? */
+
+		cERROR(1,("Negotiating LANMAN setting up strings"));
+		/* Unicode not allowed for LANMAN dialects */
+		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+#endif    
+	} else if (type == NTLM) {
+		char ntlm_session_key[CIFS_SESSION_KEY_SIZE];
+
+		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+		pSMB->req_no_secext.CaseInsensitivePasswordLength =
+			cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+		pSMB->req_no_secext.CaseSensitivePasswordLength =
+			cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+	
+		/* calculate session key */
+		SMBNTencrypt(ses->password, ses->server->cryptKey,
+			     ntlm_session_key);
+
+		if(first_time) /* should this be moved into common code 
+				  with similar ntlmv2 path? */
+			cifs_calculate_mac_key(
+				ses->server->mac_signing_key,
+				ntlm_session_key, ses->password);
+		/* copy session key */
+
+		memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESSION_KEY_SIZE);
+		bcc_ptr += CIFS_SESSION_KEY_SIZE;
+		memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESSION_KEY_SIZE);
+		bcc_ptr += CIFS_SESSION_KEY_SIZE;
+		if(ses->capabilities & CAP_UNICODE)
+			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
+		else
+			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+	} else /* NTLMSSP or SPNEGO */ {
+		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+		capabilities |= CAP_EXTENDED_SECURITY;
+		pSMB->req.Capabilities = cpu_to_le32(capabilities);
+		/* BB set password lengths */
+	}
+
+	count = (long) bcc_ptr - (long) pByteArea(smb_buf);
+	smb_buf->smb_buf_length += count;
+
+	/* if we switch to small buffers, count will need to be fewer
+	   than 383 (strings less than 335 bytes) */
+
+	BCC_LE(smb_buf) = cpu_to_le16(count);
+
+
+	/* BB FIXME check for other non ntlm code paths */
+
+	/* BB check is this too big for a small smb? */
+
+	iov[0].iov_base = (char *)pSMB;
+	iov[0].iov_len = smb_buf->smb_buf_length + 4;
+
+	rc = SendReceive2(xid, ses, iov, 1 /* num_iovecs */, &resp_buf_type, 0);
+	/* SMB request buf freed in SendReceive2 */
+
+	cFYI(1,("ssetup rc from sendrecv2 is %d",rc));
+	if(rc)
+		goto ssetup_exit;
+
+	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
+	smb_buf = (struct smb_hdr *)iov[0].iov_base;
+
+	if((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
+		rc = -EIO;
+		cERROR(1,("bad word count %d", smb_buf->WordCount));
+		goto ssetup_exit;
+	}
+	action = le16_to_cpu(pSMB->resp.Action);
+	if (action & GUEST_LOGIN)
+		cFYI(1, (" Guest login")); /* BB mark SesInfo struct? */
+	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+	cFYI(1, ("UID = %d ", ses->Suid));
+	/* response can have either 3 or 4 word count - Samba sends 3 */
+	/* and lanman response is 3 */
+	bytes_remaining = BCC(smb_buf);
+	bcc_ptr = pByteArea(smb_buf);
+
+	if(smb_buf->WordCount == 4) {
+		__u16 blob_len;
+		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+		bcc_ptr += blob_len;
+		if(blob_len > bytes_remaining) {
+			cERROR(1,("bad security blob length %d", blob_len));
+			rc = -EINVAL;
+			goto ssetup_exit;
+		}
+		bytes_remaining -= blob_len;
+	}	
+
+	/* BB check if Unicode and decode strings */
+	if(smb_buf->Flags2 & SMBFLG2_UNICODE)
+		rc = decode_unicode_ssetup(&bcc_ptr, bytes_remaining,
+						   ses, nls_cp);
+	else
+		rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,nls_cp);
+	
+ssetup_exit:
+	if(resp_buf_type == CIFS_SMALL_BUFFER) {
+		cFYI(1,("ssetup freeing small buf %p", iov[0].iov_base));
+		cifs_small_buf_release(iov[0].iov_base);
+	} else if(resp_buf_type == CIFS_LARGE_BUFFER)
+		cifs_buf_release(iov[0].iov_base);
+
+	return rc;
+}
+#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 6103bcdfb16..f518c5e4503 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -30,6 +30,7 @@
 #include <linux/random.h>
 #include "cifs_unicode.h"
 #include "cifspdu.h"
+#include "cifsglob.h"
 #include "md5.h"
 #include "cifs_debug.h"
 #include "cifsencrypt.h"
-- 
cgit v1.2.3


From 9c53588ec96d85f82e9bf3fb1af7cca31056e940 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 1 Jun 2006 05:09:10 +0000
Subject: [CIFS] Missing include shows up on some architectures

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h  |   2 +-
 fs/cifs/ntlmssp.c | 143 ------------------------------------------------------
 fs/cifs/sess.c    |   1 +
 3 files changed, 2 insertions(+), 144 deletions(-)
 delete mode 100644 fs/cifs/ntlmssp.c

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c98755dca86..2fc6d6527f1 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -99,5 +99,5 @@ extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern int cifs_ioctl (struct inode * inode, struct file * filep,
 		       unsigned int command, unsigned long arg);
-#define CIFS_VERSION   "1.43"
+#define CIFS_VERSION   "1.44"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c
deleted file mode 100644
index 115359cc7a3..00000000000
--- a/fs/cifs/ntlmssp.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- *   fs/cifs/ntlmssp.h
- *
- *   Copyright (c) International Business Machines  Corp., 2006
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include "cifspdu.h"
-#include "cifsglob.h"
-#include "cifsproto.h"
-#include "cifs_unicode.h"
-#include "cifs_debug.h"
-#include "ntlmssp.h"
-#include "nterr.h"
-
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
-{
-	__u32 capabilities = 0;
-
-	/* init fields common to all four types of SessSetup */
-	/* note that header is initialized to zero in header_assemble */
-	pSMB->req.AndXCommand = 0xFF;
-	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-
-	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
-
-	/* BB verify whether signing required on neg or just on auth frame 
-	   (and NTLM case) */
-
-	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-			CAP_LARGE_WRITE_X | CAP_LARGE_READ_X;
-
-	if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-		pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-
-	if (ses->capabilities & CAP_UNICODE) {
-		pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE;
-		capabilities |= CAP_UNICODE;
-	}
-	if (ses->capabilities & CAP_STATUS32) {
-		pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS;
-		capabilities |= CAP_STATUS32;
-	}
-	if (ses->capabilities & CAP_DFS) {
-		pSMB->req.hdr.Flags2 |= SMBFLG2_DFS;
-		capabilities |= CAP_DFS;
-	}
-
-	/* BB check whether to init vcnum BB */
-	return capabilities;
-}
-int 
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type,
-		  int * pNTLMv2_flg, const struct nls_table *nls_cp)
-{
-	int rc = 0;
-	int wct;
-	struct smb_hdr *smb_buffer;
-	char *bcc_ptr;
-	SESSION_SETUP_ANDX *pSMB;
-	__u32 capabilities;
-
-	if(ses == NULL)
-		return -EINVAL;
-
-	cFYI(1,("SStp type: %d",type));
-	if(type < CIFS_NTLM) {
-#ifndef CONFIG_CIFS_WEAK_PW_HASH
-		/* LANMAN and plaintext are less secure and off by default.
-		So we make this explicitly be turned on in kconfig (in the
-		build) and turned on at runtime (changed from the default)
-		in proc/fs/cifs or via mount parm.  Unfortunately this is
-		needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
-		return -EOPNOTSUPP;
-#endif
-		wct = 10; /* lanman 2 style sessionsetup */
-	} else if(type < CIFS_NTLMSSP_NEG)
-		wct = 13; /* old style NTLM sessionsetup */
-	else /* same size for negotiate or auth, NTLMSSP or extended security */
-		wct = 12;
-
-	rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
-			    (void **)&smb_buffer);
-	if(rc)
-		return rc;
-
-	pSMB = (SESSION_SETUP_ANDX *)smb_buffer;
-
-	capabilities = cifs_ssetup_hdr(ses, pSMB);
-	bcc_ptr = pByteArea(smb_buffer);
-	if(type > CIFS_NTLM) {
-		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-		capabilities |= CAP_EXTENDED_SECURITY;
-		pSMB->req.Capabilities = cpu_to_le32(capabilities);
-		/* BB set password lengths */
-	} else if(type < CIFS_NTLM) /* lanman */ {
-		/* no capabilities flags in old lanman negotiation */
-		/* pSMB->old_req.PasswordLength = */ /* BB fixme BB */
-	} else /* type CIFS_NTLM */ {
-		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
-		pSMB->req_no_secext.CaseInsensitivePasswordLength =
-			cpu_to_le16(CIFS_SESSION_KEY_SIZE);
-		pSMB->req_no_secext.CaseSensitivePasswordLength =
-			cpu_to_le16(CIFS_SESSION_KEY_SIZE);
-	}
-
-
-	/* copy session key */
-
-	/* if Unicode, align strings to two byte boundary */
-
-	/* copy user name */ /* BB Do we need to special case null user name? */
-
-	/* copy domain name */
-
-	/* copy Linux version */
-
-	/* copy network operating system name */
-
-	/* update bcc and smb buffer length */
-
-/*	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */
-	/* SMB request buf freed in SendReceive2 */
-
-	return rc;
-}
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 34998416eb7..fdc248fffa0 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -29,6 +29,7 @@
 #include "ntlmssp.h"
 #include "nterr.h"
 #include <linux/ctype.h>
+#include <linux/utsname.h>
 
 extern void SMBencrypt(unsigned char *passwd, unsigned char *c8,
                        unsigned char *p24);
-- 
cgit v1.2.3


From 7c7b25bc8e392aea781324efa771bc191377b876 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 1 Jun 2006 19:20:10 +0000
Subject: [CIFS] Support for setting up SMB sessions to legacy lanman servers
 part 2

---
 fs/cifs/cifs_debug.c  |  4 +--
 fs/cifs/cifsencrypt.c | 40 +++++++++++++++++++--
 fs/cifs/cifsglob.h    |  2 +-
 fs/cifs/cifspdu.h     |  3 +-
 fs/cifs/cifsproto.h   |  3 ++
 fs/cifs/cifssmb.c     | 27 +++++++++++---
 fs/cifs/connect.c     | 47 +++++++++++++++----------
 fs/cifs/netmisc.c     |  2 +-
 fs/cifs/sess.c        | 97 ++++++++++++++++++++++++++++-----------------------
 9 files changed, 150 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7f4013a8607..4e10e21c54f 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -508,7 +508,7 @@ cifs_proc_init(void)
 		pde->write_proc = multiuser_mount_write;
 
 	pde =
-	    create_proc_read_entry("ExtendedSecurity", 0, proc_fs_cifs,
+	    create_proc_read_entry("SecurityFlags", 0, proc_fs_cifs,
 				extended_security_read, NULL);
 	if (pde)
 		pde->write_proc = extended_security_write;
@@ -547,7 +547,7 @@ cifs_proc_clean(void)
 	remove_proc_entry("MultiuserMount", proc_fs_cifs);
 	remove_proc_entry("OplockEnabled", proc_fs_cifs);
 /*	remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); */
-	remove_proc_entry("ExtendedSecurity",proc_fs_cifs);
+	remove_proc_entry("SecurityFlags",proc_fs_cifs);
 /*	remove_proc_entry("PacketSigningEnabled",proc_fs_cifs); */
 	remove_proc_entry("LinuxExtensionsEnabled",proc_fs_cifs);
 	remove_proc_entry("Experimental",proc_fs_cifs);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 08781a4698b..e11d8c6bb22 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -26,6 +26,7 @@
 #include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
+#include <linux/ctype.h>
 
 /* Calculate and return the CIFS signature based on the mac key and the smb pdu */
 /* the 16 byte signature must be allocated by the caller  */
@@ -35,6 +36,8 @@
 
 extern void mdfour(unsigned char *out, unsigned char *in, int n);
 extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
+extern void SMBencrypt(unsigned char *passwd, unsigned char *c8,
+                       unsigned char *p24);
 	
 static int cifs_calculate_signature(const struct smb_hdr * cifs_pdu, 
 				    const char * key, char * signature)
@@ -45,7 +48,7 @@ static int cifs_calculate_signature(const struct smb_hdr * cifs_pdu,
 		return -EINVAL;
 
 	MD5Init(&context);
-	MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16);
+	MD5Update(&context,key,CIFS_SESS_KEY_SIZE+16);
 	MD5Update(&context,cifs_pdu->Protocol,cifs_pdu->smb_buf_length);
 	MD5Final(signature,&context);
 	return 0;
@@ -90,7 +93,7 @@ static int cifs_calc_signature2(const struct kvec * iov, int n_vec,
 		return -EINVAL;
 
 	MD5Init(&context);
-	MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16);
+	MD5Update(&context,key,CIFS_SESS_KEY_SIZE+16);
 	for(i=0;i<n_vec;i++) {
 		if(iov[i].iov_base == NULL) {
 			cERROR(1,("null iovec entry"));
@@ -204,7 +207,7 @@ int cifs_calculate_mac_key(char * key, const char * rn, const char * password)
 
 	E_md4hash(password, temp_key);
 	mdfour(key,temp_key,16);
-	memcpy(key+16,rn, CIFS_SESSION_KEY_SIZE);
+	memcpy(key+16,rn, CIFS_SESS_KEY_SIZE);
 	return 0;
 }
 
@@ -261,6 +264,37 @@ int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_
 	kfree(unicode_buf);
 	return 0;
 }
+
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key)
+{
+	int i;
+	char password_with_pad[CIFS_ENCPWD_SIZE];
+
+	memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+	strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE);
+
+	/* calculate old style session key */
+	/* calling toupper is less broken than repeatedly
+	calling nls_toupper would be since that will never
+	work for UTF8, but neither handles multibyte code pages
+	but the only alternative would be converting to UCS-16 (Unicode)
+	(using a routine something like UniStrupr) then
+	uppercasing and then converting back from Unicode - which
+	would only worth doing it if we knew it were utf8. Basically
+	utf8 and other multibyte codepages each need their own strupper
+	function since a byte at a time will ont work. */
+
+	for(i = 0; i < CIFS_ENCPWD_SIZE; i++) {
+		password_with_pad[i] = toupper(password_with_pad[i]);
+	}
+
+	SMBencrypt(password_with_pad, ses->server->cryptKey, lnm_session_key);
+	/* clear password before we return/free memory */
+	memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+}
+#endif /* CIFS_WEAK_PW_HASH */
+
 void CalcNTLMv2_response(const struct cifsSesInfo * ses,char * v2_session_response)
 {
 	struct HMACMD5Context context;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7a042041a16..975e69a2e1c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -158,7 +158,7 @@ struct TCP_Server_Info {
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
 	__u32 sequence_number; /* needed for CIFS PDU signature */
-	char mac_signing_key[CIFS_SESSION_KEY_SIZE + 16]; 
+	char mac_signing_key[CIFS_SESS_KEY_SIZE + 16]; 
 };
 
 /*
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index e0ff9b56cc4..135941738d9 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -116,7 +116,8 @@
 /*
  * Size of the session key (crypto key encrypted with the password
  */
-#define CIFS_SESSION_KEY_SIZE (24)
+#define CIFS_SESS_KEY_SIZE (24)
+#define V2_SESS_KEY_SIZE (86)
 
 /*
  * Maximum user name length
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ff78cf7b0d1..59b037f1451 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -287,6 +287,9 @@ extern int cifs_verify_signature(struct smb_hdr *, const char * mac_key,
 extern int cifs_calculate_mac_key(char * key,const char * rn,const char * pass);
 extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, struct nls_table *);
 extern void CalcNTLMv2_response(const struct cifsSesInfo *,char * );
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+extern void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key);
+#endif /* CIFS_WEAK_PW_HASH */
 extern int CIFSSMBCopy(int xid,
 			struct cifsTconInfo *source_tcon,
 			const char *fromName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index da3154fa9c8..6b5be6d59f0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -438,12 +438,19 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			goto neg_err_exit;
 		} else if((pSMBr->hdr.WordCount == 13) && 
 			(pSMBr->DialectIndex == LANMAN_PROT)) {
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
 			struct lanman_neg_rsp * rsp = 
 				(struct lanman_neg_rsp *)pSMBr;
 
-
-			/* BB Mark ses struct as negotiated lanman level BB */
-			server->secType = LANMAN;
+			if((extended_security & CIFSSEC_MAY_LANMAN) || 
+				(extended_security & CIFSSEC_MAY_PLNTXT))
+				server->secType = LANMAN;
+			else {
+				cERROR(1, ("mount failed weak security disabled"
+					" in /proc/fs/cifs/SecurityFlags"));
+				rc = -EOPNOTSUPP;
+				goto neg_err_exit;
+			}	
 			server->secMode = (__u8)le16_to_cpu(rsp->SecurityMode);
 			server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
 			server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
@@ -469,6 +476,11 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			}
 
 			cFYI(1,("LANMAN negotiated")); /* BB removeme BB */
+#else /* weak security disabled */
+			cERROR(1,("mount failed, cifs module not built with "
+				"CIFS_WEAK_PW_HASH support"));
+			rc = -EOPNOTSUPP;
+#endif /* WEAK_PW_HASH */
 			goto neg_err_exit;
 		} else if(pSMBr->hdr.WordCount != 17) {
 			/* unknown wct */
@@ -479,8 +491,13 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->secMode = pSMBr->SecurityMode;
 		if((server->secMode & SECMODE_USER) == 0)
 			cFYI(1,("share mode security"));
-		server->secType = NTLM; /* BB override default for
-					   NTLMv2 or kerberos v5 */
+		
+		if(extended_security & CIFSSEC_MUST_NTLMV2)
+			server->secType = NTLMv2;
+		else
+			server->secType = NTLM;
+		/* else krb5 ... */
+
 		/* one byte - no need to convert this or EncryptionKeyLen
 		   from little endian */
 		server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7ffb8f244f6..e6f3d2fff6c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1990,7 +1990,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 
 static int
 CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
-	      char session_key[CIFS_SESSION_KEY_SIZE],
+	      char session_key[CIFS_SESS_KEY_SIZE],
 	      const struct nls_table *nls_codepage)
 {
 	struct smb_hdr *smb_buffer;
@@ -2048,15 +2048,15 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 
 	pSMB->req_no_secext.CaseInsensitivePasswordLength = 
-		cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+		cpu_to_le16(CIFS_SESS_KEY_SIZE);
 
 	pSMB->req_no_secext.CaseSensitivePasswordLength =
-	    cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+	    cpu_to_le16(CIFS_SESS_KEY_SIZE);
 	bcc_ptr = pByteArea(smb_buffer);
-	memcpy(bcc_ptr, (char *) session_key, CIFS_SESSION_KEY_SIZE);
-	bcc_ptr += CIFS_SESSION_KEY_SIZE;
-	memcpy(bcc_ptr, (char *) session_key, CIFS_SESSION_KEY_SIZE);
-	bcc_ptr += CIFS_SESSION_KEY_SIZE;
+	memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
+	bcc_ptr += CIFS_SESS_KEY_SIZE;
+	memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE);
+	bcc_ptr += CIFS_SESS_KEY_SIZE;
 
 	if (ses->capabilities & CAP_UNICODE) {
 		if ((long) bcc_ptr % 2) { /* must be word aligned for Unicode */
@@ -3004,14 +3004,14 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	SecurityBlob->LmChallengeResponse.Buffer = 0;
 
 	SecurityBlob->NtChallengeResponse.Length =
-	    cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+	    cpu_to_le16(CIFS_SESS_KEY_SIZE);
 	SecurityBlob->NtChallengeResponse.MaximumLength =
-	    cpu_to_le16(CIFS_SESSION_KEY_SIZE);
-	memcpy(bcc_ptr, ntlm_session_key, CIFS_SESSION_KEY_SIZE);
+	    cpu_to_le16(CIFS_SESS_KEY_SIZE);
+	memcpy(bcc_ptr, ntlm_session_key, CIFS_SESS_KEY_SIZE);
 	SecurityBlob->NtChallengeResponse.Buffer =
 	    cpu_to_le32(SecurityBlobLength);
-	SecurityBlobLength += CIFS_SESSION_KEY_SIZE;
-	bcc_ptr += CIFS_SESSION_KEY_SIZE;
+	SecurityBlobLength += CIFS_SESS_KEY_SIZE;
+	bcc_ptr += CIFS_SESS_KEY_SIZE;
 
 	if (ses->capabilities & CAP_UNICODE) {
 		if (domain == NULL) {
@@ -3350,22 +3350,33 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 	bcc_ptr = &pSMB->Password[0];
 	if((ses->server->secMode) & SECMODE_USER) {
 		pSMB->PasswordLength = cpu_to_le16(1);	/* minimum */
+		*bcc_ptr = 0; /* password is null byte */
 		bcc_ptr++;              /* skip password */
+		/* already aligned so no need to do it below */
 	} else {
-		pSMB->PasswordLength = cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+		pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
 		/* BB FIXME add code to fail this if NTLMv2 or Kerberos
 		   specified as required (when that support is added to
 		   the vfs in the future) as only NTLM or the much
-		   weaker LANMAN (which we do not send) is accepted
+		   weaker LANMAN (which we do not send by default) is accepted
 		   by Samba (not sure whether other servers allow
 		   NTLMv2 password here) */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+		if((extended_security & CIFSSEC_MAY_LANMAN) && 
+			(ses->server->secType == LANMAN))
+			calc_lanman_hash(ses, bcc_ptr);
+		else
+#endif /* CIFS_WEAK_PW_HASH */
 		SMBNTencrypt(ses->password,
 			     ses->server->cryptKey,
 			     bcc_ptr);
 
-		bcc_ptr += CIFS_SESSION_KEY_SIZE;
-		*bcc_ptr = 0;
-		bcc_ptr++; /* align */
+		bcc_ptr += CIFS_SESS_KEY_SIZE;
+		if(ses->capabilities & CAP_UNICODE) {
+			/* must align unicode strings */
+			*bcc_ptr = 0; /* null byte password */
+			bcc_ptr++;
+		}
 	}
 
 	if(ses->server->secMode & 
@@ -3507,7 +3518,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 					   struct nls_table * nls_info)
 {
 	int rc = 0;
-	char ntlm_session_key[CIFS_SESSION_KEY_SIZE];
+	char ntlm_session_key[CIFS_SESS_KEY_SIZE];
 	int ntlmv2_flag = FALSE;
 	int first_time = 0;
 
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 5de74d216fd..a0bcdd6f4a6 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -84,7 +84,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
 
 static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
 	{ERRerror, -EIO},
-	{ERRbadpw, -EPERM},
+	{ERRbadpw, -EACCES},  /* was EPERM */
 	{ERRbadtype, -EREMOTE},
 	{ERRaccess, -EACCES},
 	{ERRinvtid, -ENXIO},
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index fdc248fffa0..a52aacb3fef 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -28,12 +28,8 @@
 #include "cifs_debug.h"
 #include "ntlmssp.h"
 #include "nterr.h"
-#include <linux/ctype.h>
 #include <linux/utsname.h>
 
-extern void SMBencrypt(unsigned char *passwd, unsigned char *c8,
-                       unsigned char *p24);
-
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                          unsigned char *p24);
 
@@ -80,7 +76,7 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 	return capabilities;
 }
 
-void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
+static void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
 			    const struct nls_table * nls_cp)
 {
 	char * bcc_ptr = *pbcc_area;
@@ -130,7 +126,7 @@ void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
 	*pbcc_area = bcc_ptr;
 }
 
-void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
+static void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
 			  const struct nls_table * nls_cp)
 {
 	char * bcc_ptr = *pbcc_area;
@@ -173,7 +169,7 @@ void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
         *pbcc_area = bcc_ptr;
 }
 
-int decode_unicode_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses,
+static int decode_unicode_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses,
                             const struct nls_table * nls_cp)
 {
 	int rc = 0;
@@ -255,7 +251,7 @@ int decode_unicode_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses,
 	return rc;
 }
 
-int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses,
+static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses,
                             const struct nls_table * nls_cp)
 {
 	int rc = 0;
@@ -317,7 +313,6 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 {
 	int rc = 0;
 	int wct;
-	int i;
 	struct smb_hdr *smb_buf;
 	char *bcc_ptr;
 	SESSION_SETUP_ANDX *pSMB;
@@ -343,7 +338,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		return -EOPNOTSUPP;
 #endif
 		wct = 10; /* lanman 2 style sessionsetup */
-	} else if(type == NTLM) /* NTLMv2 may retry NTLM */
+	} else if((type == NTLM) || (type == NTLMv2)) /* NTLMv2 may retry NTLM */
 		wct = 13; /* old style NTLM sessionsetup */
 	else /* same size for negotiate or auth, NTLMSSP or extended security */
 		wct = 12;
@@ -360,41 +355,22 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 
 	if(type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-		char lnm_session_key[CIFS_SESSION_KEY_SIZE];
-		char password_with_pad[CIFS_ENCPWD_SIZE];
+		char lnm_session_key[CIFS_SESS_KEY_SIZE];
 
 		/* no capabilities flags in old lanman negotiation */
 
-		pSMB->old_req.PasswordLength = CIFS_SESSION_KEY_SIZE; 
+		pSMB->old_req.PasswordLength = CIFS_SESS_KEY_SIZE; 
 		/* BB calculate hash with password */
 		/* and copy into bcc */
 
-		memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
-		strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE);
-
-		/* calculate old style session key */
-		/* toupper may be less broken then repeatedly calling
-		nls_toupper would be, but neither handles multibyte code pages
-		but the only alternative would be converting to UCS-16 (Unicode)
-		uppercasing and converting back which is only worth doing if
-		we knew it were utf8. utf8 code page needs its own
-		toupper and tolower and strnicmp functions */
-		
-		for(i = 0; i< CIFS_ENCPWD_SIZE; i++) {
-			password_with_pad[i] = toupper(password_with_pad[i]);
-		}
-
-		SMBencrypt(password_with_pad, ses->server->cryptKey,
-			   lnm_session_key);
+		calc_lanman_hash(ses, lnm_session_key);
 
 #ifdef CONFIG_CIFS_DEBUG2
 		cifs_dump_mem("cryptkey: ",ses->server->cryptKey,
-			CIFS_SESSION_KEY_SIZE);
+			CIFS_SESS_KEY_SIZE);
 #endif
-		/* clear password before we return/free memory */
-		memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
-		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESSION_KEY_SIZE);
-		bcc_ptr += CIFS_SESSION_KEY_SIZE;
+		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
+		bcc_ptr += CIFS_SESS_KEY_SIZE;
 
 		/* can not sign if LANMAN negotiated so no need
 		to calculate signing key? but what if server
@@ -406,13 +382,13 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif    
 	} else if (type == NTLM) {
-		char ntlm_session_key[CIFS_SESSION_KEY_SIZE];
+		char ntlm_session_key[CIFS_SESS_KEY_SIZE];
 
 		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
 		pSMB->req_no_secext.CaseInsensitivePasswordLength =
-			cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+			cpu_to_le16(CIFS_SESS_KEY_SIZE);
 		pSMB->req_no_secext.CaseSensitivePasswordLength =
-			cpu_to_le16(CIFS_SESSION_KEY_SIZE);
+			cpu_to_le16(CIFS_SESS_KEY_SIZE);
 	
 		/* calculate session key */
 		SMBNTencrypt(ses->password, ses->server->cryptKey,
@@ -420,15 +396,48 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 
 		if(first_time) /* should this be moved into common code 
 				  with similar ntlmv2 path? */
-			cifs_calculate_mac_key(
-				ses->server->mac_signing_key,
+			cifs_calculate_mac_key( ses->server->mac_signing_key,
 				ntlm_session_key, ses->password);
 		/* copy session key */
 
-		memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESSION_KEY_SIZE);
-		bcc_ptr += CIFS_SESSION_KEY_SIZE;
-		memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESSION_KEY_SIZE);
-		bcc_ptr += CIFS_SESSION_KEY_SIZE;
+		memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESS_KEY_SIZE);
+		bcc_ptr += CIFS_SESS_KEY_SIZE;
+		memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESS_KEY_SIZE);
+		bcc_ptr += CIFS_SESS_KEY_SIZE;
+		if(ses->capabilities & CAP_UNICODE)
+			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
+		else
+			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+	} else if (type == NTLMv2) {
+		char * v2_sess_key = kmalloc(V2_SESS_KEY_SIZE, GFP_KERNEL);
+
+		if(v2_sess_key == NULL) {
+			cifs_small_buf_release(smb_buf);
+			return -ENOMEM;
+		}
+
+		pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+
+		/* LM2 password would be here if we supported it */
+		pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
+		/*	cpu_to_le16(LM2_SESS_KEY_SIZE); */
+
+		pSMB->req_no_secext.CaseSensitivePasswordLength =
+			cpu_to_le16(V2_SESS_KEY_SIZE);
+
+		/* calculate session key */
+		CalcNTLMv2_response(ses, v2_sess_key);
+		if(first_time) /* should this be moved into common code
+			          with similar ntlmv2 path? */
+		/*   cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key,
+				response BB FIXME, v2_sess_key); */
+
+		/* copy session key */
+
+	/*	memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE);
+		bcc_ptr += LM2_SESS_KEY_SIZE; */
+		memcpy(bcc_ptr, (char *)v2_sess_key, V2_SESS_KEY_SIZE);
+		bcc_ptr += V2_SESS_KEY_SIZE;
 		if(ses->capabilities & CAP_UNICODE)
 			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 		else
-- 
cgit v1.2.3


From 3856a9d443ee24248683c415e535f7a2b0fed0f3 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 1 Jun 2006 19:38:46 +0000
Subject: [CIFS] Fix minor build breaks due to cifs kconfig issues

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/Kconfig        |  2 +-
 fs/cifs/cifssmb.c | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index c4eaacb8607..d49b2a8c0be 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1731,7 +1731,7 @@ config CIFS_POSIX
 	  CIFS POSIX ACL support.  If unsure, say N.
 
 config CIFS_DEBUG2
-	bool "Enable additional CIFS debugging routines
+	bool "Enable additional CIFS debugging routines"
 	help
 	   Enabling this option adds a few more debugging routines
 	   to the cifs code which slightly increases the size of
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6b5be6d59f0..eea8967e598 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -436,9 +436,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			a common dialect */
 			rc = -EOPNOTSUPP;
 			goto neg_err_exit;
-		} else if((pSMBr->hdr.WordCount == 13) && 
-			(pSMBr->DialectIndex == LANMAN_PROT)) {
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
+#ifdef CONFIG_CIFS_WEAK_PW_HASH 
+                } else if((pSMBr->hdr.WordCount == 13)
+				&& (pSMBr->DialectIndex == LANMAN_PROT)) {
 			struct lanman_neg_rsp * rsp = 
 				(struct lanman_neg_rsp *)pSMBr;
 
@@ -477,8 +477,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 
 			cFYI(1,("LANMAN negotiated")); /* BB removeme BB */
 #else /* weak security disabled */
-			cERROR(1,("mount failed, cifs module not built with "
-				"CIFS_WEAK_PW_HASH support"));
+		} else if(pSMBr->hdr.WordCount == 13)
+			cERROR(1,("mount failed, cifs module not built "
+				"with CIFS_WEAK_PW_HASH support"));
 			rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
 			goto neg_err_exit;
-- 
cgit v1.2.3


From 273d81d6ada951ba99f10b755d6f849dbb352730 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@austin.ibm.com>
Date: Thu, 1 Jun 2006 19:41:23 +0000
Subject: [CIFS] Do not overwrite aops

cifs should not be overwriting an element of the aops structure, since the
structure is shared by all cifs inodes.  Instead define a separate aops
structure to suit each purpose.

I also took the liberty of replacing a hard-coded 4096 with PAGE_CACHE_SIZE

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
 fs/cifs/cifsfs.h  |  1 +
 fs/cifs/file.c    | 16 ++++++++++++++++
 fs/cifs/inode.c   | 14 ++++++++------
 fs/cifs/readdir.c | 16 ++++++++++------
 4 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2fc6d6527f1..3011df988bd 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -33,6 +33,7 @@
 #endif
 
 extern struct address_space_operations cifs_addr_ops;
+extern struct address_space_operations cifs_addr_ops_smallbuf;
 
 /* Functions related to super block operations */
 extern struct super_operations cifs_super_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 379369ecda9..d62e29fe91f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1959,3 +1959,19 @@ struct address_space_operations cifs_addr_ops = {
 	/* .sync_page = cifs_sync_page, */
 	/* .direct_IO = */
 };
+
+/*
+ * cifs_readpages requires the server to support a buffer large enough to
+ * contain the header plus one complete page of data.  Otherwise, we need
+ * to leave cifs_readpages out of the address space operations.
+ */
+struct address_space_operations cifs_addr_ops_smallbuf = {
+	.readpage = cifs_readpage,
+	.writepage = cifs_writepage,
+	.writepages = cifs_writepages,
+	.prepare_write = cifs_prepare_write,
+	.commit_write = cifs_commit_write,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	/* .sync_page = cifs_sync_page, */
+	/* .direct_IO = */
+};
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a609d266803..b88147c1dc2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -180,11 +180,12 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 			else /* not direct, send byte range locks */ 
 				inode->i_fop = &cifs_file_ops;
 
-			inode->i_data.a_ops = &cifs_addr_ops;
 			/* check if server can support readpages */
 			if(pTcon->ses->server->maxBuf < 
-			    4096 + MAX_CIFS_HDR_SIZE)
-				inode->i_data.a_ops->readpages = NULL;
+			    PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
+				inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+			else
+				inode->i_data.a_ops = &cifs_addr_ops;
 		} else if (S_ISDIR(inode->i_mode)) {
 			cFYI(1, ("Directory inode"));
 			inode->i_op = &cifs_dir_inode_ops;
@@ -519,10 +520,11 @@ int cifs_get_inode_info(struct inode **pinode,
 			else /* not direct, send byte range locks */
 				inode->i_fop = &cifs_file_ops;
 
-			inode->i_data.a_ops = &cifs_addr_ops;
 			if(pTcon->ses->server->maxBuf < 
-			     4096 + MAX_CIFS_HDR_SIZE)
-				inode->i_data.a_ops->readpages = NULL;
+			     PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
+				inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+			else
+				inode->i_data.a_ops = &cifs_addr_ops;
 		} else if (S_ISDIR(inode->i_mode)) {
 			cFYI(1, ("Directory inode"));
 			inode->i_op = &cifs_dir_inode_ops;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 53903a27f78..e3e762d774d 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -21,6 +21,7 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #include <linux/fs.h>
+#include <linux/pagemap.h>
 #include <linux/stat.h>
 #include <linux/smp_lock.h>
 #include "cifspdu.h"
@@ -216,11 +217,13 @@ static void fill_in_inode(struct inode *tmp_inode,
 		else
 			tmp_inode->i_fop = &cifs_file_ops;
 
-		tmp_inode->i_data.a_ops = &cifs_addr_ops;
 		if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
 		   (cifs_sb->tcon->ses->server->maxBuf <
-			4096 + MAX_CIFS_HDR_SIZE))
-			tmp_inode->i_data.a_ops->readpages = NULL;
+			PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
+			tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+		else
+			tmp_inode->i_data.a_ops = &cifs_addr_ops;
+
 		if(isNewInode)
 			return; /* No sense invalidating pages for new inode
 				   since have not started caching readahead file
@@ -339,11 +342,12 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 		else
 			tmp_inode->i_fop = &cifs_file_ops;
 
-		tmp_inode->i_data.a_ops = &cifs_addr_ops;
 		if((cifs_sb->tcon) && (cifs_sb->tcon->ses) &&
 		   (cifs_sb->tcon->ses->server->maxBuf < 
-			4096 + MAX_CIFS_HDR_SIZE))
-			tmp_inode->i_data.a_ops->readpages = NULL;
+			PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE))
+			tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
+		else
+			tmp_inode->i_data.a_ops = &cifs_addr_ops;
 
 		if(isNewInode)
 			return; /* No sense invalidating pages for new inode since we
-- 
cgit v1.2.3


From 7a0d22317614ee8e2598feafd4812e33be17f421 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 1 Jun 2006 19:44:37 +0000
Subject: [CIFS] Missing brace

---
 fs/cifs/cifssmb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index eea8967e598..b8c236be4d8 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -477,7 +477,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 
 			cFYI(1,("LANMAN negotiated")); /* BB removeme BB */
 #else /* weak security disabled */
-		} else if(pSMBr->hdr.WordCount == 13)
+		} else if(pSMBr->hdr.WordCount == 13) {
 			cERROR(1,("mount failed, cifs module not built "
 				"with CIFS_WEAK_PW_HASH support"));
 			rc = -EOPNOTSUPP;
-- 
cgit v1.2.3


From 43411d699e8cf3293674da558d31a02e0012aa9d Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 2 Jun 2006 18:17:11 +0000
Subject: [CIFS] Fix mapping of old SMB return code Invalid Net Name so it is
 recognized on mount

the old mapping of this was to ENODEV (instead of ENXIO) - but
ENODEV is what mount returns when the cifs driver will not load
so change this to map to ENXIO (which was what the equivalent
condition returned for mapping errors from more modern servers)

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/netmisc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index a0bcdd6f4a6..b66eff5dc62 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -88,7 +88,7 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
 	{ERRbadtype, -EREMOTE},
 	{ERRaccess, -EACCES},
 	{ERRinvtid, -ENXIO},
-	{ERRinvnetname, -ENODEV},
+	{ERRinvnetname, -ENXIO},
 	{ERRinvdevice, -ENXIO},
 	{ERRqfull, -ENOSPC},
 	{ERRqtoobig, -ENOSPC},
-- 
cgit v1.2.3


From bdc4bf6e8ac8cc29c61c2f0dc61d9776ef9a8ed4 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 2 Jun 2006 22:57:13 +0000
Subject: [CIFS] Support for older servers which require plaintext passwords

disabled by default, but can be enabled via proc for servers which
require such support.  Also includes support for setting security
flags for cifs.  See fs/cifs/README

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c  | 56 ++++++++++++++++++++++++++++++++++++---------------
 fs/cifs/cifsencrypt.c |  9 +++++++++
 fs/cifs/cifssmb.c     |  7 +++++++
 3 files changed, 56 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 4e10e21c54f..7c0015a9695 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -445,8 +445,8 @@ static read_proc_t traceSMB_read;
 static write_proc_t traceSMB_write;
 static read_proc_t multiuser_mount_read;
 static write_proc_t multiuser_mount_write;
-static read_proc_t extended_security_read;
-static write_proc_t extended_security_write;
+static read_proc_t security_flags_read;
+static write_proc_t security_flags_write;
 /* static read_proc_t ntlmv2_enabled_read;
 static write_proc_t ntlmv2_enabled_write;
 static read_proc_t packet_signing_enabled_read;
@@ -509,9 +509,9 @@ cifs_proc_init(void)
 
 	pde =
 	    create_proc_read_entry("SecurityFlags", 0, proc_fs_cifs,
-				extended_security_read, NULL);
+				security_flags_read, NULL);
 	if (pde)
-		pde->write_proc = extended_security_write;
+		pde->write_proc = security_flags_write;
 
 	pde =
 	create_proc_read_entry("LookupCacheEnabled", 0, proc_fs_cifs,
@@ -832,7 +832,7 @@ multiuser_mount_write(struct file *file, const char __user *buffer,
 }
 
 static int
-extended_security_read(char *page, char **start, off_t off,
+security_flags_read(char *page, char **start, off_t off,
 		       int count, int *eof, void *data)
 {
 	int len;
@@ -853,26 +853,50 @@ extended_security_read(char *page, char **start, off_t off,
 	return len;
 }
 static int
-extended_security_write(struct file *file, const char __user *buffer,
+security_flags_write(struct file *file, const char __user *buffer,
 			unsigned long count, void *data)
 {
+	unsigned int flags;
+	char flags_string[12];
 	char c;
-	int rc;
+
 	cERROR(1,("size %ld",count)); /* BB removeme BB */
-	if((count < 2) || (count > 8))
+
+	if((count < 1) || (count > 11))
 		return -EINVAL;
 
-	rc = get_user(c, buffer);
+	memset(flags_string, 0, 12);
 
-/* BB fixme need to parse more characters in order to handle CIFSSEC flags */ 
+	if(copy_from_user(flags_string, buffer, count))
+		return -EFAULT;
 
-	if (rc)
-		return rc;
-	if (c == '0' || c == 'n' || c == 'N')
-		extended_security = CIFSSEC_DEF; /* default */
-	else if (c == '1' || c == 'y' || c == 'Y')
-		extended_security = CIFSSEC_MAX;
+	if(count < 3) {
+		/* single char or single char followed by null */
+		c = flags_string[0];
+		if (c == '0' || c == 'n' || c == 'N')
+			extended_security = CIFSSEC_DEF; /* default */
+		else if (c == '1' || c == 'y' || c == 'Y')
+			extended_security = CIFSSEC_MAX;
+		return count;
+	}
+	/* else we have a number */
+
+	flags = simple_strtoul(flags_string, NULL, 0);
+
+	cERROR(1,("sec flags 0x%x", flags));  /* BB FIXME make cFYI */
+
+	if(flags <= 0)  {
+		cERROR(1,("invalid security flags %s",flags_string));
+		return -EINVAL;
+	}
 
+	if((flags & CIFSSEC_MASK) != CIFSSEC_MASK) {
+		cERROR(1,("attempt to set unsupported security flags 0x%d",
+			flags & ~CIFSSEC_MASK));
+		return -EINVAL;
+	}
+	/* flags look ok - update the global security flags for cifs module */
+	extended_security = flags;
 	return count;
 }
 
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e11d8c6bb22..3ae964bbfdc 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -271,9 +271,18 @@ void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key)
 	int i;
 	char password_with_pad[CIFS_ENCPWD_SIZE];
 
+	if(ses->server == NULL)
+		return;
+
 	memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
 	strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE);
 
+	if((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
+		if(extended_security & CIFSSEC_MAY_PLNTXT) {
+			memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); 
+			return;
+		}
+
 	/* calculate old style session key */
 	/* calling toupper is less broken than repeatedly
 	calling nls_toupper would be since that will never
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index b8c236be4d8..77cca380946 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -492,6 +492,13 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->secMode = pSMBr->SecurityMode;
 		if((server->secMode & SECMODE_USER) == 0)
 			cFYI(1,("share mode security"));
+
+		if((server->secMode & SECMODE_PW_ENCRYPT) == 0)
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+			if ((extended_security & CIFSSEC_MAY_PLNTXT) == 0)
+#endif /* CIFS_WEAK_PW_HASH */
+				cERROR(1,("Server requests plain text password"
+					" but client support disabled"));
 		
 		if(extended_security & CIFSSEC_MUST_NTLMV2)
 			server->secType = NTLMv2;
-- 
cgit v1.2.3


From 3bcc86f507f5a0b6f5bfa312f37ec33711558acb Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 3 Jun 2006 00:25:50 +0100
Subject: [JFFS2] Remove stray __exit from jffs2_compressors_exit()

It's used from the initfunc in case of failure too. We could actually do
with an '__initexit' for this kind of thing -- when built in to the
kernel, it could do with being dropped with the init text. We _could_
actually just use __init for it, but that would break if/when we start
dropping init text from modules. So let's just leave it as it was for now,
and mutter a little more about random 'janitorial' fixes from people who
aren't paying attention to what they're doing.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/compr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 5f45e01d71e..7001ba26c06 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -440,7 +440,7 @@ int __init jffs2_compressors_init(void)
         return 0;
 }
 
-int __exit jffs2_compressors_exit(void)
+int jffs2_compressors_exit(void)
 {
 /* Unregistering compressors */
 #ifdef CONFIG_JFFS2_RUBIN
-- 
cgit v1.2.3


From 254e55ed03e2e8d23089b4a468eec2fd2e1ead9b Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sun, 4 Jun 2006 05:53:15 +0000
Subject: CIFS] Support for older servers which require plaintext passwords -
 part 2

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README     |  32 ++++--
 fs/cifs/cifsglob.h |  17 ++--
 fs/cifs/cifspdu.h  |   4 +
 fs/cifs/cifssmb.c  | 279 +++++++++++++++++++++++++++--------------------------
 fs/cifs/sess.c     |   3 +-
 5 files changed, 184 insertions(+), 151 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index 0355003f4f0..a68f8e3db2d 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -485,14 +485,34 @@ PacketSigningEnabled	If set to one, cifs packet signing is enabled
 			it.  If set to two, cifs packet signing is
 			required even if the server considers packet
 			signing optional. (default 1)
+SecurityFlags		Flags which control security negotiation and
+			also packet signing. Authentication (may/must)
+			flags (e.g. for NTLM and/or NTLMv2) may be combined with
+			the signing flags.  Specifying two different password
+			hashing mechanisms (as "must use") on the other hand 
+			does not make much sense. Default flags are 
+				0x07007 
+			(NTLM, NTLMv2 and packet signing allowed).  Maximum 
+			allowable flags if you want to allow mounts to servers
+			using weaker password hashes is 0x37037 (lanman,
+			plaintext, ntlm, ntlmv2, signing allowed):
+ 
+			may use packet signing 				0x00001
+			must use packet signing				0x01001
+			may use NTLM (most common password hash)	0x00002
+			must use NTLM					0x02002
+			may use NTLMv2					0x00004
+			must use NTLMv2					0x04004
+			may use Kerberos security (not implemented yet) 0x00008
+			must use Kerberos (not implemented yet)         0x08008
+			may use lanman (weak) password hash  		0x00010
+			must use lanman password hash			0x10010
+			may use plaintext passwords    			0x00020
+			must use plaintext passwords			0x20020
+			(reserved for future packet encryption)		0x00040
+
 cifsFYI			If set to one, additional debug information is
 			logged to the system error log. (default 0)
-ExtendedSecurity	If set to one, SPNEGO session establishment
-			is allowed which enables more advanced 
-			secure CIFS session establishment (default 0)
-NTLMV2Enabled		If set to one, more secure password hashes
-			are used when the server supports them and
-			when kerberos is not negotiated (default 0)
 traceSMB		If set to one, debug information is logged to the
 			system error log with the start of smb requests
 			and responses (default 0)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 975e69a2e1c..87453a6bcaf 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -212,12 +212,12 @@ struct cifsTconInfo {
 	struct list_head openFileList;
 	struct semaphore tconSem;
 	struct cifsSesInfo *ses;	/* pointer to session associated with */
-	char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource (in ASCII not UTF) */
+	char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
 	char *nativeFileSystem;
 	__u16 tid;		/* The 2 byte tree id */
 	__u16 Flags;		/* optional support bits */
 	enum statusEnum tidStatus;
-	atomic_t useCount;	/* how many mounts (explicit or implicit) to this share */
+	atomic_t useCount;	/* how many explicit/implicit mounts to share */
 #ifdef CONFIG_CIFS_STATS
 	atomic_t num_smbs_sent;
 	atomic_t num_writes;
@@ -257,7 +257,7 @@ struct cifsTconInfo {
 	spinlock_t stat_lock;
 #endif /* CONFIG_CIFS_STATS */
 	FILE_SYSTEM_DEVICE_INFO fsDevInfo;
-	FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo;	/* ok if file system name truncated */
+	FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
 	FILE_SYSTEM_UNIX_INFO fsUnixInfo;
 	unsigned retry:1;
 	unsigned nocase:1;
@@ -308,7 +308,6 @@ struct cifsFileInfo {
 	atomic_t wrtPending;   /* handle in use - defer close */
 	struct semaphore fh_sem; /* prevents reopen race after dead ses*/
 	char * search_resume_name; /* BB removeme BB */
-	unsigned int resume_name_length; /* BB removeme - field renamed and moved BB */
 	struct cifs_search_info srch_inf;
 };
 
@@ -523,16 +522,16 @@ GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;  /* protects list inserts on 3 above */
 GLOBAL_EXTERN struct list_head GlobalOplock_Q;
 
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; /* Outstanding dir notify requests */
-GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q; /* Dir notify response queue */
+GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;/* DirNotify response queue */
 
 /*
  * Global transaction id (XID) information
  */
 GLOBAL_EXTERN unsigned int GlobalCurrentXid;	/* protected by GlobalMid_Sem */
-GLOBAL_EXTERN unsigned int GlobalTotalActiveXid;	/* prot by GlobalMid_Sem */
+GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
 GLOBAL_EXTERN unsigned int GlobalMaxActiveXid;	/* prot by GlobalMid_Sem */
-GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above and list operations */
-					/* on midQ entries */
+GLOBAL_EXTERN spinlock_t GlobalMid_Lock;  /* protects above & list operations */
+					  /* on midQ entries */
 GLOBAL_EXTERN char Local_System_Name[15];
 
 /*
@@ -554,7 +553,7 @@ GLOBAL_EXTERN atomic_t smBufAllocCount;
 GLOBAL_EXTERN atomic_t midCount;
 
 /* Misc globals */
-GLOBAL_EXTERN unsigned int multiuser_mount;	/* if enabled allows new sessions
+GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
 				to be established on existing mount if we
 				have the uid/password or Kerberos credential 
 				or equivalent for current user */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 135941738d9..e714803a52d 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -426,6 +426,10 @@ typedef struct lanman_neg_rsp {
 	unsigned char EncryptionKey[1];
 } __attribute__((packed)) LANMAN_NEG_RSP;
 
+#define READ_RAW_ENABLE 1
+#define WRITE_RAW_ENABLE 2
+#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE)
+
 typedef struct negotiate_rsp {
 	struct smb_hdr hdr;	/* wct = 17 */
 	__le16 DialectIndex;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 77cca380946..0442c3b3679 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -411,8 +411,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		return rc;
 	pSMB->hdr.Mid = GetNextMid(server);
 	pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
-/*	if (extended_security)
-		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;*/
+	if((extended_security & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	
 	count = 0;
 	for(i=0;i<CIFS_NUM_PROT;i++) {
@@ -425,162 +425,171 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 
 	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
-	if (rc == 0) {
-		cFYI(1,("Dialect: %d", pSMBr->DialectIndex));
-		/* Check wct = 1 error case */
-		if((pSMBr->hdr.WordCount < 13)  
-			|| (pSMBr->DialectIndex == BAD_PROT)) {
-			/* core returns wct = 1, but we do not ask for
-			core - otherwise it just comes when dialect
-			index is -1 indicating we could not negotiate
-			a common dialect */
+	if (rc != 0) 
+		goto neg_err_exit;
+
+	cFYI(1,("Dialect: %d", pSMBr->DialectIndex));
+	/* Check wct = 1 error case */
+	if((pSMBr->hdr.WordCount < 13) || (pSMBr->DialectIndex == BAD_PROT)) {
+		/* core returns wct = 1, but we do not ask for core - otherwise
+		small wct just comes when dialect index is -1 indicating we 
+		could not negotiate a common dialect */
+		rc = -EOPNOTSUPP;
+		goto neg_err_exit;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH 
+	} else if((pSMBr->hdr.WordCount == 13)
+			&& (pSMBr->DialectIndex == LANMAN_PROT)) {
+		struct lanman_neg_rsp * rsp = (struct lanman_neg_rsp *)pSMBr;
+
+		if((extended_security & CIFSSEC_MAY_LANMAN) || 
+			(extended_security & CIFSSEC_MAY_PLNTXT))
+			server->secType = LANMAN;
+		else {
+			cERROR(1, ("mount failed weak security disabled"
+				   " in /proc/fs/cifs/SecurityFlags"));
 			rc = -EOPNOTSUPP;
 			goto neg_err_exit;
-#ifdef CONFIG_CIFS_WEAK_PW_HASH 
-                } else if((pSMBr->hdr.WordCount == 13)
-				&& (pSMBr->DialectIndex == LANMAN_PROT)) {
-			struct lanman_neg_rsp * rsp = 
-				(struct lanman_neg_rsp *)pSMBr;
-
-			if((extended_security & CIFSSEC_MAY_LANMAN) || 
-				(extended_security & CIFSSEC_MAY_PLNTXT))
-				server->secType = LANMAN;
-			else {
-				cERROR(1, ("mount failed weak security disabled"
-					" in /proc/fs/cifs/SecurityFlags"));
-				rc = -EOPNOTSUPP;
-				goto neg_err_exit;
-			}	
-			server->secMode = (__u8)le16_to_cpu(rsp->SecurityMode);
-			server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
-			server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
+		}	
+		server->secMode = (__u8)le16_to_cpu(rsp->SecurityMode);
+		server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
+		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
 				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+		GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
+		/* even though we do not use raw we might as well set this
+		accurately, in case we ever find a need for it */
+		if((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
+			server->maxRw = 0xFF00;
+			server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
+		} else {
+			server->maxRw = 0;/* we do not need to use raw anyway */
+			server->capabilities = CAP_MPX_MODE;
+		}
+		server->timeZone = le16_to_cpu(rsp->ServerTimeZone);
 
-			/* BB what do we do with raw mode? BB */
-			server->timeZone = le16_to_cpu(rsp->ServerTimeZone);
-			/* Do we have to set signing flags? no signing
-			was available LANMAN - default should be ok */
-
-			/* BB FIXME set default dummy capabilities since
-			they are not returned by the server in this dialect */
-
-			/* get server time for time conversions and add
-			code to use it and timezone since this is not UTC */	
+		/* BB get server time for time conversions and add
+		code to use it and timezone since this is not UTC */	
 
-			if (rsp->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
-				memcpy(server->cryptKey, rsp->EncryptionKey,
-					CIFS_CRYPTO_KEY_SIZE);
-			} else {
-				rc = -EIO;
-				goto neg_err_exit;
-			}
+		if (rsp->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+			memcpy(server->cryptKey, rsp->EncryptionKey,
+				CIFS_CRYPTO_KEY_SIZE);
+		} else if (server->secMode & SECMODE_PW_ENCRYPT) {
+			rc = -EIO; /* need cryptkey unless plain text */
+			goto neg_err_exit;
+		}
 
-			cFYI(1,("LANMAN negotiated")); /* BB removeme BB */
+		cFYI(1,("LANMAN negotiated"));
+		/* we will not end up setting signing flags - as no signing
+		was in LANMAN and server did not return the flags on */
+		goto signing_check;
 #else /* weak security disabled */
-		} else if(pSMBr->hdr.WordCount == 13) {
-			cERROR(1,("mount failed, cifs module not built "
-				"with CIFS_WEAK_PW_HASH support"));
+	} else if(pSMBr->hdr.WordCount == 13) {
+		cERROR(1,("mount failed, cifs module not built "
+			  "with CIFS_WEAK_PW_HASH support"));
 			rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
-			goto neg_err_exit;
-		} else if(pSMBr->hdr.WordCount != 17) {
-			/* unknown wct */
-			rc = -EOPNOTSUPP;
-			goto neg_err_exit;
-		}
-
-		server->secMode = pSMBr->SecurityMode;
-		if((server->secMode & SECMODE_USER) == 0)
-			cFYI(1,("share mode security"));
+		goto neg_err_exit;
+	} else if(pSMBr->hdr.WordCount != 17) {
+		/* unknown wct */
+		rc = -EOPNOTSUPP;
+		goto neg_err_exit;
+	}
+	/* else wct == 17 NTLM */
+	server->secMode = pSMBr->SecurityMode;
+	if((server->secMode & SECMODE_USER) == 0)
+		cFYI(1,("share mode security"));
 
-		if((server->secMode & SECMODE_PW_ENCRYPT) == 0)
+	if((server->secMode & SECMODE_PW_ENCRYPT) == 0)
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-			if ((extended_security & CIFSSEC_MAY_PLNTXT) == 0)
+		if ((extended_security & CIFSSEC_MAY_PLNTXT) == 0)
 #endif /* CIFS_WEAK_PW_HASH */
-				cERROR(1,("Server requests plain text password"
-					" but client support disabled"));
+			cERROR(1,("Server requests plain text password"
+				  " but client support disabled"));
 		
-		if(extended_security & CIFSSEC_MUST_NTLMV2)
-			server->secType = NTLMv2;
-		else
-			server->secType = NTLM;
-		/* else krb5 ... */
-
-		/* one byte - no need to convert this or EncryptionKeyLen
-		   from little endian */
-		server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
-		/* probably no need to store and check maxvcs */
-		server->maxBuf =
-			min(le32_to_cpu(pSMBr->MaxBufferSize),
+	if(extended_security & CIFSSEC_MUST_NTLMV2)
+		server->secType = NTLMv2;
+	else
+		server->secType = NTLM;
+	/* else krb5 ... */
+
+	/* one byte, so no need to convert this or EncryptionKeyLen from
+	   little endian */
+	server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount);
+	/* probably no need to store and check maxvcs */
+	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
 			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
-		server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
-		cFYI(0, ("Max buf = %d", ses->server->maxBuf));
-		GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
-		server->capabilities = le32_to_cpu(pSMBr->Capabilities);
-		server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone);	
-        /* BB with UTC do we ever need to be using srvr timezone? */
-		if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
-			memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
-			       CIFS_CRYPTO_KEY_SIZE);
-		} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
-			   && (pSMBr->EncryptionKeyLength == 0)) {
-			/* decode security blob */
-		} else
-			rc = -EIO;
+	server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
+	cFYI(0, ("Max buf = %d", ses->server->maxBuf));
+	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
+	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
+	server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone);	
+	if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
+		memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
+		       CIFS_CRYPTO_KEY_SIZE);
+	} else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC)
+			&& (pSMBr->EncryptionKeyLength == 0)) {
+		/* decode security blob */
+	} else if (server->secMode & SECMODE_PW_ENCRYPT) {
+		rc = -EIO; /* no crypt key only if plain text pwd */
+		goto neg_err_exit;
+	}
 
-		/* BB might be helpful to save off the domain of server here */
+	/* BB might be helpful to save off the domain of server here */
 
-		if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) && 
-			(server->capabilities & CAP_EXTENDED_SECURITY)) {
-			count = pSMBr->ByteCount;
-			if (count < 16)
-				rc = -EIO;
-			else if (count == 16) {
-				server->secType = RawNTLMSSP;
-				if (server->socketUseCount.counter > 1) {
-					if (memcmp
-						(server->server_GUID,
-						pSMBr->u.extended_response.
-						GUID, 16) != 0) {
-						cFYI(1, ("server UID changed"));
-						memcpy(server->
-							server_GUID,
-							pSMBr->u.
-							extended_response.
-							GUID, 16);
-					}
-				} else
+	if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) && 
+		(server->capabilities & CAP_EXTENDED_SECURITY)) {
+		count = pSMBr->ByteCount;
+		if (count < 16)
+			rc = -EIO;
+		else if (count == 16) {
+			server->secType = RawNTLMSSP;
+			if (server->socketUseCount.counter > 1) {
+				if (memcmp(server->server_GUID,
+					   pSMBr->u.extended_response.
+					   GUID, 16) != 0) {
+					cFYI(1, ("server UID changed"));
 					memcpy(server->server_GUID,
-					       pSMBr->u.extended_response.
-					       GUID, 16);
-			} else {
-				rc = decode_negTokenInit(pSMBr->u.
-							 extended_response.
-							 SecurityBlob,
-							 count - 16,
-							 &server->secType);
-				if(rc == 1) {
-				/* BB Need to fill struct for sessetup here */
-					rc = -EOPNOTSUPP;
-				} else {
-					rc = -EINVAL;
+						pSMBr->u.extended_response.GUID,
+						16);
 				}
+			} else
+				memcpy(server->server_GUID,
+				       pSMBr->u.extended_response.GUID, 16);
+		} else {
+			rc = decode_negTokenInit(pSMBr->u.extended_response.
+						 SecurityBlob,
+						 count - 16,
+						 &server->secType);
+			if(rc == 1) {
+			/* BB Need to fill struct for sessetup here */
+				rc = -EOPNOTSUPP;
+			} else {
+				rc = -EINVAL;
 			}
-		} else
-			server->capabilities &= ~CAP_EXTENDED_SECURITY;
-		if(sign_CIFS_PDUs == FALSE) {        
-			if(server->secMode & SECMODE_SIGN_REQUIRED)
-				cERROR(1,
-				 ("Server requires /proc/fs/cifs/PacketSigningEnabled"));
-			server->secMode &= ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
-		} else if(sign_CIFS_PDUs == 1) {
-			if((server->secMode & SECMODE_SIGN_REQUIRED) == 0)
-				server->secMode &= ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
 		}
-				
+	} else
+		server->capabilities &= ~CAP_EXTENDED_SECURITY;
+
+signing_check:
+	if(sign_CIFS_PDUs == FALSE) {        
+		if(server->secMode & SECMODE_SIGN_REQUIRED)
+			cERROR(1,("Server requires "
+				 "/proc/fs/cifs/PacketSigningEnabled to be on"));
+		server->secMode &= 
+			~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+	} else if(sign_CIFS_PDUs == 1) {
+		if((server->secMode & SECMODE_SIGN_REQUIRED) == 0)
+			server->secMode &= 
+				~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
+	} else if(sign_CIFS_PDUs == 2) {
+		if((server->secMode & 
+			(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
+			cERROR(1,("signing required but server lacks support"));
+		}
 	}
 neg_err_exit:	
 	cifs_buf_release(pSMB);
+
+	cFYI(1,("negprot rc %d",rc));
 	return rc;
 }
 
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index a52aacb3fef..76a09f5f804 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -323,7 +323,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	enum securityEnum type;
 	__u16 action;
 	int bytes_remaining;
-	
+
+	cFYI(1,("new sess setup"));
 	if(ses == NULL)
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 9312f6754d4b2d3ce27c21b16fb92923ce92a411 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sun, 4 Jun 2006 22:21:07 +0000
Subject: [CIFS] Fix mask so can set new cifs security flags properly

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c | 6 ++----
 fs/cifs/cifssmb.c    | 2 +-
 fs/cifs/sess.c       | 5 +++--
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 7c0015a9695..2f55edf2eee 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -860,8 +860,6 @@ security_flags_write(struct file *file, const char __user *buffer,
 	char flags_string[12];
 	char c;
 
-	cERROR(1,("size %ld",count)); /* BB removeme BB */
-
 	if((count < 1) || (count > 11))
 		return -EINVAL;
 
@@ -883,14 +881,14 @@ security_flags_write(struct file *file, const char __user *buffer,
 
 	flags = simple_strtoul(flags_string, NULL, 0);
 
-	cERROR(1,("sec flags 0x%x", flags));  /* BB FIXME make cFYI */
+	cFYI(1,("sec flags 0x%x", flags));
 
 	if(flags <= 0)  {
 		cERROR(1,("invalid security flags %s",flags_string));
 		return -EINVAL;
 	}
 
-	if((flags & CIFSSEC_MASK) != CIFSSEC_MASK) {
+	if(flags & ~CIFSSEC_MASK) {
 		cERROR(1,("attempt to set unsupported security flags 0x%d",
 			flags & ~CIFSSEC_MASK));
 		return -EINVAL;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 0442c3b3679..acae58313b0 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -504,7 +504,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 #endif /* CIFS_WEAK_PW_HASH */
 			cERROR(1,("Server requests plain text password"
 				  " but client support disabled"));
-		
+
 	if(extended_security & CIFSSEC_MUST_NTLMV2)
 		server->secType = NTLMv2;
 	else
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 76a09f5f804..1fe9461c6dc 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -339,9 +339,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		return -EOPNOTSUPP;
 #endif
 		wct = 10; /* lanman 2 style sessionsetup */
-	} else if((type == NTLM) || (type == NTLMv2)) /* NTLMv2 may retry NTLM */
+	} else if((type == NTLM) || (type == NTLMv2)) { 
+		/* For NTLMv2 failures eventually may need to retry NTLM */
 		wct = 13; /* old style NTLM sessionsetup */
-	else /* same size for negotiate or auth, NTLMSSP or extended security */
+	} else /* same size for negotiate or auth, NTLMSSP or extended security */
 		wct = 12;
 
 	rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
-- 
cgit v1.2.3


From f64b23ae4aef9f69d71ea41529a188acd5ab4930 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 5 Jun 2006 05:27:37 +0000
Subject: [CIFS] NTLMv2 support part 2

Still need to fill in response structure and check that hash works

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c |  2 ++
 fs/cifs/cifspdu.h     | 27 ++++++++++++++++++++++++---
 fs/cifs/sess.c        | 13 +++++++++----
 3 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 3ae964bbfdc..24ab770f60d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -306,6 +306,8 @@ void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key)
 
 void CalcNTLMv2_response(const struct cifsSesInfo * ses,char * v2_session_response)
 {
+	/* BB FIXME  -  update struct ntlmv2_response and change calling convention
+	   of this function */ 
 	struct HMACMD5Context context;
 	memcpy(v2_session_response + 8, ses->server->cryptKey,8);
 	/* gen_blob(v2_session_response + 16); */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index e714803a52d..503b7e32ede 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -117,7 +117,6 @@
  * Size of the session key (crypto key encrypted with the password
  */
 #define CIFS_SESS_KEY_SIZE (24)
-#define V2_SESS_KEY_SIZE (86)
 
 /*
  * Maximum user name length
@@ -539,7 +538,7 @@ typedef union smb_com_session_setup_andx {
 /*      unsigned char  * NativeOS;      */
 /*	unsigned char  * NativeLanMan;  */
 /*      unsigned char  * PrimaryDomain; */
-	} __attribute__((packed)) resp;			/* NTLM response format (with or without extended security */
+	} __attribute__((packed)) resp;	/* NTLM response with or without extended sec*/
 
 	struct {		/* request format */
 		struct smb_hdr hdr;	/* wct = 10 */
@@ -573,6 +572,26 @@ typedef union smb_com_session_setup_andx {
 	} __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */
 } __attribute__((packed)) SESSION_SETUP_ANDX;
 
+/* format of NLTMv2 Response ie "case sensitive password" hash when NTLMv2 */
+
+struct ntlmssp2_name {
+	__le16 type;
+	__le16 length;
+/*	char   name[length]; */
+} __attribute__((packed));
+
+struct ntlmv2_resp {
+	char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+	__le32 blob_sign;
+	__u32  reserved;
+	__le64  time;
+	__u64  client_chal; /* random */
+	__u32  reserved2;
+	struct ntlmssp2_name names[1];
+	/* array of name entries could follow ending in minimum 4 byte struct */
+} __attribute__((packed));
+
+
 #define CIFS_NETWORK_OPSYS "CIFS VFS Client for Linux"
 
 /* Capabilities bits (for NTLM SessSetup request) */
@@ -603,7 +622,9 @@ typedef struct smb_com_tconx_req {
 } __attribute__((packed)) TCONX_REQ;
 
 typedef struct smb_com_tconx_rsp {
-	struct smb_hdr hdr;	/* wct = 3 *//* note that Win2000 has sent wct=7 in some cases on responses. Four unspecified words followed OptionalSupport */
+	struct smb_hdr hdr;	/* wct = 3 note that Win2000 has sent wct = 7
+				 in some cases on responses. Four unspecified
+				 words followed OptionalSupport */
 	__u8 AndXCommand;
 	__u8 AndXReserved;
 	__le16 AndXOffset;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 1fe9461c6dc..9ce628df29b 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -411,7 +411,11 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 	} else if (type == NTLMv2) {
-		char * v2_sess_key = kmalloc(V2_SESS_KEY_SIZE, GFP_KERNEL);
+		char * v2_sess_key = kmalloc(sizeof(struct ntlmv2_resp),
+						GFP_KERNEL);
+
+		/* BB FIXME change all users of v2_sess_key to
+		   struct ntlmv2_resp */
 
 		if(v2_sess_key == NULL) {
 			cifs_small_buf_release(smb_buf);
@@ -425,7 +429,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		/*	cpu_to_le16(LM2_SESS_KEY_SIZE); */
 
 		pSMB->req_no_secext.CaseSensitivePasswordLength =
-			cpu_to_le16(V2_SESS_KEY_SIZE);
+			cpu_to_le16(sizeof(struct ntlmv2_resp));
 
 		/* calculate session key */
 		CalcNTLMv2_response(ses, v2_sess_key);
@@ -438,8 +442,9 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 
 	/*	memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE);
 		bcc_ptr += LM2_SESS_KEY_SIZE; */
-		memcpy(bcc_ptr, (char *)v2_sess_key, V2_SESS_KEY_SIZE);
-		bcc_ptr += V2_SESS_KEY_SIZE;
+		memcpy(bcc_ptr, (char *)v2_sess_key, sizeof(struct ntlmv2_resp));
+		bcc_ptr += sizeof(struct ntlmv2_resp);
+		kfree(v2_sess_key);
 		if(ses->capabilities & CAP_UNICODE)
 			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 		else
-- 
cgit v1.2.3


From 6d027cfdb19c26df3151a519ed55acfe2c4cb7c3 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 5 Jun 2006 16:26:05 +0000
Subject: [CIFS] NTLMv2 support part 3

Response struct filled in exacty for 16 byte hash which we need to check
more to make sure it works.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 17 +++++++++++++++--
 fs/cifs/cifspdu.h     |  2 +-
 fs/cifs/cifsproto.h   |  1 +
 fs/cifs/sess.c        |  6 +++---
 4 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 24ab770f60d..09f94617e53 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
 #include "cifs_unicode.h"
 #include "cifsproto.h"
 #include <linux/ctype.h>
+#include <linux/random.h>
 
 /* Calculate and return the CIFS signature based on the mac key and the smb pdu */
 /* the 16 byte signature must be allocated by the caller  */
@@ -304,10 +305,22 @@ void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key)
 }
 #endif /* CIFS_WEAK_PW_HASH */
 
+void setup_ntlmv2_rsp(const struct cifsSesInfo * ses, char * resp_buf)
+{
+	struct ntlmv2_resp * buf = (struct ntlmv2_resp *)resp_buf;
+
+	buf->blob_signature = cpu_to_le32(0x00000101);
+	buf->reserved = 0;
+	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); 
+	buf->reserved2 = 0;
+	buf->names[0].type = 0;
+	buf->names[0].length = 0;
+	/* calculate buf->ntlmv2_hash */
+}
+
 void CalcNTLMv2_response(const struct cifsSesInfo * ses,char * v2_session_response)
 {
-	/* BB FIXME  -  update struct ntlmv2_response and change calling convention
-	   of this function */ 
 	struct HMACMD5Context context;
 	memcpy(v2_session_response + 8, ses->server->cryptKey,8);
 	/* gen_blob(v2_session_response + 16); */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 503b7e32ede..5250b93d309 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -582,7 +582,7 @@ struct ntlmssp2_name {
 
 struct ntlmv2_resp {
 	char ntlmv2_hash[CIFS_ENCPWD_SIZE];
-	__le32 blob_sign;
+	__le32 blob_signature;
 	__u32  reserved;
 	__le64  time;
 	__u64  client_chal; /* random */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 59b037f1451..824afb937a6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -287,6 +287,7 @@ extern int cifs_verify_signature(struct smb_hdr *, const char * mac_key,
 extern int cifs_calculate_mac_key(char * key,const char * rn,const char * pass);
 extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, struct nls_table *);
 extern void CalcNTLMv2_response(const struct cifsSesInfo *,char * );
+extern void setup_ntlmv2_rsp(const struct cifsSesInfo *, char *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key);
 #endif /* CIFS_WEAK_PW_HASH */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 9ce628df29b..c6fd01f55e9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -411,8 +411,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 	} else if (type == NTLMv2) {
-		char * v2_sess_key = kmalloc(sizeof(struct ntlmv2_resp),
-						GFP_KERNEL);
+		char * v2_sess_key = 
+			kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL);
 
 		/* BB FIXME change all users of v2_sess_key to
 		   struct ntlmv2_resp */
@@ -432,7 +432,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 			cpu_to_le16(sizeof(struct ntlmv2_resp));
 
 		/* calculate session key */
-		CalcNTLMv2_response(ses, v2_sess_key);
+		setup_ntlmv2_rsp(ses, v2_sess_key);
 		if(first_time) /* should this be moved into common code
 			          with similar ntlmv2 path? */
 		/*   cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key,
-- 
cgit v1.2.3


From a8ee03441f66e0674e641c0cbe1a9534cdee968f Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 5 Jun 2006 23:34:19 +0000
Subject: [CIFS] NTLMv2 support part 4

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c  |  2 +-
 fs/cifs/cifsencrypt.c | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 2f55edf2eee..96abeb73897 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -889,7 +889,7 @@ security_flags_write(struct file *file, const char __user *buffer,
 	}
 
 	if(flags & ~CIFSSEC_MASK) {
-		cERROR(1,("attempt to set unsupported security flags 0x%d",
+		cERROR(1,("attempt to set unsupported security flags 0x%x",
 			flags & ~CIFSSEC_MASK));
 		return -EINVAL;
 	}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 09f94617e53..8bcb1da3270 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -305,8 +305,44 @@ void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key)
 }
 #endif /* CIFS_WEAK_PW_HASH */
 
+static int calc_ntlmv2_hash(const struct cifsSesInfo *ses, 
+			    char * ntv2_hash)
+{
+	int rc = 0;
+	int len;
+	char nt_hash[16];
+	struct HMACMD5Context * pctxt;
+
+	pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
+
+	if(pctxt == NULL)
+		return -ENOMEM;
+
+	/* calculate md4 hash of password */
+	E_md4hash(ses->password, nt_hash);
+
+	/* convERT Domainname to unicode and uppercase */
+	hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+
+	/* convert ses->userName to unicode and uppercase */
+
+	/* len = ... */  /* BB FIXME BB */
+
+	/* hmac_md5_update(user, len, pctxt); */
+
+	/* convert ses->domainName to unicode and uppercase */
+
+	/* len = ... */  /* BB FIXME BB */
+	/* hmac_md5_update(domain, len, pctxt); */
+
+	hmac_md5_final(ntv2_hash, pctxt);
+
+	return rc;
+}
+
 void setup_ntlmv2_rsp(const struct cifsSesInfo * ses, char * resp_buf)
 {
+	int rc;
 	struct ntlmv2_resp * buf = (struct ntlmv2_resp *)resp_buf;
 
 	buf->blob_signature = cpu_to_le32(0x00000101);
@@ -316,7 +352,11 @@ void setup_ntlmv2_rsp(const struct cifsSesInfo * ses, char * resp_buf)
 	buf->reserved2 = 0;
 	buf->names[0].type = 0;
 	buf->names[0].length = 0;
+
 	/* calculate buf->ntlmv2_hash */
+	rc = calc_ntlmv2_hash(ses,buf->ntlmv2_hash);
+	if(rc)
+		cERROR(1,("could not get v2 hash rc %d",rc));
 }
 
 void CalcNTLMv2_response(const struct cifsSesInfo * ses,char * v2_session_response)
-- 
cgit v1.2.3


From 5bafd76593f060540acbea3b61e3087e009aa269 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 7 Jun 2006 00:18:43 +0000
Subject: [CIFS] Add support for readdir to legacy servers

Fixes oops to OS/2 on ls and removes redundant NTCreateX calls to servers
which do not support NT SMBs.  Key operations to OS/2 work.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES   |   4 ++
 fs/cifs/cifspdu.h |  29 +++++++++--
 fs/cifs/dir.c     |  11 +++-
 fs/cifs/file.c    |   9 +++-
 fs/cifs/readdir.c | 150 +++++++++++++++++++++++++++++++++++++++---------------
 5 files changed, 153 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index b878dfcff0f..7e0058bc3dd 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -2,6 +2,10 @@ Version 1.44
 ------------
 Rewritten sessionsetup support, including support for legacy SMB
 session setup needed for OS/2 and older servers such as Windows 95 and 98.
+Fix oops on ls to OS/2 servers.  Add support for level 1 FindFirst
+so we can do search (ls etc.) to OS/2.  Do not send NTCreateX
+or recent levels of FindFirst unless server says it supports NT SMBs
+(instead use legacy equivalents from LANMAN dialect).
 
 Version 1.43
 ------------
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 5250b93d309..86239023545 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1374,6 +1374,9 @@ struct smb_t2_rsp {
 #define SMB_FILE_MAXIMUM_INFO           0x40d
 
 /* Find File infolevels */
+#define SMB_FIND_FILE_INFO_STANDARD       0x001
+#define SMB_FIND_FILE_QUERY_EA_SIZE       0x002
+#define SMB_FIND_FILE_QUERY_EAS_FROM_LIST 0x003
 #define SMB_FIND_FILE_DIRECTORY_INFO      0x101
 #define SMB_FIND_FILE_FULL_DIRECTORY_INFO 0x102
 #define SMB_FIND_FILE_NAMES_INFO          0x103
@@ -1998,7 +2001,8 @@ typedef struct {
 
 struct file_allocation_info {
 	__le64 AllocationSize; /* Note old Samba srvr rounds this up too much */
-} __attribute__((packed));	/* size used on disk, level 0x103 for set, 0x105 for query */
+} __attribute__((packed));	/* size used on disk, for level 0x103 for set,
+				   0x105 for query */
 
 struct file_end_of_file_info {
 	__le64 FileSize;		/* offset to end of file */
@@ -2105,7 +2109,7 @@ typedef struct {
 	__le32 ExtFileAttributes;
 	__le32 FileNameLength;
 	char FileName[1];
-} __attribute__((packed)) FILE_DIRECTORY_INFO;   /* level 0x101 FF response data area */
+} __attribute__((packed)) FILE_DIRECTORY_INFO;   /* level 0x101 FF resp data */
 
 typedef struct {
 	__le32 NextEntryOffset;
@@ -2120,7 +2124,7 @@ typedef struct {
 	__le32 FileNameLength;
 	__le32 EaSize; /* length of the xattrs */
 	char FileName[1];
-} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO;   /* level 0x102 FF response data area */
+} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */
 
 typedef struct {
 	__le32 NextEntryOffset;
@@ -2137,7 +2141,7 @@ typedef struct {
 	__le32 Reserved;
 	__u64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
 	char FileName[1];
-} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO;   /* level 0x105 FF response data area */
+} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */
 
 typedef struct {
 	__le32 NextEntryOffset;
@@ -2155,7 +2159,22 @@ typedef struct {
 	__u8   Reserved;
 	__u8   ShortName[12];
 	char FileName[1];
-} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO;   /* level 0x104 FF response data area */
+} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */
+
+typedef struct {
+	__u32  ResumeKey;
+	__le16 CreationDate; /* SMB Date */
+	__le16 CreationTime; /* SMB Time */
+	__le16 LastAccessDate;
+	__le16 LastAccessTime;
+	__le16 LastWriteDate;
+	__le16 LastWriteTime;
+	__le32 DataSize; /* File Size (EOF) */
+	__le32 AllocationSize;
+	__le16 Attributes; /* verify not u32 */
+	__u8   FileNameLength;
+	char FileName[1];
+} __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */
 
 
 struct win_dev {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e6ed64e94b7..ba4cbe9b068 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -178,11 +178,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		FreeXid(xid);
 		return -ENOMEM;
 	}
-
-	rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
+	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) 
+		rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
 			 desiredAccess, CREATE_NOT_DIR,
 			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	else
+		rc = -EIO; /* no NT SMB support fall into legacy open below */
+
 	if(rc == -EIO) {
 		/* old server, retry the open legacy style */
 		rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
@@ -369,6 +372,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 					 cifs_sb->mnt_cifs_flags & 
 					    CIFS_MOUNT_MAP_SPECIAL_CHR);
 
+			/* BB FIXME - add handling for backlevel servers
+			   which need legacy open and check for all
+			   calls to SMBOpen for fallback to 
+			   SMBLeagcyOpen */
 			if(!rc) {
 				/* BB Do not bother to decode buf since no
 				   local inode yet to put timestamps in,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d62e29fe91f..fafdcdffba6 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -260,10 +260,15 @@ int cifs_open(struct inode *inode, struct file *file)
 		rc = -ENOMEM;
 		goto out;
 	}
-	rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess,
-			 CREATE_NOT_DIR, &netfid, &oplock, buf,
+
+	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
+		rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, 
+			 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
 				 & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	else
+		rc = -EIO; /* no NT SMB support fall into legacy open below */
+
 	if (rc == -EIO) {
 		/* Old server, try legacy style OpenX */
 		rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index e3e762d774d..03bbcb37791 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -109,32 +109,52 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 	return rc;
 }
 
-static void fill_in_inode(struct inode *tmp_inode,
-	FILE_DIRECTORY_INFO *pfindData, int *pobject_type, int isNewInode)
+static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
+		char * buf, int *pobject_type, int isNewInode)
 {
 	loff_t local_size;
 	struct timespec local_mtime;
 
 	struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb);
-	__u32 attr = le32_to_cpu(pfindData->ExtFileAttributes);
-	__u64 allocation_size = le64_to_cpu(pfindData->AllocationSize);
-	__u64 end_of_file = le64_to_cpu(pfindData->EndOfFile);
-
-	cifsInfo->cifsAttrs = attr;
-	cifsInfo->time = jiffies;
+	__u32 attr;
+	__u64 allocation_size;
+	__u64 end_of_file;
 
 	/* save mtime and size */
 	local_mtime = tmp_inode->i_mtime;
 	local_size  = tmp_inode->i_size;
 
+	if(new_buf_type) {
+		FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf;
+
+		attr = le32_to_cpu(pfindData->ExtFileAttributes);
+		allocation_size = le64_to_cpu(pfindData->AllocationSize);
+		end_of_file = le64_to_cpu(pfindData->EndOfFile);
+		tmp_inode->i_atime =
+		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+		tmp_inode->i_mtime =
+		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+		tmp_inode->i_ctime =
+		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+	} else { /* legacy, OS2 and DOS style */
+		FIND_FILE_STANDARD_INFO * pfindData = 
+			(FIND_FILE_STANDARD_INFO *)buf;
+
+		attr = le16_to_cpu(pfindData->Attributes);
+		allocation_size = le32_to_cpu(pfindData->AllocationSize);
+		end_of_file = le32_to_cpu(pfindData->DataSize);
+		tmp_inode->i_atime = CURRENT_TIME;
+		/* tmp_inode->i_mtime =  BB FIXME - add dos time handling
+		tmp_inode->i_ctime = 0;   BB FIXME */
+
+	}
+
 	/* Linux can not store file creation time unfortunately so ignore it */
-	tmp_inode->i_atime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
-	tmp_inode->i_mtime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-	tmp_inode->i_ctime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+
+	cifsInfo->cifsAttrs = attr;
+	cifsInfo->time = jiffies;
+
 	/* treat dos attribute of read-only as read-only mode bit e.g. 555? */
 	/* 2767 perms - indicate mandatory locking */
 		/* BB fill in uid and gid here? with help from winbind? 
@@ -420,7 +440,10 @@ static int initiate_cifs_search(const int xid, struct file *file)
 ffirst_retry:
 	/* test for Unix extensions */
 	if (pTcon->ses->capabilities & CAP_UNIX) {
-		cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX; 
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX;
+	} else if ((pTcon->ses->capabilities & 
+			(CAP_NT_SMBS | CAP_NT_FIND)) == 0) {
+		cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD;
 	} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
 	} else /* not srvinos - BB fixme add check for backlevel? */ {
@@ -456,12 +479,19 @@ static int cifs_unicode_bytelen(char *str)
 	return len << 1;
 }
 
-static char *nxt_dir_entry(char *old_entry, char *end_of_smb)
+static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 {
 	char * new_entry;
 	FILE_DIRECTORY_INFO * pDirInfo = (FILE_DIRECTORY_INFO *)old_entry;
 
-	new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
+	if(level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO * pfData;
+		pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo;
+
+		new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) +
+				pfData->FileNameLength;
+	} else
+		new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
 	cFYI(1,("new entry %p old entry %p",new_entry,old_entry));
 	/* validate that new_entry is not past end of SMB */
 	if(new_entry >= end_of_smb) {
@@ -469,7 +499,10 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb)
 		      ("search entry %p began after end of SMB %p old entry %p",
 			new_entry, end_of_smb, old_entry)); 
 		return NULL;
-	} else if (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb) {
+	} else if(((level == SMB_FIND_FILE_INFO_STANDARD) &&
+		   (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) ||
+		  ((level != SMB_FIND_FILE_INFO_STANDARD) &&
+		   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
 		cERROR(1,("search entry %p extends after end of SMB %p",
 			new_entry, end_of_smb));
 		return NULL;
@@ -487,7 +520,7 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
 	char * filename = NULL;
 	int len = 0; 
 
-	if(cfile->srch_inf.info_level == 0x202) {
+	if(cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
 		FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		if(cfile->srch_inf.unicode) {
@@ -496,26 +529,34 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
 			/* BB should we make this strnlen of PATH_MAX? */
 			len = strnlen(filename, 5);
 		}
-	} else if(cfile->srch_inf.info_level == 0x101) {
+	} else if(cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) {
 		FILE_DIRECTORY_INFO * pFindData = 
 			(FILE_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level == 0x102) {
+	} else if(cfile->srch_inf.info_level == 
+			SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
 		FILE_FULL_DIRECTORY_INFO * pFindData = 
 			(FILE_FULL_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level == 0x105) {
+	} else if(cfile->srch_inf.info_level ==
+			SMB_FIND_FILE_ID_FULL_DIR_INFO) {
 		SEARCH_ID_FULL_DIR_INFO * pFindData = 
 			(SEARCH_ID_FULL_DIR_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
-	} else if(cfile->srch_inf.info_level == 0x104) {
+	} else if(cfile->srch_inf.info_level == 
+			SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
 		FILE_BOTH_DIRECTORY_INFO * pFindData = 
 			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
+	} else if(cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO * pFindData =
+			(FIND_FILE_STANDARD_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		len = le32_to_cpu(pFindData->FileNameLength);
 	} else {
 		cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level));
 	}
@@ -651,10 +692,12 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
 					- cifsFile->srch_inf.entries_in_buffer;
 		pos_in_buf = index_to_find - first_entry_in_buffer;
-		cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); 
+		cFYI(1,("found entry - pos_in_buf %d",pos_in_buf));
+
 		for(i=0;(i<(pos_in_buf)) && (current_entry != NULL);i++) {
 			/* go entry by entry figuring out which is first */
-			current_entry = nxt_dir_entry(current_entry,end_of_smb);
+			current_entry = nxt_dir_entry(current_entry,end_of_smb,
+						cifsFile->srch_inf.info_level);
 		}
 		if((current_entry == NULL) && (i < pos_in_buf)) {
 			/* BB fixme - check if we should flag this error */
@@ -681,7 +724,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
 	char *current_entry, __u16 level, unsigned int unicode,
-	struct cifs_sb_info * cifs_sb, ino_t *pinum)
+	struct cifs_sb_info * cifs_sb, int max_len, ino_t *pinum)
 {
 	int rc = 0;
 	unsigned int len = 0;
@@ -725,10 +768,22 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 			(FILE_BOTH_DIRECTORY_INFO *)current_entry;
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
+	} else if(level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO * pFindData =
+			(FIND_FILE_STANDARD_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		/* one byte length, no name conversion */
+		len = (unsigned int)pFindData->FileNameLength;
 	} else {
 		cFYI(1,("Unknown findfirst level %d",level));
 		return -EINVAL;
 	}
+
+	if(len > max_len) {
+		cERROR(1,("bad search response length %d past smb end", len));
+		return -EINVAL;
+	}
+
 	if(unicode) {
 		/* BB fixme - test with long names */
 		/* Note converted filename can be longer than in unicode */
@@ -748,7 +803,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 }
 
 static int cifs_filldir(char *pfindEntry, struct file *file,
-	filldir_t filldir, void *direntry, char *scratch_buf)
+	filldir_t filldir, void *direntry, char *scratch_buf, int max_len)
 {
 	int rc = 0;
 	struct qstr qstring;
@@ -784,6 +839,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	rc = cifs_get_name_from_search_buf(&qstring,pfindEntry,
 			pCifsF->srch_inf.info_level,
 			pCifsF->srch_inf.unicode,cifs_sb,
+			max_len,
 			&inum /* returned */);
 
 	if(rc)
@@ -805,13 +861,16 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	/* we pass in rc below, indicating whether it is a new inode,
 	   so we can figure out whether to invalidate the inode cached
 	   data if the file has changed */
-	if(pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) {
+	if(pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX)
 		unix_fill_in_inode(tmp_inode,
-				   (FILE_UNIX_INFO *)pfindEntry,&obj_type, rc);
-	} else {
-		fill_in_inode(tmp_inode,
-			      (FILE_DIRECTORY_INFO *)pfindEntry,&obj_type, rc);
-	}
+				   (FILE_UNIX_INFO *)pfindEntry,
+				   &obj_type, rc);
+	else if(pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD)
+		fill_in_inode(tmp_inode, 0 /* old level 1 buffer type */,
+				pfindEntry, &obj_type, rc);
+	else
+		fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc);
+	
 	
 	rc = filldir(direntry,qstring.name,qstring.len,file->f_pos,
 		     tmp_inode->i_ino,obj_type);
@@ -871,6 +930,12 @@ static int cifs_save_resume_key(const char *current_entry,
 		filename = &pFindData->FileName[0];
 		len = le32_to_cpu(pFindData->FileNameLength);
 		cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+	} else if(level == SMB_FIND_FILE_INFO_STANDARD) {
+		FIND_FILE_STANDARD_INFO * pFindData =
+			(FIND_FILE_STANDARD_INFO *)current_entry;
+		filename = &pFindData->FileName[0];
+		/* one byte length, no name conversion */
+		len = (unsigned int)pFindData->FileNameLength;
 	} else {
 		cFYI(1,("Unknown findfirst level %d",level));
 		return -EINVAL;
@@ -891,6 +956,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	int num_to_fill = 0;
 	char * tmp_buf = NULL;
 	char * end_of_smb;
+	int max_len;
 
 	xid = GetXid();
 
@@ -966,10 +1032,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			goto rddir2_exit;
 		}
 		cFYI(1,("loop through %d times filling dir for net buf %p",
-			num_to_fill,cifsFile->srch_inf.ntwrk_buf_start)); 
-		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start +
-			smbCalcSize((struct smb_hdr *)
-				    cifsFile->srch_inf.ntwrk_buf_start);
+			num_to_fill,cifsFile->srch_inf.ntwrk_buf_start));
+		max_len = smbCalcSize((struct smb_hdr *)
+				cifsFile->srch_inf.ntwrk_buf_start);
+		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
+
 		/* To be safe - for UCS to UTF-8 with strings loaded
 		with the rare long characters alloc more to account for
 		such multibyte target UTF-8 characters. cifs_unicode.c,
@@ -984,8 +1051,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			}
 			/* if buggy server returns . and .. late do
 			we want to check for that here? */
-			rc = cifs_filldir(current_entry, file, 
-					filldir, direntry,tmp_buf);
+			rc = cifs_filldir(current_entry, file,
+					filldir, direntry, tmp_buf, max_len);
 			file->f_pos++;
 			if(file->f_pos == 
 				cifsFile->srch_inf.index_of_last_entry) {
@@ -994,8 +1061,9 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 				cifs_save_resume_key(current_entry,cifsFile);
 				break;
 			} else 
-				current_entry = nxt_dir_entry(current_entry,
-							      end_of_smb);
+				current_entry = 
+					nxt_dir_entry(current_entry, end_of_smb,
+						cifsFile->srch_inf.info_level);
 		}
 		kfree(tmp_buf);
 		break;
-- 
cgit v1.2.3


From 1717ffc58850dfa9e08b4977f8d0323cb3336863 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 8 Jun 2006 05:41:32 +0000
Subject: [CIFS] NTLMv2 support part 5

NTLMv2 authentication (stronger authentication than default NTLM) which
many servers support now works.  There was a problem with the construction
of the security blob in the older code.  Currently requires
	/proc/fs/cifs/Experimental to be set to 2
and
	/proc/fs/cifs/SecurityFlags to be set to 0x4004 (to require using
	NTLMv2 instead of default of NTLM)

Next we will check signing to make sure optional NTLMv2 packet signing also
works.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES       |  4 +++-
 fs/cifs/cifsencrypt.c | 60 +++++++++++++++++++++++++++++++++++----------------
 fs/cifs/cifsproto.h   |  8 ++++---
 fs/cifs/sess.c        |  2 +-
 4 files changed, 50 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 7e0058bc3dd..79a202b8f66 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -5,7 +5,9 @@ session setup needed for OS/2 and older servers such as Windows 95 and 98.
 Fix oops on ls to OS/2 servers.  Add support for level 1 FindFirst
 so we can do search (ls etc.) to OS/2.  Do not send NTCreateX
 or recent levels of FindFirst unless server says it supports NT SMBs
-(instead use legacy equivalents from LANMAN dialect).
+(instead use legacy equivalents from LANMAN dialect). Fix to allow
+NTLMv2 authentication support (now can use stronger password hashing
+on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004)
 
 Version 1.43
 ------------
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 8bcb1da3270..a89efaf78a2 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -212,7 +212,8 @@ int cifs_calculate_mac_key(char * key, const char * rn, const char * password)
 	return 0;
 }
 
-int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_info)
+int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, 
+				const struct nls_table * nls_info)
 {
 	char temp_hash[16];
 	struct HMACMD5Context ctx;
@@ -305,13 +306,15 @@ void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key)
 }
 #endif /* CIFS_WEAK_PW_HASH */
 
-static int calc_ntlmv2_hash(const struct cifsSesInfo *ses, 
-			    char * ntv2_hash)
+static int calc_ntlmv2_hash(struct cifsSesInfo *ses, 
+			    const struct nls_table * nls_cp)
 {
 	int rc = 0;
 	int len;
 	char nt_hash[16];
 	struct HMACMD5Context * pctxt;
+	wchar_t * user;
+	wchar_t * domain;
 
 	pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
 
@@ -321,26 +324,44 @@ static int calc_ntlmv2_hash(const struct cifsSesInfo *ses,
 	/* calculate md4 hash of password */
 	E_md4hash(ses->password, nt_hash);
 
-	/* convERT Domainname to unicode and uppercase */
+	/* convert Domainname to unicode and uppercase */
 	hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
 
 	/* convert ses->userName to unicode and uppercase */
-
-	/* len = ... */  /* BB FIXME BB */
-
-	/* hmac_md5_update(user, len, pctxt); */
+	len = strlen(ses->userName);
+	user = kmalloc(2 + (len * 2), GFP_KERNEL);
+	if(user == NULL)
+		goto calc_exit_2;
+	len = cifs_strtoUCS(user, ses->userName, len, nls_cp);
+	UniStrupr(user);
+	hmac_md5_update((char *)user, 2*len, pctxt);
 
 	/* convert ses->domainName to unicode and uppercase */
+	if(ses->domainName) {
+		len = strlen(ses->domainName);
 
-	/* len = ... */  /* BB FIXME BB */
-	/* hmac_md5_update(domain, len, pctxt); */
+        	domain = kmalloc(2 + (len * 2), GFP_KERNEL);
+		if(domain == NULL)
+			goto calc_exit_1;
+		len = cifs_strtoUCS(domain, ses->domainName, len, nls_cp);
+		UniStrupr(domain);
 
-	hmac_md5_final(ntv2_hash, pctxt);
+		hmac_md5_update((char *)domain, 2*len, pctxt);
+	
+		kfree(domain);
+	}
+calc_exit_1:
+	kfree(user);
+calc_exit_2:
+	/* BB FIXME what about bytes 24 through 40 of the signing key? 
+	   compare with the NTLM example */
+	hmac_md5_final(ses->server->mac_signing_key, pctxt);
 
 	return rc;
 }
 
-void setup_ntlmv2_rsp(const struct cifsSesInfo * ses, char * resp_buf)
+void setup_ntlmv2_rsp(struct cifsSesInfo * ses, char * resp_buf, 
+		      const struct nls_table * nls_cp)
 {
 	int rc;
 	struct ntlmv2_resp * buf = (struct ntlmv2_resp *)resp_buf;
@@ -348,27 +369,28 @@ void setup_ntlmv2_rsp(const struct cifsSesInfo * ses, char * resp_buf)
 	buf->blob_signature = cpu_to_le32(0x00000101);
 	buf->reserved = 0;
 	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); 
+	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
 	buf->reserved2 = 0;
 	buf->names[0].type = 0;
 	buf->names[0].length = 0;
 
 	/* calculate buf->ntlmv2_hash */
-	rc = calc_ntlmv2_hash(ses,buf->ntlmv2_hash);
+	rc = calc_ntlmv2_hash(ses, nls_cp);
 	if(rc)
 		cERROR(1,("could not get v2 hash rc %d",rc));
+	CalcNTLMv2_response(ses, resp_buf);
 }
 
-void CalcNTLMv2_response(const struct cifsSesInfo * ses,char * v2_session_response)
+void CalcNTLMv2_response(const struct cifsSesInfo * ses, char * v2_session_response)
 {
 	struct HMACMD5Context context;
+	/* rest of v2 struct already generated */
 	memcpy(v2_session_response + 8, ses->server->cryptKey,8);
-	/* gen_blob(v2_session_response + 16); */
 	hmac_md5_init_limK_to_64(ses->server->mac_signing_key, 16, &context);
 
-	hmac_md5_update(ses->server->cryptKey,8,&context);
-/*	hmac_md5_update(v2_session_response+16)client thing,8,&context); */ /* BB fix */
+	hmac_md5_update(v2_session_response+8, 
+			sizeof(struct ntlmv2_resp) - 8, &context);
 
 	hmac_md5_final(v2_session_response,&context);
-	cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); /* BB removeme BB */
+/*	cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
 }
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 824afb937a6..7ffd5b0d63c 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -285,9 +285,11 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 extern int cifs_verify_signature(struct smb_hdr *, const char * mac_key,
 	__u32 expected_sequence_number);
 extern int cifs_calculate_mac_key(char * key,const char * rn,const char * pass);
-extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, struct nls_table *);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *,char * );
-extern void setup_ntlmv2_rsp(const struct cifsSesInfo *, char *);
+extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, 
+			const struct nls_table *);
+extern void CalcNTLMv2_response(const struct cifsSesInfo *, char * );
+extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, 
+			     const struct nls_table *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key);
 #endif /* CIFS_WEAK_PW_HASH */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index c6fd01f55e9..c039b54206a 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -432,7 +432,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 			cpu_to_le16(sizeof(struct ntlmv2_resp));
 
 		/* calculate session key */
-		setup_ntlmv2_rsp(ses, v2_sess_key);
+		setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
 		if(first_time) /* should this be moved into common code
 			          with similar ntlmv2 path? */
 		/*   cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key,
-- 
cgit v1.2.3


From 3e57ecf640428c01ba1ed8c8fc538447ada1715b Mon Sep 17 00:00:00 2001
From: Olaf Weber <olaf@sgi.com>
Date: Fri, 9 Jun 2006 14:48:12 +1000
Subject: [XFS] Add parameters to xfs_bmapi() and xfs_bunmapi() to have them
 report the range spanned by modifications to the in-core extent map.  Add
 XFS_BUNMAPI() and XFS_SWAP_EXTENTS() macros that call xfs_bunmapi() and
 xfs_swap_extents() via the ioops vector. Change all calls that may modify the
 in-core extent map for the data fork to go through the ioops vector. This
 allows a cache of extent map data to be kept in sync.

SGI-PV: 947615
SGI-Modid: xfs-linux-melb:xfs-kern:209226a

Signed-off-by: Olaf Weber <olaf@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c |   4 +-
 fs/xfs/linux-2.6/xfs_lrw.h |   2 +-
 fs/xfs/quota/xfs_dquot.c   |   4 +-
 fs/xfs/quota/xfs_qm.c      |   2 +-
 fs/xfs/xfs_attr.c          |  12 +-
 fs/xfs/xfs_attr_leaf.c     |   2 +-
 fs/xfs/xfs_bmap.c          | 363 +++++++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_bmap.h          |  22 ++-
 fs/xfs/xfs_da_btree.c      |   9 +-
 fs/xfs/xfs_dfrag.c         |  74 +++++----
 fs/xfs/xfs_dfrag.h         |   3 +
 fs/xfs/xfs_dir2.c          |   7 +-
 fs/xfs/xfs_dir2_leaf.c     |   2 +-
 fs/xfs/xfs_inode.c         |  11 +-
 fs/xfs/xfs_iocore.c        |   3 +
 fs/xfs/xfs_iomap.c         |  24 +--
 fs/xfs/xfs_mount.h         |  23 ++-
 fs/xfs/xfs_rtalloc.c       |   2 +-
 fs/xfs/xfs_vnodeops.c      |  34 ++---
 19 files changed, 437 insertions(+), 166 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 67efe330898..67b5e1c20de 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -458,7 +458,7 @@ xfs_zero_last_block(
 	last_fsb = XFS_B_TO_FSBT(mp, isize);
 	nimaps = 1;
 	error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
-			  &nimaps, NULL);
+			  &nimaps, NULL, NULL);
 	if (error) {
 		return error;
 	}
@@ -556,7 +556,7 @@ xfs_zero_eof(
 		nimaps = 1;
 		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
 		error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
-				  0, NULL, 0, &imap, &nimaps, NULL);
+				  0, NULL, 0, &imap, &nimaps, NULL, NULL);
 		if (error) {
 			ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 			ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index 8f453995235..dc49050688c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -49,7 +49,7 @@ struct xfs_iomap;
 #define	XFS_CTRUNC4		14
 #define	XFS_CTRUNC5		15
 #define	XFS_CTRUNC6		16
-#define	XFS_BUNMAPI		17
+#define	XFS_BUNMAP		17
 #define	XFS_INVAL_CACHED	18
 #define	XFS_DIORD_ENTER		19
 #define	XFS_DIOWR_ENTER		20
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 772ac48329e..26ee5df4f83 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -444,7 +444,7 @@ xfs_qm_dqalloc(
 			      XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
 			      &firstblock,
 			      XFS_QM_DQALLOC_SPACE_RES(mp),
-			      &map, &nmaps, &flist))) {
+			      &map, &nmaps, &flist, NULL))) {
 		goto error0;
 	}
 	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -559,7 +559,7 @@ xfs_qm_dqtobp(
 		error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
 				  XFS_DQUOT_CLUSTER_SIZE_FSB,
 				  XFS_BMAPI_METADATA,
-				  NULL, 0, &map, &nmaps, NULL);
+				  NULL, 0, &map, &nmaps, NULL, NULL);
 
 		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
 		if (error)
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 7fb5eca9bd5..492840b2a35 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1603,7 +1603,7 @@ xfs_qm_dqiterate(
 				  maxlblkcnt - lblkno,
 				  XFS_BMAPI_METADATA,
 				  NULL,
-				  0, map, &nmaps, NULL);
+				  0, map, &nmaps, NULL, NULL);
 		xfs_iunlock(qip, XFS_ILOCK_SHARED);
 		if (error)
 			break;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index b6e1e02bbb2..3ef41e75d08 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -1910,7 +1910,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 		error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
 				  args->rmtblkcnt,
 				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  NULL, 0, map, &nmap, NULL);
+				  NULL, 0, map, &nmap, NULL, NULL);
 		if (error)
 			return(error);
 		ASSERT(nmap >= 1);
@@ -1988,7 +1988,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
 							XFS_BMAPI_WRITE,
 				  args->firstblock, args->total, &map, &nmap,
-				  args->flist);
+				  args->flist, NULL);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
 						*args->firstblock, &committed);
@@ -2039,7 +2039,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
 				  args->rmtblkcnt,
 				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				  args->firstblock, 0, &map, &nmap, NULL);
+				  args->firstblock, 0, &map, &nmap,
+				  NULL, NULL);
 		if (error) {
 			return(error);
 		}
@@ -2104,7 +2105,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 					args->rmtblkcnt,
 					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
 					args->firstblock, 0, &map, &nmap,
-					args->flist);
+					args->flist, NULL);
 		if (error) {
 			return(error);
 		}
@@ -2142,7 +2143,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		XFS_BMAP_INIT(args->flist, args->firstblock);
 		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
 				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-				    1, args->firstblock, args->flist, &done);
+				    1, args->firstblock, args->flist,
+				    NULL, &done);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
 						*args->firstblock, &committed);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 9462be86aa1..5c44343d4a3 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -2990,7 +2990,7 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
 		nmap = 1;
 		error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
 					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
-					NULL, 0, &map, &nmap, NULL);
+					NULL, 0, &map, &nmap, NULL, NULL);
 		if (error) {
 			return(error);
 		}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 26939d364bc..890ad352817 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -101,6 +101,7 @@ xfs_bmap_add_extent(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd);	/* OK to allocate reserved blocks */
 
@@ -118,6 +119,7 @@ xfs_bmap_add_extent_delay_real(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd);	/* OK to allocate reserved blocks */
 
 /*
@@ -131,6 +133,7 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp,/* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd);	/* OK to allocate reserved blocks */
 
 /*
@@ -144,6 +147,7 @@ xfs_bmap_add_extent_hole_real(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork); /* data or attr fork */
 
 /*
@@ -156,7 +160,8 @@ xfs_bmap_add_extent_unwritten_real(
 	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp); /* inode logging flags */
+	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta); /* Change made to incore extents */
 
 /*
  * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
@@ -203,6 +208,7 @@ xfs_bmap_del_extent(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp,/* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd);	 /* OK to allocate reserved blocks */
 
@@ -530,6 +536,7 @@ xfs_bmap_add_extent(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd)	/* OK to use reserved data blocks */
 {
@@ -567,6 +574,15 @@ xfs_bmap_add_extent(
 			logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
 		} else
 			logflags = 0;
+		/* DELTA: single new extent */
+		if (delta) {
+			if (delta->xed_startoff > new->br_startoff)
+				delta->xed_startoff = new->br_startoff;
+			if (delta->xed_blockcount <
+					new->br_startoff + new->br_blockcount)
+				delta->xed_blockcount = new->br_startoff +
+						new->br_blockcount;
+		}
 	}
 	/*
 	 * Any kind of new delayed allocation goes here.
@@ -576,7 +592,7 @@ xfs_bmap_add_extent(
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
 		if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new,
-				&logflags, rsvd)))
+				&logflags, delta, rsvd)))
 			goto done;
 	}
 	/*
@@ -587,7 +603,7 @@ xfs_bmap_add_extent(
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
 		if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
-				&logflags, whichfork)))
+				&logflags, delta, whichfork)))
 			goto done;
 	} else {
 		xfs_bmbt_irec_t	prev;	/* old extent at offset idx */
@@ -612,17 +628,17 @@ xfs_bmap_add_extent(
 						XFS_BTCUR_BPRV_WASDEL);
 				if ((error = xfs_bmap_add_extent_delay_real(ip,
 					idx, &cur, new, &da_new, first, flist,
-					&logflags, rsvd)))
+					&logflags, delta, rsvd)))
 					goto done;
 			} else if (new->br_state == XFS_EXT_NORM) {
 				ASSERT(new->br_state == XFS_EXT_NORM);
 				if ((error = xfs_bmap_add_extent_unwritten_real(
-					ip, idx, &cur, new, &logflags)))
+					ip, idx, &cur, new, &logflags, delta)))
 					goto done;
 			} else {
 				ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
 				if ((error = xfs_bmap_add_extent_unwritten_real(
-					ip, idx, &cur, new, &logflags)))
+					ip, idx, &cur, new, &logflags, delta)))
 					goto done;
 			}
 			ASSERT(*curp == cur || *curp == NULL);
@@ -635,7 +651,7 @@ xfs_bmap_add_extent(
 				ASSERT((cur->bc_private.b.flags &
 					XFS_BTCUR_BPRV_WASDEL) == 0);
 			if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
-					new, &logflags, whichfork)))
+					new, &logflags, delta, whichfork)))
 				goto done;
 		}
 	}
@@ -700,6 +716,7 @@ xfs_bmap_add_extent_delay_real(
 	xfs_fsblock_t		*first,	/* pointer to firstblock variable */
 	xfs_bmap_free_t		*flist,	/* list of extents to be freed */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd)	/* OK to use reserved data block allocation */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
@@ -716,8 +733,8 @@ xfs_bmap_add_extent_delay_real(
 					/* left is 0, right is 1, prev is 2 */
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
-	xfs_filblks_t		temp;	/* value for dnew calculations */
-	xfs_filblks_t		temp2;	/* value for dnew calculations */
+	xfs_filblks_t		temp=0;	/* value for dnew calculations */
+	xfs_filblks_t		temp2=0;/* value for dnew calculations */
 	int			tmp_rval;	/* partial logging flags */
 	enum {				/* bit number definitions for state */
 		LEFT_CONTIG,	RIGHT_CONTIG,
@@ -839,6 +856,11 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		*dnew = 0;
+		/* DELTA: Three in-core extents are replaced by one. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
@@ -872,6 +894,10 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		*dnew = 0;
+		/* DELTA: Two in-core extents are replaced by one. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
@@ -906,6 +932,10 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		*dnew = 0;
+		/* DELTA: Two in-core extents are replaced by one. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK2(LEFT_FILLING, RIGHT_FILLING):
@@ -936,6 +966,9 @@ xfs_bmap_add_extent_delay_real(
 			ASSERT(i == 1);
 		}
 		*dnew = 0;
+		/* DELTA: The in-core extent described by new changed type. */
+		temp = new->br_startoff;
+		temp2 = new->br_blockcount;
 		break;
 
 	case MASK2(LEFT_FILLING, LEFT_CONTIG):
@@ -978,6 +1011,10 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
 			XFS_DATA_FORK);
 		*dnew = temp;
+		/* DELTA: The boundary between two in-core extents moved. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount;
 		break;
 
 	case MASK(LEFT_FILLING):
@@ -1025,6 +1062,9 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1,
 			XFS_DATA_FORK);
 		*dnew = temp;
+		/* DELTA: One in-core extent is split in two. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
@@ -1067,6 +1107,10 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
 			XFS_DATA_FORK);
 		*dnew = temp;
+		/* DELTA: The boundary between two in-core extents moved. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK(RIGHT_FILLING):
@@ -1112,6 +1156,9 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
 		xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
 		*dnew = temp;
+		/* DELTA: One in-core extent is split in two. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case 0:
@@ -1194,6 +1241,9 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmap_trace_post_update(fname, "0", ip, idx + 2,
 			XFS_DATA_FORK);
 		*dnew = temp + temp2;
+		/* DELTA: One in-core extent is split in three. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
@@ -1209,6 +1259,13 @@ xfs_bmap_add_extent_delay_real(
 		ASSERT(0);
 	}
 	*curp = cur;
+	if (delta) {
+		temp2 += temp;
+		if (delta->xed_startoff > temp)
+			delta->xed_startoff = temp;
+		if (delta->xed_blockcount < temp2)
+			delta->xed_blockcount = temp2;
+	}
 done:
 	*logflagsp = rval;
 	return error;
@@ -1235,7 +1292,8 @@ xfs_bmap_add_extent_unwritten_real(
 	xfs_extnum_t		idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
-	int			*logflagsp) /* inode logging flags */
+	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta) /* Change made to incore extents */
 {
 	xfs_btree_cur_t		*cur;	/* btree cursor */
 	xfs_bmbt_rec_t		*ep;	/* extent entry for idx */
@@ -1252,6 +1310,8 @@ xfs_bmap_add_extent_unwritten_real(
 					/* left is 0, right is 1, prev is 2 */
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
+	xfs_filblks_t		temp=0;
+	xfs_filblks_t		temp2=0;
 	enum {				/* bit number definitions for state */
 		LEFT_CONTIG,	RIGHT_CONTIG,
 		LEFT_FILLING,	RIGHT_FILLING,
@@ -1380,6 +1440,11 @@ xfs_bmap_add_extent_unwritten_real(
 				RIGHT.br_blockcount, LEFT.br_state)))
 				goto done;
 		}
+		/* DELTA: Three in-core extents are replaced by one. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
@@ -1419,6 +1484,10 @@ xfs_bmap_add_extent_unwritten_real(
 				LEFT.br_state)))
 				goto done;
 		}
+		/* DELTA: Two in-core extents are replaced by one. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
@@ -1459,6 +1528,10 @@ xfs_bmap_add_extent_unwritten_real(
 				newext)))
 				goto done;
 		}
+		/* DELTA: Two in-core extents are replaced by one. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK2(LEFT_FILLING, RIGHT_FILLING):
@@ -1487,6 +1560,9 @@ xfs_bmap_add_extent_unwritten_real(
 				newext)))
 				goto done;
 		}
+		/* DELTA: The in-core extent described by new changed type. */
+		temp = new->br_startoff;
+		temp2 = new->br_blockcount;
 		break;
 
 	case MASK2(LEFT_FILLING, LEFT_CONTIG):
@@ -1534,6 +1610,10 @@ xfs_bmap_add_extent_unwritten_real(
 				LEFT.br_state))
 				goto done;
 		}
+		/* DELTA: The boundary between two in-core extents moved. */
+		temp = LEFT.br_startoff;
+		temp2 = LEFT.br_blockcount +
+			PREV.br_blockcount;
 		break;
 
 	case MASK(LEFT_FILLING):
@@ -1574,6 +1654,9 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			ASSERT(i == 1);
 		}
+		/* DELTA: One in-core extent is split in two. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
@@ -1617,6 +1700,10 @@ xfs_bmap_add_extent_unwritten_real(
 				newext)))
 				goto done;
 		}
+		/* DELTA: The boundary between two in-core extents moved. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount +
+			RIGHT.br_blockcount;
 		break;
 
 	case MASK(RIGHT_FILLING):
@@ -1657,6 +1744,9 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			ASSERT(i == 1);
 		}
+		/* DELTA: One in-core extent is split in two. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case 0:
@@ -1710,6 +1800,9 @@ xfs_bmap_add_extent_unwritten_real(
 				goto done;
 			ASSERT(i == 1);
 		}
+		/* DELTA: One in-core extent is split in three. */
+		temp = PREV.br_startoff;
+		temp2 = PREV.br_blockcount;
 		break;
 
 	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
@@ -1725,6 +1818,13 @@ xfs_bmap_add_extent_unwritten_real(
 		ASSERT(0);
 	}
 	*curp = cur;
+	if (delta) {
+		temp2 += temp;
+		if (delta->xed_startoff > temp)
+			delta->xed_startoff = temp;
+		if (delta->xed_blockcount < temp2)
+			delta->xed_blockcount = temp2;
+	}
 done:
 	*logflagsp = rval;
 	return error;
@@ -1753,6 +1853,7 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			rsvd)		/* OK to allocate reserved blocks */
 {
 	xfs_bmbt_rec_t		*ep;	/* extent record for idx */
@@ -1765,7 +1866,8 @@ xfs_bmap_add_extent_hole_delay(
 	xfs_filblks_t		oldlen=0;	/* old indirect size */
 	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
 	int			state;  /* state bits, accessed thru macros */
-	xfs_filblks_t		temp;	/* temp for indirect calculations */
+	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
+	xfs_filblks_t		temp2=0;
 	enum {				/* bit number definitions for state */
 		LEFT_CONTIG,	RIGHT_CONTIG,
 		LEFT_DELAY,	RIGHT_DELAY,
@@ -1844,6 +1946,9 @@ xfs_bmap_add_extent_hole_delay(
 			XFS_DATA_FORK);
 		xfs_iext_remove(ifp, idx, 1);
 		ip->i_df.if_lastex = idx - 1;
+		/* DELTA: Two in-core extents were replaced by one. */
+		temp2 = temp;
+		temp = left.br_startoff;
 		break;
 
 	case MASK(LEFT_CONTIG):
@@ -1864,6 +1969,9 @@ xfs_bmap_add_extent_hole_delay(
 		xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1,
 			XFS_DATA_FORK);
 		ip->i_df.if_lastex = idx - 1;
+		/* DELTA: One in-core extent grew into a hole. */
+		temp2 = temp;
+		temp = left.br_startoff;
 		break;
 
 	case MASK(RIGHT_CONTIG):
@@ -1881,6 +1989,9 @@ xfs_bmap_add_extent_hole_delay(
 			NULLSTARTBLOCK((int)newlen), temp, right.br_state);
 		xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK);
 		ip->i_df.if_lastex = idx;
+		/* DELTA: One in-core extent grew into a hole. */
+		temp2 = temp;
+		temp = new->br_startoff;
 		break;
 
 	case 0:
@@ -1894,6 +2005,9 @@ xfs_bmap_add_extent_hole_delay(
 			XFS_DATA_FORK);
 		xfs_iext_insert(ifp, idx, 1, new);
 		ip->i_df.if_lastex = idx;
+		/* DELTA: A new in-core extent was added in a hole. */
+		temp2 = new->br_blockcount;
+		temp = new->br_startoff;
 		break;
 	}
 	if (oldlen != newlen) {
@@ -1904,6 +2018,13 @@ xfs_bmap_add_extent_hole_delay(
 		 * Nothing to do for disk quota accounting here.
 		 */
 	}
+	if (delta) {
+		temp2 += temp;
+		if (delta->xed_startoff > temp)
+			delta->xed_startoff = temp;
+		if (delta->xed_blockcount < temp2)
+			delta->xed_blockcount = temp2;
+	}
 	*logflagsp = 0;
 	return 0;
 #undef	MASK
@@ -1925,6 +2046,7 @@ xfs_bmap_add_extent_hole_real(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork) /* data or attr fork */
 {
 	xfs_bmbt_rec_t		*ep;	/* pointer to extent entry ins. point */
@@ -1936,7 +2058,10 @@ xfs_bmap_add_extent_hole_real(
 	xfs_ifork_t		*ifp;	/* inode fork pointer */
 	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
 	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			rval=0;	/* return value (logging flags) */
 	int			state;	/* state bits, accessed thru macros */
+	xfs_filblks_t		temp=0;
+	xfs_filblks_t		temp2=0;
 	enum {				/* bit number definitions for state */
 		LEFT_CONTIG,	RIGHT_CONTIG,
 		LEFT_DELAY,	RIGHT_DELAY,
@@ -1993,6 +2118,7 @@ xfs_bmap_add_extent_hole_real(
 		 left.br_blockcount + new->br_blockcount +
 		     right.br_blockcount <= MAXEXTLEN));
 
+	error = 0;
 	/*
 	 * Select which case we're in here, and implement it.
 	 */
@@ -2018,25 +2144,35 @@ xfs_bmap_add_extent_hole_real(
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL) {
-			*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
-			return 0;
+			rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					right.br_startoff,
+					right.br_startblock,
+					right.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount +
+						right.br_blockcount,
+					left.br_state)))
+				goto done;
 		}
-		*logflagsp = XFS_ILOG_CORE;
-		if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
-				right.br_startblock, right.br_blockcount, &i)))
-			return error;
-		ASSERT(i == 1);
-		if ((error = xfs_bmbt_delete(cur, &i)))
-			return error;
-		ASSERT(i == 1);
-		if ((error = xfs_bmbt_decrement(cur, 0, &i)))
-			return error;
-		ASSERT(i == 1);
-		error = xfs_bmbt_update(cur, left.br_startoff,
-				left.br_startblock,
-				left.br_blockcount + new->br_blockcount +
-				right.br_blockcount, left.br_state);
-		return error;
+		/* DELTA: Two in-core extents were replaced by one. */
+		temp = left.br_startoff;
+		temp2 = left.br_blockcount +
+			new->br_blockcount +
+			right.br_blockcount;
+		break;
 
 	case MASK(LEFT_CONTIG):
 		/*
@@ -2050,19 +2186,27 @@ xfs_bmap_add_extent_hole_real(
 		xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork);
 		ifp->if_lastex = idx - 1;
 		if (cur == NULL) {
-			*logflagsp = XFS_ILOG_FEXT(whichfork);
-			return 0;
+			rval = XFS_ILOG_FEXT(whichfork);
+		} else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, left.br_startoff,
+					left.br_startblock,
+					left.br_blockcount +
+						new->br_blockcount,
+					left.br_state)))
+				goto done;
 		}
-		*logflagsp = 0;
-		if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
-				left.br_startblock, left.br_blockcount, &i)))
-			return error;
-		ASSERT(i == 1);
-		error = xfs_bmbt_update(cur, left.br_startoff,
-				left.br_startblock,
-				left.br_blockcount + new->br_blockcount,
-				left.br_state);
-		return error;
+		/* DELTA: One in-core extent grew. */
+		temp = left.br_startoff;
+		temp2 = left.br_blockcount +
+			new->br_blockcount;
+		break;
 
 	case MASK(RIGHT_CONTIG):
 		/*
@@ -2077,19 +2221,27 @@ xfs_bmap_add_extent_hole_real(
 		xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork);
 		ifp->if_lastex = idx;
 		if (cur == NULL) {
-			*logflagsp = XFS_ILOG_FEXT(whichfork);
-			return 0;
+			rval = XFS_ILOG_FEXT(whichfork);
+		} else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					right.br_startoff,
+					right.br_startblock,
+					right.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+						right.br_blockcount,
+					right.br_state)))
+				goto done;
 		}
-		*logflagsp = 0;
-		if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
-				right.br_startblock, right.br_blockcount, &i)))
-			return error;
-		ASSERT(i == 1);
-		error = xfs_bmbt_update(cur, new->br_startoff,
-				new->br_startblock,
-				new->br_blockcount + right.br_blockcount,
-				right.br_state);
-		return error;
+		/* DELTA: One in-core extent grew. */
+		temp = new->br_startoff;
+		temp2 = new->br_blockcount +
+			right.br_blockcount;
+		break;
 
 	case 0:
 		/*
@@ -2104,29 +2256,41 @@ xfs_bmap_add_extent_hole_real(
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL) {
-			*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
-			return 0;
+			rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+		} else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur,
+					new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = new->br_state;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
 		}
-		*logflagsp = XFS_ILOG_CORE;
-		if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
-				new->br_startblock, new->br_blockcount, &i)))
-			return error;
-		ASSERT(i == 0);
-		cur->bc_rec.b.br_state = new->br_state;
-		if ((error = xfs_bmbt_insert(cur, &i)))
-			return error;
-		ASSERT(i == 1);
-		return 0;
+		/* DELTA: A new extent was added in a hole. */
+		temp = new->br_startoff;
+		temp2 = new->br_blockcount;
+		break;
 	}
+	if (delta) {
+		temp2 += temp;
+		if (delta->xed_startoff > temp)
+			delta->xed_startoff = temp;
+		if (delta->xed_blockcount < temp2)
+			delta->xed_blockcount = temp2;
+	}
+done:
+	*logflagsp = rval;
+	return error;
 #undef	MASK
 #undef	MASK2
 #undef	STATE_SET
 #undef	STATE_TEST
 #undef	STATE_SET_TEST
 #undef	SWITCH_STATE
-	/* NOTREACHED */
-	ASSERT(0);
-	return 0; /* keep gcc quite */
 }
 
 /*
@@ -2885,6 +3049,7 @@ xfs_bmap_del_extent(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
 	int			*logflagsp, /* inode logging flags */
+	xfs_extdelta_t		*delta, /* Change made to incore extents */
 	int			whichfork, /* data or attr fork */
 	int			rsvd)	/* OK to allocate reserved blocks */
 {
@@ -3193,6 +3358,14 @@ xfs_bmap_del_extent(
 	if (da_old > da_new)
 		xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new),
 			rsvd);
+	if (delta) {
+		/* DELTA: report the original extent. */
+		if (delta->xed_startoff > got.br_startoff)
+			delta->xed_startoff = got.br_startoff;
+		if (delta->xed_blockcount < got.br_startoff+got.br_blockcount)
+			delta->xed_blockcount = got.br_startoff +
+							got.br_blockcount;
+	}
 done:
 	*logflagsp = flags;
 	return error;
@@ -3753,7 +3926,7 @@ xfs_bunmap_trace(
 	if (ip->i_rwtrace == NULL)
 		return;
 	ktrace_enter(ip->i_rwtrace,
-		(void *)(__psint_t)XFS_BUNMAPI,
+		(void *)(__psint_t)XFS_BUNMAP,
 		(void *)ip,
 		(void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
 		(void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
@@ -4538,7 +4711,8 @@ xfs_bmapi(
 	xfs_extlen_t	total,		/* total blocks needed */
 	xfs_bmbt_irec_t	*mval,		/* output: map values */
 	int		*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t	*flist)		/* i/o: list extents to free */
+	xfs_bmap_free_t	*flist,		/* i/o: list extents to free */
+	xfs_extdelta_t	*delta)		/* o: change made to incore extents */
 {
 	xfs_fsblock_t	abno;		/* allocated block number */
 	xfs_extlen_t	alen;		/* allocated extent length */
@@ -4650,6 +4824,10 @@ xfs_bmapi(
 	end = bno + len;
 	obno = bno;
 	bma.ip = NULL;
+	if (delta) {
+		delta->xed_startoff = NULLFILEOFF;
+		delta->xed_blockcount = 0;
+	}
 	while (bno < end && n < *nmap) {
 		/*
 		 * Reading past eof, act as though there's a hole
@@ -4886,8 +5064,8 @@ xfs_bmapi(
 					got.br_state = XFS_EXT_UNWRITTEN;
 			}
 			error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
-				firstblock, flist, &tmp_logflags, whichfork,
-				(flags & XFS_BMAPI_RSVBLOCKS));
+				firstblock, flist, &tmp_logflags, delta,
+				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
@@ -4983,8 +5161,8 @@ xfs_bmapi(
 			}
 			mval->br_state = XFS_EXT_NORM;
 			error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
-				firstblock, flist, &tmp_logflags, whichfork,
-				(flags & XFS_BMAPI_RSVBLOCKS));
+				firstblock, flist, &tmp_logflags, delta,
+				whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
 			logflags |= tmp_logflags;
 			if (error)
 				goto error0;
@@ -5073,7 +5251,14 @@ xfs_bmapi(
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
 	       XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
 	error = 0;
-
+	if (delta && delta->xed_startoff != NULLFILEOFF) {
+		/* A change was actually made.
+		 * Note that delta->xed_blockount is an offset at this
+		 * point and needs to be converted to a block count.
+		 */
+		ASSERT(delta->xed_blockcount > delta->xed_startoff);
+		delta->xed_blockcount -= delta->xed_startoff;
+	}
 error0:
 	/*
 	 * Log everything.  Do this after conversion, there's no point in
@@ -5185,6 +5370,8 @@ xfs_bunmapi(
 	xfs_fsblock_t		*firstblock,	/* first allocated block
 						   controls a.g. for allocs */
 	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_extdelta_t		*delta,		/* o: change made to incore
+						   extents */
 	int			*done)		/* set if not done yet */
 {
 	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
@@ -5242,6 +5429,10 @@ xfs_bunmapi(
 	bno = start + len - 1;
 	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
 		&prev);
+	if (delta) {
+		delta->xed_startoff = NULLFILEOFF;
+		delta->xed_blockcount = 0;
+	}
 	/*
 	 * Check to see if the given block number is past the end of the
 	 * file, back up to the last block if so...
@@ -5340,7 +5531,8 @@ xfs_bunmapi(
 			}
 			del.br_state = XFS_EXT_UNWRITTEN;
 			error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
-				firstblock, flist, &logflags, XFS_DATA_FORK, 0);
+				firstblock, flist, &logflags, delta,
+				XFS_DATA_FORK, 0);
 			if (error)
 				goto error0;
 			goto nodelete;
@@ -5394,7 +5586,7 @@ xfs_bunmapi(
 				prev.br_state = XFS_EXT_UNWRITTEN;
 				error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
 					&prev, firstblock, flist, &logflags,
-					XFS_DATA_FORK, 0);
+					delta, XFS_DATA_FORK, 0);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5403,7 +5595,7 @@ xfs_bunmapi(
 				del.br_state = XFS_EXT_UNWRITTEN;
 				error = xfs_bmap_add_extent(ip, lastx, &cur,
 					&del, firstblock, flist, &logflags,
-					XFS_DATA_FORK, 0);
+					delta, XFS_DATA_FORK, 0);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5456,7 +5648,7 @@ xfs_bunmapi(
 			goto error0;
 		}
 		error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
-			&tmp_logflags, whichfork, rsvd);
+				&tmp_logflags, delta, whichfork, rsvd);
 		logflags |= tmp_logflags;
 		if (error)
 			goto error0;
@@ -5513,6 +5705,14 @@ nodelete:
 	ASSERT(ifp->if_ext_max ==
 	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 	error = 0;
+	if (delta && delta->xed_startoff != NULLFILEOFF) {
+		/* A change was actually made.
+		 * Note that delta->xed_blockount is an offset at this
+		 * point and needs to be converted to a block count.
+		 */
+		ASSERT(delta->xed_blockcount > delta->xed_startoff);
+		delta->xed_blockcount -= delta->xed_startoff;
+	}
 error0:
 	/*
 	 * Log everything.  Do this after conversion, there's no point in
@@ -5689,7 +5889,8 @@ xfs_getbmap(
 		nmap = (nexleft > subnex) ? subnex : nexleft;
 		error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
 				  XFS_BB_TO_FSB(mp, bmv->bmv_length),
-				  bmapi_flags, NULL, 0, map, &nmap, NULL);
+				  bmapi_flags, NULL, 0, map, &nmap,
+				  NULL, NULL);
 		if (error)
 			goto unlock_and_return;
 		ASSERT(nmap <= subnex);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 8e0d73d9ccc..80e93409b78 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -25,6 +25,20 @@ struct xfs_inode;
 struct xfs_mount;
 struct xfs_trans;
 
+/*
+ * DELTA: describe a change to the in-core extent list.
+ *
+ * Internally the use of xed_blockount is somewhat funky.
+ * xed_blockcount contains an offset much of the time because this
+ * makes merging changes easier.  (xfs_fileoff_t and xfs_filblks_t are
+ * the same underlying type).
+ */
+typedef struct xfs_extdelta
+{
+	xfs_fileoff_t		xed_startoff;	/* offset of range */
+	xfs_filblks_t		xed_blockcount;	/* blocks in range */
+} xfs_extdelta_t;
+
 /*
  * List of extents to be free "later".
  * The list is kept sorted on xbf_startblock.
@@ -275,7 +289,9 @@ xfs_bmapi(
 	xfs_extlen_t		total,		/* total blocks needed */
 	struct xfs_bmbt_irec	*mval,		/* output: map values */
 	int			*nmap,		/* i/o: mval size/count */
-	xfs_bmap_free_t		*flist);	/* i/o: list extents to free */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_extdelta_t		*delta);	/* o: change made to incore
+						   extents */
 
 /*
  * Map file blocks to filesystem blocks, simple version.
@@ -309,6 +325,8 @@ xfs_bunmapi(
 	xfs_fsblock_t		*firstblock,	/* first allocated block
 						   controls a.g. for allocs */
 	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_extdelta_t		*delta,		/* o: change made to incore
+						   extents */
 	int			*done);		/* set if not done yet */
 
 /*
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 8988b905117..47b86f9f69f 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1655,7 +1655,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 			XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
 			XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
-			args->flist))) {
+			args->flist, NULL))) {
 		return error;
 	}
 	ASSERT(nmap <= 1);
@@ -1676,7 +1676,8 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 					XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
 					XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
-					&mapp[mapi], &nmap, args->flist))) {
+					&mapp[mapi], &nmap, args->flist,
+					NULL))) {
 				kmem_free(mapp, sizeof(*mapp) * count);
 				return error;
 			}
@@ -1961,7 +1962,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 		 */
 		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
 				XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
-				0, args->firstblock, args->flist,
+				0, args->firstblock, args->flist, NULL,
 				&done)) == ENOSPC) {
 			if (w != XFS_DATA_FORK)
 				goto done;
@@ -2086,7 +2087,7 @@ xfs_da_do_buf(
 					nfsb,
 					XFS_BMAPI_METADATA |
 						XFS_BMAPI_AFLAG(whichfork),
-					NULL, 0, mapp, &nmap, NULL)))
+					NULL, 0, mapp, &nmap, NULL, NULL)))
 				goto exit0;
 		}
 	} else {
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 4968a6358e6..99daf8c0f90 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -54,24 +54,14 @@ xfs_swapext(
 	xfs_swapext_t	__user *sxu)
 {
 	xfs_swapext_t	*sxp;
-	xfs_inode_t     *ip=NULL, *tip=NULL, *ips[2];
-	xfs_trans_t     *tp;
+	xfs_inode_t     *ip=NULL, *tip=NULL;
 	xfs_mount_t     *mp;
-	xfs_bstat_t	*sbp;
 	struct file	*fp = NULL, *tfp = NULL;
 	vnode_t		*vp, *tvp;
-	static uint	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
-	int		ilf_fields, tilf_fields;
 	int		error = 0;
-	xfs_ifork_t	*tempifp, *ifp, *tifp;
-	__uint64_t	tmp;
-	int		aforkblks = 0;
-	int		taforkblks = 0;
-	char		locked = 0;
 
 	sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
-	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
-	if (!sxp || !tempifp) {
+	if (!sxp) {
 		error = XFS_ERROR(ENOMEM);
 		goto error0;
 	}
@@ -118,14 +108,56 @@ xfs_swapext(
 
 	mp = ip->i_mount;
 
-	sbp = &sxp->sx_stat;
-
 	if (XFS_FORCED_SHUTDOWN(mp)) {
 		error =  XFS_ERROR(EIO);
 		goto error0;
 	}
 
-	locked = 1;
+	error = XFS_SWAP_EXTENTS(mp, &ip->i_iocore, &tip->i_iocore, sxp);
+
+ error0:
+	if (fp != NULL)
+		fput(fp);
+	if (tfp != NULL)
+		fput(tfp);
+
+	if (sxp != NULL)
+		kmem_free(sxp, sizeof(xfs_swapext_t));
+
+	return error;
+}
+
+int
+xfs_swap_extents(
+	xfs_inode_t	*ip,
+	xfs_inode_t	*tip,
+	xfs_swapext_t	*sxp)
+{
+	xfs_mount_t	*mp;
+	xfs_inode_t	*ips[2];
+	xfs_trans_t	*tp;
+	xfs_bstat_t	*sbp = &sxp->sx_stat;
+	vnode_t		*vp, *tvp;
+	xfs_ifork_t	*tempifp, *ifp, *tifp;
+	int		ilf_fields, tilf_fields;
+	static uint	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+	int		error = 0;
+	int		aforkblks = 0;
+	int		taforkblks = 0;
+	__uint64_t	tmp;
+	char		locked = 0;
+
+	mp = ip->i_mount;
+
+	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
+	if (!tempifp) {
+		error = XFS_ERROR(ENOMEM);
+		goto error0;
+	}
+
+	sbp = &sxp->sx_stat;
+	vp = XFS_ITOV(ip);
+	tvp = XFS_ITOV(tip);
 
 	/* Lock in i_ino order */
 	if (ip->i_ino < tip->i_ino) {
@@ -137,6 +169,7 @@ xfs_swapext(
 	}
 
 	xfs_lock_inodes(ips, 2, 0, lock_flags);
+	locked = 1;
 
 	/* Check permissions */
 	error = xfs_iaccess(ip, S_IWUSR, NULL);
@@ -360,16 +393,7 @@ xfs_swapext(
 		xfs_iunlock(ip,  lock_flags);
 		xfs_iunlock(tip, lock_flags);
 	}
-
-	if (fp != NULL)
-		fput(fp);
-	if (tfp != NULL)
-		fput(tfp);
-
-	if (sxp != NULL)
-		kmem_free(sxp, sizeof(xfs_swapext_t));
 	if (tempifp != NULL)
 		kmem_free(tempifp, sizeof(xfs_ifork_t));
-
 	return error;
 }
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index f678559abc4..da178205be6 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -48,6 +48,9 @@ typedef struct xfs_swapext
  */
 int	xfs_swapext(struct xfs_swapext __user *sx);
 
+int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
+		struct xfs_swapext *sxp);
+
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 022c8398ab6..80238a2263f 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -549,7 +549,7 @@ xfs_dir2_grow_inode(
 	if ((error = xfs_bmapi(tp, dp, bno, count,
 			XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
-			args->flist))) {
+			args->flist, NULL))) {
 		return error;
 	}
 	ASSERT(nmap <= 1);
@@ -585,7 +585,8 @@ xfs_dir2_grow_inode(
 			if ((error = xfs_bmapi(tp, dp, b, c,
 					XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
-					&mapp[mapi], &nmap, args->flist))) {
+					&mapp[mapi], &nmap, args->flist,
+					NULL))) {
 				kmem_free(mapp, sizeof(*mapp) * count);
 				return error;
 			}
@@ -786,7 +787,7 @@ xfs_dir2_shrink_inode(
 	 */
 	if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
 			XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
-			&done))) {
+			NULL, &done))) {
 		/*
 		 * ENOSPC actually can happen if we're in a removename with
 		 * no space reservation, and the resulting block removal
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0f5e2f2ce6e..5fe88d546c7 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -884,7 +884,7 @@ xfs_dir2_leaf_getdents(
 					XFS_DIR2_BYTE_TO_DA(mp,
 						XFS_DIR2_LEAF_OFFSET) - map_off,
 					XFS_BMAPI_METADATA, NULL, 0,
-					&map[map_valid], &nmap, NULL);
+					&map[map_valid], &nmap, NULL, NULL);
 				/*
 				 * Don't know if we should ignore this or
 				 * try to return an error.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 94b60dd0380..020de5637e0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -1285,7 +1285,7 @@ xfs_isize_check(
 				       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
 			  map_first),
 			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
-			 NULL))
+			 NULL, NULL))
 	    return;
 	ASSERT(nimaps == 1);
 	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
@@ -1666,12 +1666,13 @@ xfs_itruncate_finish(
 		 * runs.
 		 */
 		XFS_BMAP_INIT(&free_list, &first_block);
-		error = xfs_bunmapi(ntp, ip, first_unmap_block,
-				    unmap_len,
+		error = XFS_BUNMAPI(mp, ntp, &ip->i_iocore,
+				    first_unmap_block, unmap_len,
 				    XFS_BMAPI_AFLAG(fork) |
 				      (sync ? 0 : XFS_BMAPI_ASYNC),
 				    XFS_ITRUNC_MAX_EXTENTS,
-				    &first_block, &free_list, &done);
+				    &first_block, &free_list,
+				    NULL, &done);
 		if (error) {
 			/*
 			 * If the bunmapi call encounters an error,
diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c
index a07815661a8..d3da194f8b7 100644
--- a/fs/xfs/xfs_iocore.c
+++ b/fs/xfs/xfs_iocore.c
@@ -26,6 +26,7 @@
 #include "xfs_ag.h"
 #include "xfs_dir.h"
 #include "xfs_dir2.h"
+#include "xfs_dfrag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
@@ -68,6 +69,7 @@ xfs_ioinit(
 xfs_ioops_t	xfs_iocore_xfs = {
 	.xfs_ioinit		= (xfs_ioinit_t) xfs_ioinit,
 	.xfs_bmapi_func		= (xfs_bmapi_t) xfs_bmapi,
+	.xfs_bunmapi_func	= (xfs_bunmapi_t) xfs_bunmapi,
 	.xfs_bmap_eof_func	= (xfs_bmap_eof_t) xfs_bmap_eof,
 	.xfs_iomap_write_direct =
 			(xfs_iomap_write_direct_t) xfs_iomap_write_direct,
@@ -84,6 +86,7 @@ xfs_ioops_t	xfs_iocore_xfs = {
 	.xfs_unlock		= (xfs_unlk_t) xfs_iunlock,
 	.xfs_size_func		= (xfs_size_t) xfs_size_fn,
 	.xfs_iodone		= (xfs_iodone_t) fs_noerr,
+	.xfs_swap_extents_func	= (xfs_swap_extents_t) xfs_swap_extents,
 };
 
 void
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d5dfedcb892..d79055207fb 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -252,7 +252,7 @@ xfs_iomap(
 	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
 			(xfs_filblks_t)(end_fsb - offset_fsb),
 			bmapi_flags,  NULL, 0, &imap,
-			&nimaps, NULL);
+			&nimaps, NULL, NULL);
 
 	if (error)
 		goto out;
@@ -519,8 +519,8 @@ xfs_iomap_write_direct(
 	 */
 	XFS_BMAP_INIT(&free_list, &firstfsb);
 	nimaps = 1;
-	error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
-		bmapi_flag, &firstfsb, 0, &imap, &nimaps, &free_list);
+	error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, bmapi_flag,
+		&firstfsb, 0, &imap, &nimaps, &free_list, NULL);
 	if (error)
 		goto error0;
 
@@ -610,8 +610,8 @@ xfs_iomap_eof_want_preallocate(
 	while (count_fsb > 0) {
 		imaps = nimaps;
 		firstblock = NULLFSBLOCK;
-		error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
-				  0, &firstblock, 0, imap, &imaps, NULL);
+		error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb, 0,
+				  &firstblock, 0, imap, &imaps, NULL, NULL);
 		if (error)
 			return error;
 		for (n = 0; n < imaps; n++) {
@@ -695,11 +695,11 @@ retry:
 
 	nimaps = XFS_WRITE_IMAPS;
 	firstblock = NULLFSBLOCK;
-	error = xfs_bmapi(NULL, ip, offset_fsb,
+	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
 			  (xfs_filblks_t)(last_fsb - offset_fsb),
 			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
 			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
-			  &nimaps, NULL);
+			  &nimaps, NULL, NULL);
 	if (error && (error != ENOSPC))
 		return XFS_ERROR(error);
 
@@ -832,9 +832,9 @@ xfs_iomap_write_allocate(
 			}
 
 			/* Go get the actual blocks */
-			error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
+			error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb,
 					XFS_BMAPI_WRITE, &first_block, 1,
-					imap, &nimaps, &free_list);
+					imap, &nimaps, &free_list, NULL);
 			if (error)
 				goto trans_cancel;
 
@@ -955,9 +955,9 @@ xfs_iomap_write_unwritten(
 		 */
 		XFS_BMAP_INIT(&free_list, &firstfsb);
 		nimaps = 1;
-		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+		error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb,
 				  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
-				  1, &imap, &nimaps, &free_list);
+				  1, &imap, &nimaps, &free_list, NULL);
 		if (error)
 			goto error_on_bmapi_transaction;
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 668ad23fd37..a682eb55810 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -63,6 +63,8 @@ struct xfs_perag;
 struct xfs_iocore;
 struct xfs_bmbt_irec;
 struct xfs_bmap_free;
+struct xfs_extdelta;
+struct xfs_swapext;
 
 extern struct vfsops xfs_vfsops;
 extern struct vnodeops xfs_vnodeops;
@@ -194,7 +196,12 @@ typedef int		(*xfs_bmapi_t)(struct xfs_trans *, void *,
 				xfs_fileoff_t, xfs_filblks_t, int,
 				xfs_fsblock_t *, xfs_extlen_t,
 				struct xfs_bmbt_irec *, int *,
-				struct xfs_bmap_free *);
+				struct xfs_bmap_free *, struct xfs_extdelta *);
+typedef int		(*xfs_bunmapi_t)(struct xfs_trans *,
+				void *, xfs_fileoff_t,
+				xfs_filblks_t, int, xfs_extnum_t,
+				xfs_fsblock_t *, struct xfs_bmap_free *,
+				struct xfs_extdelta *, int *);
 typedef int		(*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *);
 typedef int		(*xfs_iomap_write_direct_t)(
 				void *, xfs_off_t, size_t, int,
@@ -214,10 +221,13 @@ typedef int		(*xfs_lock_nowait_t)(void *, uint);
 typedef void		(*xfs_unlk_t)(void *, unsigned int);
 typedef xfs_fsize_t	(*xfs_size_t)(void *);
 typedef xfs_fsize_t	(*xfs_iodone_t)(struct vfs *);
+typedef int		(*xfs_swap_extents_t)(void *, void *,
+				struct xfs_swapext*);
 
 typedef struct xfs_ioops {
 	xfs_ioinit_t			xfs_ioinit;
 	xfs_bmapi_t			xfs_bmapi_func;
+	xfs_bunmapi_t			xfs_bunmapi_func;
 	xfs_bmap_eof_t			xfs_bmap_eof_func;
 	xfs_iomap_write_direct_t	xfs_iomap_write_direct;
 	xfs_iomap_write_delay_t		xfs_iomap_write_delay;
@@ -230,13 +240,17 @@ typedef struct xfs_ioops {
 	xfs_unlk_t			xfs_unlock;
 	xfs_size_t			xfs_size_func;
 	xfs_iodone_t			xfs_iodone;
+	xfs_swap_extents_t		xfs_swap_extents_func;
 } xfs_ioops_t;
 
 #define XFS_IOINIT(vfsp, args, flags) \
 	(*(mp)->m_io_ops.xfs_ioinit)(vfsp, args, flags)
-#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist)	\
+#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist,delta) \
 	(*(mp)->m_io_ops.xfs_bmapi_func) \
-		(trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist)
+		(trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist,delta)
+#define XFS_BUNMAPI(mp, trans,io,bno,len,f,nexts,first,flist,delta,done) \
+	(*(mp)->m_io_ops.xfs_bunmapi_func) \
+		(trans,(io)->io_obj,bno,len,f,nexts,first,flist,delta,done)
 #define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \
 	(*(mp)->m_io_ops.xfs_bmap_eof_func) \
 		((io)->io_obj, endoff, whichfork, eof)
@@ -266,6 +280,9 @@ typedef struct xfs_ioops {
 	(*(mp)->m_io_ops.xfs_size_func)((io)->io_obj)
 #define XFS_IODONE(vfsp) \
 	(*(mp)->m_io_ops.xfs_iodone)(vfsp)
+#define XFS_SWAP_EXTENTS(mp, io, tio, sxp) \
+	(*(mp)->m_io_ops.xfs_swap_extents_func) \
+		((io)->io_obj, (tio)->io_obj, sxp)
 
 #ifdef HAVE_PERCPU_SB
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5b413946b1c..f5944c8b378 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -141,7 +141,7 @@ xfs_growfs_rt_alloc(
 		cancelflags |= XFS_TRANS_ABORT;
 		error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
 			XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
-			resblks, &map, &nmap, &flist);
+			resblks, &map, &nmap, &flist, NULL);
 		if (!error && nmap < 1)
 			error = XFS_ERROR(ENOSPC);
 		if (error)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7027ae68ee3..567d14d06d9 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -1000,7 +1000,7 @@ xfs_readlink(
 		nmaps = SYMLINK_MAPS;
 
 		error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
-				  0, NULL, 0, mval, &nmaps, NULL);
+				  0, NULL, 0, mval, &nmaps, NULL, NULL);
 
 		if (error) {
 			goto error_return;
@@ -1208,8 +1208,8 @@ xfs_inactive_free_eofblocks(
 
 	nimaps = 1;
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
-			  NULL, 0, &imap, &nimaps, NULL);
+	error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
+			  NULL, 0, &imap, &nimaps, NULL, NULL);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (!error && (nimaps != 0) &&
@@ -1338,7 +1338,7 @@ xfs_inactive_symlink_rmt(
 	nmaps = ARRAY_SIZE(mval);
 	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
 			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
-			&free_list)))
+			&free_list, NULL)))
 		goto error0;
 	/*
 	 * Invalidate the block(s).
@@ -1353,7 +1353,7 @@ xfs_inactive_symlink_rmt(
 	 * Unmap the dead block(s) to the free_list.
 	 */
 	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
-			&first_block, &free_list, &done)))
+			&first_block, &free_list, NULL, &done)))
 		goto error1;
 	ASSERT(done);
 	/*
@@ -3457,7 +3457,7 @@ xfs_symlink(
 		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
 				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
 				  &first_block, resblks, mval, &nmaps,
-				  &free_list);
+				  &free_list, NULL);
 		if (error) {
 			goto error1;
 		}
@@ -4116,10 +4116,10 @@ retry:
 		 * Issue the xfs_bmapi() call to allocate the blocks
 		 */
 		XFS_BMAP_INIT(&free_list, &firstfsb);
-		error = xfs_bmapi(tp, ip, startoffset_fsb,
+		error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
 				  allocatesize_fsb, bmapi_flag,
 				  &firstfsb, 0, imapp, &nimaps,
-				  &free_list);
+				  &free_list, NULL);
 		if (error) {
 			goto error0;
 		}
@@ -4199,8 +4199,8 @@ xfs_zero_remaining_bytes(
 	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
 		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
-			&nimap, NULL);
+		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
+			NULL, 0, &imap, &nimap, NULL, NULL);
 		if (error || nimap < 1)
 			break;
 		ASSERT(imap.br_blockcount >= 1);
@@ -4338,8 +4338,8 @@ xfs_free_file_space(
 	 */
 	if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
-			&imap, &nimap, NULL);
+		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
+			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
@@ -4353,8 +4353,8 @@ xfs_free_file_space(
 				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
 		}
 		nimap = 1;
-		error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
-			&imap, &nimap, NULL);
+		error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
+			1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
 		if (error)
 			goto out_unlock_iolock;
 		ASSERT(nimap == 0 || nimap == 1);
@@ -4426,9 +4426,9 @@ xfs_free_file_space(
 		 * issue the bunmapi() call to free the blocks
 		 */
 		XFS_BMAP_INIT(&free_list, &firstfsb);
-		error = xfs_bunmapi(tp, ip, startoffset_fsb,
+		error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
 				  endoffset_fsb - startoffset_fsb,
-				  0, 2, &firstfsb, &free_list, &done);
+				  0, 2, &firstfsb, &free_list, NULL, &done);
 		if (error) {
 			goto error0;
 		}
-- 
cgit v1.2.3


From ff9901c1e7c7be06a99c59cfc3133d2316cbc957 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Jun 2006 14:48:37 +1000
Subject: [XFS] endianess annotations for xfs_dir2_data_entry_t

SGI-PV: 943272
SGI-Modid: xfs-linux-melb:xfs-kern:25806a

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_dir2_block.c | 16 ++++++++--------
 fs/xfs/xfs_dir2_data.c  |  2 +-
 fs/xfs/xfs_dir2_data.h  |  8 ++++----
 fs/xfs/xfs_dir2_leaf.c  | 10 +++++-----
 fs/xfs/xfs_dir2_node.c  |  8 ++++----
 fs/xfs/xfs_dir2_sf.c    | 10 +++++-----
 6 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 972ded59547..2621ff521db 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -400,7 +400,7 @@ xfs_dir2_block_addname(
 	/*
 	 * Create the new data entry.
 	 */
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	dep->namelen = args->namelen;
 	memcpy(dep->name, args->name, args->namelen);
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -508,7 +508,7 @@ xfs_dir2_block_getdents(
 
 		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
 						    ptr - (char *)block);
-		p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
+		p.ino = be64_to_cpu(dep->inumber);
 #if XFS_BIG_INUMS
 		p.ino += mp->m_inoadd;
 #endif
@@ -626,7 +626,7 @@ xfs_dir2_block_lookup(
 	/*
 	 * Fill in inode number, release the block.
 	 */
-	args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+	args->inumber = be64_to_cpu(dep->inumber);
 	xfs_da_brelse(args->trans, bp);
 	return XFS_ERROR(EEXIST);
 }
@@ -844,11 +844,11 @@ xfs_dir2_block_replace(
 	 */
 	dep = (xfs_dir2_data_entry_t *)
 	      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address)));
-	ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber);
+	ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
 	/*
 	 * Change the inode number to the new value.
 	 */
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	xfs_dir2_data_log_entry(args->trans, bp, dep);
 	xfs_dir2_data_check(dp, bp);
 	xfs_da_buf_done(bp);
@@ -1130,7 +1130,7 @@ xfs_dir2_sf_to_block(
 	 */
 	dep = (xfs_dir2_data_entry_t *)
 	      ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
-	INT_SET(dep->inumber, ARCH_CONVERT, dp->i_ino);
+	dep->inumber = cpu_to_be64(dp->i_ino);
 	dep->namelen = 1;
 	dep->name[0] = '.';
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1144,7 +1144,7 @@ xfs_dir2_sf_to_block(
 	 */
 	dep = (xfs_dir2_data_entry_t *)
 		((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
-	INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
+	dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
 	dep->namelen = 2;
 	dep->name[0] = dep->name[1] = '.';
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1193,7 +1193,7 @@ xfs_dir2_sf_to_block(
 		 * Copy a real entry.
 		 */
 		dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
-		INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp,
+		dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp,
 				XFS_DIR2_SF_INUMBERP(sfep)));
 		dep->namelen = sfep->namelen;
 		memcpy(dep->name, sfep->name, dep->namelen);
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index bb3d03ff002..7be37d38961 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -133,7 +133,7 @@ xfs_dir2_data_check(
 		 */
 		dep = (xfs_dir2_data_entry_t *)p;
 		ASSERT(dep->namelen != 0);
-		ASSERT(xfs_dir_ino_validate(mp, INT_GET(dep->inumber, ARCH_CONVERT)) == 0);
+		ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
 		ASSERT(be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep)) ==
 		       (char *)dep - (char *)d);
 		count++;
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
index 0847cbb53e1..a6ae2d21c40 100644
--- a/fs/xfs/xfs_dir2_data.h
+++ b/fs/xfs/xfs_dir2_data.h
@@ -85,11 +85,11 @@ typedef struct xfs_dir2_data_hdr {
  * Tag appears as the last 2 bytes.
  */
 typedef struct xfs_dir2_data_entry {
-	xfs_ino_t		inumber;	/* inode number */
-	__uint8_t		namelen;	/* name length */
-	__uint8_t		name[1];	/* name bytes, no null */
+	__be64			inumber;	/* inode number */
+	__u8			namelen;	/* name length */
+	__u8			name[1];	/* name bytes, no null */
 						/* variable offset */
-	xfs_dir2_data_off_t	tag;		/* starting offset of us */
+	__be16			tag;		/* starting offset of us */
 } xfs_dir2_data_entry_t;
 
 /*
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 5fe88d546c7..74ef99f2ee5 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -407,7 +407,7 @@ xfs_dir2_leaf_addname(
 	 * Initialize our new entry (at last).
 	 */
 	dep = (xfs_dir2_data_entry_t *)dup;
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	dep->namelen = args->namelen;
 	memcpy(dep->name, args->name, dep->namelen);
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1098,7 +1098,7 @@ xfs_dir2_leaf_getdents(
 
 		p->cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length);
 
-		p->ino = INT_GET(dep->inumber, ARCH_CONVERT);
+		p->ino = be64_to_cpu(dep->inumber);
 #if XFS_BIG_INUMS
 		p->ino += mp->m_inoadd;
 #endif
@@ -1319,7 +1319,7 @@ xfs_dir2_leaf_lookup(
 	/*
 	 * Return the found inode number.
 	 */
-	args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+	args->inumber = be64_to_cpu(dep->inumber);
 	xfs_da_brelse(tp, dbp);
 	xfs_da_brelse(tp, lbp);
 	return XFS_ERROR(EEXIST);
@@ -1606,11 +1606,11 @@ xfs_dir2_leaf_replace(
 	dep = (xfs_dir2_data_entry_t *)
 	      ((char *)dbp->data +
 	       XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address)));
-	ASSERT(args->inumber != INT_GET(dep->inumber, ARCH_CONVERT));
+	ASSERT(args->inumber != be64_to_cpu(dep->inumber));
 	/*
 	 * Put the new inode number in, log it.
 	 */
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	tp = args->trans;
 	xfs_dir2_data_log_entry(tp, dbp, dep);
 	xfs_da_buf_done(dbp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index ac511ab9c52..c0e8fcf5e13 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -580,7 +580,7 @@ xfs_dir2_leafn_lookup_int(
 			if (dep->namelen == args->namelen &&
 			    dep->name[0] == args->name[0] &&
 			    memcmp(dep->name, args->name, args->namelen) == 0) {
-				args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+				args->inumber = be64_to_cpu(dep->inumber);
 				*indexp = index;
 				state->extravalid = 1;
 				state->extrablk.bp = curbp;
@@ -1695,7 +1695,7 @@ xfs_dir2_node_addname_int(
 	 * Fill in the new entry and log it.
 	 */
 	dep = (xfs_dir2_data_entry_t *)dup;
-	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->inumber = cpu_to_be64(args->inumber);
 	dep->namelen = args->namelen;
 	memcpy(dep->name, args->name, dep->namelen);
 	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
@@ -1905,11 +1905,11 @@ xfs_dir2_node_replace(
 		dep = (xfs_dir2_data_entry_t *)
 		      ((char *)data +
 		       XFS_DIR2_DATAPTR_TO_OFF(state->mp, be32_to_cpu(lep->address)));
-		ASSERT(inum != INT_GET(dep->inumber, ARCH_CONVERT));
+		ASSERT(inum != be64_to_cpu(dep->inumber));
 		/*
 		 * Fill in the new inode number and log the entry.
 		 */
-		INT_SET(dep->inumber, ARCH_CONVERT, inum);
+		dep->inumber = cpu_to_be64(inum);
 		xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
 		rval = 0;
 	}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index d98a41d1fe6..06afa1b324c 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -117,13 +117,13 @@ xfs_dir2_block_sfsize(
 			dep->name[0] == '.' && dep->name[1] == '.';
 #if XFS_BIG_INUMS
 		if (!isdot)
-			i8count += INT_GET(dep->inumber, ARCH_CONVERT) > XFS_DIR2_MAX_SHORT_INUM;
+			i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
 #endif
 		if (!isdot && !isdotdot) {
 			count++;
 			namelen += dep->namelen;
 		} else if (isdotdot)
-			parent = INT_GET(dep->inumber, ARCH_CONVERT);
+			parent = be64_to_cpu(dep->inumber);
 		/*
 		 * Calculate the new size, see if we should give up yet.
 		 */
@@ -229,13 +229,13 @@ xfs_dir2_block_to_sf(
 		 * Skip .
 		 */
 		if (dep->namelen == 1 && dep->name[0] == '.')
-			ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == dp->i_ino);
+			ASSERT(be64_to_cpu(dep->inumber) == dp->i_ino);
 		/*
 		 * Skip .., but make sure the inode number is right.
 		 */
 		else if (dep->namelen == 2 &&
 			 dep->name[0] == '.' && dep->name[1] == '.')
-			ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) ==
+			ASSERT(be64_to_cpu(dep->inumber) ==
 			       XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
 		/*
 		 * Normal entry, copy it into shortform.
@@ -246,7 +246,7 @@ xfs_dir2_block_to_sf(
 				(xfs_dir2_data_aoff_t)
 				((char *)dep - (char *)block));
 			memcpy(sfep->name, dep->name, dep->namelen);
-			temp=INT_GET(dep->inumber, ARCH_CONVERT);
+			temp = be64_to_cpu(dep->inumber);
 			XFS_DIR2_SF_PUT_INUMBER(sfp, &temp,
 				XFS_DIR2_SF_INUMBERP(sfep));
 			sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
-- 
cgit v1.2.3


From 8034fff39bb9430d807375ec7a04097efba42cd2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Jun 2006 14:50:24 +1000
Subject: [XFS] endianess annotations for xfs_dir_leaf_hdr_t

SGI-PV: 943272
SGI-Modid: xfs-linux-melb:xfs-kern:25807a

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_da_btree.c |   2 +-
 fs/xfs/xfs_dir.c      |  11 +-
 fs/xfs/xfs_dir_leaf.c | 326 ++++++++++++++++++++++++++------------------------
 fs/xfs/xfs_dir_leaf.h |  14 +--
 4 files changed, 182 insertions(+), 171 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 47b86f9f69f..296aa525ac4 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1786,7 +1786,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 		dead_leaf = (xfs_dir_leafblock_t *)dead_info;
 		dead_level = 0;
 		dead_hash =
-			INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+			INT_GET(dead_leaf->entries[be16_to_cpu(dead_leaf->hdr.count) - 1].hashval, ARCH_CONVERT);
 	} else if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
 		ASSERT(XFS_DIR_IS_V2(mp));
 		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
index 9cc702a839a..a3b0e97dbf9 100644
--- a/fs/xfs/xfs_dir.c
+++ b/fs/xfs/xfs_dir.c
@@ -638,8 +638,8 @@ xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen)
 	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
 	if (retval == EEXIST) {
 		(void)xfs_dir_leaf_remove(args->trans, bp, index);
-		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
-		*totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		*count = be16_to_cpu(leaf->hdr.count);
+		*totallen = be16_to_cpu(leaf->hdr.namebytes);
 		retval = 0;
 	}
 	xfs_da_buf_done(bp);
@@ -925,7 +925,7 @@ xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
 			bp = NULL;
 		}
 		if (bp &&
-		    cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) {
+		    cookhash > INT_GET(leaf->entries[be16_to_cpu(leaf->hdr.count) - 1].hashval, ARCH_CONVERT)) {
 			xfs_dir_trace_g_dub("node: leaf hash too small",
 						   dp, uio, bno);
 			xfs_da_brelse(trans, bp);
@@ -1142,7 +1142,7 @@ void
 xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
 			xfs_dir_leafblock_t *leaf)
 {
-	int	last = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
+	int	last = be16_to_cpu(leaf->hdr.count) - 1;
 
 	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where,
 		     (void *)dp, (void *)dp->i_mount,
@@ -1150,8 +1150,7 @@ xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
 		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
 		     (void *)(unsigned long)uio->uio_resid,
 		     (void *)(unsigned long)be32_to_cpu(leaf->hdr.info.forw),
-		     (void *)(unsigned long)
-			INT_GET(leaf->hdr.count, ARCH_CONVERT),
+		     (void *)(unsigned long)be16_to_cpu(leaf->hdr.count),
 		     (void *)(unsigned long)
 			INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
 		     (void *)(unsigned long)
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
index 6d711869262..6a4d8caaba7 100644
--- a/fs/xfs/xfs_dir_leaf.c
+++ b/fs/xfs/xfs_dir_leaf.c
@@ -651,7 +651,7 @@ xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
 	 */
 	hdr = &leaf->hdr;
 	entry = &leaf->entries[0];
-	for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
+	for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
 		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
 		if ((entry->namelen == 2) &&
 		    (namest->name[0] == '.') &&
@@ -681,7 +681,7 @@ xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
 	args.trans = iargs->trans;
 	args.justcheck = 0;
 	args.addname = args.oknoent = 1;
-	for (i = 0; i < INT_GET(hdr->count, ARCH_CONVERT); entry++, i++) {
+	for (i = 0; i < be16_to_cpu(hdr->count); entry++, i++) {
 		if (!entry->nameidx)
 			continue;
 		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
@@ -744,7 +744,7 @@ xfs_dir_leaf_to_node(xfs_da_args_t *args)
 	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
 	node->btree[0].hashval = cpu_to_be32(
 		INT_GET(leaf->entries[
-			INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+			be16_to_cpu(leaf->hdr.count)-1].hashval, ARCH_CONVERT));
 	xfs_da_buf_done(bp2);
 	node->btree[0].before = cpu_to_be32(blkno);
 	node->hdr.count = cpu_to_be16(1);
@@ -783,11 +783,12 @@ xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
 	memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
 	hdr = &leaf->hdr;
 	hdr->info.magic = cpu_to_be16(XFS_DIR_LEAF_MAGIC);
-	INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
+	hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount));
 	if (!hdr->firstused)
-		INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount) - 1);
-	INT_SET(hdr->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
-	INT_SET(hdr->freemap[0].size, ARCH_CONVERT, INT_GET(hdr->firstused, ARCH_CONVERT) - INT_GET(hdr->freemap[0].base, ARCH_CONVERT));
+		hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount) - 1);
+	hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_dir_leaf_hdr_t));
+	hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) -
+					   be16_to_cpu(hdr->freemap[0].base));
 
 	xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
 
@@ -862,7 +863,7 @@ xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
 
 	leaf = bp->data;
 	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT((index >= 0) && (index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+	ASSERT((index >= 0) && (index <= be16_to_cpu(leaf->hdr.count)));
 	hdr = &leaf->hdr;
 	entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen);
 
@@ -870,25 +871,25 @@ xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
 	 * Search through freemap for first-fit on new name length.
 	 * (may need to figure in size of entry struct too)
 	 */
-	tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) * (uint)sizeof(xfs_dir_leaf_entry_t)
-			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
+	tablesize = (be16_to_cpu(hdr->count) + 1) *
+		sizeof(xfs_dir_leaf_entry_t) + sizeof(xfs_dir_leaf_hdr_t);
 	map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1];
 	for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
-		if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
-			sum += INT_GET(map->size, ARCH_CONVERT);
+		if (tablesize > be16_to_cpu(hdr->firstused)) {
+			sum += be16_to_cpu(map->size);
 			continue;
 		}
 		if (!map->size)
 			continue;	/* no space in this map */
 		tmp = entsize;
-		if (INT_GET(map->base, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
+		if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused))
 			tmp += (uint)sizeof(xfs_dir_leaf_entry_t);
-		if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
+		if (be16_to_cpu(map->size) >= tmp) {
 			if (!args->justcheck)
 				xfs_dir_leaf_add_work(bp, args, index, i);
 			return 0;
 		}
-		sum += INT_GET(map->size, ARCH_CONVERT);
+		sum += be16_to_cpu(map->size);
 	}
 
 	/*
@@ -915,7 +916,7 @@ xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
 	 * After compaction, the block is guaranteed to have only one
 	 * free region, in freemap[0].  If it is not big enough, give up.
 	 */
-	if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) <
+	if (be16_to_cpu(hdr->freemap[0].size) <
 	    (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
 		return XFS_ERROR(ENOSPC);
 
@@ -944,31 +945,32 @@ xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
 	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
 	hdr = &leaf->hdr;
 	ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE));
-	ASSERT((index >= 0) && (index <= INT_GET(hdr->count, ARCH_CONVERT)));
+	ASSERT((index >= 0) && (index <= be16_to_cpu(hdr->count)));
 
 	/*
 	 * Force open some space in the entry array and fill it in.
 	 */
 	entry = &leaf->entries[index];
-	if (index < INT_GET(hdr->count, ARCH_CONVERT)) {
-		tmp  = INT_GET(hdr->count, ARCH_CONVERT) - index;
+	if (index < be16_to_cpu(hdr->count)) {
+		tmp  = be16_to_cpu(hdr->count) - index;
 		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
 		memmove(entry + 1, entry, tmp);
 		xfs_da_log_buf(args->trans, bp,
 		    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
 	}
-	INT_MOD(hdr->count, ARCH_CONVERT, +1);
+	be16_add(&hdr->count, 1);
 
 	/*
 	 * Allocate space for the new string (at the end of the run).
 	 */
 	map = &hdr->freemap[mapindex];
 	mp = args->trans->t_mountp;
-	ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
-	ASSERT(INT_GET(map->size, ARCH_CONVERT) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
-	ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
-	INT_MOD(map->size, ARCH_CONVERT, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
-	INT_SET(entry->nameidx, ARCH_CONVERT, INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT));
+	ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
+	ASSERT(be16_to_cpu(map->size) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
+	ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
+
+	be16_add(&map->size, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
+	INT_SET(entry->nameidx, ARCH_CONVERT, be16_to_cpu(map->base) + be16_to_cpu(map->size));
 	INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
 	entry->namelen = args->namelen;
 	xfs_da_log_buf(args->trans, bp,
@@ -986,19 +988,21 @@ xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
 	/*
 	 * Update the control info for this leaf node
 	 */
-	if (INT_GET(entry->nameidx, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
-		INT_COPY(hdr->firstused, entry->nameidx, ARCH_CONVERT);
-	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
-	tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
+	if (INT_GET(entry->nameidx, ARCH_CONVERT) < be16_to_cpu(hdr->firstused))
+		hdr->firstused = entry->nameidx;
+	ASSERT(be16_to_cpu(hdr->firstused) >=
+	       ((be16_to_cpu(hdr->count)*sizeof(*entry))+sizeof(*hdr)));
+	tmp = (be16_to_cpu(hdr->count)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
 			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
 	map = &hdr->freemap[0];
 	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
-		if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
-			INT_MOD(map->base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
-			INT_MOD(map->size, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
+		if (be16_to_cpu(map->base) == tmp) {
+			int entry_size = sizeof(xfs_dir_leaf_entry_t);
+			be16_add(&map->base, entry_size);
+			be16_add(&map->size, -entry_size);
 		}
 	}
-	INT_MOD(hdr->namebytes, ARCH_CONVERT, args->namelen);
+	be16_add(&hdr->namebytes, args->namelen);
 	xfs_da_log_buf(args->trans, bp,
 		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
 }
@@ -1042,23 +1046,24 @@ xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
 	hdr_s = &leaf_s->hdr;
 	hdr_d = &leaf_d->hdr;
 	hdr_d->info = hdr_s->info;	/* struct copy */
-	INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize);
+	hdr_d->firstused = cpu_to_be16(lbsize);
 	if (!hdr_d->firstused)
-		INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize - 1);
+		hdr_d->firstused = cpu_to_be16(lbsize - 1);
 	hdr_d->namebytes = 0;
 	hdr_d->count = 0;
 	hdr_d->holes = 0;
-	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
-	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+	hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_dir_leaf_hdr_t));
+	hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) -
+			                     be16_to_cpu(hdr_d->freemap[0].base));
 
 	/*
 	 * Copy all entry's in the same (sorted) order,
 	 * but allocate filenames packed and in sequence.
 	 * This changes the source (leaf_s) as well.
 	 */
-	xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
+	xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, be16_to_cpu(hdr_s->count), mp);
 
-	if (musthave && INT_GET(hdr_d->freemap[0].size, ARCH_CONVERT) < musthave)
+	if (musthave && be16_to_cpu(hdr_d->freemap[0].size) < musthave)
 		rval = XFS_ERROR(ENOSPC);
 	else
 		rval = 0;
@@ -1129,20 +1134,20 @@ xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	/*
 	 * Move any entries required from leaf to leaf:
 	 */
-	if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
+	if (count < be16_to_cpu(hdr1->count)) {
 		/*
 		 * Figure the total bytes to be added to the destination leaf.
 		 */
-		count = INT_GET(hdr1->count, ARCH_CONVERT) - count;	/* number entries being moved */
-		space  = INT_GET(hdr1->namebytes, ARCH_CONVERT) - totallen;
+		count = be16_to_cpu(hdr1->count) - count;	/* number entries being moved */
+		space = be16_to_cpu(hdr1->namebytes) - totallen;
 		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
 		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
 
 		/*
 		 * leaf2 is the destination, compact it if it looks tight.
 		 */
-		max  = INT_GET(hdr2->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
-		max -= INT_GET(hdr2->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+		max  = be16_to_cpu(hdr2->firstused) - (uint)sizeof(xfs_dir_leaf_hdr_t);
+		max -= be16_to_cpu(hdr2->count) * (uint)sizeof(xfs_dir_leaf_entry_t);
 		if (space > max) {
 			xfs_dir_leaf_compact(state->args->trans, blk2->bp,
 								 0, 0);
@@ -1151,7 +1156,7 @@ xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		/*
 		 * Move high entries from leaf1 to low end of leaf2.
 		 */
-		xfs_dir_leaf_moveents(leaf1, INT_GET(hdr1->count, ARCH_CONVERT) - count,
+		xfs_dir_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count,
 					     leaf2, 0, count, state->mp);
 
 		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
@@ -1159,20 +1164,20 @@ xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		xfs_da_log_buf(state->args->trans, blk2->bp, 0,
 						   state->blocksize-1);
 
-	} else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
+	} else if (count > be16_to_cpu(hdr1->count)) {
 		/*
 		 * Figure the total bytes to be added to the destination leaf.
 		 */
-		count -= INT_GET(hdr1->count, ARCH_CONVERT);		/* number entries being moved */
-		space  = totallen - INT_GET(hdr1->namebytes, ARCH_CONVERT);
+		count -= be16_to_cpu(hdr1->count);		/* number entries being moved */
+		space  = totallen - be16_to_cpu(hdr1->namebytes);
 		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
 		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
 
 		/*
 		 * leaf1 is the destination, compact it if it looks tight.
 		 */
-		max  = INT_GET(hdr1->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
-		max -= INT_GET(hdr1->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+		max  = be16_to_cpu(hdr1->firstused) - (uint)sizeof(xfs_dir_leaf_hdr_t);
+		max -= be16_to_cpu(hdr1->count) * (uint)sizeof(xfs_dir_leaf_entry_t);
 		if (space > max) {
 			xfs_dir_leaf_compact(state->args->trans, blk1->bp,
 								 0, 0);
@@ -1181,7 +1186,7 @@ xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		/*
 		 * Move low entries from leaf2 to high end of leaf1.
 		 */
-		xfs_dir_leaf_moveents(leaf2, 0, leaf1, (int)INT_GET(hdr1->count, ARCH_CONVERT),
+		xfs_dir_leaf_moveents(leaf2, 0, leaf1, be16_to_cpu(hdr1->count),
 					     count, state->mp);
 
 		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
@@ -1193,15 +1198,15 @@ xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	/*
 	 * Copy out last hashval in each block for B-tree code.
 	 */
-	blk1->hashval = INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
-	blk2->hashval = INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+	blk1->hashval = INT_GET(leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval, ARCH_CONVERT);
+	blk2->hashval = INT_GET(leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval, ARCH_CONVERT);
 
 	/*
 	 * Adjust the expected index for insertion.
 	 * GROT: this doesn't work unless blk2 was originally empty.
 	 */
 	if (!state->inleaf) {
-		blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+		blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
 	}
 }
 
@@ -1238,9 +1243,10 @@ xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
 	 * Examine entries until we reduce the absolute difference in
 	 * byte usage between the two blocks to a minimum.
 	 */
-	max = INT_GET(hdr1->count, ARCH_CONVERT) + INT_GET(hdr2->count, ARCH_CONVERT);
+	max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count);
 	half  = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
-	half += INT_GET(hdr1->namebytes, ARCH_CONVERT) + INT_GET(hdr2->namebytes, ARCH_CONVERT) + state->args->namelen;
+	half += be16_to_cpu(hdr1->namebytes) + be16_to_cpu(hdr2->namebytes) +
+		state->args->namelen;
 	half /= 2;
 	lastdelta = state->blocksize;
 	entry = &leaf1->entries[0];
@@ -1263,7 +1269,7 @@ xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
 		/*
 		 * Wrap around into the second block if necessary.
 		 */
-		if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
+		if (count == be16_to_cpu(hdr1->count)) {
 			leaf1 = leaf2;
 			entry = &leaf1->entries[0];
 		}
@@ -1328,11 +1334,11 @@ xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
 	info = blk->bp->data;
 	ASSERT(be16_to_cpu(info->magic) == XFS_DIR_LEAF_MAGIC);
 	leaf = (xfs_dir_leafblock_t *)info;
-	count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	count = be16_to_cpu(leaf->hdr.count);
 	bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) +
 		count * (uint)sizeof(xfs_dir_leaf_entry_t) +
 		count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) +
-		INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		be16_to_cpu(leaf->hdr.namebytes);
 	if (bytes > (state->blocksize >> 1)) {
 		*action = 0;	/* blk over 50%, don't try to join */
 		return 0;
@@ -1386,13 +1392,13 @@ xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
 		ASSERT(bp != NULL);
 
 		leaf = (xfs_dir_leafblock_t *)info;
-		count  = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		count  = be16_to_cpu(leaf->hdr.count);
 		bytes  = state->blocksize - (state->blocksize>>2);
-		bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		bytes -= be16_to_cpu(leaf->hdr.namebytes);
 		leaf = bp->data;
 		ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-		count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
-		bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		count += be16_to_cpu(leaf->hdr.count);
+		bytes -= be16_to_cpu(leaf->hdr.namebytes);
 		bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
 		bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t);
 		bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t);
@@ -1451,11 +1457,12 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
 	hdr = &leaf->hdr;
 	mp = trans->t_mountp;
-	ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
-	ASSERT((index >= 0) && (index < INT_GET(hdr->count, ARCH_CONVERT)));
-	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
+	ASSERT(hdr->count && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8)));
+	ASSERT((index >= 0) && (index < be16_to_cpu(hdr->count)));
+	ASSERT(be16_to_cpu(hdr->firstused) >=
+	       ((be16_to_cpu(hdr->count)*sizeof(*entry))+sizeof(*hdr)));
 	entry = &leaf->entries[index];
-	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
+	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= be16_to_cpu(hdr->firstused));
 	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
 
 	/*
@@ -1464,27 +1471,28 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 	 *    find smallest free region in case we need to replace it,
 	 *    adjust any map that borders the entry table,
 	 */
-	tablesize = INT_GET(hdr->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
+	tablesize = be16_to_cpu(hdr->count) * (uint)sizeof(xfs_dir_leaf_entry_t)
 			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
 	map = &hdr->freemap[0];
-	tmp = INT_GET(map->size, ARCH_CONVERT);
+	tmp = be16_to_cpu(map->size);
 	before = after = -1;
 	smallest = XFS_DIR_LEAF_MAPSIZE - 1;
 	entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
 	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
-		ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
-		ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
-		if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
-			INT_MOD(map->base, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
-			INT_MOD(map->size, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
+		ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
+		ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
+		if (be16_to_cpu(map->base) == tablesize) {
+			int entry_size = sizeof(xfs_dir_leaf_entry_t);
+			be16_add(&map->base, -entry_size);
+			be16_add(&map->size, entry_size);
 		}
 
-		if ((INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
+		if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
 			before = i;
-		} else if (INT_GET(map->base, ARCH_CONVERT) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
+		} else if (be16_to_cpu(map->base) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
 			after = i;
-		} else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
-			tmp = INT_GET(map->size, ARCH_CONVERT);
+		} else if (be16_to_cpu(map->size) < tmp) {
+			tmp = be16_to_cpu(map->size);
 			smallest = i;
 		}
 	}
@@ -1496,33 +1504,33 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 	if ((before >= 0) || (after >= 0)) {
 		if ((before >= 0) && (after >= 0)) {
 			map = &hdr->freemap[before];
-			INT_MOD(map->size, ARCH_CONVERT, entsize);
-			INT_MOD(map->size, ARCH_CONVERT, INT_GET(hdr->freemap[after].size, ARCH_CONVERT));
+			be16_add(&map->size, entsize);
+			be16_add(&map->size, be16_to_cpu(hdr->freemap[after].size));
 			hdr->freemap[after].base = 0;
 			hdr->freemap[after].size = 0;
 		} else if (before >= 0) {
 			map = &hdr->freemap[before];
-			INT_MOD(map->size, ARCH_CONVERT, entsize);
+			be16_add(&map->size, entsize);
 		} else {
 			map = &hdr->freemap[after];
-			INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
-			INT_MOD(map->size, ARCH_CONVERT, entsize);
+			map->base = entry->nameidx;
+			be16_add(&map->size, entsize);
 		}
 	} else {
 		/*
 		 * Replace smallest region (if it is smaller than free'd entry)
 		 */
 		map = &hdr->freemap[smallest];
-		if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
-			INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
-			INT_SET(map->size, ARCH_CONVERT, entsize);
+		if (be16_to_cpu(map->size) < entsize) {
+			map->base = entry->nameidx;
+			map->size = cpu_to_be16(entsize);
 		}
 	}
 
 	/*
 	 * Did we remove the first entry?
 	 */
-	if (INT_GET(entry->nameidx, ARCH_CONVERT) == INT_GET(hdr->firstused, ARCH_CONVERT))
+	if (INT_GET(entry->nameidx, ARCH_CONVERT) == be16_to_cpu(hdr->firstused))
 		smallest = 1;
 	else
 		smallest = 0;
@@ -1534,13 +1542,13 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 	memset((char *)namest, 0, entsize);
 	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
 
-	INT_MOD(hdr->namebytes, ARCH_CONVERT, -(entry->namelen));
-	tmp = (INT_GET(hdr->count, ARCH_CONVERT) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
+	be16_add(&hdr->namebytes, -(entry->namelen));
+	tmp = (be16_to_cpu(hdr->count) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
 	memmove(entry, entry + 1, tmp);
-	INT_MOD(hdr->count, ARCH_CONVERT, -1);
+	be16_add(&hdr->count, -1);
 	xfs_da_log_buf(trans, bp,
 	    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
-	entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
+	entry = &leaf->entries[be16_to_cpu(hdr->count)];
 	memset((char *)entry, 0, sizeof(xfs_dir_leaf_entry_t));
 
 	/*
@@ -1552,15 +1560,16 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 	if (smallest) {
 		tmp = XFS_LBSIZE(mp);
 		entry = &leaf->entries[0];
-		for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
-			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
+		for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
+			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >=
+			       be16_to_cpu(hdr->firstused));
 			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
 			if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
 				tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
 		}
-		INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
+		hdr->firstused = cpu_to_be16(tmp);
 		if (!hdr->firstused)
-			INT_SET(hdr->firstused, ARCH_CONVERT, tmp - 1);
+			hdr->firstused = cpu_to_be16(tmp - 1);
 	} else {
 		hdr->holes = 1;		/* mark as needing compaction */
 	}
@@ -1572,9 +1581,9 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 	 * "join" the leaf with a sibling if so.
 	 */
 	tmp  = (uint)sizeof(xfs_dir_leaf_hdr_t);
-	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
-	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
-	tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+	tmp += be16_to_cpu(leaf->hdr.count) * (uint)sizeof(xfs_dir_leaf_entry_t);
+	tmp += be16_to_cpu(leaf->hdr.count) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
+	tmp += be16_to_cpu(leaf->hdr.namebytes);
 	if (tmp < mp->m_dir_magicpct)
 		return 1;			/* leaf is < 37% full */
 	return 0;
@@ -1608,7 +1617,7 @@ xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 	/*
 	 * Save last hashval from dying block for later Btree fixup.
 	 */
-	drop_blk->hashval = INT_GET(drop_leaf->entries[ drop_leaf->hdr.count-1 ].hashval, ARCH_CONVERT);
+	drop_blk->hashval = INT_GET(drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval, ARCH_CONVERT);
 
 	/*
 	 * Check if we need a temp buffer, or can we do it in place.
@@ -1622,11 +1631,11 @@ xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 		 */
 		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
 			xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0,
-						 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+					be16_to_cpu(drop_hdr->count), mp);
 		} else {
 			xfs_dir_leaf_moveents(drop_leaf, 0,
-					      save_leaf, INT_GET(save_hdr->count, ARCH_CONVERT),
-					      (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+					save_leaf, be16_to_cpu(save_hdr->count),
+					be16_to_cpu(drop_hdr->count), mp);
 		}
 	} else {
 		/*
@@ -1640,22 +1649,22 @@ xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 		tmp_hdr = &tmp_leaf->hdr;
 		tmp_hdr->info = save_hdr->info;	/* struct copy */
 		tmp_hdr->count = 0;
-		INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
+		tmp_hdr->firstused = cpu_to_be16(state->blocksize);
 		if (!tmp_hdr->firstused)
-			INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize - 1);
+			tmp_hdr->firstused = cpu_to_be16(state->blocksize - 1);
 		tmp_hdr->namebytes = 0;
 		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
 			xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
-						 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+					be16_to_cpu(drop_hdr->count), mp);
 			xfs_dir_leaf_moveents(save_leaf, 0,
-					      tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
-					      (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
+					tmp_leaf, be16_to_cpu(tmp_leaf->hdr.count),
+					be16_to_cpu(save_hdr->count), mp);
 		} else {
 			xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
-						 (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
+						 be16_to_cpu(save_hdr->count), mp);
 			xfs_dir_leaf_moveents(drop_leaf, 0,
-					      tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
-					      (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+					      tmp_leaf, be16_to_cpu(tmp_leaf->hdr.count),
+					      be16_to_cpu(drop_hdr->count), mp);
 		}
 		memcpy(save_leaf, tmp_leaf, state->blocksize);
 		kmem_free(tmpbuffer, state->blocksize);
@@ -1667,7 +1676,7 @@ xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 	/*
 	 * Copy out last hashval in each block for B-tree code.
 	 */
-	save_blk->hashval = INT_GET(save_leaf->entries[ INT_GET(save_leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+	save_blk->hashval = INT_GET(save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1 ].hashval, ARCH_CONVERT);
 }
 
 /*========================================================================
@@ -1697,13 +1706,13 @@ xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
 
 	leaf = bp->data;
 	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) < (XFS_LBSIZE(args->dp->i_mount)/8));
+	ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8));
 
 	/*
 	 * Binary search.  (note: small blocks will skip this loop)
 	 */
 	hashval = args->hashval;
-	probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
+	probe = span = be16_to_cpu(leaf->hdr.count) / 2;
 	for (entry = &leaf->entries[probe]; span > 4;
 		   entry = &leaf->entries[probe]) {
 		span /= 2;
@@ -1715,7 +1724,7 @@ xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
 			break;
 	}
 	ASSERT((probe >= 0) && \
-	       ((!leaf->hdr.count) || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
+	       ((!leaf->hdr.count) || (probe < be16_to_cpu(leaf->hdr.count))));
 	ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval));
 
 	/*
@@ -1726,11 +1735,11 @@ xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
 		entry--;
 		probe--;
 	}
-	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
+	while ((probe < be16_to_cpu(leaf->hdr.count)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
 		entry++;
 		probe++;
 	}
-	if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
+	if ((probe == be16_to_cpu(leaf->hdr.count)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
 		*index = probe;
 		ASSERT(args->oknoent);
 		return XFS_ERROR(ENOENT);
@@ -1739,7 +1748,7 @@ xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
 	/*
 	 * Duplicate keys may be present, so search all of them for a match.
 	 */
-	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
+	while ((probe < be16_to_cpu(leaf->hdr.count)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
 		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
 		if (entry->namelen == args->namelen &&
 		    namest->name[0] == args->name[0] &&
@@ -1752,7 +1761,7 @@ xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
 		probe++;
 	}
 	*index = probe;
-	ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
+	ASSERT(probe == be16_to_cpu(leaf->hdr.count) || args->oknoent);
 	return XFS_ERROR(ENOENT);
 }
 
@@ -1787,22 +1796,22 @@ xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
 	ASSERT(be16_to_cpu(leaf_d->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
 	hdr_s = &leaf_s->hdr;
 	hdr_d = &leaf_d->hdr;
-	ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) && (INT_GET(hdr_s->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
-	ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
-		((INT_GET(hdr_s->count, ARCH_CONVERT)*sizeof(*entry_s))+sizeof(*hdr_s)));
-	ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
-	ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
-		((INT_GET(hdr_d->count, ARCH_CONVERT)*sizeof(*entry_d))+sizeof(*hdr_d)));
+	ASSERT(hdr_s->count && (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8)));
+	ASSERT(be16_to_cpu(hdr_s->firstused) >=
+		((be16_to_cpu(hdr_s->count)*sizeof(*entry_s))+sizeof(*hdr_s)));
+	ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8));
+	ASSERT(be16_to_cpu(hdr_d->firstused) >=
+		((be16_to_cpu(hdr_d->count)*sizeof(*entry_d))+sizeof(*hdr_d)));
 
-	ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
-	ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
-	ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
+	ASSERT(start_s < be16_to_cpu(hdr_s->count));
+	ASSERT(start_d <= be16_to_cpu(hdr_d->count));
+	ASSERT(count <= be16_to_cpu(hdr_s->count));
 
 	/*
 	 * Move the entries in the destination leaf up to make a hole?
 	 */
-	if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
-		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
+	if (start_d < be16_to_cpu(hdr_d->count)) {
+		tmp  = be16_to_cpu(hdr_d->count) - start_d;
 		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
 		entry_s = &leaf_d->entries[start_d];
 		entry_d = &leaf_d->entries[start_d + count];
@@ -1816,11 +1825,12 @@ xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
 	entry_s = &leaf_s->entries[start_s];
 	entry_d = &leaf_d->entries[start_d];
 	for (i = 0; i < count; entry_s++, entry_d++, i++) {
-		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
+		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >=
+		       be16_to_cpu(hdr_s->firstused));
 		tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
-		INT_MOD(hdr_d->firstused, ARCH_CONVERT, -(tmp));
-		entry_d->hashval = entry_s->hashval; /* INT_: direct copy */
-		INT_COPY(entry_d->nameidx, hdr_d->firstused, ARCH_CONVERT);
+		be16_add(&hdr_d->firstused, -(tmp));
+		entry_d->hashval = entry_s->hashval;
+		entry_d->nameidx = hdr_d->firstused;
 		entry_d->namelen = entry_s->namelen;
 		ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
 		memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)),
@@ -1828,20 +1838,20 @@ xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
 		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
 		memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
 		      0, tmp);
-		INT_MOD(hdr_s->namebytes, ARCH_CONVERT, -(entry_d->namelen));
-		INT_MOD(hdr_d->namebytes, ARCH_CONVERT, entry_d->namelen);
-		INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
-		INT_MOD(hdr_d->count, ARCH_CONVERT, +1);
-		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
+		be16_add(&hdr_s->namebytes, -(entry_d->namelen));
+		be16_add(&hdr_d->namebytes, entry_d->namelen);
+		be16_add(&hdr_s->count, -1);
+		be16_add(&hdr_d->count, +1);
+		tmp = be16_to_cpu(hdr_d->count) * (uint)sizeof(xfs_dir_leaf_entry_t)
 				+ (uint)sizeof(xfs_dir_leaf_hdr_t);
-		ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
+		ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp);
 
 	}
 
 	/*
 	 * Zero out the entries we just copied.
 	 */
-	if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
+	if (start_s == be16_to_cpu(hdr_s->count)) {
 		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
 		entry_s = &leaf_s->entries[start_s];
 		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
@@ -1851,14 +1861,14 @@ xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
 		 * Move the remaining entries down to fill the hole,
 		 * then zero the entries at the top.
 		 */
-		tmp  = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
+		tmp  = be16_to_cpu(hdr_s->count) - count;
 		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
 		entry_s = &leaf_s->entries[start_s + count];
 		entry_d = &leaf_s->entries[start_s];
 		memcpy(entry_d, entry_s, tmp);
 
 		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
-		entry_s = &leaf_s->entries[INT_GET(hdr_s->count, ARCH_CONVERT)];
+		entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)];
 		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
 		memset((char *)entry_s, 0, tmp);
 	}
@@ -1866,11 +1876,14 @@ xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
 	/*
 	 * Fill in the freemap information
 	 */
-	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_hdr_t));
-	INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t));
-	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
-	INT_SET(hdr_d->freemap[1].base, ARCH_CONVERT, (hdr_d->freemap[2].base = 0));
-	INT_SET(hdr_d->freemap[1].size, ARCH_CONVERT, (hdr_d->freemap[2].size = 0));
+	hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_dir_leaf_hdr_t) +
+			be16_to_cpu(hdr_d->count) * sizeof(xfs_dir_leaf_entry_t));
+	hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) -
+			be16_to_cpu(hdr_d->freemap[0].base));
+	hdr_d->freemap[1].base = 0;
+	hdr_d->freemap[1].size = 0;
+	hdr_d->freemap[2].base = 0;
+	hdr_d->freemap[2].size = 0;
 	hdr_s->holes = 1;	/* leaf may not be compact */
 }
 
@@ -1886,11 +1899,11 @@ xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
 	leaf2 = leaf2_bp->data;
 	ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR_LEAF_MAGIC) &&
 	       (be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR_LEAF_MAGIC));
-	if ((INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) &&
+	if (leaf1->hdr.count && leaf2->hdr.count &&
 	    ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
 	      INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
-	     (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
-	      INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+	     (INT_GET(leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval, ARCH_CONVERT) <
+	      INT_GET(leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval, ARCH_CONVERT)))) {
 		return 1;
 	}
 	return 0;
@@ -1907,10 +1920,10 @@ xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
 	leaf = bp->data;
 	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
 	if (count)
-		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		*count = be16_to_cpu(leaf->hdr.count);
 	if (!leaf->hdr.count)
 		return(0);
-	return(INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+	return(INT_GET(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval, ARCH_CONVERT));
 }
 
 /*
@@ -1956,8 +1969,7 @@ xfs_dir_leaf_getdents_int(
 	 * Re-find our place.
 	 */
 	for (i = entno = 0, entry = &leaf->entries[0];
-		     i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
-			     entry++, i++) {
+		     i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
 
 		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
 				    INT_GET(entry->nameidx, ARCH_CONVERT));
@@ -1991,7 +2003,7 @@ xfs_dir_leaf_getdents_int(
 		}
 	}
 
-	if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+	if (i == be16_to_cpu(leaf->hdr.count)) {
 		xfs_dir_trace_g_du("leaf: hash not found", dp, uio);
 		if (!leaf->hdr.info.forw)
 			uio->uio_offset =
@@ -2012,7 +2024,7 @@ xfs_dir_leaf_getdents_int(
 	/*
 	 * We're synchronized, start copying entries out to the user.
 	 */
-	for (; entno >= 0 && i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	for (; entno >= 0 && i < be16_to_cpu(leaf->hdr.count);
 			     entry++, i++, (entno = nextentno)) {
 		int lastresid=0, retval;
 		xfs_dircook_t lastoffset;
@@ -2037,7 +2049,7 @@ xfs_dir_leaf_getdents_int(
 		xfs_dir_trace_g_duc("leaf: middle cookie  ",
 						   dp, uio, p.cook.o);
 
-		if (i < (INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1)) {
+		if (i < (be16_to_cpu(leaf->hdr.count) - 1)) {
 			nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT);
 
 			if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT))
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
index eb8cd9a4667..c6979ab829c 100644
--- a/fs/xfs/xfs_dir_leaf.h
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -68,17 +68,17 @@ struct xfs_trans;
 #define XFS_DIR_LEAF_MAPSIZE	3	/* how many freespace slots */
 
 typedef struct xfs_dir_leaf_map {	/* RLE map of free bytes */
-	__uint16_t	base;	 	/* base of free region */
-	__uint16_t	size; 		/* run length of free region */
+	__be16		base;	 	/* base of free region */
+	__be16		size; 		/* run length of free region */
 } xfs_dir_leaf_map_t;
 
 typedef struct xfs_dir_leaf_hdr {	/* constant-structure header block */
 	xfs_da_blkinfo_t info;		/* block type, links, etc. */
-	__uint16_t	count;		/* count of active leaf_entry's */
-	__uint16_t	namebytes;	/* num bytes of name strings stored */
-	__uint16_t	firstused;	/* first used byte in name area */
-	__uint8_t	holes;		/* != 0 if blk needs compaction */
-	__uint8_t	pad1;
+	__be16		count;		/* count of active leaf_entry's */
+	__be16		namebytes;	/* num bytes of name strings stored */
+	__be16		firstused;	/* first used byte in name area */
+	__u8		holes;		/* != 0 if blk needs compaction */
+	__u8		pad1;
 	xfs_dir_leaf_map_t freemap[XFS_DIR_LEAF_MAPSIZE];
 } xfs_dir_leaf_hdr_t;
 
-- 
cgit v1.2.3


From 1d8daf06f67c8920a640eb61b30c3176ecc52405 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Jun 2006 14:50:37 +1000
Subject: [XFS] endianess annotations for xfs_dir_leaf_entry_t

SGI-PV: 943272
SGI-Modid: xfs-linux-melb:xfs-kern:25808a

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_da_btree.c |   4 +-
 fs/xfs/xfs_dir.c      |  19 ++++----
 fs/xfs/xfs_dir_leaf.c | 127 ++++++++++++++++++++++++++------------------------
 fs/xfs/xfs_dir_leaf.h |   8 ++--
 4 files changed, 80 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 296aa525ac4..260c3d770c0 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1785,8 +1785,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 		ASSERT(XFS_DIR_IS_V1(mp));
 		dead_leaf = (xfs_dir_leafblock_t *)dead_info;
 		dead_level = 0;
-		dead_hash =
-			INT_GET(dead_leaf->entries[be16_to_cpu(dead_leaf->hdr.count) - 1].hashval, ARCH_CONVERT);
+		dead_hash = be32_to_cpu(dead_leaf->entries[
+				be16_to_cpu(dead_leaf->hdr.count) - 1].hashval);
 	} else if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
 		ASSERT(XFS_DIR_IS_V2(mp));
 		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
index a3b0e97dbf9..3cd8657a81f 100644
--- a/fs/xfs/xfs_dir.c
+++ b/fs/xfs/xfs_dir.c
@@ -710,7 +710,7 @@ xfs_dir_leaf_replace(xfs_da_args_t *args)
 	if (retval == EEXIST) {
 		leaf = bp->data;
 		entry = &leaf->entries[index];
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
 		/* XXX - replace assert? */
 		XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
 		xfs_da_log_buf(args->trans, bp,
@@ -918,14 +918,14 @@ xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
 			xfs_da_brelse(trans, bp);
 			bp = NULL;
 		}
-		if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) {
+		if (bp && be32_to_cpu(leaf->entries[0].hashval) > cookhash) {
 			xfs_dir_trace_g_dub("node: leaf hash too large",
 						   dp, uio, bno);
 			xfs_da_brelse(trans, bp);
 			bp = NULL;
 		}
-		if (bp &&
-		    cookhash > INT_GET(leaf->entries[be16_to_cpu(leaf->hdr.count) - 1].hashval, ARCH_CONVERT)) {
+		if (bp && cookhash > be32_to_cpu(leaf->entries[
+					be16_to_cpu(leaf->hdr.count) - 1].hashval)) {
 			xfs_dir_trace_g_dub("node: leaf hash too small",
 						   dp, uio, bno);
 			xfs_da_brelse(trans, bp);
@@ -1059,7 +1059,7 @@ xfs_dir_node_replace(xfs_da_args_t *args)
 		bp = blk->bp;
 		leaf = bp->data;
 		entry = &leaf->entries[blk->index];
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
 		/* XXX - replace assert ? */
 		XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
 		xfs_da_log_buf(args->trans, bp,
@@ -1151,10 +1151,8 @@ xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
 		     (void *)(unsigned long)uio->uio_resid,
 		     (void *)(unsigned long)be32_to_cpu(leaf->hdr.info.forw),
 		     (void *)(unsigned long)be16_to_cpu(leaf->hdr.count),
-		     (void *)(unsigned long)
-			INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
-		     (void *)(unsigned long)
-			INT_GET(leaf->entries[last].hashval, ARCH_CONVERT),
+		     (void *)(unsigned long)be32_to_cpu(leaf->entries[0].hashval),
+		     (void *)(unsigned long)be32_to_cpu(leaf->entries[last].hashval),
 		     NULL, NULL, NULL);
 }
 
@@ -1170,8 +1168,7 @@ xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio,
 		     (void *)((unsigned long)(uio->uio_offset >> 32)),
 		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
 		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)(unsigned long)
-			INT_GET(entry->hashval, ARCH_CONVERT),
+		     (void *)(unsigned long)be32_to_cpu(entry->hashval),
 		     NULL, NULL, NULL, NULL, NULL, NULL);
 }
 
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
index 6a4d8caaba7..e7121040cd9 100644
--- a/fs/xfs/xfs_dir_leaf.c
+++ b/fs/xfs/xfs_dir_leaf.c
@@ -652,7 +652,7 @@ xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
 	hdr = &leaf->hdr;
 	entry = &leaf->entries[0];
 	for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
 		if ((entry->namelen == 2) &&
 		    (namest->name[0] == '.') &&
 		    (namest->name[1] == '.')) {
@@ -684,10 +684,10 @@ xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
 	for (i = 0; i < be16_to_cpu(hdr->count); entry++, i++) {
 		if (!entry->nameidx)
 			continue;
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
 		args.name = (char *)(namest->name);
 		args.namelen = entry->namelen;
-		args.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+		args.hashval = be32_to_cpu(entry->hashval);
 		XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args.inumber);
 		xfs_dir_shortform_addname(&args);
 	}
@@ -742,9 +742,7 @@ xfs_dir_leaf_to_node(xfs_da_args_t *args)
 	node = bp1->data;
 	leaf = bp2->data;
 	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	node->btree[0].hashval = cpu_to_be32(
-		INT_GET(leaf->entries[
-			be16_to_cpu(leaf->hdr.count)-1].hashval, ARCH_CONVERT));
+	node->btree[0].hashval = leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval;
 	xfs_da_buf_done(bp2);
 	node->btree[0].before = cpu_to_be32(blkno);
 	node->hdr.count = cpu_to_be16(1);
@@ -970,8 +968,9 @@ xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
 	ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
 
 	be16_add(&map->size, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
-	INT_SET(entry->nameidx, ARCH_CONVERT, be16_to_cpu(map->base) + be16_to_cpu(map->size));
-	INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
+	entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) +
+				     be16_to_cpu(map->size));
+	entry->hashval = cpu_to_be32(args->hashval);
 	entry->namelen = args->namelen;
 	xfs_da_log_buf(args->trans, bp,
 	    XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
@@ -979,7 +978,7 @@ xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
 	/*
 	 * Copy the string and inode number into the new space.
 	 */
-	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
 	XFS_DIR_SF_PUT_DIRINO(&args->inumber, &namest->inumber);
 	memcpy(namest->name, args->name, args->namelen);
 	xfs_da_log_buf(args->trans, bp,
@@ -988,7 +987,7 @@ xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
 	/*
 	 * Update the control info for this leaf node
 	 */
-	if (INT_GET(entry->nameidx, ARCH_CONVERT) < be16_to_cpu(hdr->firstused))
+	if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused))
 		hdr->firstused = entry->nameidx;
 	ASSERT(be16_to_cpu(hdr->firstused) >=
 	       ((be16_to_cpu(hdr->count)*sizeof(*entry))+sizeof(*hdr)));
@@ -1198,8 +1197,10 @@ xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	/*
 	 * Copy out last hashval in each block for B-tree code.
 	 */
-	blk1->hashval = INT_GET(leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval, ARCH_CONVERT);
-	blk2->hashval = INT_GET(leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval, ARCH_CONVERT);
+	blk1->hashval = be32_to_cpu(leaf1->entries[
+			be16_to_cpu(leaf1->hdr.count)-1].hashval);
+	blk2->hashval = be32_to_cpu(leaf2->entries[
+			be16_to_cpu(leaf2->hdr.count)-1].hashval);
 
 	/*
 	 * Adjust the expected index for insertion.
@@ -1462,8 +1463,8 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 	ASSERT(be16_to_cpu(hdr->firstused) >=
 	       ((be16_to_cpu(hdr->count)*sizeof(*entry))+sizeof(*hdr)));
 	entry = &leaf->entries[index];
-	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= be16_to_cpu(hdr->firstused));
-	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+	ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused));
+	ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
 
 	/*
 	 * Scan through free region table:
@@ -1487,9 +1488,11 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 			be16_add(&map->size, entry_size);
 		}
 
-		if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
+		if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) ==
+				be16_to_cpu(entry->nameidx)) {
 			before = i;
-		} else if (be16_to_cpu(map->base) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
+		} else if (be16_to_cpu(map->base) ==
+				(be16_to_cpu(entry->nameidx) + entsize)) {
 			after = i;
 		} else if (be16_to_cpu(map->size) < tmp) {
 			tmp = be16_to_cpu(map->size);
@@ -1530,7 +1533,7 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 	/*
 	 * Did we remove the first entry?
 	 */
-	if (INT_GET(entry->nameidx, ARCH_CONVERT) == be16_to_cpu(hdr->firstused))
+	if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused))
 		smallest = 1;
 	else
 		smallest = 0;
@@ -1538,7 +1541,7 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 	/*
 	 * Compress the remaining entries and zero out the removed stuff.
 	 */
-	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
 	memset((char *)namest, 0, entsize);
 	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
 
@@ -1561,11 +1564,11 @@ xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
 		tmp = XFS_LBSIZE(mp);
 		entry = &leaf->entries[0];
 		for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
-			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >=
+			ASSERT(be16_to_cpu(entry->nameidx) >=
 			       be16_to_cpu(hdr->firstused));
-			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
-			if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
-				tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
+			ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+			if (be16_to_cpu(entry->nameidx) < tmp)
+				tmp = be16_to_cpu(entry->nameidx);
 		}
 		hdr->firstused = cpu_to_be16(tmp);
 		if (!hdr->firstused)
@@ -1617,7 +1620,8 @@ xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 	/*
 	 * Save last hashval from dying block for later Btree fixup.
 	 */
-	drop_blk->hashval = INT_GET(drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval, ARCH_CONVERT);
+	drop_blk->hashval = be32_to_cpu(drop_leaf->entries[
+			be16_to_cpu(drop_leaf->hdr.count)-1].hashval);
 
 	/*
 	 * Check if we need a temp buffer, or can we do it in place.
@@ -1676,7 +1680,8 @@ xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 	/*
 	 * Copy out last hashval in each block for B-tree code.
 	 */
-	save_blk->hashval = INT_GET(save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1 ].hashval, ARCH_CONVERT);
+	save_blk->hashval = be32_to_cpu(save_leaf->entries[
+			be16_to_cpu(save_leaf->hdr.count)-1].hashval);
 }
 
 /*========================================================================
@@ -1716,30 +1721,32 @@ xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
 	for (entry = &leaf->entries[probe]; span > 4;
 		   entry = &leaf->entries[probe]) {
 		span /= 2;
-		if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
+		if (be32_to_cpu(entry->hashval) < hashval)
 			probe += span;
-		else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
+		else if (be32_to_cpu(entry->hashval) > hashval)
 			probe -= span;
 		else
 			break;
 	}
 	ASSERT((probe >= 0) && \
 	       ((!leaf->hdr.count) || (probe < be16_to_cpu(leaf->hdr.count))));
-	ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval));
+	ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval));
 
 	/*
 	 * Since we may have duplicate hashval's, find the first matching
 	 * hashval in the leaf.
 	 */
-	while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) >= hashval)) {
+	while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) {
 		entry--;
 		probe--;
 	}
-	while ((probe < be16_to_cpu(leaf->hdr.count)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
+	while ((probe < be16_to_cpu(leaf->hdr.count)) &&
+	       (be32_to_cpu(entry->hashval) < hashval)) {
 		entry++;
 		probe++;
 	}
-	if ((probe == be16_to_cpu(leaf->hdr.count)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
+	if ((probe == be16_to_cpu(leaf->hdr.count)) ||
+	    (be32_to_cpu(entry->hashval) != hashval)) {
 		*index = probe;
 		ASSERT(args->oknoent);
 		return XFS_ERROR(ENOENT);
@@ -1748,8 +1755,9 @@ xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
 	/*
 	 * Duplicate keys may be present, so search all of them for a match.
 	 */
-	while ((probe < be16_to_cpu(leaf->hdr.count)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+	while ((probe < be16_to_cpu(leaf->hdr.count)) &&
+	       (be32_to_cpu(entry->hashval) == hashval)) {
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
 		if (entry->namelen == args->namelen &&
 		    namest->name[0] == args->name[0] &&
 		    memcmp(args->name, namest->name, args->namelen) == 0) {
@@ -1825,19 +1833,19 @@ xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
 	entry_s = &leaf_s->entries[start_s];
 	entry_d = &leaf_d->entries[start_d];
 	for (i = 0; i < count; entry_s++, entry_d++, i++) {
-		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >=
+		ASSERT(be16_to_cpu(entry_s->nameidx) >=
 		       be16_to_cpu(hdr_s->firstused));
 		tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
 		be16_add(&hdr_d->firstused, -(tmp));
 		entry_d->hashval = entry_s->hashval;
 		entry_d->nameidx = hdr_d->firstused;
 		entry_d->namelen = entry_s->namelen;
-		ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
-		memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)),
-		       XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)), tmp);
-		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
-		memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
-		      0, tmp);
+		ASSERT(be16_to_cpu(entry_d->nameidx) + tmp <= XFS_LBSIZE(mp));
+		memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, be16_to_cpu(entry_d->nameidx)),
+		       XFS_DIR_LEAF_NAMESTRUCT(leaf_s, be16_to_cpu(entry_s->nameidx)), tmp);
+		ASSERT(be16_to_cpu(entry_s->nameidx) + tmp <= XFS_LBSIZE(mp));
+		memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s,
+					be16_to_cpu(entry_s->nameidx)), 0, tmp);
 		be16_add(&hdr_s->namebytes, -(entry_d->namelen));
 		be16_add(&hdr_d->namebytes, entry_d->namelen);
 		be16_add(&hdr_s->count, -1);
@@ -1900,10 +1908,12 @@ xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
 	ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR_LEAF_MAGIC) &&
 	       (be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR_LEAF_MAGIC));
 	if (leaf1->hdr.count && leaf2->hdr.count &&
-	    ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
-	      INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
-	     (INT_GET(leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval, ARCH_CONVERT) <
-	      INT_GET(leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval, ARCH_CONVERT)))) {
+	    ((be32_to_cpu(leaf2->entries[0].hashval) <
+	      be32_to_cpu(leaf1->entries[0 ].hashval)) ||
+	     (be32_to_cpu(leaf2->entries[
+			  be16_to_cpu(leaf2->hdr.count)-1].hashval) <
+	      be32_to_cpu(leaf1->entries[
+			  be16_to_cpu(leaf1->hdr.count)-1].hashval)))) {
 		return 1;
 	}
 	return 0;
@@ -1923,7 +1933,7 @@ xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
 		*count = be16_to_cpu(leaf->hdr.count);
 	if (!leaf->hdr.count)
 		return(0);
-	return(INT_GET(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval, ARCH_CONVERT));
+	return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval);
 }
 
 /*
@@ -1972,7 +1982,7 @@ xfs_dir_leaf_getdents_int(
 		     i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
 
 		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
-				    INT_GET(entry->nameidx, ARCH_CONVERT));
+				    be16_to_cpu(entry->nameidx));
 
 		if (unlikely(
 		    ((char *)namest < (char *)leaf) ||
@@ -1982,19 +1992,16 @@ xfs_dir_leaf_getdents_int(
 			xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
-		if (INT_GET(entry->hashval, ARCH_CONVERT) >= cookhash) {
-			if (   entno < want_entno
-			    && INT_GET(entry->hashval, ARCH_CONVERT)
-							== cookhash) {
+		if (be32_to_cpu(entry->hashval) >= cookhash) {
+			if (entno < want_entno &&
+			    be32_to_cpu(entry->hashval) == cookhash) {
 				/*
 				 * Trying to get to a particular offset in a
 				 * run of equal-hashval entries.
 				 */
 				entno++;
-			} else if (   want_entno > 0
-				   && entno == want_entno
-				   && INT_GET(entry->hashval, ARCH_CONVERT)
-							== cookhash) {
+			} else if (want_entno > 0 && entno == want_entno &&
+				   be32_to_cpu(entry->hashval) == cookhash) {
 				break;
 			} else {
 				entno = 0;
@@ -2035,7 +2042,7 @@ xfs_dir_leaf_getdents_int(
 		 * the inode number from this entry.
 		 */
 		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
-				    INT_GET(entry->nameidx, ARCH_CONVERT));
+				    be16_to_cpu(entry->nameidx));
 
 		if (unlikely(
 		    ((char *)namest < (char *)leaf) ||
@@ -2050,9 +2057,9 @@ xfs_dir_leaf_getdents_int(
 						   dp, uio, p.cook.o);
 
 		if (i < (be16_to_cpu(leaf->hdr.count) - 1)) {
-			nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT);
+			nexthash = be32_to_cpu(entry[1].hashval);
 
-			if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT))
+			if (nexthash == be32_to_cpu(entry->hashval))
 				nextentno = entno + 1;
 			else
 				nextentno = 0;
@@ -2088,8 +2095,7 @@ xfs_dir_leaf_getdents_int(
 				return XFS_ERROR(EFSCORRUPTED);
 			}
 
-			nexthash = INT_GET(leaf2->entries[0].hashval,
-								ARCH_CONVERT);
+			nexthash = be32_to_cpu(leaf2->entries[0].hashval);
 			nextentno = -1;
 			XFS_PUT_COOKIE(p.cook, mp, thishash, 0, nexthash);
 			xfs_da_brelse(dp->i_transp, bp2);
@@ -2113,9 +2119,9 @@ xfs_dir_leaf_getdents_int(
 		 * that share the same hashval.  Hopefully the buffer
 		 * provided is big enough to handle it (see pv763517).
 		 */
+		thishash = be32_to_cpu(entry->hashval);
 #if (BITS_PER_LONG == 32)
-		if ((thishash = INT_GET(entry->hashval, ARCH_CONVERT))
-								!= lasthash) {
+		if (thishash != lasthash) {
 			XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
 			lastresid = uio->uio_resid;
 			lasthash = thishash;
@@ -2124,7 +2130,6 @@ xfs_dir_leaf_getdents_int(
 						   dp, uio, p.cook.o);
 		}
 #else
-		thishash = INT_GET(entry->hashval, ARCH_CONVERT);
 		XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
 		lastresid = uio->uio_resid;
 #endif /* BITS_PER_LONG == 32 */
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
index c6979ab829c..7ca5845195d 100644
--- a/fs/xfs/xfs_dir_leaf.h
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -83,10 +83,10 @@ typedef struct xfs_dir_leaf_hdr {	/* constant-structure header block */
 } xfs_dir_leaf_hdr_t;
 
 typedef struct xfs_dir_leaf_entry {	/* sorted on key, not name */
-	xfs_dahash_t	hashval;	/* hash value of name */
-	__uint16_t	nameidx;	/* index into buffer of name */
-	__uint8_t	namelen;	/* length of name string */
-	__uint8_t	pad2;
+	__be32		hashval;	/* hash value of name */
+	__be16		nameidx;	/* index into buffer of name */
+	__u8		namelen;	/* length of name string */
+	__u8		pad2;
 } xfs_dir_leaf_entry_t;
 
 typedef struct xfs_dir_leaf_name {
-- 
cgit v1.2.3


From 3f368a0d58cb8cadab298546286f94ca14220f65 Mon Sep 17 00:00:00 2001
From: Olaf Weber <olaf@sgi.com>
Date: Fri, 9 Jun 2006 14:51:11 +1000
Subject: [XFS] Originally the ATTR_DMI flag also had the functionality of the
 ATTR_NOLOCK flag, but this was split off some time ago, as ATTR_DMI needed to
 be used separately.	Two asserts were added to guard correctness of the
 code during the transition.  These are no longer required.

SGI-PV: 952145
SGI-Modid: xfs-linux-melb:xfs-kern:209633a

Signed-off-by: Olaf Weber <olaf@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 567d14d06d9..2b814f04176 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -347,7 +347,6 @@ xfs_setattr(
 	 */
 	tp = NULL;
 	lock_flags = XFS_ILOCK_EXCL;
-	ASSERT(flags & ATTR_NOLOCK ? flags & ATTR_DMI : 1);
 	if (flags & ATTR_NOLOCK)
 		need_iolock = 0;
 	if (!(mask & XFS_AT_SIZE)) {
@@ -4308,7 +4307,6 @@ xfs_free_file_space(
 			return error;
 	}
 
-	ASSERT(attr_flags & ATTR_NOLOCK ? attr_flags & ATTR_DMI : 1);
 	if (attr_flags & ATTR_NOLOCK)
 		need_iolock = 0;
 	if (need_iolock) {
-- 
cgit v1.2.3


From fe6c1e7240e3a7cb600030f9c909273365d52a9d Mon Sep 17 00:00:00 2001
From: Mandy Kirkconnell <alkirkco@sgi.com>
Date: Fri, 9 Jun 2006 14:51:25 +1000
Subject: [XFS] Fix size argument in kmem_free().

SGI-PV: 952291
SGI-Modid: xfs-linux-melb:xfs-kern:209807a

Signed-off-by: Mandy Kirkconnell <alkirkco@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 020de5637e0..f5284453441 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -4181,7 +4181,7 @@ xfs_iext_direct_to_inline(
 	 */
 	memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
 		nextents * sizeof(xfs_bmbt_rec_t));
-	kmem_free(ifp->if_u1.if_extents, KM_SLEEP);
+	kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
 	ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 	ifp->if_real_bytes = 0;
 }
-- 
cgit v1.2.3


From ba0b92d671c36cbebd66a306790c9b66a3224d83 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 14:52:00 +1000
Subject: [XFS] Fix a comment typo, originally noticed by Ming Zhang.

SGI-PV: 907752
SGI-Modid: xfs-linux-melb:xfs-kern:25921a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 14010f1fa82..df66f3a59fa 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -72,9 +72,9 @@ struct fsxattr {
 /*
  * Structure for XFS_IOC_GETBMAP.
  * On input, fill in bmv_offset and bmv_length of the first structure
- * to indicate the area of interest in the file, and bmv_entry with the
- * number of array elements given.  The first structure is updated on
- * return to give the offset and length for the next call.
+ * to indicate the area of interest in the file, and bmv_entries with
+ * the number of array elements given back.  The first structure is
+ * updated on return to give the offset and length for the next call.
  */
 #ifndef HAVE_GETBMAP
 struct getbmap {
-- 
cgit v1.2.3


From fbc1462bcb421620a04eb390fc79a2615c9d01d0 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 14:52:13 +1000
Subject: [XFS] Fix a noatime regression related to updating inode atime field
 on mmap only.

SGI-PV: 952736
SGI-Modid: xfs-linux-melb:xfs-kern:25922a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index c847416f6d1..7c9f7598807 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -362,15 +362,10 @@ xfs_vm_nopage(
 {
 	struct inode	*inode = area->vm_file->f_dentry->d_inode;
 	vnode_t		*vp = vn_from_inode(inode);
-	xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
-	int		error;
 
 	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
-
-	error = XFS_SEND_MMAP(mp, area, 0);
-	if (error)
+	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), area, 0))
 		return NULL;
-
 	return filemap_nopage(area, address, type);
 }
 #endif /* CONFIG_XFS_DMAPI */
@@ -456,23 +451,14 @@ xfs_file_mmap(
 	struct file	*filp,
 	struct vm_area_struct *vma)
 {
-	struct inode	*ip = filp->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(ip);
-	vattr_t		vattr;
-	int		error;
-
 	vma->vm_ops = &xfs_file_vm_ops;
 
 #ifdef CONFIG_XFS_DMAPI
-	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
+	if (vn_from_inode(filp->f_dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI)
 		vma->vm_ops = &xfs_dmapi_file_vm_ops;
-	}
 #endif /* CONFIG_XFS_DMAPI */
 
-	vattr.va_mask = XFS_AT_UPDATIME;
-	VOP_SETATTR(vp, &vattr, XFS_AT_UPDATIME, NULL, error);
-	if (likely(!error))
-		__vn_revalidate(vp, &vattr);	/* update flags */
+	file_accessed(filp);
 	return 0;
 }
 
-- 
cgit v1.2.3


From d3446eac3f50dade2f09ed212b112609ee78fb33 Mon Sep 17 00:00:00 2001
From: Barry Naujok <bnaujok@sgi.com>
Date: Fri, 9 Jun 2006 14:54:19 +1000
Subject: [XFS] Add degframentation exclusion support

SGI-PV: 953061
SGI-Modid: xfs-linux-melb:xfs-kern:25986a

Signed-off-by: Barry Naujok <bnaujok@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_globals.c | 1 +
 fs/xfs/linux-2.6/xfs_linux.h   | 1 +
 fs/xfs/linux-2.6/xfs_sysctl.c  | 5 +++++
 fs/xfs/linux-2.6/xfs_sysctl.h  | 2 ++
 fs/xfs/xfs_dinode.h            | 4 +++-
 fs/xfs/xfs_fs.h                | 1 +
 fs/xfs/xfs_inode.c             | 5 +++++
 fs/xfs/xfs_vnodeops.c          | 2 ++
 8 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 6e8085f3463..6c162c3dde7 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -45,6 +45,7 @@ xfs_param_t xfs_params = {
 	.xfs_buf_age	= {	1*100,		15*100,		7200*100},
 	.inherit_nosym	= {	0,		0,		1	},
 	.rotorstep	= {	1,		1,		255	},
+	.inherit_nodfrg	= {	0,		1,		1	},
 };
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index e9fe43d7476..7d15cb91027 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -134,6 +134,7 @@ BUFFER_FNS(PrivateStart, unwritten);
 #define xfs_buf_age_centisecs	xfs_params.xfs_buf_age.val
 #define xfs_inherit_nosymlinks	xfs_params.inherit_nosym.val
 #define xfs_rotorstep		xfs_params.rotorstep.val
+#define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
 
 #ifndef raw_smp_processor_id
 #define raw_smp_processor_id()	smp_processor_id()
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7079cc83721..4af97682bec 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -120,6 +120,11 @@ STATIC ctl_table xfs_table[] = {
 	&sysctl_intvec, NULL,
 	&xfs_params.rotorstep.min, &xfs_params.rotorstep.max},
 
+	{XFS_INHERIT_NODFRG, "inherit_nodefrag", &xfs_params.inherit_nodfrg.val,
+	sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+	&sysctl_intvec, NULL,
+	&xfs_params.inherit_nodfrg.min, &xfs_params.inherit_nodfrg.max},
+
 	/* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
 	{XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val,
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index bc8c11f1372..a631fb8cc5a 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -46,6 +46,7 @@ typedef struct xfs_param {
 	xfs_sysctl_val_t xfs_buf_age;	/* Metadata buffer age before flush. */
 	xfs_sysctl_val_t inherit_nosym;	/* Inherit the "nosymlinks" flag. */
 	xfs_sysctl_val_t rotorstep;	/* inode32 AG rotoring control knob */
+	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
 } xfs_param_t;
 
 /*
@@ -84,6 +85,7 @@ enum {
 	/* XFS_IO_BYPASS = 18 */
 	XFS_INHERIT_NOSYM = 19,
 	XFS_ROTORSTEP = 20,
+	XFS_INHERIT_NODFRG = 21,
 };
 
 extern xfs_param_t	xfs_params;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 79d0d9e1fba..77d53778258 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -257,6 +257,7 @@ typedef enum xfs_dinode_fmt
 #define XFS_DIFLAG_NOSYMLINKS_BIT   10	/* disallow symlink creation */
 #define XFS_DIFLAG_EXTSIZE_BIT      11	/* inode extent size allocator hint */
 #define XFS_DIFLAG_EXTSZINHERIT_BIT 12	/* inherit inode extent size */
+#define XFS_DIFLAG_NODEFRAG_BIT     13	/* do not reorganize/defragment */
 #define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
 #define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
 #define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
@@ -270,12 +271,13 @@ typedef enum xfs_dinode_fmt
 #define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
 #define XFS_DIFLAG_EXTSIZE       (1 << XFS_DIFLAG_EXTSIZE_BIT)
 #define XFS_DIFLAG_EXTSZINHERIT  (1 << XFS_DIFLAG_EXTSZINHERIT_BIT)
+#define XFS_DIFLAG_NODEFRAG      (1 << XFS_DIFLAG_NODEFRAG_BIT)
 
 #define XFS_DIFLAG_ANY \
 	(XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
 	 XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
 	 XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
 	 XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \
-	 XFS_DIFLAG_EXTSZINHERIT)
+	 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG)
 
 #endif	/* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index df66f3a59fa..0f0ad153595 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -67,6 +67,7 @@ struct fsxattr {
 #define XFS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
 #define XFS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
 #define XFS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
+#define XFS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */
 #define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f5284453441..083fc0479e6 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -812,6 +812,8 @@ _xfs_dic2xflags(
 			flags |= XFS_XFLAG_EXTSIZE;
 		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 			flags |= XFS_XFLAG_EXTSZINHERIT;
+		if (di_flags & XFS_DIFLAG_NODEFRAG)
+			flags |= XFS_XFLAG_NODEFRAG;
 	}
 
 	return flags;
@@ -1221,6 +1223,9 @@ xfs_ialloc(
 				di_flags |= XFS_DIFLAG_NOSYMLINKS;
 			if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 				di_flags |= XFS_DIFLAG_PROJINHERIT;
+			if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
+			    xfs_inherit_nodefrag)
+				di_flags |= XFS_DIFLAG_NODEFRAG;
 			ip->i_d.di_flags |= di_flags;
 		}
 		/* FALLTHROUGH */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2b814f04176..d1c6349e401 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -799,6 +799,8 @@ xfs_setattr(
 				di_flags |= XFS_DIFLAG_NODUMP;
 			if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
 				di_flags |= XFS_DIFLAG_PROJINHERIT;
+			if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
+				di_flags |= XFS_DIFLAG_NODEFRAG;
 			if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
 				if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
 					di_flags |= XFS_DIFLAG_RTINHERIT;
-- 
cgit v1.2.3


From d210a28cd851082cec9b282443f8cc0e6fc09830 Mon Sep 17 00:00:00 2001
From: Yingping Lu <yingping@sgi.com>
Date: Fri, 9 Jun 2006 14:55:18 +1000
Subject: [XFS] In actual allocation of file system blocks and freeing extents,
 the transaction within each such operation may involve multiple locking of
 AGF buffer. While the freeing extent function has sorted the extents based on
 AGF number before entering into transaction, however, when the file system
 space is very limited, the allocation of space would try every AGF to get
 space allocated, this could potentially cause out-of-order locking, thus
 deadlock could happen. This fix mitigates the scarce space for allocation by
 setting aside a few blocks without reservation, and avoid deadlock by
 maintaining ascending order of AGF locking.

SGI-PV: 947395
SGI-Modid: xfs-linux-melb:xfs-kern:210801a

Signed-off-by: Yingping Lu <yingping@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_alloc.c      | 29 +++++++++++++++++++++++------
 fs/xfs/xfs_alloc.h      |  2 ++
 fs/xfs/xfs_bmap.c       |  5 ++++-
 fs/xfs/xfs_bmap_btree.c | 10 ++++------
 fs/xfs/xfs_mount.c      | 24 ++++++++++++++++++++++--
 5 files changed, 55 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 8558226281c..22af489d3f3 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1862,7 +1862,7 @@ xfs_alloc_fix_freelist(
 		(pag->pagf_longest - delta) :
 		(pag->pagf_flcount > 0 || pag->pagf_longest > 0);
 	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
-	    (args->minleft &&
+	    (!(flags & XFS_ALLOC_FLAG_FREEING) &&
 	     (int)(pag->pagf_freeblks + pag->pagf_flcount -
 		   need - args->total) <
 	     (int)args->minleft)) {
@@ -1898,7 +1898,7 @@ xfs_alloc_fix_freelist(
 	longest = (longest > delta) ? (longest - delta) :
 		(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
 	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
-	     (args->minleft &&
+	     (!(flags & XFS_ALLOC_FLAG_FREEING) &&
 		(int)(be32_to_cpu(agf->agf_freeblks) +
 		   be32_to_cpu(agf->agf_flcount) - need - args->total) <
 	     (int)args->minleft)) {
@@ -1951,8 +1951,14 @@ xfs_alloc_fix_freelist(
 		 * the restrictions correctly.  Can happen for free calls
 		 * on a completely full ag.
 		 */
-		if (targs.agbno == NULLAGBLOCK)
+		if (targs.agbno == NULLAGBLOCK) {
+			if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+				xfs_trans_brelse(tp, agflbp);
+				args->agbp = NULL;
+				return 0;
+			}
 			break;
+		}
 		/*
 		 * Put each allocated block on the list.
 		 */
@@ -2360,8 +2366,19 @@ xfs_alloc_vextent(
 			if (args->agno == sagno &&
 			    type == XFS_ALLOCTYPE_START_BNO)
 				args->type = XFS_ALLOCTYPE_THIS_AG;
-			if (++(args->agno) == mp->m_sb.sb_agcount)
-				args->agno = 0;
+			/*
+			* For the first allocation, we can try any AG to get
+			* space.  However, if we already have allocated a
+			* block, we don't want to try AGs whose number is below
+			* sagno. Otherwise, we may end up with out-of-order
+			* locking of AGF, which might cause deadlock.
+			*/
+			if (++(args->agno) == mp->m_sb.sb_agcount) {
+				if (args->firstblock != NULLFSBLOCK)
+					args->agno = sagno;
+				else
+					args->agno = 0;
+			}
 			/*
 			 * Reached the starting a.g., must either be done
 			 * or switch to non-trylock mode.
@@ -2443,7 +2460,7 @@ xfs_free_extent(
 	args.minlen = args.minleft = args.minalignslop = 0;
 	down_read(&args.mp->m_peraglock);
 	args.pag = &args.mp->m_perag[args.agno];
-	if ((error = xfs_alloc_fix_freelist(&args, 0)))
+	if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
 		goto error0;
 #ifdef DEBUG
 	ASSERT(args.agbp != NULL);
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 2d1f8928b26..650591f999a 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -41,6 +41,7 @@ typedef enum xfs_alloctype
  * Flags for xfs_alloc_fix_freelist.
  */
 #define	XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
+#define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
 
 /*
  * Argument structure for xfs_alloc routines.
@@ -70,6 +71,7 @@ typedef struct xfs_alloc_arg {
 	char		wasfromfl;	/* set if allocation is from freelist */
 	char		isfl;		/* set if is freelist blocks - !acctg */
 	char		userdata;	/* set if this is user data */
+	xfs_fsblock_t	firstblock;	/* io first block allocated */
 } xfs_alloc_arg_t;
 
 /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 890ad352817..ad595dbefe1 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2762,6 +2762,7 @@ xfs_bmap_btalloc(
 	args.mp = mp;
 	args.fsbno = ap->rval;
 	args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+	args.firstblock = ap->firstblock;
 	blen = 0;
 	if (nullfb) {
 		args.type = XFS_ALLOCTYPE_START_BNO;
@@ -2821,7 +2822,7 @@ xfs_bmap_btalloc(
 		else
 			args.minlen = ap->alen;
 	} else if (ap->low) {
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
+		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.total = args.minlen = ap->minlen;
 	} else {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -3452,6 +3453,7 @@ xfs_bmap_extents_to_btree(
 	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
 	args.tp = tp;
 	args.mp = mp;
+	args.firstblock = *firstblock;
 	if (*firstblock == NULLFSBLOCK) {
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
@@ -3587,6 +3589,7 @@ xfs_bmap_local_to_extents(
 
 		args.tp = tp;
 		args.mp = ip->i_mount;
+		args.firstblock = *firstblock;
 		ASSERT((ifp->if_flags &
 			(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
 		/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bea44709afb..3b6dfc9b53a 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -1569,12 +1569,11 @@ xfs_bmbt_split(
 	lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
 	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
 	args.fsbno = cur->bc_private.b.firstblock;
+	args.firstblock = args.fsbno;
 	if (args.fsbno == NULLFSBLOCK) {
 		args.fsbno = lbno;
 		args.type = XFS_ALLOCTYPE_START_BNO;
-	} else if (cur->bc_private.b.flist->xbf_low)
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-	else
+	} else
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	args.mod = args.minleft = args.alignment = args.total = args.isfl =
 		args.userdata = args.minalignslop = 0;
@@ -2356,6 +2355,7 @@ xfs_bmbt_newroot(
 		args.userdata = args.minalignslop = 0;
 	args.minlen = args.maxlen = args.prod = 1;
 	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+	args.firstblock = args.fsbno;
 	if (args.fsbno == NULLFSBLOCK) {
 #ifdef DEBUG
 		if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
@@ -2365,9 +2365,7 @@ xfs_bmbt_newroot(
 #endif
 		args.fsbno = INT_GET(*pp, ARCH_CONVERT);
 		args.type = XFS_ALLOCTYPE_START_BNO;
-	} else if (args.wasdel)
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-	else
+	} else
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	if ((error = xfs_alloc_vextent(&args))) {
 		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c0b1c290688..4b7be49cc4d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1254,6 +1254,26 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 
 	xfs_trans_log_buf(tp, bp, first, last);
 }
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks to 8.
+*/
+#define SET_ASIDE_BLOCKS 8
+
 /*
  * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
  * a delta to a specified field in the in-core superblock.  Simply
@@ -1298,7 +1318,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 		return 0;
 	case XFS_SBS_FDBLOCKS:
 
-		lcounter = (long long)mp->m_sb.sb_fdblocks;
+		lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS;
 		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
 
 		if (delta > 0) {		/* Putting blocks back */
@@ -1332,7 +1352,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
 			}
 		}
 
-		mp->m_sb.sb_fdblocks = lcounter;
+		mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS;
 		return 0;
 	case XFS_SBS_FREXTENTS:
 		lcounter = (long long)mp->m_sb.sb_frextents;
-- 
cgit v1.2.3


From 6d192a9b82212abf1e0e89da6e3a952afba7e4d6 Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@sgi.com>
Date: Fri, 9 Jun 2006 14:55:38 +1000
Subject: [XFS] inode items and EFI/EFDs have different ondisk format for 32bit
 and 64bit kernels allow recovery to handle both versions and do the necessary
 decoding

SGI-PV: 952214
SGI-Modid: xfs-linux-melb:xfs-kern:26011a

Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_extfree_item.c | 56 ++++++++++++++++++++++++++++++
 fs/xfs/xfs_extfree_item.h | 53 ++++++++++++++++++++++++++++-
 fs/xfs/xfs_inode_item.c   | 49 ++++++++++++++++++++++++++
 fs/xfs/xfs_inode_item.h   | 61 +++++++++++++++++----------------
 fs/xfs/xfs_log_recover.c  | 87 ++++++++++++++++++++++++++++++-----------------
 5 files changed, 245 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index f19282ec854..8b028f12854 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -293,6 +293,62 @@ xfs_efi_init(xfs_mount_t	*mp,
 	return (efip);
 }
 
+/*
+ * Copy an EFI format buffer from the given buf, and into the destination
+ * EFI format structure.
+ * The given buffer can be in 32 bit or 64 bit form (which has different padding),
+ * one of which will be the native format for this kernel.
+ * It will handle the conversion of formats if necessary.
+ */
+int
+xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+{
+	xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr;
+	uint i;
+	uint len = sizeof(xfs_efi_log_format_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t);  
+	uint len32 = sizeof(xfs_efi_log_format_32_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t);  
+	uint len64 = sizeof(xfs_efi_log_format_64_t) + 
+		(src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t);  
+
+	if (buf->i_len == len) {
+		memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len);
+		return 0;
+	} else if (buf->i_len == len32) {
+		xfs_efi_log_format_32_t *src_efi_fmt_32 =
+			(xfs_efi_log_format_32_t *)buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_32->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_32->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_32->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_32->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_32->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_32->efi_extents[i].ext_len;
+		}
+		return 0;
+	} else if (buf->i_len == len64) {
+		xfs_efi_log_format_64_t *src_efi_fmt_64 =
+			(xfs_efi_log_format_64_t *)buf->i_addr;
+
+		dst_efi_fmt->efi_type     = src_efi_fmt_64->efi_type;
+		dst_efi_fmt->efi_size     = src_efi_fmt_64->efi_size;
+		dst_efi_fmt->efi_nextents = src_efi_fmt_64->efi_nextents;
+		dst_efi_fmt->efi_id       = src_efi_fmt_64->efi_id;
+		for (i = 0; i < dst_efi_fmt->efi_nextents; i++) {
+			dst_efi_fmt->efi_extents[i].ext_start =
+				src_efi_fmt_64->efi_extents[i].ext_start;
+			dst_efi_fmt->efi_extents[i].ext_len =
+				src_efi_fmt_64->efi_extents[i].ext_len;
+		}
+		return 0;
+	}
+	return EFSCORRUPTED;
+}
+
 /*
  * This is called by the efd item code below to release references to
  * the given efi item.  Each efd calls this with the number of
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 5bf681708fe..0ea45edaab0 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -26,6 +26,24 @@ typedef struct xfs_extent {
 	xfs_extlen_t	ext_len;
 } xfs_extent_t;
 
+/*
+ * Since an xfs_extent_t has types (start:64, len: 32)
+ * there are different alignments on 32 bit and 64 bit kernels.
+ * So we provide the different variants for use by a
+ * conversion routine.
+ */
+
+typedef struct xfs_extent_32 {
+	xfs_dfsbno_t	ext_start;
+	xfs_extlen_t	ext_len;
+} __attribute__((packed)) xfs_extent_32_t;
+
+typedef struct xfs_extent_64 {
+	xfs_dfsbno_t	ext_start;
+	xfs_extlen_t	ext_len;
+	__uint32_t	ext_pad;
+} xfs_extent_64_t;
+
 /*
  * This is the structure used to lay out an efi log item in the
  * log.  The efi_extents field is a variable size array whose
@@ -39,6 +57,22 @@ typedef struct xfs_efi_log_format {
 	xfs_extent_t		efi_extents[1];	/* array of extents to free */
 } xfs_efi_log_format_t;
 
+typedef struct xfs_efi_log_format_32 {
+	unsigned short		efi_type;	/* efi log item type */
+	unsigned short		efi_size;	/* size of this item */
+	uint			efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_32_t		efi_extents[1];	/* array of extents to free */
+} __attribute__((packed)) xfs_efi_log_format_32_t;
+
+typedef struct xfs_efi_log_format_64 {
+	unsigned short		efi_type;	/* efi log item type */
+	unsigned short		efi_size;	/* size of this item */
+	uint			efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_64_t		efi_extents[1];	/* array of extents to free */
+} xfs_efi_log_format_64_t;
+
 /*
  * This is the structure used to lay out an efd log item in the
  * log.  The efd_extents array is a variable size array whose
@@ -52,6 +86,22 @@ typedef struct xfs_efd_log_format {
 	xfs_extent_t		efd_extents[1];	/* array of extents freed */
 } xfs_efd_log_format_t;
 
+typedef struct xfs_efd_log_format_32 {
+	unsigned short		efd_type;	/* efd log item type */
+	unsigned short		efd_size;	/* size of this item */
+	uint			efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_32_t		efd_extents[1];	/* array of extents freed */
+} __attribute__((packed)) xfs_efd_log_format_32_t;
+
+typedef struct xfs_efd_log_format_64 {
+	unsigned short		efd_type;	/* efd log item type */
+	unsigned short		efd_size;	/* size of this item */
+	uint			efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_64_t		efd_extents[1];	/* array of extents freed */
+} xfs_efd_log_format_64_t;
+
 
 #ifdef __KERNEL__
 
@@ -103,7 +153,8 @@ extern struct kmem_zone	*xfs_efd_zone;
 xfs_efi_log_item_t	*xfs_efi_init(struct xfs_mount *, uint);
 xfs_efd_log_item_t	*xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
 				      uint);
-
+int			xfs_efi_copy_format(xfs_log_iovec_t *buf,
+					    xfs_efi_log_format_t *dst_efi_fmt);
 void			xfs_efi_item_free(xfs_efi_log_item_t *);
 
 #endif	/* __KERNEL__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 7497a481b2f..cd65a565b4f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -1084,3 +1084,52 @@ xfs_istale_done(
 {
 	xfs_iflush_abort(iip->ili_inode);
 }
+
+/*
+ * convert an xfs_inode_log_format struct from either 32 or 64 bit versions
+ * (which can have different field alignments) to the native version
+ */
+int
+xfs_inode_item_format_convert(
+	xfs_log_iovec_t		*buf,
+	xfs_inode_log_format_t	*in_f)
+{
+	if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) {
+		xfs_inode_log_format_32_t *in_f32;
+
+		in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr;
+		in_f->ilf_type = in_f32->ilf_type;
+		in_f->ilf_size = in_f32->ilf_size;
+		in_f->ilf_fields = in_f32->ilf_fields;
+		in_f->ilf_asize = in_f32->ilf_asize;
+		in_f->ilf_dsize = in_f32->ilf_dsize;
+		in_f->ilf_ino = in_f32->ilf_ino;
+		/* copy biggest field of ilf_u */
+		memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+		       in_f32->ilf_u.ilfu_uuid.__u_bits,
+		       sizeof(uuid_t));
+		in_f->ilf_blkno = in_f32->ilf_blkno;
+		in_f->ilf_len = in_f32->ilf_len;
+		in_f->ilf_boffset = in_f32->ilf_boffset;
+		return 0;
+	} else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){
+		xfs_inode_log_format_64_t *in_f64;
+
+		in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr;
+		in_f->ilf_type = in_f64->ilf_type;
+		in_f->ilf_size = in_f64->ilf_size;
+		in_f->ilf_fields = in_f64->ilf_fields;
+		in_f->ilf_asize = in_f64->ilf_asize;
+		in_f->ilf_dsize = in_f64->ilf_dsize;
+		in_f->ilf_ino = in_f64->ilf_ino;
+		/* copy biggest field of ilf_u */
+		memcpy(in_f->ilf_u.ilfu_uuid.__u_bits,
+		       in_f64->ilf_u.ilfu_uuid.__u_bits,
+		       sizeof(uuid_t));
+		in_f->ilf_blkno = in_f64->ilf_blkno;
+		in_f->ilf_len = in_f64->ilf_len;
+		in_f->ilf_boffset = in_f64->ilf_boffset;
+		return 0;
+	}
+	return EFSCORRUPTED;
+}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index c5dbf93b666..5db6cd1b4cf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -23,25 +23,6 @@
  * log.  The size of the inline data/extents/b-tree root to be logged
  * (if any) is indicated in the ilf_dsize field.  Changes to this structure
  * must be added on to the end.
- *
- * Convention for naming inode log item versions :  The current version
- * is always named XFS_LI_INODE.  When an inode log item gets superseded,
- * add the latest version of IRIX that will generate logs with that item
- * to the version name.
- *
- * -Version 1 of this structure (XFS_LI_5_3_INODE) included up to the first
- *	union (ilf_u) field.  This was released with IRIX 5.3-XFS.
- * -Version 2 of this structure (XFS_LI_6_1_INODE) is currently the entire
- *	structure.  This was released with IRIX 6.0.1-XFS and IRIX 6.1.
- * -Version 3 of this structure (XFS_LI_INODE) is the same as version 2
- *	so a new structure definition wasn't necessary.  However, we had
- *	to add a new type because the inode cluster size changed from 4K
- *	to 8K and the version number had to be rev'ved to keep older kernels
- *	from trying to recover logs with the 8K buffers in them.  The logging
- *	code can handle recovery on different-sized clusters now so hopefully
- *	this'll be the last time we need to change the inode log item just
- *	for a change in the inode cluster size.  This new version was
- *	released with IRIX 6.2.
  */
 typedef struct xfs_inode_log_format {
 	unsigned short		ilf_type;	/* inode log item type */
@@ -59,18 +40,38 @@ typedef struct xfs_inode_log_format {
 	int			ilf_boffset;	/* off of inode in buffer */
 } xfs_inode_log_format_t;
 
-/* Initial version shipped with IRIX 5.3-XFS */
-typedef struct xfs_inode_log_format_v1 {
-	unsigned short		ilf_type;	/* inode log item type */
-	unsigned short		ilf_size;	/* size of this item */
-	uint			ilf_fields;	/* flags for fields logged */
-	uint			ilf_dsize;	/* size of data/ext/root */
-	xfs_ino_t		ilf_ino;	/* inode number */
+typedef struct xfs_inode_log_format_32 {
+	unsigned short		ilf_type;	/* 16: inode log item type */
+	unsigned short		ilf_size;	/* 16: size of this item */
+	uint			ilf_fields;	/* 32: flags for fields logged */
+	ushort			ilf_asize;	/* 32: size of attr d/ext/root */
+	ushort			ilf_dsize;	/* 32: size of data/ext/root */
+	xfs_ino_t		ilf_ino;	/* 64: inode number */
 	union {
-		xfs_dev_t	ilfu_rdev;	/* rdev value for dev inode*/
-		uuid_t		ilfu_uuid;	/* mount point value */
+		xfs_dev_t	ilfu_rdev;	/* 32: rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* 128: mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* 64: blkno of inode buffer */
+	int			ilf_len;	/* 32: len of inode buffer */
+	int			ilf_boffset;	/* 32: off of inode in buffer */
+} __attribute__((packed)) xfs_inode_log_format_32_t;
+
+typedef struct xfs_inode_log_format_64 {
+	unsigned short		ilf_type;	/* 16: inode log item type */
+	unsigned short		ilf_size;	/* 16: size of this item */
+	uint			ilf_fields;	/* 32: flags for fields logged */
+	ushort			ilf_asize;	/* 32: size of attr d/ext/root */
+	ushort			ilf_dsize;	/* 32: size of data/ext/root */
+	__uint32_t		ilf_pad;	/* 32: pad for 64 bit boundary */
+	xfs_ino_t		ilf_ino;	/* 64: inode number */
+	union {
+		xfs_dev_t	ilfu_rdev;	/* 32: rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* 128: mount point value */
 	} ilf_u;
-} xfs_inode_log_format_t_v1;
+	__int64_t		ilf_blkno;	/* 64: blkno of inode buffer */
+	int			ilf_len;	/* 32: len of inode buffer */
+	int			ilf_boffset;	/* 32: off of inode in buffer */
+} xfs_inode_log_format_64_t;
 
 /*
  * Flags for xfs_trans_log_inode flags field.
@@ -172,6 +173,8 @@ extern void xfs_inode_item_destroy(struct xfs_inode *);
 extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
 extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *);
 extern void xfs_iflush_abort(struct xfs_inode *);
+extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
+					 xfs_inode_log_format_t *);
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1f0016b0b4e..efffa75fd5c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2292,12 +2292,22 @@ xlog_recover_do_inode_trans(
 	int			attr_index;
 	uint			fields;
 	xfs_dinode_core_t	*dicp;
+	int			need_free = 0;
 
 	if (pass == XLOG_RECOVER_PASS1) {
 		return 0;
 	}
 
-	in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
+		in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+	} else {
+		in_f = (xfs_inode_log_format_t *)kmem_alloc(
+			sizeof(xfs_inode_log_format_t), KM_SLEEP);
+		need_free = 1;
+		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
+		if (error)
+			goto error;
+	}
 	ino = in_f->ilf_ino;
 	mp = log->l_mp;
 	if (ITEM_TYPE(item) == XFS_LI_INODE) {
@@ -2323,8 +2333,10 @@ xlog_recover_do_inode_trans(
 	 * Inode buffers can be freed, look out for it,
 	 * and do not replay the inode.
 	 */
-	if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0))
-		return 0;
+	if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
+		error = 0;
+		goto error;
+	}
 
 	bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
 								XFS_BUF_LOCK);
@@ -2333,7 +2345,7 @@ xlog_recover_do_inode_trans(
 				  bp, imap.im_blkno);
 		error = XFS_BUF_GETERROR(bp);
 		xfs_buf_relse(bp);
-		return error;
+		goto error;
 	}
 	error = 0;
 	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
@@ -2350,7 +2362,8 @@ xlog_recover_do_inode_trans(
 			dip, bp, ino);
 		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
 				 XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 	dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr);
 	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
@@ -2360,7 +2373,8 @@ xlog_recover_do_inode_trans(
 			item, ino);
 		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
 				 XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 
 	/* Skip replay when the on disk inode is newer than the log one */
@@ -2376,7 +2390,8 @@ xlog_recover_do_inode_trans(
 			/* do nothing */
 		} else {
 			xfs_buf_relse(bp);
-			return 0;
+			error = 0;
+			goto error;
 		}
 	}
 	/* Take the opportunity to reset the flush iteration count */
@@ -2391,7 +2406,8 @@ xlog_recover_do_inode_trans(
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				item, dip, bp, ino);
-			return XFS_ERROR(EFSCORRUPTED);
+			error = EFSCORRUPTED;
+			goto error;
 		}
 	} else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2403,7 +2419,8 @@ xlog_recover_do_inode_trans(
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				item, dip, bp, ino);
-			return XFS_ERROR(EFSCORRUPTED);
+			error = EFSCORRUPTED;
+			goto error;
 		}
 	}
 	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
@@ -2415,7 +2432,8 @@ xlog_recover_do_inode_trans(
 			item, dip, bp, ino,
 			dicp->di_nextents + dicp->di_anextents,
 			dicp->di_nblocks);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
@@ -2424,7 +2442,8 @@ xlog_recover_do_inode_trans(
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
 			item, dip, bp, ino, dicp->di_forkoff);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 	if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
@@ -2433,7 +2452,8 @@ xlog_recover_do_inode_trans(
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
 			item->ri_buf[1].i_len, item);
-		return XFS_ERROR(EFSCORRUPTED);
+		error = EFSCORRUPTED;
+		goto error;
 	}
 
 	/* The core is in in-core format */
@@ -2521,7 +2541,8 @@ xlog_recover_do_inode_trans(
 			xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
 			ASSERT(0);
 			xfs_buf_relse(bp);
-			return XFS_ERROR(EIO);
+			error = EIO;
+			goto error;
 		}
 	}
 
@@ -2537,7 +2558,10 @@ write_inode_buffer:
 		error = xfs_bwrite(mp, bp);
 	}
 
-	return (error);
+error:
+	if (need_free)
+		kmem_free(in_f, sizeof(*in_f));
+	return XFS_ERROR(error);
 }
 
 /*
@@ -2674,32 +2698,32 @@ xlog_recover_do_dquot_trans(
  * structure into it, and adds the efi to the AIL with the given
  * LSN.
  */
-STATIC void
+STATIC int
 xlog_recover_do_efi_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	xfs_lsn_t		lsn,
 	int			pass)
 {
+	int			error;
 	xfs_mount_t		*mp;
 	xfs_efi_log_item_t	*efip;
 	xfs_efi_log_format_t	*efi_formatp;
 	SPLDECL(s);
 
 	if (pass == XLOG_RECOVER_PASS1) {
-		return;
+		return 0;
 	}
 
 	efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
-	ASSERT(item->ri_buf[0].i_len ==
-	       (sizeof(xfs_efi_log_format_t) +
-		((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t))));
 
 	mp = log->l_mp;
 	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
-	memcpy((char *)&(efip->efi_format), (char *)efi_formatp,
-	      sizeof(xfs_efi_log_format_t) +
-	      ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t)));
+	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
+					 &(efip->efi_format)))) {
+		xfs_efi_item_free(efip);
+		return error;
+	}
 	efip->efi_next_extent = efi_formatp->efi_nextents;
 	efip->efi_flags |= XFS_EFI_COMMITTED;
 
@@ -2708,6 +2732,7 @@ xlog_recover_do_efi_trans(
 	 * xfs_trans_update_ail() drops the AIL lock.
 	 */
 	xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s);
+	return 0;
 }
 
 
@@ -2738,9 +2763,10 @@ xlog_recover_do_efd_trans(
 	}
 
 	efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
-	ASSERT(item->ri_buf[0].i_len ==
-	       (sizeof(xfs_efd_log_format_t) +
-		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_t))));
+	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
+	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
 	efi_id = efd_formatp->efd_efi_id;
 
 	/*
@@ -2810,15 +2836,14 @@ xlog_recover_do_trans(
 			if  ((error = xlog_recover_do_buffer_trans(log, item,
 								 pass)))
 				break;
-		} else if ((ITEM_TYPE(item) == XFS_LI_INODE) ||
-			   (ITEM_TYPE(item) == XFS_LI_6_1_INODE) ||
-			   (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) {
+		} else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
 			if ((error = xlog_recover_do_inode_trans(log, item,
 								pass)))
 				break;
 		} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
-			xlog_recover_do_efi_trans(log, item, trans->r_lsn,
-						  pass);
+			if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
+						  pass)))
+				break;
 		} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
 			xlog_recover_do_efd_trans(log, item, pass);
 		} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
@@ -3798,7 +3823,7 @@ xlog_do_log_recovery(
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS2);
 #ifdef DEBUG
-	{
+	if (!error) {
 		int	i;
 
 		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
-- 
cgit v1.2.3


From 714250879ea61cdb1a39bb96fe9d934ee0c669a2 Mon Sep 17 00:00:00 2001
From: David Chinner <dgc@sgi.com>
Date: Fri, 9 Jun 2006 14:55:52 +1000
Subject: [XFS] Stop a BUG from occurring in generic_delete_inode by preventing
 transaction completion from marking the inode dirty while it is being cleaned
 up on it's way out of the system.

SGI-PV: 952967
SGI-Modid: xfs-linux-melb:xfs-kern:26040a

Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 083fc0479e6..df695e96806 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2757,7 +2757,8 @@ xfs_iunpin(
 			if (vp) {
 				struct inode	*inode = vn_to_inode(vp);
 
-				if (!(inode->i_state & I_NEW))
+				if (!(inode->i_state &
+						(I_NEW|I_FREEING|I_CLEAR)))
 					mark_inode_dirty_sync(inode);
 			}
 		}
-- 
cgit v1.2.3


From 87c199c2a79220ac9e216e72d18a15148f84d9e0 Mon Sep 17 00:00:00 2001
From: Tim Shimmin <tes@sgi.com>
Date: Fri, 9 Jun 2006 14:56:16 +1000
Subject: [XFS] Over zealous with doing endian conversions. We endian converted
 the logged version of di_next_unlinked which is actually always stored in the
 correct ondisk format. This was pointed out to us by Shailendra Tripathi. And
 is evident in the xfs qa test of 121.

SGI-PV: 953263
SGI-Modid: xfs-linux-melb:xfs-kern:26044a

Signed-off-by: Tim Shimmin <tes@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index efffa75fd5c..3abc98944c0 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer(
 
 		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
 					      next_unlinked_offset);
-		INT_SET(*buffer_nextp, ARCH_CONVERT, *logged_nextp);
+		*buffer_nextp = *logged_nextp;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 72c93bcc6348a385416603459c2fdb4cf6c43687 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 9 Jun 2006 14:57:01 +1000
Subject: [XFS] lock validator: lockdep: small xfs init_rwsem() cleanup
 init_rwsem() has no return value.  This is not a problem if init_rwsem() is a
 function, but it's a problem if it's a do { ...  } while (0) macro. (which
 lockdep introduces)

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26082a

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/mrlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
index 1b262b790d9..32e1ce0f04c 100644
--- a/fs/xfs/linux-2.6/mrlock.h
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -28,7 +28,7 @@ typedef struct {
 } mrlock_t;
 
 #define mrinit(mrp, name)	\
-	( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
 #define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
 #define mrfree(mrp)		do { } while (0)
 #define mraccess(mrp)		mraccessf(mrp, 0)
-- 
cgit v1.2.3


From 3d80ede4799889ede2aa785c2511aef3e78d5bb1 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 14:57:30 +1000
Subject: [XFS] Drop use of m_writeio_blocks when zeroing, its not meaningful
 anymore here.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26094a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 67b5e1c20de..6fbdca3eaa7 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -510,7 +510,6 @@ xfs_zero_eof(
 	xfs_fileoff_t	end_zero_fsb;
 	xfs_fileoff_t	zero_count_fsb;
 	xfs_fileoff_t	last_fsb;
-	xfs_extlen_t	buf_len_fsb;
 	xfs_mount_t	*mp = io->io_mount;
 	int		nimaps;
 	int		error = 0;
@@ -579,16 +578,7 @@ xfs_zero_eof(
 		}
 
 		/*
-		 * There are blocks in the range requested.
-		 * Zero them a single write at a time.  We actually
-		 * don't zero the entire range returned if it is
-		 * too big and simply loop around to get the rest.
-		 * That is not the most efficient thing to do, but it
-		 * is simple and this path should not be exercised often.
-		 */
-		buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
-					      mp->m_writeio_blocks << 8);
-		/*
+		 * There are blocks we need to zero.
 		 * Drop the inode lock while we're doing the I/O.
 		 * We'll still have the iolock to protect us.
 		 */
@@ -596,14 +586,13 @@ xfs_zero_eof(
 
 		error = xfs_iozero(ip,
 				   XFS_FSB_TO_B(mp, start_zero_fsb),
-				   XFS_FSB_TO_B(mp, buf_len_fsb),
+				   XFS_FSB_TO_B(mp, imap.br_blockcount),
 				   end_size);
-
 		if (error) {
 			goto out_lock;
 		}
 
-		start_zero_fsb = imap.br_startoff + buf_len_fsb;
+		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 
 		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
-- 
cgit v1.2.3


From b76963fac4a17b661bad46e5a57b0f918c6f0cd1 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 14:58:20 +1000
Subject: [XFS] getattr can return an error code, so propogate any from lower
 layers.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26095a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 2e2e275c786..0da1d6b081e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -643,7 +643,7 @@ xfs_vn_getattr(
 		error = vn_revalidate(vp);
 	if (!error)
 		generic_fillattr(inode, stat);
-	return 0;
+	return -error;
 }
 
 STATIC int
-- 
cgit v1.2.3


From 7d04a335b6b2d79e3742ffd28bd651204574e794 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 14:58:38 +1000
Subject: [XFS] Shutdown the filesystem if all device paths have gone.  Made
 shutdown vop flags consistent with sync vop flags declarations too.

SGI-PV: 939911
SGI-Modid: xfs-linux-melb:xfs-kern:26096a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c  | 15 +++++++--------
 fs/xfs/linux-2.6/xfs_aops.h  |  2 +-
 fs/xfs/linux-2.6/xfs_vfs.h   |  7 +++++++
 fs/xfs/linux-2.6/xfs_vnode.c | 16 ++++++++++++++++
 fs/xfs/linux-2.6/xfs_vnode.h |  2 ++
 fs/xfs/quota/xfs_dquot.c     |  2 +-
 fs/xfs/xfs_bmap.c            |  4 ++--
 fs/xfs/xfs_buf_item.c        |  2 +-
 fs/xfs/xfs_fsops.c           |  7 ++++---
 fs/xfs/xfs_inode.c           |  4 ++--
 fs/xfs/xfs_log.c             | 10 +++++-----
 fs/xfs/xfs_log_recover.c     |  2 +-
 fs/xfs/xfs_mount.h           |  9 ---------
 fs/xfs/xfs_rw.c              | 36 ++++++++++++++++++++----------------
 fs/xfs/xfs_trans.c           |  4 ++--
 fs/xfs/xfs_trans_ail.c       |  5 +++--
 fs/xfs/xfs_trans_buf.c       |  6 +++---
 fs/xfs/xfs_vnodeops.c        |  2 +-
 18 files changed, 78 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 4d191ef39b6..1fcdc0abda6 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -136,9 +136,10 @@ xfs_destroy_ioend(
 
 	for (bh = ioend->io_buffer_head; bh; bh = next) {
 		next = bh->b_private;
-		bh->b_end_io(bh, ioend->io_uptodate);
+		bh->b_end_io(bh, !ioend->io_error);
 	}
-
+	if (unlikely(ioend->io_error))
+		vn_ioerror(ioend->io_vnode, ioend->io_error, __FILE__,__LINE__);
 	vn_iowake(ioend->io_vnode);
 	mempool_free(ioend, xfs_ioend_pool);
 }
@@ -185,7 +186,7 @@ xfs_end_bio_unwritten(
 	size_t			size = ioend->io_size;
 	int			error;
 
-	if (ioend->io_uptodate)
+	if (likely(!ioend->io_error))
 		VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
 	xfs_destroy_ioend(ioend);
 }
@@ -211,7 +212,7 @@ xfs_alloc_ioend(
 	 * all the I/O from calling the completion routine too early.
 	 */
 	atomic_set(&ioend->io_remaining, 1);
-	ioend->io_uptodate = 1; /* cleared if any I/O fails */
+	ioend->io_error = 0;
 	ioend->io_list = NULL;
 	ioend->io_type = type;
 	ioend->io_vnode = vn_from_inode(inode);
@@ -271,16 +272,14 @@ xfs_end_bio(
 	if (bio->bi_size)
 		return 1;
 
-	ASSERT(ioend);
 	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 
 	/* Toss bio and pass work off to an xfsdatad thread */
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-		ioend->io_uptodate = 0;
 	bio->bi_private = NULL;
 	bio->bi_end_io = NULL;
-
 	bio_put(bio);
+
 	xfs_finish_ioend(ioend);
 	return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 60716543c68..41c7d2da638 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -30,7 +30,7 @@ typedef void (*xfs_ioend_func_t)(void *);
 typedef struct xfs_ioend {
 	struct xfs_ioend	*io_list;	/* next ioend in chain */
 	unsigned int		io_type;	/* delalloc / unwritten */
-	unsigned int		io_uptodate;	/* I/O status register */
+	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
 	struct vnode		*io_vnode;	/* file being written to */
 	struct buffer_head	*io_buffer_head;/* buffer linked list head */
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 841200c0309..2ca05043a0f 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -94,6 +94,13 @@ typedef enum {
 #define SYNC_REMOUNT		0x0080  /* remount readonly, no dummy LRs */
 #define SYNC_QUIESCE		0x0100  /* quiesce filesystem for a snapshot */
 
+#define SHUTDOWN_META_IO_ERROR	0x0001	/* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR	0x0002	/* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT	0x0004	/* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE	0x0008	/* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ	0x0010	/* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ	0x0020	/* failed all paths to the device */
+
 typedef int	(*vfs_mount_t)(bhv_desc_t *,
 				struct xfs_mount_args *, struct cred *);
 typedef int	(*vfs_parseargs_t)(bhv_desc_t *, char *,
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index d27c25b27cc..f17e39cff23 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -54,6 +54,22 @@ vn_iowake(
 		wake_up(vptosync(vp));
 }
 
+/*
+ * Volume managers supporting multiple paths can send back ENODEV when the
+ * final path disappears.  In this case continuing to fill the page cache
+ * with dirty data which cannot be written out is evil, so prevent that.
+ */
+void
+vn_ioerror(
+	struct vnode	*vp,
+	int		error,
+	char		*f,
+	int		l)
+{
+	if (unlikely(error == -ENODEV))
+		VFS_FORCE_SHUTDOWN(vp->v_vfsp, SHUTDOWN_DEVICE_REQ, f, l);
+}
+
 struct vnode *
 vn_initialize(
 	struct inode	*inode)
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 2a8e16c2235..a64b7db6700 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -514,6 +514,8 @@ extern void	vn_revalidate_core(struct vnode *, vattr_t *);
 extern void	vn_iowait(struct vnode *vp);
 extern void	vn_iowake(struct vnode *vp);
 
+extern void	vn_ioerror(struct vnode *vp, int error, char *f, int l);
+
 static inline int vn_count(struct vnode *vp)
 {
 	return atomic_read(&vn_to_inode(vp)->i_count);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 26ee5df4f83..46bec66bb4d 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1261,7 +1261,7 @@ xfs_qm_dqflush(
 
 	if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id),
 			   0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
-		xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE);
+		xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE);
 		return XFS_ERROR(EIO);
 	}
 
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ad595dbefe1..b4529421823 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4263,8 +4263,8 @@ xfs_bmap_finish(
 			if (!XFS_FORCED_SHUTDOWN(mp))
 				xfs_force_shutdown(mp,
 						   (error == EFSCORRUPTED) ?
-						   XFS_CORRUPT_INCORE :
-						   XFS_METADATA_IO_ERROR);
+						   SHUTDOWN_CORRUPT_INCORE :
+						   SHUTDOWN_META_IO_ERROR);
 			return error;
 		}
 		xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 5fed15682dd..59013e1f16e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1108,7 +1108,7 @@ xfs_buf_error_relse(
 	XFS_BUF_ERROR(bp,0);
 	xfs_buftrace("BUF_ERROR_RELSE", bp);
 	if (! XFS_FORCED_SHUTDOWN(mp))
-		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	/*
 	 * We have to unpin the pinned buffers so do the
 	 * callbacks.
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dfa3527b20a..96f4b3e8fa4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -578,17 +578,18 @@ xfs_fs_goingdown(
 		struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
 
 		if (sb && !IS_ERR(sb)) {
-			xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
 			thaw_bdev(sb->s_bdev, sb);
 		}
 	
 		break;
 	}
 	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
-		xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
 		break;
 	case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
-		xfs_force_shutdown(mp, XFS_FORCE_UMOUNT|XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp,
+				SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
 		break;
 	default:
 		return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index df695e96806..b30bffa99ed 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3206,7 +3206,7 @@ xfs_iflush(
 
 corrupt_out:
 	xfs_buf_relse(bp);
-	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 	xfs_iflush_abort(ip);
 	/*
 	 * Unlocks the flush lock
@@ -3228,7 +3228,7 @@ cluster_corrupt_out:
 		xfs_buf_relse(bp);
 	}
 
-	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 
 	if(!bufwasdelwri)  {
 		/*
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 32e841d2f26..7d882d6c7d4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -402,7 +402,7 @@ xfs_log_release_iclog(xfs_mount_t *mp,
 	xlog_in_core_t	  *iclog = (xlog_in_core_t *)iclog_hndl;
 
 	if (xlog_state_release_iclog(log, iclog)) {
-		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 		return EIO;
 	}
 
@@ -726,7 +726,7 @@ xfs_log_write(xfs_mount_t *	mp,
 		return XFS_ERROR(EIO);
 
 	if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
-		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 	}
 	return error;
 }	/* xfs_log_write */
@@ -956,7 +956,7 @@ xlog_iodone(xfs_buf_t *bp)
 			XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
 		xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
 		XFS_BUF_STALE(bp);
-		xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
 		/*
 		 * This flag will be propagated to the trans-committed
 		 * callback routines to let them know that the log-commit
@@ -1261,7 +1261,7 @@ xlog_commit_record(xfs_mount_t  *mp,
 	ASSERT_ALWAYS(iclog);
 	if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
 			       iclog, XLOG_COMMIT_TRANS))) {
-		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 	}
 	return error;
 }	/* xlog_commit_record */
@@ -1790,7 +1790,7 @@ xlog_write(xfs_mount_t *	mp,
 	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
 		"xfs_log_write: reservation ran out. Need to up reservation");
 	/* If we did not panic, shutdown the filesystem */
-	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 #endif
     } else
 	ticket->t_curr_res -= len;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 3abc98944c0..a17218e8153 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -282,7 +282,7 @@ xlog_recover_iodone(
 		mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
 		xfs_ioerror_alert("xlog_recover_iodone",
 				  mp, bp, XFS_BUF_ADDR(bp));
-		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	}
 	XFS_BUF_SET_FSPRIVATE(bp, NULL);
 	XFS_BUF_CLR_IODONE_FUNC(bp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a682eb55810..2ca211214a9 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -513,15 +513,6 @@ xfs_preferred_iosize(xfs_mount_t *mp)
 #define xfs_force_shutdown(m,f)	\
 	VFS_FORCE_SHUTDOWN((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
 
-/*
- * Flags sent to xfs_force_shutdown.
- */
-#define XFS_METADATA_IO_ERROR	0x1
-#define XFS_LOG_IO_ERROR	0x2
-#define XFS_FORCE_UMOUNT	0x4
-#define XFS_CORRUPT_INCORE	0x8	/* Corrupt in-memory data structures */
-#define XFS_SHUTDOWN_REMOTE_REQ 0x10	/* Shutdown came from remote cell */
-
 /*
  * Flags for xfs_mountfs
  */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index a59c102cf21..d33e4f5808e 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -109,12 +109,12 @@ xfs_do_force_shutdown(
 	xfs_mount_t	*mp;
 
 	mp = XFS_BHVTOM(bdp);
-	logerror = flags & XFS_LOG_IO_ERROR;
+	logerror = flags & SHUTDOWN_LOG_IO_ERROR;
 
-	if (!(flags & XFS_FORCE_UMOUNT)) {
-		cmn_err(CE_NOTE,
-		"xfs_force_shutdown(%s,0x%x) called from line %d of file %s.  Return address = 0x%p",
-			mp->m_fsname,flags,lnnum,fname,__return_address);
+	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
+				 "line %d of file %s.  Return address = 0x%p",
+			mp->m_fsname, flags, lnnum, fname, __return_address);
 	}
 	/*
 	 * No need to duplicate efforts.
@@ -125,33 +125,37 @@ xfs_do_force_shutdown(
 	/*
 	 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
 	 * queue up anybody new on the log reservations, and wakes up
-	 * everybody who's sleeping on log reservations and tells
-	 * them the bad news.
+	 * everybody who's sleeping on log reservations to tell them
+	 * the bad news.
 	 */
 	if (xfs_log_force_umount(mp, logerror))
 		return;
 
-	if (flags & XFS_CORRUPT_INCORE) {
+	if (flags & SHUTDOWN_CORRUPT_INCORE) {
 		xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
     "Corruption of in-memory data detected.  Shutting down filesystem: %s",
 			mp->m_fsname);
 		if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
 			xfs_stack_trace();
 		}
-	} else if (!(flags & XFS_FORCE_UMOUNT)) {
+	} else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
 		if (logerror) {
 			xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
-			"Log I/O Error Detected.  Shutting down filesystem: %s",
+		"Log I/O Error Detected.  Shutting down filesystem: %s",
 				mp->m_fsname);
-		} else if (!(flags & XFS_SHUTDOWN_REMOTE_REQ)) {
+		} else if (flags & SHUTDOWN_DEVICE_REQ) {
 			xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
-				"I/O Error Detected.  Shutting down filesystem: %s",
+		"All device paths lost.  Shutting down filesystem: %s",
+				mp->m_fsname);
+		} else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+			xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+		"I/O Error Detected.  Shutting down filesystem: %s",
 				mp->m_fsname);
 		}
 	}
-	if (!(flags & XFS_FORCE_UMOUNT)) {
-		cmn_err(CE_ALERT,
-		"Please umount the filesystem, and rectify the problem(s)");
+	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
+		cmn_err(CE_ALERT, "Please umount the filesystem, "
+				  "and rectify the problem(s)");
 	}
 }
 
@@ -335,7 +339,7 @@ xfs_bwrite(
 		 * from bwrite and we could be tracing a buffer that has
 		 * been reused.
 		 */
-		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 	}
 	return (error);
 }
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8d056cef5d1..c05da5871a5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -846,7 +846,7 @@ shut_us_down:
 	 */
 	nvec = xfs_trans_count_vecs(tp);
 	if (nvec == 0) {
-		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
 		goto shut_us_down;
 	} else if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
 		log_vector = log_vector_fast;
@@ -1148,7 +1148,7 @@ xfs_trans_cancel(
 	 */
 	if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) {
 		XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp);
-		xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 	}
 #ifdef DEBUG
 	if (!(flags & XFS_TRANS_ABORT)) {
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 19ab24af1c1..e1ca7b6fde4 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -363,9 +363,10 @@ xfs_trans_delete_ail(
 			AIL_UNLOCK(mp, s);
 		else {
 			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
-				"xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL");
+		"%s: attempting to delete a log item that is not in the AIL",
+					__FUNCTION__);
 			AIL_UNLOCK(mp, s);
-			xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 		}
 	}
 }
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c74c31ebc81..42291364570 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -369,7 +369,7 @@ xfs_trans_read_buf(
 				 */
 				if (tp->t_flags & XFS_TRANS_DIRTY)
 					xfs_force_shutdown(tp->t_mountp,
-							   XFS_METADATA_IO_ERROR);
+							SHUTDOWN_META_IO_ERROR);
 				return error;
 			}
 		}
@@ -414,7 +414,7 @@ xfs_trans_read_buf(
 		xfs_ioerror_alert("xfs_trans_read_buf", mp,
 				  bp, blkno);
 		if (tp->t_flags & XFS_TRANS_DIRTY)
-			xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR);
+			xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
 		xfs_buf_relse(bp);
 		return error;
 	}
@@ -423,7 +423,7 @@ xfs_trans_read_buf(
 		if (xfs_error_target == target) {
 			if (((xfs_req_num++) % xfs_error_mod) == 0) {
 				xfs_force_shutdown(tp->t_mountp,
-						   XFS_METADATA_IO_ERROR);
+						   SHUTDOWN_META_IO_ERROR);
 				xfs_buf_relse(bp);
 				printk("Returning error in trans!\n");
 				return XFS_ERROR(EIO);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d1c6349e401..cb36a56392e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1761,7 +1761,7 @@ xfs_inactive(
 			cmn_err(CE_NOTE,
 		"xfs_inactive:	xfs_ifree() returned an error = %d on %s",
 				error, mp->m_fsname);
-			xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
 		}
 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
 	} else {
-- 
cgit v1.2.3


From e109007461cddfc80a908f0b015f4eeb485e1d85 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 14:58:48 +1000
Subject: [XFS] Fix a buffer refcount leak in dir2 code on a forced shutdown.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26097a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_dir2_node.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index c0e8fcf5e13..a8d483c0a84 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -505,7 +505,6 @@ xfs_dir2_leafn_lookup_int(
 							XFS_DATA_FORK))) {
 						return error;
 					}
-					curfdb = newfdb;
 					free = curbp->data;
 					ASSERT(be32_to_cpu(free->hdr.magic) ==
 					       XFS_DIR2_FREE_MAGIC);
@@ -527,8 +526,11 @@ xfs_dir2_leafn_lookup_int(
 				if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
 					XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
 							 XFS_ERRLEVEL_LOW, mp);
+					if (curfdb != newfdb)
+						xfs_da_brelse(tp, curbp);
 					return XFS_ERROR(EFSCORRUPTED);
 				}
+				curfdb = newfdb;
 				if (be16_to_cpu(free->bests[fi]) >= length) {
 					*indexp = index;
 					state->extravalid = 1;
-- 
cgit v1.2.3


From 59c1b082f5fff8269565039600a2ef18d48649b5 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 14:59:13 +1000
Subject: [XFS] Make the pflags test/set wrappers more legible for us mere
 humans.

SGI-PV: 953338
SGI-Modid: xfs-linux-melb:xfs-kern:26099a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/kmem.h      | 38 +-------------------------------------
 fs/xfs/linux-2.6/xfs_aops.c  |  4 ++--
 fs/xfs/linux-2.6/xfs_linux.h | 14 ++++++++++----
 fs/xfs/xfs_trans.c           | 25 +++++++++++--------------
 4 files changed, 24 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 2cfd33d4d8a..939bd84bc7e 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -22,42 +22,6 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 
-/*
- * Process flags handling
- */
-
-#define PFLAGS_TEST_NOIO()              (current->flags & PF_NOIO)
-#define PFLAGS_TEST_FSTRANS()           (current->flags & PF_FSTRANS)
-
-#define PFLAGS_SET_NOIO() do {		\
-	current->flags |= PF_NOIO;	\
-} while (0)
-
-#define PFLAGS_CLEAR_NOIO() do {	\
-	current->flags &= ~PF_NOIO;	\
-} while (0)
-
-/* these could be nested, so we save state */
-#define PFLAGS_SET_FSTRANS(STATEP) do {	\
-	*(STATEP) = current->flags;	\
-	current->flags |= PF_FSTRANS;	\
-} while (0)
-
-#define PFLAGS_CLEAR_FSTRANS(STATEP) do { \
-	*(STATEP) = current->flags;	\
-	current->flags &= ~PF_FSTRANS;	\
-} while (0)
-
-/* Restore the PF_FSTRANS state to what was saved in STATEP */
-#define PFLAGS_RESTORE_FSTRANS(STATEP) do {     		\
-	current->flags = ((current->flags & ~PF_FSTRANS) |	\
-			  (*(STATEP) & PF_FSTRANS));		\
-} while (0)
-
-#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
-	*(NSTATEP) = *(OSTATEP);	\
-} while (0)
-
 /*
  * General memory allocation interfaces
  */
@@ -83,7 +47,7 @@ kmem_flags_convert(unsigned int __nocast flags)
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
 	} else {
 		lflags = GFP_KERNEL | __GFP_NOWARN;
-		if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS))
+		if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
 			lflags &= ~__GFP_FS;
 	}
 	return lflags;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 1fcdc0abda6..5835e699a7f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1126,7 +1126,7 @@ xfs_vm_writepage(
 	 * then mark the page dirty again and leave the page
 	 * as is.
 	 */
-	if (PFLAGS_TEST_FSTRANS() && need_trans)
+	if (current_test_flags(PF_FSTRANS) && need_trans)
 		goto out_fail;
 
 	/*
@@ -1203,7 +1203,7 @@ xfs_vm_releasepage(
 	/* If we are already inside a transaction or the thread cannot
 	 * do I/O, we cannot release this page.
 	 */
-	if (PFLAGS_TEST_FSTRANS())
+	if (current_test_flags(PF_FSTRANS))
 		return 0;
 
 	/*
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 7d15cb91027..e9285395411 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -136,13 +136,19 @@ BUFFER_FNS(PrivateStart, unwritten);
 #define xfs_rotorstep		xfs_params.rotorstep.val
 #define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
 
-#ifndef raw_smp_processor_id
-#define raw_smp_processor_id()	smp_processor_id()
-#endif
-#define current_cpu()		raw_smp_processor_id()
+#define current_cpu()		(raw_smp_processor_id())
 #define current_pid()		(current->pid)
 #define current_fsuid(cred)	(current->fsuid)
 #define current_fsgid(cred)	(current->fsgid)
+#define current_set_flags(f)	(current->flags |= (f))
+#define current_test_flags(f)	(current->flags & (f))
+#define current_clear_flags(f)	(current->flags & ~(f))
+#define current_set_flags_nested(sp, f)		\
+		(*(sp) = current->flags, current->flags |= (f))
+#define current_clear_flags_nested(sp, f)	\
+		(*(sp) = current->flags, current->flags &= ~(f))
+#define current_restore_flags_nested(sp, f)	\
+		(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
 
 #define NBPP		PAGE_SIZE
 #define DPPSHFT		(PAGE_SHIFT - 9)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c05da5871a5..7a99ed3b187 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -303,7 +303,7 @@ xfs_trans_dup(
 	tp->t_blk_res = tp->t_blk_res_used;
 	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
 	tp->t_rtx_res = tp->t_rtx_res_used;
-	PFLAGS_DUP(&tp->t_pflags, &ntp->t_pflags);
+	ntp->t_pflags = tp->t_pflags;
 
 	XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp);
 
@@ -335,14 +335,11 @@ xfs_trans_reserve(
 	uint		logcount)
 {
 	int		log_flags;
-	int		error;
-	int	rsvd;
-
-	error = 0;
-	rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+	int		error = 0;
+	int		rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
 	/* Mark this thread as being in a transaction */
-        PFLAGS_SET_FSTRANS(&tp->t_pflags);
+	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
 	/*
 	 * Attempt to reserve the needed disk blocks by decrementing
@@ -353,7 +350,7 @@ xfs_trans_reserve(
 		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
 					  -blocks, rsvd);
 		if (error != 0) {
-                        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 			return (XFS_ERROR(ENOSPC));
 		}
 		tp->t_blk_res += blocks;
@@ -426,9 +423,9 @@ undo_blocks:
 		tp->t_blk_res = 0;
 	}
 
-        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
-	return (error);
+	return error;
 }
 
 
@@ -819,7 +816,7 @@ shut_us_down:
 			if (commit_lsn == -1 && !shutdown)
 				shutdown = XFS_ERROR(EIO);
 		}
-                PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+		current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 		xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
 		xfs_trans_free_busy(tp);
 		xfs_trans_free(tp);
@@ -884,7 +881,7 @@ shut_us_down:
 	 * had pinned, clean up, free trans structure, and return error.
 	 */
 	if (error || commit_lsn == -1) {
-                PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+		current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 		xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
 		return XFS_ERROR(EIO);
 	}
@@ -926,7 +923,7 @@ shut_us_down:
 	/*
 	 * Mark this thread as no longer being in a transaction
 	 */
-	PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
 	/*
 	 * Once all the items of the transaction have been copied
@@ -1182,7 +1179,7 @@ xfs_trans_cancel(
 	}
 
 	/* mark this thread as no longer being in a transaction */
-        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
 	xfs_trans_free_items(tp, flags);
 	xfs_trans_free_busy(tp);
-- 
cgit v1.2.3


From 7d4fb40ad7efe4586d1341d4731377fb4530836f Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 15:27:16 +1000
Subject: [XFS] Start writeout earlier (on last close) in the case where we
 have a truncate down followed by delayed allocation (buffered writes) - worst
 case scenario for the notorious NULL files problem.  This reduces the window
 where we are exposed to that problem significantly.

SGI-PV: 917976
SGI-Modid: xfs-linux-melb:xfs-kern:26100a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c    | 13 +++++++
 fs/xfs/linux-2.6/xfs_file.c    | 15 +++++++
 fs/xfs/linux-2.6/xfs_fs_subr.c | 53 +++++--------------------
 fs/xfs/linux-2.6/xfs_vnode.h   | 39 ++++++++++++-------
 fs/xfs/xfs_vnodeops.c          | 88 +++++++++++++++++++++++++-----------------
 5 files changed, 114 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 5835e699a7f..c0a90431685 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1157,6 +1157,18 @@ out_unlock:
 	return error;
 }
 
+STATIC int
+xfs_vm_writepages(
+	struct address_space	*mapping,
+	struct writeback_control *wbc)
+{
+	struct vnode		*vp = vn_from_inode(mapping->host);
+
+	if (VN_TRUNC(vp))
+		VUNTRUNCATE(vp);
+	return generic_writepages(mapping, wbc);
+}
+
 /*
  * Called to move a page into cleanable state - and from there
  * to be released. Possibly the page is already clean. We always
@@ -1451,6 +1463,7 @@ struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
+	.writepages		= xfs_vm_writepages,
 	.sync_page		= block_sync_page,
 	.releasepage		= xfs_vm_releasepage,
 	.invalidatepage		= xfs_vm_invalidatepage,
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 7c9f7598807..97615cc74ef 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -323,6 +323,17 @@ xfs_file_open(
 	return -error;
 }
 
+STATIC int
+xfs_file_close(
+	struct file	*filp)
+{
+	vnode_t		*vp = vn_from_inode(filp->f_dentry->d_inode);
+	int		error;
+
+	VOP_CLOSE(vp, 0, file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL, error);
+	return -error;
+}
+
 STATIC int
 xfs_file_release(
 	struct inode	*inode,
@@ -349,6 +360,8 @@ xfs_file_fsync(
 
 	if (datasync)
 		flags |= FSYNC_DATA;
+	if (VN_TRUNC(vp))
+		VUNTRUNCATE(vp);
 	VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error);
 	return -error;
 }
@@ -578,6 +591,7 @@ const struct file_operations xfs_file_operations = {
 #endif
 	.mmap		= xfs_file_mmap,
 	.open		= xfs_file_open,
+	.flush		= xfs_file_close,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
 #ifdef HAVE_FOP_OPEN_EXEC
@@ -602,6 +616,7 @@ const struct file_operations xfs_invis_file_operations = {
 #endif
 	.mmap		= xfs_file_mmap,
 	.open		= xfs_file_open,
+	.flush		= xfs_file_close,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
 };
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 575f2a790f3..f0c56daf4d6 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -15,40 +15,12 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
-
 #include "xfs.h"
 
-/*
- * Stub for no-op vnode operations that return error status.
- */
-int
-fs_noerr(void)
-{
-	return 0;
-}
+int  fs_noerr(void) { return 0; }
+int  fs_nosys(void) { return ENOSYS; }
+void fs_noval(void) { return; }
 
-/*
- * Operation unsupported under this file system.
- */
-int
-fs_nosys(void)
-{
-	return ENOSYS;
-}
-
-/*
- * Stub for inactive, strategy, and read/write lock/unlock.  Does nothing.
- */
-/* ARGSUSED */
-void
-fs_noval(void)
-{
-}
-
-/*
- * vnode pcache layer for vnode_tosspages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
 void
 fs_tosspages(
 	bhv_desc_t	*bdp,
@@ -63,11 +35,6 @@ fs_tosspages(
 		truncate_inode_pages(ip->i_mapping, first);
 }
 
-
-/*
- * vnode pcache layer for vnode_flushinval_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
 void
 fs_flushinval_pages(
 	bhv_desc_t	*bdp,
@@ -79,16 +46,13 @@ fs_flushinval_pages(
 	struct inode	*ip = vn_to_inode(vp);
 
 	if (VN_CACHED(vp)) {
+		if (VN_TRUNC(vp))
+			VUNTRUNCATE(vp);
 		filemap_write_and_wait(ip->i_mapping);
-
 		truncate_inode_pages(ip->i_mapping, first);
 	}
 }
 
-/*
- * vnode pcache layer for vnode_flush_pages.
- * 'last' parameter unused but left in for IRIX compatibility
- */
 int
 fs_flush_pages(
 	bhv_desc_t	*bdp,
@@ -100,12 +64,13 @@ fs_flush_pages(
 	vnode_t		*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
 
-	if (VN_CACHED(vp)) {
+	if (VN_DIRTY(vp)) {
+		if (VN_TRUNC(vp))
+			VUNTRUNCATE(vp);
 		filemap_fdatawrite(ip->i_mapping);
 		if (flags & XFS_B_ASYNC)
 			return 0;
 		filemap_fdatawait(ip->i_mapping);
 	}
-
 	return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index a64b7db6700..569a4e7b5cc 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -56,12 +56,18 @@ typedef xfs_ino_t vnumber_t;
 typedef struct dentry vname_t;
 typedef bhv_head_t vn_bhv_head_t;
 
+typedef enum vflags {
+	VMODIFIED	= 0x08,	/* XFS inode state possibly differs */
+				/* to the Linux inode state. */
+	VTRUNCATED	= 0x40,	/* truncated down so flush-on-close */
+} vflags_t;
+
 /*
  * MP locking protocols:
  *	v_flag, v_vfsp				VN_LOCK/VN_UNLOCK
  */
 typedef struct vnode {
-	__u32		v_flag;			/* vnode flags (see below) */
+	vflags_t	v_flag;			/* vnode flags (see above) */
 	struct vfs	*v_vfsp;		/* ptr to containing VFS */
 	vnumber_t	v_number;		/* in-core vnode number */
 	vn_bhv_head_t	v_bh;			/* behavior head */
@@ -125,12 +131,6 @@ static inline struct inode *vn_to_inode(struct vnode *vnode)
 	return &vnode->v_inode;
 }
 
-/*
- * Vnode flags.
- */
-#define VMODIFIED	       0x8	/* XFS inode state possibly differs */
-					/* to the Linux inode state.	*/
-
 /*
  * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter.
  */
@@ -162,8 +162,10 @@ typedef enum vchange {
 	VCHANGE_FLAGS_IOEXCL_COUNT	= 4
 } vchange_t;
 
+typedef enum { L_FALSE, L_TRUE } lastclose_t;
 
 typedef int	(*vop_open_t)(bhv_desc_t *, struct cred *);
+typedef int	(*vop_close_t)(bhv_desc_t *, int, lastclose_t, struct cred *);
 typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
 				const struct iovec *, unsigned int,
 				loff_t *, int, struct cred *);
@@ -234,6 +236,7 @@ typedef int	(*vop_iflush_t)(bhv_desc_t *, int);
 typedef struct vnodeops {
 	bhv_position_t  vn_position;    /* position within behavior chain */
 	vop_open_t		vop_open;
+	vop_close_t		vop_close;
 	vop_read_t		vop_read;
 	vop_write_t		vop_write;
 	vop_sendfile_t		vop_sendfile;
@@ -278,6 +281,10 @@ typedef struct vnodeops {
  */
 #define _VOP_(op, vp)	(*((vnodeops_t *)(vp)->v_fops)->op)
 
+#define VOP_OPEN(vp, cr, rv)						\
+	rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
+#define VOP_CLOSE(vp, f, last, cr, rv)					\
+	rv = _VOP_(vop_close, vp)((vp)->v_fbhv, f, last, cr)
 #define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv)			\
 	rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
 #define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv)		\
@@ -290,8 +297,6 @@ typedef struct vnodeops {
 	rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
 #define VOP_BMAP(vp,of,sz,rw,b,n,rv)					\
 	rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
-#define VOP_OPEN(vp, cr, rv)						\
-	rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
 #define VOP_GETATTR(vp, vap, f, cr, rv)					\
 	rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr)
 #define	VOP_SETATTR(vp, vap, f, cr, rv)					\
@@ -556,8 +561,6 @@ static inline struct vnode *vn_grab(struct vnode *vp)
  */
 #define VN_LOCK(vp)		mutex_spinlock(&(vp)->v_lock)
 #define VN_UNLOCK(vp, s)	mutex_spinunlock(&(vp)->v_lock, s)
-#define VN_FLAGSET(vp,b)	vn_flagset(vp,b)
-#define VN_FLAGCLR(vp,b)	vn_flagclr(vp,b)
 
 static __inline__ void vn_flagset(struct vnode *vp, uint flag)
 {
@@ -566,13 +569,22 @@ static __inline__ void vn_flagset(struct vnode *vp, uint flag)
 	spin_unlock(&vp->v_lock);
 }
 
-static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
+static __inline__ uint vn_flagclr(struct vnode *vp, uint flag)
 {
+	uint	cleared;
+
 	spin_lock(&vp->v_lock);
+	cleared = (vp->v_flag & flag);
 	vp->v_flag &= ~flag;
 	spin_unlock(&vp->v_lock);
+	return cleared;
 }
 
+#define VMODIFY(vp)	vn_flagset(vp, VMODIFIED)
+#define VUNMODIFY(vp)	vn_flagclr(vp, VMODIFIED)
+#define VTRUNCATE(vp)	vn_flagset(vp, VTRUNCATED)
+#define VUNTRUNCATE(vp)	vn_flagclr(vp, VTRUNCATED)
+
 /*
  * Dealing with bad inodes
  */
@@ -612,8 +624,7 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define VN_CACHED(vp)	(vn_to_inode(vp)->i_mapping->nrpages)
 #define VN_DIRTY(vp)	mapping_tagged(vn_to_inode(vp)->i_mapping, \
 					PAGECACHE_TAG_DIRTY)
-#define VMODIFY(vp)	VN_FLAGSET(vp, VMODIFIED)
-#define VUNMODIFY(vp)	VN_FLAGCLR(vp, VMODIFIED)
+#define VN_TRUNC(vp)	((vp)->v_flag & VTRUNCATED)
 
 /*
  * Flags to VOP_SETATTR/VOP_GETATTR.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index cb36a56392e..35906bae92e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -16,8 +16,6 @@
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
-#include <linux/capability.h>
-
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
@@ -58,32 +56,14 @@
 #include "xfs_log_priv.h"
 #include "xfs_mac.h"
 
-
-/*
- * The maximum pathlen is 1024 bytes. Since the minimum file system
- * blocksize is 512 bytes, we can get a max of 2 extents back from
- * bmapi.
- */
-#define SYMLINK_MAPS 2
-
-/*
- * For xfs, we check that the file isn't too big to be opened by this kernel.
- * No other open action is required for regular files.  Devices are handled
- * through the specfs file system, pipes through fifofs.  Device and
- * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
- * when a new vnode is first looked up or created.
- */
 STATIC int
 xfs_open(
 	bhv_desc_t	*bdp,
 	cred_t		*credp)
 {
 	int		mode;
-	vnode_t		*vp;
-	xfs_inode_t	*ip;
-
-	vp = BHV_TO_VNODE(bdp);
-	ip = XFS_BHVTOI(bdp);
+	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
@@ -101,6 +81,36 @@ xfs_open(
 	return 0;
 }
 
+STATIC int
+xfs_close(
+	bhv_desc_t	*bdp,
+	int		flags,
+	lastclose_t	lastclose,
+	cred_t		*credp)
+{
+	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
+	int		error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return XFS_ERROR(EIO);
+
+	if (lastclose != L_TRUE || !VN_ISREG(vp))
+		return 0;
+
+	/*
+	 * If we previously truncated this file and removed old data in
+	 * the process, we want to initiate "early" writeout on the last
+	 * close.  This is an attempt to combat the notorious NULL files
+	 * problem which is particularly noticable from a truncate down,
+	 * buffered (re-)write (delalloc), followed by a crash.  What we
+	 * are effectively doing here is significantly reducing the time
+	 * window where we'd otherwise be exposed to that problem.
+	 */
+	if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
+		VOP_FLUSH_PAGES(vp, 0, -1, XFS_B_ASYNC, FI_NONE, error);
+	return error;
+}
 
 /*
  * xfs_getattr
@@ -665,9 +675,17 @@ xfs_setattr(
 					    ((ip->i_d.di_nlink != 0 ||
 					      !(mp->m_flags & XFS_MOUNT_WSYNC))
 					     ? 1 : 0));
-			if (code) {
+			if (code)
 				goto abort_return;
-			}
+			/*
+			 * Truncated "down", so we're removing references
+			 * to old data here - if we now delay flushing for
+			 * a long time, we expose ourselves unduly to the
+			 * notorious NULL files problem.  So, we mark this
+			 * vnode and flush it when the file is closed, and
+			 * do not wait the usual (long) time for writeout.
+			 */
+			VTRUNCATE(vp);
 		}
 		/*
 		 * Have to do this even if the file's size doesn't change.
@@ -936,6 +954,13 @@ xfs_access(
 }
 
 
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 2 extents back from
+ * bmapi.
+ */
+#define SYMLINK_MAPS 2
+
 /*
  * xfs_readlink
  *
@@ -1470,9 +1495,6 @@ xfs_inactive_symlink_local(
 	return 0;
 }
 
-/*
- *
- */
 STATIC int
 xfs_inactive_attrs(
 	xfs_inode_t	*ip,
@@ -1531,10 +1553,10 @@ xfs_release(
 
 	vp = BHV_TO_VNODE(bdp);
 	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
 
-	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
+	if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
 		return 0;
-	}
 
 	/* If this is a read-only mount, don't do this (would generate I/O) */
 	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
@@ -1546,8 +1568,6 @@ xfs_release(
 		return 0;
 #endif
 
-	mp = ip->i_mount;
-
 	if (ip->i_d.di_nlink != 0) {
 		if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
 		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -3745,7 +3765,6 @@ xfs_inode_flush(
 	return error;
 }
 
-
 int
 xfs_set_dmattrs (
 	bhv_desc_t	*bdp,
@@ -3786,10 +3805,6 @@ xfs_set_dmattrs (
 	return error;
 }
 
-
-/*
- * xfs_reclaim
- */
 STATIC int
 xfs_reclaim(
 	bhv_desc_t	*bdp)
@@ -4645,6 +4660,7 @@ xfs_change_file_space(
 vnodeops_t xfs_vnodeops = {
 	BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
 	.vop_open		= xfs_open,
+	.vop_close		= xfs_close,
 	.vop_read		= xfs_read,
 #ifdef HAVE_SENDFILE
 	.vop_sendfile		= xfs_sendfile,
-- 
cgit v1.2.3


From ad723875ac238137207754d2633dd9c261c71558 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 15:29:12 +1000
Subject: [XFS] Fix a typo in a header file comment.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26101a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 41c7d2da638..62c1d49025f 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005 Silicon Graphics, Inc.
+ * Copyright (c) 2005-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -43,4 +43,4 @@ typedef struct xfs_ioend {
 extern struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 
-#endif /* __XFS_IOPS_H__ */
+#endif /* __XFS_AOPS_H__ */
-- 
cgit v1.2.3


From 9c48876a05b6fbe41f1933fae3529c268d78cad0 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 15:29:22 +1000
Subject: [XFS] Remove dead code from come bulkstat paths.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26102a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/quota/xfs_qm.c |  4 +---
 fs/xfs/xfs_itable.c   | 16 ----------------
 fs/xfs/xfs_itable.h   |  1 -
 3 files changed, 1 insertion(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 492840b2a35..00fb54d4899 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1905,9 +1905,7 @@ xfs_qm_quotacheck(
 		 */
 		if ((error = xfs_bulkstat(mp, &lastino, &count,
 				     xfs_qm_dqusage_adjust, NULL,
-				     structsz, NULL,
-				     BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED,
-				     &done)))
+				     structsz, NULL, BULKSTAT_FG_IGET, &done)))
 			break;
 
 	} while (! done);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 94068d014f2..864b43d6584 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -41,11 +41,6 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 
-#ifndef HAVE_USERACC
-#define useracc(ubuffer, size, flags, foo) (0)
-#define unuseracc(ubuffer, size, flags)
-#endif
-
 STATIC int
 xfs_bulkstat_one_iget(
 	xfs_mount_t	*mp,		/* mount point for filesystem */
@@ -335,15 +330,6 @@ xfs_bulkstat(
 		(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
 	nimask = ~(nicluster - 1);
 	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
-	/*
-	 * Lock down the user's buffer. If a buffer was not sent, as in the case
-	 * disk quota code calls here, we skip this.
-	 */
-	if (ubuffer &&
-	    (error = useracc(ubuffer, ubcount * statstruct_size,
-			(B_READ|B_PHYS), NULL))) {
-		return error;
-	}
 	/*
 	 * Allocate a page-sized buffer for inode btree records.
 	 * We could try allocating something smaller, but for normal
@@ -650,8 +636,6 @@ xfs_bulkstat(
 	 * Done, we're either out of filesystem or space to put the data.
 	 */
 	kmem_free(irbuf, NBPC);
-	if (ubuffer)
-		unuseracc(ubuffer, ubcount * statstruct_size, (B_READ|B_PHYS));
 	*ubcountp = ubelem;
 	if (agno >= mp->m_sb.sb_agcount) {
 		/*
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 11eb4e1b18c..be5f12e07d2 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -45,7 +45,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
  */
 #define	BULKSTAT_FG_IGET	0x1	/* Go through the buffer cache */
 #define	BULKSTAT_FG_QUICK	0x2	/* No iget, walk the dinode cluster */
-#define BULKSTAT_FG_VFSLOCKED	0x4	/* Already have vfs lock */
 
 /*
  * Return stat information in bulk (by-inode) for the filesystem.
-- 
cgit v1.2.3


From b65745205fc00d8c7722ec74e9bd955f3861c7e2 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 15:29:40 +1000
Subject: [XFS] Portability changes: remove prdev, stick to one diagnostic
 interface.

SGI-PV: 953338
SGI-Modid: xfs-linux-melb:xfs-kern:26103a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/support/debug.c   |  4 ++--
 fs/xfs/support/debug.h   |  3 ---
 fs/xfs/xfs_buf_item.c    |  6 ++---
 fs/xfs/xfs_inode.c       |  5 ++--
 fs/xfs/xfs_log_recover.c | 18 +++++++--------
 fs/xfs/xfs_mount.c       |  5 ++--
 fs/xfs/xfs_rtalloc.c     | 12 +++++-----
 fs/xfs/xfs_trans_buf.c   |  4 ++--
 fs/xfs/xfs_trans_item.c  |  2 +-
 fs/xfs/xfs_vfsops.c      | 59 +++++++++++++++++++++++++++++++-----------------
 10 files changed, 67 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index b08b3d9345b..36fbeccdc72 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -47,7 +47,7 @@ cmn_err(register int level, char *fmt, ...)
 	va_start(ap, fmt);
 	if (*fmt == '!') fp++;
 	len = vsprintf(message, fp, ap);
-	if (message[len-1] != '\n')
+	if (level != CE_DEBUG && message[len-1] != '\n')
 		strcat(message, "\n");
 	printk("%s%s", err_level[level], message);
 	va_end(ap);
@@ -68,7 +68,7 @@ icmn_err(register int level, char *fmt, va_list ap)
 		level = XFS_MAX_ERR_LEVEL;
 	spin_lock_irqsave(&xfs_err_lock,flags);
 	len = vsprintf(message, fmt, ap);
-	if (message[len-1] != '\n')
+	if (level != CE_DEBUG && message[len-1] != '\n')
 		strcat(message, "\n");
 	spin_unlock_irqrestore(&xfs_err_lock,flags);
 	printk("%s%s", err_level[level], message);
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index e3bf58112e7..4f54dca662a 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -33,9 +33,6 @@ extern void cmn_err(int, char *, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern void assfail(char *expr, char *f, int l);
 
-#define prdev(fmt,targ,args...) \
-	printk("Device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
-
 #define ASSERT_ALWAYS(expr)	\
 	(unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 59013e1f16e..290912cbff6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1030,9 +1030,9 @@ xfs_buf_iodone_callbacks(
 		if ((XFS_BUF_TARGET(bp) != lasttarg) ||
 		    (time_after(jiffies, (lasttime + 5*HZ)))) {
 			lasttime = jiffies;
-			prdev("XFS write error in file system meta-data "
-			      "block 0x%llx in %s",
-			      XFS_BUF_TARGET(bp),
+			cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+					" block 0x%llx in %s",
+				XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
 			      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
 		}
 		lasttarg = XFS_BUF_TARGET(bp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b30bffa99ed..50119a00885 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -352,8 +352,9 @@ xfs_itobp(
 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
 				 XFS_RANDOM_ITOBP_INOTOBP))) {
 #ifdef DEBUG
-			prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)",
-				mp->m_ddev_targp,
+			cmn_err(CE_ALERT, "Device %s - bad inode magic/vsn "
+					  "daddr %lld #%d (magic=%x)",
+				XFS_BUFTARG_NAME(mp->m_ddev_targp),
 				(unsigned long long)imap.im_blkno, i,
 				INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
 #endif
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a17218e8153..f952f9dbf74 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -193,14 +193,14 @@ xlog_header_check_dump(
 {
 	int			b;
 
-	printk("%s:  SB : uuid = ", __FUNCTION__);
+	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __FUNCTION__);
 	for (b = 0; b < 16; b++)
-		printk("%02x",((unsigned char *)&mp->m_sb.sb_uuid)[b]);
-	printk(", fmt = %d\n", XLOG_FMT);
-	printk("    log : uuid = ");
+		cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
+	cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
+	cmn_err(CE_DEBUG, "    log : uuid = ");
 	for (b = 0; b < 16; b++)
-		printk("%02x",((unsigned char *)&head->h_fs_uuid)[b]);
-	printk(", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
+		cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
+	cmn_err(CE_DEBUG, ", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
 }
 #else
 #define xlog_header_check_dump(mp, head)
@@ -3444,13 +3444,13 @@ xlog_unpack_data_checksum(
 	    if (rhead->h_chksum ||
 		((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
 		    cmn_err(CE_DEBUG,
-			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
+			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
 			    INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
 		    cmn_err(CE_DEBUG,
 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
 		    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
 			    cmn_err(CE_DEBUG,
-				"XFS: LogR this is a LogV2 filesystem");
+				"XFS: LogR this is a LogV2 filesystem\n");
 		    }
 		    log->l_flags |= XLOG_CHKSUM_MISMATCH;
 	    }
@@ -3999,7 +3999,7 @@ xlog_recover_finish(
 		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
 	} else {
 		cmn_err(CE_DEBUG,
-			"!Ending clean XFS mount for filesystem: %s",
+			"!Ending clean XFS mount for filesystem: %s\n",
 			log->l_mp->m_fsname);
 	}
 	return 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4b7be49cc4d..9378408a69d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1006,8 +1006,9 @@ xfs_mountfs(
 
 	if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
 		cmn_err(CE_WARN, "XFS: corrupted root inode");
-		prdev("Root inode %llu is not a directory",
-		      mp->m_ddev_targp, (unsigned long long)rip->i_ino);
+		cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
+			XFS_BUFTARG_NAME(mp->m_ddev_targp),
+			(unsigned long long)rip->i_ino);
 		xfs_iunlock(rip, XFS_ILOCK_EXCL);
 		XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
 				 mp);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f5944c8b378..af290cf37ac 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2404,10 +2404,10 @@ xfs_rtprint_range(
 {
 	xfs_extlen_t	i;		/* block number in the extent */
 
-	printk("%Ld: ", (long long)start);
+	cmn_err(CE_DEBUG, "%Ld: ", (long long)start);
 	for (i = 0; i < len; i++)
-		printk("%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
-	printk("\n");
+		cmn_err(CE_DEBUG, "%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
+	cmn_err(CE_DEBUG, "\n");
 }
 
 /*
@@ -2431,17 +2431,17 @@ xfs_rtprint_summary(
 			(void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c);
 			if (c) {
 				if (!p) {
-					printk("%Ld-%Ld:", 1LL << l,
+					cmn_err(CE_DEBUG, "%Ld-%Ld:", 1LL << l,
 						XFS_RTMIN((1LL << l) +
 							  ((1LL << l) - 1LL),
 							 mp->m_sb.sb_rextents));
 					p = 1;
 				}
-				printk(" %Ld:%d", (long long)i, c);
+				cmn_err(CE_DEBUG, " %Ld:%d", (long long)i, c);
 			}
 		}
 		if (p)
-			printk("\n");
+			cmn_err(CE_DEBUG, "\n");
 	}
 	if (sumbp)
 		xfs_trans_brelse(tp, sumbp);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 42291364570..8cedd1583bc 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -320,7 +320,7 @@ xfs_trans_read_buf(
 			if (xfs_error_target == target) {
 				if (((xfs_req_num++) % xfs_error_mod) == 0) {
 					xfs_buf_relse(bp);
-					printk("Returning error!\n");
+					cmn_err(CE_DEBUG, "Returning error!\n");
 					return XFS_ERROR(EIO);
 				}
 			}
@@ -425,7 +425,7 @@ xfs_trans_read_buf(
 				xfs_force_shutdown(tp->t_mountp,
 						   SHUTDOWN_META_IO_ERROR);
 				xfs_buf_relse(bp);
-				printk("Returning error in trans!\n");
+				cmn_err(CE_DEBUG, "Returning trans error!\n");
 				return XFS_ERROR(EIO);
 			}
 		}
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 1117d600d74..2912aac07c7 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -493,7 +493,7 @@ xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
 				break;
 			} else {
 				/* out-of-order vacancy */
-				printk("OOO vacancy lbcp 0x%p\n", lbcp);
+				cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
 				ASSERT(0);
 			}
 		}
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 36ea1b2094f..ec85a2e24b5 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -1708,42 +1708,48 @@ xfs_parseargs(
 
 		if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			args->logbufs = simple_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			args->logbufsize = suffix_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			strncpy(args->logname, value, MAXNAMELEN);
 		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			strncpy(args->mtpt, value, MAXNAMELEN);
 		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			strncpy(args->rtname, value, MAXNAMELEN);
 		} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -1752,7 +1758,8 @@ xfs_parseargs(
 			args->iosizelog = (uint8_t) iosize;
 		} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -1761,7 +1768,8 @@ xfs_parseargs(
 			args->iosizelog = ffs(iosize) - 1;
 		} else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -1782,7 +1790,8 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_INO64)) {
 			args->flags |= XFSMNT_INO64;
 #if !XFS_BIG_INUMS
-			printk("XFS: %s option not allowed on this system\n",
+			cmn_err(CE_WARN,
+				"XFS: %s option not allowed on this system",
 				this_char);
 			return EINVAL;
 #endif
@@ -1792,14 +1801,16 @@ xfs_parseargs(
 			args->flags |= XFSMNT_SWALLOC;
 		} else if (!strcmp(this_char, MNTOPT_SUNIT)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
 			dsunit = simple_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
 			if (!value || !*value) {
-				printk("XFS: %s option requires an argument\n",
+				cmn_err(CE_WARN,
+					"XFS: %s option requires an argument",
 					this_char);
 				return EINVAL;
 			}
@@ -1807,7 +1818,8 @@ xfs_parseargs(
 		} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
 			args->flags &= ~XFSMNT_32BITINODES;
 #if !XFS_BIG_INUMS
-			printk("XFS: %s option not allowed on this system\n",
+			cmn_err(CE_WARN,
+				"XFS: %s option not allowed on this system",
 				this_char);
 			return EINVAL;
 #endif
@@ -1831,36 +1843,41 @@ xfs_parseargs(
 			args->flags &= ~XFSMNT_ATTR2;
 		} else if (!strcmp(this_char, "osyncisdsync")) {
 			/* no-op, this is now the default */
-printk("XFS: osyncisdsync is now the default, option is deprecated.\n");
+			cmn_err(CE_WARN,
+	"XFS: osyncisdsync is now the default, option is deprecated.");
 		} else if (!strcmp(this_char, "irixsgid")) {
-printk("XFS: irixsgid is now a sysctl(2) variable, option is deprecated.\n");
+			cmn_err(CE_WARN,
+	"XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
 		} else {
-			printk("XFS: unknown mount option [%s].\n", this_char);
+			cmn_err(CE_WARN,
+				"XFS: unknown mount option [%s].", this_char);
 			return EINVAL;
 		}
 	}
 
 	if (args->flags & XFSMNT_NORECOVERY) {
 		if ((vfsp->vfs_flag & VFS_RDONLY) == 0) {
-			printk("XFS: no-recovery mounts must be read-only.\n");
+			cmn_err(CE_WARN,
+				"XFS: no-recovery mounts must be read-only.");
 			return EINVAL;
 		}
 	}
 
 	if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
-		printk(
-	"XFS: sunit and swidth options incompatible with the noalign option\n");
+		cmn_err(CE_WARN,
+	"XFS: sunit and swidth options incompatible with the noalign option");
 		return EINVAL;
 	}
 
 	if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
-		printk("XFS: sunit and swidth must be specified together\n");
+		cmn_err(CE_WARN,
+			"XFS: sunit and swidth must be specified together");
 		return EINVAL;
 	}
 
 	if (dsunit && (dswidth % dsunit != 0)) {
-		printk(
-	"XFS: stripe width (%d) must be a multiple of the stripe unit (%d)\n",
+		cmn_err(CE_WARN,
+	"XFS: stripe width (%d) must be a multiple of the stripe unit (%d)",
 			dswidth, dsunit);
 		return EINVAL;
 	}
-- 
cgit v1.2.3


From 932f2c323196c214e645d5a572a1d7b562c0f93f Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 15:29:58 +1000
Subject: [XFS] statvfs component of directory/project quota support, code
 originally by Glen.

SGI-PV: 932952
SGI-Modid: xfs-linux-melb:xfs-kern:26105a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/quota/xfs_qm_bhv.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 6838b36d95a..181dd90b29c 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -185,6 +185,61 @@ xfs_qm_mount(
 	return error;
 }
 
+/*
+ * Directory tree accounting is implemented using project quotas, where
+ * the project identifier is inherited from parent directories.
+ * A statvfs (df, etc.) of a directory that is using project quota should
+ * return a statvfs of the project, not the entire filesystem.
+ * This makes such trees appear as if they are filesystems in themselves.
+ */
+STATIC int
+xfs_qm_statvfs(
+	struct bhv_desc		*bhv,
+	xfs_statfs_t		*statp,
+	struct vnode		*vnode)
+{
+	xfs_mount_t		*mp;
+	xfs_inode_t		*ip;
+	xfs_dquot_t		*dqp;
+	xfs_disk_dquot_t	*dp;
+	__uint64_t		limit;
+	int			error;
+
+	error = PVFS_STATVFS(BHV_NEXT(bhv), statp, vnode);
+	if (error || !vnode)
+		return error;
+
+	mp = XFS_BHVTOM(bhv);
+	ip = xfs_vtoi(vnode);
+
+	if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
+		return 0;
+	if (!(mp->m_qflags & XFS_PQUOTA_ACCT))
+		return 0;
+	if (!(mp->m_qflags & XFS_OQUOTA_ENFD))
+		return 0;
+
+	if (xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp))
+		return 0;
+	dp = &dqp->q_core;
+
+	limit = dp->d_blk_softlimit ? dp->d_blk_softlimit : dp->d_blk_hardlimit;
+	if (limit && statp->f_blocks > limit) {
+		statp->f_blocks = limit;
+		statp->f_bfree = (statp->f_blocks > dp->d_bcount) ?
+					(statp->f_blocks - dp->d_bcount) : 0;
+	}
+	limit = dp->d_ino_softlimit ? dp->d_ino_softlimit : dp->d_ino_hardlimit;
+	if (limit && statp->f_files > limit) {
+		statp->f_files = limit;
+		statp->f_ffree = (statp->f_files > dp->d_icount) ?
+					(statp->f_ffree - dp->d_icount) : 0;
+	}
+
+	xfs_qm_dqput(dqp);
+	return 0;
+}
+
 STATIC int
 xfs_qm_syncall(
 	struct bhv_desc		*bhv,
@@ -351,6 +406,7 @@ struct bhv_vfsops xfs_qmops = { {
 	.vfs_parseargs		= xfs_qm_parseargs,
 	.vfs_showargs		= xfs_qm_showargs,
 	.vfs_mount		= xfs_qm_mount,
+	.vfs_statvfs		= xfs_qm_statvfs,
 	.vfs_sync		= xfs_qm_syncall,
 	.vfs_quotactl		= xfs_qm_quotactl, },
 };
-- 
cgit v1.2.3


From b83bd1388133e914c38bd31d69bc90143e6ab10c Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 16:48:30 +1000
Subject: [XFS] Resolve a namespace collision on vfs/vfsops for FreeBSD
 porters.

SGI-PV: 9533338
SGI-Modid: xfs-linux-melb:xfs-kern:26106a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_export.c  |   6 +-
 fs/xfs/linux-2.6/xfs_super.c   | 141 ++++++++++++++++-------------------------
 fs/xfs/linux-2.6/xfs_vfs.c     |  28 ++++----
 fs/xfs/linux-2.6/xfs_vfs.h     | 126 ++++++++++++++++++------------------
 fs/xfs/linux-2.6/xfs_vnode.c   |   2 +-
 fs/xfs/linux-2.6/xfs_vnode.h   |   4 +-
 fs/xfs/quota/xfs_qm_bhv.c      |  23 +++----
 fs/xfs/quota/xfs_qm_syscalls.c |   2 +-
 fs/xfs/xfs_dmapi.h             |   2 +-
 fs/xfs/xfs_fsops.c             |   2 +-
 fs/xfs/xfs_iget.c              |   2 +-
 fs/xfs/xfs_inode.c             |   4 +-
 fs/xfs/xfs_inode.h             |   1 -
 fs/xfs/xfs_iocore.c            |   2 +-
 fs/xfs/xfs_log.c               |   7 +-
 fs/xfs/xfs_mount.c             |   8 +--
 fs/xfs/xfs_mount.h             |  22 +++----
 fs/xfs/xfs_quota.h             |   2 +-
 fs/xfs/xfs_vfsops.c            |  23 ++++---
 19 files changed, 183 insertions(+), 224 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index b768ea910bb..b5c579eeb8e 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -97,7 +97,7 @@ xfs_fs_encode_fh(
 	int			len;
 	int			is64 = 0;
 #if XFS_BIG_INUMS
-	vfs_t			*vfs = vfs_from_sb(inode->i_sb);
+	bhv_vfs_t		*vfs = vfs_from_sb(inode->i_sb);
 
 	if (!(vfs->vfs_flag & VFS_32BITINODES)) {
 		/* filesystem may contain 64bit inode numbers */
@@ -139,10 +139,10 @@ xfs_fs_get_dentry(
 	vnode_t			*vp;
 	struct inode		*inode;
 	struct dentry		*result;
-	vfs_t			*vfsp = vfs_from_sb(sb);
+	bhv_vfs_t		*vfsp = vfs_from_sb(sb);
 	int			error;
 
-	VFS_VGET(vfsp, &vp, (fid_t *)data, error);
+	error = bhv_vfs_vget(vfsp, &vp, (fid_t *)data);
 	if (error || vp == NULL)
 		return ERR_PTR(-ESTALE) ;
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 68f4793e8a1..97dbcb68b25 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -475,13 +475,13 @@ xfs_fs_clear_inode(
  */
 STATIC void
 xfs_syncd_queue_work(
-	struct vfs	*vfs,
+	struct bhv_vfs	*vfs,
 	void		*data,
-	void		(*syncer)(vfs_t *, void *))
+	void		(*syncer)(bhv_vfs_t *, void *))
 {
-	vfs_sync_work_t	*work;
+	struct bhv_vfs_sync_work *work;
 
-	work = kmem_alloc(sizeof(struct vfs_sync_work), KM_SLEEP);
+	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
 	INIT_LIST_HEAD(&work->w_list);
 	work->w_syncer = syncer;
 	work->w_data = data;
@@ -500,7 +500,7 @@ xfs_syncd_queue_work(
  */
 STATIC void
 xfs_flush_inode_work(
-	vfs_t		*vfs,
+	bhv_vfs_t	*vfs,
 	void		*inode)
 {
 	filemap_flush(((struct inode *)inode)->i_mapping);
@@ -512,7 +512,7 @@ xfs_flush_inode(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = vn_to_inode(XFS_ITOV(ip));
-	struct vfs	*vfs = XFS_MTOVFS(ip->i_mount);
+	struct bhv_vfs	*vfs = XFS_MTOVFS(ip->i_mount);
 
 	igrab(inode);
 	xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work);
@@ -525,7 +525,7 @@ xfs_flush_inode(
  */
 STATIC void
 xfs_flush_device_work(
-	vfs_t		*vfs,
+	bhv_vfs_t	*vfs,
 	void		*inode)
 {
 	sync_blockdev(vfs->vfs_super->s_bdev);
@@ -537,7 +537,7 @@ xfs_flush_device(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = vn_to_inode(XFS_ITOV(ip));
-	struct vfs	*vfs = XFS_MTOVFS(ip->i_mount);
+	struct bhv_vfs	*vfs = XFS_MTOVFS(ip->i_mount);
 
 	igrab(inode);
 	xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work);
@@ -545,16 +545,16 @@ xfs_flush_device(
 	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
 
-#define SYNCD_FLAGS	(SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR|SYNC_REFCACHE)
 STATIC void
 vfs_sync_worker(
-	vfs_t		*vfsp,
+	bhv_vfs_t	*vfsp,
 	void		*unused)
 {
 	int		error;
 
 	if (!(vfsp->vfs_flag & VFS_RDONLY))
-		VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error);
+		error = bhv_vfs_sync(vfsp, SYNC_FSDATA | SYNC_BDFLUSH | \
+					SYNC_ATTR | SYNC_REFCACHE, NULL);
 	vfsp->vfs_sync_seq++;
 	wmb();
 	wake_up(&vfsp->vfs_wait_single_sync_task);
@@ -565,8 +565,8 @@ xfssyncd(
 	void			*arg)
 {
 	long			timeleft;
-	vfs_t			*vfsp = (vfs_t *) arg;
-	struct vfs_sync_work	*work, *n;
+	bhv_vfs_t		*vfsp = (bhv_vfs_t *) arg;
+	bhv_vfs_sync_work_t	*work, *n;
 	LIST_HEAD		(tmp);
 
 	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
@@ -600,7 +600,7 @@ xfssyncd(
 			list_del(&work->w_list);
 			if (work == &vfsp->vfs_sync_work)
 				continue;
-			kmem_free(work, sizeof(struct vfs_sync_work));
+			kmem_free(work, sizeof(struct bhv_vfs_sync_work));
 		}
 	}
 
@@ -609,7 +609,7 @@ xfssyncd(
 
 STATIC int
 xfs_fs_start_syncd(
-	vfs_t			*vfsp)
+	bhv_vfs_t		*vfsp)
 {
 	vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
 	vfsp->vfs_sync_work.w_vfs = vfsp;
@@ -621,7 +621,7 @@ xfs_fs_start_syncd(
 
 STATIC void
 xfs_fs_stop_syncd(
-	vfs_t			*vfsp)
+	bhv_vfs_t		*vfsp)
 {
 	kthread_stop(vfsp->vfs_sync_task);
 }
@@ -630,35 +630,26 @@ STATIC void
 xfs_fs_put_super(
 	struct super_block	*sb)
 {
-	vfs_t			*vfsp = vfs_from_sb(sb);
+	bhv_vfs_t		*vfsp = vfs_from_sb(sb);
 	int			error;
 
 	xfs_fs_stop_syncd(vfsp);
-	VFS_SYNC(vfsp, SYNC_ATTR|SYNC_DELWRI, NULL, error);
-	if (!error)
-		VFS_UNMOUNT(vfsp, 0, NULL, error);
+	bhv_vfs_sync(vfsp, SYNC_ATTR | SYNC_DELWRI, NULL);
+	error = bhv_vfs_unmount(vfsp, 0, NULL);
 	if (error) {
-		printk("XFS unmount got error %d\n", error);
-		printk("%s: vfsp/0x%p left dangling!\n", __FUNCTION__, vfsp);
-		return;
+		printk("XFS: unmount got error=%d\n", error);
+		printk("%s: vfs=0x%p left dangling!\n", __FUNCTION__, vfsp);
+	} else {
+		vfs_deallocate(vfsp);
 	}
-
-	vfs_deallocate(vfsp);
 }
 
 STATIC void
 xfs_fs_write_super(
 	struct super_block	*sb)
 {
-	vfs_t			*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	if (sb->s_flags & MS_RDONLY) {
-		sb->s_dirt = 0; /* paranoia */
-		return;
-	}
-	/* Push the log and superblock a little */
-	VFS_SYNC(vfsp, SYNC_FSDATA, NULL, error);
+	if (!(sb->s_flags & MS_RDONLY))
+		bhv_vfs_sync(vfs_from_sb(sb), SYNC_FSDATA, NULL);
 	sb->s_dirt = 0;
 }
 
@@ -667,16 +658,16 @@ xfs_fs_sync_super(
 	struct super_block	*sb,
 	int			wait)
 {
-	vfs_t		*vfsp = vfs_from_sb(sb);
-	int		error;
-	int		flags = SYNC_FSDATA;
+	bhv_vfs_t		*vfsp = vfs_from_sb(sb);
+	int			error;
+	int			flags;
 
 	if (unlikely(sb->s_frozen == SB_FREEZE_WRITE))
 		flags = SYNC_QUIESCE;
 	else
 		flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0);
 
-	VFS_SYNC(vfsp, flags, NULL, error);
+	error = bhv_vfs_sync(vfsp, flags, NULL);
 	sb->s_dirt = 0;
 
 	if (unlikely(laptop_mode)) {
@@ -706,11 +697,7 @@ xfs_fs_statfs(
 	struct super_block	*sb,
 	struct kstatfs		*statp)
 {
-	vfs_t			*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	VFS_STATVFS(vfsp, statp, NULL, error);
-	return -error;
+	return -bhv_vfs_statvfs(vfs_from_sb(sb), statp, NULL);
 }
 
 STATIC int
@@ -719,13 +706,13 @@ xfs_fs_remount(
 	int			*flags,
 	char			*options)
 {
-	vfs_t			*vfsp = vfs_from_sb(sb);
+	bhv_vfs_t		*vfsp = vfs_from_sb(sb);
 	struct xfs_mount_args	*args = xfs_args_allocate(sb, 0);
 	int			error;
 
-	VFS_PARSEARGS(vfsp, options, args, 1, error);
+	error = bhv_vfs_parseargs(vfsp, options, args, 1);
 	if (!error)
-		VFS_MNTUPDATE(vfsp, flags, args, error);
+		error = bhv_vfs_mntupdate(vfsp, flags, args);
 	kmem_free(args, sizeof(*args));
 	return -error;
 }
@@ -734,7 +721,7 @@ STATIC void
 xfs_fs_lockfs(
 	struct super_block	*sb)
 {
-	VFS_FREEZE(vfs_from_sb(sb));
+	bhv_vfs_freeze(vfs_from_sb(sb));
 }
 
 STATIC int
@@ -742,11 +729,7 @@ xfs_fs_show_options(
 	struct seq_file		*m,
 	struct vfsmount		*mnt)
 {
-	struct vfs		*vfsp = vfs_from_sb(mnt->mnt_sb);
-	int			error;
-
-	VFS_SHOWARGS(vfsp, m, error);
-	return error;
+	return -bhv_vfs_showargs(vfs_from_sb(mnt->mnt_sb), m);
 }
 
 STATIC int
@@ -754,11 +737,7 @@ xfs_fs_quotasync(
 	struct super_block	*sb,
 	int			type)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	VFS_QUOTACTL(vfsp, Q_XQUOTASYNC, 0, (caddr_t)NULL, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XQUOTASYNC, 0, NULL);
 }
 
 STATIC int
@@ -766,11 +745,7 @@ xfs_fs_getxstate(
 	struct super_block	*sb,
 	struct fs_quota_stat	*fqs)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	VFS_QUOTACTL(vfsp, Q_XGETQSTAT, 0, (caddr_t)fqs, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XGETQSTAT, 0, (caddr_t)fqs);
 }
 
 STATIC int
@@ -779,11 +754,7 @@ xfs_fs_setxstate(
 	unsigned int		flags,
 	int			op)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error;
-
-	VFS_QUOTACTL(vfsp, op, 0, (caddr_t)&flags, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb), op, 0, (caddr_t)&flags);
 }
 
 STATIC int
@@ -793,13 +764,10 @@ xfs_fs_getxquota(
 	qid_t			id,
 	struct fs_disk_quota	*fdq)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error, getmode;
-
-	getmode = (type == USRQUOTA) ? Q_XGETQUOTA :
-		 ((type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETPQUOTA);
-	VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb),
+				 (type == USRQUOTA) ? Q_XGETQUOTA :
+				  ((type == GRPQUOTA) ? Q_XGETGQUOTA :
+				   Q_XGETPQUOTA), id, (caddr_t)fdq);
 }
 
 STATIC int
@@ -809,13 +777,10 @@ xfs_fs_setxquota(
 	qid_t			id,
 	struct fs_disk_quota	*fdq)
 {
-	struct vfs		*vfsp = vfs_from_sb(sb);
-	int			error, setmode;
-
-	setmode = (type == USRQUOTA) ? Q_XSETQLIM :
-		 ((type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETPQLIM);
-	VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error);
-	return -error;
+	return -bhv_vfs_quotactl(vfs_from_sb(sb),
+				 (type == USRQUOTA) ? Q_XSETQLIM :
+				  ((type == GRPQUOTA) ? Q_XSETGQLIM :
+				   Q_XSETPQLIM), id, (caddr_t)fdq);
 }
 
 STATIC int
@@ -825,14 +790,14 @@ xfs_fs_fill_super(
 	int			silent)
 {
 	vnode_t			*rootvp;
-	struct vfs		*vfsp = vfs_allocate(sb);
+	struct bhv_vfs		*vfsp = vfs_allocate(sb);
 	struct xfs_mount_args	*args = xfs_args_allocate(sb, silent);
 	struct kstatfs		statvfs;
-	int			error, error2;
+	int			error;
 
 	bhv_insert_all_vfsops(vfsp);
 
-	VFS_PARSEARGS(vfsp, (char *)data, args, 0, error);
+	error = bhv_vfs_parseargs(vfsp, (char *)data, args, 0);
 	if (error) {
 		bhv_remove_all_vfsops(vfsp, 1);
 		goto fail_vfsop;
@@ -845,13 +810,13 @@ xfs_fs_fill_super(
 	sb->s_qcop = &xfs_quotactl_operations;
 	sb->s_op = &xfs_super_operations;
 
-	VFS_MOUNT(vfsp, args, NULL, error);
+	error = bhv_vfs_mount(vfsp, args, NULL);
 	if (error) {
 		bhv_remove_all_vfsops(vfsp, 1);
 		goto fail_vfsop;
 	}
 
-	VFS_STATVFS(vfsp, &statvfs, NULL, error);
+	error = bhv_vfs_statvfs(vfsp, &statvfs, NULL);
 	if (error)
 		goto fail_unmount;
 
@@ -863,7 +828,7 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	VFS_ROOT(vfsp, &rootvp, error);
+	error = bhv_vfs_root(vfsp, &rootvp);
 	if (error)
 		goto fail_unmount;
 
@@ -892,7 +857,7 @@ fail_vnrele:
 	}
 
 fail_unmount:
-	VFS_UNMOUNT(vfsp, 0, NULL, error2);
+	bhv_vfs_unmount(vfsp, 0, NULL);
 
 fail_vfsop:
 	vfs_deallocate(vfsp);
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
index 6f7c9f7a862..9f9bc894deb 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -226,13 +226,13 @@ vfs_freeze(
 	((*bhvtovfsops(next)->vfs_freeze)(next));
 }
 
-vfs_t *
+bhv_vfs_t *
 vfs_allocate(
 	struct super_block	*sb)
 {
-	struct vfs		*vfsp;
+	struct bhv_vfs		*vfsp;
 
-	vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP);
+	vfsp = kmem_zalloc(sizeof(bhv_vfs_t), KM_SLEEP);
 	bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
 	INIT_LIST_HEAD(&vfsp->vfs_sync_list);
 	spin_lock_init(&vfsp->vfs_sync_lock);
@@ -247,25 +247,25 @@ vfs_allocate(
 	return vfsp;
 }
 
-vfs_t *
+bhv_vfs_t *
 vfs_from_sb(
 	struct super_block	*sb)
 {
-	return (vfs_t *)sb->s_fs_info;
+	return (bhv_vfs_t *)sb->s_fs_info;
 }
 
 void
 vfs_deallocate(
-	struct vfs		*vfsp)
+	struct bhv_vfs		*vfsp)
 {
 	bhv_head_destroy(VFS_BHVHEAD(vfsp));
-	kmem_free(vfsp, sizeof(vfs_t));
+	kmem_free(vfsp, sizeof(bhv_vfs_t));
 }
 
 void
 vfs_insertops(
-	struct vfs		*vfsp,
-	struct bhv_vfsops	*vfsops)
+	struct bhv_vfs		*vfsp,
+	struct bhv_module_vfsops *vfsops)
 {
 	struct bhv_desc		*bdp;
 
@@ -276,9 +276,9 @@ vfs_insertops(
 
 void
 vfs_insertbhv(
-	struct vfs		*vfsp,
+	struct bhv_vfs		*vfsp,
 	struct bhv_desc		*bdp,
-	struct vfsops		*vfsops,
+	struct bhv_vfsops	*vfsops,
 	void			*mount)
 {
 	bhv_desc_init(bdp, mount, vfsp, vfsops);
@@ -287,7 +287,7 @@ vfs_insertbhv(
 
 void
 bhv_remove_vfsops(
-	struct vfs		*vfsp,
+	struct bhv_vfs		*vfsp,
 	int			pos)
 {
 	struct bhv_desc		*bhv;
@@ -301,7 +301,7 @@ bhv_remove_vfsops(
 
 void
 bhv_remove_all_vfsops(
-	struct vfs		*vfsp,
+	struct bhv_vfs		*vfsp,
 	int			freebase)
 {
 	struct xfs_mount	*mp;
@@ -317,7 +317,7 @@ bhv_remove_all_vfsops(
 
 void
 bhv_insert_all_vfsops(
-	struct vfs		*vfsp)
+	struct bhv_vfs		*vfsp)
 {
 	struct xfs_mount	*mp;
 
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 2ca05043a0f..55f0afd3e79 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -21,42 +21,42 @@
 #include <linux/vfs.h>
 #include "xfs_fs.h"
 
+struct bhv_vfs;
 struct fid;
-struct vfs;
 struct cred;
 struct vnode;
-struct kstatfs;
+struct statfs;
 struct seq_file;
 struct super_block;
 struct xfs_mount_args;
 
 typedef struct kstatfs xfs_statfs_t;
 
-typedef struct vfs_sync_work {
+typedef struct bhv_vfs_sync_work {
 	struct list_head	w_list;
-	struct vfs		*w_vfs;
+	struct bhv_vfs		*w_vfs;
 	void			*w_data;	/* syncer routine argument */
-	void			(*w_syncer)(struct vfs *, void *);
-} vfs_sync_work_t;
+	void			(*w_syncer)(struct bhv_vfs *, void *);
+} bhv_vfs_sync_work_t;
 
-typedef struct vfs {
+typedef struct bhv_vfs {
 	u_int			vfs_flag;	/* flags */
 	xfs_fsid_t		vfs_fsid;	/* file system ID */
 	xfs_fsid_t		*vfs_altfsid;	/* An ID fixed for life of FS */
 	bhv_head_t		vfs_bh;		/* head of vfs behavior chain */
 	struct super_block	*vfs_super;	/* generic superblock pointer */
 	struct task_struct	*vfs_sync_task;	/* generalised sync thread */
-	vfs_sync_work_t		vfs_sync_work;	/* work item for VFS_SYNC */
+	bhv_vfs_sync_work_t	vfs_sync_work;	/* work item for VFS_SYNC */
 	struct list_head	vfs_sync_list;	/* sync thread work item list */
 	spinlock_t		vfs_sync_lock;	/* work item list lock */
-	int 			vfs_sync_seq;	/* sync thread generation no. */
+	int			vfs_sync_seq;	/* sync thread generation no. */
 	wait_queue_head_t	vfs_wait_single_sync_task;
-} vfs_t;
+} bhv_vfs_t;
 
 #define vfs_fbhv		vfs_bh.bh_first	/* 1st on vfs behavior chain */
 
-#define bhvtovfs(bdp)		( (struct vfs *)BHV_VOBJ(bdp) )
-#define bhvtovfsops(bdp)	( (struct vfsops *)BHV_OPS(bdp) )
+#define bhvtovfs(bdp)		( (struct bhv_vfs *)BHV_VOBJ(bdp) )
+#define bhvtovfsops(bdp)	( (struct bhv_vfsops *)BHV_OPS(bdp) )
 #define VFS_BHVHEAD(vfs)	( &(vfs)->vfs_bh )
 #define VFS_REMOVEBHV(vfs, bdp)	( bhv_remove(VFS_BHVHEAD(vfs), bdp) )
 
@@ -71,7 +71,7 @@ typedef enum {
 	VFS_BHV_QM,		/* quota manager */
 	VFS_BHV_IO,		/* IO path */
 	VFS_BHV_END		/* housekeeping end-of-range */
-} vfs_bhv_t;
+} bhv_vfs_type_t;
 
 #define VFS_POSITION_XFS	(BHV_POSITION_BASE)
 #define VFS_POSITION_DM		(VFS_POSITION_BASE+10)
@@ -81,8 +81,9 @@ typedef enum {
 #define VFS_RDONLY		0x0001	/* read-only vfs */
 #define VFS_GRPID		0x0002	/* group-ID assigned from directory */
 #define VFS_DMI			0x0004	/* filesystem has the DMI enabled */
-#define VFS_32BITINODES		0x0008	/* do not use inums above 32 bits */
-#define VFS_END			0x0008	/* max flag */
+#define VFS_UMOUNT		0x0008	/* unmount in progress */
+#define VFS_32BITINODES		0x0010	/* do not use inums above 32 bits */
+#define VFS_END			0x0010	/* max flag */
 
 #define SYNC_ATTR		0x0001	/* sync attributes */
 #define SYNC_CLOSE		0x0002	/* close file system down */
@@ -92,7 +93,7 @@ typedef enum {
 #define SYNC_FSDATA		0x0020	/* flush fs data (e.g. superblocks) */
 #define SYNC_REFCACHE		0x0040  /* prune some of the nfs ref cache */
 #define SYNC_REMOUNT		0x0080  /* remount readonly, no dummy LRs */
-#define SYNC_QUIESCE		0x0100  /* quiesce filesystem for a snapshot */
+#define SYNC_QUIESCE		0x0100  /* quiesce fileystem for a snapshot */
 
 #define SHUTDOWN_META_IO_ERROR	0x0001	/* write attempt to metadata failed */
 #define SHUTDOWN_LOG_IO_ERROR	0x0002	/* write attempt to the log failed */
@@ -120,7 +121,7 @@ typedef void	(*vfs_init_vnode_t)(bhv_desc_t *,
 typedef void	(*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
 typedef void	(*vfs_freeze_t)(bhv_desc_t *);
 
-typedef struct vfsops {
+typedef struct bhv_vfsops {
 	bhv_position_t		vf_position;	/* behavior chain position */
 	vfs_mount_t		vfs_mount;	/* mount file system */
 	vfs_parseargs_t		vfs_parseargs;	/* parse mount options */
@@ -136,44 +137,44 @@ typedef struct vfsops {
 	vfs_init_vnode_t	vfs_init_vnode;	/* initialize a new vnode */
 	vfs_force_shutdown_t	vfs_force_shutdown;	/* crash and burn */
 	vfs_freeze_t		vfs_freeze;	/* freeze fs for snapshot */
-} vfsops_t;
+} bhv_vfsops_t;
 
 /*
  * VFS's.  Operates on vfs structure pointers (starts at bhv head).
  */
 #define VHEAD(v)			((v)->vfs_fbhv)
-#define VFS_MOUNT(v, ma,cr, rv)		((rv) = vfs_mount(VHEAD(v), ma,cr))
-#define VFS_PARSEARGS(v, o,ma,f, rv)	((rv) = vfs_parseargs(VHEAD(v), o,ma,f))
-#define VFS_SHOWARGS(v, m, rv)		((rv) = vfs_showargs(VHEAD(v), m))
-#define VFS_UNMOUNT(v, f, cr, rv)	((rv) = vfs_unmount(VHEAD(v), f,cr))
-#define VFS_MNTUPDATE(v, fl, args, rv)	((rv) = vfs_mntupdate(VHEAD(v), fl, args))
-#define VFS_ROOT(v, vpp, rv)		((rv) = vfs_root(VHEAD(v), vpp))
-#define VFS_STATVFS(v, sp,vp, rv)	((rv) = vfs_statvfs(VHEAD(v), sp,vp))
-#define VFS_SYNC(v, flag,cr, rv)	((rv) = vfs_sync(VHEAD(v), flag,cr))
-#define VFS_VGET(v, vpp,fidp, rv)	((rv) = vfs_vget(VHEAD(v), vpp,fidp))
-#define VFS_DMAPIOPS(v, p, rv)		((rv) = vfs_dmapiops(VHEAD(v), p))
-#define VFS_QUOTACTL(v, c,id,p, rv)	((rv) = vfs_quotactl(VHEAD(v), c,id,p))
-#define VFS_INIT_VNODE(v, vp,b,ul)	( vfs_init_vnode(VHEAD(v), vp,b,ul) )
-#define VFS_FORCE_SHUTDOWN(v, fl,f,l)	( vfs_force_shutdown(VHEAD(v), fl,f,l) )
-#define VFS_FREEZE(v)			( vfs_freeze(VHEAD(v)) )
+#define bhv_vfs_mount(v, ma,cr)		vfs_mount(VHEAD(v), ma,cr)
+#define bhv_vfs_parseargs(v, o,ma,f)	vfs_parseargs(VHEAD(v), o,ma,f)
+#define bhv_vfs_showargs(v, m)		vfs_showargs(VHEAD(v), m)
+#define bhv_vfs_unmount(v, f,cr)	vfs_unmount(VHEAD(v), f,cr)
+#define bhv_vfs_mntupdate(v, fl,args)	vfs_mntupdate(VHEAD(v), fl,args)
+#define bhv_vfs_root(v, vpp)		vfs_root(VHEAD(v), vpp)
+#define bhv_vfs_statvfs(v, sp,vp)	vfs_statvfs(VHEAD(v), sp,vp)
+#define bhv_vfs_sync(v, flag,cr)	vfs_sync(VHEAD(v), flag,cr)
+#define bhv_vfs_vget(v, vpp,fidp)	vfs_vget(VHEAD(v), vpp,fidp)
+#define bhv_vfs_dmapiops(v, p)		vfs_dmapiops(VHEAD(v), p)
+#define bhv_vfs_quotactl(v, c,id,p)	vfs_quotactl(VHEAD(v), c,id,p)
+#define bhv_vfs_init_vnode(v, vp,b,ul)	vfs_init_vnode(VHEAD(v), vp,b,ul)
+#define bhv_vfs_force_shutdown(v,u,f,l)	vfs_force_shutdown(VHEAD(v), u,f,l)
+#define bhv_vfs_freeze(v)		vfs_freeze(VHEAD(v))
 
 /*
  * PVFS's.  Operates on behavior descriptor pointers.
  */
-#define PVFS_MOUNT(b, ma,cr, rv)	((rv) = vfs_mount(b, ma,cr))
-#define PVFS_PARSEARGS(b, o,ma,f, rv)	((rv) = vfs_parseargs(b, o,ma,f))
-#define PVFS_SHOWARGS(b, m, rv)		((rv) = vfs_showargs(b, m))
-#define PVFS_UNMOUNT(b, f,cr, rv)	((rv) = vfs_unmount(b, f,cr))
-#define PVFS_MNTUPDATE(b, fl, args, rv)	((rv) = vfs_mntupdate(b, fl, args))
-#define PVFS_ROOT(b, vpp, rv)		((rv) = vfs_root(b, vpp))
-#define PVFS_STATVFS(b, sp,vp, rv)	((rv) = vfs_statvfs(b, sp,vp))
-#define PVFS_SYNC(b, flag,cr, rv)	((rv) = vfs_sync(b, flag,cr))
-#define PVFS_VGET(b, vpp,fidp, rv)	((rv) = vfs_vget(b, vpp,fidp))
-#define PVFS_DMAPIOPS(b, p, rv)		((rv) = vfs_dmapiops(b, p))
-#define PVFS_QUOTACTL(b, c,id,p, rv)	((rv) = vfs_quotactl(b, c,id,p))
-#define PVFS_INIT_VNODE(b, vp,b2,ul)	( vfs_init_vnode(b, vp,b2,ul) )
-#define PVFS_FORCE_SHUTDOWN(b, fl,f,l)	( vfs_force_shutdown(b, fl,f,l) )
-#define PVFS_FREEZE(b)			( vfs_freeze(b) )
+#define bhv_next_vfs_mount(b, ma,cr)		vfs_mount(b, ma,cr)
+#define bhv_next_vfs_parseargs(b, o,ma,f)	vfs_parseargs(b, o,ma,f)
+#define bhv_next_vfs_showargs(b, m)		vfs_showargs(b, m)
+#define bhv_next_vfs_unmount(b, f,cr)		vfs_unmount(b, f,cr)
+#define bhv_next_vfs_mntupdate(b, fl,args)	vfs_mntupdate(b, fl, args)
+#define bhv_next_vfs_root(b, vpp)		vfs_root(b, vpp)
+#define bhv_next_vfs_statvfs(b, sp,vp)		vfs_statvfs(b, sp,vp)
+#define bhv_next_vfs_sync(b, flag,cr)		vfs_sync(b, flag,cr)
+#define bhv_next_vfs_vget(b, vpp,fidp)		vfs_vget(b, vpp,fidp)
+#define bhv_next_vfs_dmapiops(b, p)		vfs_dmapiops(b, p)
+#define bhv_next_vfs_quotactl(b, c,id,p)	vfs_quotactl(b, c,id,p)
+#define bhv_next_vfs_init_vnode(b, vp,b2,ul)	vfs_init_vnode(b, vp,b2,ul)
+#define bhv_next_force_shutdown(b, fl,f,l)	vfs_force_shutdown(b, fl,f,l)
+#define bhv_next_vfs_freeze(b)			vfs_freeze(b)
 
 extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
 extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
@@ -190,25 +191,26 @@ extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
 extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
 extern void vfs_freeze(bhv_desc_t *);
 
-typedef struct bhv_vfsops {
-	struct vfsops		bhv_common;
+typedef struct bhv_module_vfsops {
+	struct bhv_vfsops	bhv_common;
 	void *			bhv_custom;
-} bhv_vfsops_t;
+} bhv_module_vfsops_t;
+
+#define vfs_bhv_lookup(v, id)	(bhv_lookup_range(&(v)->vfs_bh, (id), (id)))
+#define vfs_bhv_custom(b)	(((bhv_module_vfsops_t*)BHV_OPS(b))->bhv_custom)
+#define vfs_bhv_set_custom(b,o)	((b)->bhv_custom = (void *)(o))
+#define vfs_bhv_clr_custom(b)	((b)->bhv_custom = NULL)
 
-#define vfs_bhv_lookup(v, id)	( bhv_lookup_range(&(v)->vfs_bh, (id), (id)) )
-#define vfs_bhv_custom(b)	( ((bhv_vfsops_t *)BHV_OPS(b))->bhv_custom )
-#define vfs_bhv_set_custom(b,o)	( (b)->bhv_custom = (void *)(o))
-#define vfs_bhv_clr_custom(b)	( (b)->bhv_custom = NULL )
+extern bhv_vfs_t *vfs_allocate(struct super_block *);
+extern bhv_vfs_t *vfs_from_sb(struct super_block *);
+extern void vfs_deallocate(bhv_vfs_t *);
+extern void vfs_insertbhv(bhv_vfs_t *, bhv_desc_t *, bhv_vfsops_t *, void *);
 
-extern vfs_t *vfs_allocate(struct super_block *);
-extern vfs_t *vfs_from_sb(struct super_block *);
-extern void vfs_deallocate(vfs_t *);
-extern void vfs_insertops(vfs_t *, bhv_vfsops_t *);
-extern void vfs_insertbhv(vfs_t *, bhv_desc_t *, vfsops_t *, void *);
+extern void vfs_insertops(bhv_vfs_t *, bhv_module_vfsops_t *);
 
-extern void bhv_insert_all_vfsops(struct vfs *);
-extern void bhv_remove_all_vfsops(struct vfs *, int);
-extern void bhv_remove_vfsops(struct vfs *, int);
+extern void bhv_insert_all_vfsops(struct bhv_vfs *);
+extern void bhv_remove_all_vfsops(struct bhv_vfs *, int);
+extern void bhv_remove_vfsops(struct bhv_vfs *, int);
 
 #define fs_frozen(vfsp)		((vfsp)->vfs_super->s_frozen)
 #define fs_check_frozen(vfsp, level) \
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index f17e39cff23..82777f1a70a 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -67,7 +67,7 @@ vn_ioerror(
 	int		l)
 {
 	if (unlikely(error == -ENODEV))
-		VFS_FORCE_SHUTDOWN(vp->v_vfsp, SHUTDOWN_DEVICE_REQ, f, l);
+		bhv_vfs_force_shutdown(vp->v_vfsp, SHUTDOWN_DEVICE_REQ, f, l);
 }
 
 struct vnode *
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 569a4e7b5cc..95343637ab6 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -68,7 +68,7 @@ typedef enum vflags {
  */
 typedef struct vnode {
 	vflags_t	v_flag;			/* vnode flags (see above) */
-	struct vfs	*v_vfsp;		/* ptr to containing VFS */
+	struct bhv_vfs	*v_vfsp;		/* ptr to containing VFS */
 	vnumber_t	v_number;		/* in-core vnode number */
 	vn_bhv_head_t	v_bh;			/* behavior head */
 	spinlock_t	v_lock;			/* VN_LOCK/VN_UNLOCK */
@@ -503,7 +503,7 @@ extern vnode_t	*vn_initialize(struct inode *);
  * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
  */
 typedef struct vnode_map {
-	vfs_t		*v_vfsp;
+	bhv_vfs_t	*v_vfsp;
 	vnumber_t	v_number;		/* in-core vnode number */
 	xfs_ino_t	v_ino;			/* inode #	*/
 } vmap_t;
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 181dd90b29c..78f5962e578 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -129,7 +129,7 @@ xfs_qm_parseargs(
 		return XFS_ERROR(EINVAL);
 	}
 
-	PVFS_PARSEARGS(BHV_NEXT(bhv), options, args, update, error);
+	error = bhv_next_vfs_parseargs(BHV_NEXT(bhv), options, args, update);
 	if (!error && !referenced)
 		bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM);
 	return error;
@@ -140,9 +140,8 @@ xfs_qm_showargs(
 	struct bhv_desc		*bhv,
 	struct seq_file		*m)
 {
-	struct vfs		*vfsp = bhvtovfs(bhv);
+	struct bhv_vfs		*vfsp = bhvtovfs(bhv);
 	struct xfs_mount	*mp = XFS_VFSTOM(vfsp);
-	int			error;
 
 	if (mp->m_qflags & XFS_UQUOTA_ACCT) {
 		(mp->m_qflags & XFS_UQUOTA_ENFD) ?
@@ -165,8 +164,7 @@ xfs_qm_showargs(
 	if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
 		seq_puts(m, "," MNTOPT_NOQUOTA);
 
-	PVFS_SHOWARGS(BHV_NEXT(bhv), m, error);
-	return error;
+	return bhv_next_vfs_showargs(BHV_NEXT(bhv), m);
 }
 
 STATIC int
@@ -175,14 +173,12 @@ xfs_qm_mount(
 	struct xfs_mount_args	*args,
 	struct cred		*cr)
 {
-	struct vfs		*vfsp = bhvtovfs(bhv);
+	struct bhv_vfs		*vfsp = bhvtovfs(bhv);
 	struct xfs_mount	*mp = XFS_VFSTOM(vfsp);
-	int			error;
 
 	if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA | XFSMNT_PQUOTA))
 		xfs_qm_mount_quotainit(mp, args->flags);
-	PVFS_MOUNT(BHV_NEXT(bhv), args, cr, error);
-	return error;
+	return bhv_next_vfs_mount(BHV_NEXT(bhv), args, cr);
 }
 
 /*
@@ -205,7 +201,7 @@ xfs_qm_statvfs(
 	__uint64_t		limit;
 	int			error;
 
-	error = PVFS_STATVFS(BHV_NEXT(bhv), statp, vnode);
+	error = bhv_next_vfs_statvfs(BHV_NEXT(bhv), statp, vnode);
 	if (error || !vnode)
 		return error;
 
@@ -246,7 +242,7 @@ xfs_qm_syncall(
 	int			flags,
 	cred_t			*credp)
 {
-	struct vfs		*vfsp = bhvtovfs(bhv);
+	struct bhv_vfs		*vfsp = bhvtovfs(bhv);
 	struct xfs_mount	*mp = XFS_VFSTOM(vfsp);
 	int			error;
 
@@ -265,8 +261,7 @@ xfs_qm_syncall(
 			}
 		}
 	}
-	PVFS_SYNC(BHV_NEXT(bhv), flags, credp, error);
-	return error;
+	return bhv_next_vfs_sync(BHV_NEXT(bhv), flags, credp);
 }
 
 STATIC int
@@ -401,7 +396,7 @@ STATIC struct xfs_qmops xfs_qmcore_xfs = {
 	.xfs_dqtrxops		= &xfs_trans_dquot_ops,
 };
 
-struct bhv_vfsops xfs_qmops = { {
+struct bhv_module_vfsops xfs_qmops = { {
 	BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM),
 	.vfs_parseargs		= xfs_qm_parseargs,
 	.vfs_showargs		= xfs_qm_showargs,
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index c55db463bbf..afc7c1c696b 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -91,8 +91,8 @@ xfs_qm_quotactl(
 	xfs_caddr_t	addr)
 {
 	xfs_mount_t	*mp;
+	bhv_vfs_t	*vfsp;
 	int		error;
-	struct vfs	*vfsp;
 
 	vfsp = bhvtovfs(bdp);
 	mp = XFS_VFSTOM(vfsp);
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index 00b1540f810..4e7865ad6f0 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -189,6 +189,6 @@ typedef enum {
 #define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
 
 
-extern struct bhv_vfsops xfs_dmops;
+extern struct bhv_module_vfsops xfs_dmops;
 
 #endif  /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 96f4b3e8fa4..502483c41f9 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -574,7 +574,7 @@ xfs_fs_goingdown(
 {
 	switch (inflags) {
 	case XFS_FSOP_GOING_FLAGS_DEFAULT: {
-		struct vfs *vfsp = XFS_MTOVFS(mp);
+		struct bhv_vfs *vfsp = XFS_MTOVFS(mp);
 		struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
 
 		if (sb && !IS_ERR(sb)) {
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b5385432526..41f38eb60eb 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -468,7 +468,7 @@ finish_inode:
 	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
 	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
 	 */
-	VFS_INIT_VNODE(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
+	bhv_vfs_init_vnode(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
 
 	return 0;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 50119a00885..0ed707ed664 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1250,8 +1250,8 @@ xfs_ialloc(
 	 */
 	xfs_trans_log_inode(tp, ip, flags);
 
-	/* now that we have an i_mode  we can set Linux inode ops (& unlock) */
-	VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
+	/* now that we have an i_mode we can setup inode ops and unlock */
+	bhv_vfs_init_vnode(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
 
 	*ipp = ip;
 	return 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3b544db1790..233668b748f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -509,7 +509,6 @@ extern struct kmem_zone	*xfs_chashlist_zone;
 extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone	*xfs_inode_zone;
 extern struct kmem_zone	*xfs_ili_zone;
-extern struct vnodeops	xfs_vnodeops;
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c
index d3da194f8b7..89441e69cdd 100644
--- a/fs/xfs/xfs_iocore.c
+++ b/fs/xfs/xfs_iocore.c
@@ -59,7 +59,7 @@ xfs_size_fn(
 
 STATIC int
 xfs_ioinit(
-	struct vfs		*vfsp,
+	struct bhv_vfs		*vfsp,
 	struct xfs_mount_args	*mntargs,
 	int			flags)
 {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7d882d6c7d4..9f70ab3e098 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -498,9 +498,8 @@ xfs_log_mount(xfs_mount_t	*mp,
 	 * just worked.
 	 */
 	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
-		int	error;
-		vfs_t	*vfsp = XFS_MTOVFS(mp);
-		int	readonly = (vfsp->vfs_flag & VFS_RDONLY);
+		bhv_vfs_t	*vfsp = XFS_MTOVFS(mp);
+		int		error, readonly = (vfsp->vfs_flag & VFS_RDONLY);
 
 		if (readonly)
 			vfsp->vfs_flag &= ~VFS_RDONLY;
@@ -816,7 +815,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
 	SPLDECL(s);
 	int		needed = 0, gen;
 	xlog_t		*log = mp->m_log;
-	vfs_t		*vfsp = XFS_MTOVFS(mp);
+	bhv_vfs_t	*vfsp = XFS_MTOVFS(mp);
 
 	if (fs_frozen(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
 	    (vfsp->vfs_flag & VFS_RDONLY))
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 9378408a69d..56c4b7e51f4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -196,7 +196,7 @@ xfs_mount_free(
 		kmem_free(mp->m_logname, strlen(mp->m_logname) + 1);
 
 	if (remove_bhv) {
-		struct vfs	*vfsp = XFS_MTOVFS(mp);
+		struct bhv_vfs	*vfsp = XFS_MTOVFS(mp);
 
 		bhv_remove_all_vfsops(vfsp, 0);
 		VFS_REMOVEBHV(vfsp, &mp->m_bhv);
@@ -337,7 +337,7 @@ xfs_mount_validate_sb(
 
 xfs_agnumber_t
 xfs_initialize_perag(
-	struct vfs	*vfs,
+	bhv_vfs_t	*vfs,
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agcount)
 {
@@ -651,7 +651,7 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
  */
 int
 xfs_mountfs(
-	vfs_t		*vfsp,
+	bhv_vfs_t	*vfsp,
 	xfs_mount_t	*mp,
 	int		mfsi_flags)
 {
@@ -1095,7 +1095,7 @@ xfs_mountfs(
 int
 xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
 {
-	struct vfs	*vfsp = XFS_MTOVFS(mp);
+	struct bhv_vfs	*vfsp = XFS_MTOVFS(mp);
 #if defined(DEBUG) || defined(INDUCE_IO_ERROR)
 	int64_t		fsid;
 #endif
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 2ca211214a9..7e612255f53 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -53,7 +53,7 @@ typedef struct xfs_trans_reservations {
 #else
 struct cred;
 struct log;
-struct vfs;
+struct bhv_vfs;
 struct vnode;
 struct xfs_mount_args;
 struct xfs_ihash;
@@ -66,7 +66,7 @@ struct xfs_bmap_free;
 struct xfs_extdelta;
 struct xfs_swapext;
 
-extern struct vfsops xfs_vfsops;
+extern struct bhv_vfsops xfs_vfsops;
 extern struct vnodeops xfs_vnodeops;
 
 #define	AIL_LOCK_T		lock_t
@@ -84,11 +84,11 @@ typedef int	(*xfs_send_data_t)(int, struct vnode *,
 			xfs_off_t, size_t, int, vrwlock_t *);
 typedef int	(*xfs_send_mmap_t)(struct vm_area_struct *, uint);
 typedef int	(*xfs_send_destroy_t)(struct vnode *, dm_right_t);
-typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct vfs *,
+typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct bhv_vfs *,
 			struct vnode *,
 			dm_right_t, struct vnode *, dm_right_t,
 			char *, char *, mode_t, int, int);
-typedef void	(*xfs_send_unmount_t)(struct vfs *, struct vnode *,
+typedef void	(*xfs_send_unmount_t)(struct bhv_vfs *, struct vnode *,
 			dm_right_t, mode_t, int, int);
 
 typedef struct xfs_dmops {
@@ -190,7 +190,7 @@ typedef struct xfs_qmops {
  * Prototypes and functions for I/O core modularization.
  */
 
-typedef int		(*xfs_ioinit_t)(struct vfs *,
+typedef int		(*xfs_ioinit_t)(struct bhv_vfs *,
 				struct xfs_mount_args *, int);
 typedef int		(*xfs_bmapi_t)(struct xfs_trans *, void *,
 				xfs_fileoff_t, xfs_filblks_t, int,
@@ -220,7 +220,7 @@ typedef void		(*xfs_lock_demote_t)(void *, uint);
 typedef int		(*xfs_lock_nowait_t)(void *, uint);
 typedef void		(*xfs_unlk_t)(void *, unsigned int);
 typedef xfs_fsize_t	(*xfs_size_t)(void *);
-typedef xfs_fsize_t	(*xfs_iodone_t)(struct vfs *);
+typedef xfs_fsize_t	(*xfs_iodone_t)(struct bhv_vfs *);
 typedef int		(*xfs_swap_extents_t)(void *, void *,
 				struct xfs_swapext*);
 
@@ -511,7 +511,7 @@ xfs_preferred_iosize(xfs_mount_t *mp)
 
 #define XFS_FORCED_SHUTDOWN(mp)	((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
 #define xfs_force_shutdown(m,f)	\
-	VFS_FORCE_SHUTDOWN((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
+	bhv_vfs_force_shutdown((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
 
 /*
  * Flags for xfs_mountfs
@@ -529,7 +529,7 @@ xfs_preferred_iosize(xfs_mount_t *mp)
  * Macros for getting from mount to vfs and back.
  */
 #define	XFS_MTOVFS(mp)		xfs_mtovfs(mp)
-static inline struct vfs *xfs_mtovfs(xfs_mount_t *mp)
+static inline struct bhv_vfs *xfs_mtovfs(xfs_mount_t *mp)
 {
 	return bhvtovfs(&mp->m_bhv);
 }
@@ -541,7 +541,7 @@ static inline xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp)
 }
 
 #define XFS_VFSTOM(vfs) xfs_vfstom(vfs)
-static inline xfs_mount_t *xfs_vfstom(vfs_t *vfs)
+static inline xfs_mount_t *xfs_vfstom(bhv_vfs_t *vfs)
 {
 	return XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops));
 }
@@ -579,7 +579,7 @@ typedef struct xfs_mod_sb {
 extern xfs_mount_t *xfs_mount_init(void);
 extern void	xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern void	xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
-extern int	xfs_mountfs(struct vfs *, xfs_mount_t *mp, int);
+extern int	xfs_mountfs(struct bhv_vfs *, xfs_mount_t *mp, int);
 extern void	xfs_mountfs_check_barriers(xfs_mount_t *mp);
 
 extern int	xfs_unmountfs(xfs_mount_t *, struct cred *);
@@ -597,7 +597,7 @@ extern void	xfs_freesb(xfs_mount_t *);
 extern void	xfs_do_force_shutdown(bhv_desc_t *, int, char *, int);
 extern int	xfs_syncsub(xfs_mount_t *, int, int, int *);
 extern int	xfs_sync_inodes(xfs_mount_t *, int, int, int *);
-extern xfs_agnumber_t	xfs_initialize_perag(struct vfs *, xfs_mount_t *,
+extern xfs_agnumber_t	xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *,
 						xfs_agnumber_t);
 extern void	xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t);
 
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 7fbef974bce..acb853b33eb 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -365,7 +365,7 @@ typedef struct xfs_dqtrxops {
 extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 
-extern struct bhv_vfsops xfs_qmops;
+extern struct bhv_module_vfsops xfs_qmops;
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index ec85a2e24b5..57ee2bebd24 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -212,7 +212,7 @@ xfs_cleanup(void)
  */
 STATIC int
 xfs_start_flags(
-	struct vfs		*vfs,
+	struct bhv_vfs		*vfs,
 	struct xfs_mount_args	*ap,
 	struct xfs_mount	*mp)
 {
@@ -337,7 +337,7 @@ xfs_start_flags(
  */
 STATIC int
 xfs_finish_flags(
-	struct vfs		*vfs,
+	struct bhv_vfs		*vfs,
 	struct xfs_mount_args	*ap,
 	struct xfs_mount	*mp)
 {
@@ -423,7 +423,7 @@ xfs_mount(
 	struct xfs_mount_args	*args,
 	cred_t			*credp)
 {
-	struct vfs		*vfsp = bhvtovfs(bhvp);
+	struct bhv_vfs		*vfsp = bhvtovfs(bhvp);
 	struct bhv_desc		*p;
 	struct xfs_mount	*mp = XFS_BHVTOM(bhvp);
 	struct block_device	*ddev, *logdev, *rtdev;
@@ -552,7 +552,7 @@ xfs_unmount(
 	int		flags,
 	cred_t		*credp)
 {
-	struct vfs	*vfsp = bhvtovfs(bdp);
+	bhv_vfs_t	*vfsp = bhvtovfs(bdp);
 	xfs_mount_t	*mp = XFS_BHVTOM(bdp);
 	xfs_inode_t	*rip;
 	vnode_t		*rvp;
@@ -665,9 +665,8 @@ xfs_mntupdate(
 	int				*flags,
 	struct xfs_mount_args		*args)
 {
-	struct vfs	*vfsp = bhvtovfs(bdp);
+	bhv_vfs_t	*vfsp = bhvtovfs(bdp);
 	xfs_mount_t	*mp = XFS_BHVTOM(bdp);
-	int		error;
 
 	if (!(*flags & MS_RDONLY)) {			/* rw/ro -> rw */
 		if (vfsp->vfs_flag & VFS_RDONLY)
@@ -679,7 +678,7 @@ xfs_mntupdate(
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
 		}
 	} else if (!(vfsp->vfs_flag & VFS_RDONLY)) {	/* rw -> ro */
-		VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+		bhv_vfs_sync(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL);
 		xfs_quiesce_fs(mp);
 		xfs_log_unmount_write(mp);
 		xfs_unmountfs_writesb(mp);
@@ -900,7 +899,7 @@ xfs_sync(
 /*
  * xfs sync routine for internal use
  *
- * This routine supports all of the flags defined for the generic VFS_SYNC
+ * This routine supports all of the flags defined for the generic vfs_sync
  * interface as explained above under xfs_sync.  In the interests of not
  * changing interfaces within the 6.5 family, additional internally-
  * required functions are specified within a separate xflags parameter,
@@ -1421,7 +1420,7 @@ xfs_sync_inodes(
 /*
  * xfs sync routine for internal use
  *
- * This routine supports all of the flags defined for the generic VFS_SYNC
+ * This routine supports all of the flags defined for the generic vfs_sync
  * interface as explained above under xfs_sync.  In the interests of not
  * changing interfaces within the 6.5 family, additional internally-
  * required functions are specified within a separate xflags parameter,
@@ -1686,7 +1685,7 @@ xfs_parseargs(
 	struct xfs_mount_args	*args,
 	int			update)
 {
-	struct vfs		*vfsp = bhvtovfs(bhv);
+	bhv_vfs_t		*vfsp = bhvtovfs(bhv);
 	char			*this_char, *value, *eov;
 	int			dsunit, dswidth, vol_dsunit, vol_dswidth;
 	int			iosize;
@@ -1924,7 +1923,7 @@ xfs_showargs(
 	};
 	struct proc_xfs_info	*xfs_infop;
 	struct xfs_mount	*mp = XFS_BHVTOM(bhv);
-	struct vfs		*vfsp = XFS_MTOVFS(mp);
+	struct bhv_vfs		*vfsp = XFS_MTOVFS(mp);
 
 	for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) {
 		if (mp->m_flags & xfs_infop->flag)
@@ -1984,7 +1983,7 @@ xfs_freeze(
 }
 
 
-vfsops_t xfs_vfsops = {
+bhv_vfsops_t xfs_vfsops = {
 	BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS),
 	.vfs_parseargs		= xfs_parseargs,
 	.vfs_showargs		= xfs_showargs,
-- 
cgit v1.2.3


From 67fcaa73adafb19139a7cd8ab133592b6a0a0901 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 17:00:52 +1000
Subject: [XFS] Resolve a namespace collision on vnode/vnodeops for FreeBSD
 porters.

SGI-PV: 953338
SGI-Modid: xfs-linux-melb:xfs-kern:26107a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c    |  33 ++---
 fs/xfs/linux-2.6/xfs_aops.h    |   2 +-
 fs/xfs/linux-2.6/xfs_export.c  |   6 +-
 fs/xfs/linux-2.6/xfs_file.c    | 120 ++++++----------
 fs/xfs/linux-2.6/xfs_fs_subr.c |   6 +-
 fs/xfs/linux-2.6/xfs_ioctl.c   |  61 ++++-----
 fs/xfs/linux-2.6/xfs_ioctl32.c |   4 +-
 fs/xfs/linux-2.6/xfs_iops.c    |  99 ++++++-------
 fs/xfs/linux-2.6/xfs_lrw.c     |  10 +-
 fs/xfs/linux-2.6/xfs_lrw.h     |   4 +-
 fs/xfs/linux-2.6/xfs_super.c   |  42 +++---
 fs/xfs/linux-2.6/xfs_super.h   |   2 +-
 fs/xfs/linux-2.6/xfs_vfs.c     |   8 +-
 fs/xfs/linux-2.6/xfs_vfs.h     |  55 ++++----
 fs/xfs/linux-2.6/xfs_vnode.c   |  32 ++---
 fs/xfs/linux-2.6/xfs_vnode.h   | 305 ++++++++++++++++-------------------------
 fs/xfs/quota/xfs_qm_bhv.c      |   2 +-
 fs/xfs/quota/xfs_qm_syscalls.c |   2 +-
 fs/xfs/xfs_acl.c               |  59 ++++----
 fs/xfs/xfs_acl.h               |  16 +--
 fs/xfs/xfs_attr.c              |  52 ++++---
 fs/xfs/xfs_attr.h              |  14 +-
 fs/xfs/xfs_bmap.c              |   4 +-
 fs/xfs/xfs_cap.h               |  10 +-
 fs/xfs/xfs_dfrag.c             |  10 +-
 fs/xfs/xfs_iget.c              |  16 +--
 fs/xfs/xfs_inode.c             |  20 +--
 fs/xfs/xfs_inode.h             |   6 +-
 fs/xfs/xfs_itable.c            |   2 +-
 fs/xfs/xfs_mount.c             |   2 +-
 fs/xfs/xfs_mount.h             |  14 +-
 fs/xfs/xfs_rename.c            |   6 +-
 fs/xfs/xfs_utils.c             |   4 +-
 fs/xfs/xfs_utils.h             |   3 +-
 fs/xfs/xfs_vfsops.c            |  24 ++--
 fs/xfs/xfs_vnodeops.c          |  77 +++++------
 36 files changed, 496 insertions(+), 636 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c0a90431685..c5ea26d73be 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -76,7 +76,7 @@ xfs_page_trace(
 	int		mask)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	loff_t		isize = i_size_read(inode);
 	loff_t		offset = page_offset(page);
 	int		delalloc = -1, unmapped = -1, unwritten = -1;
@@ -181,13 +181,12 @@ xfs_end_bio_unwritten(
 	void			*data)
 {
 	xfs_ioend_t		*ioend = data;
-	vnode_t			*vp = ioend->io_vnode;
+	bhv_vnode_t		*vp = ioend->io_vnode;
 	xfs_off_t		offset = ioend->io_offset;
 	size_t			size = ioend->io_size;
-	int			error;
 
 	if (likely(!ioend->io_error))
-		VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
+		bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL);
 	xfs_destroy_ioend(ioend);
 }
 
@@ -240,10 +239,10 @@ xfs_map_blocks(
 	xfs_iomap_t		*mapp,
 	int			flags)
 {
-	vnode_t			*vp = vn_from_inode(inode);
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 	int			error, nmaps = 1;
 
-	VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error);
+	error = bhv_vop_bmap(vp, offset, count, flags, mapp, &nmaps);
 	if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
 		VMODIFY(vp);
 	return -error;
@@ -1162,7 +1161,7 @@ xfs_vm_writepages(
 	struct address_space	*mapping,
 	struct writeback_control *wbc)
 {
-	struct vnode		*vp = vn_from_inode(mapping->host);
+	struct bhv_vnode	*vp = vn_from_inode(mapping->host);
 
 	if (VN_TRUNC(vp))
 		VUNTRUNCATE(vp);
@@ -1242,7 +1241,7 @@ __xfs_get_blocks(
 	int			direct,
 	bmapi_flags_t		flags)
 {
-	vnode_t			*vp = vn_from_inode(inode);
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 	xfs_iomap_t		iomap;
 	xfs_off_t		offset;
 	ssize_t			size;
@@ -1252,8 +1251,8 @@ __xfs_get_blocks(
 	offset = (xfs_off_t)iblock << inode->i_blkbits;
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
-	VOP_BMAP(vp, offset, size,
-		create ? flags : BMAPI_READ, &iomap, &niomap, error);
+	error = bhv_vop_bmap(vp, offset, size,
+			     create ? flags : BMAPI_READ, &iomap, &niomap);
 	if (error)
 		return -error;
 	if (niomap == 0)
@@ -1381,13 +1380,13 @@ xfs_vm_direct_IO(
 {
 	struct file	*file = iocb->ki_filp;
 	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	xfs_iomap_t	iomap;
 	int		maps = 1;
 	int		error;
 	ssize_t		ret;
 
-	VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
+	error = bhv_vop_bmap(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps);
 	if (error)
 		return -error;
 
@@ -1420,14 +1419,12 @@ xfs_vm_bmap(
 	sector_t		block)
 {
 	struct inode		*inode = (struct inode *)mapping->host;
-	vnode_t			*vp = vn_from_inode(inode);
-	int			error;
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
-
-	VOP_RWLOCK(vp, VRWLOCK_READ);
-	VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
-	VOP_RWUNLOCK(vp, VRWLOCK_READ);
+	bhv_vop_rwlock(vp, VRWLOCK_READ);
+	bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF);
+	bhv_vop_rwunlock(vp, VRWLOCK_READ);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 62c1d49025f..706d8c781b8 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -32,7 +32,7 @@ typedef struct xfs_ioend {
 	unsigned int		io_type;	/* delalloc / unwritten */
 	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
-	struct vnode		*io_vnode;	/* file being written to */
+	struct bhv_vnode	*io_vnode;	/* file being written to */
 	struct buffer_head	*io_buffer_head;/* buffer linked list head */
 	struct buffer_head	*io_buffer_tail;/* buffer linked list tail */
 	size_t			io_size;	/* size of the extent */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index b5c579eeb8e..d32a9edc43a 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -136,7 +136,7 @@ xfs_fs_get_dentry(
 	struct super_block	*sb,
 	void			*data)
 {
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	struct inode		*inode;
 	struct dentry		*result;
 	bhv_vfs_t		*vfsp = vfs_from_sb(sb);
@@ -160,12 +160,12 @@ xfs_fs_get_parent(
 	struct dentry		*child)
 {
 	int			error;
-	vnode_t			*vp, *cvp;
+	bhv_vnode_t		*vp, *cvp;
 	struct dentry		*parent;
 
 	cvp = NULL;
 	vp = vn_from_inode(child->d_inode);
-	VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
+	error = bhv_vop_lookup(vp, &dotdot, &cvp, 0, NULL, NULL);
 	if (unlikely(error))
 		return ERR_PTR(-error);
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 97615cc74ef..89b1a742135 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -58,15 +58,12 @@ __xfs_file_read(
 {
 	struct iovec		iov = {buf, count};
 	struct file		*file = iocb->ki_filp;
-	vnode_t			*vp = vn_from_inode(file->f_dentry->d_inode);
-	ssize_t			rval;
+	bhv_vnode_t		*vp = vn_from_inode(file->f_dentry->d_inode);
 
 	BUG_ON(iocb->ki_pos != pos);
-
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-	VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
-	return rval;
+	return bhv_vop_read(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
 }
 
 STATIC ssize_t
@@ -100,15 +97,12 @@ __xfs_file_write(
 	struct iovec	iov = {(void __user *)buf, count};
 	struct file	*file = iocb->ki_filp;
 	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = vn_from_inode(inode);
-	ssize_t		rval;
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-
-	VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
-	return rval;
+	return bhv_vop_write(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
 }
 
 STATIC ssize_t
@@ -140,7 +134,7 @@ __xfs_file_readv(
 	loff_t			*ppos)
 {
 	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	struct kiocb	kiocb;
 	ssize_t		rval;
 
@@ -149,7 +143,8 @@ __xfs_file_readv(
 
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-	VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
+	rval = bhv_vop_read(vp, &kiocb, iov, nr_segs,
+				&kiocb.ki_pos, ioflags, NULL);
 
 	*ppos = kiocb.ki_pos;
 	return rval;
@@ -184,7 +179,7 @@ __xfs_file_writev(
 	loff_t			*ppos)
 {
 	struct inode	*inode = file->f_mapping->host;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	struct kiocb	kiocb;
 	ssize_t		rval;
 
@@ -193,7 +188,8 @@ __xfs_file_writev(
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
 
-	VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
+	rval = bhv_vop_write(vp, &kiocb, iov, nr_segs,
+				 &kiocb.ki_pos, ioflags, NULL);
 
 	*ppos = kiocb.ki_pos;
 	return rval;
@@ -227,11 +223,8 @@ xfs_file_sendfile(
 	read_actor_t		actor,
 	void			*target)
 {
-	vnode_t			*vp = vn_from_inode(filp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SENDFILE(vp, filp, pos, 0, count, actor, target, NULL, rval);
-	return rval;
+	return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode),
+				filp, pos, 0, count, actor, target, NULL);
 }
 
 STATIC ssize_t
@@ -242,11 +235,8 @@ xfs_file_sendfile_invis(
 	read_actor_t		actor,
 	void			*target)
 {
-	vnode_t			*vp = vn_from_inode(filp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SENDFILE(vp, filp, pos, IO_INVIS, count, actor, target, NULL, rval);
-	return rval;
+	return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode),
+				filp, pos, IO_INVIS, count, actor, target, NULL);
 }
 
 STATIC ssize_t
@@ -257,11 +247,8 @@ xfs_file_splice_read(
 	size_t			len,
 	unsigned int		flags)
 {
-	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval);
-	return rval;
+	return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode),
+				   infilp, ppos, pipe, len, flags, 0, NULL);
 }
 
 STATIC ssize_t
@@ -272,11 +259,9 @@ xfs_file_splice_read_invis(
 	size_t			len,
 	unsigned int		flags)
 {
-	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval);
-	return rval;
+	return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode),
+				   infilp, ppos, pipe, len, flags, IO_INVIS,
+				   NULL);
 }
 
 STATIC ssize_t
@@ -287,11 +272,8 @@ xfs_file_splice_write(
 	size_t			len,
 	unsigned int		flags)
 {
-	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval);
-	return rval;
+	return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode),
+				    pipe, outfilp, ppos, len, flags, 0, NULL);
 }
 
 STATIC ssize_t
@@ -302,11 +284,9 @@ xfs_file_splice_write_invis(
 	size_t			len,
 	unsigned int		flags)
 {
-	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
-	ssize_t			rval;
-
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval);
-	return rval;
+	return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode),
+				    pipe, outfilp, ppos, len, flags, IO_INVIS,
+				    NULL);
 }
 
 STATIC int
@@ -314,24 +294,17 @@ xfs_file_open(
 	struct inode	*inode,
 	struct file	*filp)
 {
-	vnode_t		*vp = vn_from_inode(inode);
-	int		error;
-
 	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
 		return -EFBIG;
-	VOP_OPEN(vp, NULL, error);
-	return -error;
+	return -bhv_vop_open(vn_from_inode(inode), NULL);
 }
 
 STATIC int
 xfs_file_close(
 	struct file	*filp)
 {
-	vnode_t		*vp = vn_from_inode(filp->f_dentry->d_inode);
-	int		error;
-
-	VOP_CLOSE(vp, 0, file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL, error);
-	return -error;
+	return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0,
+				file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL);
 }
 
 STATIC int
@@ -339,12 +312,11 @@ xfs_file_release(
 	struct inode	*inode,
 	struct file	*filp)
 {
-	vnode_t		*vp = vn_from_inode(inode);
-	int		error = 0;
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	if (vp)
-		VOP_RELEASE(vp, error);
-	return -error;
+		return -bhv_vop_release(vp);
+	return 0;
 }
 
 STATIC int
@@ -353,17 +325,14 @@ xfs_file_fsync(
 	struct dentry	*dentry,
 	int		datasync)
 {
-	struct inode	*inode = dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
-	int		error;
+	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
 	int		flags = FSYNC_WAIT;
 
 	if (datasync)
 		flags |= FSYNC_DATA;
 	if (VN_TRUNC(vp))
 		VUNTRUNCATE(vp);
-	VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error);
-	return -error;
+	return -bhv_vop_fsync(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1);
 }
 
 #ifdef CONFIG_XFS_DMAPI
@@ -374,7 +343,7 @@ xfs_vm_nopage(
 	int			*type)
 {
 	struct inode	*inode = area->vm_file->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
 	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), area, 0))
@@ -390,7 +359,7 @@ xfs_file_readdir(
 	filldir_t	filldir)
 {
 	int		error = 0;
-	vnode_t		*vp = vn_from_inode(filp->f_dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(filp->f_dentry->d_inode);
 	uio_t		uio;
 	iovec_t		iov;
 	int		eof = 0;
@@ -425,7 +394,7 @@ xfs_file_readdir(
 
 		start_offset = uio.uio_offset;
 
-		VOP_READDIR(vp, &uio, NULL, &eof, error);
+		error = bhv_vop_readdir(vp, &uio, NULL, &eof);
 		if ((uio.uio_offset == start_offset) || error) {
 			size = 0;
 			break;
@@ -475,18 +444,17 @@ xfs_file_mmap(
 	return 0;
 }
 
-
 STATIC long
 xfs_file_ioctl(
 	struct file	*filp,
 	unsigned int	cmd,
-	unsigned long	arg)
+	unsigned long	p)
 {
 	int		error;
 	struct inode	*inode = filp->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
-	VOP_IOCTL(vp, inode, filp, 0, cmd, (void __user *)arg, error);
+	error = bhv_vop_ioctl(vp, inode, filp, 0, cmd, (void __user *)p);
 	VMODIFY(vp);
 
 	/* NOTE:  some of the ioctl's return positive #'s as a
@@ -502,13 +470,13 @@ STATIC long
 xfs_file_ioctl_invis(
 	struct file	*filp,
 	unsigned int	cmd,
-	unsigned long	arg)
+	unsigned long	p)
 {
-	struct inode	*inode = filp->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
 	int		error;
+	struct inode	*inode = filp->f_dentry->d_inode;
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
-	VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error);
+	error = bhv_vop_ioctl(vp, inode, filp, IO_INVIS, cmd, (void __user *)p);
 	VMODIFY(vp);
 
 	/* NOTE:  some of the ioctl's return positive #'s as a
@@ -527,7 +495,7 @@ xfs_vm_mprotect(
 	struct vm_area_struct *vma,
 	unsigned int	newflags)
 {
-	vnode_t		*vp = vn_from_inode(vma->vm_file->f_dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(vma->vm_file->f_dentry->d_inode);
 	int		error = 0;
 
 	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
@@ -553,7 +521,7 @@ STATIC int
 xfs_file_open_exec(
 	struct inode	*inode)
 {
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
 	int		error = 0;
 	xfs_inode_t	*ip;
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index f0c56daf4d6..dc0562828e7 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -28,7 +28,7 @@ fs_tosspages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
 
 	if (VN_CACHED(vp))
@@ -42,7 +42,7 @@ fs_flushinval_pages(
 	xfs_off_t	last,
 	int		fiopt)
 {
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
 
 	if (VN_CACHED(vp)) {
@@ -61,7 +61,7 @@ fs_flush_pages(
 	uint64_t	flags,
 	int		fiopt)
 {
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	struct inode	*ip = vn_to_inode(vp);
 
 	if (VN_DIRTY(vp)) {
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 84478491609..73182503c16 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -78,7 +78,7 @@ xfs_find_handle(
 	xfs_handle_t		handle;
 	xfs_fsop_handlereq_t	hreq;
 	struct inode		*inode;
-	struct vnode		*vp;
+	bhv_vnode_t		*vp;
 
 	if (copy_from_user(&hreq, arg, sizeof(hreq)))
 		return -XFS_ERROR(EFAULT);
@@ -192,7 +192,7 @@ xfs_vget_fsop_handlereq(
 	xfs_mount_t		*mp,
 	struct inode		*parinode,	/* parent inode pointer    */
 	xfs_fsop_handlereq_t	*hreq,
-	vnode_t			**vp,
+	bhv_vnode_t		**vp,
 	struct inode		**inode)
 {
 	void			__user *hanp;
@@ -202,7 +202,7 @@ xfs_vget_fsop_handlereq(
 	xfs_handle_t		handle;
 	xfs_inode_t		*ip;
 	struct inode		*inodep;
-	vnode_t			*vpp;
+	bhv_vnode_t		*vpp;
 	xfs_ino_t		ino;
 	__u32			igen;
 	int			error;
@@ -277,7 +277,7 @@ xfs_open_by_handle(
 	struct file		*filp;
 	struct inode		*inode;
 	struct dentry		*dentry;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	xfs_fsop_handlereq_t	hreq;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -362,7 +362,7 @@ xfs_readlink_by_handle(
 	struct uio		auio;
 	struct inode		*inode;
 	xfs_fsop_handlereq_t	hreq;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	__u32			olen;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -393,9 +393,11 @@ xfs_readlink_by_handle(
 	auio.uio_segflg	= UIO_USERSPACE;
 	auio.uio_resid	= olen;
 
-	VOP_READLINK(vp, &auio, IO_INVIS, NULL, error);
-
+	error = bhv_vop_readlink(vp, &auio, IO_INVIS, NULL);
 	VN_RELE(vp);
+	if (error)
+		return -error;
+
 	return (olen - auio.uio_resid);
 }
 
@@ -411,7 +413,7 @@ xfs_fssetdm_by_handle(
 	xfs_fsop_setdm_handlereq_t dmhreq;
 	struct inode		*inode;
 	bhv_desc_t		*bdp;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 
 	if (!capable(CAP_MKNOD))
 		return -XFS_ERROR(EPERM);
@@ -452,7 +454,7 @@ xfs_attrlist_by_handle(
 	attrlist_cursor_kern_t	*cursor;
 	xfs_fsop_attrlist_handlereq_t al_hreq;
 	struct inode		*inode;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	char			*kbuf;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -472,8 +474,8 @@ xfs_attrlist_by_handle(
 		goto out_vn_rele;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-	VOP_ATTR_LIST(vp, kbuf, al_hreq.buflen, al_hreq.flags,
-			cursor, NULL, error);
+	error = bhv_vop_attr_list(vp, kbuf, al_hreq.buflen, al_hreq.flags,
+					cursor, NULL);
 	if (error)
 		goto out_kfree;
 
@@ -490,7 +492,7 @@ xfs_attrlist_by_handle(
 
 STATIC int
 xfs_attrmulti_attr_get(
-	struct vnode		*vp,
+	bhv_vnode_t		*vp,
 	char			*name,
 	char			__user *ubuf,
 	__uint32_t		*len,
@@ -505,7 +507,7 @@ xfs_attrmulti_attr_get(
 	if (!kbuf)
 		return ENOMEM;
 
-	VOP_ATTR_GET(vp, name, kbuf, len, flags, NULL, error);
+	error = bhv_vop_attr_get(vp, name, kbuf, len, flags, NULL);
 	if (error)
 		goto out_kfree;
 
@@ -519,7 +521,7 @@ xfs_attrmulti_attr_get(
 
 STATIC int
 xfs_attrmulti_attr_set(
-	struct vnode		*vp,
+	bhv_vnode_t		*vp,
 	char			*name,
 	const char		__user *ubuf,
 	__uint32_t		len,
@@ -542,7 +544,7 @@ xfs_attrmulti_attr_set(
 	if (copy_from_user(kbuf, ubuf, len))
 		goto out_kfree;
 			
-	VOP_ATTR_SET(vp, name, kbuf, len, flags, NULL, error);
+	error = bhv_vop_attr_set(vp, name, kbuf, len, flags, NULL);
 
  out_kfree:
 	kfree(kbuf);
@@ -551,20 +553,15 @@ xfs_attrmulti_attr_set(
 
 STATIC int
 xfs_attrmulti_attr_remove(
-	struct vnode		*vp,
+	bhv_vnode_t		*vp,
 	char			*name,
 	__uint32_t		flags)
 {
-	int			error;
-
-
 	if (IS_RDONLY(&vp->v_inode))
 		return -EROFS;
 	if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
 		return EPERM;
-
-	VOP_ATTR_REMOVE(vp, name, flags, NULL, error);
-	return error;
+	return bhv_vop_attr_remove(vp, name, flags, NULL);
 }
 
 STATIC int
@@ -578,7 +575,7 @@ xfs_attrmulti_by_handle(
 	xfs_attr_multiop_t	*ops;
 	xfs_fsop_attrmulti_handlereq_t am_hreq;
 	struct inode		*inode;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	unsigned int		i, size;
 	char			*attr_name;
 
@@ -658,7 +655,7 @@ xfs_attrmulti_by_handle(
 STATIC int
 xfs_ioc_space(
 	bhv_desc_t		*bdp,
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	struct file		*filp,
 	int			flags,
 	unsigned int		cmd,
@@ -682,7 +679,7 @@ xfs_ioc_fsgeometry(
 
 STATIC int
 xfs_ioc_xattr(
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	xfs_inode_t		*ip,
 	struct file		*filp,
 	unsigned int		cmd,
@@ -711,7 +708,7 @@ xfs_ioctl(
 	void			__user *arg)
 {
 	int			error;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	xfs_inode_t		*ip;
 	xfs_mount_t		*mp;
 
@@ -962,7 +959,7 @@ xfs_ioctl(
 STATIC int
 xfs_ioc_space(
 	bhv_desc_t		*bdp,
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	struct file		*filp,
 	int			ioflags,
 	unsigned int		cmd,
@@ -1153,7 +1150,7 @@ xfs_di2lxflags(
 
 STATIC int
 xfs_ioc_xattr(
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	xfs_inode_t		*ip,
 	struct file		*filp,
 	unsigned int		cmd,
@@ -1173,7 +1170,7 @@ xfs_ioc_xattr(
 	case XFS_IOC_FSGETXATTR: {
 		vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
 				 XFS_AT_NEXTENTS | XFS_AT_PROJID;
-		VOP_GETATTR(vp, vattr, 0, NULL, error);
+		error = bhv_vop_getattr(vp, vattr, 0, NULL);
 		if (unlikely(error)) {
 			error = -error;
 			break;
@@ -1206,7 +1203,7 @@ xfs_ioc_xattr(
 		vattr->va_extsize = fa.fsx_extsize;
 		vattr->va_projid  = fa.fsx_projid;
 
-		VOP_SETATTR(vp, vattr, attr_flags, NULL, error);
+		error = bhv_vop_setattr(vp, vattr, attr_flags, NULL);
 		if (likely(!error))
 			__vn_revalidate(vp, vattr);	/* update flags */
 		error = -error;
@@ -1216,7 +1213,7 @@ xfs_ioc_xattr(
 	case XFS_IOC_FSGETXATTRA: {
 		vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \
 				 XFS_AT_ANEXTENTS | XFS_AT_PROJID;
-		VOP_GETATTR(vp, vattr, 0, NULL, error);
+		error = bhv_vop_getattr(vp, vattr, 0, NULL);
 		if (unlikely(error)) {
 			error = -error;
 			break;
@@ -1262,7 +1259,7 @@ xfs_ioc_xattr(
 		vattr->va_xflags = xfs_merge_ioc_xflags(flags,
 							xfs_ip2xflags(ip));
 
-		VOP_SETATTR(vp, vattr, attr_flags, NULL, error);
+		error = bhv_vop_setattr(vp, vattr, attr_flags, NULL);
 		if (likely(!error))
 			__vn_revalidate(vp, vattr);	/* update flags */
 		error = -error;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 251bfe451a3..601f01c92f7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -114,7 +114,7 @@ xfs_compat_ioctl(
 	unsigned long	arg)
 {
 	struct inode	*inode = file->f_dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	int		error;
 
 	switch (cmd) {
@@ -193,7 +193,7 @@ xfs_compat_ioctl(
 		return -ENOIOCTLCMD;
 	}
 
-	VOP_IOCTL(vp, inode, file, mode, cmd, (void __user *)arg, error);
+	error = bhv_vop_ioctl(vp, inode, file, mode, cmd, (void __user *)arg);
 	VMODIFY(vp);
 
 	return error;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 0da1d6b081e..0857658882e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -61,7 +61,7 @@
  */
 xfs_inode_t *
 xfs_vtoi(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
 	bhv_desc_t      *bdp;
 
@@ -80,7 +80,7 @@ void
 xfs_synchronize_atime(
 	xfs_inode_t	*ip)
 {
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = XFS_ITOV_NULL(ip);
 	if (vp) {
@@ -202,12 +202,8 @@ xfs_validate_fields(
 	struct inode	*ip,
 	struct vattr	*vattr)
 {
-	vnode_t		*vp = vn_from_inode(ip);
-	int		error;
-
 	vattr->va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS;
-	VOP_GETATTR(vp, vattr, ATTR_LAZY, NULL, error);
-  	if (likely(!error)) {
+	if (!bhv_vop_getattr(vn_from_inode(ip), vattr, ATTR_LAZY, NULL)) {
 		ip->i_nlink = vattr->va_nlink;
 		ip->i_blocks = vattr->va_nblocks;
 
@@ -225,7 +221,7 @@ xfs_validate_fields(
  */
 STATIC int
 xfs_init_security(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	struct inode	*dir)
 {
 	struct inode	*ip = vn_to_inode(vp);
@@ -241,7 +237,7 @@ xfs_init_security(
 		return -error;
 	}
 
-	VOP_ATTR_SET(vp, name, value, length, ATTR_SECURE, NULL, error);
+	error = bhv_vop_attr_set(vp, name, value, length, ATTR_SECURE, NULL);
 	if (!error)
 		VMODIFY(vp);
 
@@ -264,13 +260,12 @@ xfs_has_fs_struct(struct task_struct *task)
 
 STATIC inline void
 xfs_cleanup_inode(
-	vnode_t		*dvp,
-	vnode_t		*vp,
+	bhv_vnode_t	*dvp,
+	bhv_vnode_t	*vp,
 	struct dentry	*dentry,
 	int		mode)
 {
 	struct dentry   teardown = {};
-	int             error;
 
 	/* Oh, the horror.
 	 * If we can't add the ACL or we fail in
@@ -281,9 +276,9 @@ xfs_cleanup_inode(
 	teardown.d_name = dentry->d_name;
 
 	if (S_ISDIR(mode))
-	  	VOP_RMDIR(dvp, &teardown, NULL, error);
+	  	bhv_vop_rmdir(dvp, &teardown, NULL);
 	else
-		VOP_REMOVE(dvp, &teardown, NULL, error);
+		bhv_vop_remove(dvp, &teardown, NULL);
 	VN_RELE(vp);
 }
 
@@ -296,7 +291,7 @@ xfs_vn_mknod(
 {
 	struct inode	*ip;
 	vattr_t		vattr = { 0 };
-	vnode_t		*vp = NULL, *dvp = vn_from_inode(dir);
+	bhv_vnode_t	*vp = NULL, *dvp = vn_from_inode(dir);
 	xfs_acl_t	*default_acl = NULL;
 	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
 	int		error;
@@ -330,10 +325,10 @@ xfs_vn_mknod(
 		vattr.va_mask |= XFS_AT_RDEV;
 		/*FALLTHROUGH*/
 	case S_IFREG:
-		VOP_CREATE(dvp, dentry, &vattr, &vp, NULL, error);
+		error = bhv_vop_create(dvp, dentry, &vattr, &vp, NULL);
 		break;
 	case S_IFDIR:
-		VOP_MKDIR(dvp, dentry, &vattr, &vp, NULL, error);
+		error = bhv_vop_mkdir(dvp, dentry, &vattr, &vp, NULL);
 		break;
 	default:
 		error = EINVAL;
@@ -396,14 +391,14 @@ xfs_vn_lookup(
 	struct dentry	*dentry,
 	struct nameidata *nd)
 {
-	struct vnode	*vp = vn_from_inode(dir), *cvp;
+	bhv_vnode_t	*vp = vn_from_inode(dir), *cvp;
 	int		error;
 
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error);
-	if (error) {
+	error = bhv_vop_lookup(vp, dentry, &cvp, 0, NULL, NULL);
+	if (unlikely(error)) {
 		if (unlikely(error != ENOENT))
 			return ERR_PTR(-error);
 		d_add(dentry, NULL);
@@ -420,8 +415,8 @@ xfs_vn_link(
 	struct dentry	*dentry)
 {
 	struct inode	*ip;	/* inode of guy being linked to */
-	vnode_t		*tdvp;	/* target directory for new name/link */
-	vnode_t		*vp;	/* vp of name being linked */
+	bhv_vnode_t	*tdvp;	/* target directory for new name/link */
+	bhv_vnode_t	*vp;	/* vp of name being linked */
 	vattr_t		vattr;
 	int		error;
 
@@ -432,7 +427,7 @@ xfs_vn_link(
 	tdvp = vn_from_inode(dir);
 	vp = vn_from_inode(ip);
 
-	VOP_LINK(tdvp, vp, dentry, NULL, error);
+	error = bhv_vop_link(tdvp, vp, dentry, NULL);
 	if (likely(!error)) {
 		VMODIFY(tdvp);
 		VN_HOLD(vp);
@@ -448,14 +443,14 @@ xfs_vn_unlink(
 	struct dentry	*dentry)
 {
 	struct inode	*inode;
-	vnode_t		*dvp;	/* directory containing name to remove */
+	bhv_vnode_t	*dvp;	/* directory containing name to remove */
 	vattr_t		vattr;
 	int		error;
 
 	inode = dentry->d_inode;
 	dvp = vn_from_inode(dir);
 
-	VOP_REMOVE(dvp, dentry, NULL, error);
+	error = bhv_vop_remove(dvp, dentry, NULL);
 	if (likely(!error)) {
 		xfs_validate_fields(dir, &vattr);	/* size needs update */
 		xfs_validate_fields(inode, &vattr);
@@ -470,27 +465,26 @@ xfs_vn_symlink(
 	const char	*symname)
 {
 	struct inode	*ip;
-	vattr_t		vattr = { 0 };
-	vnode_t		*dvp;	/* directory containing name of symlink */
-	vnode_t		*cvp;	/* used to lookup symlink to put in dentry */
+	vattr_t		va = { 0 };
+	bhv_vnode_t	*dvp;	/* directory containing name of symlink */
+	bhv_vnode_t	*cvp;	/* used to lookup symlink to put in dentry */
 	int		error;
 
 	dvp = vn_from_inode(dir);
 	cvp = NULL;
 
-	vattr.va_mode = S_IFLNK |
+	va.va_mode = S_IFLNK |
 		(irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
-	vattr.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
+	va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
 
-	error = 0;
-	VOP_SYMLINK(dvp, dentry, &vattr, (char *)symname, &cvp, NULL, error);
+	error = bhv_vop_symlink(dvp, dentry, &va, (char *)symname, &cvp, NULL);
 	if (likely(!error && cvp)) {
 		error = xfs_init_security(cvp, dir);
 		if (likely(!error)) {
 			ip = vn_to_inode(cvp);
 			d_instantiate(dentry, ip);
-			xfs_validate_fields(dir, &vattr);
-			xfs_validate_fields(ip, &vattr);
+			xfs_validate_fields(dir, &va);
+			xfs_validate_fields(ip, &va);
 		} else {
 			xfs_cleanup_inode(dvp, cvp, dentry, 0);
 		}
@@ -504,11 +498,11 @@ xfs_vn_rmdir(
 	struct dentry	*dentry)
 {
 	struct inode	*inode = dentry->d_inode;
-	vnode_t		*dvp = vn_from_inode(dir);
+	bhv_vnode_t	*dvp = vn_from_inode(dir);
 	vattr_t		vattr;
 	int		error;
 
-	VOP_RMDIR(dvp, dentry, NULL, error);
+	error = bhv_vop_rmdir(dvp, dentry, NULL);
 	if (likely(!error)) {
 		xfs_validate_fields(inode, &vattr);
 		xfs_validate_fields(dir, &vattr);
@@ -524,15 +518,15 @@ xfs_vn_rename(
 	struct dentry	*ndentry)
 {
 	struct inode	*new_inode = ndentry->d_inode;
-	vnode_t		*fvp;	/* from directory */
-	vnode_t		*tvp;	/* target directory */
+	bhv_vnode_t	*fvp;	/* from directory */
+	bhv_vnode_t	*tvp;	/* target directory */
 	vattr_t		vattr;
 	int		error;
 
 	fvp = vn_from_inode(odir);
 	tvp = vn_from_inode(ndir);
 
-	VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error);
+	error = bhv_vop_rename(fvp, odentry, tvp, ndentry, NULL);
 	if (likely(!error)) {
 		if (new_inode)
 			xfs_validate_fields(new_inode, &vattr);
@@ -553,7 +547,7 @@ xfs_vn_follow_link(
 	struct dentry		*dentry,
 	struct nameidata	*nd)
 {
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	uio_t			*uio;
 	iovec_t			iov;
 	int			error;
@@ -586,8 +580,8 @@ xfs_vn_follow_link(
 	uio->uio_resid = MAXPATHLEN;
 	uio->uio_iovcnt = 1;
 
-	VOP_READLINK(vp, uio, 0, NULL, error);
-	if (error) {
+	error = bhv_vop_readlink(vp, uio, 0, NULL);
+	if (unlikely(error)) {
 		kfree(link);
 		link = ERR_PTR(-error);
 	} else {
@@ -618,12 +612,7 @@ xfs_vn_permission(
 	int		mode,
 	struct nameidata *nd)
 {
-	vnode_t		*vp = vn_from_inode(inode);
-	int		error;
-
-	mode <<= 6;		/* convert from linux to vnode access bits */
-	VOP_ACCESS(vp, mode, NULL, error);
-	return -error;
+	return -bhv_vop_access(vn_from_inode(inode), mode << 6, NULL);
 }
 #else
 #define xfs_vn_permission NULL
@@ -636,7 +625,7 @@ xfs_vn_getattr(
 	struct kstat	*stat)
 {
 	struct inode	*inode = dentry->d_inode;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	int		error = 0;
 
 	if (unlikely(vp->v_flag & VMODIFIED))
@@ -653,7 +642,7 @@ xfs_vn_setattr(
 {
 	struct inode	*inode = dentry->d_inode;
 	unsigned int	ia_valid = attr->ia_valid;
-	vnode_t		*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 	vattr_t		vattr = { 0 };
 	int		flags = 0;
 	int		error;
@@ -697,7 +686,7 @@ xfs_vn_setattr(
 		flags |= ATTR_NONBLOCK;
 #endif
 
-	VOP_SETATTR(vp, &vattr, flags, NULL, error);
+	error = bhv_vop_setattr(vp, &vattr, flags, NULL);
 	if (likely(!error))
 		__vn_revalidate(vp, &vattr);
 	return -error;
@@ -718,7 +707,7 @@ xfs_vn_setxattr(
 	size_t		size,
 	int		flags)
 {
-	vnode_t		*vp = vn_from_inode(dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
 	char		*attr = (char *)name;
 	attrnames_t	*namesp;
 	int		xflags = 0;
@@ -748,7 +737,7 @@ xfs_vn_getxattr(
 	void		*data,
 	size_t		size)
 {
-	vnode_t		*vp = vn_from_inode(dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
 	char		*attr = (char *)name;
 	attrnames_t	*namesp;
 	int		xflags = 0;
@@ -777,7 +766,7 @@ xfs_vn_listxattr(
 	char			*data,
 	size_t			size)
 {
-	vnode_t			*vp = vn_from_inode(dentry->d_inode);
+	bhv_vnode_t		*vp = vn_from_inode(dentry->d_inode);
 	int			error, xflags = ATTR_KERNAMELS;
 	ssize_t			result;
 
@@ -796,7 +785,7 @@ xfs_vn_removexattr(
 	struct dentry	*dentry,
 	const char	*name)
 {
-	vnode_t		*vp = vn_from_inode(dentry->d_inode);
+	bhv_vnode_t	*vp = vn_from_inode(dentry->d_inode);
 	char		*attr = (char *)name;
 	attrnames_t	*namesp;
 	int		xflags = 0;
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 6fbdca3eaa7..a9b83018f0f 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -206,7 +206,7 @@ xfs_read(
 	xfs_fsize_t		n;
 	xfs_inode_t		*ip;
 	xfs_mount_t		*mp;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	unsigned long		seg;
 
 	ip = XFS_BHVTOI(bdp);
@@ -271,7 +271,7 @@ xfs_read(
 	}
 
 	if (unlikely((ioflags & IO_ISDIRECT) && VN_CACHED(vp)))
-		VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(*offset)),
+		bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
 						-1, FI_REMAPF_LOCKED);
 
 	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
@@ -499,7 +499,7 @@ xfs_zero_last_block(
 
 int					/* error (positive) */
 xfs_zero_eof(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_iocore_t	*io,
 	xfs_off_t	offset,		/* starting I/O offset */
 	xfs_fsize_t	isize,		/* current inode size */
@@ -626,7 +626,7 @@ xfs_write(
 	ssize_t			ret = 0, error = 0;
 	xfs_fsize_t		isize, new_size;
 	xfs_iocore_t		*io;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	unsigned long		seg;
 	int			iolock;
 	int			eventsent = 0;
@@ -803,7 +803,7 @@ retry:
 		if (need_flush) {
 			xfs_inval_cached_trace(io, pos, -1,
 					ctooff(offtoct(pos)), -1);
-			VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
+			bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)),
 					-1, FI_REMAPF_LOCKED);
 		}
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index dc49050688c..c77e62efb74 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -18,8 +18,8 @@
 #ifndef __XFS_LRW_H__
 #define __XFS_LRW_H__
 
-struct vnode;
 struct bhv_desc;
+struct bhv_vnode;
 struct xfs_mount;
 struct xfs_iocore;
 struct xfs_inode;
@@ -82,7 +82,7 @@ extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
 extern int xfs_bdstrat_cb(struct xfs_buf *);
 extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
 
-extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t,
+extern int xfs_zero_eof(struct bhv_vnode *, struct xfs_iocore *, xfs_off_t,
 				xfs_fsize_t, xfs_fsize_t);
 extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *,
 				const struct iovec *, unsigned int,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 97dbcb68b25..b7aad3cfdfe 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -151,7 +151,7 @@ xfs_set_inodeops(
 STATIC __inline__ void
 xfs_revalidate_inode(
 	xfs_mount_t		*mp,
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	xfs_inode_t		*ip)
 {
 	struct inode		*inode = vn_to_inode(vp);
@@ -206,7 +206,7 @@ xfs_revalidate_inode(
 void
 xfs_initialize_vnode(
 	bhv_desc_t		*bdp,
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	bhv_desc_t		*inode_bhv,
 	int			unlock)
 {
@@ -336,7 +336,7 @@ STATIC struct inode *
 xfs_fs_alloc_inode(
 	struct super_block	*sb)
 {
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 
 	vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
 	if (unlikely(!vp))
@@ -359,13 +359,13 @@ xfs_fs_inode_init_once(
 {
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 		      SLAB_CTOR_CONSTRUCTOR)
-		inode_init_once(vn_to_inode((vnode_t *)vnode));
+		inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
 }
 
 STATIC int
 xfs_init_zones(void)
 {
-	xfs_vnode_zone = kmem_zone_init_flags(sizeof(vnode_t), "xfs_vnode_t",
+	xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
 					KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
 					KM_ZONE_SPREAD,
 					xfs_fs_inode_init_once);
@@ -409,22 +409,17 @@ xfs_fs_write_inode(
 	struct inode		*inode,
 	int			sync)
 {
-	vnode_t			*vp = vn_from_inode(inode);
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 	int			error = 0, flags = FLUSH_INODE;
 
 	if (vp) {
 		vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 		if (sync)
 			flags |= FLUSH_SYNC;
-		VOP_IFLUSH(vp, flags, error);
-		if (error == EAGAIN) {
-			if (sync)
-				VOP_IFLUSH(vp, flags | FLUSH_LOG, error);
-			else
-				error = 0;
-		}
+		error = bhv_vop_iflush(vp, flags);
+		if (error == EAGAIN)
+			error = sync? bhv_vop_iflush(vp, flags | FLUSH_LOG) : 0;
 	}
-
 	return -error;
 }
 
@@ -432,8 +427,7 @@ STATIC void
 xfs_fs_clear_inode(
 	struct inode		*inode)
 {
-	vnode_t			*vp = vn_from_inode(inode);
-	int			error, cache;
+	bhv_vnode_t		*vp = vn_from_inode(inode);
 
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 
@@ -446,20 +440,18 @@ xfs_fs_clear_inode(
 	 * This can happen because xfs_iget_core calls xfs_idestroy if we
 	 * find an inode with di_mode == 0 but without IGET_CREATE set.
 	 */
-	if (vp->v_fbhv)
-		VOP_INACTIVE(vp, NULL, cache);
+	if (VNHEAD(vp))
+		bhv_vop_inactive(vp, NULL);
 
 	VN_LOCK(vp);
 	vp->v_flag &= ~VMODIFIED;
 	VN_UNLOCK(vp, 0);
 
-	if (vp->v_fbhv) {
-		VOP_RECLAIM(vp, error);
-		if (error)
-			panic("vn_purge: cannot reclaim");
-	}
+	if (VNHEAD(vp))
+		if (bhv_vop_reclaim(vp))
+			panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, vp);
 
-	ASSERT(vp->v_fbhv == NULL);
+	ASSERT(VNHEAD(vp) == NULL);
 
 #ifdef XFS_VNODE_TRACE
 	ktrace_free(vp->v_trace);
@@ -789,7 +781,7 @@ xfs_fs_fill_super(
 	void			*data,
 	int			silent)
 {
-	vnode_t			*rootvp;
+	struct bhv_vnode	*rootvp;
 	struct bhv_vfs		*vfsp = vfs_allocate(sb);
 	struct xfs_mount_args	*args = xfs_args_allocate(sb, silent);
 	struct kstatfs		statvfs;
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 376b96cb513..33dd1ca1324 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -105,7 +105,7 @@ struct block_device;
 
 extern __uint64_t xfs_max_file_offset(unsigned int);
 
-extern void xfs_initialize_vnode(bhv_desc_t *, vnode_t *, bhv_desc_t *, int);
+extern void xfs_initialize_vnode(bhv_desc_t *, bhv_vnode_t *, bhv_desc_t *, int);
 
 extern void xfs_flush_inode(struct xfs_inode *);
 extern void xfs_flush_device(struct xfs_inode *);
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
index 9f9bc894deb..a91ecfa9c8a 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -104,7 +104,7 @@ vfs_mntupdate(
 int
 vfs_root(
 	struct bhv_desc		*bdp,
-	struct vnode		**vpp)
+	struct bhv_vnode	**vpp)
 {
 	struct bhv_desc		*next = bdp;
 
@@ -118,7 +118,7 @@ int
 vfs_statvfs(
 	struct bhv_desc		*bdp,
 	xfs_statfs_t		*sp,
-	struct vnode		*vp)
+	struct bhv_vnode	*vp)
 {
 	struct bhv_desc		*next = bdp;
 
@@ -145,7 +145,7 @@ vfs_sync(
 int
 vfs_vget(
 	struct bhv_desc		*bdp,
-	struct vnode		**vpp,
+	struct bhv_vnode	**vpp,
 	struct fid		*fidp)
 {
 	struct bhv_desc		*next = bdp;
@@ -187,7 +187,7 @@ vfs_quotactl(
 void
 vfs_init_vnode(
 	struct bhv_desc		*bdp,
-	struct vnode		*vp,
+	struct bhv_vnode	*vp,
 	struct bhv_desc		*bp,
 	int			unlock)
 {
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 55f0afd3e79..1a3a2dd4b97 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -22,9 +22,9 @@
 #include "xfs_fs.h"
 
 struct bhv_vfs;
+struct bhv_vnode;
 struct fid;
 struct cred;
-struct vnode;
 struct statfs;
 struct seq_file;
 struct super_block;
@@ -53,8 +53,6 @@ typedef struct bhv_vfs {
 	wait_queue_head_t	vfs_wait_single_sync_task;
 } bhv_vfs_t;
 
-#define vfs_fbhv		vfs_bh.bh_first	/* 1st on vfs behavior chain */
-
 #define bhvtovfs(bdp)		( (struct bhv_vfs *)BHV_VOBJ(bdp) )
 #define bhvtovfsops(bdp)	( (struct bhv_vfsops *)BHV_OPS(bdp) )
 #define VFS_BHVHEAD(vfs)	( &(vfs)->vfs_bh )
@@ -110,14 +108,15 @@ typedef	int	(*vfs_showargs_t)(bhv_desc_t *, struct seq_file *);
 typedef int	(*vfs_unmount_t)(bhv_desc_t *, int, struct cred *);
 typedef int	(*vfs_mntupdate_t)(bhv_desc_t *, int *,
 				struct xfs_mount_args *);
-typedef int	(*vfs_root_t)(bhv_desc_t *, struct vnode **);
-typedef int	(*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
+typedef int	(*vfs_root_t)(bhv_desc_t *, struct bhv_vnode **);
+typedef int	(*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *,
+				struct bhv_vnode *);
 typedef int	(*vfs_sync_t)(bhv_desc_t *, int, struct cred *);
-typedef int	(*vfs_vget_t)(bhv_desc_t *, struct vnode **, struct fid *);
+typedef int	(*vfs_vget_t)(bhv_desc_t *, struct bhv_vnode **, struct fid *);
 typedef int	(*vfs_dmapiops_t)(bhv_desc_t *, caddr_t);
 typedef int	(*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t);
 typedef void	(*vfs_init_vnode_t)(bhv_desc_t *,
-				struct vnode *, bhv_desc_t *, int);
+				struct bhv_vnode *, bhv_desc_t *, int);
 typedef void	(*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
 typedef void	(*vfs_freeze_t)(bhv_desc_t *);
 
@@ -140,26 +139,26 @@ typedef struct bhv_vfsops {
 } bhv_vfsops_t;
 
 /*
- * VFS's.  Operates on vfs structure pointers (starts at bhv head).
+ * Virtual filesystem operations, operating from head bhv.
  */
-#define VHEAD(v)			((v)->vfs_fbhv)
-#define bhv_vfs_mount(v, ma,cr)		vfs_mount(VHEAD(v), ma,cr)
-#define bhv_vfs_parseargs(v, o,ma,f)	vfs_parseargs(VHEAD(v), o,ma,f)
-#define bhv_vfs_showargs(v, m)		vfs_showargs(VHEAD(v), m)
-#define bhv_vfs_unmount(v, f,cr)	vfs_unmount(VHEAD(v), f,cr)
-#define bhv_vfs_mntupdate(v, fl,args)	vfs_mntupdate(VHEAD(v), fl,args)
-#define bhv_vfs_root(v, vpp)		vfs_root(VHEAD(v), vpp)
-#define bhv_vfs_statvfs(v, sp,vp)	vfs_statvfs(VHEAD(v), sp,vp)
-#define bhv_vfs_sync(v, flag,cr)	vfs_sync(VHEAD(v), flag,cr)
-#define bhv_vfs_vget(v, vpp,fidp)	vfs_vget(VHEAD(v), vpp,fidp)
-#define bhv_vfs_dmapiops(v, p)		vfs_dmapiops(VHEAD(v), p)
-#define bhv_vfs_quotactl(v, c,id,p)	vfs_quotactl(VHEAD(v), c,id,p)
-#define bhv_vfs_init_vnode(v, vp,b,ul)	vfs_init_vnode(VHEAD(v), vp,b,ul)
-#define bhv_vfs_force_shutdown(v,u,f,l)	vfs_force_shutdown(VHEAD(v), u,f,l)
-#define bhv_vfs_freeze(v)		vfs_freeze(VHEAD(v))
+#define VFSHEAD(v)			((v)->vfs_bh.bh_first)
+#define bhv_vfs_mount(v, ma,cr)		vfs_mount(VFSHEAD(v), ma,cr)
+#define bhv_vfs_parseargs(v, o,ma,f)	vfs_parseargs(VFSHEAD(v), o,ma,f)
+#define bhv_vfs_showargs(v, m)		vfs_showargs(VFSHEAD(v), m)
+#define bhv_vfs_unmount(v, f,cr)	vfs_unmount(VFSHEAD(v), f,cr)
+#define bhv_vfs_mntupdate(v, fl,args)	vfs_mntupdate(VFSHEAD(v), fl,args)
+#define bhv_vfs_root(v, vpp)		vfs_root(VFSHEAD(v), vpp)
+#define bhv_vfs_statvfs(v, sp,vp)	vfs_statvfs(VFSHEAD(v), sp,vp)
+#define bhv_vfs_sync(v, flag,cr)	vfs_sync(VFSHEAD(v), flag,cr)
+#define bhv_vfs_vget(v, vpp,fidp)	vfs_vget(VFSHEAD(v), vpp,fidp)
+#define bhv_vfs_dmapiops(v, p)		vfs_dmapiops(VFSHEAD(v), p)
+#define bhv_vfs_quotactl(v, c,id,p)	vfs_quotactl(VFSHEAD(v), c,id,p)
+#define bhv_vfs_init_vnode(v, vp,b,ul)	vfs_init_vnode(VFSHEAD(v), vp,b,ul)
+#define bhv_vfs_force_shutdown(v,u,f,l)	vfs_force_shutdown(VFSHEAD(v), u,f,l)
+#define bhv_vfs_freeze(v)		vfs_freeze(VFSHEAD(v))
 
 /*
- * PVFS's.  Operates on behavior descriptor pointers.
+ * Virtual filesystem operations, operating from next bhv.
  */
 #define bhv_next_vfs_mount(b, ma,cr)		vfs_mount(b, ma,cr)
 #define bhv_next_vfs_parseargs(b, o,ma,f)	vfs_parseargs(b, o,ma,f)
@@ -181,13 +180,13 @@ extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
 extern int vfs_showargs(bhv_desc_t *, struct seq_file *);
 extern int vfs_unmount(bhv_desc_t *, int, struct cred *);
 extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *);
-extern int vfs_root(bhv_desc_t *, struct vnode **);
-extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
+extern int vfs_root(bhv_desc_t *, struct bhv_vnode **);
+extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct bhv_vnode *);
 extern int vfs_sync(bhv_desc_t *, int, struct cred *);
-extern int vfs_vget(bhv_desc_t *, struct vnode **, struct fid *);
+extern int vfs_vget(bhv_desc_t *, struct bhv_vnode **, struct fid *);
 extern int vfs_dmapiops(bhv_desc_t *, caddr_t);
 extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
-extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
+extern void vfs_init_vnode(bhv_desc_t *, struct bhv_vnode *, bhv_desc_t *, int);
 extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
 extern void vfs_freeze(bhv_desc_t *);
 
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 82777f1a70a..66e36e195f2 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -39,7 +39,7 @@ vn_init(void)
 
 void
 vn_iowait(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
 	wait_queue_head_t *wq = vptosync(vp);
 
@@ -48,7 +48,7 @@ vn_iowait(
 
 void
 vn_iowake(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
 	if (atomic_dec_and_test(&vp->v_iocount))
 		wake_up(vptosync(vp));
@@ -61,7 +61,7 @@ vn_iowake(
  */
 void
 vn_ioerror(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	int		error,
 	char		*f,
 	int		l)
@@ -70,11 +70,11 @@ vn_ioerror(
 		bhv_vfs_force_shutdown(vp->v_vfsp, SHUTDOWN_DEVICE_REQ, f, l);
 }
 
-struct vnode *
+bhv_vnode_t *
 vn_initialize(
 	struct inode	*inode)
 {
-	struct vnode	*vp = vn_from_inode(inode);
+	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	XFS_STATS_INC(vn_active);
 	XFS_STATS_INC(vn_alloc);
@@ -110,7 +110,7 @@ vn_initialize(
  */
 void
 vn_revalidate_core(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	vattr_t		*vap)
 {
 	struct inode	*inode = vn_to_inode(vp);
@@ -146,14 +146,14 @@ vn_revalidate_core(
  */
 int
 __vn_revalidate(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	struct vattr	*vattr)
 {
 	int		error;
 
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
 	vattr->va_mask = XFS_AT_STAT | XFS_AT_XFLAGS;
-	VOP_GETATTR(vp, vattr, 0, NULL, error);
+	error = bhv_vop_getattr(vp, vattr, 0, NULL);
 	if (likely(!error)) {
 		vn_revalidate_core(vp, vattr);
 		VUNMODIFY(vp);
@@ -163,7 +163,7 @@ __vn_revalidate(
 
 int
 vn_revalidate(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
 	vattr_t		vattr;
 
@@ -173,9 +173,9 @@ vn_revalidate(
 /*
  * Add a reference to a referenced vnode.
  */
-struct vnode *
+bhv_vnode_t *
 vn_hold(
-	struct vnode	*vp)
+	bhv_vnode_t	*vp)
 {
 	struct inode	*inode;
 
@@ -208,31 +208,31 @@ vn_hold(
  * Vnode tracing code.
  */
 void
-vn_trace_entry(vnode_t *vp, const char *func, inst_t *ra)
+vn_trace_entry(bhv_vnode_t *vp, const char *func, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
 }
 
 void
-vn_trace_exit(vnode_t *vp, const char *func, inst_t *ra)
+vn_trace_exit(bhv_vnode_t *vp, const char *func, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
 }
 
 void
-vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
+vn_trace_hold(bhv_vnode_t *vp, char *file, int line, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
 }
 
 void
-vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
+vn_trace_ref(bhv_vnode_t *vp, char *file, int line, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
 }
 
 void
-vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
+vn_trace_rele(bhv_vnode_t *vp, char *file, int line, inst_t *ra)
 {
 	KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
 }
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 95343637ab6..cb16774aea7 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -14,33 +14,6 @@
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Portions Copyright (c) 1989, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
  */
 #ifndef __XFS_VNODE_H__
 #define __XFS_VNODE_H__
@@ -51,7 +24,6 @@ struct vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
 
-
 typedef xfs_ino_t vnumber_t;
 typedef struct dentry vname_t;
 typedef bhv_head_t vn_bhv_head_t;
@@ -66,7 +38,7 @@ typedef enum vflags {
  * MP locking protocols:
  *	v_flag, v_vfsp				VN_LOCK/VN_UNLOCK
  */
-typedef struct vnode {
+typedef struct bhv_vnode {
 	vflags_t	v_flag;			/* vnode flags (see above) */
 	struct bhv_vfs	*v_vfsp;		/* ptr to containing VFS */
 	vnumber_t	v_number;		/* in-core vnode number */
@@ -78,7 +50,7 @@ typedef struct vnode {
 #endif
 	struct inode	v_inode;		/* Linux inode */
 	/* inode MUST be last */
-} vnode_t;
+} bhv_vnode_t;
 
 #define VN_ISLNK(vp)	S_ISLNK((vp)->v_inode.i_mode)
 #define VN_ISREG(vp)	S_ISREG((vp)->v_inode.i_mode)
@@ -86,9 +58,6 @@ typedef struct vnode {
 #define VN_ISCHR(vp)	S_ISCHR((vp)->v_inode.i_mode)
 #define VN_ISBLK(vp)	S_ISBLK((vp)->v_inode.i_mode)
 
-#define v_fbhv			v_bh.bh_first	       /* first behavior */
-#define v_fops			v_bh.bh_first->bd_ops  /* first behavior ops */
-
 #define VNODE_POSITION_BASE	BHV_POSITION_BASE	/* chain bottom */
 #define VNODE_POSITION_TOP	BHV_POSITION_TOP	/* chain top */
 #define VNODE_POSITION_INVALID	BHV_POSITION_INVALID	/* invalid pos. num */
@@ -110,8 +79,8 @@ typedef enum {
 /*
  * Macros for dealing with the behavior descriptor inside of the vnode.
  */
-#define BHV_TO_VNODE(bdp)	((vnode_t *)BHV_VOBJ(bdp))
-#define BHV_TO_VNODE_NULL(bdp)	((vnode_t *)BHV_VOBJNULL(bdp))
+#define BHV_TO_VNODE(bdp)	((bhv_vnode_t *)BHV_VOBJ(bdp))
+#define BHV_TO_VNODE_NULL(bdp)	((bhv_vnode_t *)BHV_VOBJNULL(bdp))
 
 #define VN_BHV_HEAD(vp)			((bhv_head_t *)(&((vp)->v_bh)))
 #define vn_bhv_head_init(bhp,name)	bhv_head_init(bhp,name)
@@ -122,17 +91,17 @@ typedef enum {
 /*
  * Vnode to Linux inode mapping.
  */
-static inline struct vnode *vn_from_inode(struct inode *inode)
+static inline struct bhv_vnode *vn_from_inode(struct inode *inode)
 {
-	return (vnode_t *)list_entry(inode, vnode_t, v_inode);
+	return (bhv_vnode_t *)list_entry(inode, bhv_vnode_t, v_inode);
 }
-static inline struct inode *vn_to_inode(struct vnode *vnode)
+static inline struct inode *vn_to_inode(struct bhv_vnode *vnode)
 {
 	return &vnode->v_inode;
 }
 
 /*
- * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter.
+ * Values for the vop_rwlock/rwunlock flags parameter.
  */
 typedef enum vrwlock {
 	VRWLOCK_NONE,
@@ -144,7 +113,7 @@ typedef enum vrwlock {
 } vrwlock_t;
 
 /*
- * Return values for VOP_INACTIVE.  A return value of
+ * Return values for bhv_vop_inactive.  A return value of
  * VN_INACTIVE_NOCACHE implies that the file system behavior
  * has disassociated its state and bhv_desc_t from the vnode.
  */
@@ -152,7 +121,7 @@ typedef enum vrwlock {
 #define	VN_INACTIVE_NOCACHE	1
 
 /*
- * Values for the cmd code given to VOP_VNODE_CHANGE.
+ * Values for the cmd code given to vop_vnode_change.
  */
 typedef enum vchange {
 	VCHANGE_FLAGS_FRLOCKS		= 0,
@@ -188,22 +157,22 @@ typedef int	(*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
 typedef int	(*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
 				struct cred *);
 typedef int	(*vop_access_t)(bhv_desc_t *, int, struct cred *);
-typedef int	(*vop_lookup_t)(bhv_desc_t *, vname_t *, vnode_t **,
-				int, vnode_t *, struct cred *);
+typedef int	(*vop_lookup_t)(bhv_desc_t *, vname_t *, bhv_vnode_t **,
+				int, bhv_vnode_t *, struct cred *);
 typedef int	(*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *,
-				vnode_t **, struct cred *);
+				bhv_vnode_t **, struct cred *);
 typedef int	(*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *);
-typedef int	(*vop_link_t)(bhv_desc_t *, vnode_t *, vname_t *,
-				struct cred *);
-typedef int	(*vop_rename_t)(bhv_desc_t *, vname_t *, vnode_t *, vname_t *,
+typedef int	(*vop_link_t)(bhv_desc_t *, bhv_vnode_t *, vname_t *,
 				struct cred *);
+typedef int	(*vop_rename_t)(bhv_desc_t *, vname_t *, bhv_vnode_t *,
+				vname_t *, struct cred *);
 typedef int	(*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *,
-				vnode_t **, struct cred *);
+				bhv_vnode_t **, struct cred *);
 typedef int	(*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *);
 typedef int	(*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
 				int *);
 typedef int	(*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *,
-				char *, vnode_t **, struct cred *);
+				char *, bhv_vnode_t **, struct cred *);
 typedef int	(*vop_readlink_t)(bhv_desc_t *, struct uio *, int,
 				struct cred *);
 typedef int	(*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
@@ -224,7 +193,7 @@ typedef	int	(*vop_attr_remove_t)(bhv_desc_t *, const char *,
 				int, struct cred *);
 typedef	int	(*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
 				struct attrlist_cursor_kern *, struct cred *);
-typedef void	(*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int);
+typedef void	(*vop_link_removed_t)(bhv_desc_t *, bhv_vnode_t *, int);
 typedef void	(*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
 typedef void	(*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 typedef void	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
@@ -233,7 +202,7 @@ typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
 typedef int	(*vop_iflush_t)(bhv_desc_t *, int);
 
 
-typedef struct vnodeops {
+typedef struct bhv_vnodeops {
 	bhv_position_t  vn_position;    /* position within behavior chain */
 	vop_open_t		vop_open;
 	vop_close_t		vop_close;
@@ -274,105 +243,80 @@ typedef struct vnodeops {
 	vop_pflushvp_t		vop_flush_pages;
 	vop_release_t		vop_release;
 	vop_iflush_t		vop_iflush;
-} vnodeops_t;
+} bhv_vnodeops_t;
 
 /*
- * VOP's.
- */
-#define _VOP_(op, vp)	(*((vnodeops_t *)(vp)->v_fops)->op)
-
-#define VOP_OPEN(vp, cr, rv)						\
-	rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
-#define VOP_CLOSE(vp, f, last, cr, rv)					\
-	rv = _VOP_(vop_close, vp)((vp)->v_fbhv, f, last, cr)
-#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv)			\
-	rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv)		\
-	rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
-#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv)		\
-	rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
-#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
-#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
-#define VOP_BMAP(vp,of,sz,rw,b,n,rv)					\
-	rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
-#define VOP_GETATTR(vp, vap, f, cr, rv)					\
-	rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define	VOP_SETATTR(vp, vap, f, cr, rv)					\
-	rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr)
-#define	VOP_ACCESS(vp, mode, cr, rv)					\
-	rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr)
-#define	VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv)				\
-	rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr)
-#define VOP_CREATE(dvp,d,vap,vpp,cr,rv)					\
-	rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr)
-#define VOP_REMOVE(dvp,d,cr,rv)						\
-	rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr)
-#define	VOP_LINK(tdvp,fvp,d,cr,rv)					\
-	rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr)
-#define	VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv)				\
-	rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr)
-#define	VOP_MKDIR(dp,d,vap,vpp,cr,rv)					\
-	rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr)
-#define	VOP_RMDIR(dp,d,cr,rv)	 					\
-	rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr)
-#define	VOP_READDIR(vp,uiop,cr,eofp,rv)					\
-	rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp)
-#define	VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv)				\
-	rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr)
-#define	VOP_READLINK(vp,uiop,fl,cr,rv)					\
-	rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr)
-#define	VOP_FSYNC(vp,f,cr,b,e,rv)					\
-	rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e)
-#define VOP_INACTIVE(vp, cr, rv)					\
-	rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr)
-#define VOP_RELEASE(vp, rv)						\
-	rv = _VOP_(vop_release, vp)((vp)->v_fbhv)
-#define VOP_FID2(vp, fidp, rv)						\
-	rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp)
-#define VOP_RWLOCK(vp,i)						\
-	(void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWLOCK_TRY(vp,i)						\
-	_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
-#define VOP_RWUNLOCK(vp,i)						\
-	(void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i)
-#define VOP_FRLOCK(vp,c,fl,flags,offset,fr,rv)				\
-	rv = _VOP_(vop_frlock, vp)((vp)->v_fbhv,c,fl,flags,offset,fr)
-#define VOP_RECLAIM(vp, rv)						\
-	rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv)
-#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv)		\
-	rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred)
-#define	VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv)		\
-	rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred)
-#define	VOP_ATTR_REMOVE(vp, name, flags, cred, rv)			\
-	rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred)
-#define	VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv)		\
-	rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred)
-#define VOP_LINK_REMOVED(vp, dvp, linkzero)				\
-	(void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero)
-#define VOP_VNODE_CHANGE(vp, cmd, val)					\
-	(void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val)
-/*
- * These are page cache functions that now go thru VOPs.
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_TOSS_PAGES(vp, first, last, fiopt)				\
-	_VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt)
-/*
- * 'last' parameter is unused and left in for IRIX compatibility
- */
-#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt)			\
-	_VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt)
-/*
- * 'last' parameter is unused and left in for IRIX compatibility
+ * Virtual node operations, operating from head bhv.
  */
-#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv)		\
-	rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt)
-#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv)			\
-	rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg)
-#define VOP_IFLUSH(vp, flags, rv)					\
-	rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags)
+#define VNHEAD(vp)	((vp)->v_bh.bh_first)
+#define VOP(op, vp)	(*((bhv_vnodeops_t *)VNHEAD(vp)->bd_ops)->op)
+#define bhv_vop_open(vp, cr)		VOP(vop_open, vp)(VNHEAD(vp),cr)
+#define bhv_vop_close(vp, f,last,cr)	VOP(vop_close, vp)(VNHEAD(vp),f,last,cr)
+#define bhv_vop_read(vp,file,iov,segs,offset,ioflags,cr)		\
+		VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
+#define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr)		\
+		VOP(vop_write, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr)
+#define bhv_vop_sendfile(vp,f,off,ioflags,cnt,act,targ,cr)		\
+		VOP(vop_sendfile, vp)(VNHEAD(vp),f,off,ioflags,cnt,act,targ,cr)
+#define bhv_vop_splice_read(vp,f,o,pipe,cnt,fl,iofl,cr)			\
+		VOP(vop_splice_read, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr)
+#define bhv_vop_splice_write(vp,f,o,pipe,cnt,fl,iofl,cr)		\
+		VOP(vop_splice_write, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr)
+#define bhv_vop_bmap(vp,of,sz,rw,b,n)					\
+		VOP(vop_bmap, vp)(VNHEAD(vp),of,sz,rw,b,n)
+#define bhv_vop_getattr(vp, vap,f,cr)					\
+		VOP(vop_getattr, vp)(VNHEAD(vp), vap,f,cr)
+#define	bhv_vop_setattr(vp, vap,f,cr)					\
+		VOP(vop_setattr, vp)(VNHEAD(vp), vap,f,cr)
+#define	bhv_vop_access(vp, mode,cr)	VOP(vop_access, vp)(VNHEAD(vp), mode,cr)
+#define	bhv_vop_lookup(vp,d,vpp,f,rdir,cr)				\
+		VOP(vop_lookup, vp)(VNHEAD(vp),d,vpp,f,rdir,cr)
+#define bhv_vop_create(dvp,d,vap,vpp,cr)				\
+		VOP(vop_create, dvp)(VNHEAD(dvp),d,vap,vpp,cr)
+#define bhv_vop_remove(dvp,d,cr)	VOP(vop_remove, dvp)(VNHEAD(dvp),d,cr)
+#define	bhv_vop_link(dvp,fvp,d,cr)	VOP(vop_link, dvp)(VNHEAD(dvp),fvp,d,cr)
+#define	bhv_vop_rename(fvp,fnm,tdvp,tnm,cr)				\
+		VOP(vop_rename, fvp)(VNHEAD(fvp),fnm,tdvp,tnm,cr)
+#define	bhv_vop_mkdir(dp,d,vap,vpp,cr)					\
+		VOP(vop_mkdir, dp)(VNHEAD(dp),d,vap,vpp,cr)
+#define	bhv_vop_rmdir(dp,d,cr)	 	VOP(vop_rmdir, dp)(VNHEAD(dp),d,cr)
+#define	bhv_vop_readdir(vp,uiop,cr,eofp)				\
+		VOP(vop_readdir, vp)(VNHEAD(vp),uiop,cr,eofp)
+#define	bhv_vop_symlink(dvp,d,vap,tnm,vpp,cr)				\
+		VOP(vop_symlink, dvp)(VNHEAD(dvp),d,vap,tnm,vpp,cr)
+#define	bhv_vop_readlink(vp,uiop,fl,cr)					\
+		VOP(vop_readlink, vp)(VNHEAD(vp),uiop,fl,cr)
+#define	bhv_vop_fsync(vp,f,cr,b,e)	VOP(vop_fsync, vp)(VNHEAD(vp),f,cr,b,e)
+#define bhv_vop_inactive(vp,cr)		VOP(vop_inactive, vp)(VNHEAD(vp),cr)
+#define bhv_vop_release(vp)		VOP(vop_release, vp)(VNHEAD(vp))
+#define bhv_vop_fid2(vp,fidp)		VOP(vop_fid2, vp)(VNHEAD(vp),fidp)
+#define bhv_vop_rwlock(vp,i)		VOP(vop_rwlock, vp)(VNHEAD(vp),i)
+#define bhv_vop_rwlock_try(vp,i)	VOP(vop_rwlock, vp)(VNHEAD(vp),i)
+#define bhv_vop_rwunlock(vp,i)		VOP(vop_rwunlock, vp)(VNHEAD(vp),i)
+#define bhv_vop_frlock(vp,c,fl,flags,offset,fr)				\
+		VOP(vop_frlock, vp)(VNHEAD(vp),c,fl,flags,offset,fr)
+#define bhv_vop_reclaim(vp)		VOP(vop_reclaim, vp)(VNHEAD(vp))
+#define bhv_vop_attr_get(vp, name, val, vallenp, fl, cred)		\
+		VOP(vop_attr_get, vp)(VNHEAD(vp),name,val,vallenp,fl,cred)
+#define	bhv_vop_attr_set(vp, name, val, vallen, fl, cred)		\
+		VOP(vop_attr_set, vp)(VNHEAD(vp),name,val,vallen,fl,cred)
+#define	bhv_vop_attr_remove(vp, name, flags, cred)			\
+		VOP(vop_attr_remove, vp)(VNHEAD(vp),name,flags,cred)
+#define	bhv_vop_attr_list(vp, buf, buflen, fl, cursor, cred)		\
+		VOP(vop_attr_list, vp)(VNHEAD(vp),buf,buflen,fl,cursor,cred)
+#define bhv_vop_link_removed(vp, dvp, linkzero)				\
+		VOP(vop_link_removed, vp)(VNHEAD(vp), dvp, linkzero)
+#define bhv_vop_vnode_change(vp, cmd, val)				\
+		VOP(vop_vnode_change, vp)(VNHEAD(vp), cmd, val)
+#define bhv_vop_toss_pages(vp, first, last, fiopt)			\
+		VOP(vop_tosspages, vp)(VNHEAD(vp), first, last, fiopt)
+#define bhv_vop_flushinval_pages(vp, first, last, fiopt)		\
+		VOP(vop_flushinval_pages, vp)(VNHEAD(vp),first,last,fiopt)
+#define bhv_vop_flush_pages(vp, first, last, flags, fiopt)		\
+		VOP(vop_flush_pages, vp)(VNHEAD(vp),first,last,flags,fiopt)
+#define bhv_vop_ioctl(vp, inode, filp, fl, cmd, arg)			\
+		VOP(vop_ioctl, vp)(VNHEAD(vp),inode,filp,fl,cmd,arg)
+#define bhv_vop_iflush(vp, flags)	VOP(vop_iflush, vp)(VNHEAD(vp), flags)
 
 /*
  * Flags for read/write calls - same values as IRIX
@@ -382,7 +326,7 @@ typedef struct vnodeops {
 #define IO_INVIS	0x00020		/* don't update inode timestamps */
 
 /*
- * Flags for VOP_IFLUSH call
+ * Flags for vop_iflush call
  */
 #define FLUSH_SYNC		1	/* wait for flush to complete	*/
 #define FLUSH_INODE		2	/* flush the inode itself	*/
@@ -390,8 +334,7 @@ typedef struct vnodeops {
 					 * this inode out to disk	*/
 
 /*
- * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and
- *	VOP_FLUSH_PAGES.
+ * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
  */
 #define FI_NONE			0	/* none */
 #define FI_REMAPF		1	/* Do a remapf prior to the operation */
@@ -497,31 +440,17 @@ typedef struct vattr {
 	(VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
 
 extern void	vn_init(void);
-extern vnode_t	*vn_initialize(struct inode *);
-
-/*
- * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
- */
-typedef struct vnode_map {
-	bhv_vfs_t	*v_vfsp;
-	vnumber_t	v_number;		/* in-core vnode number */
-	xfs_ino_t	v_ino;			/* inode #	*/
-} vmap_t;
-
-#define VMAP(vp, vmap)	{(vmap).v_vfsp	 = (vp)->v_vfsp,	\
-			 (vmap).v_number = (vp)->v_number,	\
-			 (vmap).v_ino	 = (vp)->v_inode.i_ino; }
-
-extern int	vn_revalidate(struct vnode *);
-extern int	__vn_revalidate(struct vnode *, vattr_t *);
-extern void	vn_revalidate_core(struct vnode *, vattr_t *);
+extern bhv_vnode_t	*vn_initialize(struct inode *);
+extern int	vn_revalidate(struct bhv_vnode *);
+extern int	__vn_revalidate(struct bhv_vnode *, vattr_t *);
+extern void	vn_revalidate_core(struct bhv_vnode *, vattr_t *);
 
-extern void	vn_iowait(struct vnode *vp);
-extern void	vn_iowake(struct vnode *vp);
+extern void	vn_iowait(struct bhv_vnode *vp);
+extern void	vn_iowake(struct bhv_vnode *vp);
 
-extern void	vn_ioerror(struct vnode *vp, int error, char *f, int l);
+extern void	vn_ioerror(struct bhv_vnode *vp, int error, char *f, int l);
 
-static inline int vn_count(struct vnode *vp)
+static inline int vn_count(struct bhv_vnode *vp)
 {
 	return atomic_read(&vn_to_inode(vp)->i_count);
 }
@@ -529,7 +458,7 @@ static inline int vn_count(struct vnode *vp)
 /*
  * Vnode reference counting functions (and macros for compatibility).
  */
-extern vnode_t	*vn_hold(struct vnode *);
+extern bhv_vnode_t	*vn_hold(struct bhv_vnode *);
 
 #if defined(XFS_VNODE_TRACE)
 #define VN_HOLD(vp)		\
@@ -543,7 +472,7 @@ extern vnode_t	*vn_hold(struct vnode *);
 #define VN_RELE(vp)		(iput(vn_to_inode(vp)))
 #endif
 
-static inline struct vnode *vn_grab(struct vnode *vp)
+static inline struct bhv_vnode *vn_grab(struct bhv_vnode *vp)
 {
 	struct inode *inode = igrab(vn_to_inode(vp));
 	return inode ? vn_from_inode(inode) : NULL;
@@ -562,14 +491,14 @@ static inline struct vnode *vn_grab(struct vnode *vp)
 #define VN_LOCK(vp)		mutex_spinlock(&(vp)->v_lock)
 #define VN_UNLOCK(vp, s)	mutex_spinunlock(&(vp)->v_lock, s)
 
-static __inline__ void vn_flagset(struct vnode *vp, uint flag)
+static __inline__ void vn_flagset(struct bhv_vnode *vp, uint flag)
 {
 	spin_lock(&vp->v_lock);
 	vp->v_flag |= flag;
 	spin_unlock(&vp->v_lock);
 }
 
-static __inline__ uint vn_flagclr(struct vnode *vp, uint flag)
+static __inline__ uint vn_flagclr(struct bhv_vnode *vp, uint flag)
 {
 	uint	cleared;
 
@@ -588,12 +517,12 @@ static __inline__ uint vn_flagclr(struct vnode *vp, uint flag)
 /*
  * Dealing with bad inodes
  */
-static inline void vn_mark_bad(struct vnode *vp)
+static inline void vn_mark_bad(struct bhv_vnode *vp)
 {
 	make_bad_inode(vn_to_inode(vp));
 }
 
-static inline int VN_BAD(struct vnode *vp)
+static inline int VN_BAD(struct bhv_vnode *vp)
 {
 	return is_bad_inode(vn_to_inode(vp));
 }
@@ -601,18 +530,18 @@ static inline int VN_BAD(struct vnode *vp)
 /*
  * Extracting atime values in various formats
  */
-static inline void vn_atime_to_bstime(struct vnode *vp, xfs_bstime_t *bs_atime)
+static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime)
 {
 	bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec;
 	bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec;
 }
 
-static inline void vn_atime_to_timespec(struct vnode *vp, struct timespec *ts)
+static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts)
 {
 	*ts = vp->v_inode.i_atime;
 }
 
-static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
+static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 {
 	*tt = vp->v_inode.i_atime.tv_sec;
 }
@@ -627,7 +556,7 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define VN_TRUNC(vp)	((vp)->v_flag & VTRUNCATED)
 
 /*
- * Flags to VOP_SETATTR/VOP_GETATTR.
+ * Flags to vop_setattr/getattr.
  */
 #define	ATTR_UTIME	0x01	/* non-default utime(2) request */
 #define	ATTR_DMI	0x08	/* invocation from a DMI function */
@@ -637,7 +566,7 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define ATTR_NOSIZETOK	0x400	/* Don't get the SIZE token */
 
 /*
- * Flags to VOP_FSYNC and VOP_RECLAIM.
+ * Flags to vop_fsync/reclaim.
  */
 #define FSYNC_NOWAIT	0	/* asynchronous flush */
 #define FSYNC_WAIT	0x1	/* synchronous fsync or forced reclaim */
@@ -656,11 +585,11 @@ static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt)
 #define	VNODE_KTRACE_REF	4
 #define	VNODE_KTRACE_RELE	5
 
-extern void vn_trace_entry(struct vnode *, const char *, inst_t *);
-extern void vn_trace_exit(struct vnode *, const char *, inst_t *);
-extern void vn_trace_hold(struct vnode *, char *, int, inst_t *);
-extern void vn_trace_ref(struct vnode *, char *, int, inst_t *);
-extern void vn_trace_rele(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_entry(struct bhv_vnode *, const char *, inst_t *);
+extern void vn_trace_exit(struct bhv_vnode *, const char *, inst_t *);
+extern void vn_trace_hold(struct bhv_vnode *, char *, int, inst_t *);
+extern void vn_trace_ref(struct bhv_vnode *, char *, int, inst_t *);
+extern void vn_trace_rele(struct bhv_vnode *, char *, int, inst_t *);
 
 #define	VN_TRACE(vp)		\
 	vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address)
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 78f5962e578..ad9ab104946 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -192,7 +192,7 @@ STATIC int
 xfs_qm_statvfs(
 	struct bhv_desc		*bhv,
 	xfs_statfs_t		*statp,
-	struct vnode		*vnode)
+	struct bhv_vnode	*vnode)
 {
 	xfs_mount_t		*mp;
 	xfs_inode_t		*ip;
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index afc7c1c696b..c93072be679 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1035,7 +1035,7 @@ xfs_qm_dqrele_all_inodes(
 {
 	xfs_inode_t	*ip, *topino;
 	uint		ireclaims;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	boolean_t	vnode_refd;
 
 	ASSERT(mp->m_quotainfo);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 2539af34eb6..8c300da4e7a 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -39,15 +39,15 @@
 #include <linux/capability.h>
 #include <linux/posix_acl_xattr.h>
 
-STATIC int	xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *);
+STATIC int	xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *);
 STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
 STATIC void	xfs_acl_get_endian(xfs_acl_t *);
 STATIC int	xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
 STATIC int	xfs_acl_invalid(xfs_acl_t *);
 STATIC void	xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void	xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *);
-STATIC void	xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *);
-STATIC int	xfs_acl_allow_set(vnode_t *, int);
+STATIC void	xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *);
+STATIC void	xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *);
+STATIC int	xfs_acl_allow_set(bhv_vnode_t *, int);
 
 kmem_zone_t *xfs_acl_zone;
 
@@ -57,7 +57,7 @@ kmem_zone_t *xfs_acl_zone;
  */
 int
 xfs_acl_vhasacl_access(
-	vnode_t		*vp)
+	bhv_vnode_t	*vp)
 {
 	int		error;
 
@@ -70,7 +70,7 @@ xfs_acl_vhasacl_access(
  */
 int
 xfs_acl_vhasacl_default(
-	vnode_t		*vp)
+	bhv_vnode_t	*vp)
 {
 	int		error;
 
@@ -209,7 +209,7 @@ posix_acl_xfs_to_xattr(
 
 int
 xfs_acl_vget(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	void		*acl,
 	size_t		size,
 	int		kind)
@@ -244,7 +244,7 @@ xfs_acl_vget(
 			vattr_t	va;
 
 			va.va_mask = XFS_AT_MODE;
-			VOP_GETATTR(vp, &va, 0, sys_cred, error);
+			error = bhv_vop_getattr(vp, &va, 0, sys_cred);
 			if (error)
 				goto out;
 			xfs_acl_sync_mode(va.va_mode, xfs_acl);
@@ -260,7 +260,7 @@ out:
 
 int
 xfs_acl_vremove(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	int		kind)
 {
 	int		error;
@@ -268,9 +268,9 @@ xfs_acl_vremove(
 	VN_HOLD(vp);
 	error = xfs_acl_allow_set(vp, kind);
 	if (!error) {
-		VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT?
-				SGI_ACL_DEFAULT: SGI_ACL_FILE,
-				ATTR_ROOT, sys_cred, error);
+		error = bhv_vop_attr_remove(vp, kind == _ACL_TYPE_DEFAULT?
+						SGI_ACL_DEFAULT: SGI_ACL_FILE,
+						ATTR_ROOT, sys_cred);
 		if (error == ENOATTR)
 			error = 0;	/* 'scool */
 	}
@@ -280,7 +280,7 @@ xfs_acl_vremove(
 
 int
 xfs_acl_vset(
-	vnode_t			*vp,
+	bhv_vnode_t		*vp,
 	void			*acl,
 	size_t			size,
 	int			kind)
@@ -370,7 +370,7 @@ xfs_acl_iaccess(
 
 STATIC int
 xfs_acl_allow_set(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	int		kind)
 {
 	vattr_t		va;
@@ -383,7 +383,7 @@ xfs_acl_allow_set(
 	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
 		return EROFS;
 	va.va_mask = XFS_AT_UID;
-	VOP_GETATTR(vp, &va, 0, NULL, error);
+	error = bhv_vop_getattr(vp, &va, 0, NULL);
 	if (error)
 		return error;
 	if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
@@ -606,7 +606,7 @@ xfs_acl_get_endian(
  */
 STATIC void
 xfs_acl_get_attr(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_acl_t	*aclp,
 	int		kind,
 	int		flags,
@@ -616,9 +616,9 @@ xfs_acl_get_attr(
 
 	ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
 	flags |= ATTR_ROOT;
-	VOP_ATTR_GET(vp,
-		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT,
-		(char *)aclp, &len, flags, sys_cred, *error);
+	*error = bhv_vop_attr_get(vp, kind == _ACL_TYPE_ACCESS ?
+					SGI_ACL_FILE : SGI_ACL_DEFAULT,
+					(char *)aclp, &len, flags, sys_cred);
 	if (*error || (flags & ATTR_KERNOVAL))
 		return;
 	xfs_acl_get_endian(aclp);
@@ -629,7 +629,7 @@ xfs_acl_get_attr(
  */
 STATIC void
 xfs_acl_set_attr(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_acl_t	*aclp,
 	int		kind,
 	int		*error)
@@ -654,15 +654,15 @@ xfs_acl_set_attr(
 		INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
 	}
 	INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-	VOP_ATTR_SET(vp,
-		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT,
-		(char *)newacl, len, ATTR_ROOT, sys_cred, *error);
+	*error = bhv_vop_attr_set(vp, kind == _ACL_TYPE_ACCESS ?
+				SGI_ACL_FILE: SGI_ACL_DEFAULT,
+				(char *)newacl, len, ATTR_ROOT, sys_cred);
 	_ACL_FREE(newacl);
 }
 
 int
 xfs_acl_vtoacl(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_acl_t	*access_acl,
 	xfs_acl_t	*default_acl)
 {
@@ -678,7 +678,7 @@ xfs_acl_vtoacl(
 		if (!error) {
 			/* Got the ACL, need the mode... */
 			va.va_mask = XFS_AT_MODE;
-			VOP_GETATTR(vp, &va, 0, sys_cred, error);
+			error = bhv_vop_getattr(vp, &va, 0, sys_cred);
 		}
 
 		if (error)
@@ -701,7 +701,7 @@ xfs_acl_vtoacl(
  */
 int
 xfs_acl_inherit(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	vattr_t		*vap,
 	xfs_acl_t	*pdaclp)
 {
@@ -757,7 +757,7 @@ xfs_acl_inherit(
  */
 STATIC int
 xfs_acl_setmode(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_acl_t	*acl,
 	int		*basicperms)
 {
@@ -776,7 +776,7 @@ xfs_acl_setmode(
 	 * mode.  The m:: bits take precedence over the g:: bits.
 	 */
 	va.va_mask = XFS_AT_MODE;
-	VOP_GETATTR(vp, &va, 0, sys_cred, error);
+	error = bhv_vop_getattr(vp, &va, 0, sys_cred);
 	if (error)
 		return error;
 
@@ -810,8 +810,7 @@ xfs_acl_setmode(
 	if (gap && nomask)
 		va.va_mode |= gap->ae_perm << 3;
 
-	VOP_SETATTR(vp, &va, 0, sys_cred, error);
-	return error;
+	return bhv_vop_setattr(vp, &va, 0, sys_cred);
 }
 
 /*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 538d0d65b04..a57ff4e6d39 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -50,7 +50,7 @@ typedef struct xfs_acl {
 #ifdef CONFIG_XFS_POSIX_ACL
 
 struct vattr;
-struct vnode;
+struct bhv_vnode;
 struct xfs_inode;
 
 extern struct kmem_zone *xfs_acl_zone;
@@ -58,14 +58,14 @@ extern struct kmem_zone *xfs_acl_zone;
 		(zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
 #define xfs_acl_zone_destroy(zone)	kmem_zone_destroy(zone)
 
-extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *);
+extern int xfs_acl_inherit(struct bhv_vnode *, struct vattr *, xfs_acl_t *);
 extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(struct vnode *);
-extern int xfs_acl_vhasacl_default(struct vnode *);
-extern int xfs_acl_vset(struct vnode *, void *, size_t, int);
-extern int xfs_acl_vget(struct vnode *, void *, size_t, int);
-extern int xfs_acl_vremove(struct vnode *vp, int);
+extern int xfs_acl_vtoacl(struct bhv_vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vhasacl_access(struct bhv_vnode *);
+extern int xfs_acl_vhasacl_default(struct bhv_vnode *);
+extern int xfs_acl_vset(struct bhv_vnode *, void *, size_t, int);
+extern int xfs_acl_vget(struct bhv_vnode *, void *, size_t, int);
+extern int xfs_acl_vremove(struct bhv_vnode *, int);
 
 #define _ACL_TYPE_ACCESS	1
 #define _ACL_TYPE_DEFAULT	2
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 3ef41e75d08..4dcef8d1c32 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2324,56 +2324,56 @@ xfs_attr_trace_enter(int type, char *where,
 
 STATIC int
 posix_acl_access_set(
-	vnode_t	*vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
 }
 
 STATIC int
 posix_acl_access_remove(
-	struct vnode *vp, char *name, int xflags)
+	bhv_vnode_t *vp, char *name, int xflags)
 {
 	return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
 }
 
 STATIC int
 posix_acl_access_get(
-	vnode_t *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
 }
 
 STATIC int
 posix_acl_access_exists(
-	vnode_t *vp)
+	bhv_vnode_t *vp)
 {
 	return xfs_acl_vhasacl_access(vp);
 }
 
 STATIC int
 posix_acl_default_set(
-	vnode_t	*vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
 }
 
 STATIC int
 posix_acl_default_get(
-	vnode_t *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
 }
 
 STATIC int
 posix_acl_default_remove(
-	struct vnode *vp, char *name, int xflags)
+	bhv_vnode_t *vp, char *name, int xflags)
 {
 	return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
 }
 
 STATIC int
 posix_acl_default_exists(
-	vnode_t *vp)
+	bhv_vnode_t *vp)
 {
 	return xfs_acl_vhasacl_default(vp);
 }
@@ -2406,21 +2406,18 @@ STATIC struct attrnames *attr_system_names[] =
 
 STATIC int
 attr_generic_set(
-	struct vnode *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
-	int 	error;
-
-	VOP_ATTR_SET(vp, name, data, size, xflags, NULL, error);
-	return -error;
+	return -bhv_vop_attr_set(vp, name, data, size, xflags, NULL);
 }
 
 STATIC int
 attr_generic_get(
-	struct vnode *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	int	error, asize = size;
 
-	VOP_ATTR_GET(vp, name, data, &asize, xflags, NULL, error);
+	error = bhv_vop_attr_get(vp, name, data, &asize, xflags, NULL);
 	if (!error)
 		return asize;
 	return -error;
@@ -2428,12 +2425,9 @@ attr_generic_get(
 
 STATIC int
 attr_generic_remove(
-	struct vnode *vp, char *name, int xflags)
+	bhv_vnode_t *vp, char *name, int xflags)
 {
-	int	error;
-
-	VOP_ATTR_REMOVE(vp, name, xflags, NULL, error);
-	return -error;
+	return -bhv_vop_attr_remove(vp, name, xflags, NULL);
 }
 
 STATIC int
@@ -2461,7 +2455,7 @@ attr_generic_listadd(
 
 STATIC int
 attr_system_list(
-	struct vnode		*vp,
+	bhv_vnode_t		*vp,
 	void			*data,
 	size_t			size,
 	ssize_t			*result)
@@ -2483,12 +2477,12 @@ attr_system_list(
 
 int
 attr_generic_list(
-	struct vnode *vp, void *data, size_t size, int xflags, ssize_t *result)
+	bhv_vnode_t *vp, void *data, size_t size, int xflags, ssize_t *result)
 {
 	attrlist_cursor_kern_t	cursor = { 0 };
 	int			error;
 
-	VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error);
+	error = bhv_vop_attr_list(vp, data, size, xflags, &cursor, NULL);
 	if (error > 0)
 		return -error;
 	*result = -error;
@@ -2516,7 +2510,7 @@ attr_lookup_namespace(
  */
 STATIC int
 attr_user_capable(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	cred_t		*cred)
 {
 	struct inode	*inode = vn_to_inode(vp);
@@ -2534,7 +2528,7 @@ attr_user_capable(
 
 STATIC int
 attr_trusted_capable(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	cred_t		*cred)
 {
 	struct inode	*inode = vn_to_inode(vp);
@@ -2548,7 +2542,7 @@ attr_trusted_capable(
 
 STATIC int
 attr_secure_capable(
-	struct vnode	*vp,
+	bhv_vnode_t	*vp,
 	cred_t		*cred)
 {
 	return -ENOSECURITY;
@@ -2556,7 +2550,7 @@ attr_secure_capable(
 
 STATIC int
 attr_system_set(
-	struct vnode *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	attrnames_t	*namesp;
 	int		error;
@@ -2575,7 +2569,7 @@ attr_system_set(
 
 STATIC int
 attr_system_get(
-	struct vnode *vp, char *name, void *data, size_t size, int xflags)
+	bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
 {
 	attrnames_t	*namesp;
 
@@ -2587,7 +2581,7 @@ attr_system_get(
 
 STATIC int
 attr_system_remove(
-	struct vnode *vp, char *name, int xflags)
+	bhv_vnode_t *vp, char *name, int xflags)
 {
 	attrnames_t	*namesp;
 
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index b2c7b9fcded..981633f6c07 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -36,13 +36,13 @@
  *========================================================================*/
 
 struct cred;
-struct vnode;
+struct bhv_vnode;
 
-typedef int (*attrset_t)(struct vnode *, char *, void *, size_t, int);
-typedef int (*attrget_t)(struct vnode *, char *, void *, size_t, int);
-typedef int (*attrremove_t)(struct vnode *, char *, int);
-typedef int (*attrexists_t)(struct vnode *);
-typedef int (*attrcapable_t)(struct vnode *, struct cred *);
+typedef int (*attrset_t)(struct bhv_vnode *, char *, void *, size_t, int);
+typedef int (*attrget_t)(struct bhv_vnode *, char *, void *, size_t, int);
+typedef int (*attrremove_t)(struct bhv_vnode *, char *, int);
+typedef int (*attrexists_t)(struct bhv_vnode *);
+typedef int (*attrcapable_t)(struct bhv_vnode *, struct cred *);
 
 typedef struct attrnames {
 	char *		attr_name;
@@ -63,7 +63,7 @@ extern struct attrnames attr_trusted;
 extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT];
 
 extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int);
-extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *);
+extern int attr_generic_list(struct bhv_vnode *, void *, size_t, int, ssize_t *);
 
 #define ATTR_DONTFOLLOW	0x0001	/* -- unused, from IRIX -- */
 #define ATTR_ROOT	0x0002	/* use attrs in root (trusted) namespace */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index b4529421823..3050b4c647c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5759,7 +5759,7 @@ xfs_getbmap(
 	__int64_t		fixlen;		/* length for -1 case */
 	int			i;		/* extent number */
 	xfs_inode_t		*ip;		/* xfs incore inode pointer */
-	vnode_t			*vp;		/* corresponding vnode */
+	bhv_vnode_t		*vp;		/* corresponding vnode */
 	int			lock;		/* lock state */
 	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
 	xfs_mount_t		*mp;		/* file system mount point */
@@ -5856,7 +5856,7 @@ xfs_getbmap(
 
 	if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
 		/* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
-		VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+		error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF);
 	}
 
 	ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h
index d0035c6e951..7a0e482dd43 100644
--- a/fs/xfs/xfs_cap.h
+++ b/fs/xfs/xfs_cap.h
@@ -49,12 +49,12 @@ typedef struct xfs_cap_set {
 
 #include <linux/posix_cap_xattr.h>
 
-struct vnode;
+struct bhv_vnode;
 
-extern int xfs_cap_vhascap(struct vnode *);
-extern int xfs_cap_vset(struct vnode *, void *, size_t);
-extern int xfs_cap_vget(struct vnode *, void *, size_t);
-extern int xfs_cap_vremove(struct vnode *vp);
+extern int xfs_cap_vhascap(struct bhv_vnode *);
+extern int xfs_cap_vset(struct bhv_vnode *, void *, size_t);
+extern int xfs_cap_vget(struct bhv_vnode *, void *, size_t);
+extern int xfs_cap_vremove(struct bhv_vnode *);
 
 #define _CAP_EXISTS		xfs_cap_vhascap
 
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 99daf8c0f90..29a6c866f2c 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -57,7 +57,7 @@ xfs_swapext(
 	xfs_inode_t     *ip=NULL, *tip=NULL;
 	xfs_mount_t     *mp;
 	struct file	*fp = NULL, *tfp = NULL;
-	vnode_t		*vp, *tvp;
+	bhv_vnode_t	*vp, *tvp;
 	int		error = 0;
 
 	sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
@@ -137,7 +137,7 @@ xfs_swap_extents(
 	xfs_inode_t	*ips[2];
 	xfs_trans_t	*tp;
 	xfs_bstat_t	*sbp = &sxp->sx_stat;
-	vnode_t		*vp, *tvp;
+	bhv_vnode_t	*vp, *tvp;
 	xfs_ifork_t	*tempifp, *ifp, *tifp;
 	int		ilf_fields, tilf_fields;
 	static uint	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
@@ -202,7 +202,7 @@ xfs_swap_extents(
 
 	if (VN_CACHED(tvp) != 0) {
 		xfs_inval_cached_trace(&tip->i_iocore, 0, -1, 0, -1);
-		VOP_FLUSHINVAL_PAGES(tvp, 0, -1, FI_REMAPF_LOCKED);
+		bhv_vop_flushinval_pages(tvp, 0, -1, FI_REMAPF_LOCKED);
 	}
 
 	/* Verify O_DIRECT for ftmp */
@@ -247,7 +247,7 @@ xfs_swap_extents(
 	/* We need to fail if the file is memory mapped.  Once we have tossed
 	 * all existing pages, the page fault will have no option
 	 * but to go to the filesystem for pages. By making the page fault call
-	 * VOP_READ (or write in the case of autogrow) they block on the iolock
+	 * vop_read (or write in the case of autogrow) they block on the iolock
 	 * until we have switched the extents.
 	 */
 	if (VN_MAPPED(vp)) {
@@ -266,7 +266,7 @@ xfs_swap_extents(
 	 * fields change.
 	 */
 
-	VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+	bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF);
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
 	if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 41f38eb60eb..da3c94c230d 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -186,7 +186,7 @@ xfs_ihash_promote(
  */
 STATIC int
 xfs_iget_core(
-	vnode_t		*vp,
+	bhv_vnode_t	*vp,
 	xfs_mount_t	*mp,
 	xfs_trans_t	*tp,
 	xfs_ino_t	ino,
@@ -198,7 +198,7 @@ xfs_iget_core(
 	xfs_ihash_t	*ih;
 	xfs_inode_t	*ip;
 	xfs_inode_t	*iq;
-	vnode_t		*inode_vp;
+	bhv_vnode_t	*inode_vp;
 	ulong		version;
 	int		error;
 	/* REFERENCED */
@@ -489,7 +489,7 @@ xfs_iget(
 	xfs_daddr_t	bno)
 {
 	struct inode	*inode;
-	vnode_t		*vp = NULL;
+	bhv_vnode_t	*vp = NULL;
 	int		error;
 
 	XFS_STATS_INC(xs_ig_attempts);
@@ -543,7 +543,7 @@ retry:
 void
 xfs_inode_lock_init(
 	xfs_inode_t	*ip,
-	vnode_t		*vp)
+	bhv_vnode_t	*vp)
 {
 	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
 		     "xfsino", (long)vp->v_number);
@@ -603,12 +603,10 @@ void
 xfs_iput(xfs_inode_t	*ip,
 	 uint		lock_flags)
 {
-	vnode_t	*vp = XFS_ITOV(ip);
+	bhv_vnode_t	*vp = XFS_ITOV(ip);
 
 	vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address);
-
 	xfs_iunlock(ip, lock_flags);
-
 	VN_RELE(vp);
 }
 
@@ -619,7 +617,7 @@ void
 xfs_iput_new(xfs_inode_t	*ip,
 	     uint		lock_flags)
 {
-	vnode_t		*vp = XFS_ITOV(ip);
+	bhv_vnode_t	*vp = XFS_ITOV(ip);
 	struct inode	*inode = vn_to_inode(vp);
 
 	vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address);
@@ -645,7 +643,7 @@ xfs_iput_new(xfs_inode_t	*ip,
 void
 xfs_ireclaim(xfs_inode_t *ip)
 {
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	/*
 	 * Remove from old hash list and mount list.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0ed707ed664..848783ba565 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1086,7 +1086,7 @@ xfs_ialloc(
 {
 	xfs_ino_t	ino;
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	uint		flags;
 	int		error;
 
@@ -1427,7 +1427,7 @@ xfs_itruncate_start(
 	xfs_fsize_t	last_byte;
 	xfs_off_t	toss_start;
 	xfs_mount_t	*mp;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
 	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
@@ -1440,9 +1440,9 @@ xfs_itruncate_start(
 	vn_iowait(vp);  /* wait for the completion of any pending DIOs */
 	
 	/*
-	 * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
+	 * Call toss_pages or flushinval_pages to get rid of pages
 	 * overlapping the region being removed.  We have to use
-	 * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
+	 * the less efficient flushinval_pages in the case that the
 	 * caller may not be able to finish the truncate without
 	 * dropping the inode's I/O lock.  Make sure
 	 * to catch any pages brought in by buffers overlapping
@@ -1451,10 +1451,10 @@ xfs_itruncate_start(
 	 * so that we don't toss things on the same block as
 	 * new_size but before it.
 	 *
-	 * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
+	 * Before calling toss_page or flushinval_pages, make sure to
 	 * call remapf() over the same region if the file is mapped.
 	 * This frees up mapped file references to the pages in the
-	 * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
+	 * given range and for the flushinval_pages case it ensures
 	 * that we get the latest mapped changes flushed out.
 	 */
 	toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
@@ -1472,9 +1472,9 @@ xfs_itruncate_start(
 			 last_byte);
 	if (last_byte > toss_start) {
 		if (flags & XFS_ITRUNC_DEFINITE) {
-			VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+			bhv_vop_toss_pages(vp, toss_start, -1, FI_REMAPF_LOCKED);
 		} else {
-			VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+			bhv_vop_flushinval_pages(vp, toss_start, -1, FI_REMAPF_LOCKED);
 		}
 	}
 
@@ -2752,7 +2752,7 @@ xfs_iunpin(
 		 * the inode to become unpinned.
 		 */
 		if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
-			vnode_t	*vp = XFS_ITOV_NULL(ip);
+			bhv_vnode_t	*vp = XFS_ITOV_NULL(ip);
 
 			/* make sync come back and flush this inode */
 			if (vp) {
@@ -3512,7 +3512,7 @@ xfs_iflush_all(
 	xfs_mount_t	*mp)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
  again:
 	XFS_MOUNT_ILOCK(mp);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 233668b748f..d10b76ed1e5 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -102,9 +102,9 @@ typedef struct xfs_ifork {
 
 #ifdef __KERNEL__
 struct bhv_desc;
+struct bhv_vnode;
 struct cred;
 struct ktrace;
-struct vnode;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -400,7 +400,7 @@ void		xfs_chash_init(struct xfs_mount *);
 void		xfs_chash_free(struct xfs_mount *);
 xfs_inode_t	*xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
 				  struct xfs_trans *);
-void            xfs_inode_lock_init(xfs_inode_t *, struct vnode *);
+void            xfs_inode_lock_init(xfs_inode_t *, struct bhv_vnode *);
 int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
 			 uint, uint, xfs_inode_t **, xfs_daddr_t);
 void		xfs_iput(xfs_inode_t *, uint);
@@ -461,7 +461,7 @@ void		xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
 void		xfs_lock_inodes(xfs_inode_t **, int, int, uint);
 
-xfs_inode_t	*xfs_vtoi(struct vnode *vp);
+xfs_inode_t	*xfs_vtoi(struct bhv_vnode *vp);
 
 void		xfs_synchronize_atime(xfs_inode_t *);
 
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 864b43d6584..a0182ced29c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -51,7 +51,7 @@ xfs_bulkstat_one_iget(
 {
 	xfs_dinode_core_t *dic;		/* dinode core info pointer */
 	xfs_inode_t	*ip;		/* incore inode pointer */
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	int		error;
 
 	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 56c4b7e51f4..5ce6416fbd3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -658,7 +658,7 @@ xfs_mountfs(
 	xfs_buf_t	*bp;
 	xfs_sb_t	*sbp = &(mp->m_sb);
 	xfs_inode_t	*rip;
-	vnode_t		*rvp = NULL;
+	bhv_vnode_t	*rvp = NULL;
 	int		readio_log, writeio_log;
 	xfs_daddr_t	d;
 	__uint64_t	ret64;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7e612255f53..a7099906369 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -54,7 +54,7 @@ typedef struct xfs_trans_reservations {
 struct cred;
 struct log;
 struct bhv_vfs;
-struct vnode;
+struct bhv_vnode;
 struct xfs_mount_args;
 struct xfs_ihash;
 struct xfs_chash;
@@ -67,7 +67,7 @@ struct xfs_extdelta;
 struct xfs_swapext;
 
 extern struct bhv_vfsops xfs_vfsops;
-extern struct vnodeops xfs_vnodeops;
+extern struct bhv_vnodeops xfs_vnodeops;
 
 #define	AIL_LOCK_T		lock_t
 #define	AIL_LOCKINIT(x,y)	spinlock_init(x,y)
@@ -80,15 +80,15 @@ extern struct vnodeops xfs_vnodeops;
  * Prototypes and functions for the Data Migration subsystem.
  */
 
-typedef int	(*xfs_send_data_t)(int, struct vnode *,
+typedef int	(*xfs_send_data_t)(int, struct bhv_vnode *,
 			xfs_off_t, size_t, int, vrwlock_t *);
 typedef int	(*xfs_send_mmap_t)(struct vm_area_struct *, uint);
-typedef int	(*xfs_send_destroy_t)(struct vnode *, dm_right_t);
+typedef int	(*xfs_send_destroy_t)(struct bhv_vnode *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct bhv_vfs *,
-			struct vnode *,
-			dm_right_t, struct vnode *, dm_right_t,
+			struct bhv_vnode *,
+			dm_right_t, struct bhv_vnode *, dm_right_t,
 			char *, char *, mode_t, int, int);
-typedef void	(*xfs_send_unmount_t)(struct bhv_vfs *, struct vnode *,
+typedef void	(*xfs_send_unmount_t)(struct bhv_vfs *, struct bhv_vnode *,
 			dm_right_t, mode_t, int, int);
 
 typedef struct xfs_dmops {
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 1f148762eb2..a20566b8912 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -226,7 +226,7 @@ int
 xfs_rename(
 	bhv_desc_t	*src_dir_bdp,
 	vname_t		*src_vname,
-	vnode_t		*target_dir_vp,
+	bhv_vnode_t	*target_dir_vp,
 	vname_t		*target_vname,
 	cred_t		*credp)
 {
@@ -242,7 +242,7 @@ xfs_rename(
 	int		committed;
 	xfs_inode_t	*inodes[4];
 	int		target_ip_dropped = 0;	/* dropped target_ip link? */
-	vnode_t		*src_dir_vp;
+	bhv_vnode_t	*src_dir_vp;
 	int		spaceres;
 	int		target_link_zero = 0;
 	int		num_inodes;
@@ -609,7 +609,7 @@ xfs_rename(
 	 * Let interposed file systems know about removed links.
 	 */
 	if (target_ip_dropped) {
-		VOP_LINK_REMOVED(XFS_ITOV(target_ip), target_dir_vp,
+		bhv_vop_link_removed(XFS_ITOV(target_ip), target_dir_vp,
 					target_link_zero);
 		IRELE(target_ip);
 	}
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 34654ec6ae1..6ecc0ca5868 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -54,7 +54,7 @@ xfs_get_dir_entry(
 	vname_t		*dentry,
 	xfs_inode_t	**ipp)
 {
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = VNAME_TO_VNODE(dentry);
 
@@ -73,7 +73,7 @@ xfs_dir_lookup_int(
 	xfs_ino_t	*inum,
 	xfs_inode_t	**ipp)
 {
-	vnode_t		*dir_vp;
+	bhv_vnode_t	*dir_vp;
 	xfs_inode_t	*dp;
 	int		error;
 
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index 472661a3b6d..cecf1031059 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -23,7 +23,8 @@
 #define	ITRACE(ip)	vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \
 				(inst_t *)__return_address)
 
-extern int xfs_rename (bhv_desc_t *, vname_t *, vnode_t *, vname_t *, cred_t *);
+extern int xfs_rename (bhv_desc_t *, vname_t *, bhv_vnode_t *,
+			vname_t *, cred_t *);
 extern int xfs_get_dir_entry (vname_t *, xfs_inode_t **);
 extern int xfs_dir_lookup_int (bhv_desc_t *, uint, vname_t *, xfs_ino_t *,
 				xfs_inode_t **);
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 57ee2bebd24..d3f270a62c9 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -555,7 +555,7 @@ xfs_unmount(
 	bhv_vfs_t	*vfsp = bhvtovfs(bdp);
 	xfs_mount_t	*mp = XFS_BHVTOM(bdp);
 	xfs_inode_t	*rip;
-	vnode_t		*rvp;
+	bhv_vnode_t	*rvp;
 	int		unmount_event_wanted = 0;
 	int		unmount_event_flags = 0;
 	int		xfs_unmountfs_needed = 0;
@@ -701,7 +701,7 @@ xfs_unmount_flush(
 	xfs_inode_t	*rip = mp->m_rootip;
 	xfs_inode_t	*rbmip;
 	xfs_inode_t	*rsumip = NULL;
-	vnode_t		*rvp = XFS_ITOV(rip);
+	bhv_vnode_t	*rvp = XFS_ITOV(rip);
 	int		error;
 
 	xfs_ilock(rip, XFS_ILOCK_EXCL);
@@ -780,9 +780,9 @@ fscorrupt_out2:
 STATIC int
 xfs_root(
 	bhv_desc_t	*bdp,
-	vnode_t		**vpp)
+	bhv_vnode_t	**vpp)
 {
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip);
 	VN_HOLD(vp);
@@ -801,7 +801,7 @@ STATIC int
 xfs_statvfs(
 	bhv_desc_t	*bdp,
 	xfs_statfs_t	*statp,
-	vnode_t		*vp)
+	bhv_vnode_t	*vp)
 {
 	__uint64_t	fakeinos;
 	xfs_extlen_t	lsize;
@@ -916,7 +916,7 @@ xfs_sync_inodes(
 	xfs_inode_t	*ip = NULL;
 	xfs_inode_t	*ip_next;
 	xfs_buf_t	*bp;
-	vnode_t		*vp = NULL;
+	bhv_vnode_t	*vp = NULL;
 	int		error;
 	int		last_error;
 	uint64_t	fflag;
@@ -1155,9 +1155,9 @@ xfs_sync_inodes(
 			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 			if (XFS_FORCED_SHUTDOWN(mp)) {
-				VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+				bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF);
 			} else {
-				VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_REMAPF);
+				bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF);
 			}
 
 			xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -1177,8 +1177,8 @@ xfs_sync_inodes(
 				 * across calls to the buffer cache.
 				 */
 				xfs_iunlock(ip, XFS_ILOCK_SHARED);
-				VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1,
-							fflag, FI_NONE, error);
+				error = bhv_vop_flush_pages(vp, (xfs_off_t)0,
+							-1, fflag, FI_NONE);
 				xfs_ilock(ip, XFS_ILOCK_SHARED);
 			}
 
@@ -1230,9 +1230,7 @@ xfs_sync_inodes(
 						 * marker and free it.
 						 */
 						XFS_MOUNT_ILOCK(mp);
-
 						IPOINTER_REMOVE(ip, mp);
-
 						XFS_MOUNT_IUNLOCK(mp);
 
 						ASSERT(!(lock_flags &
@@ -1573,7 +1571,7 @@ xfs_syncsub(
 STATIC int
 xfs_vget(
 	bhv_desc_t	*bdp,
-	vnode_t		**vpp,
+	bhv_vnode_t	**vpp,
 	fid_t		*fidp)
 {
 	xfs_mount_t	*mp = XFS_BHVTOM(bdp);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 35906bae92e..f3c2deeed0a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -62,7 +62,7 @@ xfs_open(
 	cred_t		*credp)
 {
 	int		mode;
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -88,9 +88,8 @@ xfs_close(
 	lastclose_t	lastclose,
 	cred_t		*credp)
 {
-	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	bhv_vnode_t	*vp = BHV_TO_VNODE(bdp);
 	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
-	int		error = 0;
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
@@ -108,8 +107,8 @@ xfs_close(
 	 * window where we'd otherwise be exposed to that problem.
 	 */
 	if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
-		VOP_FLUSH_PAGES(vp, 0, -1, XFS_B_ASYNC, FI_NONE, error);
-	return error;
+		return bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
+	return 0;
 }
 
 /*
@@ -124,7 +123,7 @@ xfs_getattr(
 {
 	xfs_inode_t	*ip;
 	xfs_mount_t	*mp;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp  = BHV_TO_VNODE(bdp);
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
@@ -265,7 +264,7 @@ xfs_setattr(
 	uid_t			uid=0, iuid=0;
 	gid_t			gid=0, igid=0;
 	int			timeflags = 0;
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	xfs_prid_t		projid=0, iprojid=0;
 	int			mandlock_before, mandlock_after;
 	struct xfs_dquot	*udqp, *gdqp, *olddquot1, *olddquot2;
@@ -888,7 +887,7 @@ xfs_setattr(
 	 */
 	mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
 	if (mandlock_before != mandlock_after) {
-		VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
+		bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
 				 mandlock_after);
 	}
 
@@ -976,7 +975,7 @@ xfs_readlink(
 	int		count;
 	xfs_off_t	offset;
 	int		pathlen;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	int		error = 0;
 	xfs_mount_t	*mp;
 	int             nmaps;
@@ -1547,7 +1546,7 @@ xfs_release(
 	bhv_desc_t	*bdp)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 	xfs_mount_t	*mp;
 	int		error;
 
@@ -1600,8 +1599,8 @@ xfs_inactive(
 	cred_t		*credp)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
-	xfs_bmap_free_t	free_list; 
+	bhv_vnode_t	*vp;
+	xfs_bmap_free_t	free_list;
 	xfs_fsblock_t	first_block;
 	int		committed;
 	xfs_trans_t	*tp;
@@ -1817,16 +1816,16 @@ STATIC int
 xfs_lookup(
 	bhv_desc_t		*dir_bdp,
 	vname_t			*dentry,
-	vnode_t			**vpp,
+	bhv_vnode_t		**vpp,
 	int			flags,
-	vnode_t			*rdir,
+	bhv_vnode_t		*rdir,
 	cred_t			*credp)
 {
 	xfs_inode_t		*dp, *ip;
 	xfs_ino_t		e_inum;
 	int			error;
 	uint			lock_mode;
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 
 	dir_vp = BHV_TO_VNODE(dir_bdp);
 	vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
@@ -1855,13 +1854,13 @@ xfs_create(
 	bhv_desc_t		*dir_bdp,
 	vname_t			*dentry,
 	vattr_t			*vap,
-	vnode_t			**vpp,
+	bhv_vnode_t		**vpp,
 	cred_t			*credp)
 {
 	char			*name = VNAME(dentry);
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	xfs_inode_t		*dp, *ip;
-	vnode_t		        *vp=NULL;
+	bhv_vnode_t	        *vp = NULL;
 	xfs_trans_t		*tp;
 	xfs_mount_t	        *mp;
 	xfs_dev_t		rdev;
@@ -2047,7 +2046,7 @@ xfs_create(
 	 * Propagate the fact that the vnode changed after the
 	 * xfs_inode locks have been released.
 	 */
-	VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
+	bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
 
 	*vpp = vp;
 
@@ -2345,7 +2344,7 @@ xfs_remove(
 	vname_t			*dentry,
 	cred_t			*credp)
 {
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	char			*name = VNAME(dentry);
 	xfs_inode_t             *dp, *ip;
 	xfs_trans_t             *tp = NULL;
@@ -2532,7 +2531,7 @@ xfs_remove(
 	/*
 	 * Let interposed file systems know about removed links.
 	 */
-	VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
+	bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
 
 	IRELE(ip);
 
@@ -2585,7 +2584,7 @@ xfs_remove(
 STATIC int
 xfs_link(
 	bhv_desc_t		*target_dir_bdp,
-	vnode_t			*src_vp,
+	bhv_vnode_t		*src_vp,
 	vname_t			*dentry,
 	cred_t			*credp)
 {
@@ -2598,7 +2597,7 @@ xfs_link(
 	xfs_fsblock_t           first_block;
 	int			cancel_flags;
 	int			committed;
-	vnode_t			*target_dir_vp;
+	bhv_vnode_t		*target_dir_vp;
 	int			resblks;
 	char			*target_name = VNAME(dentry);
 	int			target_namelen;
@@ -2757,13 +2756,13 @@ xfs_mkdir(
 	bhv_desc_t		*dir_bdp,
 	vname_t			*dentry,
 	vattr_t			*vap,
-	vnode_t			**vpp,
+	bhv_vnode_t		**vpp,
 	cred_t			*credp)
 {
 	char			*dir_name = VNAME(dentry);
 	xfs_inode_t             *dp;
 	xfs_inode_t		*cdp;	/* inode of created dir */
-	vnode_t			*cvp;	/* vnode of created dir */
+	bhv_vnode_t		*cvp;	/* vnode of created dir */
 	xfs_trans_t		*tp;
 	xfs_mount_t		*mp;
 	int			cancel_flags;
@@ -2771,7 +2770,7 @@ xfs_mkdir(
 	int			committed;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	boolean_t		dp_joined_to_trans;
 	boolean_t		created = B_FALSE;
 	int			dm_event_sent = 0;
@@ -3003,7 +3002,7 @@ xfs_rmdir(
 	xfs_fsblock_t           first_block;
 	int			cancel_flags;
 	int			committed;
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	int			dm_di_mode = 0;
 	int			last_cdp_link;
 	int			namelen;
@@ -3202,7 +3201,7 @@ xfs_rmdir(
 	/*
 	 * Let interposed file systems know about removed links.
 	 */
-	VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
+	bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
 
 	IRELE(cdp);
 
@@ -3272,7 +3271,7 @@ xfs_symlink(
 	vname_t			*dentry,
 	vattr_t			*vap,
 	char			*target_path,
-	vnode_t			**vpp,
+	bhv_vnode_t		**vpp,
 	cred_t			*credp)
 {
 	xfs_trans_t		*tp;
@@ -3284,7 +3283,7 @@ xfs_symlink(
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
 	boolean_t		dp_joined_to_trans;
-	vnode_t			*dir_vp;
+	bhv_vnode_t		*dir_vp;
 	uint			cancel_flags;
 	int			committed;
 	xfs_fileoff_t		first_fsb;
@@ -3562,7 +3561,7 @@ std_return:
 	}
 
 	if (!error) {
-		vnode_t *vp;
+		bhv_vnode_t *vp;
 
 		ASSERT(ip);
 		vp = XFS_ITOV(ip);
@@ -3630,7 +3629,7 @@ xfs_rwlock(
 	vrwlock_t	locktype)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = BHV_TO_VNODE(bdp);
 	if (VN_ISDIR(vp))
@@ -3661,7 +3660,7 @@ xfs_rwunlock(
 	vrwlock_t	locktype)
 {
 	xfs_inode_t     *ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = BHV_TO_VNODE(bdp);
 	if (VN_ISDIR(vp))
@@ -3810,7 +3809,7 @@ xfs_reclaim(
 	bhv_desc_t	*bdp)
 {
 	xfs_inode_t	*ip;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = BHV_TO_VNODE(bdp);
 	ip = XFS_BHVTOI(bdp);
@@ -3865,7 +3864,7 @@ xfs_finish_reclaim(
 	int		sync_mode)
 {
 	xfs_ihash_t	*ih = ip->i_hash;
-	vnode_t		*vp = XFS_ITOV_NULL(ip);
+	bhv_vnode_t	*vp = XFS_ITOV_NULL(ip);
 	int		error;
 
 	if (vp && VN_BAD(vp))
@@ -4275,7 +4274,7 @@ xfs_free_file_space(
 	xfs_off_t		len,
 	int			attr_flags)
 {
-	vnode_t			*vp;
+	bhv_vnode_t		*vp;
 	int			committed;
 	int			done;
 	xfs_off_t		end_dmi_offset;
@@ -4341,7 +4340,7 @@ xfs_free_file_space(
 	if (VN_CACHED(vp) != 0) {
 		xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
 				ctooff(offtoct(ioffset)), -1);
-		VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(ioffset)),
+		bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
 				-1, FI_REMAPF_LOCKED);
 	}
 
@@ -4504,7 +4503,7 @@ xfs_change_file_space(
 	xfs_off_t	llen;
 	xfs_trans_t	*tp;
 	vattr_t		va;
-	vnode_t		*vp;
+	bhv_vnode_t	*vp;
 
 	vp = BHV_TO_VNODE(bdp);
 	vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
@@ -4657,7 +4656,7 @@ xfs_change_file_space(
 	return error;
 }
 
-vnodeops_t xfs_vnodeops = {
+bhv_vnodeops_t xfs_vnodeops = {
 	BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
 	.vop_open		= xfs_open,
 	.vop_close		= xfs_close,
-- 
cgit v1.2.3


From 8285fb58e75bfdb447c7a2c533ec9efdb238f966 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 17:07:12 +1000
Subject: [XFS] Resolve a namespace collision on remaining vtypes for FreeBSD
 porters.

SGI-PV: 953338
SGI-Modid: xfs-linux-melb:xfs-kern:26108a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c |  2 +-
 fs/xfs/linux-2.6/xfs_iops.c  | 16 +++++------
 fs/xfs/linux-2.6/xfs_lrw.c   | 10 +++----
 fs/xfs/linux-2.6/xfs_vfs.c   |  4 +--
 fs/xfs/linux-2.6/xfs_vfs.h   |  8 +++---
 fs/xfs/linux-2.6/xfs_vnode.c |  6 ++---
 fs/xfs/linux-2.6/xfs_vnode.h | 64 ++++++++++++++++++++++----------------------
 fs/xfs/quota/xfs_qm_bhv.c    |  2 +-
 fs/xfs/xfs_acl.c             | 10 +++----
 fs/xfs/xfs_acl.h             |  2 +-
 fs/xfs/xfs_mount.h           |  2 +-
 fs/xfs/xfs_rename.c          |  8 +++---
 fs/xfs/xfs_rw.h              |  7 ++---
 fs/xfs/xfs_utils.c           |  4 +--
 fs/xfs/xfs_utils.h           |  8 +++---
 fs/xfs/xfs_vfsops.c          |  2 +-
 fs/xfs/xfs_vnodeops.c        | 32 +++++++++++-----------
 17 files changed, 94 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 73182503c16..a0e247c7132 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1157,7 +1157,7 @@ xfs_ioc_xattr(
 	void			__user *arg)
 {
 	struct fsxattr		fa;
-	struct vattr		*vattr;
+	struct bhv_vattr	*vattr;
 	int			error = 0;
 	int			attr_flags;
 	unsigned int		flags;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 0857658882e..484daef91d7 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -200,7 +200,7 @@ xfs_ichgtime_fast(
 STATIC void
 xfs_validate_fields(
 	struct inode	*ip,
-	struct vattr	*vattr)
+	bhv_vattr_t	*vattr)
 {
 	vattr->va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS;
 	if (!bhv_vop_getattr(vn_from_inode(ip), vattr, ATTR_LAZY, NULL)) {
@@ -290,7 +290,7 @@ xfs_vn_mknod(
 	dev_t		rdev)
 {
 	struct inode	*ip;
-	vattr_t		vattr = { 0 };
+	bhv_vattr_t	vattr = { 0 };
 	bhv_vnode_t	*vp = NULL, *dvp = vn_from_inode(dir);
 	xfs_acl_t	*default_acl = NULL;
 	attrexists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
@@ -417,7 +417,7 @@ xfs_vn_link(
 	struct inode	*ip;	/* inode of guy being linked to */
 	bhv_vnode_t	*tdvp;	/* target directory for new name/link */
 	bhv_vnode_t	*vp;	/* vp of name being linked */
-	vattr_t		vattr;
+	bhv_vattr_t	vattr;
 	int		error;
 
 	ip = old_dentry->d_inode;	/* inode being linked to */
@@ -444,7 +444,7 @@ xfs_vn_unlink(
 {
 	struct inode	*inode;
 	bhv_vnode_t	*dvp;	/* directory containing name to remove */
-	vattr_t		vattr;
+	bhv_vattr_t	vattr;
 	int		error;
 
 	inode = dentry->d_inode;
@@ -465,7 +465,7 @@ xfs_vn_symlink(
 	const char	*symname)
 {
 	struct inode	*ip;
-	vattr_t		va = { 0 };
+	bhv_vattr_t	va = { 0 };
 	bhv_vnode_t	*dvp;	/* directory containing name of symlink */
 	bhv_vnode_t	*cvp;	/* used to lookup symlink to put in dentry */
 	int		error;
@@ -499,7 +499,7 @@ xfs_vn_rmdir(
 {
 	struct inode	*inode = dentry->d_inode;
 	bhv_vnode_t	*dvp = vn_from_inode(dir);
-	vattr_t		vattr;
+	bhv_vattr_t	vattr;
 	int		error;
 
 	error = bhv_vop_rmdir(dvp, dentry, NULL);
@@ -520,7 +520,7 @@ xfs_vn_rename(
 	struct inode	*new_inode = ndentry->d_inode;
 	bhv_vnode_t	*fvp;	/* from directory */
 	bhv_vnode_t	*tvp;	/* target directory */
-	vattr_t		vattr;
+	bhv_vattr_t	vattr;
 	int		error;
 
 	fvp = vn_from_inode(odir);
@@ -643,7 +643,7 @@ xfs_vn_setattr(
 	struct inode	*inode = dentry->d_inode;
 	unsigned int	ia_valid = attr->ia_valid;
 	bhv_vnode_t	*vp = vn_from_inode(inode);
-	vattr_t		vattr = { 0 };
+	bhv_vattr_t	vattr = { 0 };
 	int		flags = 0;
 	int		error;
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index a9b83018f0f..8696096f8d2 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -258,7 +258,7 @@ xfs_read(
 
 	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
 	    !(ioflags & IO_INVIS)) {
-		vrwlock_t locktype = VRWLOCK_READ;
+		bhv_vrwlock_t locktype = VRWLOCK_READ;
 		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 
 		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
@@ -313,7 +313,7 @@ xfs_sendfile(
 
 	if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
 	    (!(ioflags & IO_INVIS))) {
-		vrwlock_t locktype = VRWLOCK_READ;
+		bhv_vrwlock_t locktype = VRWLOCK_READ;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
@@ -357,7 +357,7 @@ xfs_splice_read(
 
 	if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
 	    (!(ioflags & IO_INVIS))) {
-		vrwlock_t locktype = VRWLOCK_READ;
+		bhv_vrwlock_t locktype = VRWLOCK_READ;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
@@ -401,7 +401,7 @@ xfs_splice_write(
 
 	if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_WRITE) &&
 	    (!(ioflags & IO_INVIS))) {
-		vrwlock_t locktype = VRWLOCK_WRITE;
+		bhv_vrwlock_t locktype = VRWLOCK_WRITE;
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp),
@@ -630,7 +630,7 @@ xfs_write(
 	unsigned long		seg;
 	int			iolock;
 	int			eventsent = 0;
-	vrwlock_t		locktype;
+	bhv_vrwlock_t		locktype;
 	size_t			ocount = 0, count;
 	loff_t			pos;
 	int			need_i_mutex = 1, need_flush = 0;
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
index a91ecfa9c8a..4fc884bcb4f 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -117,7 +117,7 @@ vfs_root(
 int
 vfs_statvfs(
 	struct bhv_desc		*bdp,
-	xfs_statfs_t		*sp,
+	bhv_statvfs_t		*statp,
 	struct bhv_vnode	*vp)
 {
 	struct bhv_desc		*next = bdp;
@@ -125,7 +125,7 @@ vfs_statvfs(
 	ASSERT(next);
 	while (! (bhvtovfsops(next))->vfs_statvfs)
 		next = BHV_NEXT(next);
-	return ((*bhvtovfsops(next)->vfs_statvfs)(next, sp, vp));
+	return ((*bhvtovfsops(next)->vfs_statvfs)(next, statp, vp));
 }
 
 int
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 1a3a2dd4b97..7b88eeae0f2 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -23,14 +23,14 @@
 
 struct bhv_vfs;
 struct bhv_vnode;
+
 struct fid;
 struct cred;
-struct statfs;
 struct seq_file;
 struct super_block;
 struct xfs_mount_args;
 
-typedef struct kstatfs xfs_statfs_t;
+typedef struct kstatfs	bhv_statvfs_t;
 
 typedef struct bhv_vfs_sync_work {
 	struct list_head	w_list;
@@ -109,7 +109,7 @@ typedef int	(*vfs_unmount_t)(bhv_desc_t *, int, struct cred *);
 typedef int	(*vfs_mntupdate_t)(bhv_desc_t *, int *,
 				struct xfs_mount_args *);
 typedef int	(*vfs_root_t)(bhv_desc_t *, struct bhv_vnode **);
-typedef int	(*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *,
+typedef int	(*vfs_statvfs_t)(bhv_desc_t *, bhv_statvfs_t *,
 				struct bhv_vnode *);
 typedef int	(*vfs_sync_t)(bhv_desc_t *, int, struct cred *);
 typedef int	(*vfs_vget_t)(bhv_desc_t *, struct bhv_vnode **, struct fid *);
@@ -181,7 +181,7 @@ extern int vfs_showargs(bhv_desc_t *, struct seq_file *);
 extern int vfs_unmount(bhv_desc_t *, int, struct cred *);
 extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *);
 extern int vfs_root(bhv_desc_t *, struct bhv_vnode **);
-extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct bhv_vnode *);
+extern int vfs_statvfs(bhv_desc_t *, bhv_statvfs_t *, struct bhv_vnode *);
 extern int vfs_sync(bhv_desc_t *, int, struct cred *);
 extern int vfs_vget(bhv_desc_t *, struct bhv_vnode **, struct fid *);
 extern int vfs_dmapiops(bhv_desc_t *, caddr_t);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 66e36e195f2..6628d96b6fd 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -111,7 +111,7 @@ vn_initialize(
 void
 vn_revalidate_core(
 	bhv_vnode_t	*vp,
-	vattr_t		*vap)
+	bhv_vattr_t	*vap)
 {
 	struct inode	*inode = vn_to_inode(vp);
 
@@ -147,7 +147,7 @@ vn_revalidate_core(
 int
 __vn_revalidate(
 	bhv_vnode_t	*vp,
-	struct vattr	*vattr)
+	bhv_vattr_t	*vattr)
 {
 	int		error;
 
@@ -165,7 +165,7 @@ int
 vn_revalidate(
 	bhv_vnode_t	*vp)
 {
-	vattr_t		vattr;
+	bhv_vattr_t	vattr;
 
 	return __vn_revalidate(vp, &vattr);
 }
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index cb16774aea7..35c6a01963a 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -20,29 +20,29 @@
 
 struct uio;
 struct file;
-struct vattr;
+struct bhv_vfs;
+struct bhv_vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
 
-typedef xfs_ino_t vnumber_t;
-typedef struct dentry vname_t;
-typedef bhv_head_t vn_bhv_head_t;
+typedef struct dentry	bhv_vname_t;
+typedef __u64		bhv_vnumber_t;
 
-typedef enum vflags {
+typedef enum bhv_vflags {
 	VMODIFIED	= 0x08,	/* XFS inode state possibly differs */
 				/* to the Linux inode state. */
 	VTRUNCATED	= 0x40,	/* truncated down so flush-on-close */
-} vflags_t;
+} bhv_vflags_t;
 
 /*
  * MP locking protocols:
  *	v_flag, v_vfsp				VN_LOCK/VN_UNLOCK
  */
 typedef struct bhv_vnode {
-	vflags_t	v_flag;			/* vnode flags (see above) */
-	struct bhv_vfs	*v_vfsp;		/* ptr to containing VFS */
-	vnumber_t	v_number;		/* in-core vnode number */
-	vn_bhv_head_t	v_bh;			/* behavior head */
+	bhv_vflags_t	v_flag;			/* vnode flags (see above) */
+	bhv_vfs_t	*v_vfsp;		/* ptr to containing VFS */
+	bhv_vnumber_t	v_number;		/* in-core vnode number */
+	bhv_head_t	v_bh;			/* behavior head */
 	spinlock_t	v_lock;			/* VN_LOCK/VN_UNLOCK */
 	atomic_t	v_iocount;		/* outstanding I/O count */
 #ifdef XFS_VNODE_TRACE
@@ -103,14 +103,14 @@ static inline struct inode *vn_to_inode(struct bhv_vnode *vnode)
 /*
  * Values for the vop_rwlock/rwunlock flags parameter.
  */
-typedef enum vrwlock {
+typedef enum bhv_vrwlock {
 	VRWLOCK_NONE,
 	VRWLOCK_READ,
 	VRWLOCK_WRITE,
 	VRWLOCK_WRITE_DIRECT,
 	VRWLOCK_TRY_READ,
 	VRWLOCK_TRY_WRITE
-} vrwlock_t;
+} bhv_vrwlock_t;
 
 /*
  * Return values for bhv_vop_inactive.  A return value of
@@ -123,13 +123,13 @@ typedef enum vrwlock {
 /*
  * Values for the cmd code given to vop_vnode_change.
  */
-typedef enum vchange {
+typedef enum bhv_vchange {
 	VCHANGE_FLAGS_FRLOCKS		= 0,
 	VCHANGE_FLAGS_ENF_LOCKING	= 1,
 	VCHANGE_FLAGS_TRUNCATED		= 2,
 	VCHANGE_FLAGS_PAGE_DIRTY	= 3,
 	VCHANGE_FLAGS_IOEXCL_COUNT	= 4
-} vchange_t;
+} bhv_vchange_t;
 
 typedef enum { L_FALSE, L_TRUE } lastclose_t;
 
@@ -152,26 +152,26 @@ typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *,
 				struct cred *);
 typedef int	(*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
 				int, unsigned int, void __user *);
-typedef int	(*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
+typedef int	(*vop_getattr_t)(bhv_desc_t *, struct bhv_vattr *, int,
 				struct cred *);
-typedef int	(*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
+typedef int	(*vop_setattr_t)(bhv_desc_t *, struct bhv_vattr *, int,
 				struct cred *);
 typedef int	(*vop_access_t)(bhv_desc_t *, int, struct cred *);
-typedef int	(*vop_lookup_t)(bhv_desc_t *, vname_t *, bhv_vnode_t **,
+typedef int	(*vop_lookup_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t **,
 				int, bhv_vnode_t *, struct cred *);
-typedef int	(*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *,
+typedef int	(*vop_create_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *,
 				bhv_vnode_t **, struct cred *);
-typedef int	(*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *);
-typedef int	(*vop_link_t)(bhv_desc_t *, bhv_vnode_t *, vname_t *,
+typedef int	(*vop_remove_t)(bhv_desc_t *, bhv_vname_t *, struct cred *);
+typedef int	(*vop_link_t)(bhv_desc_t *, bhv_vnode_t *, bhv_vname_t *,
 				struct cred *);
-typedef int	(*vop_rename_t)(bhv_desc_t *, vname_t *, bhv_vnode_t *,
-				vname_t *, struct cred *);
-typedef int	(*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *,
+typedef int	(*vop_rename_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *,
+				bhv_vname_t *, struct cred *);
+typedef int	(*vop_mkdir_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *,
 				bhv_vnode_t **, struct cred *);
-typedef int	(*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *);
+typedef int	(*vop_rmdir_t)(bhv_desc_t *, bhv_vname_t *, struct cred *);
 typedef int	(*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
 				int *);
-typedef int	(*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *,
+typedef int	(*vop_symlink_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr*,
 				char *, bhv_vnode_t **, struct cred *);
 typedef int	(*vop_readlink_t)(bhv_desc_t *, struct uio *, int,
 				struct cred *);
@@ -180,8 +180,8 @@ typedef int	(*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
 typedef int	(*vop_inactive_t)(bhv_desc_t *, struct cred *);
 typedef int	(*vop_fid2_t)(bhv_desc_t *, struct fid *);
 typedef int	(*vop_release_t)(bhv_desc_t *);
-typedef int	(*vop_rwlock_t)(bhv_desc_t *, vrwlock_t);
-typedef void	(*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t);
+typedef int	(*vop_rwlock_t)(bhv_desc_t *, bhv_vrwlock_t);
+typedef void	(*vop_rwunlock_t)(bhv_desc_t *, bhv_vrwlock_t);
 typedef int	(*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int,
 				struct xfs_iomap *, int *);
 typedef int	(*vop_reclaim_t)(bhv_desc_t *);
@@ -194,7 +194,7 @@ typedef	int	(*vop_attr_remove_t)(bhv_desc_t *, const char *,
 typedef	int	(*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
 				struct attrlist_cursor_kern *, struct cred *);
 typedef void	(*vop_link_removed_t)(bhv_desc_t *, bhv_vnode_t *, int);
-typedef void	(*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
+typedef void	(*vop_vnode_change_t)(bhv_desc_t *, bhv_vchange_t, __psint_t);
 typedef void	(*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 typedef void	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
 typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
@@ -346,7 +346,7 @@ typedef struct bhv_vnodeops {
  * Vnode attributes.  va_mask indicates those attributes the caller
  * wants to set or extract.
  */
-typedef struct vattr {
+typedef struct bhv_vattr {
 	int		va_mask;	/* bit-mask of attributes present */
 	mode_t		va_mode;	/* file access mode and type */
 	xfs_nlink_t	va_nlink;	/* number of references to file */
@@ -366,7 +366,7 @@ typedef struct vattr {
 	u_long		va_nextents;	/* number of extents in file */
 	u_long		va_anextents;	/* number of attr extents in file */
 	prid_t		va_projid;	/* project id */
-} vattr_t;
+} bhv_vattr_t;
 
 /*
  * setattr or getattr attributes
@@ -442,8 +442,8 @@ typedef struct vattr {
 extern void	vn_init(void);
 extern bhv_vnode_t	*vn_initialize(struct inode *);
 extern int	vn_revalidate(struct bhv_vnode *);
-extern int	__vn_revalidate(struct bhv_vnode *, vattr_t *);
-extern void	vn_revalidate_core(struct bhv_vnode *, vattr_t *);
+extern int	__vn_revalidate(struct bhv_vnode *, bhv_vattr_t *);
+extern void	vn_revalidate_core(struct bhv_vnode *, bhv_vattr_t *);
 
 extern void	vn_iowait(struct bhv_vnode *vp);
 extern void	vn_iowake(struct bhv_vnode *vp);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index ad9ab104946..d93d3a1064e 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -191,7 +191,7 @@ xfs_qm_mount(
 STATIC int
 xfs_qm_statvfs(
 	struct bhv_desc		*bhv,
-	xfs_statfs_t		*statp,
+	bhv_statvfs_t		*statp,
 	struct bhv_vnode	*vnode)
 {
 	xfs_mount_t		*mp;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 8c300da4e7a..e1074955386 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -241,7 +241,7 @@ xfs_acl_vget(
 			goto out;
 		}
 		if (kind == _ACL_TYPE_ACCESS) {
-			vattr_t	va;
+			bhv_vattr_t	va;
 
 			va.va_mask = XFS_AT_MODE;
 			error = bhv_vop_getattr(vp, &va, 0, sys_cred);
@@ -373,7 +373,7 @@ xfs_acl_allow_set(
 	bhv_vnode_t	*vp,
 	int		kind)
 {
-	vattr_t		va;
+	bhv_vattr_t	va;
 	int		error;
 
 	if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
@@ -666,7 +666,7 @@ xfs_acl_vtoacl(
 	xfs_acl_t	*access_acl,
 	xfs_acl_t	*default_acl)
 {
-	vattr_t		va;
+	bhv_vattr_t	va;
 	int		error = 0;
 
 	if (access_acl) {
@@ -702,7 +702,7 @@ xfs_acl_vtoacl(
 int
 xfs_acl_inherit(
 	bhv_vnode_t	*vp,
-	vattr_t		*vap,
+	bhv_vattr_t	*vap,
 	xfs_acl_t	*pdaclp)
 {
 	xfs_acl_t	*cacl;
@@ -761,7 +761,7 @@ xfs_acl_setmode(
 	xfs_acl_t	*acl,
 	int		*basicperms)
 {
-	vattr_t		va;
+	bhv_vattr_t	va;
 	xfs_acl_entry_t	*ap;
 	xfs_acl_entry_t	*gap = NULL;
 	int		i, error, nomask = 1;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index a57ff4e6d39..f853cf1a627 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -58,7 +58,7 @@ extern struct kmem_zone *xfs_acl_zone;
 		(zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
 #define xfs_acl_zone_destroy(zone)	kmem_zone_destroy(zone)
 
-extern int xfs_acl_inherit(struct bhv_vnode *, struct vattr *, xfs_acl_t *);
+extern int xfs_acl_inherit(struct bhv_vnode *, struct bhv_vattr *, xfs_acl_t *);
 extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
 extern int xfs_acl_vtoacl(struct bhv_vnode *, xfs_acl_t *, xfs_acl_t *);
 extern int xfs_acl_vhasacl_access(struct bhv_vnode *);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a7099906369..761f42f989c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -81,7 +81,7 @@ extern struct bhv_vnodeops xfs_vnodeops;
  */
 
 typedef int	(*xfs_send_data_t)(int, struct bhv_vnode *,
-			xfs_off_t, size_t, int, vrwlock_t *);
+			xfs_off_t, size_t, int, bhv_vrwlock_t *);
 typedef int	(*xfs_send_mmap_t)(struct vm_area_struct *, uint);
 typedef int	(*xfs_send_destroy_t)(struct bhv_vnode *, dm_right_t);
 typedef int	(*xfs_send_namesp_t)(dm_eventtype_t, struct bhv_vfs *,
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index a20566b8912..7d5f9b6ffdb 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -87,8 +87,8 @@ STATIC int
 xfs_lock_for_rename(
 	xfs_inode_t	*dp1,	/* old (source) directory inode */
 	xfs_inode_t	*dp2,	/* new (target) directory inode */
-	vname_t		*vname1,/* old entry name */
-	vname_t		*vname2,/* new entry name */
+	bhv_vname_t	*vname1,/* old entry name */
+	bhv_vname_t	*vname2,/* new entry name */
 	xfs_inode_t	**ipp1,	/* inode of old entry */
 	xfs_inode_t	**ipp2,	/* inode of new entry, if it
 				   already exists, NULL otherwise. */
@@ -225,9 +225,9 @@ xfs_lock_for_rename(
 int
 xfs_rename(
 	bhv_desc_t	*src_dir_bdp,
-	vname_t		*src_vname,
+	bhv_vname_t	*src_vname,
 	bhv_vnode_t	*target_dir_vp,
-	vname_t		*target_vname,
+	bhv_vname_t	*target_vname,
 	cred_t		*credp)
 {
 	xfs_trans_t	*tp;
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index e6379564447..a572b175dc8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -87,9 +87,10 @@ extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
 /*
  * Prototypes for functions in xfs_vnodeops.c.
  */
-extern int xfs_rwlock(bhv_desc_t *bdp, vrwlock_t write_lock);
-extern void xfs_rwunlock(bhv_desc_t *bdp, vrwlock_t write_lock);
-extern int xfs_setattr(bhv_desc_t *bdp, vattr_t *vap, int flags, cred_t *credp);
+extern int xfs_rwlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock);
+extern void xfs_rwunlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock);
+extern int xfs_setattr(bhv_desc_t *, bhv_vattr_t *vap, int flags,
+		       cred_t *credp);
 extern int xfs_change_file_space(bhv_desc_t *bdp, int cmd, xfs_flock64_t *bf,
 				 xfs_off_t offset, cred_t *credp, int flags);
 extern int xfs_set_dmattrs(bhv_desc_t *bdp, u_int evmask, u_int16_t state,
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 6ecc0ca5868..37fdc2dc00b 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -51,7 +51,7 @@
  */
 int
 xfs_get_dir_entry(
-	vname_t		*dentry,
+	bhv_vname_t	*dentry,
 	xfs_inode_t	**ipp)
 {
 	bhv_vnode_t	*vp;
@@ -69,7 +69,7 @@ int
 xfs_dir_lookup_int(
 	bhv_desc_t	*dir_bdp,
 	uint		lock_mode,
-	vname_t		*dentry,
+	bhv_vname_t	*dentry,
 	xfs_ino_t	*inum,
 	xfs_inode_t	**ipp)
 {
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index cecf1031059..fe953e98afa 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -23,10 +23,10 @@
 #define	ITRACE(ip)	vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \
 				(inst_t *)__return_address)
 
-extern int xfs_rename (bhv_desc_t *, vname_t *, bhv_vnode_t *,
-			vname_t *, cred_t *);
-extern int xfs_get_dir_entry (vname_t *, xfs_inode_t **);
-extern int xfs_dir_lookup_int (bhv_desc_t *, uint, vname_t *, xfs_ino_t *,
+extern int xfs_rename (bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *,
+			bhv_vname_t *, cred_t *);
+extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **);
+extern int xfs_dir_lookup_int (bhv_desc_t *, uint, bhv_vname_t *, xfs_ino_t *,
 				xfs_inode_t **);
 extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
 extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index d3f270a62c9..7550583bd09 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -800,7 +800,7 @@ xfs_root(
 STATIC int
 xfs_statvfs(
 	bhv_desc_t	*bdp,
-	xfs_statfs_t	*statp,
+	bhv_statvfs_t	*statp,
 	bhv_vnode_t	*vp)
 {
 	__uint64_t	fakeinos;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f3c2deeed0a..26d96d1b25c 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -117,7 +117,7 @@ xfs_close(
 STATIC int
 xfs_getattr(
 	bhv_desc_t	*bdp,
-	vattr_t		*vap,
+	bhv_vattr_t	*vap,
 	int		flags,
 	cred_t		*credp)
 {
@@ -250,7 +250,7 @@ xfs_getattr(
 int
 xfs_setattr(
 	bhv_desc_t		*bdp,
-	vattr_t			*vap,
+	bhv_vattr_t		*vap,
 	int			flags,
 	cred_t			*credp)
 {
@@ -1815,7 +1815,7 @@ xfs_inactive(
 STATIC int
 xfs_lookup(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
+	bhv_vname_t		*dentry,
 	bhv_vnode_t		**vpp,
 	int			flags,
 	bhv_vnode_t		*rdir,
@@ -1852,8 +1852,8 @@ xfs_lookup(
 STATIC int
 xfs_create(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
-	vattr_t			*vap,
+	bhv_vname_t		*dentry,
+	bhv_vattr_t		*vap,
 	bhv_vnode_t		**vpp,
 	cred_t			*credp)
 {
@@ -2127,7 +2127,7 @@ int xfs_rm_attempts;
 STATIC int
 xfs_lock_dir_and_entry(
 	xfs_inode_t	*dp,
-	vname_t		*dentry,
+	bhv_vname_t	*dentry,
 	xfs_inode_t	*ip)	/* inode of entry 'name' */
 {
 	int		attempts;
@@ -2341,7 +2341,7 @@ int remove_which_error_return = 0;
 STATIC int
 xfs_remove(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
+	bhv_vname_t		*dentry,
 	cred_t			*credp)
 {
 	bhv_vnode_t		*dir_vp;
@@ -2585,7 +2585,7 @@ STATIC int
 xfs_link(
 	bhv_desc_t		*target_dir_bdp,
 	bhv_vnode_t		*src_vp,
-	vname_t			*dentry,
+	bhv_vname_t		*dentry,
 	cred_t			*credp)
 {
 	xfs_inode_t		*tdp, *sip;
@@ -2754,8 +2754,8 @@ std_return:
 STATIC int
 xfs_mkdir(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
-	vattr_t			*vap,
+	bhv_vname_t		*dentry,
+	bhv_vattr_t		*vap,
 	bhv_vnode_t		**vpp,
 	cred_t			*credp)
 {
@@ -2989,7 +2989,7 @@ std_return:
 STATIC int
 xfs_rmdir(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
+	bhv_vname_t		*dentry,
 	cred_t			*credp)
 {
 	char			*name = VNAME(dentry);
@@ -3268,8 +3268,8 @@ xfs_readdir(
 STATIC int
 xfs_symlink(
 	bhv_desc_t		*dir_bdp,
-	vname_t			*dentry,
-	vattr_t			*vap,
+	bhv_vname_t		*dentry,
+	bhv_vattr_t		*vap,
 	char			*target_path,
 	bhv_vnode_t		**vpp,
 	cred_t			*credp)
@@ -3626,7 +3626,7 @@ xfs_fid2(
 int
 xfs_rwlock(
 	bhv_desc_t	*bdp,
-	vrwlock_t	locktype)
+	bhv_vrwlock_t	locktype)
 {
 	xfs_inode_t	*ip;
 	bhv_vnode_t	*vp;
@@ -3657,7 +3657,7 @@ xfs_rwlock(
 void
 xfs_rwunlock(
 	bhv_desc_t	*bdp,
-	vrwlock_t	locktype)
+	bhv_vrwlock_t	locktype)
 {
 	xfs_inode_t     *ip;
 	bhv_vnode_t	*vp;
@@ -4502,7 +4502,7 @@ xfs_change_file_space(
 	xfs_off_t	startoffset;
 	xfs_off_t	llen;
 	xfs_trans_t	*tp;
-	vattr_t		va;
+	bhv_vattr_t	va;
 	bhv_vnode_t	*vp;
 
 	vp = BHV_TO_VNODE(bdp);
-- 
cgit v1.2.3


From 34327e138481137a81a2e33060b8eb0944013801 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 17:11:55 +1000
Subject: [XFS] Cleanup a missed porting conversion, and freezing.

SGI-PV: 953338
SGI-Modid: xfs-linux-melb:xfs-kern:26109a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c |  4 ++--
 fs/xfs/linux-2.6/xfs_vfs.h |  7 +++----
 fs/xfs/xfs_log.c           |  2 +-
 fs/xfs/xfs_trans.c         | 17 +++++------------
 4 files changed, 11 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 8696096f8d2..3e76c5c9dda 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -668,11 +668,11 @@ xfs_write(
 	io = &xip->i_iocore;
 	mp = io->io_mount;
 
+	vfs_wait_for_freeze(vp->v_vfsp, SB_FREEZE_WRITE);
+
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
-
 	if (ioflags & IO_ISDIRECT) {
 		xfs_buftarg_t	*target =
 			(xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 7b88eeae0f2..91fc2c4b335 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -190,6 +190,9 @@ extern void vfs_init_vnode(bhv_desc_t *, struct bhv_vnode *, bhv_desc_t *, int);
 extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
 extern void vfs_freeze(bhv_desc_t *);
 
+#define vfs_test_for_freeze(vfs)	((vfs)->vfs_super->s_frozen)
+#define vfs_wait_for_freeze(vfs,l)	vfs_check_frozen((vfs)->vfs_super, (l))
+ 
 typedef struct bhv_module_vfsops {
 	struct bhv_vfsops	bhv_common;
 	void *			bhv_custom;
@@ -211,8 +214,4 @@ extern void bhv_insert_all_vfsops(struct bhv_vfs *);
 extern void bhv_remove_all_vfsops(struct bhv_vfs *, int);
 extern void bhv_remove_vfsops(struct bhv_vfs *, int);
 
-#define fs_frozen(vfsp)		((vfsp)->vfs_super->s_frozen)
-#define fs_check_frozen(vfsp, level) \
-	vfs_check_frozen(vfsp->vfs_super, level);
-
 #endif	/* __XFS_VFS_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 9f70ab3e098..95d679cd4e7 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -817,7 +817,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
 	xlog_t		*log = mp->m_log;
 	bhv_vfs_t	*vfsp = XFS_MTOVFS(mp);
 
-	if (fs_frozen(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
+	if (vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
 	    (vfsp->vfs_flag & VFS_RDONLY))
 		return 0;
 
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 7a99ed3b187..39f0b1ed322 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -236,11 +236,8 @@ xfs_trans_alloc(
 	xfs_mount_t	*mp,
 	uint		type)
 {
-	fs_check_frozen(XFS_MTOVFS(mp), SB_FREEZE_TRANS);
-	atomic_inc(&mp->m_active_trans);
-
-	return (_xfs_trans_alloc(mp, type));
-
+	vfs_wait_for_freeze(XFS_MTOVFS(mp), SB_FREEZE_TRANS);
+	return _xfs_trans_alloc(mp, type);
 }
 
 xfs_trans_t *
@@ -250,12 +247,9 @@ _xfs_trans_alloc(
 {
 	xfs_trans_t	*tp;
 
-	ASSERT(xfs_trans_zone != NULL);
-	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+	atomic_inc(&mp->m_active_trans);
 
-	/*
-	 * Initialize the transaction structure.
-	 */
+	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
 	tp->t_magic = XFS_TRANS_MAGIC;
 	tp->t_type = type;
 	tp->t_mountp = mp;
@@ -263,8 +257,7 @@ _xfs_trans_alloc(
 	tp->t_busy_free = XFS_LBC_NUM_SLOTS;
 	XFS_LIC_INIT(&(tp->t_items));
 	XFS_LBC_INIT(&(tp->t_busy));
-
-	return (tp);
+	return tp;
 }
 
 /*
-- 
cgit v1.2.3


From a916e2bd15b7572d9e791ae2a9333f74175470cd Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 17:12:17 +1000
Subject: [XFS] Remove unused parameter from di2xflags routine.

SGI-PV: 904192
SGI-Modid: xfs-linux-melb:xfs-kern:26110a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_inode.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 848783ba565..926d372ae0f 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -783,7 +783,6 @@ xfs_xlate_dinode_core(
 
 STATIC uint
 _xfs_dic2xflags(
-	xfs_dinode_core_t	*dic,
 	__uint16_t		di_flags)
 {
 	uint			flags = 0;
@@ -826,16 +825,16 @@ xfs_ip2xflags(
 {
 	xfs_dinode_core_t	*dic = &ip->i_d;
 
-	return _xfs_dic2xflags(dic, dic->di_flags) |
-		(XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
+	return _xfs_dic2xflags(dic->di_flags) |
+				(XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
 }
 
 uint
 xfs_dic2xflags(
 	xfs_dinode_core_t	*dic)
 {
-	return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) |
-		(XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
+	return _xfs_dic2xflags(INT_GET(dic->di_flags, ARCH_CONVERT)) |
+				(XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
 }
 
 /*
-- 
cgit v1.2.3


From 4d1a2ed3d8d6e306d20f5d99a5ae12ac4c8b787b Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 17:12:28 +1000
Subject: [XFS] Fix up debug code so that bulkstat wont generate thousands of
 fsstress warnings.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26111a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_ialloc.c |  3 +++
 fs/xfs/xfs_inode.c  | 22 ++++++++++++----------
 2 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index deddbd03c16..7e5ccfec92b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1174,6 +1174,9 @@ xfs_dilocate(
 	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
 	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
+		/* no diagnostics for bulkstat, ino comes from userspace */
+		if (flags & XFS_IMAP_BULKSTAT)
+			return XFS_ERROR(EINVAL);
 		if (agno >= mp->m_sb.sb_agcount) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 					"xfs_dilocate: agno (%d) >= "
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 926d372ae0f..9af9e32a6a1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -256,13 +256,11 @@ xfs_itobp(
 	xfs_daddr_t	bno,
 	uint		imap_flags)
 {
+	xfs_imap_t	imap;
 	xfs_buf_t	*bp;
 	int		error;
-	xfs_imap_t	imap;
-#ifdef __KERNEL__
 	int		i;
 	int		ni;
-#endif
 
 	if (ip->i_blkno == (xfs_daddr_t)0) {
 		/*
@@ -319,7 +317,6 @@ xfs_itobp(
 	 */
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
 				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
-
 	if (error) {
 #ifdef DEBUG
 		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
@@ -330,17 +327,21 @@ xfs_itobp(
 #endif /* DEBUG */
 		return error;
 	}
-#ifdef __KERNEL__
+
 	/*
 	 * Validate the magic number and version of every inode in the buffer
 	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+	 * No validation is done here in userspace (xfs_repair).
 	 */
-#ifdef DEBUG
+#if !defined(__KERNEL__)
+	ni = 0;
+#elif defined(DEBUG)
 	ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 :
 		(BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog);
-#else
+#else	/* usual case */
 	ni = (imap_flags & XFS_IMAP_BULKSTAT) ? 0 : 1;
 #endif
+
 	for (i = 0; i < ni; i++) {
 		int		di_ok;
 		xfs_dinode_t	*dip;
@@ -352,8 +353,10 @@ xfs_itobp(
 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
 				 XFS_RANDOM_ITOBP_INOTOBP))) {
 #ifdef DEBUG
-			cmn_err(CE_ALERT, "Device %s - bad inode magic/vsn "
-					  "daddr %lld #%d (magic=%x)",
+			if (!(imap_flags & XFS_IMAP_BULKSTAT))
+				cmn_err(CE_ALERT,
+					"Device %s - bad inode magic/vsn "
+					"daddr %lld #%d (magic=%x)",
 				XFS_BUFTARG_NAME(mp->m_ddev_targp),
 				(unsigned long long)imap.im_blkno, i,
 				INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
@@ -364,7 +367,6 @@ xfs_itobp(
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 	}
-#endif	/* __KERNEL__ */
 
 	xfs_inobp_check(mp, bp);
 
-- 
cgit v1.2.3


From 421ad134583bff86c0ae068e2ddcb17f530957ab Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 17:12:46 +1000
Subject: [XFS] Fix mismerge of the fs_writable cleanup patch causing a
 freeze/thaw test hang.

SGI-PV: 953563
SGI-Modid: xfs-linux-melb:xfs-kern:26182a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_fsops.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 502483c41f9..650d35f537b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -542,14 +542,13 @@ xfs_reserve_blocks(
 }
 
 void
-xfs_fs_log_dummy(xfs_mount_t *mp)
+xfs_fs_log_dummy(
+	xfs_mount_t	*mp)
 {
-	xfs_trans_t *tp;
-	xfs_inode_t *ip;
-
+	xfs_trans_t	*tp;
+	xfs_inode_t	*ip;
 
 	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-	atomic_inc(&mp->m_active_trans);
 	if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
 		xfs_trans_cancel(tp, 0);
 		return;
-- 
cgit v1.2.3


From 477829ef2e9e831c56c98948cfef6dfcec305c3a Mon Sep 17 00:00:00 2001
From: Mandy Kirkconnell <alkirkco@sgi.com>
Date: Fri, 9 Jun 2006 17:13:04 +1000
Subject: [XFS] Fix nused counter.  It's currently getting set to -1 rather
 than getting decremented by 1.  Since nused never reaches 0, the "if
 (!free->hdr.nused)" check in xfs_dir2_leafn_remove() fails every time and
 xfs_dir2_shrink_inode() doesn't get called when it should.  This causes extra
 blocks to be left on an empty directory and the directory in unable to be
 converted back to inline extent mode.

SGI-PV: 951958
SGI-Modid: xfs-linux-melb:xfs-kern:211382a

Signed-off-by: Mandy Kirkconnell <alkirkco@sgi.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_dir2_node.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index a8d483c0a84..b1f85cc7795 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -972,7 +972,7 @@ xfs_dir2_leafn_remove(
 			/*
 			 * One less used entry in the free table.
 			 */
-			free->hdr.nused = cpu_to_be32(-1);
+			be32_add(&free->hdr.nused, -1);
 			xfs_dir2_free_log_header(tp, fbp);
 			/*
 			 * If this was the last entry in the table, we can
-- 
cgit v1.2.3


From b190f1138b0f30fbe837b3f09fb6ffdb2fc4da24 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Fri, 9 Jun 2006 17:13:15 +1000
Subject: [XFS] Fix broken const use inside local suffix_strtoul routine.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26201a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vfsops.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 7550583bd09..918531b6478 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -1654,10 +1654,10 @@ xfs_vget(
 #define MNTOPT_NOATTR2	"noattr2"	/* do not use attr2 attribute format */
 
 STATIC unsigned long
-suffix_strtoul(const char *cp, char **endp, unsigned int base)
+suffix_strtoul(char *s, char **endp, unsigned int base)
 {
 	int	last, shift_left_factor = 0;
-	char	*value = (char *)cp;
+	char	*value = s;
 
 	last = strlen(value) - 1;
 	if (value[last] == 'K' || value[last] == 'k') {
@@ -1673,7 +1673,7 @@ suffix_strtoul(const char *cp, char **endp, unsigned int base)
 		value[last] = '\0';
 	}
 
-	return simple_strtoul(cp, endp, base) << shift_left_factor;
+	return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
 }
 
 STATIC int
-- 
cgit v1.2.3


From 1de3fc12ea085690547a54b6efa01c7348f1cebd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:44 -0400
Subject: NFS: Clean up and fix page zeroing when we have short reads

The code that is supposed to zero the uninitialised partial pages when the
server returns a short read is currently broken: it looks at the nfs_page
wb_pgbase and wb_bytes fields instead of the equivalent nfs_read_data
values when deciding where to start truncating the page.

Also ensure that we are more careful about setting PG_uptodate
before retrying a short read: the retry will change the nfs_read_data
args.pgbase and args.count.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c | 107 ++++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 75 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 624ca7146b6..4b5f58da565 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -104,6 +104,28 @@ int nfs_return_empty_page(struct page *page)
 	return 0;
 }
 
+static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
+{
+	unsigned int remainder = data->args.count - data->res.count;
+	unsigned int base = data->args.pgbase + data->res.count;
+	unsigned int pglen;
+	struct page **pages;
+
+	if (data->res.eof == 0 || remainder == 0)
+		return;
+	/*
+	 * Note: "remainder" can never be negative, since we check for
+	 * 	this in the XDR code.
+	 */
+	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+	base &= ~PAGE_CACHE_MASK;
+	pglen = PAGE_CACHE_SIZE - base;
+	if (pglen < remainder)
+		memclear_highpage_flush(*pages, base, pglen);
+	else
+		memclear_highpage_flush(*pages, base, remainder);
+}
+
 /*
  * Read a page synchronously.
  */
@@ -177,11 +199,9 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
 	spin_unlock(&inode->i_lock);
 
-	if (count)
-		memclear_highpage_flush(page, rdata->args.pgbase, count);
-	SetPageUptodate(page);
-	if (PageError(page))
-		ClearPageError(page);
+	nfs_readpage_truncate_uninitialised_page(rdata);
+	if (rdata->res.eof || rdata->res.count == rdata->args.count)
+		SetPageUptodate(page);
 	result = 0;
 
 io_error:
@@ -436,20 +456,12 @@ static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata)
 	struct nfs_page *req = data->req;
 	struct page *page = req->wb_page;
  
+	if (likely(task->tk_status >= 0))
+		nfs_readpage_truncate_uninitialised_page(data);
+	else
+		SetPageError(page);
 	if (nfs_readpage_result(task, data) != 0)
 		return;
-	if (task->tk_status >= 0) {
-		unsigned int request = data->args.count;
-		unsigned int result = data->res.count;
-
-		if (result < request) {
-			memclear_highpage_flush(page,
-						data->args.pgbase + result,
-						request - result);
-		}
-	} else
-		SetPageError(page);
-
 	if (atomic_dec_and_test(&req->wb_complete)) {
 		if (!PageError(page))
 			SetPageUptodate(page);
@@ -462,6 +474,40 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
 	.rpc_release = nfs_readdata_release,
 };
 
+static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data)
+{
+	unsigned int count = data->res.count;
+	unsigned int base = data->args.pgbase;
+	struct page **pages;
+
+	if (unlikely(count == 0))
+		return;
+	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+	base &= ~PAGE_CACHE_MASK;
+	count += base;
+	for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
+		SetPageUptodate(*pages);
+	/*
+	 * Was this an eof or a short read? If the latter, don't mark the page
+	 * as uptodate yet.
+	 */
+	if (count > 0 && (data->res.eof || data->args.count == data->res.count))
+		SetPageUptodate(*pages);
+}
+
+static void nfs_readpage_set_pages_error(struct nfs_read_data *data)
+{
+	unsigned int count = data->args.count;
+	unsigned int base = data->args.pgbase;
+	struct page **pages;
+
+	pages = &data->args.pages[base >> PAGE_CACHE_SHIFT];
+	base &= ~PAGE_CACHE_MASK;
+	count += base;
+	for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++)
+		SetPageError(*pages);
+}
+
 /*
  * This is the callback from RPC telling us whether a reply was
  * received or some error occurred (timeout or socket shutdown).
@@ -469,27 +515,24 @@ static const struct rpc_call_ops nfs_read_partial_ops = {
 static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
-	unsigned int count = data->res.count;
 
+	/*
+	 * Note: nfs_readpage_result may change the values of
+	 * data->args. In the multi-page case, we therefore need
+	 * to ensure that we call the next nfs_readpage_set_page_uptodate()
+	 * first in the multi-page case.
+	 */
+	if (likely(task->tk_status >= 0)) {
+		nfs_readpage_truncate_uninitialised_page(data);
+		nfs_readpage_set_pages_uptodate(data);
+	} else
+		nfs_readpage_set_pages_error(data);
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 	while (!list_empty(&data->pages)) {
 		struct nfs_page *req = nfs_list_entry(data->pages.next);
-		struct page *page = req->wb_page;
-		nfs_list_remove_request(req);
 
-		if (task->tk_status >= 0) {
-			if (count < PAGE_CACHE_SIZE) {
-				if (count < req->wb_bytes)
-					memclear_highpage_flush(page,
-							req->wb_pgbase + count,
-							req->wb_bytes - count);
-				count = 0;
-			} else
-				count -= PAGE_CACHE_SIZE;
-			SetPageUptodate(page);
-		} else
-			SetPageError(page);
+		nfs_list_remove_request(req);
 		nfs_readpage_release(req);
 	}
 }
-- 
cgit v1.2.3


From 9d1e9232223a7f065be7f956a7b749a4cbbbe16d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:46 -0400
Subject: NFSv4: Some NFSv4 servers have broken behaviour for the change
 attribute

The Linux NFSv4 server violates RFC3530 in that the change attribute is not
guaranteed to be updated for every change to the inode. Our optimisation
for checking whether or not the inode metadata has changed or not is broken
too. Grr....

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d0b991a9232..e870e4aae71 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1414,9 +1414,8 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0) {
-		if (nfsi->change_attr == fattr->change_attr)
-			goto out;
+	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+			nfsi->change_attr != fattr->change_attr) {
 		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (!data_unstable)
 			nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
@@ -1444,7 +1443,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	if (inode->i_nlink != fattr->nlink)
 		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 
-out:
 	if (!timespec_equal(&inode->i_atime, &fattr->atime))
 		nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
 
@@ -1612,15 +1610,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  		inode->i_blksize = fattr->du.nfs2.blocksize;
  	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR_V4)) {
-		if (nfsi->change_attr != fattr->change_attr) {
-			dprintk("NFS: change_attr change on server for file %s/%ld\n",
-					inode->i_sb->s_id, inode->i_ino);
-			nfsi->change_attr = fattr->change_attr;
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-			nfsi->cache_change_attribute = jiffies;
-		} else
-			invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA);
+	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+			nfsi->change_attr != fattr->change_attr) {
+		dprintk("NFS: change_attr change on server for file %s/%ld\n",
+				inode->i_sb->s_id, inode->i_ino);
+		nfsi->change_attr = fattr->change_attr;
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		nfsi->cache_change_attribute = jiffies;
 	}
 
 	/* Update attrtimeo value if we're out of the unstable period */
-- 
cgit v1.2.3


From 73a3d07c1082145a3b78407bb5252df290470c4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:47 -0400
Subject: NFS: Clean up inode metadata updates

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c    | 12 ------------
 fs/nfs/nfs4proc.c |  2 +-
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e870e4aae71..4f12c57456f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1360,12 +1360,6 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
-			&& nfsi->change_attr == fattr->pre_change_attr) {
-		nfsi->change_attr = fattr->change_attr;
-		nfsi->cache_change_attribute = jiffies;
-	}
-
 	/* If we have atomic WCC data, we may update some attributes */
 	if ((fattr->valid & NFS_ATTR_WCC) != 0) {
 		if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) {
@@ -1399,9 +1393,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	int data_unstable;
 
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-		return 0;
-
 	/* Has the inode gone and changed behind our back? */
 	if (nfsi->fileid != fattr->fileid
 			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
@@ -1525,9 +1516,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 			__FUNCTION__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
-		return 0;
-
 	if (nfsi->fileid != fattr->fileid)
 		goto out_fileid;
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d86c0db7b1e..e38a8487449 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2008,7 +2008,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
 	if (!status) {
 		update_changeattr(dir, &res.cinfo);
 		nfs_post_op_update_inode(dir, res.dir_attr);
-		nfs_refresh_inode(inode, res.fattr);
+		nfs_post_op_update_inode(inode, res.fattr);
 	}
 
 	return status;
-- 
cgit v1.2.3


From 0d0b5cb36faf7002a11736032313f06d6f3d881c Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Thu, 25 May 2006 01:40:53 -0400
Subject: NFS: Optimize allocation of nfs_read/write_data structures

Clean up use of page_array, and fix an off-by-one error noticed by Tom
Talpey which causes kmalloc calls in cases where using the page_array
is sufficient.

Test plan:
Normal client functional testing with r/wsize=32768.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c  | 11 ++++-------
 fs/nfs/write.c | 18 +++++++-----------
 2 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 4b5f58da565..fd9018c692b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -51,14 +51,11 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!p->pagevec) {
 				mempool_free(p, nfs_rdata_mempool);
 				p = NULL;
 			}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 4cfada2cc09..a515ec714bb 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -98,11 +98,10 @@ struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kzalloc(size, GFP_NOFS);
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
 			if (!p->pagevec) {
 				mempool_free(p, nfs_commit_mempool);
 				p = NULL;
@@ -126,14 +125,11 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 	if (p) {
 		memset(p, 0, sizeof(*p));
 		INIT_LIST_HEAD(&p->pages);
-		if (pagecount < NFS_PAGEVEC_SIZE)
-			p->pagevec = &p->page_array[0];
+		if (pagecount <= ARRAY_SIZE(p->page_array))
+			p->pagevec = p->page_array;
 		else {
-			size_t size = ++pagecount * sizeof(struct page *);
-			p->pagevec = kmalloc(size, GFP_NOFS);
-			if (p->pagevec) {
-				memset(p->pagevec, 0, size);
-			} else {
+			p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
+			if (!p->pagevec) {
 				mempool_free(p, nfs_wdata_mempool);
 				p = NULL;
 			}
-- 
cgit v1.2.3


From f1bb0b92ba2cdfffe6e437f7a7da53138cf08d52 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:55 -0400
Subject: NFS: Fix page cache revalidation

Fix up a bug in the handling of NFS_INO_REVAL_PAGECACHE: make sure that
nfs_update_inode() clears it when we're sure we're not racing with other
updates.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4f12c57456f..eddd0e982d2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1406,18 +1406,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 	nfs_wcc_update_inode(inode, fattr);
 
 	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
-			nfsi->change_attr != fattr->change_attr) {
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
-		if (!data_unstable)
-			nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
-	}
+			nfsi->change_attr != fattr->change_attr)
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	/* Verify a few of the more important attributes */
-	if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
-		if (!data_unstable)
-			nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
-	}
+	if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 
 	cur_size = i_size_read(inode);
  	new_isize = nfs_size_to_loff_t(fattr->size);
@@ -1459,7 +1453,6 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
 		return 0;
 	spin_lock(&inode->i_lock);
-	nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
 	if (time_after(fattr->time_start, nfsi->last_updated))
 		status = nfs_update_inode(inode, fattr);
 	else
@@ -1484,7 +1477,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	spin_lock(&inode->i_lock);
 	if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) {
-		nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+		nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
 		goto out;
 	}
 	status = nfs_update_inode(inode, fattr);
@@ -1534,7 +1527,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	/* Are we racing with known updates of the metadata on the server? */
 	data_stable = nfs_verify_change_attribute(inode, fattr->time_start);
 	if (data_stable)
-		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
+		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME);
 
 	/* Do atomic weak cache consistency updates */
 	nfs_wcc_update_inode(inode, fattr);
-- 
cgit v1.2.3


From 38478b24e37587f1c4fedf8ac070ca54f052ed28 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:57 -0400
Subject: NFS: More page cache revalidation fixups

Whenever the directory changes, we want to make sure that we always
invalidate its page cache. Fix up update_changeattr() and
nfs_mark_for_revalidate() so that they do so.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e38a8487449..ef4c6cccf95 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -185,15 +185,15 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 	spin_unlock(&clp->cl_lock);
 }
 
-static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinfo)
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_inode *nfsi = NFS_I(dir);
 
-	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+	spin_lock(&dir->i_lock);
+	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA;
 	if (cinfo->before == nfsi->change_attr && cinfo->atomic)
 		nfsi->change_attr = cinfo->after;
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&dir->i_lock);
 }
 
 struct nfs4_opendata {
-- 
cgit v1.2.3


From 44b11874ff583b6e766a05856b04f3c492c32b84 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:40:59 -0400
Subject: NFS: Separate metadata and page cache revalidation mechanisms

Separate out the function of revalidating the inode metadata, and
revalidating the mapping. The former may be called by lookup(),
and only really needs to check that permissions, ctime, etc haven't changed
whereas the latter needs only done when we want to read data from the page
cache, and may need to sync and then invalidate the mapping.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c     |  2 +-
 fs/nfs/file.c    | 24 +++---------------------
 fs/nfs/inode.c   | 16 +++++++++++-----
 fs/nfs/symlink.c |  2 +-
 4 files changed, 16 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cae74dd4c7f..1d3d8922a66 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -528,7 +528,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	lock_kernel();
 
-	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (res < 0) {
 		unlock_kernel();
 		return res;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fade02c15e6..63154070145 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -126,23 +126,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
 	return NFS_PROTO(inode)->file_release(inode, filp);
 }
 
-/**
- * nfs_revalidate_file - Revalidate the page cache & related metadata
- * @inode - pointer to inode struct
- * @file - pointer to file
- */
-static int nfs_revalidate_file(struct inode *inode, struct file *filp)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-	int retval = 0;
-
-	if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))
-			|| nfs_attribute_timeout(inode))
-		retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	nfs_revalidate_mapping(inode, filp->f_mapping);
-	return 0;
-}
-
 /**
  * nfs_revalidate_size - Revalidate the file size
  * @inode - pointer to inode struct
@@ -228,7 +211,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long) pos);
 
-	result = nfs_revalidate_file(inode, iocb->ki_filp);
+	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
 	if (!result)
 		result = generic_file_aio_read(iocb, buf, count, pos);
@@ -247,7 +230,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count,
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long long) *ppos);
 
-	res = nfs_revalidate_file(inode, filp);
+	res = nfs_revalidate_mapping(inode, filp->f_mapping);
 	if (!res)
 		res = generic_file_sendfile(filp, ppos, count, actor, target);
 	return res;
@@ -263,7 +246,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 	dfprintk(VFS, "nfs: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	status = nfs_revalidate_file(inode, file);
+	status = nfs_revalidate_mapping(inode, file->f_mapping);
 	if (!status)
 		status = generic_file_mmap(file, vma);
 	return status;
@@ -373,7 +356,6 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 		if (result)
 			goto out;
 	}
-	nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 
 	result = count;
 	if (!count)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eddd0e982d2..69036ef3986 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1220,7 +1220,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		status = -ESTALE;
 		/* Do we trust the cached ESTALE? */
 		if (NFS_ATTRTIMEO(inode) != 0) {
-			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) {
+			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) {
 				/* no */
 			} else
 				goto out;
@@ -1251,8 +1251,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	}
 	spin_unlock(&inode->i_lock);
 
-	nfs_revalidate_mapping(inode, inode->i_mapping);
-
 	if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
 
@@ -1287,7 +1285,7 @@ int nfs_attribute_timeout(struct inode *inode)
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
-	if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+	if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
 			&& !nfs_attribute_timeout(inode))
 		return NFS_STALE(inode) ? -ESTALE : 0;
 	return __nfs_revalidate_inode(server, inode);
@@ -1298,9 +1296,16 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
  * @inode - pointer to host inode
  * @mapping - pointer to mapping
  */
-void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int ret = 0;
+
+	if (NFS_STALE(inode))
+		ret = -ESTALE;
+	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+			|| nfs_attribute_timeout(inode))
+		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 
 	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
 		nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
@@ -1321,6 +1326,7 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 				inode->i_sb->s_id,
 				(long long)NFS_FILEID(inode));
 	}
+	return ret;
 }
 
 /**
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 18dc95b0b64..636c479995b 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -52,7 +52,7 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct page *page;
-	void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode));
+	void *err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
 	if (err)
 		goto read_failed;
 	page = read_cache_page(&inode->i_data, 0,
-- 
cgit v1.2.3


From 1842bfb447cea8b344fd91af97fb6d604ecb11fa Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 25 May 2006 01:41:01 -0400
Subject: NFS: Fix up inode revalidation accounting

Currently, we are accounting for all calls to nfs_revalidate_inode(), but not
to nfs_revalidate_mapping(), or nfs_lookup_verify_inode(), etc...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 69036ef3986..3200358195b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1207,6 +1207,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
 		inode->i_sb->s_id, (long long)NFS_FILEID(inode));
 
+	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
 	lock_kernel();
 	if (!inode || is_bad_inode(inode))
  		goto out_nowait;
@@ -1284,7 +1285,6 @@ int nfs_attribute_timeout(struct inode *inode)
  */
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
-	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
 	if (!(NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATTR)
 			&& !nfs_attribute_timeout(inode))
 		return NFS_STALE(inode) ? -ESTALE : 0;
-- 
cgit v1.2.3


From 4814f56d19137b3b9fa8e00e1d332b3683b950de Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Thu, 25 May 2006 01:41:03 -0400
Subject: NFSv3: Client-side nfsacl caching fix

Fix two errors in the client-side acl cache: First, when nfs3_proc_getacl
requests only the default acl of a file and the access acl is not cached
already, a NULL access acl entry is cached instead of ERR_PTR(-EAGAIN)
("not cached").

Second, update the cached acls in nfs3_proc_setacls: nfs_refresh_inode does
not always invalidate the cached acls, and when it does not, the cached acls
get out of sync.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3acl.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 33287879bd2..7322da4d205 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -172,8 +172,10 @@ static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
 		inode->i_ino, acl, dfacl);
 	spin_lock(&inode->i_lock);
 	__nfs3_forget_cached_acls(NFS_I(inode));
-	nfsi->acl_access = posix_acl_dup(acl);
-	nfsi->acl_default = posix_acl_dup(dfacl);
+	if (!IS_ERR(acl))
+		nfsi->acl_access = posix_acl_dup(acl);
+	if (!IS_ERR(dfacl))
+		nfsi->acl_default = posix_acl_dup(dfacl);
 	spin_unlock(&inode->i_lock);
 }
 
@@ -254,7 +256,9 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 			res.acl_access = NULL;
 		}
 	}
-	nfs3_cache_acls(inode, res.acl_access, res.acl_default);
+	nfs3_cache_acls(inode,
+		(res.mask & NFS_ACL)   ? res.acl_access  : ERR_PTR(-EINVAL),
+		(res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
@@ -329,6 +333,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	switch (status) {
 		case 0:
 			status = nfs_refresh_inode(inode, &fattr);
+			nfs3_cache_acls(inode, acl, dfacl);
 			break;
 		case -EPFNOSUPPORT:
 		case -EPROTONOSUPPORT:
-- 
cgit v1.2.3


From 3873bc50e2271504da45799257f69222774d9550 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 27 May 2006 03:31:12 +0400
Subject: NFSv4: really return status from decode_recall_args()

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 05c38cf40b6..c92991328d9 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -202,7 +202,7 @@ static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xd
 	status = decode_fh(xdr, &args->fh);
 out:
 	dprintk("%s: exit with status = %d\n", __FUNCTION__, status);
-	return 0;
+	return status;
 }
 
 static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
-- 
cgit v1.2.3


From c04871e6345e4c6dfda564e302d7fd8c66420fd5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 30 May 2006 16:28:58 -0400
Subject: NFSv4: remove obviously bogus comparison from decode_getacl

We just set *acl_len to zero, and attrlen is unsigned, so this comparison
is clearly bogus.  I have no idea what I was thinking.

Fixes a bug that caused getacl to fail over krb5p.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7c5d70efe72..7e9a840057f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3350,8 +3350,7 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 					attrlen, recvd);
 			return -EINVAL;
 		}
-		if (attrlen <= *acl_len)
-			xdr_read_pages(xdr, attrlen);
+		xdr_read_pages(xdr, attrlen);
 		*acl_len = attrlen;
 	} else
 		status = -EOPNOTSUPP;
-- 
cgit v1.2.3


From d2ccddf042c403b146159beea438c6bfc4a445e2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 31 May 2006 01:13:38 -0400
Subject: NFS: Flesh out nfs_invalidate_page()

In the case of a call to truncate_inode_pages(), we should really try to
cancel any pending writes on the page.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c     |  6 +++++-
 fs/nfs/pagelist.c | 47 ++++++++++++++++++++++++++++-------------------
 fs/nfs/write.c    | 27 ++++++++++++++++++++++++---
 3 files changed, 57 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 63154070145..106ef0dec04 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -303,7 +303,11 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offse
 
 static void nfs_invalidate_page(struct page *page, unsigned long offset)
 {
-	/* FIXME: we really should cancel any unstarted writes on this page */
+	struct inode *inode = page->mapping->host;
+
+	/* Cancel any unstarted writes on this page */
+	if (offset == 0)
+		nfs_sync_inode_wait(inode, page->index, 1, FLUSH_INVALIDATE);
 }
 
 static int nfs_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 106aca388eb..656481c0daa 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -325,6 +325,7 @@ out:
 
 /**
  * nfs_scan_list - Scan a list for matching requests
+ * @nfsi: NFS inode
  * @head: One of the NFS inode request lists
  * @dst: Destination list
  * @idx_start: lower bound of page->index to scan
@@ -336,14 +337,15 @@ out:
  * The requests are *not* checked to ensure that they form a contiguous set.
  * You must be holding the inode's req_lock when calling this function
  */
-int
-nfs_scan_list(struct list_head *head, struct list_head *dst,
-	      unsigned long idx_start, unsigned int npages)
+int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *head,
+		struct list_head *dst, unsigned long idx_start,
+		unsigned int npages)
 {
-	struct list_head	*pos, *tmp;
-	struct nfs_page		*req;
-	unsigned long		idx_end;
-	int			res;
+	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
+	struct nfs_page *req;
+	unsigned long idx_end;
+	int found, i;
+	int res;
 
 	res = 0;
 	if (npages == 0)
@@ -351,21 +353,28 @@ nfs_scan_list(struct list_head *head, struct list_head *dst,
 	else
 		idx_end = idx_start + npages - 1;
 
-	list_for_each_safe(pos, tmp, head) {
-
-		req = nfs_list_entry(pos);
-
-		if (req->wb_index < idx_start)
-			continue;
-		if (req->wb_index > idx_end)
+	for (;;) {
+		found = radix_tree_gang_lookup(&nfsi->nfs_page_tree,
+				(void **)&pgvec[0], idx_start,
+				NFS_SCAN_MAXENTRIES);
+		if (found <= 0)
 			break;
+		for (i = 0; i < found; i++) {
+			req = pgvec[i];
+			if (req->wb_index > idx_end)
+				goto out;
+			idx_start = req->wb_index + 1;
+			if (req->wb_list_head != head)
+				continue;
+			if (nfs_set_page_writeback_locked(req)) {
+				nfs_list_remove_request(req);
+				nfs_list_add_request(req, dst);
+				res++;
+			}
+		}
 
-		if (!nfs_set_page_writeback_locked(req))
-			continue;
-		nfs_list_remove_request(req);
-		nfs_list_add_request(req, dst);
-		res++;
 	}
+out:
 	return res;
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a515ec714bb..e03abbd8302 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -579,6 +579,17 @@ static int nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, un
 	return ret;
 }
 
+static void nfs_cancel_requests(struct list_head *head)
+{
+	struct nfs_page *req;
+	while(!list_empty(head)) {
+		req = nfs_list_entry(head->next);
+		nfs_list_remove_request(req);
+		nfs_inode_remove_request(req);
+		nfs_clear_page_writeback(req);
+	}
+}
+
 /*
  * nfs_scan_dirty - Scan an inode for dirty requests
  * @inode: NFS inode to scan
@@ -623,7 +634,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_st
 	int res = 0;
 
 	if (nfsi->ncommit != 0) {
-		res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages);
+		res = nfs_scan_list(nfsi, &nfsi->commit, dst, idx_start, npages);
 		nfsi->ncommit -= res;
 		if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
 			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -1491,15 +1502,25 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
 		pages = nfs_scan_dirty(inode, &head, idx_start, npages);
 		if (pages != 0) {
 			spin_unlock(&nfsi->req_lock);
-			ret = nfs_flush_list(inode, &head, pages, how);
+			if (how & FLUSH_INVALIDATE)
+				nfs_cancel_requests(&head);
+			else
+				ret = nfs_flush_list(inode, &head, pages, how);
 			spin_lock(&nfsi->req_lock);
 			continue;
 		}
 		if (nocommit)
 			break;
-		pages = nfs_scan_commit(inode, &head, 0, 0);
+		pages = nfs_scan_commit(inode, &head, idx_start, npages);
 		if (pages == 0)
 			break;
+		if (how & FLUSH_INVALIDATE) {
+			spin_unlock(&nfsi->req_lock);
+			nfs_cancel_requests(&head);
+			spin_lock(&nfsi->req_lock);
+			continue;
+		}
+		pages += nfs_scan_commit(inode, &head, 0, 0);
 		spin_unlock(&nfsi->req_lock);
 		ret = nfs_commit_list(inode, &head, how);
 		spin_lock(&nfsi->req_lock);
-- 
cgit v1.2.3


From da6d503aa0a75ec44f17d985a2b500077e7f6a74 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 1 Jun 2006 17:26:35 -0400
Subject: NFS: Remove nfs_delete_inode()

Now that we have a real nfs_invalidate_page() to ensure that
truncate_inode_pages() does the right thing when there are pending dirty
pages, we can get rid of nfs_delete_inode().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3200358195b..9ff039f9a83 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -62,7 +62,6 @@ static int nfs_update_inode(struct inode *, struct nfs_fattr *);
 static struct inode *nfs_alloc_inode(struct super_block *sb);
 static void nfs_destroy_inode(struct inode *);
 static int nfs_write_inode(struct inode *,int);
-static void nfs_delete_inode(struct inode *);
 static void nfs_clear_inode(struct inode *);
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct super_block *, struct kstatfs *);
@@ -76,7 +75,6 @@ static struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
 	.write_inode	= nfs_write_inode,
-	.delete_inode	= nfs_delete_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -146,31 +144,16 @@ nfs_write_inode(struct inode *inode, int sync)
 	return 0;
 }
 
-static void
-nfs_delete_inode(struct inode * inode)
-{
-	dprintk("NFS: delete_inode(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
-
-	truncate_inode_pages(&inode->i_data, 0);
-
-	nfs_wb_all(inode);
-	/*
-	 * The following should never happen...
-	 */
-	if (nfs_have_writebacks(inode)) {
-		printk(KERN_ERR "nfs_delete_inode: inode %ld has pending RPC requests\n", inode->i_ino);
-	}
-
-	clear_inode(inode);
-}
-
 static void
 nfs_clear_inode(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct rpc_cred *cred;
 
-	nfs_wb_all(inode);
+	/*
+	 * The following should never happen...
+	 */
+	BUG_ON(nfs_have_writebacks(inode));
 	BUG_ON (!list_empty(&nfsi->open_files));
 	nfs_zap_acl_cache(inode);
 	cred = nfsi->cache_access.cred;
@@ -1821,7 +1804,6 @@ static struct super_operations nfs4_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
 	.write_inode	= nfs_write_inode,
-	.delete_inode	= nfs_delete_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs4_clear_inode,
 	.umount_begin	= nfs_umount_begin,
-- 
cgit v1.2.3


From bb4a58bf46473e3e83d84054bbc110db3a0f85e4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:15 -0400
Subject: VFS: Add GPL_EXPORTED function vfs_kern_mount()

do_kern_mount() does not allow the kernel to use private mount interfaces
without exposing the same interfaces to userland. The problem is that the
filesystem is referenced by name, thus meaning that it and its mount
interface must be registered in the global filesystem list.

vfs_kern_mount() passes the struct file_system_type as an explicit
parameter in order to overcome this limitation.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/super.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index a66f66bb804..848be4fc67a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -800,17 +800,13 @@ struct super_block *get_sb_single(struct file_system_type *fs_type,
 EXPORT_SYMBOL(get_sb_single);
 
 struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct file_system_type *type = get_fs_type(fstype);
 	struct super_block *sb = ERR_PTR(-ENOMEM);
 	struct vfsmount *mnt;
 	int error;
 	char *secdata = NULL;
 
-	if (!type)
-		return ERR_PTR(-ENODEV);
-
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		goto out;
@@ -841,7 +837,6 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
 	mnt->mnt_parent = mnt;
 	up_write(&sb->s_umount);
 	free_secdata(secdata);
-	put_filesystem(type);
 	return mnt;
 out_sb:
 	up_write(&sb->s_umount);
@@ -852,10 +847,23 @@ out_free_secdata:
 out_mnt:
 	free_vfsmnt(mnt);
 out:
-	put_filesystem(type);
 	return (struct vfsmount *)sb;
 }
 
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
+	put_filesystem(type);
+	return mnt;
+}
+
 EXPORT_SYMBOL_GPL(do_kern_mount);
 
 struct vfsmount *kern_mount(struct file_system_type *type)
-- 
cgit v1.2.3


From 1f5ce9e93aa96a867f195ed45f6f77935175f12e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:16 -0400
Subject: VFS: Unexport do_kern_mount() and clean up simple_pin_fs()

Replace all module uses with the new vfs_kern_mount() interface, and fix up
simple_pin_fs().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/afs/mntpt.c      | 2 +-
 fs/afs/super.c      | 2 +-
 fs/afs/super.h      | 2 ++
 fs/binfmt_misc.c    | 3 ++-
 fs/configfs/mount.c | 2 +-
 fs/debugfs/inode.c  | 2 +-
 fs/libfs.c          | 4 ++--
 fs/super.c          | 4 +---
 8 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 4e6eeb59b83..7b6dc03caf4 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -210,7 +210,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 
 	/* try and do the mount */
 	kdebug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = do_kern_mount("afs", 0, devname, options);
+	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
 	kdebug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 53c56e7231a..93a7821db0d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -48,7 +48,7 @@ static void afs_put_super(struct super_block *sb);
 
 static void afs_destroy_inode(struct inode *inode);
 
-static struct file_system_type afs_fs_type = {
+struct file_system_type afs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "afs",
 	.get_sb		= afs_get_sb,
diff --git a/fs/afs/super.h b/fs/afs/super.h
index ac11362f4e9..32de8cc6fae 100644
--- a/fs/afs/super.h
+++ b/fs/afs/super.h
@@ -38,6 +38,8 @@ static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+extern struct file_system_type afs_fs_type;
+
 #endif /* __KERNEL__ */
 
 #endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index d73d75591a3..c0a909e1d29 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -55,6 +55,7 @@ typedef struct {
 } Node;
 
 static DEFINE_RWLOCK(entries_lock);
+static struct file_system_type bm_fs_type;
 static struct vfsmount *bm_mnt;
 static int entry_count;
 
@@ -638,7 +639,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	if (!inode)
 		goto out2;
 
-	err = simple_pin_fs("binfmt_misc", &bm_mnt, &entry_count);
+	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
 	if (err) {
 		iput(inode);
 		inode = NULL;
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f920d30478e..be5d86ae56f 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -118,7 +118,7 @@ static struct file_system_type configfs_fs_type = {
 
 int configfs_pin_fs(void)
 {
-	return simple_pin_fs("configfs", &configfs_mount,
+	return simple_pin_fs(&configfs_fs_type, &configfs_mount,
 			     &configfs_mnt_count);
 }
 
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b55b4ea9a67..90f9417181f 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -199,7 +199,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
 
 	pr_debug("debugfs: creating file '%s'\n",name);
 
-	error = simple_pin_fs("debugfs", &debugfs_mount, &debugfs_mount_count);
+	error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
 	if (error)
 		goto exit;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 7145ba7a48d..4a3ec9ad8be 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -424,13 +424,13 @@ out:
 
 static DEFINE_SPINLOCK(pin_fs_lock);
 
-int simple_pin_fs(char *name, struct vfsmount **mount, int *count)
+int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
 {
 	struct vfsmount *mnt = NULL;
 	spin_lock(&pin_fs_lock);
 	if (unlikely(!*mount)) {
 		spin_unlock(&pin_fs_lock);
-		mnt = do_kern_mount(name, 0, name, NULL);
+		mnt = vfs_kern_mount(type, 0, type->name, NULL);
 		if (IS_ERR(mnt))
 			return PTR_ERR(mnt);
 		spin_lock(&pin_fs_lock);
diff --git a/fs/super.c b/fs/super.c
index 848be4fc67a..15f2afdbf82 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -864,11 +864,9 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data)
 	return mnt;
 }
 
-EXPORT_SYMBOL_GPL(do_kern_mount);
-
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
-	return do_kern_mount(type->name, 0, type->name, NULL);
+	return vfs_kern_mount(type, 0, type->name, NULL);
 }
 
 EXPORT_SYMBOL(kern_mount);
-- 
cgit v1.2.3


From 5528f911b4c43a5de5da34bcbd7e3f2a62503617 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:17 -0400
Subject: VFS: Add shrink_submounts()

Allow a submount to be marked as being 'shrinkable' by means of the
vfsmount->mnt_flags, and then add a function 'shrink_submounts()' which
attempts to recursively unmount these submounts.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/namespace.c | 124 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 99 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb85..b22e469ab56 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1162,6 +1162,40 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
 	}
 }
 
+/*
+ * go through the vfsmounts we've just consigned to the graveyard to
+ * - check that they're still dead
+ * - delete the vfsmount from the appropriate namespace under lock
+ * - dispose of the corpse
+ */
+static void expire_mount_list(struct list_head *graveyard, struct list_head *mounts)
+{
+	struct namespace *namespace;
+	struct vfsmount *mnt;
+
+	while (!list_empty(graveyard)) {
+		LIST_HEAD(umounts);
+		mnt = list_entry(graveyard->next, struct vfsmount, mnt_expire);
+		list_del_init(&mnt->mnt_expire);
+
+		/* don't do anything if the namespace is dead - all the
+		 * vfsmounts from it are going away anyway */
+		namespace = mnt->mnt_namespace;
+		if (!namespace || !namespace->root)
+			continue;
+		get_namespace(namespace);
+
+		spin_unlock(&vfsmount_lock);
+		down_write(&namespace_sem);
+		expire_mount(mnt, mounts, &umounts);
+		up_write(&namespace_sem);
+		release_mounts(&umounts);
+		mntput(mnt);
+		put_namespace(namespace);
+		spin_lock(&vfsmount_lock);
+	}
+}
+
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * mountpoints that aren't in use and haven't been touched since last we came
@@ -1169,7 +1203,6 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts,
  */
 void mark_mounts_for_expiry(struct list_head *mounts)
 {
-	struct namespace *namespace;
 	struct vfsmount *mnt, *next;
 	LIST_HEAD(graveyard);
 
@@ -1193,38 +1226,79 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		list_move(&mnt->mnt_expire, &graveyard);
 	}
 
-	/*
-	 * go through the vfsmounts we've just consigned to the graveyard to
-	 * - check that they're still dead
-	 * - delete the vfsmount from the appropriate namespace under lock
-	 * - dispose of the corpse
-	 */
-	while (!list_empty(&graveyard)) {
-		LIST_HEAD(umounts);
-		mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
-		list_del_init(&mnt->mnt_expire);
+	expire_mount_list(&graveyard, mounts);
 
-		/* don't do anything if the namespace is dead - all the
-		 * vfsmounts from it are going away anyway */
-		namespace = mnt->mnt_namespace;
-		if (!namespace || !namespace->root)
+	spin_unlock(&vfsmount_lock);
+}
+
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+
+/*
+ * Ripoff of 'select_parent()'
+ *
+ * search the list of submounts for a given mountpoint, and move any
+ * shrinkable submounts to the 'graveyard' list.
+ */
+static int select_submounts(struct vfsmount *parent, struct list_head *graveyard)
+{
+	struct vfsmount *this_parent = parent;
+	struct list_head *next;
+	int found = 0;
+
+repeat:
+	next = this_parent->mnt_mounts.next;
+resume:
+	while (next != &this_parent->mnt_mounts) {
+		struct list_head *tmp = next;
+		struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child);
+
+		next = tmp->next;
+		if (!(mnt->mnt_flags & MNT_SHRINKABLE))
 			continue;
-		get_namespace(namespace);
+		/*
+		 * Descend a level if the d_mounts list is non-empty.
+		 */
+		if (!list_empty(&mnt->mnt_mounts)) {
+			this_parent = mnt;
+			goto repeat;
+		}
 
-		spin_unlock(&vfsmount_lock);
-		down_write(&namespace_sem);
-		expire_mount(mnt, mounts, &umounts);
-		up_write(&namespace_sem);
-		release_mounts(&umounts);
-		mntput(mnt);
-		put_namespace(namespace);
-		spin_lock(&vfsmount_lock);
+		if (!propagate_mount_busy(mnt, 1)) {
+			mntget(mnt);
+			list_move_tail(&mnt->mnt_expire, graveyard);
+			found++;
+		}
 	}
+	/*
+	 * All done at this level ... ascend and resume the search
+	 */
+	if (this_parent != parent) {
+		next = this_parent->mnt_child.next;
+		this_parent = this_parent->mnt_parent;
+		goto resume;
+	}
+	return found;
+}
+
+/*
+ * process a list of expirable mountpoints with the intent of discarding any
+ * submounts of a specific parent mountpoint
+ */
+void shrink_submounts(struct vfsmount *mountpoint, struct list_head *mounts)
+{
+	LIST_HEAD(graveyard);
+	int found;
+
+	spin_lock(&vfsmount_lock);
+
+	/* extract submounts of 'mountpoint' from the expiration list */
+	while ((found = select_submounts(mountpoint, &graveyard)) != 0)
+		expire_mount_list(&graveyard, mounts);
 
 	spin_unlock(&vfsmount_lock);
 }
 
-EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+EXPORT_SYMBOL_GPL(shrink_submounts);
 
 /*
  * Some copy_from_user() implementations do not return the exact number of
-- 
cgit v1.2.3


From 8b512d9a88875affe584bb3d2a7a235f84343b9e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:18 -0400
Subject: VFS: Remove dependency of ->umount_begin() call on MNT_FORCE

Allow filesystems to decide to perform pre-umount processing whether or not
MNT_FORCE is set.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/9p/vfs_super.c |  7 ++++---
 fs/cifs/cifsfs.c  |  6 ++++--
 fs/fuse/inode.c   |  5 +++--
 fs/namespace.c    |  4 ++--
 fs/nfs/inode.c    | 14 +++++++++-----
 5 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 61c599b4a1e..00c1f6baf87 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -253,11 +253,12 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
 }
 
 static void
-v9fs_umount_begin(struct super_block *sb)
+v9fs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	struct v9fs_session_info *v9ses = vfsmnt->mnt_sb->s_fs_info;
 
-	v9fs_session_cancel(v9ses);
+	if (flags & MNT_FORCE)
+		v9fs_session_cancel(v9ses);
 }
 
 static struct super_operations v9fs_super_ops = {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce..3fdc2258f44 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -402,12 +402,14 @@ static struct quotactl_ops cifs_quotactl_ops = {
 #endif
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-static void cifs_umount_begin(struct super_block * sblock)
+static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 {
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo * tcon;
 
-	cifs_sb = CIFS_SB(sblock);
+	if (!(flags & MNT_FORCE))
+		return;
+	cifs_sb = CIFS_SB(vfsmnt->mnt_sb);
 	if(cifs_sb == NULL)
 		return;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7627022446b..13ebe5780c9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -195,9 +195,10 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 	return inode;
 }
 
-static void fuse_umount_begin(struct super_block *sb)
+static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	fuse_abort_conn(get_fuse_conn_super(sb));
+	if (flags & MNT_FORCE)
+		fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb));
 }
 
 static void fuse_put_super(struct super_block *sb)
diff --git a/fs/namespace.c b/fs/namespace.c
index b22e469ab56..6bb0b85293e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -576,8 +576,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	 */
 
 	lock_kernel();
-	if ((flags & MNT_FORCE) && sb->s_op->umount_begin)
-		sb->s_op->umount_begin(sb);
+	if (sb->s_op->umount_begin)
+		sb->s_op->umount_begin(mnt, flags);
 	unlock_kernel();
 
 	/*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9ff039f9a83..fda2b496617 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -63,7 +63,7 @@ static struct inode *nfs_alloc_inode(struct super_block *sb);
 static void nfs_destroy_inode(struct inode *);
 static int nfs_write_inode(struct inode *,int);
 static void nfs_clear_inode(struct inode *);
-static void nfs_umount_begin(struct super_block *);
+static void nfs_umount_begin(struct vfsmount *, int);
 static int  nfs_statfs(struct super_block *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
@@ -162,15 +162,19 @@ nfs_clear_inode(struct inode *inode)
 	BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
 
-void
-nfs_umount_begin(struct super_block *sb)
+static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 {
-	struct rpc_clnt	*rpc = NFS_SB(sb)->client;
+	struct nfs_server *server;
+	struct rpc_clnt	*rpc;
 
+	if (!(flags & MNT_FORCE))
+		return;
 	/* -EIO all pending I/O */
+	server = NFS_SB(vfsmnt->mnt_sb);
+	rpc = server->client;
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
-	rpc = NFS_SB(sb)->client_acl;
+	rpc = server->client_acl;
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
 }
-- 
cgit v1.2.3


From 8b4bdcf8995dd92b23d2ec22b32aee8fbbb50e1c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:19 -0400
Subject: NFS: Store the file system "fsid" value in the NFS super block.

This should enable us to detect if we are crossing a mountpoint in the
case where the server is exporting "nohide" mounts.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c   | 1 -
 fs/nfs/inode.c   | 8 ++++++++
 fs/nfs/nfs2xdr.c | 3 ++-
 fs/nfs/nfs3xdr.c | 3 ++-
 fs/nfs/nfs4xdr.c | 4 ++--
 5 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 3fab5b0cfc5..b81e7ed3c90 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -47,7 +47,6 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 
-#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_fs.h>
 
 #include <linux/nfs_idmap.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index fda2b496617..1a809f6f898 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -236,6 +236,7 @@ nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *f
 		return ERR_PTR(error);
 	}
 
+	server->fsid = fsinfo->fattr->fsid;
 	return nfs_fhget(sb, rootfh, fsinfo->fattr);
 }
 
@@ -1493,6 +1494,7 @@ out:
  */
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
+	struct nfs_server *server;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	loff_t cur_isize, new_isize;
 	unsigned int	invalid = 0;
@@ -1511,6 +1513,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
 		goto out_changed;
 
+	server = NFS_SERVER(inode);
+	/* Update the fsid if and only if this is the root directory */
+	if (inode == inode->i_sb->s_root->d_inode
+			&& !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		server->fsid = fattr->fsid;
+
 	/*
 	 * Update the read time so we don't revalidate too often.
 	 */
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f0015fa876e..a7ed88f97a1 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -131,7 +131,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	fattr->du.nfs2.blocksize = ntohl(*p++);
 	rdev = ntohl(*p++);
 	fattr->du.nfs2.blocks = ntohl(*p++);
-	fattr->fsid_u.nfs3 = ntohl(*p++);
+	fattr->fsid.major = ntohl(*p++);
+	fattr->fsid.minor = 0;
 	fattr->fileid = ntohl(*p++);
 	p = xdr_decode_time(p, &fattr->atime);
 	p = xdr_decode_time(p, &fattr->mtime);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index ec233619687..f70eee2cac0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -166,7 +166,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	if (MAJOR(fattr->rdev) != major || MINOR(fattr->rdev) != minor)
 		fattr->rdev = 0;
 
-	p = xdr_decode_hyper(p, &fattr->fsid_u.nfs3);
+	p = xdr_decode_hyper(p, &fattr->fsid.major);
+	fattr->fsid.minor = 0;
 	p = xdr_decode_hyper(p, &fattr->fileid);
 	p = xdr_decode_time3(p, &fattr->atime);
 	p = xdr_decode_time3(p, &fattr->mtime);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7e9a840057f..0d579467594 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2217,7 +2217,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 	return 0;
 }
 
-static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fsid *fsid)
+static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
 	uint32_t *p;
 
@@ -2863,7 +2863,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
 		goto xdr_error;
-	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid_u.nfs4)) != 0)
+	if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
-- 
cgit v1.2.3


From 55a975937d40cac582e981ddc8ed783b3dcc043c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:19 -0400
Subject: NFS: Ensure the client submounts, when it crosses a server
 mountpoint.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Makefile    |   3 +-
 fs/nfs/dir.c       |  16 +++
 fs/nfs/inode.c     | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/nfs/namespace.c |  89 ++++++++++++++++
 fs/nfs/nfs4_fs.h   |   1 +
 fs/nfs/nfs4proc.c  |   2 +-
 6 files changed, 409 insertions(+), 5 deletions(-)
 create mode 100644 fs/nfs/namespace.c

(limited to 'fs')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index ec61fd56a1a..d9d494cee38 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -5,7 +5,8 @@
 obj-$(CONFIG_NFS_FS) += nfs.o
 
 nfs-y 			:= dir.o file.o inode.o nfs2xdr.o pagelist.o \
-			   proc.o read.o symlink.o unlink.o write.o
+			   proc.o read.o symlink.o unlink.o write.o \
+			   namespace.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1d3d8922a66..3ddda6f7ecc 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -868,6 +868,17 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd)
 	return (nd->intent.open.flags & O_EXCL) != 0;
 }
 
+static inline int nfs_reval_fsid(struct inode *dir,
+		struct nfs_fh *fh, struct nfs_fattr *fattr)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+
+	if (!nfs_fsid_equal(&server->fsid, &fattr->fsid))
+		/* Revalidate fsid on root dir */
+		return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode);
+	return 0;
+}
+
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct dentry *res;
@@ -900,6 +911,11 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 		res = ERR_PTR(error);
 		goto out_unlock;
 	}
+	error = nfs_reval_fsid(dir, &fhandle, &fattr);
+	if (error < 0) {
+		res = ERR_PTR(error);
+		goto out_unlock;
+	}
 	inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr);
 	res = (struct dentry *)inode;
 	if (IS_ERR(res))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1a809f6f898..47167ab64f5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -221,6 +221,14 @@ nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
 	return nfs_block_bits(bsize, nrbitsp);
 }
 
+static inline void
+nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+	sb->s_maxbytes = (loff_t)maxfilesize;
+	if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
 /*
  * Obtain the root inode of the file system.
  */
@@ -331,9 +339,7 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
 	}
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
 
-	sb->s_maxbytes = fsinfo.maxfilesize;
-	if (sb->s_maxbytes > MAX_LFS_FILESIZE) 
-		sb->s_maxbytes = MAX_LFS_FILESIZE; 
+	nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
 
 	server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
 	server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
@@ -877,6 +883,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
+			/* Deal with crossing mountpoints */
+			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+				inode->i_op = &nfs_mountpoint_inode_operations;
+				inode->i_fop = NULL;
+			}
 		} else if (S_ISLNK(inode->i_mode))
 			inode->i_op = &nfs_symlink_inode_operations;
 		else
@@ -1650,6 +1661,141 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
  * File system information
  */
 
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - arbitrary string to prepend to the path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ *
+ * Helper function for constructing the path from the
+ * root dentry to an arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition.
+ */
+static char *nfs_path(const char *base, const struct dentry *dentry,
+		      char *buffer, ssize_t buflen)
+{
+	char *end = buffer+buflen;
+	int namelen;
+
+	*--end = '\0';
+	buflen--;
+	spin_lock(&dcache_lock);
+	while (!IS_ROOT(dentry)) {
+		namelen = dentry->d_name.len;
+		buflen -= namelen + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= namelen;
+		memcpy(end, dentry->d_name.name, namelen);
+		*--end = '/';
+		dentry = dentry->d_parent;
+	}
+	spin_unlock(&dcache_lock);
+	namelen = strlen(base);
+	/* Strip off excess slashes in base string */
+	while (namelen > 0 && base[namelen - 1] == '/')
+		namelen--;
+	buflen -= namelen;
+	if (buflen < 0)
+		goto Elong;
+	end -= namelen;
+	memcpy(end, base, namelen);
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+struct nfs_clone_mount {
+	const struct super_block *sb;
+	const struct dentry *dentry;
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
+};
+
+static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
+		struct super_block *(*clone_client)(struct nfs_server *, struct nfs_clone_mount *))
+{
+	struct nfs_server *server;
+	struct nfs_server *parent = NFS_SB(data->sb);
+	struct super_block *sb = ERR_PTR(-EINVAL);
+	void *err = ERR_PTR(-ENOMEM);
+	struct inode *root_inode;
+	struct nfs_fsinfo fsinfo;
+	int len;
+
+	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (server == NULL)
+		goto out_err;
+	memcpy(server, parent, sizeof(*server));
+	len = strlen(parent->hostname) + 1;
+	server->hostname = kmalloc(len, GFP_KERNEL);
+	if (server->hostname == NULL)
+		goto free_server;
+	memcpy(server->hostname, parent->hostname, len);
+	server->fsid = data->fattr->fsid;
+	nfs_copy_fh(&server->fh, data->fh);
+	if (rpciod_up() != 0)
+		goto free_hostname;
+
+	sb = clone_client(server, data);
+	if (IS_ERR((err = sb)) || sb->s_root)
+		goto kill_rpciod;
+
+	sb->s_op = data->sb->s_op;
+	sb->s_blocksize = data->sb->s_blocksize;
+	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
+	sb->s_maxbytes = data->sb->s_maxbytes;
+
+	server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+	err = ERR_PTR(-ENOMEM);
+	server->io_stats = nfs_alloc_iostats();
+	if (server->io_stats == NULL)
+		goto out_deactivate;
+
+	server->client = rpc_clone_client(parent->client);
+	if (IS_ERR((err = server->client)))
+		goto out_deactivate;
+	if (!IS_ERR(parent->client_sys)) {
+		server->client_sys = rpc_clone_client(parent->client_sys);
+		if (IS_ERR((err = server->client_sys)))
+			goto out_deactivate;
+	}
+	if (!IS_ERR(parent->client_acl)) {
+		server->client_acl = rpc_clone_client(parent->client_acl);
+		if (IS_ERR((err = server->client_acl)))
+			goto out_deactivate;
+	}
+	root_inode = nfs_fhget(sb, data->fh, data->fattr);
+	if (!root_inode)
+		goto out_deactivate;
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_put_root;
+	fsinfo.fattr = data->fattr;
+	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
+		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+	sb->s_flags |= MS_ACTIVE;
+	return sb;
+out_put_root:
+	iput(root_inode);
+out_deactivate:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	return (struct super_block *)err;
+kill_rpciod:
+	rpciod_down();
+free_hostname:
+	kfree(server->hostname);
+free_server:
+	kfree(server);
+out_err:
+	return (struct super_block *)err;
+}
+
 static int nfs_set_super(struct super_block *s, void *data)
 {
 	s->s_fs_info = data;
@@ -1807,6 +1953,31 @@ static struct file_system_type nfs_fs_type = {
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
+static struct super_block *nfs_clone_client(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb;
+
+	sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
+		lockd_up();
+	return sb;
+}
+
+static struct super_block *nfs_clone_nfs_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs_clone_client);
+}
+
+static struct file_system_type clone_nfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_clone_nfs_sb,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
 #ifdef CONFIG_NFS_V4
 
 static void nfs4_clear_inode(struct inode *);
@@ -2156,6 +2327,75 @@ static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
 module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
 		 &nfs_idmap_cache_timeout, 0644);
 
+/* Constructs the SERVER-side path */
+static inline char *nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+}
+
+static inline char *nfs4_dup_path(const struct dentry *dentry)
+{
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *path;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (!IS_ERR(path)) {
+		int len = PAGE_SIZE + page - path;
+		char *tmp = path;
+
+		path = kmalloc(len, GFP_KERNEL);
+		if (path)
+			memcpy(path, tmp, len);
+		else
+			path = ERR_PTR(-ENOMEM);
+	}
+	free_page((unsigned long)page);
+	return path;
+}
+
+static struct super_block *nfs4_clone_client(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	const struct dentry *dentry = data->dentry;
+	struct nfs4_client *clp = server->nfs4_state;
+	struct super_block *sb;
+
+	server->mnt_path = nfs4_dup_path(dentry);
+	if (IS_ERR(server->mnt_path)) {
+		sb = (struct super_block *)server->mnt_path;
+		goto err;
+	}
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	nfs4_server_capabilities(server, &server->fh);
+
+	down_write(&clp->cl_sem);
+	atomic_inc(&clp->cl_count);
+	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+	up_write(&clp->cl_sem);
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static struct super_block *nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_clone_client);
+}
+
+static struct file_system_type clone_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_clone_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
 #define nfs4_init_once(nfsi) \
 	do { \
 		INIT_LIST_HEAD(&(nfsi)->open_states); \
@@ -2183,12 +2423,69 @@ static inline void unregister_nfs4fs(void)
 	nfs_unregister_sysctl();
 }
 #else
+#define nfs4_clone_client(a,b) ERR_PTR(-EINVAL)
 #define nfs4_init_once(nfsi) \
 	do { } while (0)
 #define register_nfs4fs() (0)
 #define unregister_nfs4fs()
 #endif
 
+static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+			 const struct dentry *dentry,
+			 char *buffer, ssize_t buflen)
+{
+	return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ *
+ */
+struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+		const struct dentry *dentry, struct nfs_fh *fh,
+		struct nfs_fattr *fattr)
+{
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.fh = fh,
+		.fattr = fattr,
+	};
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *devname;
+
+	dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+	if (page == NULL)
+		goto out;
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	mnt = (struct vfsmount *)devname;
+	if (IS_ERR(devname))
+		goto free_page;
+	switch (NFS_SB(mnt_parent->mnt_sb)->rpc_ops->version) {
+		case 2:
+		case 3:
+			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, &mountdata);
+			break;
+		case 4:
+			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, &mountdata);
+			break;
+		default:
+			BUG();
+	}
+free_page:
+	free_page((unsigned long)page);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
 extern int nfs_init_nfspagecache(void);
 extern void nfs_destroy_nfspagecache(void);
 extern int nfs_init_readpagecache(void);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
new file mode 100644
index 00000000000..a155505c36f
--- /dev/null
+++ b/fs/nfs/namespace.c
@@ -0,0 +1,89 @@
+/*
+ * linux/fs/nfs/namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFS namespace
+ */
+
+#include <linux/config.h>
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+/*
+ * nfs_follow_mountpoint - handle crossing a mountpoint on the server
+ * @dentry - dentry of mountpoint
+ * @nd - nameidata info
+ *
+ * When we encounter a mountpoint on the server, we want to set up
+ * a mountpoint on the client too, to prevent inode numbers from
+ * colliding, and to allow "df" to work properly.
+ * On NFSv4, we also want to allow for the fact that different
+ * filesystems may be migrated to different servers in a failover
+ * situation, and that different filesystems may want to use
+ * different security flavours.
+ */
+static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
+{
+	struct vfsmount *mnt;
+	struct nfs_server *server = NFS_SERVER(dentry->d_inode);
+	struct dentry *parent;
+	struct nfs_fh fh;
+	struct nfs_fattr fattr;
+	int err;
+
+	BUG_ON(IS_ROOT(dentry));
+	dprintk("%s: enter\n", __FUNCTION__);
+	dput(nd->dentry);
+	nd->dentry = dget(dentry);
+	if (d_mountpoint(nd->dentry))
+		goto out_follow;
+	/* Look it up again */
+	parent = dget_parent(nd->dentry);
+	err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr);
+	dput(parent);
+	if (err != 0)
+		goto out_err;
+
+	mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
+	err = PTR_ERR(mnt);
+	if (IS_ERR(mnt))
+		goto out_err;
+
+	mntget(mnt);
+	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags, NULL);
+	if (err < 0) {
+		mntput(mnt);
+		if (err == -EBUSY)
+			goto out_follow;
+		goto out_err;
+	}
+	mntput(nd->mnt);
+	dput(nd->dentry);
+	nd->mnt = mnt;
+	nd->dentry = dget(mnt->mnt_root);
+out:
+	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+	return ERR_PTR(err);
+out_err:
+	path_release(nd);
+	goto out;
+out_follow:
+	while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+		;
+	err = 0;
+	goto out;
+}
+
+struct inode_operations nfs_mountpoint_inode_operations = {
+	.follow_link	= nfs_follow_mountpoint,
+	.getattr	= nfs_getattr,
+};
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0f5e4e7cdde..307832fd1a4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -217,6 +217,7 @@ extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
+extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ef4c6cccf95..308407205e6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1331,7 +1331,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 	return status;
 }
 
-static int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
+int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
 	struct nfs4_exception exception = { };
 	int err;
-- 
cgit v1.2.3


From 51d8fa6a109589d522c18a8e9bf3fb167a91b1bc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:20 -0400
Subject: NFS: Add timeout to submounts

Make automounted partitions expire using the mark_mounts_for_expiry()
function. The timeout is controlled via a sysctl.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c     |  3 +++
 fs/nfs/namespace.c | 25 ++++++++++++++++++++++++-
 fs/nfs/sysctl.c    | 10 ++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 47167ab64f5..3eea556d8f5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -167,6 +167,7 @@ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
 	struct nfs_server *server;
 	struct rpc_clnt	*rpc;
 
+	shrink_submounts(vfsmnt, &nfs_automount_list);
 	if (!(flags & MNT_FORCE))
 		return;
 	/* -EIO all pending I/O */
@@ -1943,6 +1944,7 @@ static void nfs_kill_super(struct super_block *s)
 	nfs_free_iostats(server->io_stats);
 	kfree(server->hostname);
 	kfree(server);
+	nfs_release_automount_timer();
 }
 
 static struct file_system_type nfs_fs_type = {
@@ -2288,6 +2290,7 @@ static void nfs4_kill_super(struct super_block *sb)
 	nfs_free_iostats(server->io_stats);
 	kfree(server->hostname);
 	kfree(server);
+	nfs_release_automount_timer();
 }
 
 static struct file_system_type nfs4_fs_type = {
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index a155505c36f..e426516c111 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -18,6 +18,11 @@
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
+LIST_HEAD(nfs_automount_list);
+static void nfs_expire_automounts(void *list);
+static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
+int nfs_mountpoint_expiry_timeout = 500 * HZ;
+
 /*
  * nfs_follow_mountpoint - handle crossing a mountpoint on the server
  * @dentry - dentry of mountpoint
@@ -59,7 +64,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		goto out_err;
 
 	mntget(mnt);
-	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags, NULL);
+	err = do_add_mount(mnt, nd, nd->mnt->mnt_flags|MNT_SHRINKABLE, &nfs_automount_list);
 	if (err < 0) {
 		mntput(mnt);
 		if (err == -EBUSY)
@@ -70,6 +75,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	dput(nd->dentry);
 	nd->mnt = mnt;
 	nd->dentry = dget(mnt->mnt_root);
+	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
 	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
 	return ERR_PTR(err);
@@ -87,3 +93,20 @@ struct inode_operations nfs_mountpoint_inode_operations = {
 	.follow_link	= nfs_follow_mountpoint,
 	.getattr	= nfs_getattr,
 };
+
+static void nfs_expire_automounts(void *data)
+{
+	struct list_head *list = (struct list_head *)data;
+
+	mark_mounts_for_expiry(list);
+	if (!list_empty(list))
+		schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+}
+
+void nfs_release_automount_timer(void)
+{
+	if (list_empty(&nfs_automount_list)) {
+		cancel_delayed_work(&nfs_automount_task);
+		flush_scheduled_work();
+	}
+}
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 4c486eb867c..db61e51bb15 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
 
 #include "callback.h"
 
@@ -46,6 +47,15 @@ static ctl_table nfs_cb_sysctls[] = {
 		.strategy = &sysctl_jiffies,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nfs_mountpoint_timeout",
+		.data		= &nfs_mountpoint_expiry_timeout,
+		.maxlen		= sizeof(nfs_mountpoint_expiry_timeout),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3


From 683b57b435326eb512c7305892683b6205669448 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:22 -0400
Subject: NFSv4: Implement the fs_locations function call

NFSv4 allows for the fact that filesystems may be replicated across
several servers or that they may be migrated to a backup server in case of
failure of the primary server.
fs_locations is an NFSv4 operation for retrieving information about the
location of migrated and/or replicated filesystems.

Based on an initial implementation by Jiaying Zhang <jiayingz@citi.umich.edu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h  |   2 +
 fs/nfs/nfs4proc.c |  29 ++++++++++++++
 fs/nfs/nfs4xdr.c  | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 141 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 307832fd1a4..5b765117121 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -218,6 +218,8 @@ extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state);
 extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+		struct nfs_fs_locations *fs_locations, struct page *page);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 308407205e6..768514dc0c4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3570,6 +3570,35 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 	return len;
 }
 
+int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+		struct nfs_fs_locations *fs_locations, struct page *page)
+{
+	struct nfs_server *server = NFS_SERVER(dir);
+	u32 bitmask[2] = {
+		[0] = server->attr_bitmask[0] | FATTR4_WORD0_FS_LOCATIONS,
+		[1] = server->attr_bitmask[1],
+	};
+	struct nfs4_fs_locations_arg args = {
+		.dir_fh = NFS_FH(dir),
+		.name = &dentry->d_name,
+		.page = page,
+		.bitmask = bitmask,
+	};
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+		.rpc_argp = &args,
+		.rpc_resp = &fs_locations,
+	};
+	int status;
+
+	dprintk("%s: start\n", __FUNCTION__);
+	fs_locations->fattr.valid = 0;
+	fs_locations->server = server;
+	status = rpc_call_sync(server->client, &msg, 0);
+	dprintk("%s: returned status = %d\n", __FUNCTION__, status);
+	return status;
+}
+
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
 	.recover_open	= nfs4_open_reclaim,
 	.recover_lock	= nfs4_lock_reclaim,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0d579467594..7add3137b6b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -411,6 +411,15 @@ static int nfs_stat_to_errno(int);
 #define NFS4_dec_setacl_sz	(compound_decode_hdr_maxsz + \
 				decode_putfh_maxsz + \
 				op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
+#define NFS4_enc_fs_locations_sz \
+				(compound_encode_hdr_maxsz + \
+				 encode_putfh_maxsz + \
+				 encode_getattr_maxsz)
+#define NFS4_dec_fs_locations_sz \
+				(compound_decode_hdr_maxsz + \
+				 decode_putfh_maxsz + \
+				 op_decode_hdr_maxsz + \
+				 nfs4_fattr_bitmap_maxsz)
 
 static struct {
 	unsigned int	mode;
@@ -2002,6 +2011,38 @@ out:
 	return status;
 }
 
+/*
+ * Encode FS_LOCATIONS request
+ */
+static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr = {
+		.nops = 3,
+	};
+	struct rpc_auth *auth = req->rq_task->tk_auth;
+	int replen;
+	int status;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, &hdr);
+	if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+		goto out;
+	if ((status = encode_lookup(&xdr, args->name)) != 0)
+		goto out;
+	if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+		goto out;
+	/* set up reply
+	 *   toplevel_status + taglen + rescount + OP_PUTFH + status
+	 *   + OP_LOOKUP + status + OP_GETATTR + status = 7
+	 */
+	replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
+	xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
+			0, PAGE_SIZE);
+out:
+	return status;
+}
+
 /*
  * START OF "GENERIC" DECODE ROUTINES.
  *   These may look a little ugly since they are imported from a "generic"
@@ -2036,7 +2077,7 @@ out:
 	} \
 } while (0)
 
-static int decode_opaque_inline(struct xdr_stream *xdr, uint32_t *len, char **string)
+static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
 	uint32_t *p;
 
@@ -2087,7 +2128,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp)
 {
 	uint32_t *p;
-	uint32_t strlen;
+	unsigned int strlen;
 	char *str;
 
 	READ_BUF(12);
@@ -2336,6 +2377,45 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	return status;
 }
 
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fs_locations *res)
+{
+	int n;
+	uint32_t *p;
+	int status = -EIO;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
+		goto out;
+	status = 0;
+	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
+		goto out;
+	status = decode_opaque_inline(xdr, &res->fs_pathlen, &res->fs_path);
+	if (unlikely(status != 0))
+		goto out;
+	READ_BUF(4);
+	READ32(n);
+	if (n <= 0)
+		goto out_eio;
+	res->nlocations = 0;
+	while (res->nlocations < n) {
+		struct nfs_fs_location *loc = &res->locations[res->nlocations];
+
+		status = decode_opaque_inline(xdr, &loc->serverlen, &loc->server);
+		if (unlikely(status != 0))
+			goto out_eio;
+		status = decode_opaque_inline(xdr, &loc->rootpathlen, &loc->rootpath);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (res->nlocations < NFS_FS_LOCATIONS_MAXENTRIES)
+			res->nlocations++;
+	}
+out:
+	dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status);
+	return status;
+out_eio:
+	status = -EIO;
+	goto out;
+}
+
 static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
 	uint32_t *p;
@@ -2867,6 +2947,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
+	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+						struct nfs_fs_locations,
+						fattr))) != 0)
+		goto xdr_error;
 	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
 		goto xdr_error;
 	fattr->mode |= fmode;
@@ -4210,6 +4294,29 @@ out:
 	return status;
 }
 
+/*
+ * FS_LOCATIONS request
+ */
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs_fs_locations *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &req->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status != 0)
+		goto out;
+	if ((status = decode_putfh(&xdr)) != 0)
+		goto out;
+	if ((status = decode_lookup(&xdr)) != 0)
+		goto out;
+	xdr_enter_page(&xdr, PAGE_SIZE);
+	status = decode_getfattr(&xdr, &res->fattr, res->server);
+out:
+	return status;
+}
+
 uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
 {
 	uint32_t bitmap[2] = {0};
@@ -4381,6 +4488,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(DELEGRETURN,	enc_delegreturn, dec_delegreturn),
   PROC(GETACL,		enc_getacl,	dec_getacl),
   PROC(SETACL,		enc_setacl,	dec_setacl),
+  PROC(FS_LOCATIONS,	enc_fs_locations, dec_fs_locations),
 };
 
 struct rpc_version		nfs_version4 = {
-- 
cgit v1.2.3


From 7aaa0b3bd4d215d9ce4d62b6c2043a63ba650f93 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:23 -0400
Subject: NFSv4: convert fs-locations-components to conform to RFC3530

Use component4-style formats for decoding list of servers and pathnames in
fs_locations.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h  |  2 +-
 fs/nfs/nfs4proc.c |  4 +--
 fs/nfs/nfs4xdr.c  | 80 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 74 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 5b765117121..22a5f838ea5 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -219,7 +219,7 @@ extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct n
 extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *);
 extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
 extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
-		struct nfs_fs_locations *fs_locations, struct page *page);
+		struct nfs4_fs_locations *fs_locations, struct page *page);
 
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
 extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 768514dc0c4..043223a0eda 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3571,7 +3571,7 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
 }
 
 int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
-		struct nfs_fs_locations *fs_locations, struct page *page)
+		struct nfs4_fs_locations *fs_locations, struct page *page)
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 	u32 bitmask[2] = {
@@ -3587,7 +3587,7 @@ int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
 		.rpc_argp = &args,
-		.rpc_resp = &fs_locations,
+		.rpc_resp = fs_locations,
 	};
 	int status;
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7add3137b6b..f6a1ea7df37 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2377,7 +2377,43 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 	return status;
 }
 
-static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fs_locations *res)
+static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
+{
+	int n;
+	uint32_t *p;
+	int status = 0;
+
+	READ_BUF(4);
+	READ32(n);
+	if (n <= 0)
+		goto out_eio;
+	dprintk("path ");
+	path->ncomponents = 0;
+	while (path->ncomponents < n) {
+		struct nfs4_string *component = &path->components[path->ncomponents];
+		status = decode_opaque_inline(xdr, &component->len, &component->data);
+		if (unlikely(status != 0))
+			goto out_eio;
+		if (path->ncomponents != n)
+			dprintk("/");
+		dprintk("%s", component->data);
+		if (path->ncomponents < NFS4_PATHNAME_MAXCOMPONENTS)
+			path->ncomponents++;
+		else {
+			dprintk("cannot parse %d components in path\n", n);
+			goto out_eio;
+		}
+	}
+out:
+	dprintk("\n");
+	return status;
+out_eio:
+	dprintk(" status %d", status);
+	status = -EIO;
+	goto out;
+}
+
+static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
 {
 	int n;
 	uint32_t *p;
@@ -2388,7 +2424,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	status = 0;
 	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
 		goto out;
-	status = decode_opaque_inline(xdr, &res->fs_pathlen, &res->fs_path);
+	dprintk("%s: fsroot ", __FUNCTION__);
+	status = decode_pathname(xdr, &res->fs_path);
 	if (unlikely(status != 0))
 		goto out;
 	READ_BUF(4);
@@ -2397,15 +2434,40 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		goto out_eio;
 	res->nlocations = 0;
 	while (res->nlocations < n) {
-		struct nfs_fs_location *loc = &res->locations[res->nlocations];
+		int m;
+		struct nfs4_fs_location *loc = &res->locations[res->nlocations];
 
-		status = decode_opaque_inline(xdr, &loc->serverlen, &loc->server);
-		if (unlikely(status != 0))
+		READ_BUF(4);
+		READ32(m);
+		if (m <= 0)
 			goto out_eio;
-		status = decode_opaque_inline(xdr, &loc->rootpathlen, &loc->rootpath);
+
+		loc->nservers = 0;
+		dprintk("%s: servers ", __FUNCTION__);
+		while (loc->nservers < m) {
+			struct nfs4_string *server = &loc->servers[loc->nservers];
+			status = decode_opaque_inline(xdr, &server->len, &server->data);
+			if (unlikely(status != 0))
+				goto out_eio;
+			dprintk("%s ", server->data);
+			if (loc->nservers < NFS4_FS_LOCATION_MAXSERVERS)
+				loc->nservers++;
+			else {
+				int i;
+				dprintk("%s: using first %d of %d servers returned for location %d\n", __FUNCTION__, NFS4_FS_LOCATION_MAXSERVERS, m, res->nlocations);
+				for (i = loc->nservers; i < m; i++) {
+					int len;
+					char *data;
+					status = decode_opaque_inline(xdr, &len, &data);
+					if (unlikely(status != 0))
+						goto out_eio;
+				}
+			}
+		}
+		status = decode_pathname(xdr, &loc->rootpath);
 		if (unlikely(status != 0))
 			goto out_eio;
-		if (res->nlocations < NFS_FS_LOCATIONS_MAXENTRIES)
+		if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
 			res->nlocations++;
 	}
 out:
@@ -2948,7 +3010,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
-						struct nfs_fs_locations,
+						struct nfs4_fs_locations,
 						fattr))) != 0)
 		goto xdr_error;
 	if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
@@ -4297,7 +4359,7 @@ out:
 /*
  * FS_LOCATIONS request
  */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
-- 
cgit v1.2.3


From 99baf625d3b9b8944920acc7c2d06079a37458c5 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:24 -0400
Subject: NFSv4: Decode mounted_on_fileid attribute in getattr.

It is ignored if fileid is also requested. This will be used on referrals
(fs_locations).

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index f6a1ea7df37..4b6613f6134 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2326,6 +2326,22 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 	return 0;
 }
 
+static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
+{
+	uint32_t *p;
+
+	*fileid = 0;
+	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
+		return -EIO;
+	if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
+		READ_BUF(8);
+		READ64(*fileid);
+		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+	}
+	dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid);
+	return 0;
+}
+
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
 	uint32_t *p;
@@ -2983,6 +2999,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		 bitmap[2] = {0},
 		 type;
 	int status, fmode = 0;
+	uint64_t fileid;
 
 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
 		goto xdr_error;
@@ -3032,6 +3049,10 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 		goto xdr_error;
 	if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
 		goto xdr_error;
+	if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+		goto xdr_error;
+	if (fattr->fileid == 0 && fileid != 0)
+		fattr->fileid = fileid;
 	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
 		fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
 xdr_error:
-- 
cgit v1.2.3


From 361e624f6d8bfbeac53769603d995d47535cfd46 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:24 -0400
Subject: NFSv4: GETATTR attributes on referral

Per referral draft, only fs_locations, fsid, and mounted_on_fileid can be
requested in a GETATTR on referrals.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 043223a0eda..8640607d6a0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3575,8 +3575,8 @@ int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
 {
 	struct nfs_server *server = NFS_SERVER(dir);
 	u32 bitmask[2] = {
-		[0] = server->attr_bitmask[0] | FATTR4_WORD0_FS_LOCATIONS,
-		[1] = server->attr_bitmask[1],
+		[0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+		[1] = FATTR4_WORD1_MOUNTED_ON_FILEID,
 	};
 	struct nfs4_fs_locations_arg args = {
 		.dir_fh = NFS_FH(dir),
-- 
cgit v1.2.3


From 830b8e33fe1900b87c8eb7ec5c646117a9f298d6 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:25 -0400
Subject: NFSv4: Define an fs_locations bitmap

This is (similar to getattr bitmap) but includes fs_locations and
mounted_on_fileid attributes. Use this bitmap for encoding in fs_locations
requests.
Note: We can probably do better by requesting locations as part of fsinfo
itself.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h  |  1 +
 fs/nfs/nfs4proc.c | 20 ++++++++++++++++++++
 fs/nfs/nfs4xdr.c  | 11 +++++++++--
 3 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 22a5f838ea5..9a102860df3 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -228,6 +228,7 @@ extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
 extern const u32 nfs4_pathconf_bitmap[2];
 extern const u32 nfs4_fsinfo_bitmap[2];
+extern const u32 nfs4_fs_locations_bitmap[2];
 
 /* nfs4renewd.c */
 extern void nfs4_schedule_state_renewal(struct nfs4_client *);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8640607d6a0..90ee21a07b3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -121,6 +121,25 @@ const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE
 			0
 };
 
+const u32 nfs4_fs_locations_bitmap[2] = {
+	FATTR4_WORD0_TYPE
+	| FATTR4_WORD0_CHANGE
+	| FATTR4_WORD0_SIZE
+	| FATTR4_WORD0_FSID
+	| FATTR4_WORD0_FILEID
+	| FATTR4_WORD0_FS_LOCATIONS,
+	FATTR4_WORD1_MODE
+	| FATTR4_WORD1_NUMLINKS
+	| FATTR4_WORD1_OWNER
+	| FATTR4_WORD1_OWNER_GROUP
+	| FATTR4_WORD1_RAWDEV
+	| FATTR4_WORD1_SPACE_USED
+	| FATTR4_WORD1_TIME_ACCESS
+	| FATTR4_WORD1_TIME_METADATA
+	| FATTR4_WORD1_TIME_MODIFY
+	| FATTR4_WORD1_MOUNTED_ON_FILEID
+};
+
 static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
 		struct nfs4_readdir_arg *readdir)
 {
@@ -3594,6 +3613,7 @@ int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
 	dprintk("%s: start\n", __FUNCTION__);
 	fs_locations->fattr.valid = 0;
 	fs_locations->server = server;
+	fs_locations->nlocations = 0;
 	status = rpc_call_sync(server->client, &msg, 0);
 	dprintk("%s: returned status = %d\n", __FUNCTION__, status);
 	return status;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4b6613f6134..646f16da072 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -731,6 +731,13 @@ static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
 			bitmask[1] & nfs4_fsinfo_bitmap[1]);
 }
 
+static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
+{
+	return encode_getattr_two(xdr,
+				  bitmask[0] & nfs4_fs_locations_bitmap[0],
+				  bitmask[1] & nfs4_fs_locations_bitmap[1]);
+}
+
 static int encode_getfh(struct xdr_stream *xdr)
 {
 	uint32_t *p;
@@ -2030,10 +2037,10 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct n
 		goto out;
 	if ((status = encode_lookup(&xdr, args->name)) != 0)
 		goto out;
-	if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+	if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
 		goto out;
 	/* set up reply
-	 *   toplevel_status + taglen + rescount + OP_PUTFH + status
+	 *   toplevel_status + OP_PUTFH + status
 	 *   + OP_LOOKUP + status + OP_GETATTR + status = 7
 	 */
 	replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
-- 
cgit v1.2.3


From c818ba43f9ca2e8214412ab5f126b1f436c35098 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:26 -0400
Subject: NFSv4: Create NFSv4 transport and client

Move existing code into a separate function so that it can be also used by
referral code.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 131 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 73 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3eea556d8f5..db62a5a7e4f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2027,62 +2027,24 @@ static void nfs4_clear_inode(struct inode *inode)
 }
 
 
-static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
+static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
+	struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
 {
-	struct nfs_server *server;
-	struct nfs4_client *clp = NULL;
+	struct nfs4_client *clp;
 	struct rpc_xprt *xprt = NULL;
 	struct rpc_clnt *clnt = NULL;
-	struct rpc_timeout timeparms;
-	rpc_authflavor_t authflavour;
 	int err = -EIO;
 
-	sb->s_blocksize_bits = 0;
-	sb->s_blocksize = 0;
-	server = NFS_SB(sb);
-	if (data->rsize != 0)
-		server->rsize = nfs_block_size(data->rsize, NULL);
-	if (data->wsize != 0)
-		server->wsize = nfs_block_size(data->wsize, NULL);
-	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
-	server->caps = NFS_CAP_ATOMIC_OPEN;
-
-	server->acregmin = data->acregmin*HZ;
-	server->acregmax = data->acregmax*HZ;
-	server->acdirmin = data->acdirmin*HZ;
-	server->acdirmax = data->acdirmax*HZ;
-
-	server->rpc_ops = &nfs_v4_clientops;
-
-	nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
-
-	server->retrans_timeo = timeparms.to_initval;
-	server->retrans_count = timeparms.to_retries;
-
 	clp = nfs4_get_client(&server->addr.sin_addr);
 	if (!clp) {
 		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
-		return -EIO;
+		return ERR_PTR(err);
 	}
 
 	/* Now create transport and client */
-	authflavour = RPC_AUTH_UNIX;
-	if (data->auth_flavourlen != 0) {
-		if (data->auth_flavourlen != 1) {
-			dprintk("%s: Invalid number of RPC auth flavours %d.\n",
-					__FUNCTION__, data->auth_flavourlen);
-			err = -EINVAL;
-			goto out_fail;
-		}
-		if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
-			err = -EFAULT;
-			goto out_fail;
-		}
-	}
-
 	down_write(&clp->cl_sem);
 	if (IS_ERR(clp->cl_rpcclient)) {
-		xprt = xprt_create_proto(data->proto, &server->addr, &timeparms);
+		xprt = xprt_create_proto(proto, &server->addr, timeparms);
 		if (IS_ERR(xprt)) {
 			up_write(&clp->cl_sem);
 			err = PTR_ERR(xprt);
@@ -2091,7 +2053,7 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
 			goto out_fail;
 		}
 		clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-				server->rpc_ops->version, authflavour);
+				server->rpc_ops->version, flavor);
 		if (IS_ERR(clnt)) {
 			up_write(&clp->cl_sem);
 			err = PTR_ERR(clnt);
@@ -2108,43 +2070,96 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data,
 	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
 	clnt = rpc_clone_client(clp->cl_rpcclient);
 	if (!IS_ERR(clnt))
-			server->nfs4_state = clp;
+		server->nfs4_state = clp;
 	up_write(&clp->cl_sem);
 	clp = NULL;
 
 	if (IS_ERR(clnt)) {
-		err = PTR_ERR(clnt);
 		dprintk("%s: cannot create RPC client. Error = %d\n",
 				__FUNCTION__, err);
-		return err;
+		return clnt;
 	}
 
-	server->client    = clnt;
-
 	if (server->nfs4_state->cl_idmap == NULL) {
 		dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 
-	if (clnt->cl_auth->au_flavor != authflavour) {
+	if (clnt->cl_auth->au_flavor != flavor) {
 		struct rpc_auth *auth;
 
-		auth = rpcauth_create(authflavour, clnt);
+		auth = rpcauth_create(flavor, clnt);
 		if (IS_ERR(auth)) {
 			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
-			return PTR_ERR(auth);
+			return (struct rpc_clnt *)auth;
+		}
+	}
+	return clnt;
+
+ out_fail:
+	if (clp)
+		nfs4_put_client(clp);
+	return ERR_PTR(err);
+}
+
+static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
+{
+	struct nfs_server *server;
+	struct rpc_timeout timeparms;
+	rpc_authflavor_t authflavour;
+	int err = -EIO;
+
+	sb->s_blocksize_bits = 0;
+	sb->s_blocksize = 0;
+	server = NFS_SB(sb);
+	if (data->rsize != 0)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize != 0)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+	server->caps = NFS_CAP_ATOMIC_OPEN;
+
+	server->acregmin = data->acregmin*HZ;
+	server->acregmax = data->acregmax*HZ;
+	server->acdirmin = data->acdirmin*HZ;
+	server->acdirmax = data->acdirmax*HZ;
+
+	server->rpc_ops = &nfs_v4_clientops;
+
+	nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
+
+	server->retrans_timeo = timeparms.to_initval;
+	server->retrans_count = timeparms.to_retries;
+
+	/* Now create transport and client */
+	authflavour = RPC_AUTH_UNIX;
+	if (data->auth_flavourlen != 0) {
+		if (data->auth_flavourlen != 1) {
+			dprintk("%s: Invalid number of RPC auth flavours %d.\n",
+					__FUNCTION__, data->auth_flavourlen);
+			err = -EINVAL;
+			goto out_fail;
+		}
+		if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
+			err = -EFAULT;
+			goto out_fail;
 		}
 	}
 
+	server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour);
+	if (IS_ERR(server->client)) {
+		err = PTR_ERR(server->client);
+			dprintk("%s: cannot create RPC client. Error = %d\n",
+					__FUNCTION__, err);
+			goto out_fail;
+	}
+
 	sb->s_time_gran = 1;
 
 	sb->s_op = &nfs4_sops;
 	err = nfs_sb_init(sb, authflavour);
-	if (err == 0)
-		return 0;
-out_fail:
-	if (clp)
-		nfs4_put_client(clp);
+
+ out_fail:
 	return err;
 }
 
-- 
cgit v1.2.3


From 61f5164cab1f6fdf06871ea9d60fe2f912184078 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:27 -0400
Subject: NFS: Expand clone mounts to include other servers

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 111 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 64 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index db62a5a7e4f..ebdab885c47 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1717,14 +1717,13 @@ struct nfs_clone_mount {
 };
 
 static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
-		struct super_block *(*clone_client)(struct nfs_server *, struct nfs_clone_mount *))
+		struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *),
+		struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *))
 {
 	struct nfs_server *server;
 	struct nfs_server *parent = NFS_SB(data->sb);
 	struct super_block *sb = ERR_PTR(-EINVAL);
 	void *err = ERR_PTR(-ENOMEM);
-	struct inode *root_inode;
-	struct nfs_fsinfo fsinfo;
 	int len;
 
 	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
@@ -1736,53 +1735,17 @@ static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
 	if (server->hostname == NULL)
 		goto free_server;
 	memcpy(server->hostname, parent->hostname, len);
-	server->fsid = data->fattr->fsid;
-	nfs_copy_fh(&server->fh, data->fh);
 	if (rpciod_up() != 0)
 		goto free_hostname;
 
-	sb = clone_client(server, data);
+	sb = fill_sb(server, data);
 	if (IS_ERR((err = sb)) || sb->s_root)
 		goto kill_rpciod;
 
-	sb->s_op = data->sb->s_op;
-	sb->s_blocksize = data->sb->s_blocksize;
-	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
-	sb->s_maxbytes = data->sb->s_maxbytes;
-
-	server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-	err = ERR_PTR(-ENOMEM);
-	server->io_stats = nfs_alloc_iostats();
-	if (server->io_stats == NULL)
+	server = fill_server(sb, data);
+	if (IS_ERR((err = server)))
 		goto out_deactivate;
-
-	server->client = rpc_clone_client(parent->client);
-	if (IS_ERR((err = server->client)))
-		goto out_deactivate;
-	if (!IS_ERR(parent->client_sys)) {
-		server->client_sys = rpc_clone_client(parent->client_sys);
-		if (IS_ERR((err = server->client_sys)))
-			goto out_deactivate;
-	}
-	if (!IS_ERR(parent->client_acl)) {
-		server->client_acl = rpc_clone_client(parent->client_acl);
-		if (IS_ERR((err = server->client_acl)))
-			goto out_deactivate;
-	}
-	root_inode = nfs_fhget(sb, data->fh, data->fattr);
-	if (!root_inode)
-		goto out_deactivate;
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root)
-		goto out_put_root;
-	fsinfo.fattr = data->fattr;
-	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
-		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
-	sb->s_root->d_op = server->rpc_ops->dentry_ops;
-	sb->s_flags |= MS_ACTIVE;
 	return sb;
-out_put_root:
-	iput(root_inode);
 out_deactivate:
 	up_write(&sb->s_umount);
 	deactivate_super(sb);
@@ -1955,21 +1918,73 @@ static struct file_system_type nfs_fs_type = {
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
-static struct super_block *nfs_clone_client(struct nfs_server *server, struct nfs_clone_mount *data)
+static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
 {
 	struct super_block *sb;
 
+	server->fsid = data->fattr->fsid;
+	nfs_copy_fh(&server->fh, data->fh);
 	sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
 	if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
 		lockd_up();
 	return sb;
 }
 
+static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_server *parent = NFS_SB(data->sb);
+	struct inode *root_inode;
+	struct nfs_fsinfo fsinfo;
+	void *err = ERR_PTR(-ENOMEM);
+
+	sb->s_op = data->sb->s_op;
+	sb->s_blocksize = data->sb->s_blocksize;
+	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
+	sb->s_maxbytes = data->sb->s_maxbytes;
+
+	server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+	server->io_stats = nfs_alloc_iostats();
+	if (server->io_stats == NULL)
+		goto out;
+
+	server->client = rpc_clone_client(parent->client);
+	if (IS_ERR((err = server->client)))
+		goto out;
+
+	if (!IS_ERR(parent->client_sys)) {
+		server->client_sys = rpc_clone_client(parent->client_sys);
+		if (IS_ERR((err = server->client_sys)))
+			goto out;
+	}
+	if (!IS_ERR(parent->client_acl)) {
+		server->client_acl = rpc_clone_client(parent->client_acl);
+		if (IS_ERR((err = server->client_acl)))
+			goto out;
+	}
+	root_inode = nfs_fhget(sb, data->fh, data->fattr);
+	if (!root_inode)
+		goto out;
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_put_root;
+	fsinfo.fattr = data->fattr;
+	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
+		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+	sb->s_flags |= MS_ACTIVE;
+	return server;
+out_put_root:
+	iput(root_inode);
+out:
+	return err;
+}
+
 static struct super_block *nfs_clone_nfs_sb(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *raw_data)
 {
 	struct nfs_clone_mount *data = raw_data;
-	return nfs_clone_generic_sb(data, nfs_clone_client);
+	return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server);
 }
 
 static struct file_system_type clone_nfs_fs_type = {
@@ -2371,12 +2386,14 @@ static inline char *nfs4_dup_path(const struct dentry *dentry)
 	return path;
 }
 
-static struct super_block *nfs4_clone_client(struct nfs_server *server, struct nfs_clone_mount *data)
+static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
 {
 	const struct dentry *dentry = data->dentry;
 	struct nfs4_client *clp = server->nfs4_state;
 	struct super_block *sb;
 
+	server->fsid = data->fattr->fsid;
+	nfs_copy_fh(&server->fh, data->fh);
 	server->mnt_path = nfs4_dup_path(dentry);
 	if (IS_ERR(server->mnt_path)) {
 		sb = (struct super_block *)server->mnt_path;
@@ -2403,12 +2420,12 @@ static struct super_block *nfs_clone_nfs4_sb(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *raw_data)
 {
 	struct nfs_clone_mount *data = raw_data;
-	return nfs_clone_generic_sb(data, nfs4_clone_client);
+	return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server);
 }
 
 static struct file_system_type clone_nfs4_fs_type = {
 	.owner		= THIS_MODULE,
-	.name		= "nfs",
+	.name		= "nfs4",
 	.get_sb		= nfs_clone_nfs4_sb,
 	.kill_sb	= nfs4_kill_super,
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-- 
cgit v1.2.3


From 9cdb3883c38f883436a84c2353a4cf964ff890a2 Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:28 -0400
Subject: NFSv4: Ensure client submounts when following a referral

Set up mountpoint when hitting a referral on moved error by getting
fs_locations.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 267 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ebdab885c47..0d8302e59d6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -36,6 +36,8 @@
 #include <linux/mount.h>
 #include <linux/nfs_idmap.h>
 #include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -1714,6 +1716,10 @@ struct nfs_clone_mount {
 	const struct dentry *dentry;
 	struct nfs_fh *fh;
 	struct nfs_fattr *fattr;
+	char *hostname;
+	char *mnt_path;
+	struct sockaddr_in *addr;
+	rpc_authflavor_t authflavor;
 };
 
 static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
@@ -1724,17 +1730,19 @@ static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
 	struct nfs_server *parent = NFS_SB(data->sb);
 	struct super_block *sb = ERR_PTR(-EINVAL);
 	void *err = ERR_PTR(-ENOMEM);
+	char *hostname;
 	int len;
 
 	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (server == NULL)
 		goto out_err;
 	memcpy(server, parent, sizeof(*server));
-	len = strlen(parent->hostname) + 1;
+	hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
+	len = strlen(hostname) + 1;
 	server->hostname = kmalloc(len, GFP_KERNEL);
 	if (server->hostname == NULL)
 		goto free_server;
-	memcpy(server->hostname, parent->hostname, len);
+	memcpy(server->hostname, hostname, len);
 	if (rpciod_up() != 0)
 		goto free_hostname;
 
@@ -2458,7 +2466,8 @@ static inline void unregister_nfs4fs(void)
 	nfs_unregister_sysctl();
 }
 #else
-#define nfs4_clone_client(a,b) ERR_PTR(-EINVAL)
+#define nfs4_fill_sb(a,b)	ERR_PTR(-EINVAL)
+#define nfs4_fill_super(a,b)	ERR_PTR(-EINVAL)
 #define nfs4_init_once(nfsi) \
 	do { } while (0)
 #define register_nfs4fs() (0)
@@ -2521,6 +2530,261 @@ out:
 	return mnt;
 }
 
+/* Check if fs_root is valid */
+static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname, char *buffer, ssize_t buflen)
+{
+	char *end = buffer + buflen;
+	int n;
+
+	*--end = '\0';
+	buflen--;
+
+	n = pathname->ncomponents;
+	while (--n >= 0) {
+		struct nfs4_string *component = &pathname->components[n];
+		buflen -= component->len + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= component->len;
+		memcpy(end, component->data, component->len);
+		*--end = '/';
+	}
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+/* Check if the string represents a "valid" IPv4 address */
+static inline int valid_ipaddr4(const char *buf)
+{
+	int rc, count, in[4];
+
+	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+	if (rc != 4)
+		return -EINVAL;
+	for (count = 0; count < 4; count++) {
+		if (in[count] > 255)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb = ERR_PTR(-ENOMEM);
+	int len;
+
+	len = strlen(data->mnt_path) + 1;
+	server->mnt_path = kmalloc(len, GFP_KERNEL);
+	if (server->mnt_path == NULL)
+		goto err;
+	memcpy(server->mnt_path, data->mnt_path, len);
+	memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
+
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct rpc_timeout timeparms;
+	int proto, timeo, retrans;
+	void *err;
+
+	proto = IPPROTO_TCP;
+	/* Since we are following a referral and there may be alternatives,
+	   set the timeouts and retries to low values */
+	timeo = 2;
+	retrans = 1;
+	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
+
+	server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
+	if (IS_ERR((err = server->client)))
+		goto out_err;
+
+	sb->s_time_gran = 1;
+	sb->s_op = &nfs4_sops;
+	err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
+	if (!IS_ERR(err))
+		return server;
+out_err:
+	return (struct nfs_server *)err;
+}
+
+static struct super_block *nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server);
+}
+
+static struct file_system_type nfs_referral_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.get_sb		= nfs_referral_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fspath - fs path returned in fs_locations
+ * @mntpath - mount path to new server
+ * @hostname - hostname of new server
+ * @addr - host addr of new server
+ *
+ */
+struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+	     const struct dentry *dentry, struct nfs4_fs_locations *locations)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+	};
+	char *page, *page2;
+	char *path, *fs_path;
+	char *devname;
+	int loc, s;
+
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	dprintk("%s: referral at %s/%s\n", __FUNCTION__,
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	/* Ensure fs path is a prefix of current dentry path */
+	page = (char *) __get_free_page(GFP_USER);
+	if (page == NULL)
+		goto out;
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (page2 == NULL)
+		goto out;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (IS_ERR(path))
+		goto out_free;
+
+	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+	if (IS_ERR(fs_path))
+		goto out_free;
+
+	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+		dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
+		goto out_free;
+	}
+
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	if (IS_ERR(devname)) {
+		mnt = (struct vfsmount *)devname;
+		goto out_free;
+	}
+
+	loc = 0;
+	while (loc < locations->nlocations && IS_ERR(mnt)) {
+		struct nfs4_fs_location *location = &locations->locations[loc];
+		char *mnt_path;
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0) {
+			loc++;
+			continue;
+		}
+
+		mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+		if (IS_ERR(mnt_path)) {
+			loc++;
+			continue;
+		}
+		mountdata.mnt_path = mnt_path;
+
+		s = 0;
+		while (s < location->nservers) {
+			struct sockaddr_in addr = {};
+
+			if (location->servers[s].len <= 0 ||
+			    valid_ipaddr4(location->servers[s].data) < 0) {
+				s++;
+				continue;
+			}
+
+			mountdata.hostname = location->servers[s].data;
+			addr.sin_addr.s_addr = in_aton(mountdata.hostname);
+			addr.sin_family = AF_INET;
+			addr.sin_port = htons(NFS_PORT);
+			mountdata.addr = &addr;
+
+			mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
+			if (!IS_ERR(mnt)) {
+				break;
+			}
+			s++;
+		}
+		loc++;
+	}
+
+out_free:
+	free_page((unsigned long)page);
+	free_page((unsigned long)page2);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ * @nd - nameidata info
+ *
+ */
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct dentry *parent;
+	struct nfs4_fs_locations *fs_locations = NULL;
+	struct page *page;
+	int err;
+
+	/* BUG_ON(IS_ROOT(dentry)); */
+	dprintk("%s: enter\n", __FUNCTION__);
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+
+	fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (fs_locations == NULL)
+		goto out_free;
+
+	/* Get locations */
+	parent = dget_parent(dentry);
+	dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
+	err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
+	dput(parent);
+	if (err != 0 || fs_locations->nlocations <= 0 ||
+	    fs_locations->fs_path.ncomponents <= 0)
+		goto out_free;
+
+	mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+out_free:
+	__free_page(page);
+	kfree(fs_locations);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
 extern int nfs_init_nfspagecache(void);
 extern void nfs_destroy_nfspagecache(void);
 extern int nfs_init_readpagecache(void);
-- 
cgit v1.2.3


From 6b97fd3da1eab2cc490cfe884c7d4956522eaf8b Mon Sep 17 00:00:00 2001
From: Manoj Naik <manoj@almaden.ibm.com>
Date: Fri, 9 Jun 2006 09:34:29 -0400
Subject: NFSv4: Follow a referral

Respond to a moved error on NFS lookup by setting up the referral.
Note: We don't actually follow the referral during lookup/getattr, but
later when we detect fsid mismatch in inode revalidation (similar to the
processing done for cloning submounts). Referrals will have fake attributes
until they are actually followed or traversed.

Signed-off-by: Manoj Naik <manoj@almaden.ibm.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c     |  5 ++++-
 fs/nfs/namespace.c |  9 ++++++++-
 fs/nfs/nfs4proc.c  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0d8302e59d6..ee13cb01b56 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -888,7 +888,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 			/* Deal with crossing mountpoints */
 			if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
-				inode->i_op = &nfs_mountpoint_inode_operations;
+				if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
+					inode->i_op = &nfs_referral_inode_operations;
+				else
+					inode->i_op = &nfs_mountpoint_inode_operations;
 				inode->i_fop = NULL;
 			}
 		} else if (S_ISLNK(inode->i_mode))
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index e426516c111..8ca44b7b25c 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -58,7 +58,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	if (err != 0)
 		goto out_err;
 
-	mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
+	if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL)
+		mnt = nfs_do_refmount(nd->mnt, nd->dentry);
+	else
+		mnt = nfs_do_submount(nd->mnt, nd->dentry, &fh, &fattr);
 	err = PTR_ERR(mnt);
 	if (IS_ERR(mnt))
 		goto out_err;
@@ -94,6 +97,10 @@ struct inode_operations nfs_mountpoint_inode_operations = {
 	.getattr	= nfs_getattr,
 };
 
+struct inode_operations nfs_referral_inode_operations = {
+	.follow_link	= nfs_follow_mountpoint,
+};
+
 static void nfs_expire_automounts(void *data)
 {
 	struct list_head *list = (struct list_head *)data;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 90ee21a07b3..3300e35d74a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1462,6 +1462,50 @@ out:
 	return nfs4_map_errors(status);
 }
 
+/*
+ * Get locations and (maybe) other attributes of a referral.
+ * Note that we'll actually follow the referral later when
+ * we detect fsid mismatch in inode revalidation
+ */
+static int nfs4_get_referral(struct inode *dir, struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle)
+{
+	int status = -ENOMEM;
+	struct page *page = NULL;
+	struct nfs4_fs_locations *locations = NULL;
+	struct dentry dentry = {};
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+	locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (locations == NULL)
+		goto out;
+
+	dentry.d_name.name = name->name;
+	dentry.d_name.len = name->len;
+	status = nfs4_proc_fs_locations(dir, &dentry, locations, page);
+	if (status != 0)
+		goto out;
+	/* Make sure server returned a different fsid for the referral */
+	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
+		dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name);
+		status = -EIO;
+		goto out;
+	}
+
+	memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr));
+	fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL;
+	if (!fattr->mode)
+		fattr->mode = S_IFDIR;
+	memset(fhandle, 0, sizeof(struct nfs_fh));
+out:
+	if (page)
+		__free_page(page);
+	if (locations)
+		kfree(locations);
+	return status;
+}
+
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
 	struct nfs4_getattr_arg args = {
@@ -1566,6 +1610,8 @@ static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name,
 	
 	dprintk("NFS call  lookup %s\n", name->name);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	if (status == -NFS4ERR_MOVED)
+		status = nfs4_get_referral(dir, name, fattr, fhandle);
 	dprintk("NFS reply lookup: %d\n", status);
 	return status;
 }
-- 
cgit v1.2.3


From 33a43f2802d8d7be3a9b541785c4ca9ad79e4310 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@citi.umich.edu>
Date: Fri, 9 Jun 2006 09:34:30 -0400
Subject: NFSv4: A root pathname is sent as a zero component4

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 646f16da072..1750d996f49 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2408,8 +2408,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 
 	READ_BUF(4);
 	READ32(n);
-	if (n <= 0)
+	if (n < 0)
 		goto out_eio;
+	if (n == 0)
+		goto root_path;
 	dprintk("path ");
 	path->ncomponents = 0;
 	while (path->ncomponents < n) {
@@ -2430,6 +2432,13 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 out:
 	dprintk("\n");
 	return status;
+root_path:
+/* a root pathname is sent as a zero component4 */
+	path->ncomponents = 1;
+	path->components[0].len=0;
+	path->components[0].data=NULL;
+	dprintk("path /\n");
+	goto out;
 out_eio:
 	dprintk(" status %d", status);
 	status = -EIO;
-- 
cgit v1.2.3


From 87e4ba1a62af8e05ee3e8f8aaca622714386ffb0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:30 -0400
Subject: NFSv4: Ensure that referral mounts bind to a reserved port

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ee13cb01b56..648f593de0f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2078,6 +2078,8 @@ static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
 					__FUNCTION__, err);
 			goto out_fail;
 		}
+		/* Bind to a reserved port! */
+		xprt->resvport = 1;
 		clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
 				server->rpc_ops->version, flavor);
 		if (IS_ERR(clnt)) {
-- 
cgit v1.2.3


From 860de07139980afe9856cc31eb5efbf321bbcea4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:31 -0400
Subject: NFS: Fix compile errors introduced by referrals patches

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 648f593de0f..550a84dd41a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2444,6 +2444,20 @@ static struct file_system_type clone_nfs4_fs_type = {
 	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
 
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname, struct nfs_clone_mount *mountdata)
+{
+	struct vfsmount *mnt = NULL;
+	switch (server->rpc_ops->version) {
+		case 2:
+		case 3:
+			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, &mountdata);
+			break;
+		case 4:
+			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
+	}
+	return mnt;
+}
+
 #define nfs4_init_once(nfsi) \
 	do { \
 		INIT_LIST_HEAD(&(nfsi)->open_states); \
@@ -2477,6 +2491,10 @@ static inline void unregister_nfs4fs(void)
 	do { } while (0)
 #define register_nfs4fs() (0)
 #define unregister_nfs4fs()
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname, struct nfs_clone_mount *mountdata)
+{
+	return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, &mountdata);
+}
 #endif
 
 static inline char *nfs_devname(const struct vfsmount *mnt_parent,
@@ -2517,17 +2535,7 @@ struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 	mnt = (struct vfsmount *)devname;
 	if (IS_ERR(devname))
 		goto free_page;
-	switch (NFS_SB(mnt_parent->mnt_sb)->rpc_ops->version) {
-		case 2:
-		case 3:
-			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, &mountdata);
-			break;
-		case 4:
-			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, &mountdata);
-			break;
-		default:
-			BUG();
-	}
+	mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
 free_page:
 	free_page((unsigned long)page);
 out:
@@ -2535,6 +2543,7 @@ out:
 	return mnt;
 }
 
+#ifdef CONFIG_NFS_V4
 /* Check if fs_root is valid */
 static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname, char *buffer, ssize_t buflen)
 {
@@ -2789,6 +2798,12 @@ out:
 	dprintk("%s: done\n", __FUNCTION__);
 	return mnt;
 }
+#else
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+	return ERR_PTR(-ENOENT);
+}
+#endif
 
 extern int nfs_init_nfspagecache(void);
 extern void nfs_destroy_nfspagecache(void);
-- 
cgit v1.2.3


From 4e5ccf60c5aa79d325c123f47d288a068166f389 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:32 -0400
Subject: NFS: Fix typo in nfs_do_clone_mount()

Doh!

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 550a84dd41a..7ab2b38a990 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2450,7 +2450,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devn
 	switch (server->rpc_ops->version) {
 		case 2:
 		case 3:
-			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, &mountdata);
+			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
 			break;
 		case 4:
 			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
@@ -2493,7 +2493,7 @@ static inline void unregister_nfs4fs(void)
 #define unregister_nfs4fs()
 static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname, struct nfs_clone_mount *mountdata)
 {
-	return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, &mountdata);
+	return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
 }
 #endif
 
-- 
cgit v1.2.3


From f7b422b17ee5ee4920e8ae24a6ad04bf3481ce72 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 9 Jun 2006 09:34:33 -0400
Subject: NFS: Split fs/nfs/inode.c

As fs/nfs/inode.c is rather large, heterogenous and unwieldy, the attached
patch splits it up into a number of files:

 (*) fs/nfs/inode.c

     Strictly inode specific functions.

 (*) fs/nfs/super.c

     Superblock management functions for NFS and NFS4, normal access, clones
     and referrals.  The NFS4 superblock functions _could_ move out into a
     separate conditionally compiled file, but it's probably not worth it as
     there're so many common bits.

 (*) fs/nfs/namespace.c

     Some namespace-specific functions have been moved here.

 (*) fs/nfs/nfs4namespace.c

     NFS4-specific namespace functions (this could be merged into the previous
     file).  This file is conditionally compiled.

 (*) fs/nfs/internal.h

     Inter-file declarations, plus a few simple utility functions moved from
     fs/nfs/inode.c.

     Additionally, all the in-.c-file externs have been moved here, and those
     files they were moved from now includes this file.

For the most part, the functions have not been changed, only some multiplexor
functions have changed significantly.

I've also:

 (*) Added some extra banner comments above some functions.

 (*) Rearranged the function order within the files to be more logical and
     better grouped (IMO), though someone may prefer a different order.

 (*) Reduced the number of #ifdefs in .c files.

 (*) Added missing __init and __exit directives.

Signed-Off-By: David Howells <dhowells@redhat.com>
---
 fs/nfs/Makefile        |    5 +-
 fs/nfs/callback.c      |    2 -
 fs/nfs/direct.c        |    6 +-
 fs/nfs/inode.c         | 1793 +-----------------------------------------------
 fs/nfs/internal.h      |  179 +++++
 fs/nfs/namespace.c     |  112 ++-
 fs/nfs/nfs2xdr.c       |    2 -
 fs/nfs/nfs3proc.c      |    5 +-
 fs/nfs/nfs3xdr.c       |    3 +-
 fs/nfs/nfs4namespace.c |  201 ++++++
 fs/nfs/nfs4proc.c      |    2 -
 fs/nfs/pagelist.c      |    4 +-
 fs/nfs/proc.c          |    5 +-
 fs/nfs/read.c          |    4 +-
 fs/nfs/super.c         | 1468 +++++++++++++++++++++++++++++++++++++++
 fs/nfs/write.c         |    4 +-
 16 files changed, 1993 insertions(+), 1802 deletions(-)
 create mode 100644 fs/nfs/internal.h
 create mode 100644 fs/nfs/nfs4namespace.c
 create mode 100644 fs/nfs/super.c

(limited to 'fs')

diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index d9d494cee38..0b572a0c196 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_NFS_FS) += nfs.o
 
-nfs-y 			:= dir.o file.o inode.o nfs2xdr.o pagelist.o \
+nfs-y 			:= dir.o file.o inode.o super.o nfs2xdr.o pagelist.o \
 			   proc.o read.o symlink.o unlink.o write.o \
 			   namespace.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
@@ -12,7 +12,8 @@ nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
 nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   delegation.o idmap.o \
-			   callback.o callback_xdr.o callback_proc.o
+			   callback.o callback_xdr.o callback_proc.o \
+			   nfs4namespace.o
 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-objs		:= $(nfs-y)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 90c95adc8c1..d53f8c6a9ec 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -182,8 +182,6 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 /*
  * Define NFS4 callback program
  */
-extern struct svc_version nfs4_callback_version1;
-
 static struct svc_version *nfs4_callback_version[] = {
 	[1] = &nfs4_callback_version1,
 };
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3c72b0c0728..402005c35ab 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -892,7 +892,7 @@ out:
  * nfs_init_directcache - create a slab cache for nfs_direct_req structures
  *
  */
-int nfs_init_directcache(void)
+int __init nfs_init_directcache(void)
 {
 	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
 						sizeof(struct nfs_direct_req),
@@ -906,10 +906,10 @@ int nfs_init_directcache(void)
 }
 
 /**
- * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures
+ * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
  *
  */
-void nfs_destroy_directcache(void)
+void __exit nfs_destroy_directcache(void)
 {
 	if (kmem_cache_destroy(nfs_direct_cachep))
 		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7ab2b38a990..24a7139d344 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,87 +46,17 @@
 #include "callback.h"
 #include "delegation.h"
 #include "iostat.h"
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 #define NFS_PARANOIA 1
 
-/* Maximum number of readahead requests
- * FIXME: this should really be a sysctl so that users may tune it to suit
- *        their needs. People that do NFS over a slow network, might for
- *        instance want to reduce it to something closer to 1 for improved
- *        interactive response.
- */
-#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
-
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
 
-static struct inode *nfs_alloc_inode(struct super_block *sb);
-static void nfs_destroy_inode(struct inode *);
-static int nfs_write_inode(struct inode *,int);
-static void nfs_clear_inode(struct inode *);
-static void nfs_umount_begin(struct vfsmount *, int);
-static int  nfs_statfs(struct super_block *, struct kstatfs *);
-static int  nfs_show_options(struct seq_file *, struct vfsmount *);
-static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static void nfs_zap_acl_cache(struct inode *);
 
-static struct rpc_program	nfs_program;
-
-static struct super_operations nfs_sops = { 
-	.alloc_inode	= nfs_alloc_inode,
-	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
-	.statfs		= nfs_statfs,
-	.clear_inode	= nfs_clear_inode,
-	.umount_begin	= nfs_umount_begin,
-	.show_options	= nfs_show_options,
-	.show_stats	= nfs_show_stats,
-};
-
-/*
- * RPC cruft for NFS
- */
-static struct rpc_stat		nfs_rpcstat = {
-	.program		= &nfs_program
-};
-static struct rpc_version *	nfs_version[] = {
-	NULL,
-	NULL,
-	&nfs_version2,
-#if defined(CONFIG_NFS_V3)
-	&nfs_version3,
-#elif defined(CONFIG_NFS_V4)
-	NULL,
-#endif
-#if defined(CONFIG_NFS_V4)
-	&nfs_version4,
-#endif
-};
-
-static struct rpc_program	nfs_program = {
-	.name			= "nfs",
-	.number			= NFS_PROGRAM,
-	.nrvers			= ARRAY_SIZE(nfs_version),
-	.version		= nfs_version,
-	.stats			= &nfs_rpcstat,
-	.pipe_dir_name		= "/nfs",
-};
-
-#ifdef CONFIG_NFS_V3_ACL
-static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
-static struct rpc_version *	nfsacl_version[] = {
-	[3]			= &nfsacl_version3,
-};
-
-struct rpc_program		nfsacl_program = {
-	.name =			"nfsacl",
-	.number =		NFS_ACL_PROGRAM,
-	.nrvers =		ARRAY_SIZE(nfsacl_version),
-	.version =		nfsacl_version,
-	.stats =		&nfsacl_rpcstat,
-};
-#endif  /* CONFIG_NFS_V3_ACL */
+static kmem_cache_t * nfs_inode_cachep;
 
 static inline unsigned long
 nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
@@ -134,8 +64,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
-static int
-nfs_write_inode(struct inode *inode, int sync)
+int nfs_write_inode(struct inode *inode, int sync)
 {
 	int flags = sync ? FLUSH_SYNC : 0;
 	int ret;
@@ -146,8 +75,7 @@ nfs_write_inode(struct inode *inode, int sync)
 	return 0;
 }
 
-static void
-nfs_clear_inode(struct inode *inode)
+void nfs_clear_inode(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct rpc_cred *cred;
@@ -164,566 +92,6 @@ nfs_clear_inode(struct inode *inode)
 	BUG_ON(atomic_read(&nfsi->data_updates) != 0);
 }
 
-static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
-{
-	struct nfs_server *server;
-	struct rpc_clnt	*rpc;
-
-	shrink_submounts(vfsmnt, &nfs_automount_list);
-	if (!(flags & MNT_FORCE))
-		return;
-	/* -EIO all pending I/O */
-	server = NFS_SB(vfsmnt->mnt_sb);
-	rpc = server->client;
-	if (!IS_ERR(rpc))
-		rpc_killall_tasks(rpc);
-	rpc = server->client_acl;
-	if (!IS_ERR(rpc))
-		rpc_killall_tasks(rpc);
-}
-
-
-static inline unsigned long
-nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
-{
-	/* make sure blocksize is a power of two */
-	if ((bsize & (bsize - 1)) || nrbitsp) {
-		unsigned char	nrbits;
-
-		for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
-			;
-		bsize = 1 << nrbits;
-		if (nrbitsp)
-			*nrbitsp = nrbits;
-	}
-
-	return bsize;
-}
-
-/*
- * Calculate the number of 512byte blocks used.
- */
-static inline unsigned long
-nfs_calc_block_size(u64 tsize)
-{
-	loff_t used = (tsize + 511) >> 9;
-	return (used > ULONG_MAX) ? ULONG_MAX : used;
-}
-
-/*
- * Compute and set NFS server blocksize
- */
-static inline unsigned long
-nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
-{
-	if (bsize < NFS_MIN_FILE_IO_SIZE)
-		bsize = NFS_DEF_FILE_IO_SIZE;
-	else if (bsize >= NFS_MAX_FILE_IO_SIZE)
-		bsize = NFS_MAX_FILE_IO_SIZE;
-
-	return nfs_block_bits(bsize, nrbitsp);
-}
-
-static inline void
-nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
-{
-	sb->s_maxbytes = (loff_t)maxfilesize;
-	if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
-		sb->s_maxbytes = MAX_LFS_FILESIZE;
-}
-
-/*
- * Obtain the root inode of the file system.
- */
-static struct inode *
-nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
-{
-	struct nfs_server	*server = NFS_SB(sb);
-	int			error;
-
-	error = server->rpc_ops->getroot(server, rootfh, fsinfo);
-	if (error < 0) {
-		dprintk("nfs_get_root: getattr error = %d\n", -error);
-		return ERR_PTR(error);
-	}
-
-	server->fsid = fsinfo->fattr->fsid;
-	return nfs_fhget(sb, rootfh, fsinfo->fattr);
-}
-
-/*
- * Do NFS version-independent mount processing, and sanity checking
- */
-static int
-nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
-{
-	struct nfs_server	*server;
-	struct inode		*root_inode;
-	struct nfs_fattr	fattr;
-	struct nfs_fsinfo	fsinfo = {
-					.fattr = &fattr,
-				};
-	struct nfs_pathconf pathinfo = {
-			.fattr = &fattr,
-	};
-	int no_root_error = 0;
-	unsigned long max_rpc_payload;
-
-	/* We probably want something more informative here */
-	snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
-
-	server = NFS_SB(sb);
-
-	sb->s_magic      = NFS_SUPER_MAGIC;
-
-	server->io_stats = nfs_alloc_iostats();
-	if (server->io_stats == NULL)
-		return -ENOMEM;
-
-	root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
-	/* Did getting the root inode fail? */
-	if (IS_ERR(root_inode)) {
-		no_root_error = PTR_ERR(root_inode);
-		goto out_no_root;
-	}
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root) {
-		no_root_error = -ENOMEM;
-		goto out_no_root;
-	}
-	sb->s_root->d_op = server->rpc_ops->dentry_ops;
-
-	/* mount time stamp, in seconds */
-	server->mount_time = jiffies;
-
-	/* Get some general file system info */
-	if (server->namelen == 0 &&
-	    server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
-		server->namelen = pathinfo.max_namelen;
-	/* Work out a lot of parameters */
-	if (server->rsize == 0)
-		server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
-	if (server->wsize == 0)
-		server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
-
-	if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
-		server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
-	if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
-		server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
-
-	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
-	if (server->rsize > max_rpc_payload)
-		server->rsize = max_rpc_payload;
-	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
-		server->rsize = NFS_MAX_FILE_IO_SIZE;
-	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
-	if (server->wsize > max_rpc_payload)
-		server->wsize = max_rpc_payload;
-	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
-		server->wsize = NFS_MAX_FILE_IO_SIZE;
-	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
-	if (sb->s_blocksize == 0)
-		sb->s_blocksize = nfs_block_bits(server->wsize,
-							 &sb->s_blocksize_bits);
-	server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
-
-	server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
-	if (server->dtsize > PAGE_CACHE_SIZE)
-		server->dtsize = PAGE_CACHE_SIZE;
-	if (server->dtsize > server->rsize)
-		server->dtsize = server->rsize;
-
-	if (server->flags & NFS_MOUNT_NOAC) {
-		server->acregmin = server->acregmax = 0;
-		server->acdirmin = server->acdirmax = 0;
-		sb->s_flags |= MS_SYNCHRONOUS;
-	}
-	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
-	nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
-
-	server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
-	server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
-
-	/* We're airborne Set socket buffersize */
-	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
-	return 0;
-	/* Yargs. It didn't work out. */
-out_no_root:
-	dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
-	if (!IS_ERR(root_inode))
-		iput(root_inode);
-	return no_root_error;
-}
-
-static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
-{
-	to->to_initval = timeo * HZ / 10;
-	to->to_retries = retrans;
-	if (!to->to_retries)
-		to->to_retries = 2;
-
-	switch (proto) {
-	case IPPROTO_TCP:
-		if (!to->to_initval)
-			to->to_initval = 60 * HZ;
-		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
-			to->to_initval = NFS_MAX_TCP_TIMEOUT;
-		to->to_increment = to->to_initval;
-		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
-		to->to_exponential = 0;
-		break;
-	case IPPROTO_UDP:
-	default:
-		if (!to->to_initval)
-			to->to_initval = 11 * HZ / 10;
-		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
-			to->to_initval = NFS_MAX_UDP_TIMEOUT;
-		to->to_maxval = NFS_MAX_UDP_TIMEOUT;
-		to->to_exponential = 1;
-		break;
-	}
-}
-
-/*
- * Create an RPC client handle.
- */
-static struct rpc_clnt *
-nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
-{
-	struct rpc_timeout	timeparms;
-	struct rpc_xprt		*xprt = NULL;
-	struct rpc_clnt		*clnt = NULL;
-	int			proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
-
-	nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
-
-	server->retrans_timeo = timeparms.to_initval;
-	server->retrans_count = timeparms.to_retries;
-
-	/* create transport and client */
-	xprt = xprt_create_proto(proto, &server->addr, &timeparms);
-	if (IS_ERR(xprt)) {
-		dprintk("%s: cannot create RPC transport. Error = %ld\n",
-				__FUNCTION__, PTR_ERR(xprt));
-		return (struct rpc_clnt *)xprt;
-	}
-	clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-				 server->rpc_ops->version, data->pseudoflavor);
-	if (IS_ERR(clnt)) {
-		dprintk("%s: cannot create RPC client. Error = %ld\n",
-				__FUNCTION__, PTR_ERR(xprt));
-		goto out_fail;
-	}
-
-	clnt->cl_intr     = 1;
-	clnt->cl_softrtry = 1;
-
-	return clnt;
-
-out_fail:
-	return clnt;
-}
-
-/*
- * The way this works is that the mount process passes a structure
- * in the data argument which contains the server's IP address
- * and the root file handle obtained from the server's mount
- * daemon. We stash these away in the private superblock fields.
- */
-static int
-nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
-{
-	struct nfs_server	*server;
-	rpc_authflavor_t	authflavor;
-
-	server           = NFS_SB(sb);
-	sb->s_blocksize_bits = 0;
-	sb->s_blocksize = 0;
-	if (data->bsize)
-		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
-	if (data->rsize)
-		server->rsize = nfs_block_size(data->rsize, NULL);
-	if (data->wsize)
-		server->wsize = nfs_block_size(data->wsize, NULL);
-	server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
-
-	server->acregmin = data->acregmin*HZ;
-	server->acregmax = data->acregmax*HZ;
-	server->acdirmin = data->acdirmin*HZ;
-	server->acdirmax = data->acdirmax*HZ;
-
-	/* Start lockd here, before we might error out */
-	if (!(server->flags & NFS_MOUNT_NONLM))
-		lockd_up();
-
-	server->namelen  = data->namlen;
-	server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
-	if (!server->hostname)
-		return -ENOMEM;
-	strcpy(server->hostname, data->hostname);
-
-	/* Check NFS protocol revision and initialize RPC op vector
-	 * and file handle pool. */
-#ifdef CONFIG_NFS_V3
-	if (server->flags & NFS_MOUNT_VER3) {
-		server->rpc_ops = &nfs_v3_clientops;
-		server->caps |= NFS_CAP_READDIRPLUS;
-	} else {
-		server->rpc_ops = &nfs_v2_clientops;
-	}
-#else
-	server->rpc_ops = &nfs_v2_clientops;
-#endif
-
-	/* Fill in pseudoflavor for mount version < 5 */
-	if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
-		data->pseudoflavor = RPC_AUTH_UNIX;
-	authflavor = data->pseudoflavor;	/* save for sb_init() */
-	/* XXX maybe we want to add a server->pseudoflavor field */
-
-	/* Create RPC client handles */
-	server->client = nfs_create_client(server, data);
-	if (IS_ERR(server->client))
-		return PTR_ERR(server->client);
-	/* RFC 2623, sec 2.3.2 */
-	if (authflavor != RPC_AUTH_UNIX) {
-		struct rpc_auth *auth;
-
-		server->client_sys = rpc_clone_client(server->client);
-		if (IS_ERR(server->client_sys))
-			return PTR_ERR(server->client_sys);
-		auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
-		if (IS_ERR(auth))
-			return PTR_ERR(auth);
-	} else {
-		atomic_inc(&server->client->cl_count);
-		server->client_sys = server->client;
-	}
-	if (server->flags & NFS_MOUNT_VER3) {
-#ifdef CONFIG_NFS_V3_ACL
-		if (!(server->flags & NFS_MOUNT_NOACL)) {
-			server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
-			/* No errors! Assume that Sun nfsacls are supported */
-			if (!IS_ERR(server->client_acl))
-				server->caps |= NFS_CAP_ACLS;
-		}
-#else
-		server->flags &= ~NFS_MOUNT_NOACL;
-#endif /* CONFIG_NFS_V3_ACL */
-		/*
-		 * The VFS shouldn't apply the umask to mode bits. We will
-		 * do so ourselves when necessary.
-		 */
-		sb->s_flags |= MS_POSIXACL;
-		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
-			server->namelen = NFS3_MAXNAMLEN;
-		sb->s_time_gran = 1;
-	} else {
-		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
-			server->namelen = NFS2_MAXNAMLEN;
-	}
-
-	sb->s_op = &nfs_sops;
-	return nfs_sb_init(sb, authflavor);
-}
-
-static int
-nfs_statfs(struct super_block *sb, struct kstatfs *buf)
-{
-	struct nfs_server *server = NFS_SB(sb);
-	unsigned char blockbits;
-	unsigned long blockres;
-	struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
-	struct nfs_fattr fattr;
-	struct nfs_fsstat res = {
-			.fattr = &fattr,
-	};
-	int error;
-
-	lock_kernel();
-
-	error = server->rpc_ops->statfs(server, rootfh, &res);
-	buf->f_type = NFS_SUPER_MAGIC;
-	if (error < 0)
-		goto out_err;
-
-	/*
-	 * Current versions of glibc do not correctly handle the
-	 * case where f_frsize != f_bsize.  Eventually we want to
-	 * report the value of wtmult in this field.
-	 */
-	buf->f_frsize = sb->s_blocksize;
-
-	/*
-	 * On most *nix systems, f_blocks, f_bfree, and f_bavail
-	 * are reported in units of f_frsize.  Linux hasn't had
-	 * an f_frsize field in its statfs struct until recently,
-	 * thus historically Linux's sys_statfs reports these
-	 * fields in units of f_bsize.
-	 */
-	buf->f_bsize = sb->s_blocksize;
-	blockbits = sb->s_blocksize_bits;
-	blockres = (1 << blockbits) - 1;
-	buf->f_blocks = (res.tbytes + blockres) >> blockbits;
-	buf->f_bfree = (res.fbytes + blockres) >> blockbits;
-	buf->f_bavail = (res.abytes + blockres) >> blockbits;
-
-	buf->f_files = res.tfiles;
-	buf->f_ffree = res.afiles;
-
-	buf->f_namelen = server->namelen;
- out:
-	unlock_kernel();
-	return 0;
-
- out_err:
-	dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
-	buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
-	goto out;
-
-}
-
-static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
-{
-	static struct proc_nfs_info {
-		int flag;
-		char *str;
-		char *nostr;
-	} nfs_info[] = {
-		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
-		{ NFS_MOUNT_INTR, ",intr", "" },
-		{ NFS_MOUNT_NOCTO, ",nocto", "" },
-		{ NFS_MOUNT_NOAC, ",noac", "" },
-		{ NFS_MOUNT_NONLM, ",nolock", "" },
-		{ NFS_MOUNT_NOACL, ",noacl", "" },
-		{ 0, NULL, NULL }
-	};
-	struct proc_nfs_info *nfs_infop;
-	char buf[12];
-	char *proto;
-
-	seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
-	seq_printf(m, ",rsize=%d", nfss->rsize);
-	seq_printf(m, ",wsize=%d", nfss->wsize);
-	if (nfss->acregmin != 3*HZ || showdefaults)
-		seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
-	if (nfss->acregmax != 60*HZ || showdefaults)
-		seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
-	if (nfss->acdirmin != 30*HZ || showdefaults)
-		seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
-	if (nfss->acdirmax != 60*HZ || showdefaults)
-		seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
-	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
-		if (nfss->flags & nfs_infop->flag)
-			seq_puts(m, nfs_infop->str);
-		else
-			seq_puts(m, nfs_infop->nostr);
-	}
-	switch (nfss->client->cl_xprt->prot) {
-		case IPPROTO_TCP:
-			proto = "tcp";
-			break;
-		case IPPROTO_UDP:
-			proto = "udp";
-			break;
-		default:
-			snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
-			proto = buf;
-	}
-	seq_printf(m, ",proto=%s", proto);
-	seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
-	seq_printf(m, ",retrans=%u", nfss->retrans_count);
-}
-
-static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
-
-	nfs_show_mount_options(m, nfss, 0);
-
-	seq_puts(m, ",addr=");
-	seq_escape(m, nfss->hostname, " \t\n\\");
-
-	return 0;
-}
-
-static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
-{
-	int i, cpu;
-	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
-	struct rpc_auth *auth = nfss->client->cl_auth;
-	struct nfs_iostats totals = { };
-
-	seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
-
-	/*
-	 * Display all mount option settings
-	 */
-	seq_printf(m, "\n\topts:\t");
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
-	seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
-	nfs_show_mount_options(m, nfss, 1);
-
-	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
-
-	seq_printf(m, "\n\tcaps:\t");
-	seq_printf(m, "caps=0x%x", nfss->caps);
-	seq_printf(m, ",wtmult=%d", nfss->wtmult);
-	seq_printf(m, ",dtsize=%d", nfss->dtsize);
-	seq_printf(m, ",bsize=%d", nfss->bsize);
-	seq_printf(m, ",namelen=%d", nfss->namelen);
-
-#ifdef CONFIG_NFS_V4
-	if (nfss->rpc_ops->version == 4) {
-		seq_printf(m, "\n\tnfsv4:\t");
-		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
-		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
-		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
-	}
-#endif
-
-	/*
-	 * Display security flavor in effect for this mount
-	 */
-	seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
-	if (auth->au_flavor)
-		seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
-
-	/*
-	 * Display superblock I/O counters
-	 */
-	for_each_possible_cpu(cpu) {
-		struct nfs_iostats *stats;
-
-		preempt_disable();
-		stats = per_cpu_ptr(nfss->io_stats, cpu);
-
-		for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
-			totals.events[i] += stats->events[i];
-		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
-			totals.bytes[i] += stats->bytes[i];
-
-		preempt_enable();
-	}
-
-	seq_printf(m, "\n\tevents:\t");
-	for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
-		seq_printf(m, "%lu ", totals.events[i]);
-	seq_printf(m, "\n\tbytes:\t");
-	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
-		seq_printf(m, "%Lu ", totals.bytes[i]);
-	seq_printf(m, "\n");
-
-	rpc_print_iostats(m, nfss->client);
-
-	return 0;
-}
-
 /**
  * nfs_sync_mapping - helper to flush all mmapped dirty data to disk
  */
@@ -1663,371 +1031,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	goto out_err;
 }
 
-/*
- * File system information
- */
-
-/*
- * nfs_path - reconstruct the path given an arbitrary dentry
- * @base - arbitrary string to prepend to the path
- * @dentry - pointer to dentry
- * @buffer - result buffer
- * @buflen - length of buffer
- *
- * Helper function for constructing the path from the
- * root dentry to an arbitrary hashed dentry.
- *
- * This is mainly for use in figuring out the path on the
- * server side when automounting on top of an existing partition.
- */
-static char *nfs_path(const char *base, const struct dentry *dentry,
-		      char *buffer, ssize_t buflen)
-{
-	char *end = buffer+buflen;
-	int namelen;
-
-	*--end = '\0';
-	buflen--;
-	spin_lock(&dcache_lock);
-	while (!IS_ROOT(dentry)) {
-		namelen = dentry->d_name.len;
-		buflen -= namelen + 1;
-		if (buflen < 0)
-			goto Elong;
-		end -= namelen;
-		memcpy(end, dentry->d_name.name, namelen);
-		*--end = '/';
-		dentry = dentry->d_parent;
-	}
-	spin_unlock(&dcache_lock);
-	namelen = strlen(base);
-	/* Strip off excess slashes in base string */
-	while (namelen > 0 && base[namelen - 1] == '/')
-		namelen--;
-	buflen -= namelen;
-	if (buflen < 0)
-		goto Elong;
-	end -= namelen;
-	memcpy(end, base, namelen);
-	return end;
-Elong:
-	return ERR_PTR(-ENAMETOOLONG);
-}
-
-struct nfs_clone_mount {
-	const struct super_block *sb;
-	const struct dentry *dentry;
-	struct nfs_fh *fh;
-	struct nfs_fattr *fattr;
-	char *hostname;
-	char *mnt_path;
-	struct sockaddr_in *addr;
-	rpc_authflavor_t authflavor;
-};
-
-static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
-		struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *),
-		struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *))
-{
-	struct nfs_server *server;
-	struct nfs_server *parent = NFS_SB(data->sb);
-	struct super_block *sb = ERR_PTR(-EINVAL);
-	void *err = ERR_PTR(-ENOMEM);
-	char *hostname;
-	int len;
-
-	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
-	if (server == NULL)
-		goto out_err;
-	memcpy(server, parent, sizeof(*server));
-	hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
-	len = strlen(hostname) + 1;
-	server->hostname = kmalloc(len, GFP_KERNEL);
-	if (server->hostname == NULL)
-		goto free_server;
-	memcpy(server->hostname, hostname, len);
-	if (rpciod_up() != 0)
-		goto free_hostname;
-
-	sb = fill_sb(server, data);
-	if (IS_ERR((err = sb)) || sb->s_root)
-		goto kill_rpciod;
-
-	server = fill_server(sb, data);
-	if (IS_ERR((err = server)))
-		goto out_deactivate;
-	return sb;
-out_deactivate:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
-	return (struct super_block *)err;
-kill_rpciod:
-	rpciod_down();
-free_hostname:
-	kfree(server->hostname);
-free_server:
-	kfree(server);
-out_err:
-	return (struct super_block *)err;
-}
-
-static int nfs_set_super(struct super_block *s, void *data)
-{
-	s->s_fs_info = data;
-	return set_anon_super(s, data);
-}
- 
-static int nfs_compare_super(struct super_block *sb, void *data)
-{
-	struct nfs_server *server = data;
-	struct nfs_server *old = NFS_SB(sb);
-
-	if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
-		return 0;
-	if (old->addr.sin_port != server->addr.sin_port)
-		return 0;
-	return !nfs_compare_fh(&old->fh, &server->fh);
-}
-
-static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
-{
-	int error;
-	struct nfs_server *server = NULL;
-	struct super_block *s;
-	struct nfs_fh *root;
-	struct nfs_mount_data *data = raw_data;
-
-	s = ERR_PTR(-EINVAL);
-	if (data == NULL) {
-		dprintk("%s: missing data argument\n", __FUNCTION__);
-		goto out_err;
-	}
-	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
-		dprintk("%s: bad mount version\n", __FUNCTION__);
-		goto out_err;
-	}
-	switch (data->version) {
-		case 1:
-			data->namlen = 0;
-		case 2:
-			data->bsize  = 0;
-		case 3:
-			if (data->flags & NFS_MOUNT_VER3) {
-				dprintk("%s: mount structure version %d does not support NFSv3\n",
-						__FUNCTION__,
-						data->version);
-				goto out_err;
-			}
-			data->root.size = NFS2_FHSIZE;
-			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
-		case 4:
-			if (data->flags & NFS_MOUNT_SECFLAVOUR) {
-				dprintk("%s: mount structure version %d does not support strong security\n",
-						__FUNCTION__,
-						data->version);
-				goto out_err;
-			}
-		case 5:
-			memset(data->context, 0, sizeof(data->context));
-	}
-#ifndef CONFIG_NFS_V3
-	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-	s = ERR_PTR(-EPROTONOSUPPORT);
-	if (data->flags & NFS_MOUNT_VER3) {
-		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-		goto out_err;
-	}
-#endif /* CONFIG_NFS_V3 */
-
-	s = ERR_PTR(-ENOMEM);
-	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-	if (!server)
-		goto out_err;
-	/* Zero out the NFS state stuff */
-	init_nfsv4_state(server);
-	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-
-	root = &server->fh;
-	if (data->flags & NFS_MOUNT_VER3)
-		root->size = data->root.size;
-	else
-		root->size = NFS2_FHSIZE;
-	s = ERR_PTR(-EINVAL);
-	if (root->size > sizeof(root->data)) {
-		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
-		goto out_err;
-	}
-	memcpy(root->data, data->root.data, root->size);
-
-	/* We now require that the mount process passes the remote address */
-	memcpy(&server->addr, &data->addr, sizeof(server->addr));
-	if (server->addr.sin_addr.s_addr == INADDR_ANY) {
-		dprintk("%s: mount program didn't pass remote address!\n",
-				__FUNCTION__);
-		goto out_err;
-	}
-
-	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
-		goto out_err;
-	}
-
-	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
-	if (IS_ERR(s) || s->s_root)
-		goto out_rpciod_down;
-
-	s->s_flags = flags;
-
-	error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-	if (error) {
-		up_write(&s->s_umount);
-		deactivate_super(s);
-		return ERR_PTR(error);
-	}
-	s->s_flags |= MS_ACTIVE;
-	return s;
-out_rpciod_down:
-	rpciod_down();
-out_err:
-	kfree(server);
-	return s;
-}
-
-static void nfs_kill_super(struct super_block *s)
-{
-	struct nfs_server *server = NFS_SB(s);
-
-	kill_anon_super(s);
-
-	if (!IS_ERR(server->client))
-		rpc_shutdown_client(server->client);
-	if (!IS_ERR(server->client_sys))
-		rpc_shutdown_client(server->client_sys);
-	if (!IS_ERR(server->client_acl))
-		rpc_shutdown_client(server->client_acl);
-
-	if (!(server->flags & NFS_MOUNT_NONLM))
-		lockd_down();	/* release rpc.lockd */
-
-	rpciod_down();		/* release rpciod */
-
-	nfs_free_iostats(server->io_stats);
-	kfree(server->hostname);
-	kfree(server);
-	nfs_release_automount_timer();
-}
-
-static struct file_system_type nfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "nfs",
-	.get_sb		= nfs_get_sb,
-	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
-
-static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
-{
-	struct super_block *sb;
-
-	server->fsid = data->fattr->fsid;
-	nfs_copy_fh(&server->fh, data->fh);
-	sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
-	if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
-		lockd_up();
-	return sb;
-}
-
-static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data)
-{
-	struct nfs_server *server = NFS_SB(sb);
-	struct nfs_server *parent = NFS_SB(data->sb);
-	struct inode *root_inode;
-	struct nfs_fsinfo fsinfo;
-	void *err = ERR_PTR(-ENOMEM);
-
-	sb->s_op = data->sb->s_op;
-	sb->s_blocksize = data->sb->s_blocksize;
-	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
-	sb->s_maxbytes = data->sb->s_maxbytes;
-
-	server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-	server->io_stats = nfs_alloc_iostats();
-	if (server->io_stats == NULL)
-		goto out;
-
-	server->client = rpc_clone_client(parent->client);
-	if (IS_ERR((err = server->client)))
-		goto out;
-
-	if (!IS_ERR(parent->client_sys)) {
-		server->client_sys = rpc_clone_client(parent->client_sys);
-		if (IS_ERR((err = server->client_sys)))
-			goto out;
-	}
-	if (!IS_ERR(parent->client_acl)) {
-		server->client_acl = rpc_clone_client(parent->client_acl);
-		if (IS_ERR((err = server->client_acl)))
-			goto out;
-	}
-	root_inode = nfs_fhget(sb, data->fh, data->fattr);
-	if (!root_inode)
-		goto out;
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root)
-		goto out_put_root;
-	fsinfo.fattr = data->fattr;
-	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
-		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
-	sb->s_root->d_op = server->rpc_ops->dentry_ops;
-	sb->s_flags |= MS_ACTIVE;
-	return server;
-out_put_root:
-	iput(root_inode);
-out:
-	return err;
-}
-
-static struct super_block *nfs_clone_nfs_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data)
-{
-	struct nfs_clone_mount *data = raw_data;
-	return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server);
-}
-
-static struct file_system_type clone_nfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "nfs",
-	.get_sb		= nfs_clone_nfs_sb,
-	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
 
 #ifdef CONFIG_NFS_V4
 
-static void nfs4_clear_inode(struct inode *);
-
-
-static struct super_operations nfs4_sops = { 
-	.alloc_inode	= nfs_alloc_inode,
-	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
-	.statfs		= nfs_statfs,
-	.clear_inode	= nfs4_clear_inode,
-	.umount_begin	= nfs_umount_begin,
-	.show_options	= nfs_show_options,
-	.show_stats	= nfs_show_stats,
-};
-
 /*
  * Clean out any remaining NFSv4 state that might be left over due
  * to open() calls that passed nfs_atomic_lookup, but failed to call
  * nfs_open().
  */
-static void nfs4_clear_inode(struct inode *inode)
+void nfs4_clear_inode(struct inode *inode)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
@@ -2051,774 +1063,9 @@ static void nfs4_clear_inode(struct inode *inode)
 		nfs4_close_state(state, state->state);
 	}
 }
-
-
-static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
-	struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
-{
-	struct nfs4_client *clp;
-	struct rpc_xprt *xprt = NULL;
-	struct rpc_clnt *clnt = NULL;
-	int err = -EIO;
-
-	clp = nfs4_get_client(&server->addr.sin_addr);
-	if (!clp) {
-		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
-		return ERR_PTR(err);
-	}
-
-	/* Now create transport and client */
-	down_write(&clp->cl_sem);
-	if (IS_ERR(clp->cl_rpcclient)) {
-		xprt = xprt_create_proto(proto, &server->addr, timeparms);
-		if (IS_ERR(xprt)) {
-			up_write(&clp->cl_sem);
-			err = PTR_ERR(xprt);
-			dprintk("%s: cannot create RPC transport. Error = %d\n",
-					__FUNCTION__, err);
-			goto out_fail;
-		}
-		/* Bind to a reserved port! */
-		xprt->resvport = 1;
-		clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
-				server->rpc_ops->version, flavor);
-		if (IS_ERR(clnt)) {
-			up_write(&clp->cl_sem);
-			err = PTR_ERR(clnt);
-			dprintk("%s: cannot create RPC client. Error = %d\n",
-					__FUNCTION__, err);
-			goto out_fail;
-		}
-		clnt->cl_intr     = 1;
-		clnt->cl_softrtry = 1;
-		clp->cl_rpcclient = clnt;
-		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
-		nfs_idmap_new(clp);
-	}
-	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
-	clnt = rpc_clone_client(clp->cl_rpcclient);
-	if (!IS_ERR(clnt))
-		server->nfs4_state = clp;
-	up_write(&clp->cl_sem);
-	clp = NULL;
-
-	if (IS_ERR(clnt)) {
-		dprintk("%s: cannot create RPC client. Error = %d\n",
-				__FUNCTION__, err);
-		return clnt;
-	}
-
-	if (server->nfs4_state->cl_idmap == NULL) {
-		dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	if (clnt->cl_auth->au_flavor != flavor) {
-		struct rpc_auth *auth;
-
-		auth = rpcauth_create(flavor, clnt);
-		if (IS_ERR(auth)) {
-			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
-			return (struct rpc_clnt *)auth;
-		}
-	}
-	return clnt;
-
- out_fail:
-	if (clp)
-		nfs4_put_client(clp);
-	return ERR_PTR(err);
-}
-
-static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
-{
-	struct nfs_server *server;
-	struct rpc_timeout timeparms;
-	rpc_authflavor_t authflavour;
-	int err = -EIO;
-
-	sb->s_blocksize_bits = 0;
-	sb->s_blocksize = 0;
-	server = NFS_SB(sb);
-	if (data->rsize != 0)
-		server->rsize = nfs_block_size(data->rsize, NULL);
-	if (data->wsize != 0)
-		server->wsize = nfs_block_size(data->wsize, NULL);
-	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
-	server->caps = NFS_CAP_ATOMIC_OPEN;
-
-	server->acregmin = data->acregmin*HZ;
-	server->acregmax = data->acregmax*HZ;
-	server->acdirmin = data->acdirmin*HZ;
-	server->acdirmax = data->acdirmax*HZ;
-
-	server->rpc_ops = &nfs_v4_clientops;
-
-	nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
-
-	server->retrans_timeo = timeparms.to_initval;
-	server->retrans_count = timeparms.to_retries;
-
-	/* Now create transport and client */
-	authflavour = RPC_AUTH_UNIX;
-	if (data->auth_flavourlen != 0) {
-		if (data->auth_flavourlen != 1) {
-			dprintk("%s: Invalid number of RPC auth flavours %d.\n",
-					__FUNCTION__, data->auth_flavourlen);
-			err = -EINVAL;
-			goto out_fail;
-		}
-		if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
-			err = -EFAULT;
-			goto out_fail;
-		}
-	}
-
-	server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour);
-	if (IS_ERR(server->client)) {
-		err = PTR_ERR(server->client);
-			dprintk("%s: cannot create RPC client. Error = %d\n",
-					__FUNCTION__, err);
-			goto out_fail;
-	}
-
-	sb->s_time_gran = 1;
-
-	sb->s_op = &nfs4_sops;
-	err = nfs_sb_init(sb, authflavour);
-
- out_fail:
-	return err;
-}
-
-static int nfs4_compare_super(struct super_block *sb, void *data)
-{
-	struct nfs_server *server = data;
-	struct nfs_server *old = NFS_SB(sb);
-
-	if (strcmp(server->hostname, old->hostname) != 0)
-		return 0;
-	if (strcmp(server->mnt_path, old->mnt_path) != 0)
-		return 0;
-	return 1;
-}
-
-static void *
-nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
-{
-	void *p = NULL;
-
-	if (!src->len)
-		return ERR_PTR(-EINVAL);
-	if (src->len < maxlen)
-		maxlen = src->len;
-	if (dst == NULL) {
-		p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
-		if (p == NULL)
-			return ERR_PTR(-ENOMEM);
-	}
-	if (copy_from_user(dst, src->data, maxlen)) {
-		kfree(p);
-		return ERR_PTR(-EFAULT);
-	}
-	dst[maxlen] = '\0';
-	return dst;
-}
-
-static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
-{
-	int error;
-	struct nfs_server *server;
-	struct super_block *s;
-	struct nfs4_mount_data *data = raw_data;
-	void *p;
-
-	if (data == NULL) {
-		dprintk("%s: missing data argument\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
-	}
-	if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
-		dprintk("%s: bad mount version\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
-	}
-
-	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
-	if (!server)
-		return ERR_PTR(-ENOMEM);
-	/* Zero out the NFS state stuff */
-	init_nfsv4_state(server);
-	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
-
-	p = nfs_copy_user_string(NULL, &data->hostname, 256);
-	if (IS_ERR(p))
-		goto out_err;
-	server->hostname = p;
-
-	p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
-	if (IS_ERR(p))
-		goto out_err;
-	server->mnt_path = p;
-
-	p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
-			sizeof(server->ip_addr) - 1);
-	if (IS_ERR(p))
-		goto out_err;
-
-	/* We now require that the mount process passes the remote address */
-	if (data->host_addrlen != sizeof(server->addr)) {
-		s = ERR_PTR(-EINVAL);
-		goto out_free;
-	}
-	if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
-		s = ERR_PTR(-EFAULT);
-		goto out_free;
-	}
-	if (server->addr.sin_family != AF_INET ||
-	    server->addr.sin_addr.s_addr == INADDR_ANY) {
-		dprintk("%s: mount program didn't pass remote IP address!\n",
-				__FUNCTION__);
-		s = ERR_PTR(-EINVAL);
-		goto out_free;
-	}
-
-	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
-		goto out_free;
-	}
-
-	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
-
-	if (IS_ERR(s) || s->s_root)
-		goto out_free;
-
-	s->s_flags = flags;
-
-	error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-	if (error) {
-		up_write(&s->s_umount);
-		deactivate_super(s);
-		return ERR_PTR(error);
-	}
-	s->s_flags |= MS_ACTIVE;
-	return s;
-out_err:
-	s = (struct super_block *)p;
-out_free:
-	kfree(server->mnt_path);
-	kfree(server->hostname);
-	kfree(server);
-	return s;
-}
-
-static void nfs4_kill_super(struct super_block *sb)
-{
-	struct nfs_server *server = NFS_SB(sb);
-
-	nfs_return_all_delegations(sb);
-	kill_anon_super(sb);
-
-	nfs4_renewd_prepare_shutdown(server);
-
-	if (server->client != NULL && !IS_ERR(server->client))
-		rpc_shutdown_client(server->client);
-
-	destroy_nfsv4_state(server);
-
-	rpciod_down();
-
-	nfs_free_iostats(server->io_stats);
-	kfree(server->hostname);
-	kfree(server);
-	nfs_release_automount_timer();
-}
-
-static struct file_system_type nfs4_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "nfs4",
-	.get_sb		= nfs4_get_sb,
-	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
-
-static const int nfs_set_port_min = 0;
-static const int nfs_set_port_max = 65535;
-static int param_set_port(const char *val, struct kernel_param *kp)
-{
-	char *endp;
-	int num = simple_strtol(val, &endp, 0);
-	if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
-		return -EINVAL;
-	*((int *)kp->arg) = num;
-	return 0;
-}
-
-module_param_call(callback_tcpport, param_set_port, param_get_int,
-		 &nfs_callback_set_tcpport, 0644);
-
-static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
-{
-	char *endp;
-	int num = simple_strtol(val, &endp, 0);
-	int jif = num * HZ;
-	if (endp == val || *endp || num < 0 || jif < num)
-		return -EINVAL;
-	*((int *)kp->arg) = jif;
-	return 0;
-}
-
-module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
-		 &nfs_idmap_cache_timeout, 0644);
-
-/* Constructs the SERVER-side path */
-static inline char *nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
-{
-	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
-}
-
-static inline char *nfs4_dup_path(const struct dentry *dentry)
-{
-	char *page = (char *) __get_free_page(GFP_USER);
-	char *path;
-
-	path = nfs4_path(dentry, page, PAGE_SIZE);
-	if (!IS_ERR(path)) {
-		int len = PAGE_SIZE + page - path;
-		char *tmp = path;
-
-		path = kmalloc(len, GFP_KERNEL);
-		if (path)
-			memcpy(path, tmp, len);
-		else
-			path = ERR_PTR(-ENOMEM);
-	}
-	free_page((unsigned long)page);
-	return path;
-}
-
-static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
-{
-	const struct dentry *dentry = data->dentry;
-	struct nfs4_client *clp = server->nfs4_state;
-	struct super_block *sb;
-
-	server->fsid = data->fattr->fsid;
-	nfs_copy_fh(&server->fh, data->fh);
-	server->mnt_path = nfs4_dup_path(dentry);
-	if (IS_ERR(server->mnt_path)) {
-		sb = (struct super_block *)server->mnt_path;
-		goto err;
-	}
-	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
-	if (IS_ERR(sb) || sb->s_root)
-		goto free_path;
-	nfs4_server_capabilities(server, &server->fh);
-
-	down_write(&clp->cl_sem);
-	atomic_inc(&clp->cl_count);
-	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
-	up_write(&clp->cl_sem);
-	return sb;
-free_path:
-	kfree(server->mnt_path);
-err:
-	server->mnt_path = NULL;
-	return sb;
-}
-
-static struct super_block *nfs_clone_nfs4_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data)
-{
-	struct nfs_clone_mount *data = raw_data;
-	return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server);
-}
-
-static struct file_system_type clone_nfs4_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "nfs4",
-	.get_sb		= nfs_clone_nfs4_sb,
-	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
-
-static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname, struct nfs_clone_mount *mountdata)
-{
-	struct vfsmount *mnt = NULL;
-	switch (server->rpc_ops->version) {
-		case 2:
-		case 3:
-			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
-			break;
-		case 4:
-			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
-	}
-	return mnt;
-}
-
-#define nfs4_init_once(nfsi) \
-	do { \
-		INIT_LIST_HEAD(&(nfsi)->open_states); \
-		nfsi->delegation = NULL; \
-		nfsi->delegation_state = 0; \
-		init_rwsem(&nfsi->rwsem); \
-	} while(0)
-
-static inline int register_nfs4fs(void)
-{
-	int ret;
-
-	ret = nfs_register_sysctl();
-	if (ret != 0)
-		return ret;
-	ret = register_filesystem(&nfs4_fs_type);
-	if (ret != 0)
-		nfs_unregister_sysctl();
-	return ret;
-}
-
-static inline void unregister_nfs4fs(void)
-{
-	unregister_filesystem(&nfs4_fs_type);
-	nfs_unregister_sysctl();
-}
-#else
-#define nfs4_fill_sb(a,b)	ERR_PTR(-EINVAL)
-#define nfs4_fill_super(a,b)	ERR_PTR(-EINVAL)
-#define nfs4_init_once(nfsi) \
-	do { } while (0)
-#define register_nfs4fs() (0)
-#define unregister_nfs4fs()
-static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname, struct nfs_clone_mount *mountdata)
-{
-	return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
-}
 #endif
 
-static inline char *nfs_devname(const struct vfsmount *mnt_parent,
-			 const struct dentry *dentry,
-			 char *buffer, ssize_t buflen)
-{
-	return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
-}
-
-/**
- * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @mnt_parent - mountpoint of parent directory
- * @dentry - parent directory
- * @fh - filehandle for new root dentry
- * @fattr - attributes for new root inode
- *
- */
-struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
-		const struct dentry *dentry, struct nfs_fh *fh,
-		struct nfs_fattr *fattr)
-{
-	struct nfs_clone_mount mountdata = {
-		.sb = mnt_parent->mnt_sb,
-		.dentry = dentry,
-		.fh = fh,
-		.fattr = fattr,
-	};
-	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
-	char *page = (char *) __get_free_page(GFP_USER);
-	char *devname;
-
-	dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
-			dentry->d_parent->d_name.name,
-			dentry->d_name.name);
-	if (page == NULL)
-		goto out;
-	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
-	mnt = (struct vfsmount *)devname;
-	if (IS_ERR(devname))
-		goto free_page;
-	mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
-free_page:
-	free_page((unsigned long)page);
-out:
-	dprintk("%s: done\n", __FUNCTION__);
-	return mnt;
-}
-
-#ifdef CONFIG_NFS_V4
-/* Check if fs_root is valid */
-static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname, char *buffer, ssize_t buflen)
-{
-	char *end = buffer + buflen;
-	int n;
-
-	*--end = '\0';
-	buflen--;
-
-	n = pathname->ncomponents;
-	while (--n >= 0) {
-		struct nfs4_string *component = &pathname->components[n];
-		buflen -= component->len + 1;
-		if (buflen < 0)
-			goto Elong;
-		end -= component->len;
-		memcpy(end, component->data, component->len);
-		*--end = '/';
-	}
-	return end;
-Elong:
-	return ERR_PTR(-ENAMETOOLONG);
-}
-
-/* Check if the string represents a "valid" IPv4 address */
-static inline int valid_ipaddr4(const char *buf)
-{
-	int rc, count, in[4];
-
-	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
-	if (rc != 4)
-		return -EINVAL;
-	for (count = 0; count < 4; count++) {
-		if (in[count] > 255)
-			return -EINVAL;
-	}
-	return 0;
-}
-
-static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
-{
-	struct super_block *sb = ERR_PTR(-ENOMEM);
-	int len;
-
-	len = strlen(data->mnt_path) + 1;
-	server->mnt_path = kmalloc(len, GFP_KERNEL);
-	if (server->mnt_path == NULL)
-		goto err;
-	memcpy(server->mnt_path, data->mnt_path, len);
-	memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
-
-	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
-	if (IS_ERR(sb) || sb->s_root)
-		goto free_path;
-	return sb;
-free_path:
-	kfree(server->mnt_path);
-err:
-	server->mnt_path = NULL;
-	return sb;
-}
-
-static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
-{
-	struct nfs_server *server = NFS_SB(sb);
-	struct rpc_timeout timeparms;
-	int proto, timeo, retrans;
-	void *err;
-
-	proto = IPPROTO_TCP;
-	/* Since we are following a referral and there may be alternatives,
-	   set the timeouts and retries to low values */
-	timeo = 2;
-	retrans = 1;
-	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
-
-	server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
-	if (IS_ERR((err = server->client)))
-		goto out_err;
-
-	sb->s_time_gran = 1;
-	sb->s_op = &nfs4_sops;
-	err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
-	if (!IS_ERR(err))
-		return server;
-out_err:
-	return (struct nfs_server *)err;
-}
-
-static struct super_block *nfs_referral_nfs4_sb(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *raw_data)
-{
-	struct nfs_clone_mount *data = raw_data;
-	return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server);
-}
-
-static struct file_system_type nfs_referral_nfs4_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "nfs4",
-	.get_sb		= nfs_referral_nfs4_sb,
-	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
-
-/**
- * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @mnt_parent - mountpoint of parent directory
- * @dentry - parent directory
- * @fspath - fs path returned in fs_locations
- * @mntpath - mount path to new server
- * @hostname - hostname of new server
- * @addr - host addr of new server
- *
- */
-struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
-	     const struct dentry *dentry, struct nfs4_fs_locations *locations)
-{
-	struct vfsmount *mnt = ERR_PTR(-ENOENT);
-	struct nfs_clone_mount mountdata = {
-		.sb = mnt_parent->mnt_sb,
-		.dentry = dentry,
-		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
-	};
-	char *page, *page2;
-	char *path, *fs_path;
-	char *devname;
-	int loc, s;
-
-	if (locations == NULL || locations->nlocations <= 0)
-		goto out;
-
-	dprintk("%s: referral at %s/%s\n", __FUNCTION__,
-		dentry->d_parent->d_name.name, dentry->d_name.name);
-
-	/* Ensure fs path is a prefix of current dentry path */
-	page = (char *) __get_free_page(GFP_USER);
-	if (page == NULL)
-		goto out;
-	page2 = (char *) __get_free_page(GFP_USER);
-	if (page2 == NULL)
-		goto out;
-
-	path = nfs4_path(dentry, page, PAGE_SIZE);
-	if (IS_ERR(path))
-		goto out_free;
-
-	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
-	if (IS_ERR(fs_path))
-		goto out_free;
-
-	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
-		dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
-		goto out_free;
-	}
-
-	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
-	if (IS_ERR(devname)) {
-		mnt = (struct vfsmount *)devname;
-		goto out_free;
-	}
-
-	loc = 0;
-	while (loc < locations->nlocations && IS_ERR(mnt)) {
-		struct nfs4_fs_location *location = &locations->locations[loc];
-		char *mnt_path;
-
-		if (location == NULL || location->nservers <= 0 ||
-		    location->rootpath.ncomponents == 0) {
-			loc++;
-			continue;
-		}
-
-		mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
-		if (IS_ERR(mnt_path)) {
-			loc++;
-			continue;
-		}
-		mountdata.mnt_path = mnt_path;
-
-		s = 0;
-		while (s < location->nservers) {
-			struct sockaddr_in addr = {};
-
-			if (location->servers[s].len <= 0 ||
-			    valid_ipaddr4(location->servers[s].data) < 0) {
-				s++;
-				continue;
-			}
-
-			mountdata.hostname = location->servers[s].data;
-			addr.sin_addr.s_addr = in_aton(mountdata.hostname);
-			addr.sin_family = AF_INET;
-			addr.sin_port = htons(NFS_PORT);
-			mountdata.addr = &addr;
-
-			mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
-			if (!IS_ERR(mnt)) {
-				break;
-			}
-			s++;
-		}
-		loc++;
-	}
-
-out_free:
-	free_page((unsigned long)page);
-	free_page((unsigned long)page2);
-out:
-	dprintk("%s: done\n", __FUNCTION__);
-	return mnt;
-}
-
-/*
- * nfs_do_refmount - handle crossing a referral on server
- * @dentry - dentry of referral
- * @nd - nameidata info
- *
- */
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
-{
-	struct vfsmount *mnt = ERR_PTR(-ENOENT);
-	struct dentry *parent;
-	struct nfs4_fs_locations *fs_locations = NULL;
-	struct page *page;
-	int err;
-
-	/* BUG_ON(IS_ROOT(dentry)); */
-	dprintk("%s: enter\n", __FUNCTION__);
-
-	page = alloc_page(GFP_KERNEL);
-	if (page == NULL)
-		goto out;
-
-	fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
-	if (fs_locations == NULL)
-		goto out_free;
-
-	/* Get locations */
-	parent = dget_parent(dentry);
-	dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
-	err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
-	dput(parent);
-	if (err != 0 || fs_locations->nlocations <= 0 ||
-	    fs_locations->fs_path.ncomponents <= 0)
-		goto out_free;
-
-	mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
-out_free:
-	__free_page(page);
-	kfree(fs_locations);
-out:
-	dprintk("%s: done\n", __FUNCTION__);
-	return mnt;
-}
-#else
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
-{
-	return ERR_PTR(-ENOENT);
-}
-#endif
-
-extern int nfs_init_nfspagecache(void);
-extern void nfs_destroy_nfspagecache(void);
-extern int nfs_init_readpagecache(void);
-extern void nfs_destroy_readpagecache(void);
-extern int nfs_init_writepagecache(void);
-extern void nfs_destroy_writepagecache(void);
-#ifdef CONFIG_NFS_DIRECTIO
-extern int nfs_init_directcache(void);
-extern void nfs_destroy_directcache(void);
-#endif
-
-static kmem_cache_t * nfs_inode_cachep;
-
-static struct inode *nfs_alloc_inode(struct super_block *sb)
+struct inode *nfs_alloc_inode(struct super_block *sb)
 {
 	struct nfs_inode *nfsi;
 	nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL);
@@ -2837,11 +1084,19 @@ static struct inode *nfs_alloc_inode(struct super_block *sb)
 	return &nfsi->vfs_inode;
 }
 
-static void nfs_destroy_inode(struct inode *inode)
+void nfs_destroy_inode(struct inode *inode)
 {
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 
+#define nfs4_init_once(nfsi) \
+	do { \
+		INIT_LIST_HEAD(&(nfsi)->open_states); \
+		nfsi->delegation = NULL; \
+		nfsi->delegation_state = 0; \
+		init_rwsem(&nfsi->rwsem); \
+	} while(0)
+
 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
@@ -2862,7 +1117,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 	}
 }
  
-static int nfs_init_inodecache(void)
+static int __init nfs_init_inodecache(void)
 {
 	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
 					     sizeof(struct nfs_inode),
@@ -2875,7 +1130,7 @@ static int nfs_init_inodecache(void)
 	return 0;
 }
 
-static void nfs_destroy_inodecache(void)
+static void __exit nfs_destroy_inodecache(void)
 {
 	if (kmem_cache_destroy(nfs_inode_cachep))
 		printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n");
@@ -2904,29 +1159,22 @@ static int __init init_nfs_fs(void)
 	if (err)
 		goto out1;
 
-#ifdef CONFIG_NFS_DIRECTIO
 	err = nfs_init_directcache();
 	if (err)
 		goto out0;
-#endif
 
 #ifdef CONFIG_PROC_FS
 	rpc_proc_register(&nfs_rpcstat);
 #endif
-        err = register_filesystem(&nfs_fs_type);
-	if (err)
-		goto out;
-	if ((err = register_nfs4fs()) != 0)
+	if ((err = register_nfs_fs()) != 0)
 		goto out;
 	return 0;
 out:
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
-#ifdef CONFIG_NFS_DIRECTIO
 	nfs_destroy_directcache();
 out0:
-#endif
 	nfs_destroy_writepagecache();
 out1:
 	nfs_destroy_readpagecache();
@@ -2940,9 +1188,7 @@ out4:
 
 static void __exit exit_nfs_fs(void)
 {
-#ifdef CONFIG_NFS_DIRECTIO
 	nfs_destroy_directcache();
-#endif
 	nfs_destroy_writepagecache();
 	nfs_destroy_readpagecache();
 	nfs_destroy_inodecache();
@@ -2950,8 +1196,7 @@ static void __exit exit_nfs_fs(void)
 #ifdef CONFIG_PROC_FS
 	rpc_proc_unregister("nfs");
 #endif
-	unregister_filesystem(&nfs_fs_type);
-	unregister_nfs4fs();
+	unregister_nfs_fs();
 }
 
 /* Not quite true; I just maintain it */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
new file mode 100644
index 00000000000..5e51c4535b6
--- /dev/null
+++ b/fs/nfs/internal.h
@@ -0,0 +1,179 @@
+/*
+ * NFS internal definitions
+ */
+
+#include <linux/mount.h>
+
+struct nfs_clone_mount {
+	const struct super_block *sb;
+	const struct dentry *dentry;
+	struct nfs_fh *fh;
+	struct nfs_fattr *fattr;
+	char *hostname;
+	char *mnt_path;
+	struct sockaddr_in *addr;
+	rpc_authflavor_t authflavor;
+};
+
+/* namespace-nfs4.c */
+#ifdef CONFIG_NFS_V4
+extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
+#else
+static inline
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+	return ERR_PTR(-ENOENT);
+}
+#endif
+
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+
+/* pagelist.c */
+extern int __init nfs_init_nfspagecache(void);
+extern void __exit nfs_destroy_nfspagecache(void);
+extern int __init nfs_init_readpagecache(void);
+extern void __exit nfs_destroy_readpagecache(void);
+extern int __init nfs_init_writepagecache(void);
+extern void __exit nfs_destroy_writepagecache(void);
+
+#ifdef CONFIG_NFS_DIRECTIO
+extern int __init nfs_init_directcache(void);
+extern void __exit nfs_destroy_directcache(void);
+#else
+#define nfs_init_directcache() (0)
+#define nfs_destroy_directcache() do {} while(0)
+#endif
+
+/* nfs2xdr.c */
+extern struct rpc_procinfo nfs_procedures[];
+extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
+
+/* nfs3xdr.c */
+extern struct rpc_procinfo nfs3_procedures[];
+extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
+
+/* nfs4xdr.c */
+extern int nfs_stat_to_errno(int);
+extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+
+/* nfs4proc.c */
+extern struct rpc_procinfo nfs4_procedures[];
+
+extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
+				  struct nfs4_fs_locations *fs_locations,
+				  struct page *page);
+
+/* inode.c */
+extern struct inode *nfs_alloc_inode(struct super_block *sb);
+extern void nfs_destroy_inode(struct inode *);
+extern int nfs_write_inode(struct inode *,int);
+extern void nfs_clear_inode(struct inode *);
+#ifdef CONFIG_NFS_V4
+extern void nfs4_clear_inode(struct inode *);
+#endif
+
+/* super.c */
+extern struct file_system_type nfs_referral_nfs4_fs_type;
+extern struct file_system_type clone_nfs_fs_type;
+#ifdef CONFIG_NFS_V4
+extern struct file_system_type clone_nfs4_fs_type;
+#endif
+#ifdef CONFIG_PROC_FS
+extern struct rpc_stat nfs_rpcstat;
+#endif
+extern int __init register_nfs_fs(void);
+extern void __exit unregister_nfs_fs(void);
+
+/* namespace.c */
+extern char *nfs_path(const char *base, const struct dentry *dentry,
+		      char *buffer, ssize_t buflen);
+
+/*
+ * Determine the mount path as a string
+ */
+static inline char *nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+{
+	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+}
+
+/*
+ * Determine the device name as a string
+ */
+static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+			 const struct dentry *dentry,
+			 char *buffer, ssize_t buflen)
+{
+	return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen);
+}
+
+/*
+ * Determine the actual block size (and log2 thereof)
+ */
+static inline
+unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp)
+{
+	/* make sure blocksize is a power of two */
+	if ((bsize & (bsize - 1)) || nrbitsp) {
+		unsigned char	nrbits;
+
+		for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--)
+			;
+		bsize = 1 << nrbits;
+		if (nrbitsp)
+			*nrbitsp = nrbits;
+	}
+
+	return bsize;
+}
+
+/*
+ * Calculate the number of 512byte blocks used.
+ */
+static inline unsigned long nfs_calc_block_size(u64 tsize)
+{
+	loff_t used = (tsize + 511) >> 9;
+	return (used > ULONG_MAX) ? ULONG_MAX : used;
+}
+
+/*
+ * Compute and set NFS server blocksize
+ */
+static inline
+unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp)
+{
+	if (bsize < NFS_MIN_FILE_IO_SIZE)
+		bsize = NFS_DEF_FILE_IO_SIZE;
+	else if (bsize >= NFS_MAX_FILE_IO_SIZE)
+		bsize = NFS_MAX_FILE_IO_SIZE;
+
+	return nfs_block_bits(bsize, nrbitsp);
+}
+
+/*
+ * Determine the maximum file size for a superblock
+ */
+static inline
+void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
+{
+	sb->s_maxbytes = (loff_t)maxfilesize;
+	if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+}
+
+/*
+ * Check if the string represents a "valid" IPv4 address
+ */
+static inline int valid_ipaddr4(const char *buf)
+{
+	int rc, count, in[4];
+
+	rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
+	if (rc != 4)
+		return -EINVAL;
+	for (count = 0; count < 4; count++) {
+		if (in[count] > 255)
+			return -EINVAL;
+	}
+	return 0;
+}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 8ca44b7b25c..19b98ca468e 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -15,14 +15,63 @@
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/vfs.h>
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
-LIST_HEAD(nfs_automount_list);
 static void nfs_expire_automounts(void *list);
+
+LIST_HEAD(nfs_automount_list);
 static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
+/*
+ * nfs_path - reconstruct the path given an arbitrary dentry
+ * @base - arbitrary string to prepend to the path
+ * @dentry - pointer to dentry
+ * @buffer - result buffer
+ * @buflen - length of buffer
+ *
+ * Helper function for constructing the path from the
+ * root dentry to an arbitrary hashed dentry.
+ *
+ * This is mainly for use in figuring out the path on the
+ * server side when automounting on top of an existing partition.
+ */
+char *nfs_path(const char *base, const struct dentry *dentry,
+	       char *buffer, ssize_t buflen)
+{
+	char *end = buffer+buflen;
+	int namelen;
+
+	*--end = '\0';
+	buflen--;
+	spin_lock(&dcache_lock);
+	while (!IS_ROOT(dentry)) {
+		namelen = dentry->d_name.len;
+		buflen -= namelen + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= namelen;
+		memcpy(end, dentry->d_name.name, namelen);
+		*--end = '/';
+		dentry = dentry->d_parent;
+	}
+	spin_unlock(&dcache_lock);
+	namelen = strlen(base);
+	/* Strip off excess slashes in base string */
+	while (namelen > 0 && base[namelen - 1] == '/')
+		namelen--;
+	buflen -= namelen;
+	if (buflen < 0)
+		goto Elong;
+	end -= namelen;
+	memcpy(end, base, namelen);
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
 /*
  * nfs_follow_mountpoint - handle crossing a mountpoint on the server
  * @dentry - dentry of mountpoint
@@ -117,3 +166,64 @@ void nfs_release_automount_timer(void)
 		flush_scheduled_work();
 	}
 }
+
+/*
+ * Clone a mountpoint of the appropriate type
+ */
+static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname,
+					   struct nfs_clone_mount *mountdata)
+{
+#ifdef CONFIG_NFS_V4
+	struct vfsmount *mnt = NULL;
+	switch (server->rpc_ops->version) {
+		case 2:
+		case 3:
+			mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+			break;
+		case 4:
+			mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata);
+	}
+	return mnt;
+#else
+	return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata);
+#endif
+}
+
+/**
+ * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fh - filehandle for new root dentry
+ * @fattr - attributes for new root inode
+ *
+ */
+struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+		const struct dentry *dentry, struct nfs_fh *fh,
+		struct nfs_fattr *fattr)
+{
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.fh = fh,
+		.fattr = fattr,
+	};
+	struct vfsmount *mnt = ERR_PTR(-ENOMEM);
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *devname;
+
+	dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
+			dentry->d_parent->d_name.name,
+			dentry->d_name.name);
+	if (page == NULL)
+		goto out;
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	mnt = (struct vfsmount *)devname;
+	if (IS_ERR(devname))
+		goto free_page;
+	mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
+free_page:
+	free_page((unsigned long)page);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index a7ed88f97a1..4a006f81666 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -27,8 +27,6 @@
 #define NFSDBG_FACILITY		NFSDBG_XDR
 /* #define NFS_PARANOIA 1 */
 
-extern int			nfs_stat_to_errno(int stat);
-
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO		EIO
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cf186f0d2b3..7143b1f82ce 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -20,11 +20,10 @@
 #include <linux/nfs_mount.h>
 
 #include "iostat.h"
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
-extern struct rpc_procinfo nfs3_procedures[];
-
 /* A wrapper to handle the EJUKEBOX error message */
 static int
 nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
@@ -809,8 +808,6 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return status;
 }
 
-extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
-
 static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
 	if (nfs3_async_handle_jukebox(task, data->inode))
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index f70eee2cac0..0250269e975 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -22,14 +22,13 @@
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfsacl.h>
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
 /* Mapping from NFS error code to "errno" error code. */
 #define errno_NFSERR_IO		EIO
 
-extern int			nfs_stat_to_errno(int);
-
 /*
  * Declare the space requirements for NFS arguments and replies as
  * number of 32bit-words
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
new file mode 100644
index 00000000000..ea38d27b74e
--- /dev/null
+++ b/fs/nfs/nfs4namespace.c
@@ -0,0 +1,201 @@
+/*
+ * linux/fs/nfs/nfs4namespace.c
+ *
+ * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * NFSv4 namespace
+ */
+
+#include <linux/config.h>
+
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/nfs_fs.h>
+#include <linux/string.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+/*
+ * Check if fs_root is valid
+ */
+static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname,
+					 char *buffer, ssize_t buflen)
+{
+	char *end = buffer + buflen;
+	int n;
+
+	*--end = '\0';
+	buflen--;
+
+	n = pathname->ncomponents;
+	while (--n >= 0) {
+		struct nfs4_string *component = &pathname->components[n];
+		buflen -= component->len + 1;
+		if (buflen < 0)
+			goto Elong;
+		end -= component->len;
+		memcpy(end, component->data, component->len);
+		*--end = '/';
+	}
+	return end;
+Elong:
+	return ERR_PTR(-ENAMETOOLONG);
+}
+
+
+/**
+ * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
+ * @mnt_parent - mountpoint of parent directory
+ * @dentry - parent directory
+ * @fspath - fs path returned in fs_locations
+ * @mntpath - mount path to new server
+ * @hostname - hostname of new server
+ * @addr - host addr of new server
+ *
+ */
+static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+					    const struct dentry *dentry,
+					    struct nfs4_fs_locations *locations)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct nfs_clone_mount mountdata = {
+		.sb = mnt_parent->mnt_sb,
+		.dentry = dentry,
+		.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+	};
+	char *page, *page2;
+	char *path, *fs_path;
+	char *devname;
+	int loc, s;
+
+	if (locations == NULL || locations->nlocations <= 0)
+		goto out;
+
+	dprintk("%s: referral at %s/%s\n", __FUNCTION__,
+		dentry->d_parent->d_name.name, dentry->d_name.name);
+
+	/* Ensure fs path is a prefix of current dentry path */
+	page = (char *) __get_free_page(GFP_USER);
+	if (page == NULL)
+		goto out;
+	page2 = (char *) __get_free_page(GFP_USER);
+	if (page2 == NULL)
+		goto out;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (IS_ERR(path))
+		goto out_free;
+
+	fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE);
+	if (IS_ERR(fs_path))
+		goto out_free;
+
+	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
+		dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path);
+		goto out_free;
+	}
+
+	devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+	if (IS_ERR(devname)) {
+		mnt = (struct vfsmount *)devname;
+		goto out_free;
+	}
+
+	loc = 0;
+	while (loc < locations->nlocations && IS_ERR(mnt)) {
+		struct nfs4_fs_location *location = &locations->locations[loc];
+		char *mnt_path;
+
+		if (location == NULL || location->nservers <= 0 ||
+		    location->rootpath.ncomponents == 0) {
+			loc++;
+			continue;
+		}
+
+		mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+		if (IS_ERR(mnt_path)) {
+			loc++;
+			continue;
+		}
+		mountdata.mnt_path = mnt_path;
+
+		s = 0;
+		while (s < location->nservers) {
+			struct sockaddr_in addr = {};
+
+			if (location->servers[s].len <= 0 ||
+			    valid_ipaddr4(location->servers[s].data) < 0) {
+				s++;
+				continue;
+			}
+
+			mountdata.hostname = location->servers[s].data;
+			addr.sin_addr.s_addr = in_aton(mountdata.hostname);
+			addr.sin_family = AF_INET;
+			addr.sin_port = htons(NFS_PORT);
+			mountdata.addr = &addr;
+
+			mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata);
+			if (!IS_ERR(mnt)) {
+				break;
+			}
+			s++;
+		}
+		loc++;
+	}
+
+out_free:
+	free_page((unsigned long)page);
+	free_page((unsigned long)page2);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
+
+/*
+ * nfs_do_refmount - handle crossing a referral on server
+ * @dentry - dentry of referral
+ * @nd - nameidata info
+ *
+ */
+struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+{
+	struct vfsmount *mnt = ERR_PTR(-ENOENT);
+	struct dentry *parent;
+	struct nfs4_fs_locations *fs_locations = NULL;
+	struct page *page;
+	int err;
+
+	/* BUG_ON(IS_ROOT(dentry)); */
+	dprintk("%s: enter\n", __FUNCTION__);
+
+	page = alloc_page(GFP_KERNEL);
+	if (page == NULL)
+		goto out;
+
+	fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+	if (fs_locations == NULL)
+		goto out_free;
+
+	/* Get locations */
+	parent = dget_parent(dentry);
+	dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name);
+	err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page);
+	dput(parent);
+	if (err != 0 || fs_locations->nlocations <= 0 ||
+	    fs_locations->fs_path.ncomponents <= 0)
+		goto out_free;
+
+	mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+out_free:
+	__free_page(page);
+	kfree(fs_locations);
+out:
+	dprintk("%s: done\n", __FUNCTION__);
+	return mnt;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3300e35d74a..b4916b09219 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -65,8 +65,6 @@ static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *)
 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry);
 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp);
-extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
-extern struct rpc_procinfo nfs4_procedures[];
 
 /* Prevent leaks of NFSv4 errors into userland */
 int nfs4_map_errors(int err)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 656481c0daa..ef9429643eb 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -378,7 +378,7 @@ out:
 	return res;
 }
 
-int nfs_init_nfspagecache(void)
+int __init nfs_init_nfspagecache(void)
 {
 	nfs_page_cachep = kmem_cache_create("nfs_page",
 					    sizeof(struct nfs_page),
@@ -390,7 +390,7 @@ int nfs_init_nfspagecache(void)
 	return 0;
 }
 
-void nfs_destroy_nfspagecache(void)
+void __exit nfs_destroy_nfspagecache(void)
 {
 	if (kmem_cache_destroy(nfs_page_cachep))
 		printk(KERN_INFO "nfs_page: not all structures were freed\n");
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 9dd85cac2df..b3899ea3229 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -44,11 +44,10 @@
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
 #include <linux/smp_lock.h>
+#include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
-extern struct rpc_procinfo nfs_procedures[];
-
 /*
  * Bare-bones access to getattr: this is for nfs_read_super.
  */
@@ -611,8 +610,6 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return 0;
 }
 
-extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
-
 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
 	if (task->tk_status >= 0) {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index fd9018c692b..41c2ffee24f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -694,7 +694,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 	return ret;
 }
 
-int nfs_init_readpagecache(void)
+int __init nfs_init_readpagecache(void)
 {
 	nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
 					     sizeof(struct nfs_read_data),
@@ -711,7 +711,7 @@ int nfs_init_readpagecache(void)
 	return 0;
 }
 
-void nfs_destroy_readpagecache(void)
+void __exit nfs_destroy_readpagecache(void)
 {
 	mempool_destroy(nfs_rdata_mempool);
 	if (kmem_cache_destroy(nfs_rdata_cachep))
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
new file mode 100644
index 00000000000..4acd3ee9642
--- /dev/null
+++ b/fs/nfs/super.c
@@ -0,0 +1,1468 @@
+/*
+ *  linux/fs/nfs/super.c
+ *
+ *  Copyright (C) 1992  Rick Sladkey
+ *
+ *  nfs superblock handling functions
+ *
+ *  Modularised by Alan Cox <Alan.Cox@linux.org>, while hacking some
+ *  experimental NFS changes. Modularisation taken straight from SYS5 fs.
+ *
+ *  Change to nfs_read_super() to permit NFS mounts to multi-homed hosts.
+ *  J.S.Peatfield@damtp.cam.ac.uk
+ *
+ *  Split from inode.c by David Howells <dhowells@redhat.com>
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_fs.h>
+#include <linux/nfs_mount.h>
+#include <linux/nfs4_mount.h>
+#include <linux/lockd/bind.h>
+#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/nfs_idmap.h>
+#include <linux/vfs.h>
+#include <linux/inet.h>
+#include <linux/nfs_xdr.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include "nfs4_fs.h"
+#include "callback.h"
+#include "delegation.h"
+#include "iostat.h"
+#include "internal.h"
+
+#define NFSDBG_FACILITY		NFSDBG_VFS
+
+/* Maximum number of readahead requests
+ * FIXME: this should really be a sysctl so that users may tune it to suit
+ *        their needs. People that do NFS over a slow network, might for
+ *        instance want to reduce it to something closer to 1 for improved
+ *        interactive response.
+ */
+#define NFS_MAX_READAHEAD	(RPC_DEF_SLOT_TABLE - 1)
+
+/*
+ * RPC cruft for NFS
+ */
+static struct rpc_version * nfs_version[] = {
+	NULL,
+	NULL,
+	&nfs_version2,
+#if defined(CONFIG_NFS_V3)
+	&nfs_version3,
+#elif defined(CONFIG_NFS_V4)
+	NULL,
+#endif
+#if defined(CONFIG_NFS_V4)
+	&nfs_version4,
+#endif
+};
+
+static struct rpc_program nfs_program = {
+	.name			= "nfs",
+	.number			= NFS_PROGRAM,
+	.nrvers			= ARRAY_SIZE(nfs_version),
+	.version		= nfs_version,
+	.stats			= &nfs_rpcstat,
+	.pipe_dir_name		= "/nfs",
+};
+
+struct rpc_stat nfs_rpcstat = {
+	.program		= &nfs_program
+};
+
+
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
+static struct rpc_version *	nfsacl_version[] = {
+	[3]			= &nfsacl_version3,
+};
+
+struct rpc_program		nfsacl_program = {
+	.name =			"nfsacl",
+	.number =		NFS_ACL_PROGRAM,
+	.nrvers =		ARRAY_SIZE(nfsacl_version),
+	.version =		nfsacl_version,
+	.stats =		&nfsacl_rpcstat,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
+
+static void nfs_umount_begin(struct vfsmount *, int);
+static int  nfs_statfs(struct super_block *, struct kstatfs *);
+static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
+static struct super_block *nfs_get_sb(struct file_system_type *, int, const char *, void *);
+static struct super_block *nfs_clone_nfs_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data);
+static void nfs_kill_super(struct super_block *);
+
+static struct file_system_type nfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_get_sb,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type clone_nfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs",
+	.get_sb		= nfs_clone_nfs_sb,
+	.kill_sb	= nfs_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+static struct super_operations nfs_sops = {
+	.alloc_inode	= nfs_alloc_inode,
+	.destroy_inode	= nfs_destroy_inode,
+	.write_inode	= nfs_write_inode,
+	.statfs		= nfs_statfs,
+	.clear_inode	= nfs_clear_inode,
+	.umount_begin	= nfs_umount_begin,
+	.show_options	= nfs_show_options,
+	.show_stats	= nfs_show_stats,
+};
+
+#ifdef CONFIG_NFS_V4
+static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data);
+static struct super_block *nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data);
+static struct super_block *nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data);
+static void nfs4_kill_super(struct super_block *sb);
+
+static struct file_system_type nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.get_sb		= nfs4_get_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type clone_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.get_sb		= nfs_clone_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+struct file_system_type nfs_referral_nfs4_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "nfs4",
+	.get_sb		= nfs_referral_nfs4_sb,
+	.kill_sb	= nfs4_kill_super,
+	.fs_flags	= FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+
+static struct super_operations nfs4_sops = {
+	.alloc_inode	= nfs_alloc_inode,
+	.destroy_inode	= nfs_destroy_inode,
+	.write_inode	= nfs_write_inode,
+	.statfs		= nfs_statfs,
+	.clear_inode	= nfs4_clear_inode,
+	.umount_begin	= nfs_umount_begin,
+	.show_options	= nfs_show_options,
+	.show_stats	= nfs_show_stats,
+};
+#endif
+
+static const int nfs_set_port_min = 0;
+static const int nfs_set_port_max = 65535;
+
+static int param_set_port(const char *val, struct kernel_param *kp)
+{
+	char *endp;
+	int num = simple_strtol(val, &endp, 0);
+	if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+		return -EINVAL;
+	*((int *)kp->arg) = num;
+	return 0;
+}
+
+module_param_call(callback_tcpport, param_set_port, param_get_int,
+		 &nfs_callback_set_tcpport, 0644);
+
+static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
+{
+	char *endp;
+	int num = simple_strtol(val, &endp, 0);
+	int jif = num * HZ;
+	if (endp == val || *endp || num < 0 || jif < num)
+		return -EINVAL;
+	*((int *)kp->arg) = jif;
+	return 0;
+}
+
+module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
+		 &nfs_idmap_cache_timeout, 0644);
+
+/*
+ * Register the NFS filesystems
+ */
+int __init register_nfs_fs(void)
+{
+	int ret;
+
+        ret = register_filesystem(&nfs_fs_type);
+	if (ret < 0)
+		goto error_0;
+
+#ifdef CONFIG_NFS_V4
+	ret = nfs_register_sysctl();
+	if (ret < 0)
+		goto error_1;
+	ret = register_filesystem(&nfs4_fs_type);
+	if (ret < 0)
+		goto error_2;
+#endif
+	return 0;
+
+#ifdef CONFIG_NFS_V4
+error_2:
+	nfs_unregister_sysctl();
+error_1:
+	unregister_filesystem(&nfs_fs_type);
+#endif
+error_0:
+	return ret;
+}
+
+/*
+ * Unregister the NFS filesystems
+ */
+void __exit unregister_nfs_fs(void)
+{
+#ifdef CONFIG_NFS_V4
+	unregister_filesystem(&nfs4_fs_type);
+	nfs_unregister_sysctl();
+#endif
+	unregister_filesystem(&nfs_fs_type);
+}
+
+/*
+ * Deliver file system statistics to userspace
+ */
+static int nfs_statfs(struct super_block *sb, struct kstatfs *buf)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	unsigned char blockbits;
+	unsigned long blockres;
+	struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode);
+	struct nfs_fattr fattr;
+	struct nfs_fsstat res = {
+			.fattr = &fattr,
+	};
+	int error;
+
+	lock_kernel();
+
+	error = server->rpc_ops->statfs(server, rootfh, &res);
+	buf->f_type = NFS_SUPER_MAGIC;
+	if (error < 0)
+		goto out_err;
+
+	/*
+	 * Current versions of glibc do not correctly handle the
+	 * case where f_frsize != f_bsize.  Eventually we want to
+	 * report the value of wtmult in this field.
+	 */
+	buf->f_frsize = sb->s_blocksize;
+
+	/*
+	 * On most *nix systems, f_blocks, f_bfree, and f_bavail
+	 * are reported in units of f_frsize.  Linux hasn't had
+	 * an f_frsize field in its statfs struct until recently,
+	 * thus historically Linux's sys_statfs reports these
+	 * fields in units of f_bsize.
+	 */
+	buf->f_bsize = sb->s_blocksize;
+	blockbits = sb->s_blocksize_bits;
+	blockres = (1 << blockbits) - 1;
+	buf->f_blocks = (res.tbytes + blockres) >> blockbits;
+	buf->f_bfree = (res.fbytes + blockres) >> blockbits;
+	buf->f_bavail = (res.abytes + blockres) >> blockbits;
+
+	buf->f_files = res.tfiles;
+	buf->f_ffree = res.afiles;
+
+	buf->f_namelen = server->namelen;
+ out:
+	unlock_kernel();
+	return 0;
+
+ out_err:
+	dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
+	buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1;
+	goto out;
+
+}
+
+/*
+ * Describe the mount options in force on this server representation
+ */
+static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults)
+{
+	static struct proc_nfs_info {
+		int flag;
+		char *str;
+		char *nostr;
+	} nfs_info[] = {
+		{ NFS_MOUNT_SOFT, ",soft", ",hard" },
+		{ NFS_MOUNT_INTR, ",intr", "" },
+		{ NFS_MOUNT_NOCTO, ",nocto", "" },
+		{ NFS_MOUNT_NOAC, ",noac", "" },
+		{ NFS_MOUNT_NONLM, ",nolock", "" },
+		{ NFS_MOUNT_NOACL, ",noacl", "" },
+		{ 0, NULL, NULL }
+	};
+	struct proc_nfs_info *nfs_infop;
+	char buf[12];
+	char *proto;
+
+	seq_printf(m, ",vers=%d", nfss->rpc_ops->version);
+	seq_printf(m, ",rsize=%d", nfss->rsize);
+	seq_printf(m, ",wsize=%d", nfss->wsize);
+	if (nfss->acregmin != 3*HZ || showdefaults)
+		seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ);
+	if (nfss->acregmax != 60*HZ || showdefaults)
+		seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ);
+	if (nfss->acdirmin != 30*HZ || showdefaults)
+		seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ);
+	if (nfss->acdirmax != 60*HZ || showdefaults)
+		seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ);
+	for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
+		if (nfss->flags & nfs_infop->flag)
+			seq_puts(m, nfs_infop->str);
+		else
+			seq_puts(m, nfs_infop->nostr);
+	}
+	switch (nfss->client->cl_xprt->prot) {
+		case IPPROTO_TCP:
+			proto = "tcp";
+			break;
+		case IPPROTO_UDP:
+			proto = "udp";
+			break;
+		default:
+			snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot);
+			proto = buf;
+	}
+	seq_printf(m, ",proto=%s", proto);
+	seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
+	seq_printf(m, ",retrans=%u", nfss->retrans_count);
+}
+
+/*
+ * Describe the mount options on this VFS mountpoint
+ */
+static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+
+	nfs_show_mount_options(m, nfss, 0);
+
+	seq_puts(m, ",addr=");
+	seq_escape(m, nfss->hostname, " \t\n\\");
+
+	return 0;
+}
+
+/*
+ * Present statistical information for this VFS mountpoint
+ */
+static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
+{
+	int i, cpu;
+	struct nfs_server *nfss = NFS_SB(mnt->mnt_sb);
+	struct rpc_auth *auth = nfss->client->cl_auth;
+	struct nfs_iostats totals = { };
+
+	seq_printf(m, "statvers=%s", NFS_IOSTAT_VERS);
+
+	/*
+	 * Display all mount option settings
+	 */
+	seq_printf(m, "\n\topts:\t");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? "ro" : "rw");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : "");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_NOATIME ? ",noatime" : "");
+	seq_puts(m, mnt->mnt_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : "");
+	nfs_show_mount_options(m, nfss, 1);
+
+	seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ);
+
+	seq_printf(m, "\n\tcaps:\t");
+	seq_printf(m, "caps=0x%x", nfss->caps);
+	seq_printf(m, ",wtmult=%d", nfss->wtmult);
+	seq_printf(m, ",dtsize=%d", nfss->dtsize);
+	seq_printf(m, ",bsize=%d", nfss->bsize);
+	seq_printf(m, ",namelen=%d", nfss->namelen);
+
+#ifdef CONFIG_NFS_V4
+	if (nfss->rpc_ops->version == 4) {
+		seq_printf(m, "\n\tnfsv4:\t");
+		seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
+		seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
+		seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+	}
+#endif
+
+	/*
+	 * Display security flavor in effect for this mount
+	 */
+	seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor);
+	if (auth->au_flavor)
+		seq_printf(m, ",pseudoflavor=%d", auth->au_flavor);
+
+	/*
+	 * Display superblock I/O counters
+	 */
+	for_each_possible_cpu(cpu) {
+		struct nfs_iostats *stats;
+
+		preempt_disable();
+		stats = per_cpu_ptr(nfss->io_stats, cpu);
+
+		for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+			totals.events[i] += stats->events[i];
+		for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+			totals.bytes[i] += stats->bytes[i];
+
+		preempt_enable();
+	}
+
+	seq_printf(m, "\n\tevents:\t");
+	for (i = 0; i < __NFSIOS_COUNTSMAX; i++)
+		seq_printf(m, "%lu ", totals.events[i]);
+	seq_printf(m, "\n\tbytes:\t");
+	for (i = 0; i < __NFSIOS_BYTESMAX; i++)
+		seq_printf(m, "%Lu ", totals.bytes[i]);
+	seq_printf(m, "\n");
+
+	rpc_print_iostats(m, nfss->client);
+
+	return 0;
+}
+
+/*
+ * Begin unmount by attempting to remove all automounted mountpoints we added
+ * in response to traversals
+ */
+static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+	struct nfs_server *server;
+	struct rpc_clnt	*rpc;
+
+	shrink_submounts(vfsmnt, &nfs_automount_list);
+	if (!(flags & MNT_FORCE))
+		return;
+	/* -EIO all pending I/O */
+	server = NFS_SB(vfsmnt->mnt_sb);
+	rpc = server->client;
+	if (!IS_ERR(rpc))
+		rpc_killall_tasks(rpc);
+	rpc = server->client_acl;
+	if (!IS_ERR(rpc))
+		rpc_killall_tasks(rpc);
+}
+
+/*
+ * Obtain the root inode of the file system.
+ */
+static struct inode *
+nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo)
+{
+	struct nfs_server	*server = NFS_SB(sb);
+	int			error;
+
+	error = server->rpc_ops->getroot(server, rootfh, fsinfo);
+	if (error < 0) {
+		dprintk("nfs_get_root: getattr error = %d\n", -error);
+		return ERR_PTR(error);
+	}
+
+	server->fsid = fsinfo->fattr->fsid;
+	return nfs_fhget(sb, rootfh, fsinfo->fattr);
+}
+
+/*
+ * Do NFS version-independent mount processing, and sanity checking
+ */
+static int
+nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor)
+{
+	struct nfs_server	*server;
+	struct inode		*root_inode;
+	struct nfs_fattr	fattr;
+	struct nfs_fsinfo	fsinfo = {
+					.fattr = &fattr,
+				};
+	struct nfs_pathconf pathinfo = {
+			.fattr = &fattr,
+	};
+	int no_root_error = 0;
+	unsigned long max_rpc_payload;
+
+	/* We probably want something more informative here */
+	snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+
+	server = NFS_SB(sb);
+
+	sb->s_magic      = NFS_SUPER_MAGIC;
+
+	server->io_stats = nfs_alloc_iostats();
+	if (server->io_stats == NULL)
+		return -ENOMEM;
+
+	root_inode = nfs_get_root(sb, &server->fh, &fsinfo);
+	/* Did getting the root inode fail? */
+	if (IS_ERR(root_inode)) {
+		no_root_error = PTR_ERR(root_inode);
+		goto out_no_root;
+	}
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root) {
+		no_root_error = -ENOMEM;
+		goto out_no_root;
+	}
+	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+
+	/* mount time stamp, in seconds */
+	server->mount_time = jiffies;
+
+	/* Get some general file system info */
+	if (server->namelen == 0 &&
+	    server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0)
+		server->namelen = pathinfo.max_namelen;
+	/* Work out a lot of parameters */
+	if (server->rsize == 0)
+		server->rsize = nfs_block_size(fsinfo.rtpref, NULL);
+	if (server->wsize == 0)
+		server->wsize = nfs_block_size(fsinfo.wtpref, NULL);
+
+	if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax)
+		server->rsize = nfs_block_size(fsinfo.rtmax, NULL);
+	if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax)
+		server->wsize = nfs_block_size(fsinfo.wtmax, NULL);
+
+	max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL);
+	if (server->rsize > max_rpc_payload)
+		server->rsize = max_rpc_payload;
+	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
+		server->rsize = NFS_MAX_FILE_IO_SIZE;
+	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (server->wsize > max_rpc_payload)
+		server->wsize = max_rpc_payload;
+	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
+		server->wsize = NFS_MAX_FILE_IO_SIZE;
+	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (sb->s_blocksize == 0)
+		sb->s_blocksize = nfs_block_bits(server->wsize,
+							 &sb->s_blocksize_bits);
+	server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL);
+
+	server->dtsize = nfs_block_size(fsinfo.dtpref, NULL);
+	if (server->dtsize > PAGE_CACHE_SIZE)
+		server->dtsize = PAGE_CACHE_SIZE;
+	if (server->dtsize > server->rsize)
+		server->dtsize = server->rsize;
+
+	if (server->flags & NFS_MOUNT_NOAC) {
+		server->acregmin = server->acregmax = 0;
+		server->acdirmin = server->acdirmax = 0;
+		sb->s_flags |= MS_SYNCHRONOUS;
+	}
+	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+
+	nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+
+	server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0;
+	server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0;
+
+	/* We're airborne Set socket buffersize */
+	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
+	return 0;
+	/* Yargs. It didn't work out. */
+out_no_root:
+	dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error);
+	if (!IS_ERR(root_inode))
+		iput(root_inode);
+	return no_root_error;
+}
+
+/*
+ * Initialise the timeout values for a connection
+ */
+static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans)
+{
+	to->to_initval = timeo * HZ / 10;
+	to->to_retries = retrans;
+	if (!to->to_retries)
+		to->to_retries = 2;
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		if (!to->to_initval)
+			to->to_initval = 60 * HZ;
+		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
+			to->to_initval = NFS_MAX_TCP_TIMEOUT;
+		to->to_increment = to->to_initval;
+		to->to_maxval = to->to_initval + (to->to_increment * to->to_retries);
+		to->to_exponential = 0;
+		break;
+	case IPPROTO_UDP:
+	default:
+		if (!to->to_initval)
+			to->to_initval = 11 * HZ / 10;
+		if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
+			to->to_initval = NFS_MAX_UDP_TIMEOUT;
+		to->to_maxval = NFS_MAX_UDP_TIMEOUT;
+		to->to_exponential = 1;
+		break;
+	}
+}
+
+/*
+ * Create an RPC client handle.
+ */
+static struct rpc_clnt *
+nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data)
+{
+	struct rpc_timeout	timeparms;
+	struct rpc_xprt		*xprt = NULL;
+	struct rpc_clnt		*clnt = NULL;
+	int			proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP;
+
+	nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans);
+
+	server->retrans_timeo = timeparms.to_initval;
+	server->retrans_count = timeparms.to_retries;
+
+	/* create transport and client */
+	xprt = xprt_create_proto(proto, &server->addr, &timeparms);
+	if (IS_ERR(xprt)) {
+		dprintk("%s: cannot create RPC transport. Error = %ld\n",
+				__FUNCTION__, PTR_ERR(xprt));
+		return (struct rpc_clnt *)xprt;
+	}
+	clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
+				 server->rpc_ops->version, data->pseudoflavor);
+	if (IS_ERR(clnt)) {
+		dprintk("%s: cannot create RPC client. Error = %ld\n",
+				__FUNCTION__, PTR_ERR(xprt));
+		goto out_fail;
+	}
+
+	clnt->cl_intr     = 1;
+	clnt->cl_softrtry = 1;
+
+	return clnt;
+
+out_fail:
+	return clnt;
+}
+
+/*
+ * Clone a server record
+ */
+static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct nfs_server *parent = NFS_SB(data->sb);
+	struct inode *root_inode;
+	struct nfs_fsinfo fsinfo;
+	void *err = ERR_PTR(-ENOMEM);
+
+	sb->s_op = data->sb->s_op;
+	sb->s_blocksize = data->sb->s_blocksize;
+	sb->s_blocksize_bits = data->sb->s_blocksize_bits;
+	sb->s_maxbytes = data->sb->s_maxbytes;
+
+	server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+	server->io_stats = nfs_alloc_iostats();
+	if (server->io_stats == NULL)
+		goto out;
+
+	server->client = rpc_clone_client(parent->client);
+	if (IS_ERR((err = server->client)))
+		goto out;
+
+	if (!IS_ERR(parent->client_sys)) {
+		server->client_sys = rpc_clone_client(parent->client_sys);
+		if (IS_ERR((err = server->client_sys)))
+			goto out;
+	}
+	if (!IS_ERR(parent->client_acl)) {
+		server->client_acl = rpc_clone_client(parent->client_acl);
+		if (IS_ERR((err = server->client_acl)))
+			goto out;
+	}
+	root_inode = nfs_fhget(sb, data->fh, data->fattr);
+	if (!root_inode)
+		goto out;
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_put_root;
+	fsinfo.fattr = data->fattr;
+	if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0)
+		nfs_super_set_maxbytes(sb, fsinfo.maxfilesize);
+	sb->s_root->d_op = server->rpc_ops->dentry_ops;
+	sb->s_flags |= MS_ACTIVE;
+	return server;
+out_put_root:
+	iput(root_inode);
+out:
+	return err;
+}
+
+/*
+ * Copy an existing superblock and attach revised data
+ */
+static struct super_block *nfs_clone_generic_sb(struct nfs_clone_mount *data,
+		struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *),
+		struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *))
+{
+	struct nfs_server *server;
+	struct nfs_server *parent = NFS_SB(data->sb);
+	struct super_block *sb = ERR_PTR(-EINVAL);
+	void *err = ERR_PTR(-ENOMEM);
+	char *hostname;
+	int len;
+
+	server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (server == NULL)
+		goto out_err;
+	memcpy(server, parent, sizeof(*server));
+	hostname = (data->hostname != NULL) ? data->hostname : parent->hostname;
+	len = strlen(hostname) + 1;
+	server->hostname = kmalloc(len, GFP_KERNEL);
+	if (server->hostname == NULL)
+		goto free_server;
+	memcpy(server->hostname, hostname, len);
+	if (rpciod_up() != 0)
+		goto free_hostname;
+
+	sb = fill_sb(server, data);
+	if (IS_ERR((err = sb)) || sb->s_root)
+		goto kill_rpciod;
+
+	server = fill_server(sb, data);
+	if (IS_ERR((err = server)))
+		goto out_deactivate;
+	return sb;
+out_deactivate:
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+	return (struct super_block *)err;
+kill_rpciod:
+	rpciod_down();
+free_hostname:
+	kfree(server->hostname);
+free_server:
+	kfree(server);
+out_err:
+	return (struct super_block *)err;
+}
+
+/*
+ * Set up an NFS2/3 superblock
+ *
+ * The way this works is that the mount process passes a structure
+ * in the data argument which contains the server's IP address
+ * and the root file handle obtained from the server's mount
+ * daemon. We stash these away in the private superblock fields.
+ */
+static int
+nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
+{
+	struct nfs_server	*server;
+	rpc_authflavor_t	authflavor;
+
+	server           = NFS_SB(sb);
+	sb->s_blocksize_bits = 0;
+	sb->s_blocksize = 0;
+	if (data->bsize)
+		sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
+	if (data->rsize)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+	server->flags    = data->flags & NFS_MOUNT_FLAGMASK;
+
+	server->acregmin = data->acregmin*HZ;
+	server->acregmax = data->acregmax*HZ;
+	server->acdirmin = data->acdirmin*HZ;
+	server->acdirmax = data->acdirmax*HZ;
+
+	/* Start lockd here, before we might error out */
+	if (!(server->flags & NFS_MOUNT_NONLM))
+		lockd_up();
+
+	server->namelen  = data->namlen;
+	server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL);
+	if (!server->hostname)
+		return -ENOMEM;
+	strcpy(server->hostname, data->hostname);
+
+	/* Check NFS protocol revision and initialize RPC op vector
+	 * and file handle pool. */
+#ifdef CONFIG_NFS_V3
+	if (server->flags & NFS_MOUNT_VER3) {
+		server->rpc_ops = &nfs_v3_clientops;
+		server->caps |= NFS_CAP_READDIRPLUS;
+	} else {
+		server->rpc_ops = &nfs_v2_clientops;
+	}
+#else
+	server->rpc_ops = &nfs_v2_clientops;
+#endif
+
+	/* Fill in pseudoflavor for mount version < 5 */
+	if (!(data->flags & NFS_MOUNT_SECFLAVOUR))
+		data->pseudoflavor = RPC_AUTH_UNIX;
+	authflavor = data->pseudoflavor;	/* save for sb_init() */
+	/* XXX maybe we want to add a server->pseudoflavor field */
+
+	/* Create RPC client handles */
+	server->client = nfs_create_client(server, data);
+	if (IS_ERR(server->client))
+		return PTR_ERR(server->client);
+	/* RFC 2623, sec 2.3.2 */
+	if (authflavor != RPC_AUTH_UNIX) {
+		struct rpc_auth *auth;
+
+		server->client_sys = rpc_clone_client(server->client);
+		if (IS_ERR(server->client_sys))
+			return PTR_ERR(server->client_sys);
+		auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys);
+		if (IS_ERR(auth))
+			return PTR_ERR(auth);
+	} else {
+		atomic_inc(&server->client->cl_count);
+		server->client_sys = server->client;
+	}
+	if (server->flags & NFS_MOUNT_VER3) {
+#ifdef CONFIG_NFS_V3_ACL
+		if (!(server->flags & NFS_MOUNT_NOACL)) {
+			server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+			/* No errors! Assume that Sun nfsacls are supported */
+			if (!IS_ERR(server->client_acl))
+				server->caps |= NFS_CAP_ACLS;
+		}
+#else
+		server->flags &= ~NFS_MOUNT_NOACL;
+#endif /* CONFIG_NFS_V3_ACL */
+		/*
+		 * The VFS shouldn't apply the umask to mode bits. We will
+		 * do so ourselves when necessary.
+		 */
+		sb->s_flags |= MS_POSIXACL;
+		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
+			server->namelen = NFS3_MAXNAMLEN;
+		sb->s_time_gran = 1;
+	} else {
+		if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
+			server->namelen = NFS2_MAXNAMLEN;
+	}
+
+	sb->s_op = &nfs_sops;
+	return nfs_sb_init(sb, authflavor);
+}
+
+static int nfs_set_super(struct super_block *s, void *data)
+{
+	s->s_fs_info = data;
+	return set_anon_super(s, data);
+}
+
+static int nfs_compare_super(struct super_block *sb, void *data)
+{
+	struct nfs_server *server = data;
+	struct nfs_server *old = NFS_SB(sb);
+
+	if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr)
+		return 0;
+	if (old->addr.sin_port != server->addr.sin_port)
+		return 0;
+	return !nfs_compare_fh(&old->fh, &server->fh);
+}
+
+static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data)
+{
+	int error;
+	struct nfs_server *server = NULL;
+	struct super_block *s;
+	struct nfs_fh *root;
+	struct nfs_mount_data *data = raw_data;
+
+	s = ERR_PTR(-EINVAL);
+	if (data == NULL) {
+		dprintk("%s: missing data argument\n", __FUNCTION__);
+		goto out_err;
+	}
+	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
+		dprintk("%s: bad mount version\n", __FUNCTION__);
+		goto out_err;
+	}
+	switch (data->version) {
+		case 1:
+			data->namlen = 0;
+		case 2:
+			data->bsize  = 0;
+		case 3:
+			if (data->flags & NFS_MOUNT_VER3) {
+				dprintk("%s: mount structure version %d does not support NFSv3\n",
+						__FUNCTION__,
+						data->version);
+				goto out_err;
+			}
+			data->root.size = NFS2_FHSIZE;
+			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
+		case 4:
+			if (data->flags & NFS_MOUNT_SECFLAVOUR) {
+				dprintk("%s: mount structure version %d does not support strong security\n",
+						__FUNCTION__,
+						data->version);
+				goto out_err;
+			}
+		case 5:
+			memset(data->context, 0, sizeof(data->context));
+	}
+#ifndef CONFIG_NFS_V3
+	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
+	s = ERR_PTR(-EPROTONOSUPPORT);
+	if (data->flags & NFS_MOUNT_VER3) {
+		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
+		goto out_err;
+	}
+#endif /* CONFIG_NFS_V3 */
+
+	s = ERR_PTR(-ENOMEM);
+	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (!server)
+		goto out_err;
+	/* Zero out the NFS state stuff */
+	init_nfsv4_state(server);
+	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+
+	root = &server->fh;
+	if (data->flags & NFS_MOUNT_VER3)
+		root->size = data->root.size;
+	else
+		root->size = NFS2_FHSIZE;
+	s = ERR_PTR(-EINVAL);
+	if (root->size > sizeof(root->data)) {
+		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
+		goto out_err;
+	}
+	memcpy(root->data, data->root.data, root->size);
+
+	/* We now require that the mount process passes the remote address */
+	memcpy(&server->addr, &data->addr, sizeof(server->addr));
+	if (server->addr.sin_addr.s_addr == INADDR_ANY) {
+		dprintk("%s: mount program didn't pass remote address!\n",
+				__FUNCTION__);
+		goto out_err;
+	}
+
+	/* Fire up rpciod if not yet running */
+	s = ERR_PTR(rpciod_up());
+	if (IS_ERR(s)) {
+		dprintk("%s: couldn't start rpciod! Error = %ld\n",
+				__FUNCTION__, PTR_ERR(s));
+		goto out_err;
+	}
+
+	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
+	if (IS_ERR(s) || s->s_root)
+		goto out_rpciod_down;
+
+	s->s_flags = flags;
+
+	error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+	if (error) {
+		up_write(&s->s_umount);
+		deactivate_super(s);
+		return ERR_PTR(error);
+	}
+	s->s_flags |= MS_ACTIVE;
+	return s;
+out_rpciod_down:
+	rpciod_down();
+out_err:
+	kfree(server);
+	return s;
+}
+
+static void nfs_kill_super(struct super_block *s)
+{
+	struct nfs_server *server = NFS_SB(s);
+
+	kill_anon_super(s);
+
+	if (!IS_ERR(server->client))
+		rpc_shutdown_client(server->client);
+	if (!IS_ERR(server->client_sys))
+		rpc_shutdown_client(server->client_sys);
+	if (!IS_ERR(server->client_acl))
+		rpc_shutdown_client(server->client_acl);
+
+	if (!(server->flags & NFS_MOUNT_NONLM))
+		lockd_down();	/* release rpc.lockd */
+
+	rpciod_down();		/* release rpciod */
+
+	nfs_free_iostats(server->io_stats);
+	kfree(server->hostname);
+	kfree(server);
+	nfs_release_automount_timer();
+}
+
+static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb;
+
+	server->fsid = data->fattr->fsid;
+	nfs_copy_fh(&server->fh, data->fh);
+	sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server);
+	if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM))
+		lockd_up();
+	return sb;
+}
+
+static struct super_block *nfs_clone_nfs_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server);
+}
+
+#ifdef CONFIG_NFS_V4
+static struct rpc_clnt *nfs4_create_client(struct nfs_server *server,
+	struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor)
+{
+	struct nfs4_client *clp;
+	struct rpc_xprt *xprt = NULL;
+	struct rpc_clnt *clnt = NULL;
+	int err = -EIO;
+
+	clp = nfs4_get_client(&server->addr.sin_addr);
+	if (!clp) {
+		dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__);
+		return ERR_PTR(err);
+	}
+
+	/* Now create transport and client */
+	down_write(&clp->cl_sem);
+	if (IS_ERR(clp->cl_rpcclient)) {
+		xprt = xprt_create_proto(proto, &server->addr, timeparms);
+		if (IS_ERR(xprt)) {
+			up_write(&clp->cl_sem);
+			err = PTR_ERR(xprt);
+			dprintk("%s: cannot create RPC transport. Error = %d\n",
+					__FUNCTION__, err);
+			goto out_fail;
+		}
+		/* Bind to a reserved port! */
+		xprt->resvport = 1;
+		clnt = rpc_create_client(xprt, server->hostname, &nfs_program,
+				server->rpc_ops->version, flavor);
+		if (IS_ERR(clnt)) {
+			up_write(&clp->cl_sem);
+			err = PTR_ERR(clnt);
+			dprintk("%s: cannot create RPC client. Error = %d\n",
+					__FUNCTION__, err);
+			goto out_fail;
+		}
+		clnt->cl_intr     = 1;
+		clnt->cl_softrtry = 1;
+		clp->cl_rpcclient = clnt;
+		memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr));
+		nfs_idmap_new(clp);
+	}
+	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+	clnt = rpc_clone_client(clp->cl_rpcclient);
+	if (!IS_ERR(clnt))
+		server->nfs4_state = clp;
+	up_write(&clp->cl_sem);
+	clp = NULL;
+
+	if (IS_ERR(clnt)) {
+		dprintk("%s: cannot create RPC client. Error = %d\n",
+				__FUNCTION__, err);
+		return clnt;
+	}
+
+	if (server->nfs4_state->cl_idmap == NULL) {
+		dprintk("%s: failed to create idmapper.\n", __FUNCTION__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (clnt->cl_auth->au_flavor != flavor) {
+		struct rpc_auth *auth;
+
+		auth = rpcauth_create(flavor, clnt);
+		if (IS_ERR(auth)) {
+			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
+			return (struct rpc_clnt *)auth;
+		}
+	}
+	return clnt;
+
+ out_fail:
+	if (clp)
+		nfs4_put_client(clp);
+	return ERR_PTR(err);
+}
+
+/*
+ * Set up an NFS4 superblock
+ */
+static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent)
+{
+	struct nfs_server *server;
+	struct rpc_timeout timeparms;
+	rpc_authflavor_t authflavour;
+	int err = -EIO;
+
+	sb->s_blocksize_bits = 0;
+	sb->s_blocksize = 0;
+	server = NFS_SB(sb);
+	if (data->rsize != 0)
+		server->rsize = nfs_block_size(data->rsize, NULL);
+	if (data->wsize != 0)
+		server->wsize = nfs_block_size(data->wsize, NULL);
+	server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+	server->caps = NFS_CAP_ATOMIC_OPEN;
+
+	server->acregmin = data->acregmin*HZ;
+	server->acregmax = data->acregmax*HZ;
+	server->acdirmin = data->acdirmin*HZ;
+	server->acdirmax = data->acdirmax*HZ;
+
+	server->rpc_ops = &nfs_v4_clientops;
+
+	nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans);
+
+	server->retrans_timeo = timeparms.to_initval;
+	server->retrans_count = timeparms.to_retries;
+
+	/* Now create transport and client */
+	authflavour = RPC_AUTH_UNIX;
+	if (data->auth_flavourlen != 0) {
+		if (data->auth_flavourlen != 1) {
+			dprintk("%s: Invalid number of RPC auth flavours %d.\n",
+					__FUNCTION__, data->auth_flavourlen);
+			err = -EINVAL;
+			goto out_fail;
+		}
+		if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) {
+			err = -EFAULT;
+			goto out_fail;
+		}
+	}
+
+	server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour);
+	if (IS_ERR(server->client)) {
+		err = PTR_ERR(server->client);
+			dprintk("%s: cannot create RPC client. Error = %d\n",
+					__FUNCTION__, err);
+			goto out_fail;
+	}
+
+	sb->s_time_gran = 1;
+
+	sb->s_op = &nfs4_sops;
+	err = nfs_sb_init(sb, authflavour);
+
+ out_fail:
+	return err;
+}
+
+static int nfs4_compare_super(struct super_block *sb, void *data)
+{
+	struct nfs_server *server = data;
+	struct nfs_server *old = NFS_SB(sb);
+
+	if (strcmp(server->hostname, old->hostname) != 0)
+		return 0;
+	if (strcmp(server->mnt_path, old->mnt_path) != 0)
+		return 0;
+	return 1;
+}
+
+static void *
+nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
+{
+	void *p = NULL;
+
+	if (!src->len)
+		return ERR_PTR(-EINVAL);
+	if (src->len < maxlen)
+		maxlen = src->len;
+	if (dst == NULL) {
+		p = dst = kmalloc(maxlen + 1, GFP_KERNEL);
+		if (p == NULL)
+			return ERR_PTR(-ENOMEM);
+	}
+	if (copy_from_user(dst, src->data, maxlen)) {
+		kfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+	dst[maxlen] = '\0';
+	return dst;
+}
+
+static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data)
+{
+	int error;
+	struct nfs_server *server;
+	struct super_block *s;
+	struct nfs4_mount_data *data = raw_data;
+	void *p;
+
+	if (data == NULL) {
+		dprintk("%s: missing data argument\n", __FUNCTION__);
+		return ERR_PTR(-EINVAL);
+	}
+	if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
+		dprintk("%s: bad mount version\n", __FUNCTION__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
+	if (!server)
+		return ERR_PTR(-ENOMEM);
+	/* Zero out the NFS state stuff */
+	init_nfsv4_state(server);
+	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
+
+	p = nfs_copy_user_string(NULL, &data->hostname, 256);
+	if (IS_ERR(p))
+		goto out_err;
+	server->hostname = p;
+
+	p = nfs_copy_user_string(NULL, &data->mnt_path, 1024);
+	if (IS_ERR(p))
+		goto out_err;
+	server->mnt_path = p;
+
+	p = nfs_copy_user_string(server->ip_addr, &data->client_addr,
+			sizeof(server->ip_addr) - 1);
+	if (IS_ERR(p))
+		goto out_err;
+
+	/* We now require that the mount process passes the remote address */
+	if (data->host_addrlen != sizeof(server->addr)) {
+		s = ERR_PTR(-EINVAL);
+		goto out_free;
+	}
+	if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
+		s = ERR_PTR(-EFAULT);
+		goto out_free;
+	}
+	if (server->addr.sin_family != AF_INET ||
+	    server->addr.sin_addr.s_addr == INADDR_ANY) {
+		dprintk("%s: mount program didn't pass remote IP address!\n",
+				__FUNCTION__);
+		s = ERR_PTR(-EINVAL);
+		goto out_free;
+	}
+
+	/* Fire up rpciod if not yet running */
+	s = ERR_PTR(rpciod_up());
+	if (IS_ERR(s)) {
+		dprintk("%s: couldn't start rpciod! Error = %ld\n",
+				__FUNCTION__, PTR_ERR(s));
+		goto out_free;
+	}
+
+	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
+
+	if (IS_ERR(s) || s->s_root)
+		goto out_free;
+
+	s->s_flags = flags;
+
+	error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+	if (error) {
+		up_write(&s->s_umount);
+		deactivate_super(s);
+		return ERR_PTR(error);
+	}
+	s->s_flags |= MS_ACTIVE;
+	return s;
+out_err:
+	s = (struct super_block *)p;
+out_free:
+	kfree(server->mnt_path);
+	kfree(server->hostname);
+	kfree(server);
+	return s;
+}
+
+static void nfs4_kill_super(struct super_block *sb)
+{
+	struct nfs_server *server = NFS_SB(sb);
+
+	nfs_return_all_delegations(sb);
+	kill_anon_super(sb);
+
+	nfs4_renewd_prepare_shutdown(server);
+
+	if (server->client != NULL && !IS_ERR(server->client))
+		rpc_shutdown_client(server->client);
+
+	destroy_nfsv4_state(server);
+
+	rpciod_down();
+
+	nfs_free_iostats(server->io_stats);
+	kfree(server->hostname);
+	kfree(server);
+	nfs_release_automount_timer();
+}
+
+/*
+ * Constructs the SERVER-side path
+ */
+static inline char *nfs4_dup_path(const struct dentry *dentry)
+{
+	char *page = (char *) __get_free_page(GFP_USER);
+	char *path;
+
+	path = nfs4_path(dentry, page, PAGE_SIZE);
+	if (!IS_ERR(path)) {
+		int len = PAGE_SIZE + page - path;
+		char *tmp = path;
+
+		path = kmalloc(len, GFP_KERNEL);
+		if (path)
+			memcpy(path, tmp, len);
+		else
+			path = ERR_PTR(-ENOMEM);
+	}
+	free_page((unsigned long)page);
+	return path;
+}
+
+static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	const struct dentry *dentry = data->dentry;
+	struct nfs4_client *clp = server->nfs4_state;
+	struct super_block *sb;
+
+	server->fsid = data->fattr->fsid;
+	nfs_copy_fh(&server->fh, data->fh);
+	server->mnt_path = nfs4_dup_path(dentry);
+	if (IS_ERR(server->mnt_path)) {
+		sb = (struct super_block *)server->mnt_path;
+		goto err;
+	}
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	nfs4_server_capabilities(server, &server->fh);
+
+	down_write(&clp->cl_sem);
+	atomic_inc(&clp->cl_count);
+	list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks);
+	up_write(&clp->cl_sem);
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static struct super_block *nfs_clone_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server);
+}
+
+static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data)
+{
+	struct super_block *sb = ERR_PTR(-ENOMEM);
+	int len;
+
+	len = strlen(data->mnt_path) + 1;
+	server->mnt_path = kmalloc(len, GFP_KERNEL);
+	if (server->mnt_path == NULL)
+		goto err;
+	memcpy(server->mnt_path, data->mnt_path, len);
+	memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in));
+
+	sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server);
+	if (IS_ERR(sb) || sb->s_root)
+		goto free_path;
+	return sb;
+free_path:
+	kfree(server->mnt_path);
+err:
+	server->mnt_path = NULL;
+	return sb;
+}
+
+static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data)
+{
+	struct nfs_server *server = NFS_SB(sb);
+	struct rpc_timeout timeparms;
+	int proto, timeo, retrans;
+	void *err;
+
+	proto = IPPROTO_TCP;
+	/* Since we are following a referral and there may be alternatives,
+	   set the timeouts and retries to low values */
+	timeo = 2;
+	retrans = 1;
+	nfs_init_timeout_values(&timeparms, proto, timeo, retrans);
+
+	server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor);
+	if (IS_ERR((err = server->client)))
+		goto out_err;
+
+	sb->s_time_gran = 1;
+	sb->s_op = &nfs4_sops;
+	err = ERR_PTR(nfs_sb_init(sb, data->authflavor));
+	if (!IS_ERR(err))
+		return server;
+out_err:
+	return (struct nfs_server *)err;
+}
+
+static struct super_block *nfs_referral_nfs4_sb(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *raw_data)
+{
+	struct nfs_clone_mount *data = raw_data;
+	return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server);
+}
+
+#endif
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e03abbd8302..b383fdd3a15 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1529,7 +1529,7 @@ int nfs_sync_inode_wait(struct inode *inode, unsigned long idx_start,
 	return ret;
 }
 
-int nfs_init_writepagecache(void)
+int __init nfs_init_writepagecache(void)
 {
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
 					     sizeof(struct nfs_write_data),
@@ -1551,7 +1551,7 @@ int nfs_init_writepagecache(void)
 	return 0;
 }
 
-void nfs_destroy_writepagecache(void)
+void __exit nfs_destroy_writepagecache(void)
 {
 	mempool_destroy(nfs_commit_mempool);
 	mempool_destroy(nfs_wdata_mempool);
-- 
cgit v1.2.3


From 81039f1f204a0fd2952112a240284e114f1a25e6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:34:34 -0400
Subject: NFS: Display the chosen RPCSEC_GSS security flavour in /proc/mounts

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4acd3ee9642..30f939bcb72 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -318,6 +318,34 @@ static int nfs_statfs(struct super_block *sb, struct kstatfs *buf)
 
 }
 
+static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
+{
+	static struct {
+		rpc_authflavor_t flavour;
+		const char *str;
+	} sec_flavours[] = {
+		{ RPC_AUTH_NULL, "null" },
+		{ RPC_AUTH_UNIX, "sys" },
+		{ RPC_AUTH_GSS_KRB5, "krb5" },
+		{ RPC_AUTH_GSS_KRB5I, "krb5i" },
+		{ RPC_AUTH_GSS_KRB5P, "krb5p" },
+		{ RPC_AUTH_GSS_LKEY, "lkey" },
+		{ RPC_AUTH_GSS_LKEYI, "lkeyi" },
+		{ RPC_AUTH_GSS_LKEYP, "lkeyp" },
+		{ RPC_AUTH_GSS_SPKM, "spkm" },
+		{ RPC_AUTH_GSS_SPKMI, "spkmi" },
+		{ RPC_AUTH_GSS_SPKMP, "spkmp" },
+		{ -1, "unknown" }
+	};
+	int i;
+
+	for (i=0; sec_flavours[i].flavour != -1; i++) {
+		if (sec_flavours[i].flavour == flavour)
+			break;
+	}
+	return sec_flavours[i].str;
+}
+
 /*
  * Describe the mount options in force on this server representation
  */
@@ -371,6 +399,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	seq_printf(m, ",proto=%s", proto);
 	seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ);
 	seq_printf(m, ",retrans=%u", nfss->retrans_count);
+	seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor));
 }
 
 /*
-- 
cgit v1.2.3


From 5046791417dcac1ba126b77b8062af15a2f0b8e1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:40:24 -0400
Subject: NLM: sem to mutex conversion

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 729ac427d35..5242743c940 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -112,7 +112,7 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	host->h_version    = version;
 	host->h_proto      = proto;
 	host->h_rpcclnt    = NULL;
-	init_MUTEX(&host->h_sema);
+	mutex_init(&host->h_mutex);
 	host->h_nextrebind = jiffies + NLM_HOST_REBIND;
 	host->h_expires    = jiffies + NLM_HOST_EXPIRE;
 	atomic_set(&host->h_count, 1);
@@ -172,7 +172,7 @@ nlm_bind_host(struct nlm_host *host)
 			(unsigned)ntohl(host->h_addr.sin_addr.s_addr));
 
 	/* Lock host handle */
-	down(&host->h_sema);
+	mutex_lock(&host->h_mutex);
 
 	/* If we've already created an RPC client, check whether
 	 * RPC rebind is required
@@ -204,12 +204,12 @@ nlm_bind_host(struct nlm_host *host)
 		host->h_rpcclnt = clnt;
 	}
 
-	up(&host->h_sema);
+	mutex_unlock(&host->h_mutex);
 	return clnt;
 
 forgetit:
 	printk("lockd: couldn't create RPC handle for %s\n", host->h_name);
-	up(&host->h_sema);
+	mutex_unlock(&host->h_mutex);
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 28df955a2ad484d602314b30183ea8496a9aa34a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 9 Jun 2006 09:40:27 -0400
Subject: NLM: Fix reclaim races

Currently it is possible for a task to remove its locks at the same time as
the NLM recovery thread is trying to recover them. This quickly leads to an
Oops.
Protect the locks using an rw semaphore while they are being recovered.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c | 39 +++++++++++++++++++++++++--------------
 fs/lockd/clntproc.c | 14 +++++++++++++-
 fs/lockd/host.c     |  1 +
 3 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index bce74446870..52774feab93 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -147,11 +147,10 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
  * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
  * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
  */
-static inline
-void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
+static void nlmclnt_prepare_reclaim(struct nlm_host *host)
 {
+	down_write(&host->h_rwsem);
 	host->h_monitored = 0;
-	host->h_nsmstate = newstate;
 	host->h_state++;
 	host->h_nextrebind = 0;
 	nlm_rebind_host(host);
@@ -164,6 +163,13 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 	dprintk("NLM: reclaiming locks for host %s", host->h_name);
 }
 
+static void nlmclnt_finish_reclaim(struct nlm_host *host)
+{
+	host->h_reclaiming = 0;
+	up_write(&host->h_rwsem);
+	dprintk("NLM: done reclaiming locks for host %s", host->h_name);
+}
+
 /*
  * Reclaim all locks on server host. We do this by spawning a separate
  * reclaimer thread.
@@ -171,12 +177,10 @@ void nlmclnt_prepare_reclaim(struct nlm_host *host, u32 newstate)
 void
 nlmclnt_recovery(struct nlm_host *host, u32 newstate)
 {
-	if (host->h_reclaiming++) {
-		if (host->h_nsmstate == newstate)
-			return;
-		nlmclnt_prepare_reclaim(host, newstate);
-	} else {
-		nlmclnt_prepare_reclaim(host, newstate);
+	if (host->h_nsmstate == newstate)
+		return;
+	host->h_nsmstate = newstate;
+	if (!host->h_reclaiming++) {
 		nlm_get_host(host);
 		__module_get(THIS_MODULE);
 		if (kernel_thread(reclaimer, host, CLONE_KERNEL) < 0)
@@ -190,6 +194,7 @@ reclaimer(void *ptr)
 	struct nlm_host	  *host = (struct nlm_host *) ptr;
 	struct nlm_wait	  *block;
 	struct file_lock *fl, *next;
+	u32 nsmstate;
 
 	daemonize("%s-reclaim", host->h_name);
 	allow_signal(SIGKILL);
@@ -199,19 +204,25 @@ reclaimer(void *ptr)
 	lock_kernel();
 	lockd_up();
 
+	nlmclnt_prepare_reclaim(host);
 	/* First, reclaim all locks that have been marked. */
 restart:
+	nsmstate = host->h_nsmstate;
 	list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
 		list_del_init(&fl->fl_u.nfs_fl.list);
 
 		if (signalled())
 			continue;
-		if (nlmclnt_reclaim(host, fl) == 0)
-			list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
-		goto restart;
+		if (nlmclnt_reclaim(host, fl) != 0)
+			continue;
+		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
+		if (host->h_nsmstate != nsmstate) {
+			/* Argh! The server rebooted again! */
+			list_splice_init(&host->h_granted, &host->h_reclaim);
+			goto restart;
+		}
 	}
-
-	host->h_reclaiming = 0;
+	nlmclnt_finish_reclaim(host);
 
 	/* Now, wake up all processes that sleep on a blocked lock */
 	list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index f96e38155b5..4db62098d3f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -508,7 +508,10 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 
 	block = nlmclnt_prepare_block(host, fl);
+again:
 	for(;;) {
+		/* Reboot protection */
+		fl->fl_u.nfs_fl.state = host->h_state;
 		status = nlmclnt_call(req, NLMPROC_LOCK);
 		if (status < 0)
 			goto out_unblock;
@@ -531,10 +534,16 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	}
 
 	if (resp->status == NLM_LCK_GRANTED) {
-		fl->fl_u.nfs_fl.state = host->h_state;
+		down_read(&host->h_rwsem);
+		/* Check whether or not the server has rebooted */
+		if (fl->fl_u.nfs_fl.state != host->h_state) {
+			up_read(&host->h_rwsem);
+			goto again;
+		}
 		fl->fl_flags |= FL_SLEEP;
 		/* Ensure the resulting lock will get added to granted list */
 		do_vfs_lock(fl);
+		up_read(&host->h_rwsem);
 	}
 	status = nlm_stat_to_errno(resp->status);
 out_unblock:
@@ -596,6 +605,7 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl)
 static int
 nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 {
+	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
 	int		status;
 
@@ -604,7 +614,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
 	 * case, we want to unlock.
 	 */
+	down_read(&host->h_rwsem);
 	do_vfs_lock(fl);
+	up_read(&host->h_rwsem);
 
 	if (req->a_flags & RPC_TASK_ASYNC)
 		return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 5242743c940..38b0e8a1aec 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -117,6 +117,7 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	host->h_expires    = jiffies + NLM_HOST_EXPIRE;
 	atomic_set(&host->h_count, 1);
 	init_waitqueue_head(&host->h_gracewait);
+	init_rwsem(&host->h_rwsem);
 	host->h_state      = 0;			/* pseudo NSM state */
 	host->h_nsmstate   = 0;			/* real NSM state */
 	host->h_server	   = server;
-- 
cgit v1.2.3


From 4ed0156f774cf50252e7f51032d1cc857fe86879 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 9 Jun 2006 15:06:42 +0100
Subject: [JFFS2] Fix more breakage caused by janitorial meddling.

jffs2_zlib_exit() and free_workspaces() shouldn't be marked __exit because
they get called in the error case from the init functions.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/compr_zlib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index d43cbac4fb9..5c63e0cdcf4 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -60,7 +60,7 @@ static int __init alloc_workspaces(void)
 	return 0;
 }
 
-static void __exit free_workspaces(void)
+static void free_workspaces(void)
 {
 	vfree(def_strm.workspace);
 	vfree(inf_strm.workspace);
@@ -216,7 +216,7 @@ int __init jffs2_zlib_init(void)
     return ret;
 }
 
-void __exit jffs2_zlib_exit(void)
+void jffs2_zlib_exit(void)
 {
     jffs2_unregister_compressor(&jffs2_zlib_comp);
     free_workspaces();
-- 
cgit v1.2.3


From 6344a423e5806d138923caa1d7699f3b7809fe43 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 12 Jun 2006 04:18:35 +0000
Subject: [CIFS] fix minor compile warning when config_cifs_weak_security is
 off

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index acae58313b0..271a037a386 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -569,7 +569,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	} else
 		server->capabilities &= ~CAP_EXTENDED_SECURITY;
 
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
 signing_check:
+#endif
 	if(sign_CIFS_PDUs == FALSE) {        
 		if(server->secMode & SECMODE_SIGN_REQUIRED)
 			cERROR(1,("Server requires "
-- 
cgit v1.2.3


From d7ede1aa5dfff53e76dbabac5b8087341686f662 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 13 Jun 2006 16:28:11 +1000
Subject: [XFS] Minor XFS documentation updates.

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/Kconfig | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index bac27d66151..236f9cf3714 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -65,18 +65,19 @@ config XFS_POSIX_ACL
 	  If you don't know what Access Control Lists are, say N.
 
 config XFS_RT
-	bool "XFS Realtime support (EXPERIMENTAL)"
-	depends on XFS_FS && EXPERIMENTAL
+	bool "XFS Realtime subvolume support"
+	depends on XFS_FS
 	help
 	  If you say Y here you will be able to mount and use XFS filesystems
-	  which contain a realtime subvolume. The realtime subvolume is a
-	  separate area of disk space where only file data is stored. The
-	  realtime subvolume is designed to provide very deterministic
-	  data rates suitable for media streaming applications.
-
-	  See the xfs man page in section 5 for a bit more information.
+	  which contain a realtime subvolume.  The realtime subvolume is a
+	  separate area of disk space where only file data is stored.  It was
+	  originally designed to provide deterministic data rates suitable
+	  for media streaming applications, but is also useful as a generic
+	  mechanism for ensuring data and metadata/log I/Os are completely
+	  separated.  Regular file I/Os are isolated to a separate device
+	  from all other requests, and this can be done quite transparently
+	  to applications via the inherit-realtime directory inode flag.
 
-	  This feature is unsupported at this time, is not yet fully
-	  functional, and may cause serious problems.
+	  See the xfs man page in section 5 for additional information.
 
 	  If unsure, say N.
-- 
cgit v1.2.3


From 0fd1ffe0633b4b039b343b753598e6df435e034d Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@suse.de>
Date: Tue, 13 Jun 2006 21:31:39 +0000
Subject: [CIFS] Fix suspend/resume problem which causes EIO on subsequent
 access to the mount.

Signed-off-by: Pavel Machek <pavel@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c  |  2 +-
 fs/cifs/connect.c | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 700570522b2..fb7c11c2c91 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -905,7 +905,7 @@ static int cifs_dnotify_thread(void * dummyarg)
 	struct cifsSesInfo *ses;
 
 	do {
-		if(try_to_freeze())
+		if (try_to_freeze())
 			continue;
 		set_current_state(TASK_INTERRUPTIBLE);
 		schedule_timeout(15*HZ);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e6f3d2fff6c..faaf9eb15b9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -367,21 +367,21 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 			continue;
 		if (bigbuf == NULL) {
 			bigbuf = cifs_buf_get();
-			if(bigbuf == NULL) {
-				cERROR(1,("No memory for large SMB response"));
+			if (!bigbuf) {
+				cERROR(1, ("No memory for large SMB response"));
 				msleep(3000);
 				/* retry will check if exiting */
 				continue;
 			}
-		} else if(isLargeBuf) {
-			/* we are reusing a dirtry large buf, clear its start */
+		} else if (isLargeBuf) {
+			/* we are reusing a dirty large buf, clear its start */
 			memset(bigbuf, 0, sizeof (struct smb_hdr));
 		}
 
 		if (smallbuf == NULL) {
 			smallbuf = cifs_small_buf_get();
-			if(smallbuf == NULL) {
-				cERROR(1,("No memory for SMB response"));
+			if (!smallbuf) {
+				cERROR(1, ("No memory for SMB response"));
 				msleep(1000);
 				/* retry will check if exiting */
 				continue;
@@ -401,12 +401,12 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		    kernel_recvmsg(csocket, &smb_msg,
 				 &iov, 1, 4, 0 /* BB see socket.h flags */);
 
-		if(server->tcpStatus == CifsExiting) {
+		if (server->tcpStatus == CifsExiting) {
 			break;
 		} else if (server->tcpStatus == CifsNeedReconnect) {
-			cFYI(1,("Reconnect after server stopped responding"));
+			cFYI(1, ("Reconnect after server stopped responding"));
 			cifs_reconnect(server);
-			cFYI(1,("call to reconnect done"));
+			cFYI(1, ("call to reconnect done"));
 			csocket = server->ssocket;
 			continue;
 		} else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
@@ -415,15 +415,15 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 				tcpStatus CifsNeedReconnect if server hung */
 			continue;
 		} else if (length <= 0) {
-			if(server->tcpStatus == CifsNew) {
-				cFYI(1,("tcp session abend after SMBnegprot"));
+			if (server->tcpStatus == CifsNew) {
+				cFYI(1, ("tcp session abend after SMBnegprot"));
 				/* some servers kill the TCP session rather than
 				   returning an SMB negprot error, in which
 				   case reconnecting here is not going to help,
 				   and so simply return error to mount */
 				break;
 			}
-			if(length == -EINTR) { 
+			if (!try_to_freeze() && (length == -EINTR)) {
 				cFYI(1,("cifsd thread killed"));
 				break;
 			}
-- 
cgit v1.2.3


From 3877f0b6c9f54d43e55e532404a935b90393b635 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 00:05:26 +0100
Subject: [JFFS2] Don't trust node headers before the CRC is checked.

Especially when summary code is used, we can have in-memory data
structures referencing certain nodes without them actually being readable
on the flash. Discard the nodes gracefully in that case, rather than
triggering a BUG().

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/readinode.c | 62 ++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e1acce8fb2b..5351b34d541 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -343,7 +343,7 @@ free_out:
  * Helper function for jffs2_get_inode_nodes().
  * It is called every time an unknown node is found.
  *
- * Returns: 0 on succes;
+ * Returns: 0 on success;
  * 	    1 if the node should be marked obsolete;
  * 	    negative error code on failure.
  */
@@ -354,37 +354,30 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
 
 	un->nodetype = cpu_to_je16(JFFS2_NODE_ACCURATE | je16_to_cpu(un->nodetype));
 
-	if (crc32(0, un, sizeof(struct jffs2_unknown_node) - 4) != je32_to_cpu(un->hdr_crc)) {
-		/* Hmmm. This should have been caught at scan time. */
-		JFFS2_NOTICE("node header CRC failed at %#08x. But it must have been OK earlier.\n", ref_offset(ref));
-		jffs2_dbg_dump_node(c, ref_offset(ref));
-		return 1;
-	} else {
-		switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) {
+	switch(je16_to_cpu(un->nodetype) & JFFS2_COMPAT_MASK) {
 
-		case JFFS2_FEATURE_INCOMPAT:
-			JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n",
-				je16_to_cpu(un->nodetype), ref_offset(ref));
-			/* EEP */
-			BUG();
-			break;
+	case JFFS2_FEATURE_INCOMPAT:
+		JFFS2_ERROR("unknown INCOMPAT nodetype %#04X at %#08x\n",
+			    je16_to_cpu(un->nodetype), ref_offset(ref));
+		/* EEP */
+		BUG();
+		break;
 
-		case JFFS2_FEATURE_ROCOMPAT:
-			JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n",
-					je16_to_cpu(un->nodetype), ref_offset(ref));
-			BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO));
-			break;
+	case JFFS2_FEATURE_ROCOMPAT:
+		JFFS2_ERROR("unknown ROCOMPAT nodetype %#04X at %#08x\n",
+			    je16_to_cpu(un->nodetype), ref_offset(ref));
+		BUG_ON(!(c->flags & JFFS2_SB_FLAG_RO));
+		break;
 
-		case JFFS2_FEATURE_RWCOMPAT_COPY:
-			JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n",
-					je16_to_cpu(un->nodetype), ref_offset(ref));
-			break;
+	case JFFS2_FEATURE_RWCOMPAT_COPY:
+		JFFS2_NOTICE("unknown RWCOMPAT_COPY nodetype %#04X at %#08x\n",
+			     je16_to_cpu(un->nodetype), ref_offset(ref));
+		break;
 
-		case JFFS2_FEATURE_RWCOMPAT_DELETE:
-			JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
-					je16_to_cpu(un->nodetype), ref_offset(ref));
-			return 1;
-		}
+	case JFFS2_FEATURE_RWCOMPAT_DELETE:
+		JFFS2_NOTICE("unknown RWCOMPAT_DELETE nodetype %#04X at %#08x\n",
+			     je16_to_cpu(un->nodetype), ref_offset(ref));
+		return 1;
 	}
 
 	return 0;
@@ -549,6 +542,18 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
 
 		node = (union jffs2_node_union *)bufstart;
 
+		/* No need to mask in the valid bit; it shouldn't be invalid */
+		if (je32_to_cpu(node->u.hdr_crc) != crc32(0, node, sizeof(node->u)-4)) {
+			JFFS2_NOTICE("Node header CRC failed at %#08x. {%04x,%04x,%08x,%08x}\n",
+				     ref_offset(ref), je16_to_cpu(node->u.magic),
+				     je16_to_cpu(node->u.nodetype),
+				     je32_to_cpu(node->u.totlen),
+				     je32_to_cpu(node->u.hdr_crc));
+			jffs2_dbg_dump_node(c, ref_offset(ref));
+			jffs2_mark_node_obsolete(c, ref);
+			goto cont;
+		}
+
 		switch (je16_to_cpu(node->u.nodetype)) {
 
 		case JFFS2_NODETYPE_DIRENT:
@@ -606,6 +611,7 @@ static int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_inf
 				goto free_out;
 
 		}
+	cont:
 		spin_lock(&c->erase_completion_lock);
 	}
 
-- 
cgit v1.2.3


From 2ba72cb754bb091bb24a44e9682f7105110f7f38 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 10:22:40 +0100
Subject: [JFFS2] Mark XATTR support as experimental, for now

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/Kconfig | 56 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 2496ccbe260..572cc435a1b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1075,9 +1075,35 @@ config JFFS2_FS_DEBUG
 	  If reporting bugs, please try to have available a full dump of the
 	  messages at debug level 1 while the misbehaviour was occurring.
 
-config JFFS2_FS_XATTR
-	bool "JFFS2 XATTR support"
+config JFFS2_FS_WRITEBUFFER
+	bool "JFFS2 write-buffering support"
 	depends on JFFS2_FS
+	default y
+	help
+	  This enables the write-buffering support in JFFS2.
+
+	  This functionality is required to support JFFS2 on the following
+	  types of flash devices:
+	    - NAND flash
+	    - NOR flash with transparent ECC
+	    - DataFlash
+
+config JFFS2_SUMMARY
+	bool "JFFS2 summary support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  This feature makes it possible to use summary information
+	  for faster filesystem mount.
+
+	  The summary information can be inserted into a filesystem image
+	  by the utility 'sumtool'.
+
+	  If unsure, say 'N'.
+
+config JFFS2_FS_XATTR
+	bool "JFFS2 XATTR support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL && !JFFS2_FS_WRITEBUFFER
 	default n
 	help
 	  Extended attributes are name:value pairs associated with inodes by
@@ -1113,32 +1139,6 @@ config JFFS2_FS_SECURITY
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
 
-config JFFS2_FS_WRITEBUFFER
-	bool "JFFS2 write-buffering support"
-	depends on JFFS2_FS
-	default y
-	help
-	  This enables the write-buffering support in JFFS2.
-
-	  This functionality is required to support JFFS2 on the following
-	  types of flash devices:
-	    - NAND flash
-	    - NOR flash with transparent ECC
-	    - DataFlash
-
-config JFFS2_SUMMARY
-	bool "JFFS2 summary support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
-	default n
-	help
-	  This feature makes it possible to use summary information
-	  for faster filesystem mount.
-
-	  The summary information can be inserted into a filesystem image
-	  by the utility 'sumtool'.
-
-	  If unsure, say 'N'.
-
 config JFFS2_COMPRESSION_OPTIONS
 	bool "Advanced compression options for JFFS2"
 	depends on JFFS2_FS
-- 
cgit v1.2.3


From fc6612f627c697b348a4ef64f16fb373d86dbd76 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 18:35:10 +0100
Subject: [JFFS2] When retiring nextblock, allocate a node_ref for the wasted
 space

Failing to do so makes the calculated length of the last node incorrect,
when we're not using eraseblock summaries.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/nodemgmt.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 71d1630609a..8bedfd2ff68 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -317,6 +317,8 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 		}
 	} else {
 		if (jeb && minsize > jeb->free_size) {
+			uint32_t waste;
+
 			/* Skip the end of this block and file it as having some dirty space */
 			/* If there's a pending write to it, flush now */
 
@@ -329,10 +331,26 @@ static int jffs2_do_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
 				goto restart;
 			}
 
-			c->wasted_size += jeb->free_size;
-			c->free_size -= jeb->free_size;
-			jeb->wasted_size += jeb->free_size;
-			jeb->free_size = 0;
+			spin_unlock(&c->erase_completion_lock);
+
+			ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+			if (ret)
+				return ret;
+			/* Just lock it again and continue. Nothing much can change because
+			   we hold c->alloc_sem anyway. In fact, it's not entirely clear why
+			   we hold c->erase_completion_lock in the majority of this function...
+			   but that's a question for another (more caffeine-rich) day. */
+			spin_lock(&c->erase_completion_lock);
+
+			waste = jeb->free_size;
+			jffs2_link_node_ref(c, jeb,
+					    (jeb->offset + c->sector_size - waste) | REF_OBSOLETE,
+					    waste, NULL);
+			/* FIXME: that made it count as dirty. Convert to wasted */
+			jeb->dirty_size -= waste;
+			c->dirty_size -= waste;
+			jeb->wasted_size += waste;
+			c->wasted_size += waste;
 
 			jffs2_close_nextblock(c, jeb);
 			jeb = NULL;
-- 
cgit v1.2.3


From 1046d88001e7b8819f60dece2eaf1b44bf4b4460 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 18 Jun 2006 22:44:21 +0100
Subject: [JFFS2] Check CRC32 on dirent and data nodes each time they're read

Also, make sure dirents are marked REF_UNCHECKED when we 'discover' them
through eraseblock summary.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/readinode.c | 51 +++++++++++++++++++++++++++++++++++++--------------
 fs/jffs2/summary.c   |  2 +-
 2 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 5351b34d541..5ea4faafa2d 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -116,19 +116,42 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
 				uint32_t *latest_mctime, uint32_t *mctime_ver)
 {
 	struct jffs2_full_dirent *fd;
+	uint32_t crc;
 
-	/* The direntry nodes are checked during the flash scanning */
-	BUG_ON(ref_flags(ref) == REF_UNCHECKED);
 	/* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
 	BUG_ON(ref_obsolete(ref));
 
-	/* Sanity check */
-	if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
-		JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
-		       ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
+	crc = crc32(0, rd, sizeof(*rd) - 8);
+	if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
+		JFFS2_NOTICE("header CRC failed on dirent node at %#08x: read %#08x, calculated %#08x\n",
+			     ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
 		return 1;
 	}
 
+	/* If we've never checked the CRCs on this node, check them now */
+	if (ref_flags(ref) == REF_UNCHECKED) {
+		struct jffs2_eraseblock *jeb;
+		int len;
+
+		/* Sanity check */
+		if (unlikely(PAD((rd->nsize + sizeof(*rd))) != PAD(je32_to_cpu(rd->totlen)))) {
+			JFFS2_ERROR("illegal nsize in node at %#08x: nsize %#02x, totlen %#04x\n",
+				    ref_offset(ref), rd->nsize, je32_to_cpu(rd->totlen));
+			return 1;
+		}
+
+		jeb = &c->blocks[ref->flash_offset / c->sector_size];
+		len = ref_totlen(c, jeb, ref);
+
+		spin_lock(&c->erase_completion_lock);
+		jeb->used_size += len;
+		jeb->unchecked_size -= len;
+		c->used_size += len;
+		c->unchecked_size -= len;
+		ref->flash_offset = ref_offset(ref) | REF_PRISTINE;
+		spin_unlock(&c->erase_completion_lock);
+	}
+
 	fd = jffs2_alloc_full_dirent(rd->nsize + 1);
 	if (unlikely(!fd))
 		return -ENOMEM;
@@ -198,10 +221,18 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 	struct jffs2_tmp_dnode_info *tn;
 	uint32_t len, csize;
 	int ret = 1;
+	uint32_t crc;
 
 	/* Obsoleted. This cannot happen, surely? dwmw2 20020308 */
 	BUG_ON(ref_obsolete(ref));
 
+	crc = crc32(0, rd, sizeof(*rd) - 8);
+	if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
+		JFFS2_NOTICE("node CRC failed on dnode at %#08x: read %#08x, calculated %#08x\n",
+			     ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
+		return 1;
+	}
+
 	tn = jffs2_alloc_tmp_dnode_info();
 	if (!tn) {
 		JFFS2_ERROR("failed to allocate tn (%zu bytes).\n", sizeof(*tn));
@@ -213,14 +244,6 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 	/* If we've never checked the CRCs on this node, check them now */
 	if (ref_flags(ref) == REF_UNCHECKED) {
-		uint32_t crc;
-
-		crc = crc32(0, rd, sizeof(*rd) - 8);
-		if (unlikely(crc != je32_to_cpu(rd->node_crc))) {
-			JFFS2_NOTICE("header CRC failed on node at %#08x: read %#08x, calculated %#08x\n",
-					ref_offset(ref), je32_to_cpu(rd->node_crc), crc);
-			goto free_out;
-		}
 
 		/* Sanity checks */
 		if (unlikely(je32_to_cpu(rd->offset) > je32_to_cpu(rd->isize)) ||
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 51bf1654ce3..0b02fc79e4d 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -453,7 +453,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					return -ENOMEM;
 				}
 
-				fd->raw = sum_link_node_ref(c, jeb,  je32_to_cpu(spd->offset) | REF_PRISTINE,
+				fd->raw = sum_link_node_ref(c, jeb,  je32_to_cpu(spd->offset) | REF_UNCHECKED,
 							    PAD(je32_to_cpu(spd->totlen)), ic);
 
 				fd->next = NULL;
-- 
cgit v1.2.3


From 1d47bec290a6f1f366192946840efef5076d9fc7 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Mon, 19 Jun 2006 08:39:16 +1000
Subject: [XFS] Remove unnecessary local from open_exec dmapi path.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26247a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 89b1a742135..cf65a8364d5 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -522,23 +522,18 @@ xfs_file_open_exec(
 	struct inode	*inode)
 {
 	bhv_vnode_t	*vp = vn_from_inode(inode);
-	xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
-	int		error = 0;
-	xfs_inode_t	*ip;
 
-	if (vp->v_vfsp->vfs_flag & VFS_DMI) {
-		ip = xfs_vtoi(vp);
-		if (!ip) {
-			error = -EINVAL;
-			goto open_exec_out;
-		}
-		if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) {
-			error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
+	if (unlikely(vp->v_vfsp->vfs_flag & VFS_DMI)) {
+		xfs_mount_t	*mp = XFS_VFSTOM(vp->v_vfsp);
+		xfs_inode_t	*ip = xfs_vtoi(vp);
+
+		if (!ip)
+			return -EINVAL;
+		if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ))
+			return -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
 					       0, 0, 0, NULL);
-		}
 	}
-open_exec_out:
-	return error;
+	return 0;
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
 
-- 
cgit v1.2.3


From 1e69dd0eb354d6f1a77098a3946b5ba57d4e3109 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Mon, 19 Jun 2006 08:39:53 +1000
Subject: [XFS] Push some common code out of write path into core XFS code for
 sharing.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26248a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_lrw.c | 76 ++--------------------------------------
 fs/xfs/xfs_rw.c            | 86 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_rw.h            |  3 +-
 3 files changed, 90 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 3e76c5c9dda..8e546870481 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -892,79 +892,9 @@ retry:
 
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
-		/*
-		 * If we're treating this as O_DSYNC and we have not updated the
-		 * size, force the log.
-		 */
-		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-		    !(xip->i_update_size)) {
-			xfs_inode_log_item_t	*iip = xip->i_itemp;
-
-			/*
-			 * If an allocation transaction occurred
-			 * without extending the size, then we have to force
-			 * the log up the proper point to ensure that the
-			 * allocation is permanent.  We can't count on
-			 * the fact that buffered writes lock out direct I/O
-			 * writes - the direct I/O write could have extended
-			 * the size nontransactionally, then finished before
-			 * we started.  xfs_write_file will think that the file
-			 * didn't grow but the update isn't safe unless the
-			 * size change is logged.
-			 *
-			 * Force the log if we've committed a transaction
-			 * against the inode or if someone else has and
-			 * the commit record hasn't gone to disk (e.g.
-			 * the inode is pinned).  This guarantees that
-			 * all changes affecting the inode are permanent
-			 * when we return.
-			 */
-			if (iip && iip->ili_last_lsn) {
-				xfs_log_force(mp, iip->ili_last_lsn,
-						XFS_LOG_FORCE | XFS_LOG_SYNC);
-			} else if (xfs_ipincount(xip) > 0) {
-				xfs_log_force(mp, (xfs_lsn_t)0,
-						XFS_LOG_FORCE | XFS_LOG_SYNC);
-			}
-
-		} else {
-			xfs_trans_t	*tp;
-
-			/*
-			 * O_SYNC or O_DSYNC _with_ a size update are handled
-			 * the same way.
-			 *
-			 * If the write was synchronous then we need to make
-			 * sure that the inode modification time is permanent.
-			 * We'll have updated the timestamp above, so here
-			 * we use a synchronous transaction to log the inode.
-			 * It's not fast, but it's necessary.
-			 *
-			 * If this a dsync write and the size got changed
-			 * non-transactionally, then we need to ensure that
-			 * the size change gets logged in a synchronous
-			 * transaction.
-			 */
-
-			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-			if ((error = xfs_trans_reserve(tp, 0,
-						      XFS_SWRITE_LOG_RES(mp),
-						      0, 0, 0))) {
-				/* Transaction reserve failed */
-				xfs_trans_cancel(tp, 0);
-			} else {
-				/* Transaction reserve successful */
-				xfs_ilock(xip, XFS_ILOCK_EXCL);
-				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
-				xfs_trans_ihold(tp, xip);
-				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
-				xfs_trans_set_sync(tp);
-				error = xfs_trans_commit(tp, 0, NULL);
-				xfs_iunlock(xip, XFS_ILOCK_EXCL);
-			}
-			if (error)
-				goto out_unlock_internal;
-		}
+		error = xfs_write_sync_logforce(mp, xip);
+		if (error)
+			goto out_unlock_internal;
 
 		xfs_rwunlock(bdp, locktype);
 		if (need_i_mutex)
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index d33e4f5808e..c55e28189ab 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -91,6 +91,90 @@ xfs_write_clear_setuid(
 	return 0;
 }
 
+/*
+ * Handle logging requirements of various synchronous types of write.
+ */
+int
+xfs_write_sync_logforce(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip)
+{
+	int		error = 0;
+
+	/*
+	 * If we're treating this as O_DSYNC and we have not updated the
+	 * size, force the log.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
+	    !(ip->i_update_size)) {
+		xfs_inode_log_item_t	*iip = ip->i_itemp;
+
+		/*
+		 * If an allocation transaction occurred
+		 * without extending the size, then we have to force
+		 * the log up the proper point to ensure that the
+		 * allocation is permanent.  We can't count on
+		 * the fact that buffered writes lock out direct I/O
+		 * writes - the direct I/O write could have extended
+		 * the size nontransactionally, then finished before
+		 * we started.  xfs_write_file will think that the file
+		 * didn't grow but the update isn't safe unless the
+		 * size change is logged.
+		 *
+		 * Force the log if we've committed a transaction
+		 * against the inode or if someone else has and
+		 * the commit record hasn't gone to disk (e.g.
+		 * the inode is pinned).  This guarantees that
+		 * all changes affecting the inode are permanent
+		 * when we return.
+		 */
+		if (iip && iip->ili_last_lsn) {
+			xfs_log_force(mp, iip->ili_last_lsn,
+					XFS_LOG_FORCE | XFS_LOG_SYNC);
+		} else if (xfs_ipincount(ip) > 0) {
+			xfs_log_force(mp, (xfs_lsn_t)0,
+					XFS_LOG_FORCE | XFS_LOG_SYNC);
+		}
+
+	} else {
+		xfs_trans_t	*tp;
+
+		/*
+		 * O_SYNC or O_DSYNC _with_ a size update are handled
+		 * the same way.
+		 *
+		 * If the write was synchronous then we need to make
+		 * sure that the inode modification time is permanent.
+		 * We'll have updated the timestamp above, so here
+		 * we use a synchronous transaction to log the inode.
+		 * It's not fast, but it's necessary.
+		 *
+		 * If this a dsync write and the size got changed
+		 * non-transactionally, then we need to ensure that
+		 * the size change gets logged in a synchronous
+		 * transaction.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
+		if ((error = xfs_trans_reserve(tp, 0,
+						XFS_SWRITE_LOG_RES(mp),
+						0, 0, 0))) {
+			/* Transaction reserve failed */
+			xfs_trans_cancel(tp, 0);
+		} else {
+			/* Transaction reserve successful */
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(tp, ip);
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+			xfs_trans_set_sync(tp);
+			error = xfs_trans_commit(tp, 0, NULL);
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		}
+	}
+
+	return error;
+}
+
 /*
  * Force a shutdown of the filesystem instantly while keeping
  * the filesystem consistent. We don't do an unmount here; just shutdown
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index a572b175dc8..188b296ff50 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -75,6 +75,7 @@ xfs_fsb_to_db_io(struct xfs_iocore *io, xfs_fsblock_t fsb)
  * Prototypes for functions in xfs_rw.c.
  */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
+extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bioerror(struct xfs_buf *bp);
 extern int xfs_bioerror_relse(struct xfs_buf *bp);
-- 
cgit v1.2.3


From 6fe90e6d1451a05db37b2a582410ddcb45af3606 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Mon, 19 Jun 2006 08:40:12 +1000
Subject: [XFS] Remove an incorrect use of unlikely() on a relatively likely
 code path.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26249a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3050b4c647c..4d0ca14039a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2834,7 +2834,7 @@ xfs_bmap_btalloc(
 		args.prod = ap->ip->i_d.di_extsize;
 		if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);
-	} else if (unlikely(mp->m_sb.sb_blocksize >= NBPP)) {
+	} else if (mp->m_sb.sb_blocksize >= NBPP) {
 		args.prod = 1;
 		args.mod = 0;
 	} else {
-- 
cgit v1.2.3


From a805bad5daae8d4f92ce46f467484d4867e996d4 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Mon, 19 Jun 2006 08:40:27 +1000
Subject: [XFS] Remove unneeded conditional code on NFS export interface
 related code paths.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26250a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/Kconfig               | 6 ------
 fs/xfs/linux-2.6/xfs_super.c | 4 +---
 2 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 236f9cf3714..26b364c9d62 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,5 @@
 config XFS_FS
 	tristate "XFS filesystem support"
-	select EXPORTFS if NFSD!=n
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
@@ -18,11 +17,6 @@ config XFS_FS
 	  system of your root partition is compiled as a module, you'll need
 	  to use an initial ramdisk (initrd) to boot.
 
-config XFS_EXPORT
-	bool
-	depends on XFS_FS && EXPORTFS
-	default y
-
 config XFS_QUOTA
 	bool "XFS Quota support"
 	depends on XFS_FS
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b7aad3cfdfe..7fae922d54d 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -796,9 +796,7 @@ xfs_fs_fill_super(
 	}
 
 	sb_min_blocksize(sb, BBSIZE);
-#ifdef CONFIG_XFS_EXPORT
 	sb->s_export_op = &xfs_export_operations;
-#endif
 	sb->s_qcop = &xfs_quotactl_operations;
 	sb->s_op = &xfs_super_operations;
 
-- 
cgit v1.2.3


From 0d8fee3270f8a5e4bf95fbed3e81e21b57f8a5a0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 19 Jun 2006 08:41:30 +1000
Subject: [XFS] Kill direct access to ->count in valusema(); all we ever use it
 for is check if semaphore is actually locked, which can be trivially done in
 portable way. Code gets more reabable, while we are at it...

SGI-PV: 953915
SGI-Modid: xfs-linux-melb:xfs-kern:26274a

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/sema.h       | 19 ++++++++++---------
 fs/xfs/quota/xfs_dquot.h      |  4 ++--
 fs/xfs/quota/xfs_dquot_item.c |  4 ++--
 fs/xfs/xfs_iget.c             |  2 +-
 fs/xfs/xfs_inode.c            |  4 ++--
 fs/xfs/xfs_inode_item.c       |  6 +++---
 6 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 194a84490bd..b25090094cc 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -34,20 +34,21 @@ typedef struct semaphore sema_t;
 #define initnsema(sp, val, name)	sema_init(sp, val)
 #define psema(sp, b)			down(sp)
 #define vsema(sp)			up(sp)
-#define valusema(sp)			(atomic_read(&(sp)->count))
-#define freesema(sema)
+#define freesema(sema)			do { } while (0)
+
+static inline int issemalocked(sema_t *sp)
+{
+	return down_trylock(sp) || (up(sp), 0);
+}
 
 /*
  * Map cpsema (try to get the sema) to down_trylock. We need to switch
  * the return values since cpsema returns 1 (acquired) 0 (failed) and
  * down_trylock returns the reverse 0 (acquired) 1 (failed).
  */
-
-#define cpsema(sp)			(down_trylock(sp) ? 0 : 1)
-
-/*
- * Didn't do cvsema(sp). Not sure how to map this to up/down/...
- * It does a vsema if the values is < 0 other wise nothing.
- */
+static inline int cpsema(sema_t *sp)
+{
+	return down_trylock(sp) ? 0 : 1;
+}
 
 #endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index c0c629663a5..78d3ab95c5f 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -119,7 +119,7 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
  */
 #define xfs_dqflock(dqp)	 { psema(&((dqp)->q_flock), PINOD | PRECALC);\
 				   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
-#define xfs_dqfunlock(dqp)	 { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \
+#define xfs_dqfunlock(dqp)	 { ASSERT(issemalocked(&((dqp)->q_flock))); \
 				   vsema(&((dqp)->q_flock)); \
 				   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
 
@@ -128,7 +128,7 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
 #define XFS_DQ_PINUNLOCK(dqp, s)   mutex_spinunlock( \
 				     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s)
 
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0)
+#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock)))
 #define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
 #define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 546f48af882..21ad5a55e01 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -248,7 +248,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	 * inode flush completed and the inode was taken off the AIL.
 	 * So, just get out.
 	 */
-	if ((valusema(&(dqp->q_flock)) > 0)  ||
+	if (!issemalocked(&(dqp->q_flock))  ||
 	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
 		qip->qli_pushbuf_flag = 0;
 		xfs_dqunlock(dqp);
@@ -261,7 +261,7 @@ xfs_qm_dquot_logitem_pushbuf(
 	if (bp != NULL) {
 		if (XFS_BUF_ISDELAYWRITE(bp)) {
 			dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
-				  (valusema(&(dqp->q_flock)) <= 0));
+				  issemalocked(&(dqp->q_flock)));
 			qip->qli_pushbuf_flag = 0;
 			xfs_dqunlock(dqp);
 
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index da3c94c230d..7894b72a717 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -1031,6 +1031,6 @@ xfs_iflock_nowait(xfs_inode_t *ip)
 void
 xfs_ifunlock(xfs_inode_t *ip)
 {
-	ASSERT(valusema(&(ip->i_flock)) <= 0);
+	ASSERT(issemalocked(&(ip->i_flock)));
 	vsema(&(ip->i_flock));
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9af9e32a6a1..e799f0d0087 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3015,7 +3015,7 @@ xfs_iflush(
 	XFS_STATS_INC(xs_iflush_count);
 
 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
-	ASSERT(valusema(&ip->i_flock) <= 0);
+	ASSERT(issemalocked(&(ip->i_flock)));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 
@@ -3273,7 +3273,7 @@ xfs_iflush_int(
 	SPLDECL(s);
 
 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
-	ASSERT(valusema(&ip->i_flock) <= 0);
+	ASSERT(issemalocked(&(ip->i_flock)));
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
 
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cd65a565b4f..2caa91b8971 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -794,7 +794,7 @@ xfs_inode_item_pushbuf(
 	 * inode flush completed and the inode was taken off the AIL.
 	 * So, just get out.
 	 */
-	if ((valusema(&(ip->i_flock)) > 0)  ||
+	if (!issemalocked(&(ip->i_flock)) ||
 	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
 		iip->ili_pushbuf_flag = 0;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -816,7 +816,7 @@ xfs_inode_item_pushbuf(
 			 * If not, we can flush it async.
 			 */
 			dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-				  (valusema(&(ip->i_flock)) <= 0));
+				  issemalocked(&(ip->i_flock)));
 			iip->ili_pushbuf_flag = 0;
 			xfs_iunlock(ip, XFS_ILOCK_SHARED);
 			xfs_buftrace("INODE ITEM PUSH", bp);
@@ -864,7 +864,7 @@ xfs_inode_item_push(
 	ip = iip->ili_inode;
 
 	ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
-	ASSERT(valusema(&(ip->i_flock)) <= 0);
+	ASSERT(issemalocked(&(ip->i_flock)));
 	/*
 	 * Since we were able to lock the inode's flush lock and
 	 * we found it on the AIL, the inode must be dirty.  This
-- 
cgit v1.2.3


From da2f4d679c8070ba5b6a920281e495917b293aa0 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 20 Jun 2006 13:01:38 +1000
Subject: [XFS] Map EFSCORRUPTED to an actual error code, not just a made up
 one (990).	Turns out some ye-olde unices used EUCLEAN as
 Filesystem-needs-cleaning, so now we use that too.

SGI-PV: 953954
SGI-Modid: xfs-linux-melb:xfs-kern:26286a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_linux.h | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index e9285395411..aa26ab906c8 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -194,25 +194,9 @@ BUFFER_FNS(PrivateStart, unwritten);
 /* bytes to clicks */
 #define btoc(x)         (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
 
-#ifndef ENOATTR
 #define ENOATTR		ENODATA		/* Attribute not found */
-#endif
-
-/* Note: EWRONGFS never visible outside the kernel */
-#define	EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
-
-/*
- * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't
- *     return codes out of its known range in errno.
- * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't
- *     conflict with any code we use already or any code a driver may use)
- * XXX Some options (currently we do #2):
- *	1/ New error code ["Filesystem is corrupted", _after_ glibc updated]
- *	2/ 990 ["Unknown error 990"]
- *	3/ EUCLEAN ["Structure needs cleaning"]
- *	4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]
- */
-#define EFSCORRUPTED    990		/* Filesystem is corrupted */
+#define EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
+#define EFSCORRUPTED	EUCLEAN		/* Filesystem is corrupted */
 
 #define SYNCHRONIZE()	barrier()
 #define __return_address __builtin_return_address(0)
-- 
cgit v1.2.3


From f6c2d1fa6310a71b1c2e05fc6d9ff9b91489fa0e Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 20 Jun 2006 13:04:51 +1000
Subject: [XFS] Remove version 1 directory code.	Never functioned on Linux,
 just pure bloat.

SGI-PV: 952969
SGI-Modid: xfs-linux-melb:xfs-kern:26251a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c    |    2 -
 fs/xfs/linux-2.6/xfs_export.c  |    1 -
 fs/xfs/linux-2.6/xfs_file.c    |    2 -
 fs/xfs/linux-2.6/xfs_ioctl.c   |    2 -
 fs/xfs/linux-2.6/xfs_iops.c    |    2 -
 fs/xfs/linux-2.6/xfs_lrw.c     |    2 -
 fs/xfs/linux-2.6/xfs_super.c   |    2 -
 fs/xfs/linux-2.6/xfs_vfs.c     |    1 -
 fs/xfs/quota/xfs_dquot.c       |    2 -
 fs/xfs/quota/xfs_dquot_item.c  |    2 -
 fs/xfs/quota/xfs_qm.c          |    2 -
 fs/xfs/quota/xfs_qm_bhv.c      |    2 -
 fs/xfs/quota/xfs_qm_stats.c    |    2 -
 fs/xfs/quota/xfs_qm_syscalls.c |    2 -
 fs/xfs/quota/xfs_trans_dquot.c |    2 -
 fs/xfs/xfs_acl.c               |    2 -
 fs/xfs/xfs_alloc.c             |    2 -
 fs/xfs/xfs_alloc_btree.c       |    2 -
 fs/xfs/xfs_attr.c              |    2 -
 fs/xfs/xfs_attr_leaf.c         |    2 -
 fs/xfs/xfs_bmap.c              |    8 +-
 fs/xfs/xfs_bmap_btree.c        |    2 -
 fs/xfs/xfs_btree.c             |    2 -
 fs/xfs/xfs_buf_item.c          |    1 -
 fs/xfs/xfs_da_btree.c          |  185 +---
 fs/xfs/xfs_da_btree.h          |    4 -
 fs/xfs/xfs_dfrag.c             |    2 -
 fs/xfs/xfs_dinode.h            |    1 -
 fs/xfs/xfs_dir.c               | 1213 ----------------------
 fs/xfs/xfs_dir.h               |  142 ---
 fs/xfs/xfs_dir2.c              |  391 +++----
 fs/xfs/xfs_dir2.h              |   32 +-
 fs/xfs/xfs_dir2_block.c        |   15 +-
 fs/xfs/xfs_dir2_data.c         |    3 -
 fs/xfs/xfs_dir2_leaf.c         |    2 -
 fs/xfs/xfs_dir2_node.c         |    2 -
 fs/xfs/xfs_dir2_sf.c           |    3 -
 fs/xfs/xfs_dir2_trace.c        |    2 -
 fs/xfs/xfs_dir_leaf.c          | 2230 ----------------------------------------
 fs/xfs/xfs_dir_leaf.h          |  231 -----
 fs/xfs/xfs_dir_sf.h            |  155 ---
 fs/xfs/xfs_dmops.c             |    1 -
 fs/xfs/xfs_error.c             |    2 -
 fs/xfs/xfs_extfree_item.c      |    1 -
 fs/xfs/xfs_fsops.c             |    2 -
 fs/xfs/xfs_ialloc.c            |    2 -
 fs/xfs/xfs_ialloc_btree.c      |    2 -
 fs/xfs/xfs_iget.c              |    2 -
 fs/xfs/xfs_inode.c             |    9 -
 fs/xfs/xfs_inode_item.c        |    2 -
 fs/xfs/xfs_iocore.c            |    2 -
 fs/xfs/xfs_iomap.c             |    2 -
 fs/xfs/xfs_itable.c            |    2 -
 fs/xfs/xfs_log.c               |    2 -
 fs/xfs/xfs_log_recover.c       |    2 -
 fs/xfs/xfs_mount.c             |   15 +-
 fs/xfs/xfs_mount.h             |    2 -
 fs/xfs/xfs_qmops.c             |    1 -
 fs/xfs/xfs_rename.c            |   54 +-
 fs/xfs/xfs_rtalloc.c           |    2 -
 fs/xfs/xfs_rw.c                |    2 -
 fs/xfs/xfs_trans.c             |    2 -
 fs/xfs/xfs_trans.h             |    7 +-
 fs/xfs/xfs_trans_ail.c         |    1 -
 fs/xfs/xfs_trans_buf.c         |    2 -
 fs/xfs/xfs_trans_extfree.c     |    1 -
 fs/xfs/xfs_trans_inode.c       |    2 -
 fs/xfs/xfs_trans_space.h       |   11 +-
 fs/xfs/xfs_utils.c             |    5 +-
 fs/xfs/xfs_vfsops.c            |    8 -
 fs/xfs/xfs_vnodeops.c          |   68 +-
 71 files changed, 285 insertions(+), 4595 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c5ea26d73be..3e807b828e2 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -21,7 +21,6 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_trans.h"
 #include "xfs_dmapi.h"
@@ -29,7 +28,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index d32a9edc43a..5fb75d9151f 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -21,7 +21,6 @@
 #include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_mount.h"
 #include "xfs_export.h"
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index cf65a8364d5..70662371bb1 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -21,7 +21,6 @@
 #include "xfs_inum.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_trans.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a0e247c7132..6e52a5dd38d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -31,7 +30,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 484daef91d7..12810baeb5d 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 8e546870481..5d9cfd91ad0 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 7fae922d54d..f2a0778536f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
index 4fc884bcb4f..6145e8bd0be 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_imap.h"
 #include "xfs_alloc.h"
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 46bec66bb4d..3aa77153185 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 21ad5a55e01..5b2dcc58b24 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 00fb54d4899..e23e45535c4 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index d93d3a1064e..e95e99f7168 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 0570f773355..6f858fb81a3 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index c93072be679..ed620c4d159 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -26,7 +26,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -35,7 +34,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 9168918db25..0242e9666e8 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -33,7 +32,6 @@
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index e1074955386..4b0cb474be4 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -21,12 +21,10 @@
 #include "xfs_bit.h"
 #include "xfs_inum.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 22af489d3f3..eef6763f3a6 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index a1d92da86cc..7446556e802 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 4dcef8d1c32..1a210104327 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -27,7 +27,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -35,7 +34,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 5c44343d4a3..9455051f012 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -34,7 +33,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 4d0ca14039a..3a613753906 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -24,13 +24,11 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -40,13 +38,15 @@
 #include "xfs_mount.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
 #include "xfs_inode_item.h"
 #include "xfs_extfree_item.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_rw.h"
 #include "xfs_quota.h"
@@ -516,7 +516,7 @@ xfs_bmap_add_attrfork_local(
 		dargs.total = mp->m_dirblkfsbs;
 		dargs.whichfork = XFS_DATA_FORK;
 		dargs.trans = tp;
-		error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs);
+		error = xfs_dir2_sf_to_block(&dargs);
 	} else
 		error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
 			XFS_DATA_FORK);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 3b6dfc9b53a..18fb7385d71 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 52d5d095fc3..ee2255bd656 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 290912cbff6..a4aa53974f7 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -23,7 +23,6 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_buf_item.h"
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 260c3d770c0..32ab61d17ac 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -43,7 +41,6 @@
 #include "xfs_bmap.h"
 #include "xfs_attr.h"
 #include "xfs_attr_leaf.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -159,7 +156,7 @@ xfs_da_split(xfs_da_state_t *state)
 	max = state->path.active - 1;
 	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
 	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
-	       state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
 
 	addblk = &state->path.blk[max];		/* initial dummy value */
 	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
@@ -199,38 +196,7 @@ xfs_da_split(xfs_da_state_t *state)
 				return(error);	/* GROT: attr inconsistent */
 			addblk = newblk;
 			break;
-		case XFS_DIR_LEAF_MAGIC:
-			ASSERT(XFS_DIR_IS_V1(state->mp));
-			error = xfs_dir_leaf_split(state, oldblk, newblk);
-			if ((error != 0) && (error != ENOSPC)) {
-				return(error);	/* GROT: dir is inconsistent */
-			}
-			if (!error) {
-				addblk = newblk;
-				break;
-			}
-			/*
-			 * Entry wouldn't fit, split the leaf again.
-			 */
-			state->extravalid = 1;
-			if (state->inleaf) {
-				state->extraafter = 0;	/* before newblk */
-				error = xfs_dir_leaf_split(state, oldblk,
-							   &state->extrablk);
-				if (error)
-					return(error);	/* GROT: dir incon. */
-				addblk = newblk;
-			} else {
-				state->extraafter = 1;	/* after newblk */
-				error = xfs_dir_leaf_split(state, newblk,
-							   &state->extrablk);
-				if (error)
-					return(error);	/* GROT: dir incon. */
-				addblk = newblk;
-			}
-			break;
 		case XFS_DIR2_LEAFN_MAGIC:
-			ASSERT(XFS_DIR_IS_V2(state->mp));
 			error = xfs_dir2_leafn_split(state, oldblk, newblk);
 			if (error)
 				return error;
@@ -363,7 +329,6 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] -
 			     (char *)oldroot);
 	} else {
-		ASSERT(XFS_DIR_IS_V2(mp));
 		ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
 		leaf = (xfs_dir2_leaf_t *)oldroot;
 		size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] -
@@ -379,8 +344,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	 * Set up the new root node.
 	 */
 	error = xfs_da_node_create(args,
-		args->whichfork == XFS_DATA_FORK &&
-		XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0,
+		(args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
 		be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork);
 	if (error)
 		return(error);
@@ -427,10 +391,9 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
 
 	/*
-	 * With V2 the extra block is data or freespace.
+	 * With V2 dirs the extra block is data or freespace.
 	 */
-	useextra = state->extravalid && (XFS_DIR_IS_V1(state->mp) ||
-			state->args->whichfork == XFS_ATTR_FORK);
+	useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
 	newcount = 1 + useextra;
 	/*
 	 * Do we have to split the node?
@@ -624,7 +587,7 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
 	ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC);
 	ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count)));
 	ASSERT(newblk->blkno != 0);
-	if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+	if (state->args->whichfork == XFS_DATA_FORK)
 		ASSERT(newblk->blkno >= mp->m_dirleafblk &&
 		       newblk->blkno < mp->m_dirfreeblk);
 
@@ -670,7 +633,7 @@ xfs_da_join(xfs_da_state_t *state)
 	save_blk = &state->altpath.blk[ state->path.active-1 ];
 	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
 	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
-	       drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+	       drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
 
 	/*
 	 * Walk back up the tree joining/deallocating as necessary.
@@ -693,17 +656,7 @@ xfs_da_join(xfs_da_state_t *state)
 				return(0);
 			xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
 			break;
-		case XFS_DIR_LEAF_MAGIC:
-			ASSERT(XFS_DIR_IS_V1(state->mp));
-			error = xfs_dir_leaf_toosmall(state, &action);
-			if (error)
-				return(error);
-			if (action == 0)
-				return(0);
-			xfs_dir_leaf_unbalance(state, drop_blk, save_blk);
-			break;
 		case XFS_DIR2_LEAFN_MAGIC:
-			ASSERT(XFS_DIR_IS_V2(state->mp));
 			error = xfs_dir2_leafn_toosmall(state, &action);
 			if (error)
 				return error;
@@ -790,7 +743,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
 	ASSERT(bp != NULL);
 	blkinfo = bp->data;
 	if (be16_to_cpu(oldroot->hdr.level) == 1) {
-		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIR2_LEAFN_MAGIC ||
 		       be16_to_cpu(blkinfo->magic) == XFS_ATTR_LEAF_MAGIC);
 	} else {
 		ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DA_NODE_MAGIC);
@@ -951,14 +904,7 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
 		if (count == 0)
 			return;
 		break;
-	case XFS_DIR_LEAF_MAGIC:
-		ASSERT(XFS_DIR_IS_V1(state->mp));
-		lasthash = xfs_dir_leaf_lasthash(blk->bp, &count);
-		if (count == 0)
-			return;
-		break;
 	case XFS_DIR2_LEAFN_MAGIC:
-		ASSERT(XFS_DIR_IS_V2(state->mp));
 		lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
 		if (count == 0)
 			return;
@@ -1117,10 +1063,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 	 * Descend thru the B-tree searching each level for the right
 	 * node to use, until the right hashval is found.
 	 */
-	if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp))
-		blkno = state->mp->m_dirleafblk;
-	else
-		blkno = 0;
+	blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
 	for (blk = &state->path.blk[0], state->path.active = 1;
 			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
 			 blk++, state->path.active++) {
@@ -1137,7 +1080,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 		}
 		curr = blk->bp->data;
 		ASSERT(be16_to_cpu(curr->magic) == XFS_DA_NODE_MAGIC ||
-		       be16_to_cpu(curr->magic) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC ||
 		       be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC);
 
 		/*
@@ -1190,16 +1133,10 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 				blk->index = probe;
 				blkno = be32_to_cpu(btree->before);
 			}
-		}
-		else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) {
+		} else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) {
 			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
 			break;
-		}
-		else if (be16_to_cpu(curr->magic) == XFS_DIR_LEAF_MAGIC) {
-			blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL);
-			break;
-		}
-		else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) {
+		} else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) {
 			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
 			break;
 		}
@@ -1212,12 +1149,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 	 * next leaf and keep searching.
 	 */
 	for (;;) {
-		if (blk->magic == XFS_DIR_LEAF_MAGIC) {
-			ASSERT(XFS_DIR_IS_V1(state->mp));
-			retval = xfs_dir_leaf_lookup_int(blk->bp, args,
-								  &blk->index);
-		} else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
-			ASSERT(XFS_DIR_IS_V2(state->mp));
+		if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
 			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
 							&blk->index, state);
 		}
@@ -1270,7 +1202,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 	old_info = old_blk->bp->data;
 	new_info = new_blk->bp->data;
 	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
-	       old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+	       old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
 	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
 	ASSERT(old_blk->magic == be16_to_cpu(old_info->magic));
 	ASSERT(new_blk->magic == be16_to_cpu(new_info->magic));
@@ -1280,12 +1212,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 	case XFS_ATTR_LEAF_MAGIC:
 		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
 		break;
-	case XFS_DIR_LEAF_MAGIC:
-		ASSERT(XFS_DIR_IS_V1(state->mp));
-		before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp);
-		break;
 	case XFS_DIR2_LEAFN_MAGIC:
-		ASSERT(XFS_DIR_IS_V2(state->mp));
 		before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
 		break;
 	case XFS_DA_NODE_MAGIC:
@@ -1404,7 +1331,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 	save_info = save_blk->bp->data;
 	drop_info = drop_blk->bp->data;
 	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
-	       save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+	       save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
 	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
 	ASSERT(save_blk->magic == be16_to_cpu(save_info->magic));
 	ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic));
@@ -1529,7 +1456,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 		ASSERT(blk->bp != NULL);
 		info = blk->bp->data;
 		ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC ||
-		       be16_to_cpu(info->magic) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC ||
 		       be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC);
 		blk->magic = be16_to_cpu(info->magic);
 		if (blk->magic == XFS_DA_NODE_MAGIC) {
@@ -1548,20 +1475,13 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 				blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
 								      NULL);
 				break;
-			case XFS_DIR_LEAF_MAGIC:
-				ASSERT(XFS_DIR_IS_V1(state->mp));
-				blk->hashval = xfs_dir_leaf_lasthash(blk->bp,
-								     NULL);
-				break;
 			case XFS_DIR2_LEAFN_MAGIC:
-				ASSERT(XFS_DIR_IS_V2(state->mp));
 				blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
 								       NULL);
 				break;
 			default:
 				ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
-				       blk->magic ==
-				       XFS_DIRX_LEAF_MAGIC(state->mp));
+				       blk->magic == XFS_DIR2_LEAFN_MAGIC);
 				break;
 			}
 		}
@@ -1620,7 +1540,6 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	xfs_bmbt_irec_t	*mapp;
 	xfs_inode_t *dp;
 	int nmap, error, w, count, c, got, i, mapi;
-	xfs_fsize_t size;
 	xfs_trans_t *tp;
 	xfs_mount_t *mp;
 
@@ -1631,7 +1550,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	/*
 	 * For new directories adjust the file offset and block count.
 	 */
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) {
+	if (w == XFS_DATA_FORK) {
 		bno = mp->m_dirleafblk;
 		count = mp->m_dirblkfsbs;
 	} else {
@@ -1641,10 +1560,9 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	/*
 	 * Find a spot in the file space to put the new block.
 	 */
-	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) {
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w)))
 		return error;
-	}
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+	if (w == XFS_DATA_FORK)
 		ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
 	/*
 	 * Try mapping it in one filesystem block.
@@ -1706,19 +1624,6 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	if (mapp != &map)
 		kmem_free(mapp, sizeof(*mapp) * count);
 	*new_blkno = (xfs_dablk_t)bno;
-	/*
-	 * For version 1 directories, adjust the file size if it changed.
-	 */
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
-		ASSERT(mapi == 1);
-		if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
-			return error;
-		size = XFS_FSB_TO_B(mp, bno);
-		if (size != dp->i_d.di_size) {
-			dp->i_d.di_size = size;
-			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-		}
-	}
 	return 0;
 }
 
@@ -1743,7 +1648,6 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 	int error, w, entno, level, dead_level;
 	xfs_da_blkinfo_t *dead_info, *sib_info;
 	xfs_da_intnode_t *par_node, *dead_node;
-	xfs_dir_leafblock_t *dead_leaf;
 	xfs_dir2_leaf_t *dead_leaf2;
 	xfs_dahash_t dead_hash;
 
@@ -1754,11 +1658,8 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 	w = args->whichfork;
 	ASSERT(w == XFS_DATA_FORK);
 	mp = ip->i_mount;
-	if (XFS_DIR_IS_V2(mp)) {
-		lastoff = mp->m_dirfreeblk;
-		error = xfs_bmap_last_before(tp, ip, &lastoff, w);
-	} else
-		error = xfs_bmap_last_offset(tp, ip, &lastoff, w);
+	lastoff = mp->m_dirfreeblk;
+	error = xfs_bmap_last_before(tp, ip, &lastoff, w);
 	if (error)
 		return error;
 	if (unlikely(lastoff == 0)) {
@@ -1781,14 +1682,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 	/*
 	 * Get values from the moved block.
 	 */
-	if (be16_to_cpu(dead_info->magic) == XFS_DIR_LEAF_MAGIC) {
-		ASSERT(XFS_DIR_IS_V1(mp));
-		dead_leaf = (xfs_dir_leafblock_t *)dead_info;
-		dead_level = 0;
-		dead_hash = be32_to_cpu(dead_leaf->entries[
-				be16_to_cpu(dead_leaf->hdr.count) - 1].hashval);
-	} else if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
-		ASSERT(XFS_DIR_IS_V2(mp));
+	if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) {
 		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
 		dead_level = 0;
 		dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval);
@@ -1843,7 +1737,7 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
 		xfs_da_buf_done(sib_buf);
 		sib_buf = NULL;
 	}
-	par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk;
+	par_blkno = mp->m_dirleafblk;
 	level = -1;
 	/*
 	 * Walk down the tree looking for the parent of the moved block.
@@ -1942,8 +1836,6 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 {
 	xfs_inode_t *dp;
 	int done, error, w, count;
-	xfs_fileoff_t bno;
-	xfs_fsize_t size;
 	xfs_trans_t *tp;
 	xfs_mount_t *mp;
 
@@ -1951,7 +1843,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 	w = args->whichfork;
 	tp = args->trans;
 	mp = dp->i_mount;
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+	if (w == XFS_DATA_FORK)
 		count = mp->m_dirblkfsbs;
 	else
 		count = 1;
@@ -1965,31 +1857,14 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 				0, args->firstblock, args->flist, NULL,
 				&done)) == ENOSPC) {
 			if (w != XFS_DATA_FORK)
-				goto done;
+				break;
 			if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
 					&dead_buf)))
-				goto done;
-		} else if (error)
-			goto done;
-		else
+				break;
+		} else {
 			break;
-	}
-	ASSERT(done);
-	xfs_da_binval(tp, dead_buf);
-	/*
-	 * Adjust the directory size for version 1.
-	 */
-	if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
-		if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
-			return error;
-		size = XFS_FSB_TO_B(dp->i_mount, bno);
-		if (size != dp->i_d.di_size) {
-			dp->i_d.di_size = size;
-			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 		}
 	}
-	return 0;
-done:
 	xfs_da_binval(tp, dead_buf);
 	return error;
 }
@@ -2050,10 +1925,7 @@ xfs_da_do_buf(
 	xfs_dabuf_t	*rbp;
 
 	mp = dp->i_mount;
-	if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
-		nfsb = mp->m_dirblkfsbs;
-	else
-		nfsb = 1;
+	nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
 	mappedbno = *mappedbnop;
 	/*
 	 * Caller doesn't have a mapping.  -2 means don't complain
@@ -2199,7 +2071,6 @@ xfs_da_do_buf(
 		magic1 = be32_to_cpu(data->hdr.magic);
 		if (unlikely(
 		    XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
-				   (magic != XFS_DIR_LEAF_MAGIC) &&
 				   (magic != XFS_ATTR_LEAF_MAGIC) &&
 				   (magic != XFS_DIR2_LEAF1_MAGIC) &&
 				   (magic != XFS_DIR2_LEAFN_MAGIC) &&
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 243a730d5ec..4ab865ec8b8 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -36,14 +36,10 @@ struct zone;
  * level in the Btree, and to identify which type of block this is.
  */
 #define XFS_DA_NODE_MAGIC	0xfebe	/* magic number: non-leaf blocks */
-#define XFS_DIR_LEAF_MAGIC	0xfeeb	/* magic number: directory leaf blks */
 #define XFS_ATTR_LEAF_MAGIC	0xfbee	/* magic number: attribute leaf blks */
 #define	XFS_DIR2_LEAF1_MAGIC	0xd2f1	/* magic number: v2 dirlf single blks */
 #define	XFS_DIR2_LEAFN_MAGIC	0xd2ff	/* magic number: v2 dirlf multi blks */
 
-#define	XFS_DIRX_LEAF_MAGIC(mp)	\
-	(XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC)
-
 typedef struct xfs_da_blkinfo {
 	__be32		forw;			/* previous block in list */
 	__be32		back;			/* following block in list */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 29a6c866f2c..80562b60fb9 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 77d53778258..b33826961c4 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -85,7 +85,6 @@ typedef struct xfs_dinode
 	union {
 		xfs_bmdr_block_t di_bmbt;	/* btree root block */
 		xfs_bmbt_rec_32_t di_bmx[1];	/* extent list */
-		xfs_dir_shortform_t di_dirsf;	/* shortform directory */
 		xfs_dir2_sf_t	di_dir2sf;	/* shortform directory v2 */
 		char		di_c[1];	/* local contents */
 		xfs_dev_t	di_dev;		/* device for S_IFCHR/S_IFBLK */
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
index 3cd8657a81f..e69de29bb2d 100644
--- a/fs/xfs/xfs_dir.c
+++ b/fs/xfs/xfs_dir.c
@@ -1,1213 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_error.h"
-
-/*
- * xfs_dir.c
- *
- * Provide the external interfaces to manage directories.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Functions for the dirops interfaces.
- */
-static void	xfs_dir_mount(struct xfs_mount *mp);
-
-static int	xfs_dir_isempty(struct xfs_inode *dp);
-
-static int	xfs_dir_init(struct xfs_trans *trans,
-			     struct xfs_inode *dir,
-			     struct xfs_inode *parent_dir);
-
-static int	xfs_dir_createname(struct xfs_trans *trans,
-				   struct xfs_inode *dp,
-				   char *name_string,
-				   int name_len,
-				   xfs_ino_t inode_number,
-				   xfs_fsblock_t *firstblock,
-				   xfs_bmap_free_t *flist,
-				   xfs_extlen_t total);
-
-static int	xfs_dir_lookup(struct xfs_trans *tp,
-			       struct xfs_inode *dp,
-			       char *name_string,
-			       int name_length,
-			       xfs_ino_t *inode_number);
-
-static int	xfs_dir_removename(struct xfs_trans *trans,
-				   struct xfs_inode *dp,
-				   char *name_string,
-				   int name_length,
-				   xfs_ino_t ino,
-				   xfs_fsblock_t *firstblock,
-				   xfs_bmap_free_t *flist,
-				   xfs_extlen_t total);
-
-static int	xfs_dir_getdents(struct xfs_trans *tp,
-				 struct xfs_inode *dp,
-				 struct uio *uiop,
-				 int *eofp);
-
-static int	xfs_dir_replace(struct xfs_trans *tp,
-				struct xfs_inode *dp,
-				char *name_string,
-				int name_length,
-				xfs_ino_t inode_number,
-				xfs_fsblock_t *firstblock,
-				xfs_bmap_free_t *flist,
-				xfs_extlen_t total);
-
-static int	xfs_dir_canenter(struct xfs_trans *tp,
-				 struct xfs_inode *dp,
-				 char *name_string,
-				 int name_length);
-
-static int	xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp,
-						  xfs_dinode_t *dip);
-
-xfs_dirops_t xfsv1_dirops = {
-	.xd_mount			= xfs_dir_mount,
-	.xd_isempty			= xfs_dir_isempty,
-	.xd_init			= xfs_dir_init,
-	.xd_createname			= xfs_dir_createname,
-	.xd_lookup			= xfs_dir_lookup,
-	.xd_removename			= xfs_dir_removename,
-	.xd_getdents			= xfs_dir_getdents,
-	.xd_replace			= xfs_dir_replace,
-	.xd_canenter			= xfs_dir_canenter,
-	.xd_shortform_validate_ondisk	= xfs_dir_shortform_validate_ondisk,
-	.xd_shortform_to_single		= xfs_dir_shortform_to_leaf,
-};
-
-/*
- * Internal routines when dirsize == XFS_LBSIZE(mp).
- */
-STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args);
-STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries,
-						 int *total_namebytes);
-STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
-					     uio_t *uio, int *eofp,
-					     xfs_dirent_t *dbp,
-					     xfs_dir_put_t put);
-STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args);
-
-/*
- * Internal routines when dirsize > XFS_LBSIZE(mp).
- */
-STATIC int xfs_dir_node_addname(xfs_da_args_t *args);
-STATIC int xfs_dir_node_lookup(xfs_da_args_t *args);
-STATIC int xfs_dir_node_removename(xfs_da_args_t *args);
-STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
-					     uio_t *uio, int *eofp,
-					     xfs_dirent_t *dbp,
-					     xfs_dir_put_t put);
-STATIC int xfs_dir_node_replace(xfs_da_args_t *args);
-
-#if defined(XFS_DIR_TRACE)
-ktrace_t *xfs_dir_trace_buf;
-#endif
-
-
-/*========================================================================
- * Overall external interface routines.
- *========================================================================*/
-
-xfs_dahash_t	xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-
-/*
- * One-time startup routine called from xfs_init().
- */
-void
-xfs_dir_startup(void)
-{
-	xfs_dir_hash_dot = xfs_da_hashname(".", 1);
-	xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
-}
-
-/*
- * Initialize directory-related fields in the mount structure.
- */
-static void
-xfs_dir_mount(xfs_mount_t *mp)
-{
-	uint shortcount, leafcount, count;
-
-	mp->m_dirversion = 1;
-	if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
-		shortcount = (mp->m_attroffset -
-				(uint)sizeof(xfs_dir_sf_hdr_t)) /
-				 (uint)sizeof(xfs_dir_sf_entry_t);
-		leafcount = (XFS_LBSIZE(mp) -
-				(uint)sizeof(xfs_dir_leaf_hdr_t)) /
-				 ((uint)sizeof(xfs_dir_leaf_entry_t) +
-				  (uint)sizeof(xfs_dir_leaf_name_t));
-	} else {
-		shortcount = (XFS_BMDR_SPACE_CALC(MINABTPTRS) -
-			      (uint)sizeof(xfs_dir_sf_hdr_t)) /
-			       (uint)sizeof(xfs_dir_sf_entry_t);
-		leafcount = (XFS_LBSIZE(mp) -
-			    (uint)sizeof(xfs_dir_leaf_hdr_t)) /
-			     ((uint)sizeof(xfs_dir_leaf_entry_t) +
-			      (uint)sizeof(xfs_dir_leaf_name_t));
-	}
-	count = shortcount > leafcount ? shortcount : leafcount;
-	mp->m_dircook_elog = xfs_da_log2_roundup(count + 1);
-	ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog);
-	mp->m_dir_node_ents = mp->m_attr_node_ents =
-		(XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) /
-		(uint)sizeof(xfs_da_node_entry_t);
-	mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100;
-	mp->m_dirblksize = mp->m_sb.sb_blocksize;
-	mp->m_dirblkfsbs = 1;
-}
-
-/*
- * Return 1 if directory contains only "." and "..".
- */
-static int
-xfs_dir_isempty(xfs_inode_t *dp)
-{
-	xfs_dir_sf_hdr_t *hdr;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	if (dp->i_d.di_size == 0)
-		return(1);
-	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
-		return(0);
-	hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	return(hdr->count == 0);
-}
-
-/*
- * Initialize a directory with its "." and ".." entries.
- */
-static int
-xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir)
-{
-	xfs_da_args_t args;
-	int error;
-
-	memset((char *)&args, 0, sizeof(args));
-	args.dp = dir;
-	args.trans = trans;
-
-	ASSERT((dir->i_d.di_mode & S_IFMT) == S_IFDIR);
-	if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino)))
-		return error;
-
-	return(xfs_dir_shortform_create(&args, parent_dir->i_ino));
-}
-
-/*
- * Generic handler routine to add a name to a directory.
- * Transitions directory from shortform to Btree as necessary.
- */
-static int							/* error */
-xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
-		   int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock,
-		   xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
-	xfs_da_args_t args;
-	int retval, newsize, done;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
-	if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
-		return (retval);
-
-	XFS_STATS_INC(xs_dir_create);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = inum;
-	args.dp = dp;
-	args.firstblock = firstblock;
-	args.flist = flist;
-	args.total = total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = 0;
-	args.addname = args.oknoent = 1;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	done = 0;
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
-		if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) {
-			retval = xfs_dir_shortform_addname(&args);
-			done = 1;
-		} else {
-			if (total == 0)
-				return XFS_ERROR(ENOSPC);
-			retval = xfs_dir_shortform_to_leaf(&args);
-			done = retval != 0;
-		}
-	}
-	if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_addname(&args);
-		done = retval != ENOSPC;
-		if (!done) {
-			if (total == 0)
-				return XFS_ERROR(ENOSPC);
-			retval = xfs_dir_leaf_to_node(&args);
-			done = retval != 0;
-		}
-	}
-	if (!done) {
-		retval = xfs_dir_node_addname(&args);
-	}
-	return(retval);
-}
-
-/*
- * Generic handler routine to check if a name can be added to a directory,
- * without adding any blocks to the directory.
- */
-static int							/* error */
-xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen)
-{
-	xfs_da_args_t args;
-	int retval, newsize;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = 0;
-	args.dp = dp;
-	args.firstblock = NULL;
-	args.flist = NULL;
-	args.total = 0;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = args.addname = args.oknoent = 1;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
-		if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp))
-			retval = 0;
-		else
-			retval = XFS_ERROR(ENOSPC);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_addname(&args);
-	} else {
-		retval = xfs_dir_node_addname(&args);
-	}
-	return(retval);
-}
-
-/*
- * Generic handler routine to remove a name from a directory.
- * Transitions directory from Btree to shortform as necessary.
- */
-static int							/* error */
-xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
-		   int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock,
-		   xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
-	xfs_da_args_t args;
-	int count, totallen, newsize, retval;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	XFS_STATS_INC(xs_dir_remove);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = ino;
-	args.dp = dp;
-	args.firstblock = firstblock;
-	args.flist = flist;
-	args.total = total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = args.addname = args.oknoent = 0;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		retval = xfs_dir_shortform_removename(&args);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_removename(&args, &count, &totallen);
-		if (retval == 0) {
-			newsize = XFS_DIR_SF_ALLFIT(count, totallen);
-			if (newsize <= XFS_IFORK_DSIZE(dp)) {
-				retval = xfs_dir_leaf_to_shortform(&args);
-			}
-		}
-	} else {
-		retval = xfs_dir_node_removename(&args);
-	}
-	return(retval);
-}
-
-static int							/* error */
-xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
-				   xfs_ino_t *inum)
-{
-	xfs_da_args_t args;
-	int retval;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
-	XFS_STATS_INC(xs_dir_lookup);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = 0;
-	args.dp = dp;
-	args.firstblock = NULL;
-	args.flist = NULL;
-	args.total = 0;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = args.addname = 0;
-	args.oknoent = 1;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		retval = xfs_dir_shortform_lookup(&args);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_lookup(&args);
-	} else {
-		retval = xfs_dir_node_lookup(&args);
-	}
-	if (retval == EEXIST)
-		retval = 0;
-	*inum = args.inumber;
-	return(retval);
-}
-
-/*
- * Implement readdir.
- */
-static int							/* error */
-xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp)
-{
-	xfs_dirent_t *dbp;
-	int  alignment, retval;
-	xfs_dir_put_t put;
-
-	XFS_STATS_INC(xs_dir_getdents);
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
-	/*
-	 * If our caller has given us a single contiguous memory buffer,
-	 * just work directly within that buffer.  If it's in user memory,
-	 * lock it down first.
-	 */
-	alignment = sizeof(xfs_off_t) - 1;
-	if ((uio->uio_iovcnt == 1) &&
-	    (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
-	    ((uio->uio_iov[0].iov_len & alignment) == 0)) {
-		dbp = NULL;
-		put = xfs_dir_put_dirent64_direct;
-	} else {
-		dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
-		put = xfs_dir_put_dirent64_uio;
-	}
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	*eofp = 0;
-
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put);
-	} else {
-		retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put);
-	}
-	if (dbp != NULL)
-		kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
-
-	return(retval);
-}
-
-static int							/* error */
-xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
-				    xfs_ino_t inum, xfs_fsblock_t *firstblock,
-				    xfs_bmap_free_t *flist, xfs_extlen_t total)
-{
-	xfs_da_args_t args;
-	int retval;
-
-	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-
-	if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
-		return retval;
-
-	/*
-	 * Fill in the arg structure for this request.
-	 */
-	args.name = name;
-	args.namelen = namelen;
-	args.hashval = xfs_da_hashname(name, namelen);
-	args.inumber = inum;
-	args.dp = dp;
-	args.firstblock = firstblock;
-	args.flist = flist;
-	args.total = total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = trans;
-	args.justcheck = args.addname = args.oknoent = 0;
-
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
-	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-		retval = xfs_dir_shortform_replace(&args);
-	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
-		retval = xfs_dir_leaf_replace(&args);
-	} else {
-		retval = xfs_dir_node_replace(&args);
-	}
-
-	return(retval);
-}
-
-static int
-xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp)
-{
-	xfs_ino_t		ino;
-	int			namelen_sum;
-	int			count;
-	xfs_dir_shortform_t	*sf;
-	xfs_dir_sf_entry_t	*sfe;
-	int			i;
-
-
-
-	if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & S_IFMT) != S_IFDIR) {
-		return 0;
-	}
-	if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) {
-		return 0;
-	}
-	if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) {
-		xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p",
-			dp);
-		return 1;
-	}
-	sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf);
-	ino = XFS_GET_DIR_INO8(sf->hdr.parent);
-	if (xfs_dir_ino_validate(mp, ino))
-		return 1;
-
-	count =	sf->hdr.count;
-	if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) {
-		xfs_fs_cmn_err(CE_WARN, mp,
-			"Invalid shortform count: dp 0x%p", dp);
-		return(1);
-	}
-
-	if (count == 0) {
-		return 0;
-	}
-
-	namelen_sum = 0;
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count - 1; i >= 0; i--) {
-		ino = XFS_GET_DIR_INO8(sfe->inumber);
-		xfs_dir_ino_validate(mp, ino);
-		if (sfe->namelen >= XFS_LITINO(mp)) {
-			xfs_fs_cmn_err(CE_WARN, mp,
-				"Invalid shortform namelen: dp 0x%p", dp);
-			return 1;
-		}
-		namelen_sum += sfe->namelen;
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	if (namelen_sum >= XFS_LITINO(mp)) {
-		xfs_fs_cmn_err(CE_WARN, mp,
-			"Invalid shortform namelen: dp 0x%p", dp);
-		return 1;
-	}
-
-	return 0;
-}
-
-/*========================================================================
- * External routines when dirsize == XFS_LBSIZE(dp->i_mount).
- *========================================================================*/
-
-/*
- * Add a name to the leaf directory structure
- * This is the external routine.
- */
-int
-xfs_dir_leaf_addname(xfs_da_args_t *args)
-{
-	int index, retval;
-	xfs_dabuf_t *bp;
-
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-					      XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-
-	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-	if (retval == ENOENT)
-		retval = xfs_dir_leaf_add(bp, args, index);
-	xfs_da_buf_done(bp);
-	return(retval);
-}
-
-/*
- * Remove a name from the leaf directory structure
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen)
-{
-	xfs_dir_leafblock_t *leaf;
-	int index, retval;
-	xfs_dabuf_t *bp;
-
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-					      XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-	if (retval == EEXIST) {
-		(void)xfs_dir_leaf_remove(args->trans, bp, index);
-		*count = be16_to_cpu(leaf->hdr.count);
-		*totallen = be16_to_cpu(leaf->hdr.namebytes);
-		retval = 0;
-	}
-	xfs_da_buf_done(bp);
-	return(retval);
-}
-
-/*
- * Look up a name in a leaf directory structure.
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_lookup(xfs_da_args_t *args)
-{
-	int index, retval;
-	xfs_dabuf_t *bp;
-
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-					      XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-	xfs_da_brelse(args->trans, bp);
-	return(retval);
-}
-
-/*
- * Copy out directory entries for getdents(), for leaf directories.
- */
-STATIC int
-xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
-				  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
-	xfs_dabuf_t *bp;
-	int retval, eob;
-
-	retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-	retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1);
-	xfs_da_brelse(trans, bp);
-	*eofp = (eob == 0);
-	return(retval);
-}
-
-/*
- * Look up a name in a leaf directory structure, replace the inode number.
- * This is the external routine.
- */
-STATIC int
-xfs_dir_leaf_replace(xfs_da_args_t *args)
-{
-	int index, retval;
-	xfs_dabuf_t *bp;
-	xfs_ino_t inum;
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-
-	inum = args->inumber;
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-					      XFS_DATA_FORK);
-	if (retval)
-		return(retval);
-	ASSERT(bp != NULL);
-	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
-	if (retval == EEXIST) {
-		leaf = bp->data;
-		entry = &leaf->entries[index];
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
-		/* XXX - replace assert? */
-		XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
-		xfs_da_log_buf(args->trans, bp,
-		    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
-		xfs_da_buf_done(bp);
-		retval = 0;
-	} else
-		xfs_da_brelse(args->trans, bp);
-	return(retval);
-}
-
-
-/*========================================================================
- * External routines when dirsize > XFS_LBSIZE(mp).
- *========================================================================*/
-
-/*
- * Add a name to a Btree-format directory.
- *
- * This will involve walking down the Btree, and may involve splitting
- * leaf nodes and even splitting intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_dir_node_addname(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	xfs_da_state_blk_t *blk;
-	int retval, error;
-
-	/*
-	 * Fill in bucket of arguments/results/context to carry around.
-	 */
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	state->blocksize = state->mp->m_sb.sb_blocksize;
-	state->node_ents = state->mp->m_dir_node_ents;
-
-	/*
-	 * Search to see if name already exists, and get back a pointer
-	 * to where it should go.
-	 */
-	error = xfs_da_node_lookup_int(state, &retval);
-	if (error)
-		retval = error;
-	if (retval != ENOENT)
-		goto error;
-	blk = &state->path.blk[ state->path.active-1 ];
-	ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
-	retval = xfs_dir_leaf_add(blk->bp, args, blk->index);
-	if (retval == 0) {
-		/*
-		 * Addition succeeded, update Btree hashvals.
-		 */
-		if (!args->justcheck)
-			xfs_da_fixhashpath(state, &state->path);
-	} else {
-		/*
-		 * Addition failed, split as many Btree elements as required.
-		 */
-		if (args->total == 0) {
-			ASSERT(retval == ENOSPC);
-			goto error;
-		}
-		retval = xfs_da_split(state);
-	}
-error:
-	xfs_da_state_free(state);
-
-	return(retval);
-}
-
-/*
- * Remove a name from a B-tree directory.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- */
-STATIC int
-xfs_dir_node_removename(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	xfs_da_state_blk_t *blk;
-	int retval, error;
-
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	state->blocksize = state->mp->m_sb.sb_blocksize;
-	state->node_ents = state->mp->m_dir_node_ents;
-
-	/*
-	 * Search to see if name exists, and get back a pointer to it.
-	 */
-	error = xfs_da_node_lookup_int(state, &retval);
-	if (error)
-		retval = error;
-	if (retval != EEXIST) {
-		xfs_da_state_free(state);
-		return(retval);
-	}
-
-	/*
-	 * Remove the name and update the hashvals in the tree.
-	 */
-	blk = &state->path.blk[ state->path.active-1 ];
-	ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
-	retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index);
-	xfs_da_fixhashpath(state, &state->path);
-
-	/*
-	 * Check to see if the tree needs to be collapsed.
-	 */
-	error = 0;
-	if (retval) {
-		error = xfs_da_join(state);
-	}
-
-	xfs_da_state_free(state);
-	if (error)
-		return(error);
-	return(0);
-}
-
-/*
- * Look up a filename in a int directory.
- * Use an internal routine to actually do all the work.
- */
-STATIC int
-xfs_dir_node_lookup(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	int retval, error, i;
-
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	state->blocksize = state->mp->m_sb.sb_blocksize;
-	state->node_ents = state->mp->m_dir_node_ents;
-
-	/*
-	 * Search to see if name exists,
-	 * and get back a pointer to it.
-	 */
-	error = xfs_da_node_lookup_int(state, &retval);
-	if (error) {
-		retval = error;
-	}
-
-	/*
-	 * If not in a transaction, we have to release all the buffers.
-	 */
-	for (i = 0; i < state->path.active; i++) {
-		xfs_da_brelse(args->trans, state->path.blk[i].bp);
-		state->path.blk[i].bp = NULL;
-	}
-
-	xfs_da_state_free(state);
-	return(retval);
-}
-
-STATIC int
-xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
-				  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
-	xfs_da_intnode_t *node;
-	xfs_da_node_entry_t *btree;
-	xfs_dir_leafblock_t *leaf = NULL;
-	xfs_dablk_t bno, nextbno;
-	xfs_dahash_t cookhash;
-	xfs_mount_t *mp;
-	int error, eob, i;
-	xfs_dabuf_t *bp;
-	xfs_daddr_t nextda;
-
-	/*
-	 * Pick up our context.
-	 */
-	mp = dp->i_mount;
-	bp = NULL;
-	bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset);
-	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-
-	xfs_dir_trace_g_du("node: start", dp, uio);
-
-	/*
-	 * Re-find our place, even if we're confused about what our place is.
-	 *
-	 * First we check the block number from the magic cookie, it is a
-	 * cache of where we ended last time.  If we find a leaf block, and
-	 * the starting hashval in that block is less than our desired
-	 * hashval, then we run with it.
-	 */
-	if (bno > 0) {
-		error = xfs_da_read_buf(trans, dp, bno, -2, &bp, XFS_DATA_FORK);
-		if ((error != 0) && (error != EFSCORRUPTED))
-			return(error);
-		if (bp)
-			leaf = bp->data;
-		if (bp && be16_to_cpu(leaf->hdr.info.magic) != XFS_DIR_LEAF_MAGIC) {
-			xfs_dir_trace_g_dub("node: block not a leaf",
-						   dp, uio, bno);
-			xfs_da_brelse(trans, bp);
-			bp = NULL;
-		}
-		if (bp && be32_to_cpu(leaf->entries[0].hashval) > cookhash) {
-			xfs_dir_trace_g_dub("node: leaf hash too large",
-						   dp, uio, bno);
-			xfs_da_brelse(trans, bp);
-			bp = NULL;
-		}
-		if (bp && cookhash > be32_to_cpu(leaf->entries[
-					be16_to_cpu(leaf->hdr.count) - 1].hashval)) {
-			xfs_dir_trace_g_dub("node: leaf hash too small",
-						   dp, uio, bno);
-			xfs_da_brelse(trans, bp);
-			bp = NULL;
-		}
-	}
-
-	/*
-	 * If we did not find a leaf block from the blockno in the cookie,
-	 * or we there was no blockno in the cookie (eg: first time thru),
-	 * the we start at the top of the Btree and re-find our hashval.
-	 */
-	if (bp == NULL) {
-		xfs_dir_trace_g_du("node: start at root" , dp, uio);
-		bno = 0;
-		for (;;) {
-			error = xfs_da_read_buf(trans, dp, bno, -1, &bp,
-						       XFS_DATA_FORK);
-			if (error)
-				return(error);
-			if (bp == NULL)
-				return(XFS_ERROR(EFSCORRUPTED));
-			node = bp->data;
-			if (be16_to_cpu(node->hdr.info.magic) != XFS_DA_NODE_MAGIC)
-				break;
-			btree = &node->btree[0];
-			xfs_dir_trace_g_dun("node: node detail", dp, uio, node);
-			for (i = 0; i < be16_to_cpu(node->hdr.count); btree++, i++) {
-				if (be32_to_cpu(btree->hashval) >= cookhash) {
-					bno = be32_to_cpu(btree->before);
-					break;
-				}
-			}
-			if (i == be16_to_cpu(node->hdr.count)) {
-				xfs_da_brelse(trans, bp);
-				xfs_dir_trace_g_du("node: hash beyond EOF",
-							  dp, uio);
-				uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0,
-							     XFS_DA_MAXHASH);
-				*eofp = 1;
-				return(0);
-			}
-			xfs_dir_trace_g_dub("node: going to block",
-						   dp, uio, bno);
-			xfs_da_brelse(trans, bp);
-		}
-	}
-	ASSERT(cookhash != XFS_DA_MAXHASH);
-
-	/*
-	 * We've dropped down to the (first) leaf block that contains the
-	 * hashval we are interested in.  Continue rolling upward thru the
-	 * leaf blocks until we fill up our buffer.
-	 */
-	for (;;) {
-		leaf = bp->data;
-		if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_DIR_LEAF_MAGIC)) {
-			xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf);
-			xfs_da_brelse(trans, bp);
-			XFS_CORRUPTION_ERROR("xfs_dir_node_getdents(1)",
-					     XFS_ERRLEVEL_LOW, mp, leaf);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-		xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf);
-		if ((nextbno = be32_to_cpu(leaf->hdr.info.forw))) {
-			nextda = xfs_da_reada_buf(trans, dp, nextbno,
-						  XFS_DATA_FORK);
-		} else
-			nextda = -1;
-		error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp,
-						  put, nextda);
-		xfs_da_brelse(trans, bp);
-		bno = nextbno;
-		if (eob) {
-			xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno);
-			*eofp = 0;
-			return(error);
-		}
-		if (bno == 0)
-			break;
-		error = xfs_da_read_buf(trans, dp, bno, nextda, &bp,
-					XFS_DATA_FORK);
-		if (error)
-			return(error);
-		if (unlikely(bp == NULL)) {
-			XFS_ERROR_REPORT("xfs_dir_node_getdents(2)",
-					 XFS_ERRLEVEL_LOW, mp);
-			return(XFS_ERROR(EFSCORRUPTED));
-		}
-	}
-	*eofp = 1;
-	xfs_dir_trace_g_du("node: E-O-F", dp, uio);
-	return(0);
-}
-
-/*
- * Look up a filename in an int directory, replace the inode number.
- * Use an internal routine to actually do the lookup.
- */
-STATIC int
-xfs_dir_node_replace(xfs_da_args_t *args)
-{
-	xfs_da_state_t *state;
-	xfs_da_state_blk_t *blk;
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	xfs_ino_t inum;
-	int retval, error, i;
-	xfs_dabuf_t *bp;
-
-	state = xfs_da_state_alloc();
-	state->args = args;
-	state->mp = args->dp->i_mount;
-	state->blocksize = state->mp->m_sb.sb_blocksize;
-	state->node_ents = state->mp->m_dir_node_ents;
-	inum = args->inumber;
-
-	/*
-	 * Search to see if name exists,
-	 * and get back a pointer to it.
-	 */
-	error = xfs_da_node_lookup_int(state, &retval);
-	if (error) {
-		retval = error;
-	}
-
-	if (retval == EEXIST) {
-		blk = &state->path.blk[state->path.active - 1];
-		ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
-		bp = blk->bp;
-		leaf = bp->data;
-		entry = &leaf->entries[blk->index];
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
-		/* XXX - replace assert ? */
-		XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
-		xfs_da_log_buf(args->trans, bp,
-		    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
-		xfs_da_buf_done(bp);
-		blk->bp = NULL;
-		retval = 0;
-	} else {
-		i = state->path.active - 1;
-		xfs_da_brelse(args->trans, state->path.blk[i].bp);
-		state->path.blk[i].bp = NULL;
-	}
-	for (i = 0; i < state->path.active - 1; i++) {
-		xfs_da_brelse(args->trans, state->path.blk[i].bp);
-		state->path.blk[i].bp = NULL;
-	}
-
-	xfs_da_state_free(state);
-	return(retval);
-}
-
-#if defined(XFS_DIR_TRACE)
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio)
-{
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     NULL, NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno)
-{
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)(unsigned long)bno,
-		     NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio,
-			xfs_da_intnode_t *node)
-{
-	int	last = be16_to_cpu(node->hdr.count) - 1;
-
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)(unsigned long)be32_to_cpu(node->hdr.info.forw),
-		     (void *)(unsigned long)
-			be16_to_cpu(node->hdr.count),
-		     (void *)(unsigned long)
-			be32_to_cpu(node->btree[0].hashval),
-		     (void *)(unsigned long)
-			be32_to_cpu(node->btree[last].hashval),
-		     NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
-			xfs_dir_leafblock_t *leaf)
-{
-	int	last = be16_to_cpu(leaf->hdr.count) - 1;
-
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)(unsigned long)be32_to_cpu(leaf->hdr.info.forw),
-		     (void *)(unsigned long)be16_to_cpu(leaf->hdr.count),
-		     (void *)(unsigned long)be32_to_cpu(leaf->entries[0].hashval),
-		     (void *)(unsigned long)be32_to_cpu(leaf->entries[last].hashval),
-		     NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio,
-			xfs_dir_leaf_entry_t *entry)
-{
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)(unsigned long)be32_to_cpu(entry->hashval),
-		     NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for an inode and a uio.
- */
-void
-xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie)
-{
-	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where,
-		     (void *)dp, (void *)dp->i_mount,
-		     (void *)((unsigned long)(uio->uio_offset >> 32)),
-		     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
-		     (void *)(unsigned long)uio->uio_resid,
-		     (void *)((unsigned long)(cookie >> 32)),
-		     (void *)((unsigned long)(cookie & 0xFFFFFFFF)),
-		     NULL, NULL, NULL, NULL, NULL);
-}
-
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-void
-xfs_dir_trace_enter(int type, char *where,
-			void * a0, void * a1,
-			void * a2, void * a3,
-			void * a4, void * a5,
-			void * a6, void * a7,
-			void * a8, void * a9,
-			void * a10, void * a11)
-{
-	ASSERT(xfs_dir_trace_buf);
-	ktrace_enter(xfs_dir_trace_buf, (void *)(unsigned long)type,
-					(void *)where,
-					(void *)a0, (void *)a1, (void *)a2,
-					(void *)a3, (void *)a4, (void *)a5,
-					(void *)a6, (void *)a7, (void *)a8,
-					(void *)a9, (void *)a10, (void *)a11,
-					NULL, NULL);
-}
-#endif	/* XFS_DIR_TRACE */
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
index 8cc8afb9f6c..e69de29bb2d 100644
--- a/fs/xfs/xfs_dir.h
+++ b/fs/xfs/xfs_dir.h
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR_H__
-#define	__XFS_DIR_H__
-
-/*
- * Large directories are structured around Btrees where all the data
- * elements are in the leaf nodes.  Filenames are hashed into an int,
- * then that int is used as the index into the Btree.  Since the hashval
- * of a filename may not be unique, we may have duplicate keys.  The
- * internal links in the Btree are logical block offsets into the file.
- *
- * Small directories use a different format and are packed as tightly
- * as possible so as to fit into the literal area of the inode.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-struct uio;
-struct xfs_bmap_free;
-struct xfs_da_args;
-struct xfs_dinode;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-/*
- * Directory function types.
- * Put in structures (xfs_dirops_t) for v1 and v2 directories.
- */
-typedef void	(*xfs_dir_mount_t)(struct xfs_mount *mp);
-typedef int	(*xfs_dir_isempty_t)(struct xfs_inode *dp);
-typedef int	(*xfs_dir_init_t)(struct xfs_trans *tp,
-				  struct xfs_inode *dp,
-				  struct xfs_inode *pdp);
-typedef int	(*xfs_dir_createname_t)(struct xfs_trans *tp,
-					struct xfs_inode *dp,
-					char *name,
-					int namelen,
-					xfs_ino_t inum,
-					xfs_fsblock_t *first,
-					struct xfs_bmap_free *flist,
-					xfs_extlen_t total);
-typedef int	(*xfs_dir_lookup_t)(struct xfs_trans *tp,
-				    struct xfs_inode *dp,
-				    char *name,
-				    int namelen,
-				    xfs_ino_t *inum);
-typedef int	(*xfs_dir_removename_t)(struct xfs_trans *tp,
-					struct xfs_inode *dp,
-					char *name,
-					int namelen,
-					xfs_ino_t ino,
-					xfs_fsblock_t *first,
-					struct xfs_bmap_free *flist,
-					xfs_extlen_t total);
-typedef int	(*xfs_dir_getdents_t)(struct xfs_trans *tp,
-				      struct xfs_inode *dp,
-				      struct uio *uio,
-				      int *eofp);
-typedef int	(*xfs_dir_replace_t)(struct xfs_trans *tp,
-				     struct xfs_inode *dp,
-				     char *name,
-				     int namelen,
-				     xfs_ino_t inum,
-				     xfs_fsblock_t *first,
-				     struct xfs_bmap_free *flist,
-				     xfs_extlen_t total);
-typedef int	(*xfs_dir_canenter_t)(struct xfs_trans *tp,
-				      struct xfs_inode *dp,
-				      char *name,
-				      int namelen);
-typedef int	(*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp,
-						       struct xfs_dinode *dip);
-typedef int	(*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args);
-
-typedef struct xfs_dirops {
-	xfs_dir_mount_t				xd_mount;
-	xfs_dir_isempty_t			xd_isempty;
-	xfs_dir_init_t				xd_init;
-	xfs_dir_createname_t			xd_createname;
-	xfs_dir_lookup_t			xd_lookup;
-	xfs_dir_removename_t			xd_removename;
-	xfs_dir_getdents_t			xd_getdents;
-	xfs_dir_replace_t			xd_replace;
-	xfs_dir_canenter_t			xd_canenter;
-	xfs_dir_shortform_validate_ondisk_t	xd_shortform_validate_ondisk;
-	xfs_dir_shortform_to_single_t		xd_shortform_to_single;
-} xfs_dirops_t;
-
-/*
- * Overall external interface routines.
- */
-void	xfs_dir_startup(void);	/* called exactly once */
-
-#define	XFS_DIR_MOUNT(mp)	\
-	((mp)->m_dirops.xd_mount(mp))
-#define	XFS_DIR_ISEMPTY(mp,dp)	\
-	((mp)->m_dirops.xd_isempty(dp))
-#define	XFS_DIR_INIT(mp,tp,dp,pdp)	\
-	((mp)->m_dirops.xd_init(tp,dp,pdp))
-#define	XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \
-	((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\
-				      total))
-#define	XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum)	\
-	((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum))
-#define	XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total)	\
-	((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total))
-#define	XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp)	\
-	((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp))
-#define	XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total)	\
-	((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total))
-#define	XFS_DIR_CANENTER(mp,tp,dp,name,namelen)	\
-	((mp)->m_dirops.xd_canenter(tp,dp,name,namelen))
-#define	XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip)	\
-	((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip))
-#define	XFS_DIR_SHORTFORM_TO_SINGLE(mp,args)	\
-	((mp)->m_dirops.xd_shortform_to_single(args))
-
-#define	XFS_DIR_IS_V1(mp)	((mp)->m_dirversion == 1)
-#define	XFS_DIR_IS_V2(mp)	((mp)->m_dirversion == 2)
-extern xfs_dirops_t xfsv1_dirops;
-extern xfs_dirops_t xfsv2_dirops;
-
-#endif	/* __XFS_DIR_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 80238a2263f..8edbe1adb95 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -24,21 +24,18 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -46,69 +43,14 @@
 #include "xfs_dir2_trace.h"
 #include "xfs_error.h"
 
-/*
- * Declarations for interface routines.
- */
-static void	xfs_dir2_mount(xfs_mount_t *mp);
-static int	xfs_dir2_isempty(xfs_inode_t *dp);
-static int	xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp,
-			      xfs_inode_t *pdp);
-static int	xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp,
-				    char *name, int namelen, xfs_ino_t inum,
-				    xfs_fsblock_t *first,
-				    xfs_bmap_free_t *flist, xfs_extlen_t total);
-static int	xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
-				int namelen, xfs_ino_t *inum);
-static int	xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp,
-				    char *name, int namelen, xfs_ino_t ino,
-				    xfs_fsblock_t *first,
-				    xfs_bmap_free_t *flist, xfs_extlen_t total);
-static int	xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio,
-				  int *eofp);
-static int	xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
-				 int namelen, xfs_ino_t inum,
-				 xfs_fsblock_t *first, xfs_bmap_free_t *flist,
-				 xfs_extlen_t total);
-static int	xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
-				  int namelen);
-static int	xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp,
-						   xfs_dinode_t *dip);
-
-/*
- * Utility routine declarations.
- */
 static int	xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa);
 static int	xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa);
 
-/*
- * Directory operations vector.
- */
-xfs_dirops_t	xfsv2_dirops = {
-	.xd_mount			= xfs_dir2_mount,
-	.xd_isempty			= xfs_dir2_isempty,
-	.xd_init			= xfs_dir2_init,
-	.xd_createname			= xfs_dir2_createname,
-	.xd_lookup			= xfs_dir2_lookup,
-	.xd_removename			= xfs_dir2_removename,
-	.xd_getdents			= xfs_dir2_getdents,
-	.xd_replace			= xfs_dir2_replace,
-	.xd_canenter			= xfs_dir2_canenter,
-	.xd_shortform_validate_ondisk	= xfs_dir2_shortform_validate_ondisk,
-	.xd_shortform_to_single		= xfs_dir2_sf_to_block,
-};
-
-/*
- * Interface routines.
- */
-
-/*
- * Initialize directory-related fields in the mount structure.
- */
-static void
-xfs_dir2_mount(
-	xfs_mount_t	*mp)		/* filesystem mount point */
+void
+xfs_dir_mount(
+	xfs_mount_t	*mp)
 {
-	mp->m_dirversion = 2;
+	ASSERT(XFS_SB_VERSION_HASDIRV2(&mp->m_sb));
 	ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
 	       XFS_MAX_BLOCKSIZE);
 	mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
@@ -128,73 +70,99 @@ xfs_dir2_mount(
 /*
  * Return 1 if directory contains only "." and "..".
  */
-static int				/* return code */
-xfs_dir2_isempty(
-	xfs_inode_t	*dp)		/* incore inode structure */
+int
+xfs_dir_isempty(
+	xfs_inode_t	*dp)
 {
-	xfs_dir2_sf_t	*sfp;		/* shortform directory structure */
+	xfs_dir2_sf_t	*sfp;
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	/*
-	 * Might happen during shutdown.
-	 */
-	if (dp->i_d.di_size == 0) {
+	if (dp->i_d.di_size == 0)	/* might happen during shutdown. */
 		return 1;
-	}
 	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
 		return 0;
 	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
 	return !sfp->hdr.count;
 }
 
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(
+	xfs_mount_t	*mp,
+	xfs_ino_t	ino)
+{
+	xfs_agblock_t	agblkno;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		ino_ok;
+	int		ioff;
+
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agblkno = XFS_INO_TO_AGBNO(mp, ino);
+	ioff = XFS_INO_TO_OFFSET(mp, ino);
+	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+	ino_ok =
+		agno < mp->m_sb.sb_agcount &&
+		agblkno < mp->m_sb.sb_agblocks &&
+		agblkno != 0 &&
+		ioff < (1 << mp->m_sb.sb_inopblog) &&
+		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+	if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+			XFS_RANDOM_DIR_INO_VALIDATE))) {
+		xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
+				(unsigned long long) ino);
+		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
 /*
  * Initialize a directory with its "." and ".." entries.
  */
-static int				/* error */
-xfs_dir2_init(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
-	xfs_inode_t	*pdp)		/* incore parent directory inode */
+int
+xfs_dir_init(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	xfs_inode_t	*pdp)
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		error;		/* error return value */
+	xfs_da_args_t	args;
+	int		error;
 
 	memset((char *)&args, 0, sizeof(args));
 	args.dp = dp;
 	args.trans = tp;
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) {
+	if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
 		return error;
-	}
 	return xfs_dir2_sf_create(&args, pdp->i_ino);
 }
 
 /*
   Enter a name in a directory.
  */
-static int					/* error */
-xfs_dir2_createname(
-	xfs_trans_t		*tp,		/* transaction pointer */
-	xfs_inode_t		*dp,		/* incore directory inode */
-	char			*name,		/* new entry name */
-	int			namelen,	/* new entry name length */
+int
+xfs_dir_createname(
+	xfs_trans_t		*tp,
+	xfs_inode_t		*dp,
+	char			*name,
+	int			namelen,
 	xfs_ino_t		inum,		/* new entry inode number */
 	xfs_fsblock_t		*first,		/* bmap's firstblock */
 	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
 	xfs_extlen_t		total)		/* bmap's total block count */
 {
-	xfs_da_args_t		args;		/* operation arguments */
-	int			rval;		/* return value */
+	xfs_da_args_t		args;
+	int			rval;
 	int			v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
 		return rval;
-	}
 	XFS_STATS_INC(xs_dir_create);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
+
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -207,18 +175,16 @@ xfs_dir2_createname(
 	args.trans = tp;
 	args.justcheck = 0;
 	args.addname = args.oknoent = 1;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_addname(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_addname(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_addname(&args);
 	else
 		rval = xfs_dir2_node_addname(&args);
@@ -228,24 +194,21 @@ xfs_dir2_createname(
 /*
  * Lookup a name in a directory, give back the inode number.
  */
-static int				/* error */
-xfs_dir2_lookup(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
-	char		*name,		/* lookup name */
-	int		namelen,	/* lookup name length */
+int
+xfs_dir_lookup(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	char		*name,
+	int		namelen,
 	xfs_ino_t	*inum)		/* out: inode number */
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		rval;		/* return value */
+	xfs_da_args_t	args;
+	int		rval;
 	int		v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_lookup);
 
-	/*
-	 * Fill in the arg structure for this request.
-	 */
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -258,18 +221,16 @@ xfs_dir2_lookup(
 	args.trans = tp;
 	args.justcheck = args.addname = 0;
 	args.oknoent = 1;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_lookup(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_lookup(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_lookup(&args);
 	else
 		rval = xfs_dir2_node_lookup(&args);
@@ -283,26 +244,24 @@ xfs_dir2_lookup(
 /*
  * Remove an entry from a directory.
  */
-static int				/* error */
-xfs_dir2_removename(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
-	char		*name,		/* name of entry to remove */
-	int		namelen,	/* name length of entry to remove */
-	xfs_ino_t	ino,		/* inode number of entry to remove */
+int
+xfs_dir_removename(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
+	char		*name,
+	int		namelen,
+	xfs_ino_t	ino,
 	xfs_fsblock_t	*first,		/* bmap's firstblock */
 	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
 	xfs_extlen_t	total)		/* bmap's total block count */
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		rval;		/* return value */
+	xfs_da_args_t	args;
+	int		rval;
 	int		v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 	XFS_STATS_INC(xs_dir_remove);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
+
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -314,18 +273,16 @@ xfs_dir2_removename(
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
 	args.justcheck = args.addname = args.oknoent = 0;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_removename(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_removename(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_removename(&args);
 	else
 		rval = xfs_dir2_node_removename(&args);
@@ -335,10 +292,10 @@ xfs_dir2_removename(
 /*
  * Read a directory.
  */
-static int				/* error */
-xfs_dir2_getdents(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+int
+xfs_dir_getdents(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	uio_t		*uio,		/* caller's buffer control */
 	int		*eofp)		/* out: eof reached */
 {
@@ -367,14 +324,11 @@ xfs_dir2_getdents(
 	}
 
 	*eofp = 0;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put);
 	else
 		rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put);
@@ -386,29 +340,26 @@ xfs_dir2_getdents(
 /*
  * Replace the inode number of a directory entry.
  */
-static int				/* error */
-xfs_dir2_replace(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+int
+xfs_dir_replace(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	char		*name,		/* name of entry to replace */
-	int		namelen,	/* name length of entry to replace */
+	int		namelen,
 	xfs_ino_t	inum,		/* new inode number */
 	xfs_fsblock_t	*first,		/* bmap's firstblock */
 	xfs_bmap_free_t	*flist,		/* bmap's freeblock list */
 	xfs_extlen_t	total)		/* bmap's total block count */
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		rval;		/* return value */
+	xfs_da_args_t	args;
+	int		rval;
 	int		v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
 
-	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
 		return rval;
-	}
-	/*
-	 * Fill in the arg structure for this request.
-	 */
+
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -420,18 +371,16 @@ xfs_dir2_replace(
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
 	args.justcheck = args.addname = args.oknoent = 0;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_replace(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_replace(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_replace(&args);
 	else
 		rval = xfs_dir2_node_replace(&args);
@@ -441,21 +390,19 @@ xfs_dir2_replace(
 /*
  * See if this entry can be added to the directory without allocating space.
  */
-static int				/* error */
-xfs_dir2_canenter(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+int
+xfs_dir_canenter(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	char		*name,		/* name of entry to add */
-	int		namelen)	/* name length of entry to add */
+	int		namelen)
 {
-	xfs_da_args_t	args;		/* operation arguments */
-	int		rval;		/* return value */
+	xfs_da_args_t	args;
+	int		rval;
 	int		v;		/* type-checking value */
 
 	ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
-	/*
-	 * Fill in the arg structure for this request.
-	 */
+
 	args.name = name;
 	args.namelen = namelen;
 	args.hashval = xfs_da_hashname(name, namelen);
@@ -467,37 +414,22 @@ xfs_dir2_canenter(
 	args.whichfork = XFS_DATA_FORK;
 	args.trans = tp;
 	args.justcheck = args.addname = args.oknoent = 1;
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
+
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_addname(&args);
-	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_block_addname(&args);
-	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
 		return rval;
-	} else if (v)
+	else if (v)
 		rval = xfs_dir2_leaf_addname(&args);
 	else
 		rval = xfs_dir2_node_addname(&args);
 	return rval;
 }
 
-/*
- * Dummy routine for shortform inode validation.
- * Can't really do this.
- */
-/* ARGSUSED */
-static int				/* error */
-xfs_dir2_shortform_validate_ondisk(
-	xfs_mount_t	*mp,		/* filesystem mount point */
-	xfs_dinode_t	*dip)		/* ondisk inode */
-{
-	return 0;
-}
-
 /*
  * Utility routines.
  */
@@ -507,24 +439,24 @@ xfs_dir2_shortform_validate_ondisk(
  * This routine is for data and free blocks, not leaf/node blocks
  * which are handled by xfs_da_grow_inode.
  */
-int					/* error */
+int
 xfs_dir2_grow_inode(
-	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_da_args_t	*args,
 	int		space,		/* v2 dir's space XFS_DIR2_xxx_SPACE */
 	xfs_dir2_db_t	*dbp)		/* out: block number added */
 {
 	xfs_fileoff_t	bno;		/* directory offset of new block */
 	int		count;		/* count of filesystem blocks */
 	xfs_inode_t	*dp;		/* incore directory inode */
-	int		error;		/* error return value */
+	int		error;
 	int		got;		/* blocks actually mapped */
-	int		i;		/* temp mapping index */
+	int		i;
 	xfs_bmbt_irec_t	map;		/* single structure for bmap */
 	int		mapi;		/* mapping index */
 	xfs_bmbt_irec_t	*mapp;		/* bmap mapping structure(s) */
-	xfs_mount_t	*mp;		/* filesystem mount point */
+	xfs_mount_t	*mp;
 	int		nmap;		/* number of bmap entries */
-	xfs_trans_t	*tp;		/* transaction pointer */
+	xfs_trans_t	*tp;
 
 	xfs_dir2_trace_args_s("grow_inode", args, space);
 	dp = args->dp;
@@ -538,9 +470,8 @@ xfs_dir2_grow_inode(
 	/*
 	 * Find the first hole for our block.
 	 */
-	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) {
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK)))
 		return error;
-	}
 	nmap = 1;
 	ASSERT(args->firstblock != NULL);
 	/*
@@ -549,13 +480,9 @@ xfs_dir2_grow_inode(
 	if ((error = xfs_bmapi(tp, dp, bno, count,
 			XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
-			args->flist, NULL))) {
+			args->flist, NULL)))
 		return error;
-	}
 	ASSERT(nmap <= 1);
-	/*
-	 * Got it in 1.
-	 */
 	if (nmap == 1) {
 		mapp = &map;
 		mapi = 1;
@@ -646,20 +573,19 @@ xfs_dir2_grow_inode(
 /*
  * See if the directory is a single-block form directory.
  */
-int					/* error */
+int
 xfs_dir2_isblock(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	int		*vp)		/* out: 1 is block, 0 is not block */
 {
 	xfs_fileoff_t	last;		/* last file offset */
-	xfs_mount_t	*mp;		/* filesystem mount point */
-	int		rval;		/* return value */
+	xfs_mount_t	*mp;
+	int		rval;
 
 	mp = dp->i_mount;
-	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
 		return rval;
-	}
 	rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
 	ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
 	*vp = rval;
@@ -669,20 +595,19 @@ xfs_dir2_isblock(
 /*
  * See if the directory is a single-leaf form directory.
  */
-int					/* error */
+int
 xfs_dir2_isleaf(
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_inode_t	*dp,		/* incore directory inode */
+	xfs_trans_t	*tp,
+	xfs_inode_t	*dp,
 	int		*vp)		/* out: 1 is leaf, 0 is not leaf */
 {
 	xfs_fileoff_t	last;		/* last file offset */
-	xfs_mount_t	*mp;		/* filesystem mount point */
-	int		rval;		/* return value */
+	xfs_mount_t	*mp;
+	int		rval;
 
 	mp = dp->i_mount;
-	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
 		return rval;
-	}
 	*vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
 	return 0;
 }
@@ -690,9 +615,9 @@ xfs_dir2_isleaf(
 /*
  * Getdents put routine for 64-bit ABI, direct form.
  */
-static int					/* error */
+static int
 xfs_dir2_put_dirent64_direct(
-	xfs_dir2_put_args_t	*pa)		/* argument bundle */
+	xfs_dir2_put_args_t	*pa)
 {
 	xfs_dirent_t		*idbp;		/* dirent pointer */
 	iovec_t			*iovp;		/* io vector */
@@ -727,9 +652,9 @@ xfs_dir2_put_dirent64_direct(
 /*
  * Getdents put routine for 64-bit ABI, uio form.
  */
-static int					/* error */
+static int
 xfs_dir2_put_dirent64_uio(
-	xfs_dir2_put_args_t	*pa)		/* argument bundle */
+	xfs_dir2_put_args_t	*pa)
 {
 	xfs_dirent_t		*idbp;		/* dirent pointer */
 	int			namelen;	/* entry name length */
@@ -765,17 +690,17 @@ xfs_dir2_put_dirent64_uio(
  */
 int
 xfs_dir2_shrink_inode(
-	xfs_da_args_t	*args,		/* operation arguments */
-	xfs_dir2_db_t	db,		/* directory block number */
-	xfs_dabuf_t	*bp)		/* block's buffer */
+	xfs_da_args_t	*args,
+	xfs_dir2_db_t	db,
+	xfs_dabuf_t	*bp)
 {
 	xfs_fileoff_t	bno;		/* directory file offset */
 	xfs_dablk_t	da;		/* directory file offset */
 	int		done;		/* bunmap is finished */
-	xfs_inode_t	*dp;		/* incore directory inode */
-	int		error;		/* error return value */
-	xfs_mount_t	*mp;		/* filesystem mount point */
-	xfs_trans_t	*tp;		/* transaction pointer */
+	xfs_inode_t	*dp;
+	int		error;
+	xfs_mount_t	*mp;
+	xfs_trans_t	*tp;
 
 	xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
 	dp = args->dp;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 7dd364b1e03..86560b6f794 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -22,7 +22,9 @@ struct uio;
 struct xfs_dabuf;
 struct xfs_da_args;
 struct xfs_dir2_put_args;
+struct xfs_bmap_free;
 struct xfs_inode;
+struct xfs_mount;
 struct xfs_trans;
 
 /*
@@ -73,7 +75,35 @@ typedef struct xfs_dir2_put_args {
 } xfs_dir2_put_args_t;
 
 /*
- * Other interfaces used by the rest of the dir v2 code.
+ * Generic directory interface routines
+ */
+extern void xfs_dir_startup(void);
+extern void xfs_dir_mount(struct xfs_mount *mp);
+extern int xfs_dir_isempty(struct xfs_inode *dp);
+extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct xfs_inode *pdp);
+extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen, xfs_ino_t inum,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen, xfs_ino_t *inum);
+extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen, xfs_ino_t ino,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
+				uio_t *uio, int *eofp);
+extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen, xfs_ino_t inum,
+				xfs_fsblock_t *first,
+				struct xfs_bmap_free *flist, xfs_extlen_t tot);
+extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
+				char *name, int namelen);
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
+/*
+ * Utility routines for v2 directories.
  */
 extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
 				xfs_dir2_db_t *dbp);
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 2621ff521db..9d7438bba30 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -22,19 +22,16 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
@@ -51,6 +48,18 @@ static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
 				     int *entno);
 static int xfs_dir2_block_sort(const void *a, const void *b);
 
+static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+	xfs_dir_hash_dot = xfs_da_hashname(".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
+}
+
 /*
  * Add an entry to a block directory.
  */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 7be37d38961..f7c79921707 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -22,18 +22,15 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
 #include "xfs_dir2_block.h"
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 74ef99f2ee5..b1cf1fbf423 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_attr_sf.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index b1f85cc7795..9ca71719b68 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -22,13 +22,11 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 06afa1b324c..0cd77b17bf9 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -22,19 +22,16 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_error.h"
 #include "xfs_dir2_data.h"
 #include "xfs_dir2_leaf.h"
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
index c626943b411..f3fb2ffd6f5 100644
--- a/fs/xfs/xfs_dir2_trace.c
+++ b/fs/xfs/xfs_dir2_trace.c
@@ -19,11 +19,9 @@
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_inum.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
index e7121040cd9..e69de29bb2d 100644
--- a/fs/xfs/xfs_dir_leaf.c
+++ b/fs/xfs/xfs_dir_leaf.c
@@ -1,2230 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dir.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_dir_leaf.h"
-#include "xfs_error.h"
-
-/*
- * xfs_dir_leaf.c
- *
- * Routines to implement leaf blocks of directories as Btrees of hashed names.
- */
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Routines used for growing the Btree.
- */
-STATIC void xfs_dir_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
-					      int insertion_index,
-					      int freemap_index);
-STATIC int xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer,
-					    int musthave, int justcheck);
-STATIC void xfs_dir_leaf_rebalance(xfs_da_state_t *state,
-						  xfs_da_state_blk_t *blk1,
-						  xfs_da_state_blk_t *blk2);
-STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
-					  xfs_da_state_blk_t *leaf_blk_1,
-					  xfs_da_state_blk_t *leaf_blk_2,
-					  int *number_entries_in_blk1,
-					  int *number_namebytes_in_blk1);
-
-STATIC int xfs_dir_leaf_create(struct xfs_da_args *args,
-				xfs_dablk_t which_block,
-				struct xfs_dabuf **bpp);
-
-/*
- * Utility routines.
- */
-STATIC void xfs_dir_leaf_moveents(xfs_dir_leafblock_t *src_leaf,
-					      int src_start,
-					      xfs_dir_leafblock_t *dst_leaf,
-					      int dst_start, int move_count,
-					      xfs_mount_t *mp);
-
-
-/*========================================================================
- * External routines when dirsize < XFS_IFORK_DSIZE(dp).
- *========================================================================*/
-
-
-/*
- * Validate a given inode number.
- */
-int
-xfs_dir_ino_validate(xfs_mount_t *mp, xfs_ino_t ino)
-{
-	xfs_agblock_t	agblkno;
-	xfs_agino_t	agino;
-	xfs_agnumber_t	agno;
-	int		ino_ok;
-	int		ioff;
-
-	agno = XFS_INO_TO_AGNO(mp, ino);
-	agblkno = XFS_INO_TO_AGBNO(mp, ino);
-	ioff = XFS_INO_TO_OFFSET(mp, ino);
-	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
-	ino_ok =
-		agno < mp->m_sb.sb_agcount &&
-		agblkno < mp->m_sb.sb_agblocks &&
-		agblkno != 0 &&
-		ioff < (1 << mp->m_sb.sb_inopblog) &&
-		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
-	if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
-			XFS_RANDOM_DIR_INO_VALIDATE))) {
-		xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
-				(unsigned long long) ino);
-		XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
-	return 0;
-}
-
-/*
- * Create the initial contents of a shortform directory.
- */
-int
-xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent)
-{
-	xfs_dir_sf_hdr_t *hdr;
-	xfs_inode_t *dp;
-
-	dp = args->dp;
-	ASSERT(dp != NULL);
-	ASSERT(dp->i_d.di_size == 0);
-	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
-		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
-		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
-		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
-		dp->i_df.if_flags |= XFS_IFINLINE;
-	}
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	ASSERT(dp->i_df.if_bytes == 0);
-	xfs_idata_realloc(dp, sizeof(*hdr), XFS_DATA_FORK);
-	hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
-	XFS_DIR_SF_PUT_DIRINO(&parent, &hdr->parent);
-
-	hdr->count = 0;
-	dp->i_d.di_size = sizeof(*hdr);
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-	return 0;
-}
-
-/*
- * Add a name to the shortform directory structure.
- * Overflow from the inode has already been checked for.
- */
-int
-xfs_dir_shortform_addname(xfs_da_args_t *args)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	int i, offset, size;
-	xfs_inode_t *dp;
-
-	dp = args->dp;
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count-1; i >= 0; i--) {
-		if (sfe->namelen == args->namelen &&
-		    args->name[0] == sfe->name[0] &&
-		    memcmp(args->name, sfe->name, args->namelen) == 0)
-			return XFS_ERROR(EEXIST);
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-
-	offset = (int)((char *)sfe - (char *)sf);
-	size = XFS_DIR_SF_ENTSIZE_BYNAME(args->namelen);
-	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	sfe = (xfs_dir_sf_entry_t *)((char *)sf + offset);
-
-	XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
-	sfe->namelen = args->namelen;
-	memcpy(sfe->name, args->name, sfe->namelen);
-	sf->hdr.count++;
-
-	dp->i_d.di_size += size;
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-
-	return 0;
-}
-
-/*
- * Remove a name from the shortform directory structure.
- */
-int
-xfs_dir_shortform_removename(xfs_da_args_t *args)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	int base, size = 0, i;
-	xfs_inode_t *dp;
-
-	dp = args->dp;
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	base = sizeof(xfs_dir_sf_hdr_t);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count-1; i >= 0; i--) {
-		size = XFS_DIR_SF_ENTSIZE_BYENTRY(sfe);
-		if (sfe->namelen == args->namelen &&
-		    sfe->name[0] == args->name[0] &&
-		    memcmp(sfe->name, args->name, args->namelen) == 0)
-			break;
-		base += size;
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	if (i < 0) {
-		ASSERT(args->oknoent);
-		return XFS_ERROR(ENOENT);
-	}
-
-	if ((base + size) != dp->i_d.di_size) {
-		memmove(&((char *)sf)[base], &((char *)sf)[base+size],
-					      dp->i_d.di_size - (base+size));
-	}
-	sf->hdr.count--;
-
-	xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
-	dp->i_d.di_size -= size;
-	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
-
-	return 0;
-}
-
-/*
- * Look up a name in a shortform directory structure.
- */
-int
-xfs_dir_shortform_lookup(xfs_da_args_t *args)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	int i;
-	xfs_inode_t *dp;
-
-	dp = args->dp;
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	if (args->namelen == 2 &&
-	    args->name[0] == '.' && args->name[1] == '.') {
-		XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &args->inumber);
-		return(XFS_ERROR(EEXIST));
-	}
-	if (args->namelen == 1 && args->name[0] == '.') {
-		args->inumber = dp->i_ino;
-		return(XFS_ERROR(EEXIST));
-	}
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count-1; i >= 0; i--) {
-		if (sfe->namelen == args->namelen &&
-		    sfe->name[0] == args->name[0] &&
-		    memcmp(args->name, sfe->name, args->namelen) == 0) {
-			XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args->inumber);
-			return(XFS_ERROR(EEXIST));
-		}
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	ASSERT(args->oknoent);
-	return(XFS_ERROR(ENOENT));
-}
-
-/*
- * Convert from using the shortform to the leaf.
- */
-int
-xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs)
-{
-	xfs_inode_t *dp;
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	xfs_da_args_t args;
-	xfs_ino_t inumber;
-	char *tmpbuffer;
-	int retval, i, size;
-	xfs_dablk_t blkno;
-	xfs_dabuf_t *bp;
-
-	dp = iargs->dp;
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	size = dp->i_df.if_bytes;
-	tmpbuffer = kmem_alloc(size, KM_SLEEP);
-	ASSERT(tmpbuffer != NULL);
-
-	memcpy(tmpbuffer, dp->i_df.if_u1.if_data, size);
-
-	sf = (xfs_dir_shortform_t *)tmpbuffer;
-	XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &inumber);
-
-	xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
-	dp->i_d.di_size = 0;
-	xfs_trans_log_inode(iargs->trans, dp, XFS_ILOG_CORE);
-	retval = xfs_da_grow_inode(iargs, &blkno);
-	if (retval)
-		goto out;
-
-	ASSERT(blkno == 0);
-	retval = xfs_dir_leaf_create(iargs, blkno, &bp);
-	if (retval)
-		goto out;
-	xfs_da_buf_done(bp);
-
-	args.name = ".";
-	args.namelen = 1;
-	args.hashval = xfs_dir_hash_dot;
-	args.inumber = dp->i_ino;
-	args.dp = dp;
-	args.firstblock = iargs->firstblock;
-	args.flist = iargs->flist;
-	args.total = iargs->total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = iargs->trans;
-	args.justcheck = 0;
-	args.addname = args.oknoent = 1;
-	retval = xfs_dir_leaf_addname(&args);
-	if (retval)
-		goto out;
-
-	args.name = "..";
-	args.namelen = 2;
-	args.hashval = xfs_dir_hash_dotdot;
-	args.inumber = inumber;
-	retval = xfs_dir_leaf_addname(&args);
-	if (retval)
-		goto out;
-
-	sfe = &sf->list[0];
-	for (i = 0; i < sf->hdr.count; i++) {
-		args.name = (char *)(sfe->name);
-		args.namelen = sfe->namelen;
-		args.hashval = xfs_da_hashname((char *)(sfe->name),
-					       sfe->namelen);
-		XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args.inumber);
-		retval = xfs_dir_leaf_addname(&args);
-		if (retval)
-			goto out;
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	retval = 0;
-
-out:
-	kmem_free(tmpbuffer, size);
-	return retval;
-}
-
-STATIC int
-xfs_dir_shortform_compare(const void *a, const void *b)
-{
-	xfs_dir_sf_sort_t *sa, *sb;
-
-	sa = (xfs_dir_sf_sort_t *)a;
-	sb = (xfs_dir_sf_sort_t *)b;
-	if (sa->hash < sb->hash)
-		return -1;
-	else if (sa->hash > sb->hash)
-		return 1;
-	else
-		return sa->entno - sb->entno;
-}
-
-/*
- * Copy out directory entries for getdents(), for shortform directories.
- */
-/*ARGSUSED*/
-int
-xfs_dir_shortform_getdents(xfs_inode_t *dp, uio_t *uio, int *eofp,
-				       xfs_dirent_t *dbp, xfs_dir_put_t put)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	int retval, i, sbsize, nsbuf, lastresid=0, want_entno;
-	xfs_mount_t *mp;
-	xfs_dahash_t cookhash, hash;
-	xfs_dir_put_args_t p;
-	xfs_dir_sf_sort_t *sbuf, *sbp;
-
-	mp = dp->i_mount;
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-	want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
-	nsbuf = sf->hdr.count + 2;
-	sbsize = (nsbuf + 1) * sizeof(*sbuf);
-	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
-
-	xfs_dir_trace_g_du("sf: start", dp, uio);
-
-	/*
-	 * Collect all the entries into the buffer.
-	 * Entry 0 is .
-	 */
-	sbp->entno = 0;
-	sbp->seqno = 0;
-	sbp->hash = xfs_dir_hash_dot;
-	sbp->ino = dp->i_ino;
-	sbp->name = ".";
-	sbp->namelen = 1;
-	sbp++;
-
-	/*
-	 * Entry 1 is ..
-	 */
-	sbp->entno = 1;
-	sbp->seqno = 0;
-	sbp->hash = xfs_dir_hash_dotdot;
-	sbp->ino = XFS_GET_DIR_INO8(sf->hdr.parent);
-	sbp->name = "..";
-	sbp->namelen = 2;
-	sbp++;
-
-	/*
-	 * Scan the directory data for the rest of the entries.
-	 */
-	for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-
-		if (unlikely(
-		    ((char *)sfe < (char *)sf) ||
-		    ((char *)sfe >= ((char *)sf + dp->i_df.if_bytes)))) {
-			xfs_dir_trace_g_du("sf: corrupted", dp, uio);
-			XFS_CORRUPTION_ERROR("xfs_dir_shortform_getdents",
-					     XFS_ERRLEVEL_LOW, mp, sfe);
-			kmem_free(sbuf, sbsize);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-
-		sbp->entno = i + 2;
-		sbp->seqno = 0;
-		sbp->hash = xfs_da_hashname((char *)sfe->name, sfe->namelen);
-		sbp->ino = XFS_GET_DIR_INO8(sfe->inumber);
-		sbp->name = (char *)sfe->name;
-		sbp->namelen = sfe->namelen;
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-		sbp++;
-	}
-
-	/*
-	 * Sort the entries on hash then entno.
-	 */
-	xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_dir_shortform_compare);
-	/*
-	 * Stuff in last entry.
-	 */
-	sbp->entno = nsbuf;
-	sbp->hash = XFS_DA_MAXHASH;
-	sbp->seqno = 0;
-	/*
-	 * Figure out the sequence numbers in case there's a hash duplicate.
-	 */
-	for (hash = sbuf->hash, sbp = sbuf + 1;
-				sbp < &sbuf[nsbuf + 1]; sbp++) {
-		if (sbp->hash == hash)
-			sbp->seqno = sbp[-1].seqno + 1;
-		else
-			hash = sbp->hash;
-	}
-
-	/*
-	 * Set up put routine.
-	 */
-	p.dbp = dbp;
-	p.put = put;
-	p.uio = uio;
-
-	/*
-	 * Find our place.
-	 */
-	for (sbp = sbuf; sbp < &sbuf[nsbuf + 1]; sbp++) {
-		if (sbp->hash > cookhash ||
-		    (sbp->hash == cookhash && sbp->seqno >= want_entno))
-			break;
-	}
-
-	/*
-	 * Did we fail to find anything?  We stop at the last entry,
-	 * the one we put maxhash into.
-	 */
-	if (sbp == &sbuf[nsbuf]) {
-		kmem_free(sbuf, sbsize);
-		xfs_dir_trace_g_du("sf: hash beyond end", dp, uio);
-		uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
-		*eofp = 1;
-		return 0;
-	}
-
-	/*
-	 * Loop putting entries into the user buffer.
-	 */
-	while (sbp < &sbuf[nsbuf]) {
-		/*
-		 * Save the first resid in a run of equal-hashval entries
-		 * so that we can back them out if they don't all fit.
-		 */
-		if (sbp->seqno == 0 || sbp == sbuf)
-			lastresid = uio->uio_resid;
-		XFS_PUT_COOKIE(p.cook, mp, 0, sbp[1].seqno, sbp[1].hash);
-		p.ino = sbp->ino;
-#if XFS_BIG_INUMS
-		p.ino += mp->m_inoadd;
-#endif
-		p.name = sbp->name;
-		p.namelen = sbp->namelen;
-		retval = p.put(&p);
-		if (!p.done) {
-			uio->uio_offset =
-				XFS_DA_MAKE_COOKIE(mp, 0, 0, sbp->hash);
-			kmem_free(sbuf, sbsize);
-			uio->uio_resid = lastresid;
-			xfs_dir_trace_g_du("sf: E-O-B", dp, uio);
-			return retval;
-		}
-		sbp++;
-	}
-	kmem_free(sbuf, sbsize);
-	uio->uio_offset = p.cook.o;
-	*eofp = 1;
-	xfs_dir_trace_g_du("sf: E-O-F", dp, uio);
-	return 0;
-}
-
-/*
- * Look up a name in a shortform directory structure, replace the inode number.
- */
-int
-xfs_dir_shortform_replace(xfs_da_args_t *args)
-{
-	xfs_dir_shortform_t *sf;
-	xfs_dir_sf_entry_t *sfe;
-	xfs_inode_t *dp;
-	int i;
-
-	dp = args->dp;
-	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
-	/*
-	 * Catch the case where the conversion from shortform to leaf
-	 * failed part way through.
-	 */
-	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
-		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-		return XFS_ERROR(EIO);
-	}
-	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
-	ASSERT(dp->i_df.if_u1.if_data != NULL);
-	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
-	if (args->namelen == 2 &&
-	    args->name[0] == '.' && args->name[1] == '.') {
-		/* XXX - replace assert? */
-		XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sf->hdr.parent);
-		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
-		return 0;
-	}
-	ASSERT(args->namelen != 1 || args->name[0] != '.');
-	sfe = &sf->list[0];
-	for (i = sf->hdr.count-1; i >= 0; i--) {
-		if (sfe->namelen == args->namelen &&
-		    sfe->name[0] == args->name[0] &&
-		    memcmp(args->name, sfe->name, args->namelen) == 0) {
-			ASSERT(memcmp((char *)&args->inumber,
-				(char *)&sfe->inumber, sizeof(xfs_ino_t)));
-			XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
-			xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
-			return 0;
-		}
-		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
-	}
-	ASSERT(args->oknoent);
-	return XFS_ERROR(ENOENT);
-}
-
-/*
- * Convert a leaf directory to shortform structure
- */
-int
-xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	xfs_da_args_t args;
-	xfs_inode_t *dp;
-	xfs_ino_t parent = 0;
-	char *tmpbuffer;
-	int retval, i;
-	xfs_dabuf_t *bp;
-
-	dp = iargs->dp;
-	tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
-	ASSERT(tmpbuffer != NULL);
-
-	retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp,
-					       XFS_DATA_FORK);
-	if (retval)
-		goto out;
-	ASSERT(bp != NULL);
-	memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
-	leaf = (xfs_dir_leafblock_t *)tmpbuffer;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
-
-	/*
-	 * Find and special case the parent inode number
-	 */
-	hdr = &leaf->hdr;
-	entry = &leaf->entries[0];
-	for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
-		if ((entry->namelen == 2) &&
-		    (namest->name[0] == '.') &&
-		    (namest->name[1] == '.')) {
-			XFS_DIR_SF_GET_DIRINO(&namest->inumber, &parent);
-			entry->nameidx = 0;
-		} else if ((entry->namelen == 1) && (namest->name[0] == '.')) {
-			entry->nameidx = 0;
-		}
-	}
-	retval = xfs_da_shrink_inode(iargs, 0, bp);
-	if (retval)
-		goto out;
-	retval = xfs_dir_shortform_create(iargs, parent);
-	if (retval)
-		goto out;
-
-	/*
-	 * Copy the rest of the filenames
-	 */
-	entry = &leaf->entries[0];
-	args.dp = dp;
-	args.firstblock = iargs->firstblock;
-	args.flist = iargs->flist;
-	args.total = iargs->total;
-	args.whichfork = XFS_DATA_FORK;
-	args.trans = iargs->trans;
-	args.justcheck = 0;
-	args.addname = args.oknoent = 1;
-	for (i = 0; i < be16_to_cpu(hdr->count); entry++, i++) {
-		if (!entry->nameidx)
-			continue;
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
-		args.name = (char *)(namest->name);
-		args.namelen = entry->namelen;
-		args.hashval = be32_to_cpu(entry->hashval);
-		XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args.inumber);
-		xfs_dir_shortform_addname(&args);
-	}
-
-out:
-	kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
-	return retval;
-}
-
-/*
- * Convert from using a single leaf to a root node and a leaf.
- */
-int
-xfs_dir_leaf_to_node(xfs_da_args_t *args)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_da_intnode_t *node;
-	xfs_inode_t *dp;
-	xfs_dabuf_t *bp1, *bp2;
-	xfs_dablk_t blkno;
-	int retval;
-
-	dp = args->dp;
-	retval = xfs_da_grow_inode(args, &blkno);
-	ASSERT(blkno == 1);
-	if (retval)
-		return retval;
-	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
-					      XFS_DATA_FORK);
-	if (retval)
-		return retval;
-	ASSERT(bp1 != NULL);
-	retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2,
-					     XFS_DATA_FORK);
-	if (retval) {
-		xfs_da_buf_done(bp1);
-		return retval;
-	}
-	ASSERT(bp2 != NULL);
-	memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
-	xfs_da_buf_done(bp1);
-	xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
-
-	/*
-	 * Set up the new root node.
-	 */
-	retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK);
-	if (retval) {
-		xfs_da_buf_done(bp2);
-		return retval;
-	}
-	node = bp1->data;
-	leaf = bp2->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	node->btree[0].hashval = leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval;
-	xfs_da_buf_done(bp2);
-	node->btree[0].before = cpu_to_be32(blkno);
-	node->hdr.count = cpu_to_be16(1);
-	xfs_da_log_buf(args->trans, bp1,
-		XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0])));
-	xfs_da_buf_done(bp1);
-
-	return retval;
-}
-
-
-/*========================================================================
- * Routines used for growing the Btree.
- *========================================================================*/
-
-/*
- * Create the initial contents of a leaf directory
- * or a leaf in a node directory.
- */
-STATIC int
-xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_inode_t *dp;
-	xfs_dabuf_t *bp;
-	int retval;
-
-	dp = args->dp;
-	ASSERT(dp != NULL);
-	retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK);
-	if (retval)
-		return retval;
-	ASSERT(bp != NULL);
-	leaf = bp->data;
-	memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
-	hdr = &leaf->hdr;
-	hdr->info.magic = cpu_to_be16(XFS_DIR_LEAF_MAGIC);
-	hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount));
-	if (!hdr->firstused)
-		hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount) - 1);
-	hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_dir_leaf_hdr_t));
-	hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) -
-					   be16_to_cpu(hdr->freemap[0].base));
-
-	xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
-
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Split the leaf node, rebalance, then add the new entry.
- */
-int
-xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
-				  xfs_da_state_blk_t *newblk)
-{
-	xfs_dablk_t blkno;
-	xfs_da_args_t *args;
-	int error;
-
-	/*
-	 * Allocate space for a new leaf node.
-	 */
-	args = state->args;
-	ASSERT(args != NULL);
-	ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC);
-	error = xfs_da_grow_inode(args, &blkno);
-	if (error)
-		return error;
-	error = xfs_dir_leaf_create(args, blkno, &newblk->bp);
-	if (error)
-		return error;
-	newblk->blkno = blkno;
-	newblk->magic = XFS_DIR_LEAF_MAGIC;
-
-	/*
-	 * Rebalance the entries across the two leaves.
-	 */
-	xfs_dir_leaf_rebalance(state, oldblk, newblk);
-	error = xfs_da_blk_link(state, oldblk, newblk);
-	if (error)
-		return error;
-
-	/*
-	 * Insert the new entry in the correct block.
-	 */
-	if (state->inleaf) {
-		error = xfs_dir_leaf_add(oldblk->bp, args, oldblk->index);
-	} else {
-		error = xfs_dir_leaf_add(newblk->bp, args, newblk->index);
-	}
-
-	/*
-	 * Update last hashval in each block since we added the name.
-	 */
-	oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL);
-	newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL);
-	return error;
-}
-
-/*
- * Add a name to the leaf directory structure.
- *
- * Must take into account fragmented leaves and leaves where spacemap has
- * lost some freespace information (ie: holes).
- */
-int
-xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_dir_leaf_map_t *map;
-	int tablesize, entsize, sum, i, tmp, error;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT((index >= 0) && (index <= be16_to_cpu(leaf->hdr.count)));
-	hdr = &leaf->hdr;
-	entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen);
-
-	/*
-	 * Search through freemap for first-fit on new name length.
-	 * (may need to figure in size of entry struct too)
-	 */
-	tablesize = (be16_to_cpu(hdr->count) + 1) *
-		sizeof(xfs_dir_leaf_entry_t) + sizeof(xfs_dir_leaf_hdr_t);
-	map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1];
-	for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
-		if (tablesize > be16_to_cpu(hdr->firstused)) {
-			sum += be16_to_cpu(map->size);
-			continue;
-		}
-		if (!map->size)
-			continue;	/* no space in this map */
-		tmp = entsize;
-		if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused))
-			tmp += (uint)sizeof(xfs_dir_leaf_entry_t);
-		if (be16_to_cpu(map->size) >= tmp) {
-			if (!args->justcheck)
-				xfs_dir_leaf_add_work(bp, args, index, i);
-			return 0;
-		}
-		sum += be16_to_cpu(map->size);
-	}
-
-	/*
-	 * If there are no holes in the address space of the block,
-	 * and we don't have enough freespace, then compaction will do us
-	 * no good and we should just give up.
-	 */
-	if (!hdr->holes && (sum < entsize))
-		return XFS_ERROR(ENOSPC);
-
-	/*
-	 * Compact the entries to coalesce free space.
-	 * Pass the justcheck flag so the checking pass can return
-	 * an error, without changing anything, if it won't fit.
-	 */
-	error = xfs_dir_leaf_compact(args->trans, bp,
-			args->total == 0 ?
-				entsize +
-				(uint)sizeof(xfs_dir_leaf_entry_t) : 0,
-			args->justcheck);
-	if (error)
-		return error;
-	/*
-	 * After compaction, the block is guaranteed to have only one
-	 * free region, in freemap[0].  If it is not big enough, give up.
-	 */
-	if (be16_to_cpu(hdr->freemap[0].size) <
-	    (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
-		return XFS_ERROR(ENOSPC);
-
-	if (!args->justcheck)
-		xfs_dir_leaf_add_work(bp, args, index, 0);
-	return 0;
-}
-
-/*
- * Add a name to a leaf directory structure.
- */
-STATIC void
-xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
-		      int mapindex)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	xfs_dir_leaf_map_t *map;
-	/* REFERENCED */
-	xfs_mount_t *mp;
-	int tmp, i;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	hdr = &leaf->hdr;
-	ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE));
-	ASSERT((index >= 0) && (index <= be16_to_cpu(hdr->count)));
-
-	/*
-	 * Force open some space in the entry array and fill it in.
-	 */
-	entry = &leaf->entries[index];
-	if (index < be16_to_cpu(hdr->count)) {
-		tmp  = be16_to_cpu(hdr->count) - index;
-		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
-		memmove(entry + 1, entry, tmp);
-		xfs_da_log_buf(args->trans, bp,
-		    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
-	}
-	be16_add(&hdr->count, 1);
-
-	/*
-	 * Allocate space for the new string (at the end of the run).
-	 */
-	map = &hdr->freemap[mapindex];
-	mp = args->trans->t_mountp;
-	ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
-	ASSERT(be16_to_cpu(map->size) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
-	ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
-
-	be16_add(&map->size, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
-	entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) +
-				     be16_to_cpu(map->size));
-	entry->hashval = cpu_to_be32(args->hashval);
-	entry->namelen = args->namelen;
-	xfs_da_log_buf(args->trans, bp,
-	    XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
-
-	/*
-	 * Copy the string and inode number into the new space.
-	 */
-	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
-	XFS_DIR_SF_PUT_DIRINO(&args->inumber, &namest->inumber);
-	memcpy(namest->name, args->name, args->namelen);
-	xfs_da_log_buf(args->trans, bp,
-	    XFS_DA_LOGRANGE(leaf, namest, XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)));
-
-	/*
-	 * Update the control info for this leaf node
-	 */
-	if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused))
-		hdr->firstused = entry->nameidx;
-	ASSERT(be16_to_cpu(hdr->firstused) >=
-	       ((be16_to_cpu(hdr->count)*sizeof(*entry))+sizeof(*hdr)));
-	tmp = (be16_to_cpu(hdr->count)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
-			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
-	map = &hdr->freemap[0];
-	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
-		if (be16_to_cpu(map->base) == tmp) {
-			int entry_size = sizeof(xfs_dir_leaf_entry_t);
-			be16_add(&map->base, entry_size);
-			be16_add(&map->size, -entry_size);
-		}
-	}
-	be16_add(&hdr->namebytes, args->namelen);
-	xfs_da_log_buf(args->trans, bp,
-		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
-}
-
-/*
- * Garbage collect a leaf directory block by copying it to a new buffer.
- */
-STATIC int
-xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
-		     int justcheck)
-{
-	xfs_dir_leafblock_t *leaf_s, *leaf_d;
-	xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
-	xfs_mount_t *mp;
-	char *tmpbuffer;
-	char *tmpbuffer2=NULL;
-	int rval;
-	int lbsize;
-
-	mp = trans->t_mountp;
-	lbsize = XFS_LBSIZE(mp);
-	tmpbuffer = kmem_alloc(lbsize, KM_SLEEP);
-	ASSERT(tmpbuffer != NULL);
-	memcpy(tmpbuffer, bp->data, lbsize);
-
-	/*
-	 * Make a second copy in case xfs_dir_leaf_moveents()
-	 * below destroys the original.
-	 */
-	if (musthave || justcheck) {
-		tmpbuffer2 = kmem_alloc(lbsize, KM_SLEEP);
-		memcpy(tmpbuffer2, bp->data, lbsize);
-	}
-	memset(bp->data, 0, lbsize);
-
-	/*
-	 * Copy basic information
-	 */
-	leaf_s = (xfs_dir_leafblock_t *)tmpbuffer;
-	leaf_d = bp->data;
-	hdr_s = &leaf_s->hdr;
-	hdr_d = &leaf_d->hdr;
-	hdr_d->info = hdr_s->info;	/* struct copy */
-	hdr_d->firstused = cpu_to_be16(lbsize);
-	if (!hdr_d->firstused)
-		hdr_d->firstused = cpu_to_be16(lbsize - 1);
-	hdr_d->namebytes = 0;
-	hdr_d->count = 0;
-	hdr_d->holes = 0;
-	hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_dir_leaf_hdr_t));
-	hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) -
-			                     be16_to_cpu(hdr_d->freemap[0].base));
-
-	/*
-	 * Copy all entry's in the same (sorted) order,
-	 * but allocate filenames packed and in sequence.
-	 * This changes the source (leaf_s) as well.
-	 */
-	xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, be16_to_cpu(hdr_s->count), mp);
-
-	if (musthave && be16_to_cpu(hdr_d->freemap[0].size) < musthave)
-		rval = XFS_ERROR(ENOSPC);
-	else
-		rval = 0;
-
-	if (justcheck || rval == ENOSPC) {
-		ASSERT(tmpbuffer2);
-		memcpy(bp->data, tmpbuffer2, lbsize);
-	} else {
-		xfs_da_log_buf(trans, bp, 0, lbsize - 1);
-	}
-
-	kmem_free(tmpbuffer, lbsize);
-	if (musthave || justcheck)
-		kmem_free(tmpbuffer2, lbsize);
-	return rval;
-}
-
-/*
- * Redistribute the directory entries between two leaf nodes,
- * taking into account the size of the new entry.
- *
- * NOTE: if new block is empty, then it will get the upper half of old block.
- */
-STATIC void
-xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
-				      xfs_da_state_blk_t *blk2)
-{
-	xfs_da_state_blk_t *tmp_blk;
-	xfs_dir_leafblock_t *leaf1, *leaf2;
-	xfs_dir_leaf_hdr_t *hdr1, *hdr2;
-	int count, totallen, max, space, swap;
-
-	/*
-	 * Set up environment.
-	 */
-	ASSERT(blk1->magic == XFS_DIR_LEAF_MAGIC);
-	ASSERT(blk2->magic == XFS_DIR_LEAF_MAGIC);
-	leaf1 = blk1->bp->data;
-	leaf2 = blk2->bp->data;
-	ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-
-	/*
-	 * Check ordering of blocks, reverse if it makes things simpler.
-	 */
-	swap = 0;
-	if (xfs_dir_leaf_order(blk1->bp, blk2->bp)) {
-		tmp_blk = blk1;
-		blk1 = blk2;
-		blk2 = tmp_blk;
-		leaf1 = blk1->bp->data;
-		leaf2 = blk2->bp->data;
-		swap = 1;
-	}
-	hdr1 = &leaf1->hdr;
-	hdr2 = &leaf2->hdr;
-
-	/*
-	 * Examine entries until we reduce the absolute difference in
-	 * byte usage between the two blocks to a minimum.  Then get
-	 * the direction to copy and the number of elements to move.
-	 */
-	state->inleaf = xfs_dir_leaf_figure_balance(state, blk1, blk2,
-							   &count, &totallen);
-	if (swap)
-		state->inleaf = !state->inleaf;
-
-	/*
-	 * Move any entries required from leaf to leaf:
-	 */
-	if (count < be16_to_cpu(hdr1->count)) {
-		/*
-		 * Figure the total bytes to be added to the destination leaf.
-		 */
-		count = be16_to_cpu(hdr1->count) - count;	/* number entries being moved */
-		space = be16_to_cpu(hdr1->namebytes) - totallen;
-		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
-		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
-
-		/*
-		 * leaf2 is the destination, compact it if it looks tight.
-		 */
-		max  = be16_to_cpu(hdr2->firstused) - (uint)sizeof(xfs_dir_leaf_hdr_t);
-		max -= be16_to_cpu(hdr2->count) * (uint)sizeof(xfs_dir_leaf_entry_t);
-		if (space > max) {
-			xfs_dir_leaf_compact(state->args->trans, blk2->bp,
-								 0, 0);
-		}
-
-		/*
-		 * Move high entries from leaf1 to low end of leaf2.
-		 */
-		xfs_dir_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count,
-					     leaf2, 0, count, state->mp);
-
-		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
-						   state->blocksize-1);
-		xfs_da_log_buf(state->args->trans, blk2->bp, 0,
-						   state->blocksize-1);
-
-	} else if (count > be16_to_cpu(hdr1->count)) {
-		/*
-		 * Figure the total bytes to be added to the destination leaf.
-		 */
-		count -= be16_to_cpu(hdr1->count);		/* number entries being moved */
-		space  = totallen - be16_to_cpu(hdr1->namebytes);
-		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
-		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
-
-		/*
-		 * leaf1 is the destination, compact it if it looks tight.
-		 */
-		max  = be16_to_cpu(hdr1->firstused) - (uint)sizeof(xfs_dir_leaf_hdr_t);
-		max -= be16_to_cpu(hdr1->count) * (uint)sizeof(xfs_dir_leaf_entry_t);
-		if (space > max) {
-			xfs_dir_leaf_compact(state->args->trans, blk1->bp,
-								 0, 0);
-		}
-
-		/*
-		 * Move low entries from leaf2 to high end of leaf1.
-		 */
-		xfs_dir_leaf_moveents(leaf2, 0, leaf1, be16_to_cpu(hdr1->count),
-					     count, state->mp);
-
-		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
-						   state->blocksize-1);
-		xfs_da_log_buf(state->args->trans, blk2->bp, 0,
-						   state->blocksize-1);
-	}
-
-	/*
-	 * Copy out last hashval in each block for B-tree code.
-	 */
-	blk1->hashval = be32_to_cpu(leaf1->entries[
-			be16_to_cpu(leaf1->hdr.count)-1].hashval);
-	blk2->hashval = be32_to_cpu(leaf2->entries[
-			be16_to_cpu(leaf2->hdr.count)-1].hashval);
-
-	/*
-	 * Adjust the expected index for insertion.
-	 * GROT: this doesn't work unless blk2 was originally empty.
-	 */
-	if (!state->inleaf) {
-		blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
-	}
-}
-
-/*
- * Examine entries until we reduce the absolute difference in
- * byte usage between the two blocks to a minimum.
- * GROT: Is this really necessary?  With other than a 512 byte blocksize,
- * GROT: there will always be enough room in either block for a new entry.
- * GROT: Do a double-split for this case?
- */
-STATIC int
-xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
-					   xfs_da_state_blk_t *blk1,
-					   xfs_da_state_blk_t *blk2,
-					   int *countarg, int *namebytesarg)
-{
-	xfs_dir_leafblock_t *leaf1, *leaf2;
-	xfs_dir_leaf_hdr_t *hdr1, *hdr2;
-	xfs_dir_leaf_entry_t *entry;
-	int count, max, totallen, half;
-	int lastdelta, foundit, tmp;
-
-	/*
-	 * Set up environment.
-	 */
-	leaf1 = blk1->bp->data;
-	leaf2 = blk2->bp->data;
-	hdr1 = &leaf1->hdr;
-	hdr2 = &leaf2->hdr;
-	foundit = 0;
-	totallen = 0;
-
-	/*
-	 * Examine entries until we reduce the absolute difference in
-	 * byte usage between the two blocks to a minimum.
-	 */
-	max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count);
-	half  = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
-	half += be16_to_cpu(hdr1->namebytes) + be16_to_cpu(hdr2->namebytes) +
-		state->args->namelen;
-	half /= 2;
-	lastdelta = state->blocksize;
-	entry = &leaf1->entries[0];
-	for (count = 0; count < max; entry++, count++) {
-
-#define XFS_DIR_ABS(A)	(((A) < 0) ? -(A) : (A))
-		/*
-		 * The new entry is in the first block, account for it.
-		 */
-		if (count == blk1->index) {
-			tmp = totallen + (uint)sizeof(*entry)
-				+ XFS_DIR_LEAF_ENTSIZE_BYNAME(state->args->namelen);
-			if (XFS_DIR_ABS(half - tmp) > lastdelta)
-				break;
-			lastdelta = XFS_DIR_ABS(half - tmp);
-			totallen = tmp;
-			foundit = 1;
-		}
-
-		/*
-		 * Wrap around into the second block if necessary.
-		 */
-		if (count == be16_to_cpu(hdr1->count)) {
-			leaf1 = leaf2;
-			entry = &leaf1->entries[0];
-		}
-
-		/*
-		 * Figure out if next leaf entry would be too much.
-		 */
-		tmp = totallen + (uint)sizeof(*entry)
-				+ XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
-		if (XFS_DIR_ABS(half - tmp) > lastdelta)
-			break;
-		lastdelta = XFS_DIR_ABS(half - tmp);
-		totallen = tmp;
-#undef XFS_DIR_ABS
-	}
-
-	/*
-	 * Calculate the number of namebytes that will end up in lower block.
-	 * If new entry not in lower block, fix up the count.
-	 */
-	totallen -=
-		count * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
-	if (foundit) {
-		totallen -= (sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1) +
-			    state->args->namelen;
-	}
-
-	*countarg = count;
-	*namebytesarg = totallen;
-	return foundit;
-}
-
-/*========================================================================
- * Routines used for shrinking the Btree.
- *========================================================================*/
-
-/*
- * Check a leaf block and its neighbors to see if the block should be
- * collapsed into one or the other neighbor.  Always keep the block
- * with the smaller block number.
- * If the current block is over 50% full, don't try to join it, return 0.
- * If the block is empty, fill in the state structure and return 2.
- * If it can be collapsed, fill in the state structure and return 1.
- * If nothing can be done, return 0.
- */
-int
-xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_da_state_blk_t *blk;
-	xfs_da_blkinfo_t *info;
-	int count, bytes, forward, error, retval, i;
-	xfs_dablk_t blkno;
-	xfs_dabuf_t *bp;
-
-	/*
-	 * Check for the degenerate case of the block being over 50% full.
-	 * If so, it's not worth even looking to see if we might be able
-	 * to coalesce with a sibling.
-	 */
-	blk = &state->path.blk[ state->path.active-1 ];
-	info = blk->bp->data;
-	ASSERT(be16_to_cpu(info->magic) == XFS_DIR_LEAF_MAGIC);
-	leaf = (xfs_dir_leafblock_t *)info;
-	count = be16_to_cpu(leaf->hdr.count);
-	bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) +
-		count * (uint)sizeof(xfs_dir_leaf_entry_t) +
-		count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) +
-		be16_to_cpu(leaf->hdr.namebytes);
-	if (bytes > (state->blocksize >> 1)) {
-		*action = 0;	/* blk over 50%, don't try to join */
-		return 0;
-	}
-
-	/*
-	 * Check for the degenerate case of the block being empty.
-	 * If the block is empty, we'll simply delete it, no need to
-	 * coalesce it with a sibling block.  We choose (arbitrarily)
-	 * to merge with the forward block unless it is NULL.
-	 */
-	if (count == 0) {
-		/*
-		 * Make altpath point to the block we want to keep and
-		 * path point to the block we want to drop (this one).
-		 */
-		forward = (info->forw != 0);
-		memcpy(&state->altpath, &state->path, sizeof(state->path));
-		error = xfs_da_path_shift(state, &state->altpath, forward,
-						 0, &retval);
-		if (error)
-			return error;
-		if (retval) {
-			*action = 0;
-		} else {
-			*action = 2;
-		}
-		return 0;
-	}
-
-	/*
-	 * Examine each sibling block to see if we can coalesce with
-	 * at least 25% free space to spare.  We need to figure out
-	 * whether to merge with the forward or the backward block.
-	 * We prefer coalescing with the lower numbered sibling so as
-	 * to shrink a directory over time.
-	 */
-	forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back));	/* start with smaller blk num */
-	for (i = 0; i < 2; forward = !forward, i++) {
-		if (forward)
-			blkno = be32_to_cpu(info->forw);
-		else
-			blkno = be32_to_cpu(info->back);
-		if (blkno == 0)
-			continue;
-		error = xfs_da_read_buf(state->args->trans, state->args->dp,
-							    blkno, -1, &bp,
-							    XFS_DATA_FORK);
-		if (error)
-			return error;
-		ASSERT(bp != NULL);
-
-		leaf = (xfs_dir_leafblock_t *)info;
-		count  = be16_to_cpu(leaf->hdr.count);
-		bytes  = state->blocksize - (state->blocksize>>2);
-		bytes -= be16_to_cpu(leaf->hdr.namebytes);
-		leaf = bp->data;
-		ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-		count += be16_to_cpu(leaf->hdr.count);
-		bytes -= be16_to_cpu(leaf->hdr.namebytes);
-		bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
-		bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t);
-		bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t);
-		if (bytes >= 0)
-			break;	/* fits with at least 25% to spare */
-
-		xfs_da_brelse(state->args->trans, bp);
-	}
-	if (i >= 2) {
-		*action = 0;
-		return 0;
-	}
-	xfs_da_buf_done(bp);
-
-	/*
-	 * Make altpath point to the block we want to keep (the lower
-	 * numbered block) and path point to the block we want to drop.
-	 */
-	memcpy(&state->altpath, &state->path, sizeof(state->path));
-	if (blkno < blk->blkno) {
-		error = xfs_da_path_shift(state, &state->altpath, forward,
-						 0, &retval);
-	} else {
-		error = xfs_da_path_shift(state, &state->path, forward,
-						 0, &retval);
-	}
-	if (error)
-		return error;
-	if (retval) {
-		*action = 0;
-	} else {
-		*action = 1;
-	}
-	return 0;
-}
-
-/*
- * Remove a name from the leaf directory structure.
- *
- * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
- * If two leaves are 37% full, when combined they will leave 25% free.
- */
-int
-xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_hdr_t *hdr;
-	xfs_dir_leaf_map_t *map;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	int before, after, smallest, entsize;
-	int tablesize, tmp, i;
-	xfs_mount_t *mp;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	hdr = &leaf->hdr;
-	mp = trans->t_mountp;
-	ASSERT(hdr->count && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8)));
-	ASSERT((index >= 0) && (index < be16_to_cpu(hdr->count)));
-	ASSERT(be16_to_cpu(hdr->firstused) >=
-	       ((be16_to_cpu(hdr->count)*sizeof(*entry))+sizeof(*hdr)));
-	entry = &leaf->entries[index];
-	ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused));
-	ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
-
-	/*
-	 * Scan through free region table:
-	 *    check for adjacency of free'd entry with an existing one,
-	 *    find smallest free region in case we need to replace it,
-	 *    adjust any map that borders the entry table,
-	 */
-	tablesize = be16_to_cpu(hdr->count) * (uint)sizeof(xfs_dir_leaf_entry_t)
-			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
-	map = &hdr->freemap[0];
-	tmp = be16_to_cpu(map->size);
-	before = after = -1;
-	smallest = XFS_DIR_LEAF_MAPSIZE - 1;
-	entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
-	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
-		ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp));
-		ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp));
-		if (be16_to_cpu(map->base) == tablesize) {
-			int entry_size = sizeof(xfs_dir_leaf_entry_t);
-			be16_add(&map->base, -entry_size);
-			be16_add(&map->size, entry_size);
-		}
-
-		if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) ==
-				be16_to_cpu(entry->nameidx)) {
-			before = i;
-		} else if (be16_to_cpu(map->base) ==
-				(be16_to_cpu(entry->nameidx) + entsize)) {
-			after = i;
-		} else if (be16_to_cpu(map->size) < tmp) {
-			tmp = be16_to_cpu(map->size);
-			smallest = i;
-		}
-	}
-
-	/*
-	 * Coalesce adjacent freemap regions,
-	 * or replace the smallest region.
-	 */
-	if ((before >= 0) || (after >= 0)) {
-		if ((before >= 0) && (after >= 0)) {
-			map = &hdr->freemap[before];
-			be16_add(&map->size, entsize);
-			be16_add(&map->size, be16_to_cpu(hdr->freemap[after].size));
-			hdr->freemap[after].base = 0;
-			hdr->freemap[after].size = 0;
-		} else if (before >= 0) {
-			map = &hdr->freemap[before];
-			be16_add(&map->size, entsize);
-		} else {
-			map = &hdr->freemap[after];
-			map->base = entry->nameidx;
-			be16_add(&map->size, entsize);
-		}
-	} else {
-		/*
-		 * Replace smallest region (if it is smaller than free'd entry)
-		 */
-		map = &hdr->freemap[smallest];
-		if (be16_to_cpu(map->size) < entsize) {
-			map->base = entry->nameidx;
-			map->size = cpu_to_be16(entsize);
-		}
-	}
-
-	/*
-	 * Did we remove the first entry?
-	 */
-	if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused))
-		smallest = 1;
-	else
-		smallest = 0;
-
-	/*
-	 * Compress the remaining entries and zero out the removed stuff.
-	 */
-	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
-	memset((char *)namest, 0, entsize);
-	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
-
-	be16_add(&hdr->namebytes, -(entry->namelen));
-	tmp = (be16_to_cpu(hdr->count) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
-	memmove(entry, entry + 1, tmp);
-	be16_add(&hdr->count, -1);
-	xfs_da_log_buf(trans, bp,
-	    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
-	entry = &leaf->entries[be16_to_cpu(hdr->count)];
-	memset((char *)entry, 0, sizeof(xfs_dir_leaf_entry_t));
-
-	/*
-	 * If we removed the first entry, re-find the first used byte
-	 * in the name area.  Note that if the entry was the "firstused",
-	 * then we don't have a "hole" in our block resulting from
-	 * removing the name.
-	 */
-	if (smallest) {
-		tmp = XFS_LBSIZE(mp);
-		entry = &leaf->entries[0];
-		for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) {
-			ASSERT(be16_to_cpu(entry->nameidx) >=
-			       be16_to_cpu(hdr->firstused));
-			ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
-			if (be16_to_cpu(entry->nameidx) < tmp)
-				tmp = be16_to_cpu(entry->nameidx);
-		}
-		hdr->firstused = cpu_to_be16(tmp);
-		if (!hdr->firstused)
-			hdr->firstused = cpu_to_be16(tmp - 1);
-	} else {
-		hdr->holes = 1;		/* mark as needing compaction */
-	}
-
-	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
-
-	/*
-	 * Check if leaf is less than 50% full, caller may want to
-	 * "join" the leaf with a sibling if so.
-	 */
-	tmp  = (uint)sizeof(xfs_dir_leaf_hdr_t);
-	tmp += be16_to_cpu(leaf->hdr.count) * (uint)sizeof(xfs_dir_leaf_entry_t);
-	tmp += be16_to_cpu(leaf->hdr.count) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
-	tmp += be16_to_cpu(leaf->hdr.namebytes);
-	if (tmp < mp->m_dir_magicpct)
-		return 1;			/* leaf is < 37% full */
-	return 0;
-}
-
-/*
- * Move all the directory entries from drop_leaf into save_leaf.
- */
-void
-xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
-				      xfs_da_state_blk_t *save_blk)
-{
-	xfs_dir_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
-	xfs_dir_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
-	xfs_mount_t *mp;
-	char *tmpbuffer;
-
-	/*
-	 * Set up environment.
-	 */
-	mp = state->mp;
-	ASSERT(drop_blk->magic == XFS_DIR_LEAF_MAGIC);
-	ASSERT(save_blk->magic == XFS_DIR_LEAF_MAGIC);
-	drop_leaf = drop_blk->bp->data;
-	save_leaf = save_blk->bp->data;
-	ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	drop_hdr = &drop_leaf->hdr;
-	save_hdr = &save_leaf->hdr;
-
-	/*
-	 * Save last hashval from dying block for later Btree fixup.
-	 */
-	drop_blk->hashval = be32_to_cpu(drop_leaf->entries[
-			be16_to_cpu(drop_leaf->hdr.count)-1].hashval);
-
-	/*
-	 * Check if we need a temp buffer, or can we do it in place.
-	 * Note that we don't check "leaf" for holes because we will
-	 * always be dropping it, toosmall() decided that for us already.
-	 */
-	if (save_hdr->holes == 0) {
-		/*
-		 * dest leaf has no holes, so we add there.  May need
-		 * to make some room in the entry array.
-		 */
-		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
-			xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0,
-					be16_to_cpu(drop_hdr->count), mp);
-		} else {
-			xfs_dir_leaf_moveents(drop_leaf, 0,
-					save_leaf, be16_to_cpu(save_hdr->count),
-					be16_to_cpu(drop_hdr->count), mp);
-		}
-	} else {
-		/*
-		 * Destination has holes, so we make a temporary copy
-		 * of the leaf and add them both to that.
-		 */
-		tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
-		ASSERT(tmpbuffer != NULL);
-		memset(tmpbuffer, 0, state->blocksize);
-		tmp_leaf = (xfs_dir_leafblock_t *)tmpbuffer;
-		tmp_hdr = &tmp_leaf->hdr;
-		tmp_hdr->info = save_hdr->info;	/* struct copy */
-		tmp_hdr->count = 0;
-		tmp_hdr->firstused = cpu_to_be16(state->blocksize);
-		if (!tmp_hdr->firstused)
-			tmp_hdr->firstused = cpu_to_be16(state->blocksize - 1);
-		tmp_hdr->namebytes = 0;
-		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
-			xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
-					be16_to_cpu(drop_hdr->count), mp);
-			xfs_dir_leaf_moveents(save_leaf, 0,
-					tmp_leaf, be16_to_cpu(tmp_leaf->hdr.count),
-					be16_to_cpu(save_hdr->count), mp);
-		} else {
-			xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
-						 be16_to_cpu(save_hdr->count), mp);
-			xfs_dir_leaf_moveents(drop_leaf, 0,
-					      tmp_leaf, be16_to_cpu(tmp_leaf->hdr.count),
-					      be16_to_cpu(drop_hdr->count), mp);
-		}
-		memcpy(save_leaf, tmp_leaf, state->blocksize);
-		kmem_free(tmpbuffer, state->blocksize);
-	}
-
-	xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
-					   state->blocksize - 1);
-
-	/*
-	 * Copy out last hashval in each block for B-tree code.
-	 */
-	save_blk->hashval = be32_to_cpu(save_leaf->entries[
-			be16_to_cpu(save_leaf->hdr.count)-1].hashval);
-}
-
-/*========================================================================
- * Routines used for finding things in the Btree.
- *========================================================================*/
-
-/*
- * Look up a name in a leaf directory structure.
- * This is the internal routine, it uses the caller's buffer.
- *
- * Note that duplicate keys are allowed, but only check within the
- * current leaf node.  The Btree code must check in adjacent leaf nodes.
- *
- * Return in *index the index into the entry[] array of either the found
- * entry, or where the entry should have been (insert before that entry).
- *
- * Don't change the args->inumber unless we find the filename.
- */
-int
-xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
-{
-	xfs_dir_leafblock_t *leaf;
-	xfs_dir_leaf_entry_t *entry;
-	xfs_dir_leaf_name_t *namest;
-	int probe, span;
-	xfs_dahash_t hashval;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT(be16_to_cpu(leaf->hdr.count) < (XFS_LBSIZE(args->dp->i_mount)/8));
-
-	/*
-	 * Binary search.  (note: small blocks will skip this loop)
-	 */
-	hashval = args->hashval;
-	probe = span = be16_to_cpu(leaf->hdr.count) / 2;
-	for (entry = &leaf->entries[probe]; span > 4;
-		   entry = &leaf->entries[probe]) {
-		span /= 2;
-		if (be32_to_cpu(entry->hashval) < hashval)
-			probe += span;
-		else if (be32_to_cpu(entry->hashval) > hashval)
-			probe -= span;
-		else
-			break;
-	}
-	ASSERT((probe >= 0) && \
-	       ((!leaf->hdr.count) || (probe < be16_to_cpu(leaf->hdr.count))));
-	ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval));
-
-	/*
-	 * Since we may have duplicate hashval's, find the first matching
-	 * hashval in the leaf.
-	 */
-	while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) {
-		entry--;
-		probe--;
-	}
-	while ((probe < be16_to_cpu(leaf->hdr.count)) &&
-	       (be32_to_cpu(entry->hashval) < hashval)) {
-		entry++;
-		probe++;
-	}
-	if ((probe == be16_to_cpu(leaf->hdr.count)) ||
-	    (be32_to_cpu(entry->hashval) != hashval)) {
-		*index = probe;
-		ASSERT(args->oknoent);
-		return XFS_ERROR(ENOENT);
-	}
-
-	/*
-	 * Duplicate keys may be present, so search all of them for a match.
-	 */
-	while ((probe < be16_to_cpu(leaf->hdr.count)) &&
-	       (be32_to_cpu(entry->hashval) == hashval)) {
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, be16_to_cpu(entry->nameidx));
-		if (entry->namelen == args->namelen &&
-		    namest->name[0] == args->name[0] &&
-		    memcmp(args->name, namest->name, args->namelen) == 0) {
-			XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args->inumber);
-			*index = probe;
-			return XFS_ERROR(EEXIST);
-		}
-		entry++;
-		probe++;
-	}
-	*index = probe;
-	ASSERT(probe == be16_to_cpu(leaf->hdr.count) || args->oknoent);
-	return XFS_ERROR(ENOENT);
-}
-
-/*========================================================================
- * Utility routines.
- *========================================================================*/
-
-/*
- * Move the indicated entries from one leaf to another.
- * NOTE: this routine modifies both source and destination leaves.
- */
-/* ARGSUSED */
-STATIC void
-xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
-		      xfs_dir_leafblock_t *leaf_d, int start_d,
-		      int count, xfs_mount_t *mp)
-{
-	xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
-	xfs_dir_leaf_entry_t *entry_s, *entry_d;
-	int tmp, i;
-
-	/*
-	 * Check for nothing to do.
-	 */
-	if (count == 0)
-		return;
-
-	/*
-	 * Set up environment.
-	 */
-	ASSERT(be16_to_cpu(leaf_s->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	ASSERT(be16_to_cpu(leaf_d->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	hdr_s = &leaf_s->hdr;
-	hdr_d = &leaf_d->hdr;
-	ASSERT(hdr_s->count && (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8)));
-	ASSERT(be16_to_cpu(hdr_s->firstused) >=
-		((be16_to_cpu(hdr_s->count)*sizeof(*entry_s))+sizeof(*hdr_s)));
-	ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8));
-	ASSERT(be16_to_cpu(hdr_d->firstused) >=
-		((be16_to_cpu(hdr_d->count)*sizeof(*entry_d))+sizeof(*hdr_d)));
-
-	ASSERT(start_s < be16_to_cpu(hdr_s->count));
-	ASSERT(start_d <= be16_to_cpu(hdr_d->count));
-	ASSERT(count <= be16_to_cpu(hdr_s->count));
-
-	/*
-	 * Move the entries in the destination leaf up to make a hole?
-	 */
-	if (start_d < be16_to_cpu(hdr_d->count)) {
-		tmp  = be16_to_cpu(hdr_d->count) - start_d;
-		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
-		entry_s = &leaf_d->entries[start_d];
-		entry_d = &leaf_d->entries[start_d + count];
-		memcpy(entry_d, entry_s, tmp);
-	}
-
-	/*
-	 * Copy all entry's in the same (sorted) order,
-	 * but allocate filenames packed and in sequence.
-	 */
-	entry_s = &leaf_s->entries[start_s];
-	entry_d = &leaf_d->entries[start_d];
-	for (i = 0; i < count; entry_s++, entry_d++, i++) {
-		ASSERT(be16_to_cpu(entry_s->nameidx) >=
-		       be16_to_cpu(hdr_s->firstused));
-		tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
-		be16_add(&hdr_d->firstused, -(tmp));
-		entry_d->hashval = entry_s->hashval;
-		entry_d->nameidx = hdr_d->firstused;
-		entry_d->namelen = entry_s->namelen;
-		ASSERT(be16_to_cpu(entry_d->nameidx) + tmp <= XFS_LBSIZE(mp));
-		memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, be16_to_cpu(entry_d->nameidx)),
-		       XFS_DIR_LEAF_NAMESTRUCT(leaf_s, be16_to_cpu(entry_s->nameidx)), tmp);
-		ASSERT(be16_to_cpu(entry_s->nameidx) + tmp <= XFS_LBSIZE(mp));
-		memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s,
-					be16_to_cpu(entry_s->nameidx)), 0, tmp);
-		be16_add(&hdr_s->namebytes, -(entry_d->namelen));
-		be16_add(&hdr_d->namebytes, entry_d->namelen);
-		be16_add(&hdr_s->count, -1);
-		be16_add(&hdr_d->count, +1);
-		tmp = be16_to_cpu(hdr_d->count) * (uint)sizeof(xfs_dir_leaf_entry_t)
-				+ (uint)sizeof(xfs_dir_leaf_hdr_t);
-		ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp);
-
-	}
-
-	/*
-	 * Zero out the entries we just copied.
-	 */
-	if (start_s == be16_to_cpu(hdr_s->count)) {
-		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
-		entry_s = &leaf_s->entries[start_s];
-		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
-		memset((char *)entry_s, 0, tmp);
-	} else {
-		/*
-		 * Move the remaining entries down to fill the hole,
-		 * then zero the entries at the top.
-		 */
-		tmp  = be16_to_cpu(hdr_s->count) - count;
-		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
-		entry_s = &leaf_s->entries[start_s + count];
-		entry_d = &leaf_s->entries[start_s];
-		memcpy(entry_d, entry_s, tmp);
-
-		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
-		entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)];
-		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
-		memset((char *)entry_s, 0, tmp);
-	}
-
-	/*
-	 * Fill in the freemap information
-	 */
-	hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_dir_leaf_hdr_t) +
-			be16_to_cpu(hdr_d->count) * sizeof(xfs_dir_leaf_entry_t));
-	hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) -
-			be16_to_cpu(hdr_d->freemap[0].base));
-	hdr_d->freemap[1].base = 0;
-	hdr_d->freemap[1].size = 0;
-	hdr_d->freemap[2].base = 0;
-	hdr_d->freemap[2].size = 0;
-	hdr_s->holes = 1;	/* leaf may not be compact */
-}
-
-/*
- * Compare two leaf blocks "order".
- */
-int
-xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
-{
-	xfs_dir_leafblock_t *leaf1, *leaf2;
-
-	leaf1 = leaf1_bp->data;
-	leaf2 = leaf2_bp->data;
-	ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR_LEAF_MAGIC) &&
-	       (be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR_LEAF_MAGIC));
-	if (leaf1->hdr.count && leaf2->hdr.count &&
-	    ((be32_to_cpu(leaf2->entries[0].hashval) <
-	      be32_to_cpu(leaf1->entries[0 ].hashval)) ||
-	     (be32_to_cpu(leaf2->entries[
-			  be16_to_cpu(leaf2->hdr.count)-1].hashval) <
-	      be32_to_cpu(leaf1->entries[
-			  be16_to_cpu(leaf1->hdr.count)-1].hashval)))) {
-		return 1;
-	}
-	return 0;
-}
-
-/*
- * Pick up the last hashvalue from a leaf block.
- */
-xfs_dahash_t
-xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
-{
-	xfs_dir_leafblock_t *leaf;
-
-	leaf = bp->data;
-	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR_LEAF_MAGIC);
-	if (count)
-		*count = be16_to_cpu(leaf->hdr.count);
-	if (!leaf->hdr.count)
-		return(0);
-	return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval);
-}
-
-/*
- * Copy out directory entries for getdents(), for leaf directories.
- */
-int
-xfs_dir_leaf_getdents_int(
-	xfs_dabuf_t	*bp,
-	xfs_inode_t	*dp,
-	xfs_dablk_t	bno,
-	uio_t		*uio,
-	int		*eobp,
-	xfs_dirent_t	*dbp,
-	xfs_dir_put_t	put,
-	xfs_daddr_t		nextda)
-{
-	xfs_dir_leafblock_t	*leaf;
-	xfs_dir_leaf_entry_t	*entry;
-	xfs_dir_leaf_name_t	*namest;
-	int			entno, want_entno, i, nextentno;
-	xfs_mount_t		*mp;
-	xfs_dahash_t		cookhash;
-	xfs_dahash_t		nexthash = 0;
-#if (BITS_PER_LONG == 32)
-	xfs_dahash_t		lasthash = XFS_DA_MAXHASH;
-#endif
-	xfs_dir_put_args_t	p;
-
-	mp = dp->i_mount;
-	leaf = bp->data;
-	if (be16_to_cpu(leaf->hdr.info.magic) != XFS_DIR_LEAF_MAGIC) {
-		*eobp = 1;
-		return XFS_ERROR(ENOENT);	/* XXX wrong code */
-	}
-
-	want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
-
-	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
-
-	xfs_dir_trace_g_dul("leaf: start", dp, uio, leaf);
-
-	/*
-	 * Re-find our place.
-	 */
-	for (i = entno = 0, entry = &leaf->entries[0];
-		     i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
-
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
-				    be16_to_cpu(entry->nameidx));
-
-		if (unlikely(
-		    ((char *)namest < (char *)leaf) ||
-		    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
-			XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(1)",
-					     XFS_ERRLEVEL_LOW, mp, leaf);
-			xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-		if (be32_to_cpu(entry->hashval) >= cookhash) {
-			if (entno < want_entno &&
-			    be32_to_cpu(entry->hashval) == cookhash) {
-				/*
-				 * Trying to get to a particular offset in a
-				 * run of equal-hashval entries.
-				 */
-				entno++;
-			} else if (want_entno > 0 && entno == want_entno &&
-				   be32_to_cpu(entry->hashval) == cookhash) {
-				break;
-			} else {
-				entno = 0;
-				break;
-			}
-		}
-	}
-
-	if (i == be16_to_cpu(leaf->hdr.count)) {
-		xfs_dir_trace_g_du("leaf: hash not found", dp, uio);
-		if (!leaf->hdr.info.forw)
-			uio->uio_offset =
-				XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
-		/*
-		 * Don't set uio_offset if there's another block:
-		 * the node code will be setting uio_offset anyway.
-		 */
-		*eobp = 0;
-		return 0;
-	}
-	xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry);
-
-	p.dbp = dbp;
-	p.put = put;
-	p.uio = uio;
-
-	/*
-	 * We're synchronized, start copying entries out to the user.
-	 */
-	for (; entno >= 0 && i < be16_to_cpu(leaf->hdr.count);
-			     entry++, i++, (entno = nextentno)) {
-		int lastresid=0, retval;
-		xfs_dircook_t lastoffset;
-		xfs_dahash_t thishash;
-
-		/*
-		 * Check for a damaged directory leaf block and pick up
-		 * the inode number from this entry.
-		 */
-		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
-				    be16_to_cpu(entry->nameidx));
-
-		if (unlikely(
-		    ((char *)namest < (char *)leaf) ||
-		    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
-			XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(2)",
-					     XFS_ERRLEVEL_LOW, mp, leaf);
-			xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
-
-		xfs_dir_trace_g_duc("leaf: middle cookie  ",
-						   dp, uio, p.cook.o);
-
-		if (i < (be16_to_cpu(leaf->hdr.count) - 1)) {
-			nexthash = be32_to_cpu(entry[1].hashval);
-
-			if (nexthash == be32_to_cpu(entry->hashval))
-				nextentno = entno + 1;
-			else
-				nextentno = 0;
-			XFS_PUT_COOKIE(p.cook, mp, bno, nextentno, nexthash);
-			xfs_dir_trace_g_duc("leaf: middle cookie  ",
-						   dp, uio, p.cook.o);
-
-		} else if ((thishash = be32_to_cpu(leaf->hdr.info.forw))) {
-			xfs_dabuf_t *bp2;
-			xfs_dir_leafblock_t *leaf2;
-
-			ASSERT(nextda != -1);
-
-			retval = xfs_da_read_buf(dp->i_transp, dp, thishash,
-						 nextda, &bp2, XFS_DATA_FORK);
-			if (retval)
-				return retval;
-
-			ASSERT(bp2 != NULL);
-
-			leaf2 = bp2->data;
-
-			if (unlikely(
-			       (be16_to_cpu(leaf2->hdr.info.magic)
-						!= XFS_DIR_LEAF_MAGIC)
-			    || (be32_to_cpu(leaf2->hdr.info.back)
-						!= bno))) {	/* GROT */
-				XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(3)",
-						     XFS_ERRLEVEL_LOW, mp,
-						     leaf2);
-				xfs_da_brelse(dp->i_transp, bp2);
-
-				return XFS_ERROR(EFSCORRUPTED);
-			}
-
-			nexthash = be32_to_cpu(leaf2->entries[0].hashval);
-			nextentno = -1;
-			XFS_PUT_COOKIE(p.cook, mp, thishash, 0, nexthash);
-			xfs_da_brelse(dp->i_transp, bp2);
-			xfs_dir_trace_g_duc("leaf: next blk cookie",
-						   dp, uio, p.cook.o);
-		} else {
-			nextentno = -1;
-			XFS_PUT_COOKIE(p.cook, mp, 0, 0, XFS_DA_MAXHASH);
-		}
-
-		/*
-		 * Save off the cookie so we can fall back should the
-		 * 'put' into the outgoing buffer fails.  To handle a run
-		 * of equal-hashvals, the off_t structure on 64bit
-		 * builds has entno built into the cookie to ID the
-		 * entry.  On 32bit builds, we only have space for the
-		 * hashval so we can't ID specific entries within a group
-		 * of same hashval entries.   For this, lastoffset is set
-		 * to the first in the run of equal hashvals so we don't
-		 * include any entries unless we can include all entries
-		 * that share the same hashval.  Hopefully the buffer
-		 * provided is big enough to handle it (see pv763517).
-		 */
-		thishash = be32_to_cpu(entry->hashval);
-#if (BITS_PER_LONG == 32)
-		if (thishash != lasthash) {
-			XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
-			lastresid = uio->uio_resid;
-			lasthash = thishash;
-		} else {
-			xfs_dir_trace_g_duc("leaf: DUP COOKIES, skipped",
-						   dp, uio, p.cook.o);
-		}
-#else
-		XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
-		lastresid = uio->uio_resid;
-#endif /* BITS_PER_LONG == 32 */
-
-		/*
-		 * Put the current entry into the outgoing buffer.  If we fail
-		 * then restore the UIO to the first entry in the current
-		 * run of equal-hashval entries (probably one 1 entry long).
-		 */
-		p.ino = XFS_GET_DIR_INO8(namest->inumber);
-#if XFS_BIG_INUMS
-		p.ino += mp->m_inoadd;
-#endif
-		p.name = (char *)namest->name;
-		p.namelen = entry->namelen;
-
-		retval = p.put(&p);
-
-		if (!p.done) {
-			uio->uio_offset = lastoffset.o;
-			uio->uio_resid = lastresid;
-
-			*eobp = 1;
-
-			xfs_dir_trace_g_du("leaf: E-O-B", dp, uio);
-
-			return retval;
-		}
-	}
-
-	uio->uio_offset = p.cook.o;
-
-	*eobp = 0;
-
-	xfs_dir_trace_g_du("leaf: E-O-F", dp, uio);
-
-	return 0;
-}
-
-/*
- * Format a dirent64 structure and copy it out the the user's buffer.
- */
-int
-xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa)
-{
-	iovec_t *iovp;
-	int reclen, namelen;
-	xfs_dirent_t *idbp;
-	uio_t *uio;
-
-	namelen = pa->namelen;
-	reclen = DIRENTSIZE(namelen);
-	uio = pa->uio;
-	if (reclen > uio->uio_resid) {
-		pa->done = 0;
-		return 0;
-	}
-	iovp = uio->uio_iov;
-	idbp = (xfs_dirent_t *)iovp->iov_base;
-	iovp->iov_base = (char *)idbp + reclen;
-	iovp->iov_len -= reclen;
-	uio->uio_resid -= reclen;
-	idbp->d_reclen = reclen;
-	idbp->d_ino = pa->ino;
-	idbp->d_off = pa->cook.o;
-	idbp->d_name[namelen] = '\0';
-	pa->done = 1;
-	memcpy(idbp->d_name, pa->name, namelen);
-	return 0;
-}
-
-/*
- * Format a dirent64 structure and copy it out the the user's buffer.
- */
-int
-xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa)
-{
-	int		retval, reclen, namelen;
-	xfs_dirent_t	*idbp;
-	uio_t		*uio;
-
-	namelen = pa->namelen;
-	reclen = DIRENTSIZE(namelen);
-	uio = pa->uio;
-	if (reclen > uio->uio_resid) {
-		pa->done = 0;
-		return 0;
-	}
-	idbp = pa->dbp;
-	idbp->d_reclen = reclen;
-	idbp->d_ino = pa->ino;
-	idbp->d_off = pa->cook.o;
-	idbp->d_name[namelen] = '\0';
-	memcpy(idbp->d_name, pa->name, namelen);
-	retval = uio_read((caddr_t)idbp, reclen, uio);
-	pa->done = (retval == 0);
-	return retval;
-}
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
index 7ca5845195d..e69de29bb2d 100644
--- a/fs/xfs/xfs_dir_leaf.h
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR_LEAF_H__
-#define	__XFS_DIR_LEAF_H__
-
-/*
- * Directory layout, internal structure, access macros, etc.
- *
- * Large directories are structured around Btrees where all the data
- * elements are in the leaf nodes.  Filenames are hashed into an int,
- * then that int is used as the index into the Btree.  Since the hashval
- * of a filename may not be unique, we may have duplicate keys.  The
- * internal links in the Btree are logical block offsets into the file.
- */
-
-struct uio;
-struct xfs_bmap_free;
-struct xfs_dabuf;
-struct xfs_da_args;
-struct xfs_da_state;
-struct xfs_da_state_blk;
-struct xfs_dir_put_args;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_trans;
-
-/*========================================================================
- * Directory Structure when equal to XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This is the structure of the leaf nodes in the Btree.
- *
- * Struct leaf_entry's are packed from the top.  Names grow from the bottom
- * but are not packed.  The freemap contains run-length-encoded entries
- * for the free bytes after the leaf_entry's, but only the N largest such,
- * smaller runs are dropped.  When the freemap doesn't show enough space
- * for an allocation, we compact the namelist area and try again.  If we
- * still don't have enough space, then we have to split the block.
- *
- * Since we have duplicate hash keys, for each key that matches, compare
- * the actual string.  The root and intermediate node search always takes
- * the first-in-the-block key match found, so we should only have to work
- * "forw"ard.  If none matches, continue with the "forw"ard leaf nodes
- * until the hash key changes or the filename is found.
- *
- * The parent directory and the self-pointer are explicitly represented
- * (ie: there are entries for "." and "..").
- *
- * Note that the count being a __uint16_t limits us to something like a
- * blocksize of 1.3MB in the face of worst case (short) filenames.
- */
-#define XFS_DIR_LEAF_MAPSIZE	3	/* how many freespace slots */
-
-typedef struct xfs_dir_leaf_map {	/* RLE map of free bytes */
-	__be16		base;	 	/* base of free region */
-	__be16		size; 		/* run length of free region */
-} xfs_dir_leaf_map_t;
-
-typedef struct xfs_dir_leaf_hdr {	/* constant-structure header block */
-	xfs_da_blkinfo_t info;		/* block type, links, etc. */
-	__be16		count;		/* count of active leaf_entry's */
-	__be16		namebytes;	/* num bytes of name strings stored */
-	__be16		firstused;	/* first used byte in name area */
-	__u8		holes;		/* != 0 if blk needs compaction */
-	__u8		pad1;
-	xfs_dir_leaf_map_t freemap[XFS_DIR_LEAF_MAPSIZE];
-} xfs_dir_leaf_hdr_t;
-
-typedef struct xfs_dir_leaf_entry {	/* sorted on key, not name */
-	__be32		hashval;	/* hash value of name */
-	__be16		nameidx;	/* index into buffer of name */
-	__u8		namelen;	/* length of name string */
-	__u8		pad2;
-} xfs_dir_leaf_entry_t;
-
-typedef struct xfs_dir_leaf_name {
-	xfs_dir_ino_t	inumber;	/* inode number for this key */
-	__uint8_t	name[1];	/* name string itself */
-} xfs_dir_leaf_name_t;
-
-typedef struct xfs_dir_leafblock {
-	xfs_dir_leaf_hdr_t	hdr;	/* constant-structure header block */
-	xfs_dir_leaf_entry_t	entries[1];	/* var sized array */
-	xfs_dir_leaf_name_t	namelist[1];	/* grows from bottom of buf */
-} xfs_dir_leafblock_t;
-
-/*
- * Length of name for which a 512-byte block filesystem
- * can get a double split.
- */
-#define	XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN	\
-	(512 - (uint)sizeof(xfs_dir_leaf_hdr_t) - \
-	 (uint)sizeof(xfs_dir_leaf_entry_t) * 2 - \
-	 (uint)sizeof(xfs_dir_leaf_name_t) * 2 - (MAXNAMELEN - 2) + 1 + 1)
-
-typedef int (*xfs_dir_put_t)(struct xfs_dir_put_args *pa);
-
-typedef union {
-	xfs_off_t		o;		/* offset (cookie) */
-	/*
-	 * Watch the order here (endian-ness dependent).
-	 */
-	struct {
-#ifndef XFS_NATIVE_HOST
-		xfs_dahash_t	h;	/* hash value */
-		__uint32_t	be;	/* block and entry */
-#else
-		__uint32_t	be;	/* block and entry */
-		xfs_dahash_t	h;	/* hash value */
-#endif /* XFS_NATIVE_HOST */
-	} s;
-} xfs_dircook_t;
-
-#define	XFS_PUT_COOKIE(c,mp,bno,entry,hash)	\
-	((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
-
-typedef struct xfs_dir_put_args {
-	xfs_dircook_t	cook;		/* cookie of (next) entry */
-	xfs_intino_t	ino;		/* inode number */
-	struct xfs_dirent *dbp;		/* buffer pointer */
-	char		*name;		/* directory entry name */
-	int		namelen;	/* length of name */
-	int		done;		/* output: set if value was stored */
-	xfs_dir_put_t	put;		/* put function ptr (i/o) */
-	struct uio	*uio;		/* uio control structure */
-} xfs_dir_put_args_t;
-
-#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)	\
-	xfs_dir_leaf_entsize_byname(len)
-static inline int xfs_dir_leaf_entsize_byname(int len)
-{
-	return (uint)sizeof(xfs_dir_leaf_name_t)-1 + len;
-}
-
-#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)	\
-	xfs_dir_leaf_entsize_byentry(entry)
-static inline int xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry)
-{
-	return (uint)sizeof(xfs_dir_leaf_name_t)-1 + (entry)->namelen;
-}
-
-#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset)	\
-	xfs_dir_leaf_namestruct(leafp,offset)
-static inline xfs_dir_leaf_name_t *
-xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset)
-{
-	return (xfs_dir_leaf_name_t *)&((char *)(leafp))[offset];
-}
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Internal routines when dirsize < XFS_LITINO(mp).
- */
-int xfs_dir_shortform_create(struct xfs_da_args *args, xfs_ino_t parent);
-int xfs_dir_shortform_addname(struct xfs_da_args *args);
-int xfs_dir_shortform_lookup(struct xfs_da_args *args);
-int xfs_dir_shortform_to_leaf(struct xfs_da_args *args);
-int xfs_dir_shortform_removename(struct xfs_da_args *args);
-int xfs_dir_shortform_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
-			       struct xfs_dirent *dbp, xfs_dir_put_t put);
-int xfs_dir_shortform_replace(struct xfs_da_args *args);
-
-/*
- * Internal routines when dirsize == XFS_LBSIZE(mp).
- */
-int xfs_dir_leaf_to_node(struct xfs_da_args *args);
-int xfs_dir_leaf_to_shortform(struct xfs_da_args *args);
-
-/*
- * Routines used for growing the Btree.
- */
-int	xfs_dir_leaf_split(struct xfs_da_state *state,
-				  struct xfs_da_state_blk *oldblk,
-				  struct xfs_da_state_blk *newblk);
-int	xfs_dir_leaf_add(struct xfs_dabuf *leaf_buffer,
-				struct xfs_da_args *args, int insertion_index);
-int	xfs_dir_leaf_addname(struct xfs_da_args *args);
-int	xfs_dir_leaf_lookup_int(struct xfs_dabuf *leaf_buffer,
-				       struct xfs_da_args *args,
-				       int *index_found_at);
-int	xfs_dir_leaf_remove(struct xfs_trans *trans,
-				   struct xfs_dabuf *leaf_buffer,
-				   int index_to_remove);
-int	xfs_dir_leaf_getdents_int(struct xfs_dabuf *bp, struct xfs_inode *dp,
-					 xfs_dablk_t bno, struct uio *uio,
-					 int *eobp, struct xfs_dirent *dbp,
-					 xfs_dir_put_t put, xfs_daddr_t nextda);
-
-/*
- * Routines used for shrinking the Btree.
- */
-int	xfs_dir_leaf_toosmall(struct xfs_da_state *state, int *retval);
-void	xfs_dir_leaf_unbalance(struct xfs_da_state *state,
-					     struct xfs_da_state_blk *drop_blk,
-					     struct xfs_da_state_blk *save_blk);
-
-/*
- * Utility routines.
- */
-uint	xfs_dir_leaf_lasthash(struct xfs_dabuf *bp, int *count);
-int	xfs_dir_leaf_order(struct xfs_dabuf *leaf1_bp,
-				  struct xfs_dabuf *leaf2_bp);
-int	xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa);
-int	xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa);
-int	xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
-
-/*
- * Global data.
- */
-extern xfs_dahash_t	xfs_dir_hash_dot, xfs_dir_hash_dotdot;
-
-#endif /* __XFS_DIR_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir_sf.h b/fs/xfs/xfs_dir_sf.h
index 5b20b4d3f57..e69de29bb2d 100644
--- a/fs/xfs/xfs_dir_sf.h
+++ b/fs/xfs/xfs_dir_sf.h
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DIR_SF_H__
-#define	__XFS_DIR_SF_H__
-
-/*
- * Directory layout when stored internal to an inode.
- *
- * Small directories are packed as tightly as possible so as to
- * fit into the literal area of the inode.
- */
-
-typedef struct { __uint8_t i[sizeof(xfs_ino_t)]; } xfs_dir_ino_t;
-
-/*
- * The parent directory has a dedicated field, and the self-pointer must
- * be calculated on the fly.
- *
- * Entries are packed toward the top as tight as possible.  The header
- * and the elements much be memcpy'd out into a work area to get correct
- * alignment for the inode number fields.
- */
-typedef struct xfs_dir_sf_hdr {		/* constant-structure header block */
-	xfs_dir_ino_t	parent;		/* parent dir inode number */
-	__uint8_t	count;		/* count of active entries */
-} xfs_dir_sf_hdr_t;
-
-typedef struct xfs_dir_sf_entry {
-	xfs_dir_ino_t	inumber;	/* referenced inode number */
-	__uint8_t	namelen;	/* actual length of name (no NULL) */
-	__uint8_t	name[1];	/* name */
-} xfs_dir_sf_entry_t;
-
-typedef struct xfs_dir_shortform {
-	xfs_dir_sf_hdr_t	hdr;
-	xfs_dir_sf_entry_t	list[1];	/* variable sized array */
-} xfs_dir_shortform_t;
-
-/*
- * We generate this then sort it, so that readdirs are returned in
- * hash-order.  Else seekdir won't work.
- */
-typedef struct xfs_dir_sf_sort {
-	__uint8_t	entno;		/* .=0, ..=1, else entry# + 2 */
-	__uint8_t	seqno;		/* sequence # with same hash value */
-	__uint8_t	namelen;	/* length of name value (no null) */
-	xfs_dahash_t	hash;		/* this entry's hash value */
-	xfs_intino_t	ino;		/* this entry's inode number */
-	char		*name;		/* name value, pointer into buffer */
-} xfs_dir_sf_sort_t;
-
-#define	XFS_DIR_SF_GET_DIRINO(from,to)	xfs_dir_sf_get_dirino(from, to)
-static inline void xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to)
-{
-	*(to) = XFS_GET_DIR_INO8(*from);
-}
-
-#define	XFS_DIR_SF_PUT_DIRINO(from,to)	xfs_dir_sf_put_dirino(from, to)
-static inline void xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to)
-{
-	XFS_PUT_DIR_INO8(*(from), *(to));
-}
-
-#define XFS_DIR_SF_ENTSIZE_BYNAME(len)	xfs_dir_sf_entsize_byname(len)
-static inline int xfs_dir_sf_entsize_byname(int len)
-{
-	return (uint)sizeof(xfs_dir_sf_entry_t)-1 + (len);
-}
-
-#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)	xfs_dir_sf_entsize_byentry(sfep)
-static inline int xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep)
-{
-	return (uint)sizeof(xfs_dir_sf_entry_t)-1 + (sfep)->namelen;
-}
-
-#define XFS_DIR_SF_NEXTENTRY(sfep)		xfs_dir_sf_nextentry(sfep)
-static inline xfs_dir_sf_entry_t *xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep)
-{
-	return (xfs_dir_sf_entry_t *) \
-		((char *)(sfep) + XFS_DIR_SF_ENTSIZE_BYENTRY(sfep));
-}
-
-#define XFS_DIR_SF_ALLFIT(count,totallen)	\
-	xfs_dir_sf_allfit(count,totallen)
-static inline int xfs_dir_sf_allfit(int count, int totallen)
-{
-	return ((uint)sizeof(xfs_dir_sf_hdr_t) + \
-	       ((uint)sizeof(xfs_dir_sf_entry_t)-1)*(count) + (totallen));
-}
-
-#if defined(XFS_DIR_TRACE)
-
-/*
- * Kernel tracing support for directories.
- */
-struct uio;
-struct xfs_inode;
-struct xfs_da_intnode;
-struct xfs_dinode;
-struct xfs_dir_leafblock;
-struct xfs_dir_leaf_entry;
-
-#define	XFS_DIR_TRACE_SIZE	4096	/* size of global trace buffer */
-extern ktrace_t	*xfs_dir_trace_buf;
-
-/*
- * Trace record types.
- */
-#define	XFS_DIR_KTRACE_G_DU	1	/* dp, uio */
-#define	XFS_DIR_KTRACE_G_DUB	2	/* dp, uio, bno */
-#define	XFS_DIR_KTRACE_G_DUN	3	/* dp, uio, node */
-#define	XFS_DIR_KTRACE_G_DUL	4	/* dp, uio, leaf */
-#define	XFS_DIR_KTRACE_G_DUE	5	/* dp, uio, leaf entry */
-#define	XFS_DIR_KTRACE_G_DUC	6	/* dp, uio, cookie */
-
-void xfs_dir_trace_g_du(char *where, struct xfs_inode *dp, struct uio *uio);
-void xfs_dir_trace_g_dub(char *where, struct xfs_inode *dp, struct uio *uio,
-			      xfs_dablk_t bno);
-void xfs_dir_trace_g_dun(char *where, struct xfs_inode *dp, struct uio *uio,
-			      struct xfs_da_intnode *node);
-void xfs_dir_trace_g_dul(char *where, struct xfs_inode *dp, struct uio *uio,
-			      struct xfs_dir_leafblock *leaf);
-void xfs_dir_trace_g_due(char *where, struct xfs_inode *dp, struct uio *uio,
-			      struct xfs_dir_leaf_entry *entry);
-void xfs_dir_trace_g_duc(char *where, struct xfs_inode *dp, struct uio *uio,
-			      xfs_off_t cookie);
-void xfs_dir_trace_enter(int type, char *where,
-			     void *a0, void *a1, void *a2, void *a3,
-			     void *a4, void *a5, void *a6, void *a7,
-			     void *a8, void *a9, void *a10, void *a11);
-#else
-#define	xfs_dir_trace_g_du(w,d,u)
-#define	xfs_dir_trace_g_dub(w,d,u,b)
-#define	xfs_dir_trace_g_dun(w,d,u,n)
-#define	xfs_dir_trace_g_dul(w,d,u,l)
-#define	xfs_dir_trace_g_due(w,d,u,e)
-#define	xfs_dir_trace_g_duc(w,d,u,c)
-#endif /* DEBUG */
-
-#endif	/* __XFS_DIR_SF_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
index 629795b3b3d..1e4a35ddf7f 100644
--- a/fs/xfs/xfs_dmops.c
+++ b/fs/xfs/xfs_dmops.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 2a21c502401..b95681b03d8 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -22,12 +22,10 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 8b028f12854..6cf6d8769b9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 650d35f537b..077629bab53 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 7e5ccfec92b..33164a85aa9 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 60c65683462..616eeeb6953 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 7894b72a717..0724df7fabb 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e799f0d0087..5fa0adb7e17 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -26,14 +26,12 @@
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -2925,13 +2923,6 @@ xfs_iflush_fork(
 			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
 			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
 		}
-		if (whichfork == XFS_DATA_FORK) {
-			if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) {
-				XFS_ERROR_REPORT("xfs_iflush_fork",
-						 XFS_ERRLEVEL_LOW, mp);
-				return XFS_ERROR(EFSCORRUPTED);
-			}
-		}
 		break;
 
 	case XFS_DINODE_FMT_EXTENTS:
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2caa91b8971..f8e80d8e723 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -25,7 +25,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c
index 89441e69cdd..06d710c9ce4 100644
--- a/fs/xfs/xfs_iocore.c
+++ b/fs/xfs/xfs_iocore.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dfrag.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d79055207fb..f1949c16df1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index a0182ced29c..46249e4d1fe 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 95d679cd4e7..d8f5d4cbe8b 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -36,7 +35,6 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_log_recover.h"
 #include "xfs_trans_priv.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index f952f9dbf74..55b4237c215 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5ce6416fbd3..10dbf203c62 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -934,18 +932,7 @@ xfs_mountfs(
 	vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid;
 	mp->m_dmevmask = 0;	/* not persistent; set after each mount */
 
-	/*
-	 * Select the right directory manager.
-	 */
-	mp->m_dirops =
-		XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
-			xfsv2_dirops :
-			xfsv1_dirops;
-
-	/*
-	 * Initialize directory manager's entries.
-	 */
-	XFS_DIR_MOUNT(mp);
+	xfs_dir_mount(mp);
 
 	/*
 	 * Initialize the attribute manager's entries.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 761f42f989c..b2bd4be4200 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -403,8 +403,6 @@ typedef struct xfs_mount {
 	__uint8_t		m_inode_quiesce;/* call quiesce on new inodes.
 						   field governed by m_ilock */
 	__uint8_t		m_sectbb_log;	/* sectlog - BBSHIFT */
-	__uint8_t		m_dirversion;	/* 1 or 2 */
-	xfs_dirops_t		m_dirops;	/* table of dir funcs */
 	int			m_dirblksize;	/* directory block sz--bytes */
 	int			m_dirblkfsbs;	/* directory block sz--fsbs */
 	xfs_dablk_t		m_dirdatablk;	/* blockno of dir data v2 */
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 1408a32eef8..320d63ff9ca 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -23,7 +23,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 7d5f9b6ffdb..d98171deaa1 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -22,13 +22,11 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_da_btree.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -40,7 +38,6 @@
 #include "xfs_refcache.h"
 #include "xfs_utils.h"
 #include "xfs_trans_space.h"
-#include "xfs_dir_leaf.h"
 
 
 /*
@@ -398,34 +395,29 @@ xfs_rename(
 		 * fit before actually inserting it.
 		 */
 		if (spaceres == 0 &&
-		    (error = XFS_DIR_CANENTER(mp, tp, target_dp, target_name,
-				target_namelen))) {
+		    (error = xfs_dir_canenter(tp, target_dp, target_name,
+						target_namelen)))
 			goto error_return;
-		}
 		/*
 		 * If target does not exist and the rename crosses
 		 * directories, adjust the target directory link count
 		 * to account for the ".." reference from the new entry.
 		 */
-		error = XFS_DIR_CREATENAME(mp, tp, target_dp, target_name,
+		error = xfs_dir_createname(tp, target_dp, target_name,
 					   target_namelen, src_ip->i_ino,
 					   &first_block, &free_list, spaceres);
-		if (error == ENOSPC) {
+		if (error == ENOSPC)
 			goto error_return;
-		}
-		if (error) {
+		if (error)
 			goto abort_return;
-		}
 		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 		if (new_parent && src_is_directory) {
 			error = xfs_bumplink(tp, target_dp);
-			if (error) {
+			if (error)
 				goto abort_return;
-			}
 		}
 	} else { /* target_ip != NULL */
-
 		/*
 		 * If target exists and it's a directory, check that both
 		 * target and source are directories and that target can be
@@ -435,7 +427,7 @@ xfs_rename(
 			/*
 			 * Make sure target dir is empty.
 			 */
-			if (!(XFS_DIR_ISEMPTY(target_ip->i_mount, target_ip)) ||
+			if (!(xfs_dir_isempty(target_ip)) ||
 			    (target_ip->i_d.di_nlink > 2)) {
 				error = XFS_ERROR(EEXIST);
 				goto error_return;
@@ -451,12 +443,11 @@ xfs_rename(
 		 * In case there is already an entry with the same
 		 * name at the destination directory, remove it first.
 		 */
-		error = XFS_DIR_REPLACE(mp, tp, target_dp, target_name,
-			target_namelen, src_ip->i_ino, &first_block,
-			&free_list, spaceres);
-		if (error) {
+		error = xfs_dir_replace(tp, target_dp, target_name,
+					target_namelen, src_ip->i_ino,
+					&first_block, &free_list, spaceres);
+		if (error)
 			goto abort_return;
-		}
 		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 		/*
@@ -464,9 +455,8 @@ xfs_rename(
 		 * dir no longer points to it.
 		 */
 		error = xfs_droplink(tp, target_ip);
-		if (error) {
+		if (error)
 			goto abort_return;
-		}
 		target_ip_dropped = 1;
 
 		if (src_is_directory) {
@@ -474,9 +464,8 @@ xfs_rename(
 			 * Drop the link from the old "." entry.
 			 */
 			error = xfs_droplink(tp, target_ip);
-			if (error) {
+			if (error)
 				goto abort_return;
-			}
 		}
 
 		/* Do this test while we still hold the locks */
@@ -488,18 +477,15 @@ xfs_rename(
 	 * Remove the source.
 	 */
 	if (new_parent && src_is_directory) {
-
 		/*
 		 * Rewrite the ".." entry to point to the new
 		 * directory.
 		 */
-		error = XFS_DIR_REPLACE(mp, tp, src_ip, "..", 2,
-					target_dp->i_ino, &first_block,
-					&free_list, spaceres);
+		error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino,
+					&first_block, &free_list, spaceres);
 		ASSERT(error != EEXIST);
-		if (error) {
+		if (error)
 			goto abort_return;
-		}
 		xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 	} else {
@@ -527,16 +513,14 @@ xfs_rename(
 		 * entry that's moved no longer points to it.
 		 */
 		error = xfs_droplink(tp, src_dp);
-		if (error) {
+		if (error)
 			goto abort_return;
-		}
 	}
 
-	error = XFS_DIR_REMOVENAME(mp, tp, src_dp, src_name, src_namelen,
+	error = xfs_dir_removename(tp, src_dp, src_name, src_namelen,
 			src_ip->i_ino, &first_block, &free_list, spaceres);
-	if (error) {
+	if (error)
 		goto abort_return;
-	}
 	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
 	/*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index af290cf37ac..0c1e42b037e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index c55e28189ab..defb2febaaf 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 39f0b1ed322..ee2721e0de4 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -33,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 100d9a4b38e..cb65c3a603f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -805,12 +805,9 @@ typedef struct xfs_trans {
 	((mp)->m_sb.sb_inodesize + \
 	 (mp)->m_sb.sb_sectsize * 2 + \
 	 (mp)->m_dirblksize + \
-	 (XFS_DIR_IS_V1(mp) ? 0 : \
-	    XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1))) + \
+	 XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
 	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-	 (128 * (4 + \
-		 (XFS_DIR_IS_V1(mp) ? 0 : \
-			 XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
+	 (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
 		 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 
 #define	XFS_ADDAFORK_LOG_RES(mp)	((mp)->m_reservations.tr_addafork)
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index e1ca7b6fde4..558c87ff0c4 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -22,7 +22,6 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8cedd1583bc..60b6b898022 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 7d7d627f25d..b290270dd4a 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -22,7 +22,6 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
-#include "xfs_dir.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 7c5894d59f8..b8db1d5cde5 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -24,14 +24,12 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7fe3792b18d..4ea2e5074bd 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -30,8 +30,7 @@
 	  XFS_EXTENTADD_SPACE_RES(mp,w))
 #define	XFS_DAENTER_1B(mp,w)	((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
 #define	XFS_DAENTER_DBS(mp,w)	\
-	(XFS_DA_NODE_MAXDEPTH + \
-	 ((XFS_DIR_IS_V2(mp) && (w) == XFS_DATA_FORK) ? 2 : 0))
+	(XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
 #define	XFS_DAENTER_BLOCKS(mp,w)	\
 	(XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
 #define	XFS_DAENTER_BMAP1B(mp,w)	\
@@ -41,10 +40,7 @@
 #define	XFS_DAENTER_SPACE_RES(mp,w)	\
 	(XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
 #define	XFS_DAREMOVE_SPACE_RES(mp,w)	XFS_DAENTER_BMAPS(mp,w)
-#define	XFS_DIRENTER_MAX_SPLIT(mp,nl)	\
-	(((mp)->m_sb.sb_blocksize == 512 && \
-	  XFS_DIR_IS_V1(mp) && \
-	  (nl) >= XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN) ? 2 : 1)
+#define	XFS_DIRENTER_MAX_SPLIT(mp,nl)	1
 #define	XFS_DIRENTER_SPACE_RES(mp,nl)	\
 	(XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
 	 XFS_DIRENTER_MAX_SPLIT(mp,nl))
@@ -57,8 +53,7 @@
  * Space reservation values for various transactions.
  */
 #define	XFS_ADDAFORK_SPACE_RES(mp)	\
-	((mp)->m_dirblkfsbs + \
-	 (XFS_DIR_IS_V1(mp) ? 0 : XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)))
+	((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
 #define	XFS_ATTRRM_SPACE_RES(mp)	\
 	XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
 /* This macro is not used - see inline code in xfs_attr_set */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 37fdc2dc00b..9014d7e4448 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -24,12 +24,10 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -82,8 +80,7 @@ xfs_dir_lookup_int(
 
 	dp = XFS_BHVTOI(dir_bdp);
 
-	error = XFS_DIR_LOOKUP(dp->i_mount, NULL, dp,
-				VNAME(dentry), VNAMELEN(dentry), inum);
+	error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum);
 	if (!error) {
 		/*
 		 * Unlock the directory. We do this because we can't
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 918531b6478..6c96391f3f1 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -24,7 +24,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -32,7 +31,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
@@ -131,9 +129,6 @@ xfs_init(void)
 #ifdef XFS_BMBT_TRACE
 	xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
 #endif
-#ifdef XFS_DIR_TRACE
-	xfs_dir_trace_buf = ktrace_alloc(XFS_DIR_TRACE_SIZE, KM_SLEEP);
-#endif
 #ifdef XFS_ATTR_TRACE
 	xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
 #endif
@@ -177,9 +172,6 @@ xfs_cleanup(void)
 #ifdef XFS_ATTR_TRACE
 	ktrace_free(xfs_attr_trace_buf);
 #endif
-#ifdef XFS_DIR_TRACE
-	ktrace_free(xfs_dir_trace_buf);
-#endif
 #ifdef XFS_BMBT_TRACE
 	ktrace_free(xfs_bmbt_trace_buf);
 #endif
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 26d96d1b25c..00a6b7dc24a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -25,7 +25,6 @@
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
-#include "xfs_dir.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -33,13 +32,11 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
-#include "xfs_dir_sf.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_dir_leaf.h"
 #include "xfs_itable.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
@@ -1958,8 +1955,7 @@ xfs_create(
 	if (error)
 		goto error_return;
 
-	if (resblks == 0 &&
-	    (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
+	if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
 		goto error_return;
 	rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
 	error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
@@ -1990,9 +1986,9 @@ xfs_create(
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	dp_joined_to_trans = B_TRUE;
 
-	error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
-		&first_block, &free_list,
-		resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
+					&first_block, &free_list, resblks ?
+					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
 		ASSERT(error != ENOSPC);
 		goto abort_return;
@@ -2468,8 +2464,8 @@ xfs_remove(
 	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
 	 */
 	XFS_BMAP_INIT(&free_list, &first_block);
-	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
-		&first_block, &free_list, 0);
+	error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
+					&first_block, &free_list, 0);
 	if (error) {
 		ASSERT(error != ENOENT);
 		REMOVE_DEBUG_TRACE(__LINE__);
@@ -2688,13 +2684,12 @@ xfs_link(
 	}
 
 	if (resblks == 0 &&
-	    (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
-			target_namelen)))
+	    (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
 		goto error_return;
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 
-	error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
+	error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
 				   sip->i_ino, &first_block, &free_list,
 				   resblks);
 	if (error)
@@ -2860,7 +2855,7 @@ xfs_mkdir(
 		goto error_return;
 
 	if (resblks == 0 &&
-	    (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
+	    (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
 		goto error_return;
 	/*
 	 * create the directory inode.
@@ -2887,9 +2882,9 @@ xfs_mkdir(
 
 	XFS_BMAP_INIT(&free_list, &first_block);
 
-	error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
-			cdp->i_ino, &first_block, &free_list,
-			resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
+				   &first_block, &free_list, resblks ?
+				   resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
 	if (error) {
 		ASSERT(error != ENOSPC);
 		goto error1;
@@ -2903,16 +2898,14 @@ xfs_mkdir(
 	 */
 	dp->i_gen++;
 
-	error = XFS_DIR_INIT(mp, tp, cdp, dp);
-	if (error) {
+	error = xfs_dir_init(tp, cdp, dp);
+	if (error)
 		goto error2;
-	}
 
 	cdp->i_gen = 1;
 	error = xfs_bumplink(tp, dp);
-	if (error) {
+	if (error)
 		goto error2;
-	}
 
 	cvp = XFS_ITOV(cdp);
 
@@ -3121,16 +3114,15 @@ xfs_rmdir(
 		error = XFS_ERROR(ENOTEMPTY);
 		goto error_return;
 	}
-	if (!XFS_DIR_ISEMPTY(mp, cdp)) {
+	if (!xfs_dir_isempty(cdp)) {
 		error = XFS_ERROR(ENOTEMPTY);
 		goto error_return;
 	}
 
-	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
-		&first_block, &free_list, resblks);
-	if (error) {
+	error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
+					&first_block, &free_list, resblks);
+	if (error)
 		goto error1;
-	}
 
 	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 
@@ -3229,8 +3221,6 @@ xfs_rmdir(
 
 
 /*
- * xfs_readdir
- *
  * Read dp's entries starting at uiop->uio_offset and translate them into
  * bufsize bytes worth of struct dirents starting at bufbase.
  */
@@ -3250,21 +3240,16 @@ xfs_readdir(
 					       (inst_t *)__return_address);
 	dp = XFS_BHVTOI(dir_bdp);
 
-	if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return XFS_ERROR(EIO);
-	}
 
 	lock_mode = xfs_ilock_map_shared(dp);
-	error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
+	error = xfs_dir_getdents(tp, dp, uiop, eofp);
 	xfs_iunlock_map_shared(dp, lock_mode);
 	return error;
 }
 
 
-/*
- * xfs_symlink
- *
- */
 STATIC int
 xfs_symlink(
 	bhv_desc_t		*dir_bdp,
@@ -3328,7 +3313,7 @@ xfs_symlink(
 		int len, total;
 		char *path;
 
-		for(total = 0, path = target_path; total < pathlen;) {
+		for (total = 0, path = target_path; total < pathlen;) {
 			/*
 			 * Skip any slashes.
 			 */
@@ -3422,7 +3407,7 @@ xfs_symlink(
 	 * Check for ability to enter directory entry, if no space reserved.
 	 */
 	if (resblks == 0 &&
-	    (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
+	    (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
 		goto error_return;
 	/*
 	 * Initialize the bmap freelist prior to calling either
@@ -3509,11 +3494,10 @@ xfs_symlink(
 	/*
 	 * Create the directory entry for the symlink.
 	 */
-	error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
-			ip->i_ino, &first_block, &free_list, resblks);
-	if (error) {
+	error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
+				   &first_block, &free_list, resblks);
+	if (error)
 		goto error1;
-	}
 	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
 
-- 
cgit v1.2.3


From d7b849da47a59d2be6d6aea1effb0efa91c30424 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 20 Jun 2006 14:01:29 +1000
Subject: [XFS] Fix a Makefile issue related to exports.o handling.

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/Makefile-linux-2.6 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index 5d73eaa1971..6f3a5558493 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -59,7 +59,6 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)		+= $(XFS_LINUX)/xfs_stats.o
 xfs-$(CONFIG_SYSCTL)		+= $(XFS_LINUX)/xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= $(XFS_LINUX)/xfs_ioctl32.o
-xfs-$(CONFIG_XFS_EXPORT)	+= $(XFS_LINUX)/xfs_export.o
 
 
 xfs-y				+= xfs_alloc.o \
@@ -117,6 +116,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
 				   kmem.o \
 				   xfs_aops.o \
 				   xfs_buf.o \
+				   xfs_export.o \
 				   xfs_file.o \
 				   xfs_fs_subr.o \
 				   xfs_globals.o \
-- 
cgit v1.2.3


From d8ce75324135ea7100124c1fff4ec5090a350607 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 20 Jun 2006 14:53:51 +1000
Subject: [XFS] Remove files from the build that are now unused.

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/Makefile-linux-2.6 | 2 --
 fs/xfs/xfs_dir.c          | 0
 fs/xfs/xfs_dir.h          | 0
 fs/xfs/xfs_dir_leaf.c     | 0
 fs/xfs/xfs_dir_leaf.h     | 0
 fs/xfs/xfs_dir_sf.h       | 0
 6 files changed, 2 deletions(-)
 delete mode 100644 fs/xfs/xfs_dir.c
 delete mode 100644 fs/xfs/xfs_dir.h
 delete mode 100644 fs/xfs/xfs_dir_leaf.c
 delete mode 100644 fs/xfs/xfs_dir_leaf.h
 delete mode 100644 fs/xfs/xfs_dir_sf.h

(limited to 'fs')

diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index 6f3a5558493..9e7f85986d0 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -72,14 +72,12 @@ xfs-y				+= xfs_alloc.o \
 				   xfs_btree.o \
 				   xfs_buf_item.o \
 				   xfs_da_btree.o \
-				   xfs_dir.o \
 				   xfs_dir2.o \
 				   xfs_dir2_block.o \
 				   xfs_dir2_data.o \
 				   xfs_dir2_leaf.o \
 				   xfs_dir2_node.o \
 				   xfs_dir2_sf.o \
-				   xfs_dir_leaf.o \
 				   xfs_error.o \
 				   xfs_extfree_item.o \
 				   xfs_fsops.o \
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/fs/xfs/xfs_dir_sf.h b/fs/xfs/xfs_dir_sf.h
deleted file mode 100644
index e69de29bb2d..00000000000
-- 
cgit v1.2.3


From 2d9048e201bfb67ba21f05e647b1286b8a4a5667 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 1 Jun 2006 13:10:59 -0700
Subject: [PATCH] inotify (1/5): split kernel API from userspace support

The following series of patches introduces a kernel API for inotify,
making it possible for kernel modules to benefit from inotify's
mechanism for watching inodes.  With these patches, inotify will
maintain for each caller a list of watches (via an embedded struct
inotify_watch), where each inotify_watch is associated with a
corresponding struct inode.  The caller registers an event handler and
specifies for which filesystem events their event handler should be
called per inotify_watch.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Acked-by: Robert Love <rml@novell.com>
Acked-by: John McCutchan <john@johnmccutchan.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Kconfig        |  24 +-
 fs/Makefile       |   1 +
 fs/inotify.c      | 941 +++++++++++++-----------------------------------------
 fs/inotify_user.c | 717 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 966 insertions(+), 717 deletions(-)
 create mode 100644 fs/inotify_user.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f9b5842c8d2..74f11a23622 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -393,18 +393,30 @@ config INOTIFY
 	bool "Inotify file change notification support"
 	default y
 	---help---
-	  Say Y here to enable inotify support and the associated system
-	  calls.  Inotify is a file change notification system and a
-	  replacement for dnotify.  Inotify fixes numerous shortcomings in
-	  dnotify and introduces several new features.  It allows monitoring
-	  of both files and directories via a single open fd.  Other features
-	  include multiple file events, one-shot support, and unmount
+	  Say Y here to enable inotify support.  Inotify is a file change
+	  notification system and a replacement for dnotify.  Inotify fixes
+	  numerous shortcomings in dnotify and introduces several new features
+	  including multiple file events, one-shot support, and unmount
 	  notification.
 
 	  For more information, see Documentation/filesystems/inotify.txt
 
 	  If unsure, say Y.
 
+config INOTIFY_USER
+	bool "Inotify support for userspace"
+	depends on INOTIFY
+	default y
+	---help---
+	  Say Y here to enable inotify support for userspace, including the
+	  associated system calls.  Inotify allows monitoring of both files and
+	  directories via a single open fd.  Events are read from the file
+	  descriptor, which is also select()- and poll()-able.
+
+	  For more information, see Documentation/filesystems/inotify.txt
+
+	  If unsure, say Y.
+
 config QUOTA
 	bool "Quota support"
 	help
diff --git a/fs/Makefile b/fs/Makefile
index 078d3d1191a..d0ea6bfccf2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,6 +13,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioprio.o pnode.o drop_caches.o splice.o sync.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
diff --git a/fs/inotify.c b/fs/inotify.c
index 732ec4bd577..a1bedf3975c 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -5,7 +5,10 @@
  *	John McCutchan	<ttb@tentacle.dhs.org>
  *	Robert Love	<rml@novell.com>
  *
+ * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
+ *
  * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -20,35 +23,17 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
-#include <linux/syscalls.h>
-
-#include <asm/ioctls.h>
 
 static atomic_t inotify_cookie;
 
-static kmem_cache_t *watch_cachep __read_mostly;
-static kmem_cache_t *event_cachep __read_mostly;
-
-static struct vfsmount *inotify_mnt __read_mostly;
-
-/* these are configurable via /proc/sys/fs/inotify/ */
-int inotify_max_user_instances __read_mostly;
-int inotify_max_user_watches __read_mostly;
-int inotify_max_queued_events __read_mostly;
-
 /*
  * Lock ordering:
  *
@@ -56,327 +41,108 @@ int inotify_max_queued_events __read_mostly;
  * iprune_mutex (synchronize shrink_icache_memory())
  * 	inode_lock (protects the super_block->s_inodes list)
  * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
- * 		inotify_dev->mutex (protects inotify_device and watches->d_list)
+ * 		inotify_handle->mutex (protects inotify_handle and watches->h_list)
+ *
+ * The inode->inotify_mutex and inotify_handle->mutex and held during execution
+ * of a caller's event handler.  Thus, the caller must not hold any locks
+ * taken in their event handler while calling any of the published inotify
+ * interfaces.
  */
 
 /*
- * Lifetimes of the three main data structures--inotify_device, inode, and
+ * Lifetimes of the three main data structures--inotify_handle, inode, and
  * inotify_watch--are managed by reference count.
  *
- * inotify_device: Lifetime is from inotify_init() until release.  Additional
- * references can bump the count via get_inotify_dev() and drop the count via
- * put_inotify_dev().
+ * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
+ * Additional references can bump the count via get_inotify_handle() and drop
+ * the count via put_inotify_handle().
  *
- * inotify_watch: Lifetime is from create_watch() to destory_watch().
- * Additional references can bump the count via get_inotify_watch() and drop
- * the count via put_inotify_watch().
+ * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
+ * to remove_watch_no_event().  Additional references can bump the count via
+ * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
+ * is reponsible for the final put after receiving IN_IGNORED, or when using
+ * IN_ONESHOT after receiving the first event.  Inotify does the final put if
+ * inotify_destroy() is called.
  *
  * inode: Pinned so long as the inode is associated with a watch, from
- * create_watch() to put_inotify_watch().
+ * inotify_add_watch() to the final put_inotify_watch().
  */
 
 /*
- * struct inotify_device - represents an inotify instance
+ * struct inotify_handle - represents an inotify instance
  *
  * This structure is protected by the mutex 'mutex'.
  */
-struct inotify_device {
-	wait_queue_head_t 	wq;		/* wait queue for i/o */
+struct inotify_handle {
 	struct idr		idr;		/* idr mapping wd -> watch */
 	struct mutex		mutex;		/* protects this bad boy */
-	struct list_head 	events;		/* list of queued events */
 	struct list_head	watches;	/* list of watches */
 	atomic_t		count;		/* reference count */
-	struct user_struct	*user;		/* user who opened this dev */
-	unsigned int		queue_size;	/* size of the queue (bytes) */
-	unsigned int		event_count;	/* number of pending events */
-	unsigned int		max_events;	/* maximum number of events */
 	u32			last_wd;	/* the last wd allocated */
+	const struct inotify_operations *in_ops; /* inotify caller operations */
 };
 
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-	struct inotify_event	event;	/* the user-space event */
-	struct list_head        list;	/* entry in inotify_device's list */
-	char			*name;	/* filename, if any */
-};
-
-/*
- * struct inotify_watch - represents a watch request on a specific inode
- *
- * d_list is protected by dev->mutex of the associated watch->dev.
- * i_list and mask are protected by inode->inotify_mutex of the associated inode.
- * dev, inode, and wd are never written to once the watch is created.
- */
-struct inotify_watch {
-	struct list_head	d_list;	/* entry in inotify_device's list */
-	struct list_head	i_list;	/* entry in inode's list */
-	atomic_t		count;	/* reference count */
-	struct inotify_device	*dev;	/* associated device */
-	struct inode		*inode;	/* associated inode */
-	s32 			wd;	/* watch descriptor */
-	u32			mask;	/* event mask for this watch */
-};
-
-#ifdef CONFIG_SYSCTL
-
-#include <linux/sysctl.h>
-
-static int zero;
-
-ctl_table inotify_table[] = {
-	{
-		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
-		.procname	= "max_user_instances",
-		.data		= &inotify_max_user_instances,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
-		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero, 
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
-		.procname	= "max_queued_events",
-		.data		= &inotify_max_queued_events,
-		.maxlen		= sizeof(int),
-		.mode		= 0644, 
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec, 
-		.extra1		= &zero
-	},
-	{ .ctl_name = 0 }
-};
-#endif /* CONFIG_SYSCTL */
-
-static inline void get_inotify_dev(struct inotify_device *dev)
+static inline void get_inotify_handle(struct inotify_handle *ih)
 {
-	atomic_inc(&dev->count);
+	atomic_inc(&ih->count);
 }
 
-static inline void put_inotify_dev(struct inotify_device *dev)
+static inline void put_inotify_handle(struct inotify_handle *ih)
 {
-	if (atomic_dec_and_test(&dev->count)) {
-		atomic_dec(&dev->user->inotify_devs);
-		free_uid(dev->user);
-		idr_destroy(&dev->idr);
-		kfree(dev);
+	if (atomic_dec_and_test(&ih->count)) {
+		idr_destroy(&ih->idr);
+		kfree(ih);
 	}
 }
 
-static inline void get_inotify_watch(struct inotify_watch *watch)
+/**
+ * get_inotify_watch - grab a reference to an inotify_watch
+ * @watch: watch to grab
+ */
+void get_inotify_watch(struct inotify_watch *watch)
 {
 	atomic_inc(&watch->count);
 }
+EXPORT_SYMBOL_GPL(get_inotify_watch);
 
-/*
+/**
  * put_inotify_watch - decrements the ref count on a given watch.  cleans up
- * the watch and its references if the count reaches zero.
+ * watch references if the count reaches zero.  inotify_watch is freed by
+ * inotify callers via the destroy_watch() op.
+ * @watch: watch to release
  */
-static inline void put_inotify_watch(struct inotify_watch *watch)
+void put_inotify_watch(struct inotify_watch *watch)
 {
 	if (atomic_dec_and_test(&watch->count)) {
-		put_inotify_dev(watch->dev);
-		iput(watch->inode);
-		kmem_cache_free(watch_cachep, watch);
-	}
-}
-
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-						  const char *name)
-{
-	struct inotify_kernel_event *kevent;
-
-	kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
-	if (unlikely(!kevent))
-		return NULL;
-
-	/* we hand this out to user-space, so zero it just in case */
-	memset(&kevent->event, 0, sizeof(struct inotify_event));
-
-	kevent->event.wd = wd;
-	kevent->event.mask = mask;
-	kevent->event.cookie = cookie;
-
-	INIT_LIST_HEAD(&kevent->list);
-
-	if (name) {
-		size_t len, rem, event_size = sizeof(struct inotify_event);
-
-		/*
-		 * We need to pad the filename so as to properly align an
-		 * array of inotify_event structures.  Because the structure is
-		 * small and the common case is a small filename, we just round
-		 * up to the next multiple of the structure's sizeof.  This is
-		 * simple and safe for all architectures.
-		 */
-		len = strlen(name) + 1;
-		rem = event_size - len;
-		if (len > event_size) {
-			rem = event_size - (len % event_size);
-			if (len % event_size == 0)
-				rem = 0;
-		}
-
-		kevent->name = kmalloc(len + rem, GFP_KERNEL);
-		if (unlikely(!kevent->name)) {
-			kmem_cache_free(event_cachep, kevent);
-			return NULL;
-		}
-		memcpy(kevent->name, name, len);
-		if (rem)
-			memset(kevent->name + len, 0, rem);		
-		kevent->event.len = len + rem;
-	} else {
-		kevent->event.len = 0;
-		kevent->name = NULL;
-	}
-
-	return kevent;
-}
-
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-	return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_queue_event - add a new event to the given device
- *
- * Caller must hold dev->mutex.  Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_device *dev,
-				    struct inotify_watch *watch, u32 mask,
-				    u32 cookie, const char *name)
-{
-	struct inotify_kernel_event *kevent, *last;
-
-	/* coalescing: drop this event if it is a dupe of the previous */
-	last = inotify_dev_get_event(dev);
-	if (last && last->event.mask == mask && last->event.wd == watch->wd &&
-			last->event.cookie == cookie) {
-		const char *lastname = last->name;
-
-		if (!name && !lastname)
-			return;
-		if (name && lastname && !strcmp(lastname, name))
-			return;
-	}
-
-	/* the queue overflowed and we already sent the Q_OVERFLOW event */
-	if (unlikely(dev->event_count > dev->max_events))
-		return;
-
-	/* if the queue overflows, we need to notify user space */
-	if (unlikely(dev->event_count == dev->max_events))
-		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-	else
-		kevent = kernel_event(watch->wd, mask, cookie, name);
-
-	if (unlikely(!kevent))
-		return;
-
-	/* queue the event and wake up anyone waiting */
-	dev->event_count++;
-	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-	list_add_tail(&kevent->list, &dev->events);
-	wake_up_interruptible(&dev->wq);
-}
-
-/*
- * remove_kevent - cleans up and ultimately frees the given kevent
- *
- * Caller must hold dev->mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-			  struct inotify_kernel_event *kevent)
-{
-	list_del(&kevent->list);
+		struct inotify_handle *ih = watch->ih;
 
-	dev->event_count--;
-	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-
-	kfree(kevent->name);
-	kmem_cache_free(event_cachep, kevent);
-}
-
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-	if (!list_empty(&dev->events)) {
-		struct inotify_kernel_event *kevent;
-		kevent = inotify_dev_get_event(dev);
-		remove_kevent(dev, kevent);
+		iput(watch->inode);
+		ih->in_ops->destroy_watch(watch);
+		put_inotify_handle(ih);
 	}
 }
+EXPORT_SYMBOL_GPL(put_inotify_watch);
 
 /*
- * inotify_dev_get_wd - returns the next WD for use by the given dev
+ * inotify_handle_get_wd - returns the next WD for use by the given handle
  *
- * Callers must hold dev->mutex.  This function can sleep.
+ * Callers must hold ih->mutex.  This function can sleep.
  */
-static int inotify_dev_get_wd(struct inotify_device *dev,
-			      struct inotify_watch *watch)
+static int inotify_handle_get_wd(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
 {
 	int ret;
 
 	do {
-		if (unlikely(!idr_pre_get(&dev->idr, GFP_KERNEL)))
+		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
 			return -ENOSPC;
-		ret = idr_get_new_above(&dev->idr, watch, dev->last_wd+1, &watch->wd);
+		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
 	} while (ret == -EAGAIN);
 
-	return ret;
-}
+	if (likely(!ret))
+		ih->last_wd = watch->wd;
 
-/*
- * find_inode - resolve a user-given path to a specific inode and return a nd
- */
-static int find_inode(const char __user *dirname, struct nameidata *nd,
-		      unsigned flags)
-{
-	int error;
-
-	error = __user_walk(dirname, flags, nd);
-	if (error)
-		return error;
-	/* you can only watch an inode if you have read permissions on it */
-	error = vfs_permission(nd, MAY_READ);
-	if (error) 
-		path_release(nd);
-	return error;
+	return ret;
 }
 
 /*
@@ -422,67 +188,18 @@ static void set_dentry_child_flags(struct inode *inode, int watched)
 }
 
 /*
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->mutex.  Calls inotify_dev_get_wd() so may sleep.
- * Both 'dev' and 'inode' (by way of nameidata) need to be pinned.
- */
-static struct inotify_watch *create_watch(struct inotify_device *dev,
-					  u32 mask, struct inode *inode)
-{
-	struct inotify_watch *watch;
-	int ret;
-
-	if (atomic_read(&dev->user->inotify_watches) >=
-			inotify_max_user_watches)
-		return ERR_PTR(-ENOSPC);
-
-	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-	if (unlikely(!watch))
-		return ERR_PTR(-ENOMEM);
-
-	ret = inotify_dev_get_wd(dev, watch);
-	if (unlikely(ret)) {
-		kmem_cache_free(watch_cachep, watch);
-		return ERR_PTR(ret);
-	}
-
-	dev->last_wd = watch->wd;
-	watch->mask = mask;
-	atomic_set(&watch->count, 0);
-	INIT_LIST_HEAD(&watch->d_list);
-	INIT_LIST_HEAD(&watch->i_list);
-
-	/* save a reference to device and bump the count to make it official */
-	get_inotify_dev(dev);
-	watch->dev = dev;
-
-	/*
-	 * Save a reference to the inode and bump the ref count to make it
-	 * official.  We hold a reference to nameidata, which makes this safe.
-	 */
-	watch->inode = igrab(inode);
-
-	/* bump our own count, corresponding to our entry in dev->watches */
-	get_inotify_watch(watch);
-
-	atomic_inc(&dev->user->inotify_watches);
-
-	return watch;
-}
-
-/*
- * inotify_find_dev - find the watch associated with the given inode and dev
+ * inotify_find_handle - find the watch associated with the given inode and
+ * handle
  *
  * Callers must hold inode->inotify_mutex.
  */
-static struct inotify_watch *inode_find_dev(struct inode *inode,
-					    struct inotify_device *dev)
+static struct inotify_watch *inode_find_handle(struct inode *inode,
+					       struct inotify_handle *ih)
 {
 	struct inotify_watch *watch;
 
 	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
-		if (watch->dev == dev)
+		if (watch->ih == ih)
 			return watch;
 	}
 
@@ -491,39 +208,34 @@ static struct inotify_watch *inode_find_dev(struct inode *inode,
 
 /*
  * remove_watch_no_event - remove_watch() without the IN_IGNORED event.
+ *
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
 static void remove_watch_no_event(struct inotify_watch *watch,
-				  struct inotify_device *dev)
+				  struct inotify_handle *ih)
 {
 	list_del(&watch->i_list);
-	list_del(&watch->d_list);
+	list_del(&watch->h_list);
 
 	if (!inotify_inode_watched(watch->inode))
 		set_dentry_child_flags(watch->inode, 0);
 
-	atomic_dec(&dev->user->inotify_watches);
-	idr_remove(&dev->idr, watch->wd);
-	put_inotify_watch(watch);
+	idr_remove(&ih->idr, watch->wd);
 }
 
 /*
- * remove_watch - Remove a watch from both the device and the inode.  Sends
- * the IN_IGNORED event to the given device signifying that the inode is no
- * longer watched.
- *
- * Callers must hold both inode->inotify_mutex and dev->mutex.  We drop a
- * reference to the inode before returning.
+ * remove_watch - Remove a watch from both the handle and the inode.  Sends
+ * the IN_IGNORED event signifying that the inode is no longer watched.
  *
- * The inode is not iput() so as to remain atomic.  If the inode needs to be
- * iput(), the call returns one.  Otherwise, it returns zero.
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
-static void remove_watch(struct inotify_watch *watch,struct inotify_device *dev)
+static void remove_watch(struct inotify_watch *watch, struct inotify_handle *ih)
 {
-	inotify_dev_queue_event(dev, watch, IN_IGNORED, 0, NULL);
-	remove_watch_no_event(watch, dev);
+	remove_watch_no_event(watch, ih);
+	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL);
 }
 
-/* Kernel API */
+/* Kernel API for producing events */
 
 /*
  * inotify_d_instantiate - instantiate dcache entry for inode
@@ -576,14 +288,12 @@ void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
 		u32 watch_mask = watch->mask;
 		if (watch_mask & mask) {
-			struct inotify_device *dev = watch->dev;
-			get_inotify_watch(watch);
-			mutex_lock(&dev->mutex);
-			inotify_dev_queue_event(dev, watch, mask, cookie, name);
+			struct inotify_handle *ih= watch->ih;
+			mutex_lock(&ih->mutex);
 			if (watch_mask & IN_ONESHOT)
-				remove_watch_no_event(watch, dev);
-			mutex_unlock(&dev->mutex);
-			put_inotify_watch(watch);
+				remove_watch_no_event(watch, ih);
+			ih->in_ops->handle_event(watch, watch->wd, mask, cookie, name);
+			mutex_unlock(&ih->mutex);
 		}
 	}
 	mutex_unlock(&inode->inotify_mutex);
@@ -694,11 +404,12 @@ void inotify_unmount_inodes(struct list_head *list)
 		mutex_lock(&inode->inotify_mutex);
 		watches = &inode->inotify_watches;
 		list_for_each_entry_safe(watch, next_w, watches, i_list) {
-			struct inotify_device *dev = watch->dev;
-			mutex_lock(&dev->mutex);
-			inotify_dev_queue_event(dev, watch, IN_UNMOUNT,0,NULL);
-			remove_watch(watch, dev);
-			mutex_unlock(&dev->mutex);
+			struct inotify_handle *ih= watch->ih;
+			mutex_lock(&ih->mutex);
+			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
+						 NULL);
+			remove_watch(watch, ih);
+			mutex_unlock(&ih->mutex);
 		}
 		mutex_unlock(&inode->inotify_mutex);
 		iput(inode);		
@@ -718,432 +429,240 @@ void inotify_inode_is_dead(struct inode *inode)
 
 	mutex_lock(&inode->inotify_mutex);
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		struct inotify_device *dev = watch->dev;
-		mutex_lock(&dev->mutex);
-		remove_watch(watch, dev);
-		mutex_unlock(&dev->mutex);
+		struct inotify_handle *ih = watch->ih;
+		mutex_lock(&ih->mutex);
+		remove_watch(watch, ih);
+		mutex_unlock(&ih->mutex);
 	}
 	mutex_unlock(&inode->inotify_mutex);
 }
 EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
 
-/* Device Interface */
-
-static unsigned int inotify_poll(struct file *file, poll_table *wait)
-{
-	struct inotify_device *dev = file->private_data;
-	int ret = 0;
-
-	poll_wait(file, &dev->wq, wait);
-	mutex_lock(&dev->mutex);
-	if (!list_empty(&dev->events))
-		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&dev->mutex);
-
-	return ret;
-}
+/* Kernel Consumer API */
 
-static ssize_t inotify_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *pos)
+/**
+ * inotify_init - allocate and initialize an inotify instance
+ * @ops: caller's inotify operations
+ */
+struct inotify_handle *inotify_init(const struct inotify_operations *ops)
 {
-	size_t event_size = sizeof (struct inotify_event);
-	struct inotify_device *dev;
-	char __user *start;
-	int ret;
-	DEFINE_WAIT(wait);
-
-	start = buf;
-	dev = file->private_data;
-
-	while (1) {
-		int events;
-
-		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+	struct inotify_handle *ih;
 
-		mutex_lock(&dev->mutex);
-		events = !list_empty(&dev->events);
-		mutex_unlock(&dev->mutex);
-		if (events) {
-			ret = 0;
-			break;
-		}
-
-		if (file->f_flags & O_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-
-		schedule();
-	}
-
-	finish_wait(&dev->wq, &wait);
-	if (ret)
-		return ret;
-
-	mutex_lock(&dev->mutex);
-	while (1) {
-		struct inotify_kernel_event *kevent;
-
-		ret = buf - start;
-		if (list_empty(&dev->events))
-			break;
-
-		kevent = inotify_dev_get_event(dev);
-		if (event_size + kevent->event.len > count)
-			break;
-
-		if (copy_to_user(buf, &kevent->event, event_size)) {
-			ret = -EFAULT;
-			break;
-		}
-		buf += event_size;
-		count -= event_size;
-
-		if (kevent->name) {
-			if (copy_to_user(buf, kevent->name, kevent->event.len)){
-				ret = -EFAULT;
-				break;
-			}
-			buf += kevent->event.len;
-			count -= kevent->event.len;
-		}
+	ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
+	if (unlikely(!ih))
+		return ERR_PTR(-ENOMEM);
 
-		remove_kevent(dev, kevent);
-	}
-	mutex_unlock(&dev->mutex);
+	idr_init(&ih->idr);
+	INIT_LIST_HEAD(&ih->watches);
+	mutex_init(&ih->mutex);
+	ih->last_wd = 0;
+	ih->in_ops = ops;
+	atomic_set(&ih->count, 0);
+	get_inotify_handle(ih);
 
-	return ret;
+	return ih;
 }
+EXPORT_SYMBOL_GPL(inotify_init);
 
-static int inotify_release(struct inode *ignored, struct file *file)
+/**
+ * inotify_destroy - clean up and destroy an inotify instance
+ * @ih: inotify handle
+ */
+void inotify_destroy(struct inotify_handle *ih)
 {
-	struct inotify_device *dev = file->private_data;
-
 	/*
-	 * Destroy all of the watches on this device.  Unfortunately, not very
+	 * Destroy all of the watches for this handle. Unfortunately, not very
 	 * pretty.  We cannot do a simple iteration over the list, because we
 	 * do not know the inode until we iterate to the watch.  But we need to
-	 * hold inode->inotify_mutex before dev->mutex.  The following works.
+	 * hold inode->inotify_mutex before ih->mutex.  The following works.
 	 */
 	while (1) {
 		struct inotify_watch *watch;
 		struct list_head *watches;
 		struct inode *inode;
 
-		mutex_lock(&dev->mutex);
-		watches = &dev->watches;
+		mutex_lock(&ih->mutex);
+		watches = &ih->watches;
 		if (list_empty(watches)) {
-			mutex_unlock(&dev->mutex);
+			mutex_unlock(&ih->mutex);
 			break;
 		}
-		watch = list_entry(watches->next, struct inotify_watch, d_list);
+		watch = list_entry(watches->next, struct inotify_watch, h_list);
 		get_inotify_watch(watch);
-		mutex_unlock(&dev->mutex);
+		mutex_unlock(&ih->mutex);
 
 		inode = watch->inode;
 		mutex_lock(&inode->inotify_mutex);
-		mutex_lock(&dev->mutex);
+		mutex_lock(&ih->mutex);
 
 		/* make sure we didn't race with another list removal */
-		if (likely(idr_find(&dev->idr, watch->wd)))
-			remove_watch_no_event(watch, dev);
+		if (likely(idr_find(&ih->idr, watch->wd))) {
+			remove_watch_no_event(watch, ih);
+			put_inotify_watch(watch);
+		}
 
-		mutex_unlock(&dev->mutex);
+		mutex_unlock(&ih->mutex);
 		mutex_unlock(&inode->inotify_mutex);
 		put_inotify_watch(watch);
 	}
 
-	/* destroy all of the events on this device */
-	mutex_lock(&dev->mutex);
-	while (!list_empty(&dev->events))
-		inotify_dev_event_dequeue(dev);
-	mutex_unlock(&dev->mutex);
-
-	/* free this device: the put matching the get in inotify_init() */
-	put_inotify_dev(dev);
-
-	return 0;
+	/* free this handle: the put matching the get in inotify_init() */
+	put_inotify_handle(ih);
 }
+EXPORT_SYMBOL_GPL(inotify_destroy);
 
-/*
- * inotify_ignore - remove a given wd from this inotify instance.
+/**
+ * inotify_find_update_watch - find and update the mask of an existing watch
+ * @ih: inotify handle
+ * @inode: inode's watch to update
+ * @mask: mask of events to watch
  *
- * Can sleep.
+ * Caller must pin given inode (via nameidata).
  */
-static int inotify_ignore(struct inotify_device *dev, s32 wd)
+s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
+			      u32 mask)
 {
-	struct inotify_watch *watch;
-	struct inode *inode;
+	struct inotify_watch *old;
+	int mask_add = 0;
+	int ret;
 
-	mutex_lock(&dev->mutex);
-	watch = idr_find(&dev->idr, wd);
-	if (unlikely(!watch)) {
-		mutex_unlock(&dev->mutex);
+	if (mask & IN_MASK_ADD)
+		mask_add = 1;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
 		return -EINVAL;
-	}
-	get_inotify_watch(watch);
-	inode = watch->inode;
-	mutex_unlock(&dev->mutex);
 
 	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&dev->mutex);
-
-	/* make sure that we did not race */
-	if (likely(idr_find(&dev->idr, wd) == watch))
-		remove_watch(watch, dev);
-
-	mutex_unlock(&dev->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	put_inotify_watch(watch);
-
-	return 0;
-}
-
-static long inotify_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
-{
-	struct inotify_device *dev;
-	void __user *p;
-	int ret = -ENOTTY;
+	mutex_lock(&ih->mutex);
 
-	dev = file->private_data;
-	p = (void __user *) arg;
-
-	switch (cmd) {
-	case FIONREAD:
-		ret = put_user(dev->queue_size, (int __user *) p);
-		break;
-	}
-
-	return ret;
-}
-
-static const struct file_operations inotify_fops = {
-	.poll           = inotify_poll,
-	.read           = inotify_read,
-	.release        = inotify_release,
-	.unlocked_ioctl = inotify_ioctl,
-	.compat_ioctl	= inotify_ioctl,
-};
-
-asmlinkage long sys_inotify_init(void)
-{
-	struct inotify_device *dev;
-	struct user_struct *user;
-	struct file *filp;	
-	int fd, ret;
-
-	fd = get_unused_fd();
-	if (fd < 0)
-		return fd;
-
-	filp = get_empty_filp();
-	if (!filp) {
-		ret = -ENFILE;
-		goto out_put_fd;
-	}
-
-	user = get_uid(current->user);
-	if (unlikely(atomic_read(&user->inotify_devs) >=
-			inotify_max_user_instances)) {
-		ret = -EMFILE;
-		goto out_free_uid;
-	}
-
-	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
-	if (unlikely(!dev)) {
-		ret = -ENOMEM;
-		goto out_free_uid;
+	/*
+	 * Handle the case of re-adding a watch on an (inode,ih) pair that we
+	 * are already watching.  We just update the mask and return its wd.
+	 */
+	old = inode_find_handle(inode, ih);
+	if (unlikely(!old)) {
+		ret = -ENOENT;
+		goto out;
 	}
 
-	filp->f_op = &inotify_fops;
-	filp->f_vfsmnt = mntget(inotify_mnt);
-	filp->f_dentry = dget(inotify_mnt->mnt_root);
-	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
-	filp->f_mode = FMODE_READ;
-	filp->f_flags = O_RDONLY;
-	filp->private_data = dev;
-
-	idr_init(&dev->idr);
-	INIT_LIST_HEAD(&dev->events);
-	INIT_LIST_HEAD(&dev->watches);
-	init_waitqueue_head(&dev->wq);
-	mutex_init(&dev->mutex);
-	dev->event_count = 0;
-	dev->queue_size = 0;
-	dev->max_events = inotify_max_queued_events;
-	dev->user = user;
-	dev->last_wd = 0;
-	atomic_set(&dev->count, 0);
-
-	get_inotify_dev(dev);
-	atomic_inc(&user->inotify_devs);
-	fd_install(fd, filp);
-
-	return fd;
-out_free_uid:
-	free_uid(user);
-	put_filp(filp);
-out_put_fd:
-	put_unused_fd(fd);
+	if (mask_add)
+		old->mask |= mask;
+	else
+		old->mask = mask;
+	ret = old->wd;
+out:
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_find_update_watch);
 
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+/**
+ * inotify_add_watch - add a watch to an inotify instance
+ * @ih: inotify handle
+ * @watch: caller allocated watch structure
+ * @inode: inode to watch
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ * Caller must ensure it only calls inotify_add_watch() once per watch.
+ * Calls inotify_handle_get_wd() so may sleep.
+ */
+s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
+		      struct inode *inode, u32 mask)
 {
-	struct inotify_watch *watch, *old;
-	struct inode *inode;
-	struct inotify_device *dev;
-	struct nameidata nd;
-	struct file *filp;
-	int ret, fput_needed;
-	int mask_add = 0;
-	unsigned flags = 0;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
+	int ret = 0;
 
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto fput_and_out;
-	}
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
+		return -EINVAL;
+	watch->mask = mask;
 
-	if (!(mask & IN_DONT_FOLLOW))
-		flags |= LOOKUP_FOLLOW;
-	if (mask & IN_ONLYDIR)
-		flags |= LOOKUP_DIRECTORY;
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
 
-	ret = find_inode(path, &nd, flags);
+	/* Initialize a new watch */
+	ret = inotify_handle_get_wd(ih, watch);
 	if (unlikely(ret))
-		goto fput_and_out;
-
-	/* inode held in place by reference to nd; dev by fget on fd */
-	inode = nd.dentry->d_inode;
-	dev = filp->private_data;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&dev->mutex);
+		goto out;
+	ret = watch->wd;
 
-	if (mask & IN_MASK_ADD)
-		mask_add = 1;
+	atomic_set(&watch->count, 0);
+	INIT_LIST_HEAD(&watch->h_list);
+	INIT_LIST_HEAD(&watch->i_list);
 
-	/* don't let user-space set invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	/* save a reference to handle and bump the count to make it official */
+	get_inotify_handle(ih);
+	watch->ih = ih;
 
 	/*
-	 * Handle the case of re-adding a watch on an (inode,dev) pair that we
-	 * are already watching.  We just update the mask and return its wd.
+	 * Save a reference to the inode and bump the ref count to make it
+	 * official.  We hold a reference to nameidata, which makes this safe.
 	 */
-	old = inode_find_dev(inode, dev);
-	if (unlikely(old)) {
-		if (mask_add)
-			old->mask |= mask;
-		else
-			old->mask = mask;
-		ret = old->wd;
-		goto out;
-	}
+	watch->inode = igrab(inode);
 
-	watch = create_watch(dev, mask, inode);
-	if (unlikely(IS_ERR(watch))) {
-		ret = PTR_ERR(watch);
-		goto out;
-	}
+	get_inotify_watch(watch); /* initial get */
 
 	if (!inotify_inode_watched(inode))
 		set_dentry_child_flags(inode, 1);
 
-	/* Add the watch to the device's and the inode's list */
-	list_add(&watch->d_list, &dev->watches);
+	/* Add the watch to the handle's and the inode's list */
+	list_add(&watch->h_list, &ih->watches);
 	list_add(&watch->i_list, &inode->inotify_watches);
-	ret = watch->wd;
 out:
-	mutex_unlock(&dev->mutex);
+	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
-	path_release(&nd);
-fput_and_out:
-	fput_light(filp, fput_needed);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(inotify_add_watch);
 
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+/**
+ * inotify_rm_wd - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @wd: watch descriptor to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 {
-	struct file *filp;
-	struct inotify_device *dev;
-	int ret, fput_needed;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
+	struct inotify_watch *watch;
+	struct inode *inode;
 
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto out;
+	mutex_lock(&ih->mutex);
+	watch = idr_find(&ih->idr, wd);
+	if (unlikely(!watch)) {
+		mutex_unlock(&ih->mutex);
+		return -EINVAL;
 	}
+	get_inotify_watch(watch);
+	inode = watch->inode;
+	mutex_unlock(&ih->mutex);
 
-	dev = filp->private_data;
-	ret = inotify_ignore(dev, wd);
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
 
-out:
-	fput_light(filp, fput_needed);
-	return ret;
-}
+	/* make sure that we did not race */
+	if (likely(idr_find(&ih->idr, wd) == watch))
+		remove_watch(watch, ih);
 
-static struct super_block *
-inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data)
-{
-    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
-}
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+	put_inotify_watch(watch);
 
-static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
-    .get_sb         = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
-};
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inotify_rm_wd);
 
 /*
- * inotify_setup - Our initialization function.  Note that we cannnot return
- * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
- * must result in panic().
+ * inotify_setup - core initialization function
  */
 static int __init inotify_setup(void)
 {
-	int ret;
-
-	ret = register_filesystem(&inotify_fs_type);
-	if (unlikely(ret))
-		panic("inotify: register_filesystem returned %d!\n", ret);
-
-	inotify_mnt = kern_mount(&inotify_fs_type);
-	if (IS_ERR(inotify_mnt))
-		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
-
-	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
-
 	atomic_set(&inotify_cookie, 0);
 
-	watch_cachep = kmem_cache_create("inotify_watch_cache",
-					 sizeof(struct inotify_watch),
-					 0, SLAB_PANIC, NULL, NULL);
-	event_cachep = kmem_cache_create("inotify_event_cache",
-					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL, NULL);
-
 	return 0;
 }
 
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
new file mode 100644
index 00000000000..845dc79a4e9
--- /dev/null
+++ b/fs/inotify_user.c
@@ -0,0 +1,717 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/inotify.h>
+#include <linux/syscalls.h>
+
+#include <asm/ioctls.h>
+
+static kmem_cache_t *watch_cachep __read_mostly;
+static kmem_cache_t *event_cachep __read_mostly;
+
+static struct vfsmount *inotify_mnt __read_mostly;
+
+/* these are configurable via /proc/sys/fs/inotify/ */
+int inotify_max_user_instances __read_mostly;
+int inotify_max_user_watches __read_mostly;
+int inotify_max_queued_events __read_mostly;
+
+/*
+ * Lock ordering:
+ *
+ * inotify_dev->up_mutex (ensures we don't re-add the same watch)
+ * 	inode->inotify_mutex (protects inode's watch list)
+ * 		inotify_handle->mutex (protects inotify_handle's watch list)
+ * 			inotify_dev->ev_mutex (protects device's event queue)
+ */
+
+/*
+ * Lifetimes of the main data structures:
+ *
+ * inotify_device: Lifetime is managed by reference count, from
+ * sys_inotify_init() until release.  Additional references can bump the count
+ * via get_inotify_dev() and drop the count via put_inotify_dev().
+ *
+ * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
+ * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
+ * first event, or to inotify_destroy().
+ */
+
+/*
+ * struct inotify_device - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_device {
+	wait_queue_head_t 	wq;		/* wait queue for i/o */
+	struct mutex		ev_mutex;	/* protects event queue */
+	struct mutex		up_mutex;	/* synchronizes watch updates */
+	struct list_head 	events;		/* list of queued events */
+	atomic_t		count;		/* reference count */
+	struct user_struct	*user;		/* user who opened this dev */
+	struct inotify_handle	*ih;		/* inotify handle */
+	unsigned int		queue_size;	/* size of the queue (bytes) */
+	unsigned int		event_count;	/* number of pending events */
+	unsigned int		max_events;	/* maximum number of events */
+};
+
+/*
+ * struct inotify_kernel_event - An inotify event, originating from a watch and
+ * queued for user-space.  A list of these is attached to each instance of the
+ * device.  In read(), this list is walked and all events that can fit in the
+ * buffer are returned.
+ *
+ * Protected by dev->ev_mutex of the device in which we are queued.
+ */
+struct inotify_kernel_event {
+	struct inotify_event	event;	/* the user-space event */
+	struct list_head        list;	/* entry in inotify_device's list */
+	char			*name;	/* filename, if any */
+};
+
+/*
+ * struct inotify_user_watch - our version of an inotify_watch, we add
+ * a reference to the associated inotify_device.
+ */
+struct inotify_user_watch {
+	struct inotify_device	*dev;	/* associated device */
+	struct inotify_watch	wdata;	/* inotify watch data */
+};
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table inotify_table[] = {
+	{
+		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
+		.procname	= "max_user_instances",
+		.data		= &inotify_max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
+		.procname	= "max_user_watches",
+		.data		= &inotify_max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
+		.procname	= "max_queued_events",
+		.data		= &inotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
+static inline void get_inotify_dev(struct inotify_device *dev)
+{
+	atomic_inc(&dev->count);
+}
+
+static inline void put_inotify_dev(struct inotify_device *dev)
+{
+	if (atomic_dec_and_test(&dev->count)) {
+		atomic_dec(&dev->user->inotify_devs);
+		free_uid(dev->user);
+		kfree(dev);
+	}
+}
+
+/*
+ * free_inotify_user_watch - cleans up the watch and its references
+ */
+static void free_inotify_user_watch(struct inotify_watch *w)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	atomic_dec(&dev->user->inotify_watches);
+	put_inotify_dev(dev);
+	kmem_cache_free(watch_cachep, watch);
+}
+
+/*
+ * kernel_event - create a new kernel event with the given parameters
+ *
+ * This function can sleep.
+ */
+static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
+						  const char *name)
+{
+	struct inotify_kernel_event *kevent;
+
+	kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
+	if (unlikely(!kevent))
+		return NULL;
+
+	/* we hand this out to user-space, so zero it just in case */
+	memset(&kevent->event, 0, sizeof(struct inotify_event));
+
+	kevent->event.wd = wd;
+	kevent->event.mask = mask;
+	kevent->event.cookie = cookie;
+
+	INIT_LIST_HEAD(&kevent->list);
+
+	if (name) {
+		size_t len, rem, event_size = sizeof(struct inotify_event);
+
+		/*
+		 * We need to pad the filename so as to properly align an
+		 * array of inotify_event structures.  Because the structure is
+		 * small and the common case is a small filename, we just round
+		 * up to the next multiple of the structure's sizeof.  This is
+		 * simple and safe for all architectures.
+		 */
+		len = strlen(name) + 1;
+		rem = event_size - len;
+		if (len > event_size) {
+			rem = event_size - (len % event_size);
+			if (len % event_size == 0)
+				rem = 0;
+		}
+
+		kevent->name = kmalloc(len + rem, GFP_KERNEL);
+		if (unlikely(!kevent->name)) {
+			kmem_cache_free(event_cachep, kevent);
+			return NULL;
+		}
+		memcpy(kevent->name, name, len);
+		if (rem)
+			memset(kevent->name + len, 0, rem);
+		kevent->event.len = len + rem;
+	} else {
+		kevent->event.len = 0;
+		kevent->name = NULL;
+	}
+
+	return kevent;
+}
+
+/*
+ * inotify_dev_get_event - return the next event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_event(struct inotify_device *dev)
+{
+	return list_entry(dev->events.next, struct inotify_kernel_event, list);
+}
+
+/*
+ * inotify_dev_queue_event - event handler registered with core inotify, adds
+ * a new event to the given device
+ *
+ * Can sleep (calls kernel_event()).
+ */
+static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
+				    u32 cookie, const char *name)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+	struct inotify_kernel_event *kevent, *last;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	mutex_lock(&dev->ev_mutex);
+
+	/* we can safely put the watch as we don't reference it while
+	 * generating the event
+	 */
+	if (mask & IN_IGNORED || mask & IN_ONESHOT)
+		put_inotify_watch(w); /* final put */
+
+	/* coalescing: drop this event if it is a dupe of the previous */
+	last = inotify_dev_get_event(dev);
+	if (last && last->event.mask == mask && last->event.wd == wd &&
+			last->event.cookie == cookie) {
+		const char *lastname = last->name;
+
+		if (!name && !lastname)
+			goto out;
+		if (name && lastname && !strcmp(lastname, name))
+			goto out;
+	}
+
+	/* the queue overflowed and we already sent the Q_OVERFLOW event */
+	if (unlikely(dev->event_count > dev->max_events))
+		goto out;
+
+	/* if the queue overflows, we need to notify user space */
+	if (unlikely(dev->event_count == dev->max_events))
+		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
+	else
+		kevent = kernel_event(wd, mask, cookie, name);
+
+	if (unlikely(!kevent))
+		goto out;
+
+	/* queue the event and wake up anyone waiting */
+	dev->event_count++;
+	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
+	list_add_tail(&kevent->list, &dev->events);
+	wake_up_interruptible(&dev->wq);
+
+out:
+	mutex_unlock(&dev->ev_mutex);
+}
+
+/*
+ * remove_kevent - cleans up and ultimately frees the given kevent
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void remove_kevent(struct inotify_device *dev,
+			  struct inotify_kernel_event *kevent)
+{
+	list_del(&kevent->list);
+
+	dev->event_count--;
+	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+
+	kfree(kevent->name);
+	kmem_cache_free(event_cachep, kevent);
+}
+
+/*
+ * inotify_dev_event_dequeue - destroy an event on the given device
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void inotify_dev_event_dequeue(struct inotify_device *dev)
+{
+	if (!list_empty(&dev->events)) {
+		struct inotify_kernel_event *kevent;
+		kevent = inotify_dev_get_event(dev);
+		remove_kevent(dev, kevent);
+	}
+}
+
+/*
+ * find_inode - resolve a user-given path to a specific inode and return a nd
+ */
+static int find_inode(const char __user *dirname, struct nameidata *nd,
+		      unsigned flags)
+{
+	int error;
+
+	error = __user_walk(dirname, flags, nd);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = vfs_permission(nd, MAY_READ);
+	if (error)
+		path_release(nd);
+	return error;
+}
+
+/*
+ * create_watch - creates a watch on the given device.
+ *
+ * Callers must hold dev->up_mutex.
+ */
+static int create_watch(struct inotify_device *dev, struct inode *inode,
+			u32 mask)
+{
+	struct inotify_user_watch *watch;
+	int ret;
+
+	if (atomic_read(&dev->user->inotify_watches) >=
+			inotify_max_user_watches)
+		return -ENOSPC;
+
+	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
+	if (unlikely(!watch))
+		return -ENOMEM;
+
+	/* save a reference to device and bump the count to make it official */
+	get_inotify_dev(dev);
+	watch->dev = dev;
+
+	atomic_inc(&dev->user->inotify_watches);
+
+	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
+	if (ret < 0)
+		free_inotify_user_watch(&watch->wdata);
+
+	return ret;
+}
+
+/* Device Interface */
+
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+	struct inotify_device *dev = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &dev->wq, wait);
+	mutex_lock(&dev->ev_mutex);
+	if (!list_empty(&dev->events))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static ssize_t inotify_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	size_t event_size = sizeof (struct inotify_event);
+	struct inotify_device *dev;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	dev = file->private_data;
+
+	while (1) {
+		int events;
+
+		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&dev->ev_mutex);
+		events = !list_empty(&dev->events);
+		mutex_unlock(&dev->ev_mutex);
+		if (events) {
+			ret = 0;
+			break;
+		}
+
+		if (file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		schedule();
+	}
+
+	finish_wait(&dev->wq, &wait);
+	if (ret)
+		return ret;
+
+	mutex_lock(&dev->ev_mutex);
+	while (1) {
+		struct inotify_kernel_event *kevent;
+
+		ret = buf - start;
+		if (list_empty(&dev->events))
+			break;
+
+		kevent = inotify_dev_get_event(dev);
+		if (event_size + kevent->event.len > count)
+			break;
+
+		if (copy_to_user(buf, &kevent->event, event_size)) {
+			ret = -EFAULT;
+			break;
+		}
+		buf += event_size;
+		count -= event_size;
+
+		if (kevent->name) {
+			if (copy_to_user(buf, kevent->name, kevent->event.len)){
+				ret = -EFAULT;
+				break;
+			}
+			buf += kevent->event.len;
+			count -= kevent->event.len;
+		}
+
+		remove_kevent(dev, kevent);
+	}
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+	struct inotify_device *dev = file->private_data;
+
+	inotify_destroy(dev->ih);
+
+	/* destroy all of the events on this device */
+	mutex_lock(&dev->ev_mutex);
+	while (!list_empty(&dev->events))
+		inotify_dev_event_dequeue(dev);
+	mutex_unlock(&dev->ev_mutex);
+
+	/* free this device: the put matching the get in inotify_init() */
+	put_inotify_dev(dev);
+
+	return 0;
+}
+
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct inotify_device *dev;
+	void __user *p;
+	int ret = -ENOTTY;
+
+	dev = file->private_data;
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		ret = put_user(dev->queue_size, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations inotify_fops = {
+	.poll           = inotify_poll,
+	.read           = inotify_read,
+	.release        = inotify_release,
+	.unlocked_ioctl = inotify_ioctl,
+	.compat_ioctl	= inotify_ioctl,
+};
+
+static const struct inotify_operations inotify_user_ops = {
+	.handle_event	= inotify_dev_queue_event,
+	.destroy_watch	= free_inotify_user_watch,
+};
+
+asmlinkage long sys_inotify_init(void)
+{
+	struct inotify_device *dev;
+	struct inotify_handle *ih;
+	struct user_struct *user;
+	struct file *filp;
+	int fd, ret;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return fd;
+
+	filp = get_empty_filp();
+	if (!filp) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	user = get_uid(current->user);
+	if (unlikely(atomic_read(&user->inotify_devs) >=
+			inotify_max_user_instances)) {
+		ret = -EMFILE;
+		goto out_free_uid;
+	}
+
+	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+	if (unlikely(!dev)) {
+		ret = -ENOMEM;
+		goto out_free_uid;
+	}
+
+	ih = inotify_init(&inotify_user_ops);
+	if (unlikely(IS_ERR(ih))) {
+		ret = PTR_ERR(ih);
+		goto out_free_dev;
+	}
+	dev->ih = ih;
+
+	filp->f_op = &inotify_fops;
+	filp->f_vfsmnt = mntget(inotify_mnt);
+	filp->f_dentry = dget(inotify_mnt->mnt_root);
+	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+	filp->f_mode = FMODE_READ;
+	filp->f_flags = O_RDONLY;
+	filp->private_data = dev;
+
+	INIT_LIST_HEAD(&dev->events);
+	init_waitqueue_head(&dev->wq);
+	mutex_init(&dev->ev_mutex);
+	mutex_init(&dev->up_mutex);
+	dev->event_count = 0;
+	dev->queue_size = 0;
+	dev->max_events = inotify_max_queued_events;
+	dev->user = user;
+	atomic_set(&dev->count, 0);
+
+	get_inotify_dev(dev);
+	atomic_inc(&user->inotify_devs);
+	fd_install(fd, filp);
+
+	return fd;
+out_free_dev:
+	kfree(dev);
+out_free_uid:
+	free_uid(user);
+	put_filp(filp);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+{
+	struct inode *inode;
+	struct inotify_device *dev;
+	struct nameidata nd;
+	struct file *filp;
+	int ret, fput_needed;
+	unsigned flags = 0;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto fput_and_out;
+	}
+
+	if (!(mask & IN_DONT_FOLLOW))
+		flags |= LOOKUP_FOLLOW;
+	if (mask & IN_ONLYDIR)
+		flags |= LOOKUP_DIRECTORY;
+
+	ret = find_inode(path, &nd, flags);
+	if (unlikely(ret))
+		goto fput_and_out;
+
+	/* inode held in place by reference to nd; dev by fget on fd */
+	inode = nd.dentry->d_inode;
+	dev = filp->private_data;
+
+	mutex_lock(&dev->up_mutex);
+	ret = inotify_find_update_watch(dev->ih, inode, mask);
+	if (ret == -ENOENT)
+		ret = create_watch(dev, inode, mask);
+	mutex_unlock(&dev->up_mutex);
+
+	path_release(&nd);
+fput_and_out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+{
+	struct file *filp;
+	struct inotify_device *dev;
+	int ret, fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	dev = filp->private_data;
+
+	/* we free our watch data when we get IN_IGNORED */
+	ret = inotify_rm_wd(dev->ih, wd);
+
+out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+static struct super_block *
+inotify_get_sb(struct file_system_type *fs_type, int flags,
+	       const char *dev_name, void *data)
+{
+    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+}
+
+static struct file_system_type inotify_fs_type = {
+    .name           = "inotifyfs",
+    .get_sb         = inotify_get_sb,
+    .kill_sb        = kill_anon_super,
+};
+
+/*
+ * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_user_setup(void)
+{
+	int ret;
+
+	ret = register_filesystem(&inotify_fs_type);
+	if (unlikely(ret))
+		panic("inotify: register_filesystem returned %d!\n", ret);
+
+	inotify_mnt = kern_mount(&inotify_fs_type);
+	if (IS_ERR(inotify_mnt))
+		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+
+	inotify_max_queued_events = 16384;
+	inotify_max_user_instances = 128;
+	inotify_max_user_watches = 8192;
+
+	watch_cachep = kmem_cache_create("inotify_watch_cache",
+					 sizeof(struct inotify_user_watch),
+					 0, SLAB_PANIC, NULL, NULL);
+	event_cachep = kmem_cache_create("inotify_event_cache",
+					 sizeof(struct inotify_kernel_event),
+					 0, SLAB_PANIC, NULL, NULL);
+
+	return 0;
+}
+
+module_init(inotify_user_setup);
-- 
cgit v1.2.3


From 7c29772288b7026504cfe75bfd90d40fbd1574bf Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 1 Jun 2006 13:11:01 -0700
Subject: [PATCH] inotify (2/5): add name's inode to event handler

When an inotify event includes a dentry name, also include the inode
associated with that name.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Acked-by: Robert Love <rml@novell.com>
Acked-by: John McCutchan <john@johnmccutchan.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inotify.c      | 13 ++++++++-----
 fs/inotify_user.c |  3 ++-
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index a1bedf3975c..f25c21801fd 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -232,7 +232,7 @@ static void remove_watch_no_event(struct inotify_watch *watch,
 static void remove_watch(struct inotify_watch *watch, struct inotify_handle *ih)
 {
 	remove_watch_no_event(watch, ih);
-	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL);
+	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
 }
 
 /* Kernel API for producing events */
@@ -275,9 +275,10 @@ void inotify_d_move(struct dentry *entry)
  * @mask: event mask describing this event
  * @cookie: cookie for synchronization, or zero
  * @name: filename, if any
+ * @n_inode: inode associated with name
  */
 void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
-			       const char *name)
+			       const char *name, struct inode *n_inode)
 {
 	struct inotify_watch *watch, *next;
 
@@ -292,7 +293,8 @@ void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
 			mutex_lock(&ih->mutex);
 			if (watch_mask & IN_ONESHOT)
 				remove_watch_no_event(watch, ih);
-			ih->in_ops->handle_event(watch, watch->wd, mask, cookie, name);
+			ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
+						 name, n_inode);
 			mutex_unlock(&ih->mutex);
 		}
 	}
@@ -323,7 +325,8 @@ void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
 	if (inotify_inode_watched(inode)) {
 		dget(parent);
 		spin_unlock(&dentry->d_lock);
-		inotify_inode_queue_event(inode, mask, cookie, name);
+		inotify_inode_queue_event(inode, mask, cookie, name,
+					  dentry->d_inode);
 		dput(parent);
 	} else
 		spin_unlock(&dentry->d_lock);
@@ -407,7 +410,7 @@ void inotify_unmount_inodes(struct list_head *list)
 			struct inotify_handle *ih= watch->ih;
 			mutex_lock(&ih->mutex);
 			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
-						 NULL);
+						 NULL, NULL);
 			remove_watch(watch, ih);
 			mutex_unlock(&ih->mutex);
 		}
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 845dc79a4e9..8b83c719006 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -253,7 +253,8 @@ inotify_dev_get_event(struct inotify_device *dev)
  * Can sleep (calls kernel_event()).
  */
 static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-				    u32 cookie, const char *name)
+				    u32 cookie, const char *name,
+				    struct inode *ignored)
 {
 	struct inotify_user_watch *watch;
 	struct inotify_device *dev;
-- 
cgit v1.2.3


From a9dc971d3fdb857a2bcd6d53238125a2cd31d5f4 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 1 Jun 2006 13:11:03 -0700
Subject: [PATCH] inotify (3/5): add interfaces to kernel API

Add inotify_init_watch() so caller can use inotify_watch refcounts
before calling inotify_add_watch().

Add inotify_find_watch() to find an existing watch for an (ih,inode)
pair.  This is similar to inotify_find_update_watch(), but does not
update the watch's mask if one is found.

Add inotify_rm_watch() to remove a watch via the watch pointer instead
of the watch descriptor.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Acked-by: Robert Love <rml@novell.com>
Acked-by: John McCutchan <john@johnmccutchan.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inotify.c      | 64 +++++++++++++++++++++++++++++++++++++++++++++++++------
 fs/inotify_user.c |  1 +
 2 files changed, 59 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index f25c21801fd..8477c4fbecb 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -467,6 +467,19 @@ struct inotify_handle *inotify_init(const struct inotify_operations *ops)
 }
 EXPORT_SYMBOL_GPL(inotify_init);
 
+/**
+ * inotify_init_watch - initialize an inotify watch
+ * @watch: watch to initialize
+ */
+void inotify_init_watch(struct inotify_watch *watch)
+{
+	INIT_LIST_HEAD(&watch->h_list);
+	INIT_LIST_HEAD(&watch->i_list);
+	atomic_set(&watch->count, 0);
+	get_inotify_watch(watch); /* initial get */
+}
+EXPORT_SYMBOL_GPL(inotify_init_watch);
+
 /**
  * inotify_destroy - clean up and destroy an inotify instance
  * @ih: inotify handle
@@ -514,6 +527,37 @@ void inotify_destroy(struct inotify_handle *ih)
 }
 EXPORT_SYMBOL_GPL(inotify_destroy);
 
+/**
+ * inotify_find_watch - find an existing watch for an (ih,inode) pair
+ * @ih: inotify handle
+ * @inode: inode to watch
+ * @watchp: pointer to existing inotify_watch
+ *
+ * Caller must pin given inode (via nameidata).
+ */
+s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+		       struct inotify_watch **watchp)
+{
+	struct inotify_watch *old;
+	int ret = -ENOENT;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	old = inode_find_handle(inode, ih);
+	if (unlikely(old)) {
+		get_inotify_watch(old); /* caller must put watch */
+		*watchp = old;
+		ret = old->wd;
+	}
+
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inotify_find_watch);
+
 /**
  * inotify_find_update_watch - find and update the mask of an existing watch
  * @ih: inotify handle
@@ -593,10 +637,6 @@ s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
 		goto out;
 	ret = watch->wd;
 
-	atomic_set(&watch->count, 0);
-	INIT_LIST_HEAD(&watch->h_list);
-	INIT_LIST_HEAD(&watch->i_list);
-
 	/* save a reference to handle and bump the count to make it official */
 	get_inotify_handle(ih);
 	watch->ih = ih;
@@ -607,8 +647,6 @@ s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
 	 */
 	watch->inode = igrab(inode);
 
-	get_inotify_watch(watch); /* initial get */
-
 	if (!inotify_inode_watched(inode))
 		set_dentry_child_flags(inode, 1);
 
@@ -659,6 +697,20 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 }
 EXPORT_SYMBOL_GPL(inotify_rm_wd);
 
+/**
+ * inotify_rm_watch - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @watch: watch to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_watch(struct inotify_handle *ih,
+		     struct inotify_watch *watch)
+{
+	return inotify_rm_wd(ih, watch->wd);
+}
+EXPORT_SYMBOL_GPL(inotify_rm_watch);
+
 /*
  * inotify_setup - core initialization function
  */
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 8b83c719006..9e9931e2bad 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -380,6 +380,7 @@ static int create_watch(struct inotify_device *dev, struct inode *inode,
 
 	atomic_inc(&dev->user->inotify_watches);
 
+	inotify_init_watch(&watch->wdata);
 	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
 	if (ret < 0)
 		free_inotify_user_watch(&watch->wdata);
-- 
cgit v1.2.3


From 3ca10067f7f4bfa62a1b0edc84f590261fa02d75 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 1 Jun 2006 13:11:05 -0700
Subject: [PATCH] inotify (4/5): allow watch removal from event handler

Allow callers to remove watches from their event handler via
inotify_remove_watch_locked().  This functionality can be used to
achieve IN_ONESHOT-like functionality for a subset of events in the
mask.

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Acked-by: Robert Love <rml@novell.com>
Acked-by: John McCutchan <john@johnmccutchan.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inotify.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/inotify.c b/fs/inotify.c
index 8477c4fbecb..723836a1f71 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -207,7 +207,7 @@ static struct inotify_watch *inode_find_handle(struct inode *inode,
 }
 
 /*
- * remove_watch_no_event - remove_watch() without the IN_IGNORED event.
+ * remove_watch_no_event - remove watch without the IN_IGNORED event.
  *
  * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
@@ -223,17 +223,22 @@ static void remove_watch_no_event(struct inotify_watch *watch,
 	idr_remove(&ih->idr, watch->wd);
 }
 
-/*
- * remove_watch - Remove a watch from both the handle and the inode.  Sends
- * the IN_IGNORED event signifying that the inode is no longer watched.
+/**
+ * inotify_remove_watch_locked - Remove a watch from both the handle and the
+ * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
+ * watched.  May be invoked from a caller's event handler.
+ * @ih: inotify handle associated with watch
+ * @watch: watch to remove
  *
  * Callers must hold both inode->inotify_mutex and ih->mutex.
  */
-static void remove_watch(struct inotify_watch *watch, struct inotify_handle *ih)
+void inotify_remove_watch_locked(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
 {
 	remove_watch_no_event(watch, ih);
 	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
 }
+EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
 
 /* Kernel API for producing events */
 
@@ -378,7 +383,7 @@ void inotify_unmount_inodes(struct list_head *list)
 
 		need_iput_tmp = need_iput;
 		need_iput = NULL;
-		/* In case the remove_watch() drops a reference. */
+		/* In case inotify_remove_watch_locked() drops a reference. */
 		if (inode != need_iput_tmp)
 			__iget(inode);
 		else
@@ -411,7 +416,7 @@ void inotify_unmount_inodes(struct list_head *list)
 			mutex_lock(&ih->mutex);
 			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
 						 NULL, NULL);
-			remove_watch(watch, ih);
+			inotify_remove_watch_locked(ih, watch);
 			mutex_unlock(&ih->mutex);
 		}
 		mutex_unlock(&inode->inotify_mutex);
@@ -434,7 +439,7 @@ void inotify_inode_is_dead(struct inode *inode)
 	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
 		struct inotify_handle *ih = watch->ih;
 		mutex_lock(&ih->mutex);
-		remove_watch(watch, ih);
+		inotify_remove_watch_locked(ih, watch);
 		mutex_unlock(&ih->mutex);
 	}
 	mutex_unlock(&inode->inotify_mutex);
@@ -687,7 +692,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
 
 	/* make sure that we did not race */
 	if (likely(idr_find(&ih->idr, wd) == watch))
-		remove_watch(watch, ih);
+		inotify_remove_watch_locked(ih, watch);
 
 	mutex_unlock(&ih->mutex);
 	mutex_unlock(&inode->inotify_mutex);
-- 
cgit v1.2.3


From 473ae30bc7b1dda5c5791c773f95e9424ddfead9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Apr 2006 14:04:08 -0400
Subject: [PATCH] execve argument logging

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 3a79d97ac23..d07858c0b7c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -49,6 +49,7 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1085,6 +1086,11 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	/* kernel module loader fixup */
 	/* so we don't try to load run modprobe in kernel space. */
 	set_fs(USER_DS);
+
+	retval = audit_bprm(bprm);
+	if (retval)
+		return retval;
+
 	retval = -ENOENT;
 	for (try=0; try<2; try++) {
 		read_lock(&binfmt_lock);
-- 
cgit v1.2.3


From e0182909297da8d38a5d473ae7bee3d0324632a1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 18 May 2006 08:28:02 -0400
Subject: [PATCH] proc_loginuid_write() uses simple_strtoul() on non-terminated
 array

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6cc77dc3f3f..6afff725a8c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1019,8 +1019,8 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	if (current != task)
 		return -EPERM;
 
-	if (count > PAGE_SIZE)
-		count = PAGE_SIZE;
+	if (count >= PAGE_SIZE)
+		count = PAGE_SIZE - 1;
 
 	if (*ppos != 0) {
 		/* No partial writes. */
@@ -1033,6 +1033,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	if (copy_from_user(page, buf, count))
 		goto out_free_page;
 
+	page[count] = '\0';
 	loginuid = simple_strtoul(page, &tmp, 10);
 	if (tmp == page) {
 		length = -EINVAL;
-- 
cgit v1.2.3


From 9c937dcc71021f2dbf78f904f03d962dd9bcc130 Mon Sep 17 00:00:00 2001
From: Amy Griffis <amy.griffis@hp.com>
Date: Thu, 8 Jun 2006 23:19:31 -0400
Subject: [PATCH] log more info for directory entry change events

When an audit event involves changes to a directory entry, include
a PATH record for the directory itself.  A few other notable changes:

    - fixed audit_inode_child() hooks in fsnotify_move()
    - removed unused flags arg from audit_inode()
    - added audit log routines for logging a portion of a string

Here's some sample output.

before patch:
type=SYSCALL msg=audit(1149821605.320:26): arch=40000003 syscall=39 success=yes exit=0 a0=bf8d3c7c a1=1ff a2=804e1b8 a3=bf8d3c7c items=1 ppid=739 pid=800 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 comm="mkdir" exe="/bin/mkdir" subj=root:system_r:unconfined_t:s0-s0:c0.c255
type=CWD msg=audit(1149821605.320:26):  cwd="/root"
type=PATH msg=audit(1149821605.320:26): item=0 name="foo" parent=164068 inode=164010 dev=03:00 mode=040755 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_t:s0

after patch:
type=SYSCALL msg=audit(1149822032.332:24): arch=40000003 syscall=39 success=yes exit=0 a0=bfdd9c7c a1=1ff a2=804e1b8 a3=bfdd9c7c items=2 ppid=714 pid=777 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 comm="mkdir" exe="/bin/mkdir" subj=root:system_r:unconfined_t:s0-s0:c0.c255
type=CWD msg=audit(1149822032.332:24):  cwd="/root"
type=PATH msg=audit(1149822032.332:24): item=0 name="/root" inode=164068 dev=03:00 mode=040750 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_dir_t:s0
type=PATH msg=audit(1149822032.332:24): item=1 name="foo" inode=164010 dev=03:00 mode=040755 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_t:s0

Signed-off-by: Amy Griffis <amy.griffis@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 2 +-
 fs/open.c  | 4 ++--
 fs/xattr.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d6e2ee25173..184fe4acf82 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1127,7 +1127,7 @@ out:
 	if (likely(retval == 0)) {
 		if (unlikely(current->audit_context && nd && nd->dentry &&
 				nd->dentry->d_inode))
-		audit_inode(name, nd->dentry->d_inode, flags);
+		audit_inode(name, nd->dentry->d_inode);
 	}
 out_fail:
 	return retval;
diff --git a/fs/open.c b/fs/open.c
index 317b7c7f38a..4f178acd4c0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -633,7 +633,7 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
 	dentry = file->f_dentry;
 	inode = dentry->d_inode;
 
-	audit_inode(NULL, inode, 0);
+	audit_inode(NULL, inode);
 
 	err = -EROFS;
 	if (IS_RDONLY(inode))
@@ -786,7 +786,7 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
 	if (file) {
 		struct dentry * dentry;
 		dentry = file->f_dentry;
-		audit_inode(NULL, dentry->d_inode, 0);
+		audit_inode(NULL, dentry->d_inode);
 		error = chown_common(dentry, user, group);
 		fput(file);
 	}
diff --git a/fs/xattr.c b/fs/xattr.c
index e416190f5e9..c32f15b5f60 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -242,7 +242,7 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
 	if (!f)
 		return error;
 	dentry = f->f_dentry;
-	audit_inode(NULL, dentry->d_inode, 0);
+	audit_inode(NULL, dentry->d_inode);
 	error = setxattr(dentry, name, value, size, flags);
 	fput(f);
 	return error;
@@ -469,7 +469,7 @@ sys_fremovexattr(int fd, char __user *name)
 	if (!f)
 		return error;
 	dentry = f->f_dentry;
-	audit_inode(NULL, dentry->d_inode, 0);
+	audit_inode(NULL, dentry->d_inode);
 	error = removexattr(dentry, name);
 	fput(f);
 	return error;
-- 
cgit v1.2.3


From b9d9c82b4d081feb464f62dfc786c8621d09ecd2 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@suse.de>
Date: Thu, 15 Jun 2006 15:31:56 +0200
Subject: [PATCH] Driver core: add generic "subsystem" link to all devices

Like the SUBSYTEM= key we find in the environment of the uevent, this
creates a generic "subsystem" link in sysfs for every device. Userspace
usually doesn't care at all if its a "class" or a "bus" device. This
provides an unified way to determine the subsytem of a device, regardless
of the way the driver core has created it.

Signed-off-by: Kay Sievers <kay.sievers@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7ef1f094de9..8851b81e7c5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -329,6 +329,7 @@ void delete_partition(struct gendisk *disk, int part)
 	p->ios[0] = p->ios[1] = 0;
 	p->sectors[0] = p->sectors[1] = 0;
 	devfs_remove("%s/part%d", disk->devfs_name, part);
+	sysfs_remove_link(&p->kobj, "subsystem");
 	if (p->holder_dir)
 		kobject_unregister(p->holder_dir);
 	kobject_uevent(&p->kobj, KOBJ_REMOVE);
@@ -363,6 +364,7 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 	kobject_add(&p->kobj);
 	if (!disk->part_uevent_suppress)
 		kobject_uevent(&p->kobj, KOBJ_ADD);
+	sysfs_create_link(&p->kobj, &block_subsys.kset.kobj, "subsystem");
 	partition_sysfs_add_subdir(p);
 	disk->part[part-1] = p;
 }
@@ -398,6 +400,7 @@ static void disk_sysfs_symlinks(struct gendisk *disk)
 			kfree(disk_name);
 		}
 	}
+	sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj, "subsystem");
 }
 
 /* Not exported, helper to add_disk(). */
@@ -548,5 +551,6 @@ void del_gendisk(struct gendisk *disk)
 		put_device(disk->driverfs_dev);
 		disk->driverfs_dev = NULL;
 	}
+	sysfs_remove_link(&disk->kobj, "subsystem");
 	kobject_del(&disk->kobj);
 }
-- 
cgit v1.2.3


From f893afbe1262e27e91234506f72e17716190dd2f Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cam.ac.uk>
Date: Thu, 22 Jun 2006 14:47:15 -0700
Subject: [PATCH] NTFS: Critical bug fix (affects MIPS and possibly others)

Many thanks to Pauline Ng for the detailed bug report and analysis!

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ntfs/file.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index c63a83e8da9..36e1e136bb0 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1484,14 +1484,15 @@ static inline void ntfs_flush_dcache_pages(struct page **pages,
 		unsigned nr_pages)
 {
 	BUG_ON(!nr_pages);
+	/*
+	 * Warning: Do not do the decrement at the same time as the call to
+	 * flush_dcache_page() because it is a NULL macro on i386 and hence the
+	 * decrement never happens so the loop never terminates.
+	 */
 	do {
-		/*
-		 * Warning: Do not do the decrement at the same time as the
-		 * call because flush_dcache_page() is a NULL macro on i386
-		 * and hence the decrement never happens.
-		 */
+		--nr_pages;
 		flush_dcache_page(pages[nr_pages]);
-	} while (--nr_pages > 0);
+	} while (nr_pages > 0);
 }
 
 /**
-- 
cgit v1.2.3


From 09d967c6f32b35eab15b45862ae16e4f06259d8e Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Thu, 22 Jun 2006 14:47:21 -0700
Subject: [PATCH] Fix a race condition between ->i_mapping and iput()

This race became a cause of oops, and can reproduce by the following.

    while true; do
	dd if=/dev/zero of=/dev/.static/dev/hdg1 bs=512 count=1000 & sync
    done

This race condition was between __sync_single_inode() and iput().

          cpu0 (fs's inode)                 cpu1 (bdev's inode)
          -----------------                 -------------------
                                       close("/dev/hda2")
                                       [...]
__sync_single_inode()
   /* copy the bdev's ->i_mapping */
   mapping = inode->i_mapping;

                                       generic_forget_inode()
                                          bdev_clear_inode()
					     /* restre the fs's ->i_mapping */
				             inode->i_mapping = &inode->i_data;
				          /* bdev's inode was freed */
                                          destroy_inode(inode);

   if (wait) {
      /* dereference a freed bdev's mapping->host */
      filemap_fdatawait(mapping);  /* Oops */

Since __sync_single_inode() is only taking a ref-count of fs's inode, the
another process can be close() and freeing the bdev's inode while writing
fs's inode.  So, __sync_signle_inode() accesses the freed ->i_mapping,
oops.

This patch takes a ref-count on the bdev's inode for the fs's inode before
setting a ->i_mapping, and the clear_inode() of the fs's inode does iput() on
the bdev's inode.  So if the fs's inode is still living, bdev's inode
shouldn't be freed.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/block_dev.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f5958f413bd..44aaba202f7 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -414,21 +414,31 @@ EXPORT_SYMBOL(bdput);
 static struct block_device *bd_acquire(struct inode *inode)
 {
 	struct block_device *bdev;
+
 	spin_lock(&bdev_lock);
 	bdev = inode->i_bdev;
-	if (bdev && igrab(bdev->bd_inode)) {
+	if (bdev) {
+		atomic_inc(&bdev->bd_inode->i_count);
 		spin_unlock(&bdev_lock);
 		return bdev;
 	}
 	spin_unlock(&bdev_lock);
+
 	bdev = bdget(inode->i_rdev);
 	if (bdev) {
 		spin_lock(&bdev_lock);
-		if (inode->i_bdev)
-			__bd_forget(inode);
-		inode->i_bdev = bdev;
-		inode->i_mapping = bdev->bd_inode->i_mapping;
-		list_add(&inode->i_devices, &bdev->bd_inodes);
+		if (!inode->i_bdev) {
+			/*
+			 * We take an additional bd_inode->i_count for inode,
+			 * and it's released in clear_inode() of inode.
+			 * So, we can access it via ->i_mapping always
+			 * without igrab().
+			 */
+			atomic_inc(&bdev->bd_inode->i_count);
+			inode->i_bdev = bdev;
+			inode->i_mapping = bdev->bd_inode->i_mapping;
+			list_add(&inode->i_devices, &bdev->bd_inodes);
+		}
 		spin_unlock(&bdev_lock);
 	}
 	return bdev;
@@ -438,10 +448,18 @@ static struct block_device *bd_acquire(struct inode *inode)
 
 void bd_forget(struct inode *inode)
 {
+	struct block_device *bdev = NULL;
+
 	spin_lock(&bdev_lock);
-	if (inode->i_bdev)
+	if (inode->i_bdev) {
+		if (inode->i_sb != blockdev_superblock)
+			bdev = inode->i_bdev;
 		__bd_forget(inode);
+	}
 	spin_unlock(&bdev_lock);
+
+	if (bdev)
+		iput(bdev->bd_inode);
 }
 
 int bd_claim(struct block_device *bdev, void *holder)
-- 
cgit v1.2.3


From c89681ed7d0e4a61d35bdc12c06c6733b718b2cb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 22 Jun 2006 14:47:22 -0700
Subject: [PATCH] remove steal_locks()

This patch removes the steal_locks() function.

steal_locks() doesn't work correctly with any filesystem that does it's own
lock management, including NFS, CIFS, etc.

In addition it has weird semantics on local filesystems in case tasks
sharing file-descriptor tables are doing POSIX locking operations in
parallel to execve().

The steal_locks() function has an effect on applications doing:

clone(CLONE_FILES)
  /* in child */
  lock
  execve
  lock

POSIX locks acquired before execve (by "child", "parent" or any further
task sharing files_struct) will after the execve be owned exclusively by
"child".

According to Chris Wright some LSB/LTP kind of suite triggers without the
stealing behavior, but there's no known real-world application that would
also fail.

Apps using NPTL are not affected, since all other threads are killed before
execve.

Apps using LinuxThreads are only affected if they

  - have multiple threads during exec (LinuxThreads doesn't kill other
    threads, the app may do it with pthread_kill_other_threads_np())
  - rely on POSIX locks being inherited across exec

Both conditions are documented, but not their interaction.

Apps using clone() natively are affected if they

  - use clone(CLONE_FILES)
  - rely on POSIX locks being inherited across exec

The above scenarios are unlikely, but possible.

If the patch is vetoed, there's a plan B, that involves mostly keeping the
weird stealing semantics, but changing the way lock ownership is handled so
that network and local filesystems work consistently.

That would add more complexity though, so this solution seems to be
preferred by most people.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Matthew Wilcox <willy@debian.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c  |  1 -
 fs/binfmt_misc.c |  1 -
 fs/exec.c        |  1 -
 fs/locks.c       | 57 --------------------------------------------------------
 4 files changed, 60 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 537893a1601..8a04216e8b4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -759,7 +759,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* Discard our unneeded old files struct */
 	if (files) {
-		steal_locks(files);
 		put_files_struct(files);
 		files = NULL;
 	}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index d73d75591a3..599f36fd0f6 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -203,7 +203,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		goto _error;
 
 	if (files) {
-		steal_locks(files);
 		put_files_struct(files);
 		files = NULL;
 	}
diff --git a/fs/exec.c b/fs/exec.c
index d07858c0b7c..0b88bf64614 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -866,7 +866,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 	bprm->mm = NULL;		/* We're using it now */
 
 	/* This is the point of no return */
-	steal_locks(files);
 	put_files_struct(files);
 
 	current->sas_ss_sp = current->sas_ss_size = 0;
diff --git a/fs/locks.c b/fs/locks.c
index ab61a8b5482..69435c68c1e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2206,63 +2206,6 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 
 EXPORT_SYMBOL(lock_may_write);
 
-static inline void __steal_locks(struct file *file, fl_owner_t from)
-{
-	struct inode *inode = file->f_dentry->d_inode;
-	struct file_lock *fl = inode->i_flock;
-
-	while (fl) {
-		if (fl->fl_file == file && fl->fl_owner == from)
-			fl->fl_owner = current->files;
-		fl = fl->fl_next;
-	}
-}
-
-/* When getting ready for executing a binary, we make sure that current
- * has a files_struct on its own. Before dropping the old files_struct,
- * we take over ownership of all locks for all file descriptors we own.
- * Note that we may accidentally steal a lock for a file that a sibling
- * has created since the unshare_files() call.
- */
-void steal_locks(fl_owner_t from)
-{
-	struct files_struct *files = current->files;
-	int i, j;
-	struct fdtable *fdt;
-
-	if (from == files)
-		return;
-
-	lock_kernel();
-	j = 0;
-
-	/*
-	 * We are not taking a ref to the file structures, so
-	 * we need to acquire ->file_lock.
-	 */
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	for (;;) {
-		unsigned long set;
-		i = j * __NFDBITS;
-		if (i >= fdt->max_fdset || i >= fdt->max_fds)
-			break;
-		set = fdt->open_fds->fds_bits[j++];
-		while (set) {
-			if (set & 1) {
-				struct file *file = fdt->fd[i];
-				if (file)
-					__steal_locks(file, from);
-			}
-			i++;
-			set >>= 1;
-		}
-	}
-	spin_unlock(&files->file_lock);
-	unlock_kernel();
-}
-EXPORT_SYMBOL(steal_locks);
-
 static int __init filelock_init(void)
 {
 	filelock_cache = kmem_cache_create("file_lock_cache",
-- 
cgit v1.2.3


From 0feae5c47aabdde59cbbec32d150e17102de37f0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 22 Jun 2006 14:47:28 -0700
Subject: [PATCH] Fix dcache race during umount

The race is that the shrink_dcache_memory shrinker could get called while a
filesystem is being unmounted, and could try to prune a dentry belonging to
that filesystem.

If it does, then it will call in to iput on the inode while the dentry is
no longer able to be found by the umounting process.  If iput takes a
while, generic_shutdown_super could get all the way though
shrink_dcache_parent and shrink_dcache_anon and invalidate_inodes without
ever waiting on this particular inode.

Eventually the superblock gets freed anyway and if the iput tried to touch
it (which some filesystems certainly do), it will lose.  The promised
"Self-destruct in 5 seconds" doesn't lead to a nice day.

The race is closed by holding s_umount while calling prune_one_dentry on
someone else's dentry.  As a down_read_trylock is used,
shrink_dcache_memory will no longer try to prune the dentry of a filesystem
that is being unmounted, and unmount will not be able to start until any
such active prune_one_dentry completes.

This requires that prune_dcache *knows* which filesystem (if any) it is
doing the prune on behalf of so that it can be careful of other
filesystems.  shrink_dcache_memory isn't called it on behalf of any
filesystem, and so is careful of everything.

shrink_dcache_anon is now passed a super_block rather than the s_anon list
out of the superblock, so it can get the s_anon list itself, and can pass
the superblock down to prune_dcache.

If prune_dcache finds a dentry that it cannot free, it leaves it where it
is (at the tail of the list) and exits, on the assumption that some other
thread will be removing that dentry soon.  To try to make sure that some
work gets done, a limited number of dnetries which are untouchable are
skipped over while choosing the dentry to work on.

I believe this race was first found by Kirill Korotaev.

Cc: Jan Blunck <jblunck@suse.de>
Acked-by: Kirill Korotaev <dev@openvz.org>
Cc: Olaf Hering <olh@suse.de>
Acked-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 fs/super.c  |  2 +-
 2 files changed, 61 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 940d188e5d1..385f5dbc4b0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -382,6 +382,8 @@ static inline void prune_one_dentry(struct dentry * dentry)
 /**
  * prune_dcache - shrink the dcache
  * @count: number of entries to try and free
+ * @sb: if given, ignore dentries for other superblocks
+ *         which are being unmounted.
  *
  * Shrink the dcache. This is done when we need
  * more memory, or simply when we need to unmount
@@ -392,16 +394,29 @@ static inline void prune_one_dentry(struct dentry * dentry)
  * all the dentries are in use.
  */
  
-static void prune_dcache(int count)
+static void prune_dcache(int count, struct super_block *sb)
 {
 	spin_lock(&dcache_lock);
 	for (; count ; count--) {
 		struct dentry *dentry;
 		struct list_head *tmp;
+		struct rw_semaphore *s_umount;
 
 		cond_resched_lock(&dcache_lock);
 
 		tmp = dentry_unused.prev;
+		if (unlikely(sb)) {
+			/* Try to find a dentry for this sb, but don't try
+			 * too hard, if they aren't near the tail they will
+			 * be moved down again soon
+			 */
+			int skip = count;
+			while (skip && tmp != &dentry_unused &&
+			    list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
+				skip--;
+				tmp = tmp->prev;
+			}
+		}
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
@@ -427,7 +442,45 @@ static void prune_dcache(int count)
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		prune_one_dentry(dentry);
+		/*
+		 * If the dentry is not DCACHED_REFERENCED, it is time
+		 * to remove it from the dcache, provided the super block is
+		 * NULL (which means we are trying to reclaim memory)
+		 * or this dentry belongs to the same super block that
+		 * we want to shrink.
+		 */
+		/*
+		 * If this dentry is for "my" filesystem, then I can prune it
+		 * without taking the s_umount lock (I already hold it).
+		 */
+		if (sb && dentry->d_sb == sb) {
+			prune_one_dentry(dentry);
+			continue;
+		}
+		/*
+		 * ...otherwise we need to be sure this filesystem isn't being
+		 * unmounted, otherwise we could race with
+		 * generic_shutdown_super(), and end up holding a reference to
+		 * an inode while the filesystem is unmounted.
+		 * So we try to get s_umount, and make sure s_root isn't NULL.
+		 * (Take a local copy of s_umount to avoid a use-after-free of
+		 * `dentry').
+		 */
+		s_umount = &dentry->d_sb->s_umount;
+		if (down_read_trylock(s_umount)) {
+			if (dentry->d_sb->s_root != NULL) {
+				prune_one_dentry(dentry);
+				up_read(s_umount);
+				continue;
+			}
+			up_read(s_umount);
+		}
+		spin_unlock(&dentry->d_lock);
+		/* Cannot remove the first dentry, and it isn't appropriate
+		 * to move it to the head of the list, so give up, and try
+		 * later
+		 */
+		break;
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -630,7 +683,7 @@ void shrink_dcache_parent(struct dentry * parent)
 	int found;
 
 	while ((found = select_parent(parent)) != 0)
-		prune_dcache(found);
+		prune_dcache(found, parent->d_sb);
 }
 
 /**
@@ -643,9 +696,10 @@ void shrink_dcache_parent(struct dentry * parent)
  * done under dcache_lock.
  *
  */
-void shrink_dcache_anon(struct hlist_head *head)
+void shrink_dcache_anon(struct super_block *sb)
 {
 	struct hlist_node *lp;
+	struct hlist_head *head = &sb->s_anon;
 	int found;
 	do {
 		found = 0;
@@ -668,7 +722,7 @@ void shrink_dcache_anon(struct hlist_head *head)
 			}
 		}
 		spin_unlock(&dcache_lock);
-		prune_dcache(found);
+		prune_dcache(found, sb);
 	} while(found);
 }
 
@@ -689,7 +743,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr);
+		prune_dcache(nr, NULL);
 	}
 	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
diff --git a/fs/super.c b/fs/super.c
index a66f66bb804..9d5c2add722 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (root) {
 		sb->s_root = NULL;
 		shrink_dcache_parent(root);
-		shrink_dcache_anon(&sb->s_anon);
+		shrink_dcache_anon(sb);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);
-- 
cgit v1.2.3


From d702ccb342e49f7591df5a87c3857c698183b0fa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 22 Jun 2006 14:47:31 -0700
Subject: [PATCH] prune_one_dentry() tweaks

- Add description of d_lock handling to comments over prune_one_dentry().

- It has three callsites - uninline it, saving 200 bytes of text.

Cc: Jan Blunck <jblunck@suse.de>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Olaf Hering <olh@suse.de>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 385f5dbc4b0..59dbc92c207 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -359,12 +359,13 @@ restart:
 }
 
 /*
- * Throw away a dentry - free the inode, dput the parent.
- * This requires that the LRU list has already been
- * removed.
+ * Throw away a dentry - free the inode, dput the parent.  This requires that
+ * the LRU list has already been removed.
+ *
  * Called with dcache_lock, drops it and then regains.
+ * Called with dentry->d_lock held, drops it.
  */
-static inline void prune_one_dentry(struct dentry * dentry)
+static void prune_one_dentry(struct dentry * dentry)
 {
 	struct dentry * parent;
 
-- 
cgit v1.2.3


From 189acaaef81b1d71aedd0d28810de24160c2e781 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 23 Jun 2006 02:33:48 +0000
Subject: [CIFS] Enable sec flags on mount for cifs (part one)

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/README      |   2 +
 fs/cifs/cifsproto.h |   2 +-
 fs/cifs/connect.c   | 338 ++++------------------------------------------------
 fs/cifs/sess.c      |   7 +-
 4 files changed, 28 insertions(+), 321 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/README b/fs/cifs/README
index a68f8e3db2d..46c2cfa5acf 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -453,6 +453,8 @@ sec		Security mode.  Allowed values are:
 				server requires signing also can be the default) 
 			ntlmv2  Use NTLMv2 password hashing      
 			ntlmv2i Use NTLMv2 password hashing with packet signing
+			lanman  (if configured in kernel config) use older
+				lanman hash
 
 The mount.cifs mount helper also accepts a few mount options before -o
 including:
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 7ffd5b0d63c..7ed06bfa192 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -68,10 +68,10 @@ extern void header_assemble(struct smb_hdr *, char /* command */ ,
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
 				struct cifsSesInfo *ses,
 				void ** request_buf);
+#endif
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 			     const int stage, 
 			     const struct nls_table *nls_cp);
-#endif
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct oplock_q_entry * AllocOplockQEntry(struct inode *, u16, 
 						 struct cifsTconInfo *);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index faaf9eb15b9..01608bb4d67 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -68,6 +68,7 @@ struct smb_vol {
 	gid_t linux_gid;
 	mode_t file_mode;
 	mode_t dir_mode;
+	unsigned secFlg;
 	unsigned rw:1;
 	unsigned retry:1;
 	unsigned intr:1;
@@ -81,12 +82,7 @@ struct smb_vol {
 	unsigned remap:1;   /* set to remap seven reserved chars in filenames */
 	unsigned posix_paths:1;   /* unset to not ask for posix pathnames. */
 	unsigned sfu_emul:1;
-	unsigned krb5:1;
-	unsigned ntlm:1;
-	unsigned ntlmv2:1;
 	unsigned nullauth:1; /* attempt to authenticate with null user */
-	unsigned sign:1;
-	unsigned seal:1;     /* encrypt */
 	unsigned nocase;     /* request case insensitive filenames */
 	unsigned nobrl;      /* disable sending byte range locks to srv */
 	unsigned int rsize;
@@ -789,7 +785,6 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 
 	/* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
 	vol->rw = TRUE;
-	vol->ntlm = TRUE;
 	/* default is always to request posix paths. */
 	vol->posix_paths = 1;
 
@@ -920,30 +915,35 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				cERROR(1,("no security value specified"));
                                 continue;
                         } else if (strnicmp(value, "krb5i", 5) == 0) {
-				vol->sign = 1;
-				vol->krb5 = 1;
+				vol->secFlg = CIFSSEC_MAY_KRB5 | 
+					CIFSSEC_MUST_SIGN;
 			} else if (strnicmp(value, "krb5p", 5) == 0) {
-				/* vol->seal = 1; 
-				   vol->krb5 = 1; */
+				/* vol->secFlg = CIFSSEC_MUST_SEAL | 
+					CIFSSEC_MAY_KRB5; */ 
 				cERROR(1,("Krb5 cifs privacy not supported"));
 				return 1;
 			} else if (strnicmp(value, "krb5", 4) == 0) {
-				vol->krb5 = 1;
+				vol->secFlg = CIFSSEC_MAY_KRB5;
 			} else if (strnicmp(value, "ntlmv2i", 7) == 0) {
-				vol->ntlmv2 = 1;
-				vol->sign = 1;
+				vol->secFlg = CIFSSEC_MAY_NTLMV2 |
+					CIFSSEC_MUST_SIGN;
 			} else if (strnicmp(value, "ntlmv2", 6) == 0) {
-				vol->ntlmv2 = 1;
+				vol->secFlg = CIFSSEC_MAY_NTLMV2;
 			} else if (strnicmp(value, "ntlmi", 5) == 0) {
-				vol->ntlm = 1;
-				vol->sign = 1;
+				vol->secFlg = CIFSSEC_MAY_NTLM |
+					CIFSSEC_MUST_SIGN;
 			} else if (strnicmp(value, "ntlm", 4) == 0) {
 				/* ntlm is default so can be turned off too */
-				vol->ntlm = 1;
+				vol->secFlg = CIFSSEC_MAY_NTLM;
 			} else if (strnicmp(value, "nontlm", 6) == 0) {
-				vol->ntlm = 0;
+				/* BB is there a better way to do this? */
+				vol->secFlg = CIFSSEC_MAY_NTLMV2;
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+			} else if (strnicmp(value, "lanman", 6) == 0) {
+                                vol->secFlg = CIFSSEC_MAY_LANMAN;
+#endif
 			} else if (strnicmp(value, "none", 4) == 0) {
-				vol->nullauth = 1; 
+				vol->nullauth = 1;
                         } else {
                                 cERROR(1,("bad security option: %s", value));
                                 return 1;
@@ -1777,6 +1777,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			}
 			pSesInfo->linux_uid = volume_info.linux_uid;
 			down(&pSesInfo->sesSem);
+			/* BB FIXME need to pass vol->secFlgs BB */
 			rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls);
 			up(&pSesInfo->sesSem);
 			if(!rc)
@@ -2283,292 +2284,6 @@ sesssetup_nomem:	/* do not return an error on nomem for the info strings,
 	return rc;
 }
 
-static int
-CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
-		char *SecurityBlob,int SecurityBlobLength,
-		const struct nls_table *nls_codepage)
-{
-	struct smb_hdr *smb_buffer;
-	struct smb_hdr *smb_buffer_response;
-	SESSION_SETUP_ANDX *pSMB;
-	SESSION_SETUP_ANDX *pSMBr;
-	char *bcc_ptr;
-	char *user;
-	char *domain;
-	int rc = 0;
-	int remaining_words = 0;
-	int bytes_returned = 0;
-	int len;
-	__u32 capabilities;
-	__u16 count;
-
-	cFYI(1, ("In spnego sesssetup "));
-	if(ses == NULL)
-		return -EINVAL;
-	user = ses->userName;
-	domain = ses->domainName;
-
-	smb_buffer = cifs_buf_get();
-	if (smb_buffer == NULL) {
-		return -ENOMEM;
-	}
-	smb_buffer_response = smb_buffer;
-	pSMBr = pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
-
-	/* send SMBsessionSetup here */
-	header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX,
-			NULL /* no tCon exists yet */ , 12 /* wct */ );
-
-	smb_buffer->Mid = GetNextMid(ses->server);
-	pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-	pSMB->req.AndXCommand = 0xFF;
-	if(ses->server->maxBuf > 64*1024)
-		ses->server->maxBuf = (64*1023);
-	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
-	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
-
-	if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-		smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-
-	capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS |
-	    CAP_EXTENDED_SECURITY;
-	if (ses->capabilities & CAP_UNICODE) {
-		smb_buffer->Flags2 |= SMBFLG2_UNICODE;
-		capabilities |= CAP_UNICODE;
-	}
-	if (ses->capabilities & CAP_STATUS32) {
-		smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
-		capabilities |= CAP_STATUS32;
-	}
-	if (ses->capabilities & CAP_DFS) {
-		smb_buffer->Flags2 |= SMBFLG2_DFS;
-		capabilities |= CAP_DFS;
-	}
-	pSMB->req.Capabilities = cpu_to_le32(capabilities);
-
-	pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength);
-	bcc_ptr = pByteArea(smb_buffer);
-	memcpy(bcc_ptr, SecurityBlob, SecurityBlobLength);
-	bcc_ptr += SecurityBlobLength;
-
-	if (ses->capabilities & CAP_UNICODE) {
-		if ((long) bcc_ptr % 2) {	/* must be word aligned for Unicode strings */
-			*bcc_ptr = 0;
-			bcc_ptr++;
-		}
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, user, 100, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;	/* convert num of 16 bit words to bytes */
-		bcc_ptr += 2;	/* trailing null */
-		if (domain == NULL)
-			bytes_returned =
-			    cifs_strtoUCS((__le16 *) bcc_ptr,
-					  "CIFS_LINUX_DOM", 32, nls_codepage);
-		else
-			bytes_returned =
-			    cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64,
-					  nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bcc_ptr += 2;
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ",
-				  32, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
-				  nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bcc_ptr += 2;
-		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-				  64, nls_codepage);
-		bcc_ptr += 2 * bytes_returned;
-		bcc_ptr += 2;
-	} else {
-		strncpy(bcc_ptr, user, 200);
-		bcc_ptr += strnlen(user, 200);
-		*bcc_ptr = 0;
-		bcc_ptr++;
-		if (domain == NULL) {
-			strcpy(bcc_ptr, "CIFS_LINUX_DOM");
-			bcc_ptr += strlen("CIFS_LINUX_DOM") + 1;
-		} else {
-			strncpy(bcc_ptr, domain, 64);
-			bcc_ptr += strnlen(domain, 64);
-			*bcc_ptr = 0;
-			bcc_ptr++;
-		}
-		strcpy(bcc_ptr, "Linux version ");
-		bcc_ptr += strlen("Linux version ");
-		strcpy(bcc_ptr, system_utsname.release);
-		bcc_ptr += strlen(system_utsname.release) + 1;
-		strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
-		bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
-	}
-	count = (long) bcc_ptr - (long) pByteArea(smb_buffer);
-	smb_buffer->smb_buf_length += count;
-	pSMB->req.ByteCount = cpu_to_le16(count);
-
-	rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response,
-			 &bytes_returned, 1);
-	if (rc) {
-/*    rc = map_smb_to_linux_error(smb_buffer_response);  *//* done in SendReceive now */
-	} else if ((smb_buffer_response->WordCount == 3)
-		   || (smb_buffer_response->WordCount == 4)) {
-		__u16 action = le16_to_cpu(pSMBr->resp.Action);
-		__u16 blob_len =
-		    le16_to_cpu(pSMBr->resp.SecurityBlobLength);
-		if (action & GUEST_LOGIN)
-			cFYI(1, (" Guest login"));	/* BB do we want to set anything in SesInfo struct ? */
-		if (ses) {
-			ses->Suid = smb_buffer_response->Uid;	/* UID left in wire format (le) */
-			cFYI(1, ("UID = %d ", ses->Suid));
-			bcc_ptr = pByteArea(smb_buffer_response);	/* response can have either 3 or 4 word count - Samba sends 3 */
-
-			/* BB Fix below to make endian neutral !! */
-
-			if ((pSMBr->resp.hdr.WordCount == 3)
-			    || ((pSMBr->resp.hdr.WordCount == 4)
-				&& (blob_len <
-				    pSMBr->resp.ByteCount))) {
-				if (pSMBr->resp.hdr.WordCount == 4) {
-					bcc_ptr +=
-					    blob_len;
-					cFYI(1,
-					     ("Security Blob Length %d ",
-					      blob_len));
-				}
-
-				if (smb_buffer->Flags2 & SMBFLG2_UNICODE) {
-					if ((long) (bcc_ptr) % 2) {
-						remaining_words =
-						    (BCC(smb_buffer_response)
-						     - 1) / 2;
-						bcc_ptr++;	/* Unicode strings must be word aligned */
-					} else {
-						remaining_words =
-						    BCC
-						    (smb_buffer_response) / 2;
-					}
-					len =
-					    UniStrnlen((wchar_t *) bcc_ptr,
-						       remaining_words - 1);
-/* We look for obvious messed up bcc or strings in response so we do not go off
-   the end since (at least) WIN2K and Windows XP have a major bug in not null
-   terminating last Unicode string in response  */
-					if(ses->serverOS)
-						kfree(ses->serverOS);
-					ses->serverOS =
-					    kzalloc(2 * (len + 1), GFP_KERNEL);
-					cifs_strfromUCS_le(ses->serverOS,
-							   (__le16 *)
-							   bcc_ptr, len,
-							   nls_codepage);
-					bcc_ptr += 2 * (len + 1);
-					remaining_words -= len + 1;
-					ses->serverOS[2 * len] = 0;
-					ses->serverOS[1 + (2 * len)] = 0;
-					if (remaining_words > 0) {
-						len = UniStrnlen((wchar_t *)bcc_ptr,
-								 remaining_words
-								 - 1);
-						if(ses->serverNOS)
-							kfree(ses->serverNOS);
-						ses->serverNOS =
-						    kzalloc(2 * (len + 1),
-							    GFP_KERNEL);
-						cifs_strfromUCS_le(ses->serverNOS,
-								   (__le16 *)bcc_ptr,
-								   len,
-								   nls_codepage);
-						bcc_ptr += 2 * (len + 1);
-						ses->serverNOS[2 * len] = 0;
-						ses->serverNOS[1 + (2 * len)] = 0;
-						remaining_words -= len + 1;
-						if (remaining_words > 0) {
-							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
-                     /* last string not null terminated (e.g.Windows XP/2000) */
-							if(ses->serverDomain)
-								kfree(ses->serverDomain);
-							ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL);
-							cifs_strfromUCS_le(ses->serverDomain,
-							     (__le16 *)bcc_ptr, 
-							     len, nls_codepage);
-							bcc_ptr += 2*(len+1);
-							ses->serverDomain[2*len] = 0;
-							ses->serverDomain[1+(2*len)] = 0;
-						} /* else no more room so create dummy domain string */
-						else {
-							if(ses->serverDomain)
-								kfree(ses->serverDomain);
-							ses->serverDomain =
-							    kzalloc(2,GFP_KERNEL);
-						}
-					} else {/* no room use dummy domain&NOS */
-						if(ses->serverDomain)
-							kfree(ses->serverDomain);
-						ses->serverDomain = kzalloc(2, GFP_KERNEL);
-						if(ses->serverNOS)
-							kfree(ses->serverNOS);
-						ses->serverNOS = kzalloc(2, GFP_KERNEL);
-					}
-				} else {	/* ASCII */
-
-					len = strnlen(bcc_ptr, 1024);
-					if (((long) bcc_ptr + len) - (long)
-					    pByteArea(smb_buffer_response)
-					    <= BCC(smb_buffer_response)) {
-						if(ses->serverOS)
-							kfree(ses->serverOS);
-						ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
-						strncpy(ses->serverOS, bcc_ptr, len);
-
-						bcc_ptr += len;
-						bcc_ptr[0] = 0;	/* null terminate the string */
-						bcc_ptr++;
-
-						len = strnlen(bcc_ptr, 1024);
-						if(ses->serverNOS)
-							kfree(ses->serverNOS);
-						ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
-						strncpy(ses->serverNOS, bcc_ptr, len);
-						bcc_ptr += len;
-						bcc_ptr[0] = 0;
-						bcc_ptr++;
-
-						len = strnlen(bcc_ptr, 1024);
-						if(ses->serverDomain)
-							kfree(ses->serverDomain);
-						ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
-						strncpy(ses->serverDomain, bcc_ptr, len);
-						bcc_ptr += len;
-						bcc_ptr[0] = 0;
-						bcc_ptr++;
-					} else
-						cFYI(1,
-						     ("Variable field of length %d extends beyond end of smb ",
-						      len));
-				}
-			} else {
-				cERROR(1,
-				       (" Security Blob Length extends beyond end of SMB"));
-			}
-		} else {
-			cERROR(1, ("No session structure passed in."));
-		}
-	} else {
-		cERROR(1,
-		       (" Invalid Word count %d: ",
-			smb_buffer_response->WordCount));
-		rc = -EIO;
-	}
-
-	if (smb_buffer)
-		cifs_buf_release(smb_buffer);
-
-	return rc;
-}
-
 static int
 CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 			      struct cifsSesInfo *ses, int * pNTLMv2_flag,
@@ -3550,20 +3265,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 			pSesInfo->server->secMode,
 			pSesInfo->server->capabilities,
 			pSesInfo->server->timeZone));
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-		if(experimEnabled > 1)
+		if(experimEnabled < 2)
 			rc = CIFS_SessSetup(xid, pSesInfo,
 					    first_time, nls_info);
-		else
-#endif
-		if (extended_security
+		else if (extended_security
 				&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
 				&& (pSesInfo->server->secType == NTLMSSP)) {
-			cFYI(1, ("New style sesssetup"));
-			rc = CIFSSpnegoSessSetup(xid, pSesInfo,
-				NULL /* security blob */, 
-				0 /* blob length */,
-				nls_info);
+			rc = -EOPNOTSUPP;
 		} else if (extended_security
 			   && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
 			   && (pSesInfo->server->secType == RawNTLMSSP)) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index c039b54206a..70e32a81c21 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -33,8 +33,6 @@
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
                          unsigned char *p24);
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-
 static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 {
 	__u32 capabilities = 0;
@@ -319,7 +317,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	__u32 capabilities;
 	int count;
 	int resp_buf_type = 0;
-	struct kvec iov[1];
+	struct kvec iov[2];  /* BB split variable length info into 2nd iovec */
 	enum securityEnum type;
 	__u16 action;
 	int bytes_remaining;
@@ -489,7 +487,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	}
 	action = le16_to_cpu(pSMB->resp.Action);
 	if (action & GUEST_LOGIN)
-		cFYI(1, (" Guest login")); /* BB mark SesInfo struct? */
+		cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
 	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
 	cFYI(1, ("UID = %d ", ses->Suid));
 	/* response can have either 3 or 4 word count - Samba sends 3 */
@@ -525,4 +523,3 @@ ssetup_exit:
 
 	return rc;
 }
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
-- 
cgit v1.2.3


From 454e2398be9b9fa30433fccc548db34d19aa9958 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:02:57 -0700
Subject: [PATCH] VFS: Permit filesystem to override root dentry on mount

Extend the get_sb() filesystem operation to take an extra argument that
permits the VFS to pass in the target vfsmount that defines the mountpoint.

The filesystem is then required to manually set the superblock and root dentry
pointers.  For most filesystems, this should be done with simple_set_mnt()
which will set the superblock pointer and then set the root dentry to the
superblock's s_root (as per the old default behaviour).

The get_sb() op now returns an integer as there's now no need to return the
superblock pointer.

This patch permits a superblock to be implicitly shared amongst several mount
points, such as can be done with NFS to avoid potential inode aliasing.  In
such a case, simple_set_mnt() would not be called, and instead the mnt_root
and mnt_sb would be set directly.

The patch also makes the following changes:

 (*) the get_sb_*() convenience functions in the core kernel now take a vfsmount
     pointer argument and return an integer, so most filesystems have to change
     very little.

 (*) If one of the convenience function is not used, then get_sb() should
     normally call simple_set_mnt() to instantiate the vfsmount. This will
     always return 0, and so can be tail-called from get_sb().

 (*) generic_shutdown_super() now calls shrink_dcache_sb() to clean up the
     dcache upon superblock destruction rather than shrink_dcache_anon().

     This is required because the superblock may now have multiple trees that
     aren't actually bound to s_root, but that still need to be cleaned up. The
     currently called functions assume that the whole tree is rooted at s_root,
     and that anonymous dentries are not the roots of trees which results in
     dentries being left unculled.

     However, with the way NFS superblock sharing are currently set to be
     implemented, these assumptions are violated: the root of the filesystem is
     simply a dummy dentry and inode (the real inode for '/' may well be
     inaccessible), and all the vfsmounts are rooted on anonymous[*] dentries
     with child trees.

     [*] Anonymous until discovered from another tree.

 (*) The documentation has been adjusted, including the additional bit of
     changing ext2_* into foo_* in the documentation.

[akpm@osdl.org: convert ipath_fs, do other stuff]
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_super.c            |  21 +++++----
 fs/adfs/super.c              |   7 +--
 fs/affs/super.c              |   7 +--
 fs/afs/super.c               |  24 +++++-----
 fs/autofs/init.c             |   6 +--
 fs/autofs4/init.c            |   6 +--
 fs/befs/linuxvfs.c           |   7 +--
 fs/bfs/inode.c               |   6 +--
 fs/binfmt_misc.c             |   6 +--
 fs/block_dev.c               |   6 +--
 fs/cifs/cifsfs.c             |  10 ++--
 fs/coda/inode.c              |   6 +--
 fs/configfs/mount.c          |   6 +--
 fs/cramfs/inode.c            |   7 +--
 fs/dcache.c                  |  40 ----------------
 fs/debugfs/inode.c           |   8 ++--
 fs/devfs/base.c              |   8 ++--
 fs/devpts/inode.c            |   6 +--
 fs/efs/super.c               |   6 +--
 fs/eventpoll.c               |  13 +++---
 fs/ext2/super.c              |   6 +--
 fs/ext3/super.c              |   6 +--
 fs/freevxfs/vxfs_super.c     |   7 +--
 fs/fuse/inode.c              |   8 ++--
 fs/hfs/super.c               |   7 +--
 fs/hfsplus/super.c           |   8 ++--
 fs/hostfs/hostfs_kern.c      |   8 ++--
 fs/hpfs/super.c              |   7 +--
 fs/hppfs/hppfs_kern.c        |   8 ++--
 fs/hugetlbfs/inode.c         |   6 +--
 fs/inotify_user.c            |   6 +--
 fs/isofs/inode.c             |   7 +--
 fs/jffs/inode-v23.c          |   7 +--
 fs/jffs2/super.c             |  49 ++++++++++---------
 fs/jfs/super.c               |   7 +--
 fs/libfs.c                   |  12 ++---
 fs/minix/inode.c             |   7 +--
 fs/msdos/namei.c             |   9 ++--
 fs/namespace.c               |   9 ++++
 fs/ncpfs/inode.c             |   6 +--
 fs/nfs/inode.c               |  96 ++++++++++++++++++++++---------------
 fs/nfsd/nfsctl.c             |   6 +--
 fs/ntfs/super.c              |   7 +--
 fs/ocfs2/dlm/dlmfs.c         |   6 +--
 fs/ocfs2/super.c             |  12 +++--
 fs/openpromfs/inode.c        |   6 +--
 fs/pipe.c                    |   9 ++--
 fs/proc/root.c               |   6 +--
 fs/qnx4/inode.c              |   7 +--
 fs/ramfs/inode.c             |  13 +++---
 fs/reiserfs/super.c          |   9 ++--
 fs/romfs/inode.c             |   7 +--
 fs/smbfs/inode.c             |   6 +--
 fs/super.c                   | 109 ++++++++++++++++++++++++-------------------
 fs/sysfs/mount.c             |   6 +--
 fs/sysv/super.c              |  13 +++---
 fs/udf/super.c               |   6 +--
 fs/ufs/super.c               |   6 +--
 fs/vfat/namei.c              |   9 ++--
 fs/xfs/linux-2.6/xfs_super.c |   8 ++--
 60 files changed, 391 insertions(+), 352 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 61c599b4a1e..872943004e5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -99,12 +99,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
  * @flags: mount flags
  * @dev_name: device name that was mounted
  * @data: mount options
+ * @mnt: mountpoint record to be instantiated
  *
  */
 
-static struct super_block *v9fs_get_sb(struct file_system_type
-				       *fs_type, int flags,
-				       const char *dev_name, void *data)
+static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data,
+		       struct vfsmount *mnt)
 {
 	struct super_block *sb = NULL;
 	struct v9fs_fcall *fcall = NULL;
@@ -123,17 +124,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
 	if (!v9ses)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
-		sb = ERR_PTR(newfid);
+		retval = newfid;
 		goto out_free_session;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-	if (IS_ERR(sb))
+	if (IS_ERR(sb)) {
+		retval = PTR_ERR(sb);
 		goto out_close_session;
+	}
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -184,19 +187,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 		goto put_back_sb;
 	}
 
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
 out_close_session:
 	v9fs_session_close(v9ses);
 out_free_session:
 	kfree(v9ses);
-	return sb;
+	return retval;
 
 put_back_sb:
 	/* deactivate_super calls v9fs_kill_super which will frees the rest */
 	up_write(&sb->s_umount);
 	deactivate_super(sb);
-	return ERR_PTR(retval);
+	return retval;
 }
 
 /**
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 252abda0d20..1b58a9b7f6a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -470,10 +470,11 @@ error:
 	return -EINVAL;
 }
 
-static struct super_block *adfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int adfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type adfs_fs_type = {
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4d7e5b19e5c..6a52e787540 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -524,10 +524,11 @@ affs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *affs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int affs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type affs_fs_type = {
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 53c56e7231a..82468df0ba5 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -38,9 +38,9 @@ struct afs_mount_params {
 static void afs_i_init_once(void *foo, kmem_cache_t *cachep,
 			    unsigned long flags);
 
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name,
-				      void *data);
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name,
+		      void *data, struct vfsmount *mnt);
 
 static struct inode *afs_alloc_inode(struct super_block *sb);
 
@@ -294,10 +294,11 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
  * get an AFS superblock
  * - TODO: don't use get_sb_nodev(), but rather call sget() directly
  */
-static struct super_block *afs_get_sb(struct file_system_type *fs_type,
-				      int flags,
-				      const char *dev_name,
-				      void *options)
+static int afs_get_sb(struct file_system_type *fs_type,
+		      int flags,
+		      const char *dev_name,
+		      void *options,
+		      struct vfsmount *mnt)
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
@@ -311,7 +312,7 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 	ret = afscm_start();
 	if (ret < 0) {
 		_leave(" = %d", ret);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	/* parse the options */
@@ -348,18 +349,19 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type,
 		goto error;
 	}
 	sb->s_flags |= MS_ACTIVE;
+	simple_set_mnt(mnt, sb);
 
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
-	_leave(" = %p", sb);
-	return sb;
+	_leave(" = 0 [%p]", 0, sb);
+	return 0;
 
  error:
 	afs_put_volume(params.volume);
 	afs_put_cell(params.default_cell);
 	afscm_stop();
 	_leave(" = %d", ret);
-	return ERR_PTR(ret);
+	return ret;
 } /* end afs_get_sb() */
 
 /*****************************************************************************/
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index b977ece69f0..aca12375240 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index acecec8578c..5d9193332be 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -14,10 +14,10 @@
 #include <linux/init.h>
 #include "autofs_i.h"
 
-static struct super_block *autofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int autofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super);
+	return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt);
 }
 
 static struct file_system_type autofs_fs_type = {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 68ebd10f345..6ed07a5f10c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -899,11 +899,12 @@ befs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;
 }
 
-static struct super_block *
+static int
 befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name,
-	    void *data)
+	    void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type befs_fs_type = {
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 55a7a78332f..e7da03f63a5 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -410,10 +410,10 @@ out:
 	return -EINVAL;
 }
 
-static struct super_block *bfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt);
 }
 
 static struct file_system_type bfs_fs_type = {
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 599f36fd0f6..07a4996cca3 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -739,10 +739,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 	return err;
 }
 
-static struct super_block *bm_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bm_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, bm_fill_super);
+	return get_sb_single(fs_type, flags, data, bm_fill_super, mnt);
 }
 
 static struct linux_binfmt misc_format = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 44aaba202f7..028d9fb9c2d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -300,10 +300,10 @@ static struct super_operations bdev_sops = {
 	.clear_inode = bdev_clear_inode,
 };
 
-static struct super_block *bd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int bd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576);
+	return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt);
 }
 
 static struct file_system_type bd_type = {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c262d8874ce..08b35801dfe 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -460,9 +460,9 @@ struct super_operations cifs_super_ops = {
 	.remount_fs = cifs_remount,
 };
 
-static struct super_block *
+static int
 cifs_get_sb(struct file_system_type *fs_type,
-	    int flags, const char *dev_name, void *data)
+	    int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	int rc;
 	struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
@@ -470,7 +470,7 @@ cifs_get_sb(struct file_system_type *fs_type,
 	cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
 
 	if (IS_ERR(sb))
-		return sb;
+		return PTR_ERR(sb);
 
 	sb->s_flags = flags;
 
@@ -478,10 +478,10 @@ cifs_get_sb(struct file_system_type *fs_type,
 	if (rc) {
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(rc);
+		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 }
 
 static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index ada1a81df6b..cba70201567 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -307,10 +307,10 @@ static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
 
 /* init_coda: used by filesystems.c to register coda */
 
-static struct super_block *coda_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int coda_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, coda_fill_super);
+	return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt);
 }
 
 struct file_system_type coda_fs_type = {
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f920d30478e..94dab7bdd85 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -103,10 +103,10 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int configfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, configfs_fill_super);
+	return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt);
 }
 
 static struct file_system_type configfs_fs_type = {
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9efcc3a164e..37a91a153aa 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -528,10 +528,11 @@ static struct super_operations cramfs_ops = {
 	.statfs		= cramfs_statfs,
 };
 
-static struct super_block *cramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int cramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type cramfs_fs_type = {
diff --git a/fs/dcache.c b/fs/dcache.c
index 59dbc92c207..313b54b2b8f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -687,46 +687,6 @@ void shrink_dcache_parent(struct dentry * parent)
 		prune_dcache(found, parent->d_sb);
 }
 
-/**
- * shrink_dcache_anon - further prune the cache
- * @head: head of d_hash list of dentries to prune
- *
- * Prune the dentries that are anonymous
- *
- * parsing d_hash list does not hlist_for_each_entry_rcu() as it
- * done under dcache_lock.
- *
- */
-void shrink_dcache_anon(struct super_block *sb)
-{
-	struct hlist_node *lp;
-	struct hlist_head *head = &sb->s_anon;
-	int found;
-	do {
-		found = 0;
-		spin_lock(&dcache_lock);
-		hlist_for_each(lp, head) {
-			struct dentry *this = hlist_entry(lp, struct dentry, d_hash);
-			if (!list_empty(&this->d_lru)) {
-				dentry_stat.nr_unused--;
-				list_del_init(&this->d_lru);
-			}
-
-			/* 
-			 * move only zero ref count dentries to the end 
-			 * of the unused list for prune_dcache
-			 */
-			if (!atomic_read(&this->d_count)) {
-				list_add_tail(&this->d_lru, &dentry_unused);
-				dentry_stat.nr_unused++;
-				found++;
-			}
-		}
-		spin_unlock(&dcache_lock);
-		prune_dcache(found, sb);
-	} while(found);
-}
-
 /*
  * Scan `nr' dentries and return the number which remain.
  *
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b55b4ea9a67..440128ebef3 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -111,11 +111,11 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
 	return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
 }
 
-static struct super_block *debug_get_sb(struct file_system_type *fs_type,
-				        int flags, const char *dev_name,
-					void *data)
+static int debug_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, debug_fill_super);
+	return get_sb_single(fs_type, flags, data, debug_fill_super, mnt);
 }
 
 static struct file_system_type debug_fs_type = {
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 52f5059c4f3..51a97f13274 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2549,11 +2549,11 @@ static int devfs_fill_super(struct super_block *sb, void *data, int silent)
 	return -EINVAL;
 }				/*  End Function devfs_fill_super  */
 
-static struct super_block *devfs_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int devfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devfs_fill_super);
+	return get_sb_single(fs_type, flags, data, devfs_fill_super, mnt);
 }
 
 static struct file_system_type devfs_fs_type = {
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14c5620b5ca..f7aef5bb584 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -130,10 +130,10 @@ fail:
 	return -ENOMEM;
 }
 
-static struct super_block *devpts_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int devpts_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devpts_fill_super);
+	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
 
 static struct file_system_type devpts_fs_type = {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index dff623e3ddb..1ba5e14f879 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -18,10 +18,10 @@
 static int efs_statfs(struct super_block *s, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
-static struct super_block *efs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int efs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt);
 }
 
 static struct file_system_type efs_fs_type = {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2695337d4d6..08e7e6a555c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -268,9 +268,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		   int maxevents, long timeout);
 static int eventpollfs_delete_dentry(struct dentry *dentry);
 static struct inode *ep_eventpoll_inode(void);
-static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data);
+static int eventpollfs_get_sb(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, struct vfsmount *mnt);
 
 /*
  * This semaphore is used to serialize ep_free() and eventpoll_release_file().
@@ -1595,11 +1595,12 @@ eexit_1:
 }
 
 
-static struct super_block *
+static int
 eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
-		   const char *dev_name, void *data)
+		   const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
+	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
+			     mnt);
 }
 
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7e30bae174e..a4dfffac596 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1087,10 +1087,10 @@ static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
 	return 0;
 }
 
-static struct super_block *ext2_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext2_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt);
 }
 
 #ifdef CONFIG_QUOTA
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f8a5266ea1f..657f8e73b62 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2646,10 +2646,10 @@ out:
 
 #endif
 
-static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ext3_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
 }
 
 static struct file_system_type ext3_fs_type = {
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index b44c916d24a..d76eeaafbde 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -241,10 +241,11 @@ out:
 /*
  * The usual module blurb.
  */
-static struct super_block *vxfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int vxfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vxfs_fs_type = {
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7627022446b..c91f0a50aad 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -569,11 +569,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	return err;
 }
 
-static struct super_block *fuse_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *raw_data)
+static int fuse_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *raw_data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super);
+	return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt);
 }
 
 static struct file_system_type fuse_fs_type = {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 1181d116117..ee5b80a409e 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -413,10 +413,11 @@ bail:
 	return res;
 }
 
-static struct super_block *hfs_get_sb(struct file_system_type *fs_type,
-				      int flags, const char *dev_name, void *data)
+static int hfs_get_sb(struct file_system_type *fs_type,
+		      int flags, const char *dev_name, void *data,
+		      struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt);
 }
 
 static struct file_system_type hfs_fs_type = {
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7843f792a4b..0ed8b7e8e87 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -450,10 +450,12 @@ static void hfsplus_destroy_inode(struct inode *inode)
 
 #define HFSPLUS_INODE_SIZE	sizeof(struct hfsplus_inode_info)
 
-static struct super_block *hfsplus_get_sb(struct file_system_type *fs_type,
-					  int flags, const char *dev_name, void *data)
+static int hfsplus_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hfsplus_fs_type = {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bf0f8e16e43..04035e08f5c 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -993,11 +993,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hostfs_read_sb(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hostfs_read_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common));
+	return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt);
 }
 
 static struct file_system_type hostfs_type = {
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index d72d8c87c99..3b25cf3e2e6 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -662,10 +662,11 @@ bail0:
 	return -EINVAL;
 }
 
-static struct super_block *hpfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hpfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type hpfs_fs_type = {
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 5e6363be246..ec43c22bc9c 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -769,11 +769,11 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
 	return(err);
 }
 
-static struct super_block *hppfs_read_super(struct file_system_type *type,
-					     int flags, const char *dev_name,
-					     void *data)
+static int hppfs_read_super(struct file_system_type *type,
+			    int flags, const char *dev_name,
+			    void *data, struct vfsmount *mnt)
 {
-	return(get_sb_nodev(type, flags, data, hppfs_fill_super));
+	return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt);
 }
 
 static struct file_system_type hppfs_type = {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3a5b4e92345..4665c26171f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -723,10 +723,10 @@ void hugetlb_put_quota(struct address_space *mapping)
 	}
 }
 
-static struct super_block *hugetlbfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int hugetlbfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt);
 }
 
 static struct file_system_type hugetlbfs_fs_type = {
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 9e9931e2bad..f2386442ade 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -672,11 +672,11 @@ out:
 	return ret;
 }
 
-static struct super_block *
+static int
 inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data)
+	       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+	return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA, mnt);
 }
 
 static struct file_system_type inotify_fs_type = {
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 70adbb98bad..17268da63a4 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1399,10 +1399,11 @@ struct inode *isofs_iget(struct super_block *sb,
 	return inode;
 }
 
-static struct super_block *isofs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int isofs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type iso9660_fs_type = {
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 020cc097c53..dd93a091ad6 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1785,10 +1785,11 @@ static struct super_operations jffs_ops =
 	.remount_fs	= jffs_remount,
 };
 
-static struct super_block *jffs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int jffs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type jffs_fs_type = {
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 9d0521451f5..2378a662c25 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -111,9 +111,10 @@ static int jffs2_sb_set(struct super_block *sb, void *data)
 	return 0;
 }
 
-static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, struct mtd_info *mtd)
+static int jffs2_get_sb_mtd(struct file_system_type *fs_type,
+			    int flags, const char *dev_name,
+			    void *data, struct mtd_info *mtd,
+			    struct vfsmount *mnt)
 {
 	struct super_block *sb;
 	struct jffs2_sb_info *c;
@@ -121,19 +122,20 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 
 	c = kmalloc(sizeof(*c), GFP_KERNEL);
 	if (!c)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	memset(c, 0, sizeof(*c));
 	c->mtd = mtd;
 
 	sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c);
 
 	if (IS_ERR(sb))
-		goto out_put;
+		goto out_error;
 
 	if (sb->s_root) {
 		/* New mountpoint for JFFS2 which is already mounted */
 		D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): Device %d (\"%s\") is already mounted\n",
 			  mtd->index, mtd->name));
+		ret = simple_set_mnt(mnt, sb);
 		goto out_put;
 	}
 
@@ -161,44 +163,47 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 		/* Failure case... */
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
-		return ERR_PTR(ret);
+		return ret;
 	}
 
 	sb->s_flags |= MS_ACTIVE;
-	return sb;
+	return simple_set_mnt(mnt, sb);
 
+out_error:
+	ret = PTR_ERR(sb);
  out_put:
 	kfree(c);
 	put_mtd_device(mtd);
 
-	return sb;
+	return ret;
 }
 
-static struct super_block *jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
-					      int flags, const char *dev_name,
-					      void *data, int mtdnr)
+static int jffs2_get_sb_mtdnr(struct file_system_type *fs_type,
+			      int flags, const char *dev_name,
+			      void *data, int mtdnr,
+			      struct vfsmount *mnt)
 {
 	struct mtd_info *mtd;
 
 	mtd = get_mtd_device(NULL, mtdnr);
 	if (!mtd) {
 		D1(printk(KERN_DEBUG "jffs2: MTD device #%u doesn't appear to exist\n", mtdnr));
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+	return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 }
 
-static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int jffs2_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
 	int err;
 	struct nameidata nd;
 	int mtdnr;
 
 	if (!dev_name)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	D1(printk(KERN_DEBUG "jffs2_get_sb(): dev_name \"%s\"\n", dev_name));
 
@@ -220,7 +225,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 				mtd = get_mtd_device(NULL, mtdnr);
 				if (mtd) {
 					if (!strcmp(mtd->name, dev_name+4))
-						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd);
+						return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt);
 					put_mtd_device(mtd);
 				}
 			}
@@ -233,7 +238,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 			if (!*endptr) {
 				/* It was a valid number */
 				D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd%%d, mtdnr %d\n", mtdnr));
-				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+				return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 			}
 		}
 	}
@@ -247,7 +252,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 		  err, nd.dentry->d_inode));
 
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	err = -EINVAL;
 
@@ -269,11 +274,11 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type,
 	mtdnr = iminor(nd.dentry->d_inode);
 	path_release(&nd);
 
-	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr);
+	return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt);
 
 out:
 	path_release(&nd);
-	return ERR_PTR(err);
+	return err;
 }
 
 static void jffs2_put_super (struct super_block *sb)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index db6f41d6dd6..18a28137b90 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -565,10 +565,11 @@ static void jfs_unlockfs(struct super_block *sb)
 	}
 }
 
-static struct super_block *jfs_get_sb(struct file_system_type *fs_type, 
-	int flags, const char *dev_name, void *data)
+static int jfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super,
+			   mnt);
 }
 
 static int jfs_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/libfs.c b/fs/libfs.c
index 7145ba7a48d..7d70efa46da 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -196,9 +196,9 @@ struct inode_operations simple_dir_inode_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-struct super_block *
-get_sb_pseudo(struct file_system_type *fs_type, char *name,
-	struct super_operations *ops, unsigned long magic)
+int get_sb_pseudo(struct file_system_type *fs_type, char *name,
+	struct super_operations *ops, unsigned long magic,
+	struct vfsmount *mnt)
 {
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 	static struct super_operations default_ops = {.statfs = simple_statfs};
@@ -207,7 +207,7 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	struct qstr d_name = {.name = name, .len = strlen(name)};
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = MS_NOUSER;
 	s->s_maxbytes = ~0ULL;
@@ -232,12 +232,12 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 
 Enomem:
 	up_write(&s->s_umount);
 	deactivate_super(s);
-	return ERR_PTR(-ENOMEM);
+	return -ENOMEM;
 }
 
 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 2dcccf1d1b7..14f24dfbfe3 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -559,10 +559,11 @@ void minix_truncate(struct inode * inode)
 		V2_minix_truncate(inode);
 }
 
-static struct super_block *minix_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int minix_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super,
+			   mnt);
 }
 
 static struct file_system_type minix_fs_type = {
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 5b76ccd19e3..9e44158a754 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -661,11 +661,12 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *msdos_get_sb(struct file_system_type *fs_type,
-					int flags, const char *dev_name,
-					void *data)
+static int msdos_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super,
+			   mnt);
 }
 
 static struct file_system_type msdos_fs_type = {
diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb85..c13072a5f1e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -86,6 +86,15 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 	return mnt;
 }
 
+int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+{
+	mnt->mnt_sb = sb;
+	mnt->mnt_root = dget(sb->s_root);
+	return 0;
+}
+
+EXPORT_SYMBOL(simple_set_mnt);
+
 void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index a1f3e972c6e..8db033fab3f 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -957,10 +957,10 @@ out:
 	return result;
 }
 
-static struct super_block *ncp_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ncp_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ncp_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt);
 }
 
 static struct file_system_type ncp_fs_type = {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d0b991a9232..ff645a961bc 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1690,8 +1690,8 @@ static int nfs_compare_super(struct super_block *sb, void *data)
 	return !nfs_compare_fh(&old->fh, &server->fh);
 }
 
-static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static int nfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	int error;
 	struct nfs_server *server = NULL;
@@ -1699,14 +1699,14 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	struct nfs_fh *root;
 	struct nfs_mount_data *data = raw_data;
 
-	s = ERR_PTR(-EINVAL);
+	error = -EINVAL;
 	if (data == NULL) {
 		dprintk("%s: missing data argument\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 	if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) {
 		dprintk("%s: bad mount version\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 	switch (data->version) {
 		case 1:
@@ -1718,7 +1718,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 				dprintk("%s: mount structure version %d does not support NFSv3\n",
 						__FUNCTION__,
 						data->version);
-				goto out_err;
+				goto out_err_noserver;
 			}
 			data->root.size = NFS2_FHSIZE;
 			memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE);
@@ -1727,24 +1727,24 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 				dprintk("%s: mount structure version %d does not support strong security\n",
 						__FUNCTION__,
 						data->version);
-				goto out_err;
+				goto out_err_noserver;
 			}
 		case 5:
 			memset(data->context, 0, sizeof(data->context));
 	}
 #ifndef CONFIG_NFS_V3
 	/* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */
-	s = ERR_PTR(-EPROTONOSUPPORT);
+	error = -EPROTONOSUPPORT;
 	if (data->flags & NFS_MOUNT_VER3) {
 		dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__);
-		goto out_err;
+		goto out_err_noserver;
 	}
 #endif /* CONFIG_NFS_V3 */
 
-	s = ERR_PTR(-ENOMEM);
+	error = -ENOMEM;
 	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (!server)
-		goto out_err;
+		goto out_err_noserver;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
 	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
@@ -1754,7 +1754,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 		root->size = data->root.size;
 	else
 		root->size = NFS2_FHSIZE;
-	s = ERR_PTR(-EINVAL);
+	error = -EINVAL;
 	if (root->size > sizeof(root->data)) {
 		dprintk("%s: invalid root filehandle\n", __FUNCTION__);
 		goto out_err;
@@ -1770,15 +1770,20 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	}
 
 	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
 		goto out_err;
 	}
 
 	s = sget(fs_type, nfs_compare_super, nfs_set_super, server);
-	if (IS_ERR(s) || s->s_root)
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto out_err_rpciod;
+	}
+
+	if (s->s_root)
 		goto out_rpciod_down;
 
 	s->s_flags = flags;
@@ -1787,15 +1792,22 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
+
 out_rpciod_down:
 	rpciod_down();
+	kfree(server);
+	return simple_set_mnt(mnt, s);
+
+out_err_rpciod:
+	rpciod_down();
 out_err:
 	kfree(server);
-	return s;
+out_err_noserver:
+	return error;
 }
 
 static void nfs_kill_super(struct super_block *s)
@@ -2032,8 +2044,8 @@ nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen)
 	return dst;
 }
 
-static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *raw_data)
+static int nfs4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
 	int error;
 	struct nfs_server *server;
@@ -2043,16 +2055,16 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 
 	if (data == NULL) {
 		dprintk("%s: missing data argument\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 	if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) {
 		dprintk("%s: bad mount version\n", __FUNCTION__);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL);
 	if (!server)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
 	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
@@ -2074,33 +2086,41 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 
 	/* We now require that the mount process passes the remote address */
 	if (data->host_addrlen != sizeof(server->addr)) {
-		s = ERR_PTR(-EINVAL);
+		error = -EINVAL;
 		goto out_free;
 	}
 	if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) {
-		s = ERR_PTR(-EFAULT);
+		error = -EFAULT;
 		goto out_free;
 	}
 	if (server->addr.sin_family != AF_INET ||
 	    server->addr.sin_addr.s_addr == INADDR_ANY) {
 		dprintk("%s: mount program didn't pass remote IP address!\n",
 				__FUNCTION__);
-		s = ERR_PTR(-EINVAL);
+		error = -EINVAL;
 		goto out_free;
 	}
 
 	/* Fire up rpciod if not yet running */
-	s = ERR_PTR(rpciod_up());
-	if (IS_ERR(s)) {
-		dprintk("%s: couldn't start rpciod! Error = %ld\n",
-				__FUNCTION__, PTR_ERR(s));
+	error = rpciod_up();
+	if (error < 0) {
+		dprintk("%s: couldn't start rpciod! Error = %d\n",
+				__FUNCTION__, error);
 		goto out_free;
 	}
 
 	s = sget(fs_type, nfs4_compare_super, nfs_set_super, server);
-
-	if (IS_ERR(s) || s->s_root)
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
 		goto out_free;
+	}
+
+	if (s->s_root) {
+		kfree(server->mnt_path);
+		kfree(server->hostname);
+		kfree(server);
+		return simple_set_mnt(mnt, s);
+	}
 
 	s->s_flags = flags;
 
@@ -2108,17 +2128,17 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 out_err:
-	s = (struct super_block *)p;
+	error = PTR_ERR(p);
 out_free:
 	kfree(server->mnt_path);
 	kfree(server->hostname);
 	kfree(server);
-	return s;
+	return error;
 }
 
 static void nfs4_kill_super(struct super_block *sb)
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3ef017b3b5b..a1810e6a93e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -494,10 +494,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 	return simple_fill_super(sb, 0x6e667364, nfsd_files);
 }
 
-static struct super_block *nfsd_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int nfsd_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, nfsd_fill_super);
+	return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt);
 }
 
 static struct file_system_type nfsd_fs_type = {
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 27833f6df49..d5d5e969294 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3093,10 +3093,11 @@ struct kmem_cache *ntfs_index_ctx_cache;
 /* Driver wide mutex. */
 DEFINE_MUTEX(ntfs_lock);
 
-static struct super_block *ntfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ntfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ntfs_fs_type = {
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7e88e24b347..7273d9fa6ba 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -574,10 +574,10 @@ static struct inode_operations dlmfs_file_inode_operations = {
 	.getattr	= simple_getattr,
 };
 
-static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int dlmfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt);
 }
 
 static struct file_system_type dlmfs_fs_type = {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 949b3dac30f..788b8b50dc4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -672,12 +672,14 @@ read_super_error:
 	return status;
 }
 
-static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type,
-					int flags,
-					const char *dev_name,
-					void *data)
+static int ocfs2_get_sb(struct file_system_type *fs_type,
+			int flags,
+			const char *dev_name,
+			void *data,
+			struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super,
+			   mnt);
 }
 
 static struct file_system_type ocfs2_fs_type = {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 0f14276a2e5..464e2bce020 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1054,10 +1054,10 @@ out_no_root:
 	return -ENOMEM;
 }
 
-static struct super_block *openprom_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int openprom_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, openprom_fill_super);
+	return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt);
 }
 
 static struct file_system_type openprom_fs_type = {
diff --git a/fs/pipe.c b/fs/pipe.c
index 5acd8954aaa..20352573e02 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -979,12 +979,11 @@ no_files:
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
-
-static struct super_block *
-pipefs_get_sb(struct file_system_type *fs_type, int flags,
-	      const char *dev_name, void *data)
+static int pipefs_get_sb(struct file_system_type *fs_type,
+			 int flags, const char *dev_name, void *data,
+			 struct vfsmount *mnt)
 {
-	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
+	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
 }
 
 static struct file_system_type pipe_fs_type = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c3fd3611112..9995356ce73 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -26,10 +26,10 @@ struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc
 struct proc_dir_entry *proc_sys_root;
 #endif
 
-static struct super_block *proc_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int proc_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, proc_fill_super);
+	return get_sb_single(fs_type, flags, data, proc_fill_super, mnt);
 }
 
 static struct file_system_type proc_fs_type = {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2ecd46f85e9..e6cca5cd4b4 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -561,10 +561,11 @@ static void destroy_inodecache(void)
 		       "qnx4_inode_cache: not all structures were freed\n");
 }
 
-static struct super_block *qnx4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int qnx4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super,
+			   mnt);
 }
 
 static struct file_system_type qnx4_fs_type = {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 14bd2246fb6..b9677335cc8 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -185,16 +185,17 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	return 0;
 }
 
-struct super_block *ramfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+int ramfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt);
 }
 
-static struct super_block *rootfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int rootfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
+	return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super,
+			    mnt);
 }
 
 static struct file_system_type ramfs_fs_type = {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index cae2abbc0c7..f3ff41d3398 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2249,11 +2249,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 
 #endif
 
-static struct super_block *get_super_block(struct file_system_type *fs_type,
-					   int flags, const char *dev_name,
-					   void *data)
+static int get_super_block(struct file_system_type *fs_type,
+			   int flags, const char *dev_name,
+			   void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super,
+			   mnt);
 }
 
 static int __init init_reiserfs_fs(void)
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 9b9eda7b335..4d6cd662106 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -607,10 +607,11 @@ static struct super_operations romfs_ops = {
 	.remount_fs	= romfs_remount,
 };
 
-static struct super_block *romfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int romfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
+			   mnt);
 }
 
 static struct file_system_type romfs_fs_type = {
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fdeabc0a34f..4a37c2bbfa3 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -782,10 +782,10 @@ out:
 	return error;
 }
 
-static struct super_block *smb_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int smb_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_nodev(fs_type, flags, data, smb_fill_super);
+	return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt);
 }
 
 static struct file_system_type smb_fs_type = {
diff --git a/fs/super.c b/fs/super.c
index 9d5c2add722..324c2d232f5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (root) {
 		sb->s_root = NULL;
 		shrink_dcache_parent(root);
-		shrink_dcache_anon(sb);
+		shrink_dcache_sb(sb);
 		dput(root);
 		fsync_super(sb);
 		lock_super(sb);
@@ -676,9 +676,10 @@ static void bdev_uevent(struct block_device *bdev, enum kobject_action action)
 	}
 }
 
-struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+int get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct block_device *bdev;
 	struct super_block *s;
@@ -686,7 +687,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 
 	bdev = open_bdev_excl(dev_name, flags, fs_type);
 	if (IS_ERR(bdev))
-		return (struct super_block *)bdev;
+		return PTR_ERR(bdev);
 
 	/*
 	 * once the super is inserted into the list by sget, s_umount
@@ -697,15 +698,17 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
 	mutex_unlock(&bdev->bd_mount_mutex);
 	if (IS_ERR(s))
-		goto out;
+		goto error_s;
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(-EBUSY);
+			error = -EBUSY;
+			goto error_bdev;
 		}
-		goto out;
+
+		close_bdev_excl(bdev);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -716,18 +719,21 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			s = ERR_PTR(error);
-		} else {
-			s->s_flags |= MS_ACTIVE;
-			bdev_uevent(bdev, KOBJ_MOUNT);
+			goto error;
 		}
+
+		s->s_flags |= MS_ACTIVE;
+		bdev_uevent(bdev, KOBJ_MOUNT);
 	}
 
-	return s;
+	return simple_set_mnt(mnt, s);
 
-out:
+error_s:
+	error = PTR_ERR(s);
+error_bdev:
 	close_bdev_excl(bdev);
-	return s;
+error:
+	return error;
 }
 
 EXPORT_SYMBOL(get_sb_bdev);
@@ -744,15 +750,16 @@ void kill_block_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_block_super);
 
-struct super_block *get_sb_nodev(struct file_system_type *fs_type,
+int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	int error;
 	struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL);
 
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 
 	s->s_flags = flags;
 
@@ -760,10 +767,10 @@ struct super_block *get_sb_nodev(struct file_system_type *fs_type,
 	if (error) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
-		return ERR_PTR(error);
+		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_nodev);
@@ -773,94 +780,102 @@ static int compare_single(struct super_block *s, void *p)
 	return 1;
 }
 
-struct super_block *get_sb_single(struct file_system_type *fs_type,
+int get_sb_single(struct file_system_type *fs_type,
 	int flags, void *data,
-	int (*fill_super)(struct super_block *, void *, int))
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
 {
 	struct super_block *s;
 	int error;
 
 	s = sget(fs_type, compare_single, set_anon_super, NULL);
 	if (IS_ERR(s))
-		return s;
+		return PTR_ERR(s);
 	if (!s->s_root) {
 		s->s_flags = flags;
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			return ERR_PTR(error);
+			return error;
 		}
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return s;
+	return simple_set_mnt(mnt, s);
 }
 
 EXPORT_SYMBOL(get_sb_single);
 
 struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-	struct file_system_type *type = get_fs_type(fstype);
-	struct super_block *sb = ERR_PTR(-ENOMEM);
 	struct vfsmount *mnt;
-	int error;
 	char *secdata = NULL;
+	int error;
 
 	if (!type)
 		return ERR_PTR(-ENODEV);
 
+	error = -ENOMEM;
 	mnt = alloc_vfsmnt(name);
 	if (!mnt)
 		goto out;
 
 	if (data) {
 		secdata = alloc_secdata();
-		if (!secdata) {
-			sb = ERR_PTR(-ENOMEM);
+		if (!secdata)
 			goto out_mnt;
-		}
 
 		error = security_sb_copy_data(type, data, secdata);
-		if (error) {
-			sb = ERR_PTR(error);
+		if (error)
 			goto out_free_secdata;
-		}
 	}
 
-	sb = type->get_sb(type, flags, name, data);
-	if (IS_ERR(sb))
+	error = type->get_sb(type, flags, name, data, mnt);
+	if (error < 0)
 		goto out_free_secdata;
- 	error = security_sb_kern_mount(sb, secdata);
+
+ 	error = security_sb_kern_mount(mnt->mnt_sb, secdata);
  	if (error)
  		goto out_sb;
-	mnt->mnt_sb = sb;
-	mnt->mnt_root = dget(sb->s_root);
-	mnt->mnt_mountpoint = sb->s_root;
+
+	mnt->mnt_mountpoint = mnt->mnt_root;
 	mnt->mnt_parent = mnt;
-	up_write(&sb->s_umount);
+	up_write(&mnt->mnt_sb->s_umount);
 	free_secdata(secdata);
-	put_filesystem(type);
 	return mnt;
 out_sb:
-	up_write(&sb->s_umount);
-	deactivate_super(sb);
-	sb = ERR_PTR(error);
+	dput(mnt->mnt_root);
+	up_write(&mnt->mnt_sb->s_umount);
+	deactivate_super(mnt->mnt_sb);
 out_free_secdata:
 	free_secdata(secdata);
 out_mnt:
 	free_vfsmnt(mnt);
 out:
+	return ERR_PTR(error);
+}
+
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
+
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+	struct file_system_type *type = get_fs_type(fstype);
+	struct vfsmount *mnt;
+	if (!type)
+		return ERR_PTR(-ENODEV);
+	mnt = vfs_kern_mount(type, flags, name, data);
 	put_filesystem(type);
-	return (struct vfsmount *)sb;
+	return mnt;
 }
 
 EXPORT_SYMBOL_GPL(do_kern_mount);
 
 struct vfsmount *kern_mount(struct file_system_type *type)
 {
-	return do_kern_mount(type->name, 0, type->name, NULL);
+	return vfs_kern_mount(type, 0, type->name, NULL);
 }
 
 EXPORT_SYMBOL(kern_mount);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f1117e885bd..40190c48927 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -66,10 +66,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *sysfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, sysfs_fill_super);
+	return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt);
 }
 
 static struct file_system_type sysfs_fs_type = {
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index e92b991e6dd..876639b9332 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -506,16 +506,17 @@ failed:
 
 /* Every kernel module contains stuff like this. */
 
-static struct super_block *sysv_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int sysv_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super,
+			   mnt);
 }
 
-static struct super_block *v7_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int v7_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt);
 }
 
 static struct file_system_type sysv_fs_type = {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e45789fe38e..2250774a831 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -94,10 +94,10 @@ static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct super_block *, struct kstatfs *);
 
 /* UDF filesystem type */
-static struct super_block *udf_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int udf_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt);
 }
 
 static struct file_system_type udf_fstype = {
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index db98a4c71e6..768fb8d9e67 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1311,10 +1311,10 @@ out:
 
 #endif
 
-static struct super_block *ufs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static int ufs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt);
 }
 
 static struct file_system_type ufs_fs_type = {
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a56cec3be5f..9a8f48bae95 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -1023,11 +1023,12 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent)
 	return 0;
 }
 
-static struct super_block *vfat_get_sb(struct file_system_type *fs_type,
-				       int flags, const char *dev_name,
-				       void *data)
+static int vfat_get_sb(struct file_system_type *fs_type,
+		       int flags, const char *dev_name,
+		       void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super,
+			   mnt);
 }
 
 static struct file_system_type vfat_fs_type = {
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2a0778536f..d03c89a3665 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -853,14 +853,16 @@ fail_vfsop:
 	return -error;
 }
 
-STATIC struct super_block *
+STATIC int
 xfs_fs_get_sb(
 	struct file_system_type	*fs_type,
 	int			flags,
 	const char		*dev_name,
-	void			*data)
+	void			*data,
+	struct vfsmount		*mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super,
+			   mnt);
 }
 
 STATIC struct super_operations xfs_super_operations = {
-- 
cgit v1.2.3


From 726c334223180e3c0197cc980a432681370d4baf Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:02:58 -0700
Subject: [PATCH] VFS: Permit filesystem to perform statfs with a known root
 dentry

Give the statfs superblock operation a dentry pointer rather than a superblock
pointer.

This complements the get_sb() patch.  That reduced the significance of
sb->s_root, allowing NFS to place a fake root there.  However, NFS does
require a dentry to use as a target for the statfs operation.  This permits
the root in the vfsmount to be used instead.

linux/mount.h has been added where necessary to make allyesconfig build
successfully.

Interest has also been expressed for use with the FUSE and XFS filesystems.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Nathan Scott <nathans@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/adfs/super.c              |  8 ++++----
 fs/affs/super.c              |  5 +++--
 fs/befs/linuxvfs.c           |  5 +++--
 fs/bfs/inode.c               |  3 ++-
 fs/cifs/cifsfs.c             |  3 ++-
 fs/coda/inode.c              |  6 +++---
 fs/coda/upcall.c             |  4 ++--
 fs/compat.c                  |  8 ++++----
 fs/cramfs/inode.c            |  4 +++-
 fs/efs/super.c               |  6 +++---
 fs/ext2/super.c              |  5 +++--
 fs/ext3/super.c              |  5 +++--
 fs/fat/inode.c               |  8 ++++----
 fs/freevxfs/vxfs_super.c     | 13 +++++++------
 fs/fuse/inode.c              |  3 ++-
 fs/hfs/super.c               |  4 +++-
 fs/hfsplus/super.c           |  4 +++-
 fs/hostfs/hostfs_kern.c      |  4 ++--
 fs/hpfs/super.c              |  3 ++-
 fs/hppfs/hppfs_kern.c        |  2 +-
 fs/hugetlbfs/inode.c         |  4 ++--
 fs/isofs/inode.c             |  6 ++++--
 fs/jffs/inode-v23.c          |  4 ++--
 fs/jffs2/fs.c                |  4 ++--
 fs/jffs2/os-linux.h          |  2 +-
 fs/jfs/super.c               |  4 ++--
 fs/libfs.c                   |  4 ++--
 fs/minix/inode.c             | 10 +++++-----
 fs/ncpfs/inode.c             |  5 +++--
 fs/nfs/inode.c               |  5 +++--
 fs/nfsd/nfs4xdr.c            |  2 +-
 fs/nfsd/vfs.c                |  2 +-
 fs/ntfs/super.c              |  7 ++++---
 fs/ocfs2/super.c             | 10 +++++-----
 fs/open.c                    | 26 +++++++++++++-------------
 fs/qnx4/inode.c              |  6 ++++--
 fs/reiserfs/super.c          |  8 ++++----
 fs/romfs/inode.c             |  4 ++--
 fs/smbfs/inode.c             |  6 +++---
 fs/smbfs/proc.c              |  4 ++--
 fs/smbfs/proto.h             |  2 +-
 fs/super.c                   |  2 +-
 fs/sysv/inode.c              |  3 ++-
 fs/udf/super.c               |  6 ++++--
 fs/ufs/super.c               |  3 ++-
 fs/xfs/linux-2.6/xfs_super.c |  4 ++--
 46 files changed, 136 insertions(+), 110 deletions(-)

(limited to 'fs')

diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 1b58a9b7f6a..ba1c88af49f 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -196,17 +196,17 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data)
 	return parse_options(sb, data);
 }
 
-static int adfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct adfs_sb_info *asb = ADFS_SB(sb);
+	struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb);
 
 	buf->f_type    = ADFS_SUPER_MAGIC;
 	buf->f_namelen = asb->s_namelen;
-	buf->f_bsize   = sb->s_blocksize;
+	buf->f_bsize   = dentry->d_sb->s_blocksize;
 	buf->f_blocks  = asb->s_size;
 	buf->f_files   = asb->s_ids_per_zone * asb->s_map_size;
 	buf->f_bavail  =
-	buf->f_bfree   = adfs_map_free(sb);
+	buf->f_bfree   = adfs_map_free(dentry->d_sb);
 	buf->f_ffree   = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks;
 
 	return 0;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 6a52e787540..8765cba35bb 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,7 +18,7 @@
 
 extern struct timezone sys_tz;
 
-static int affs_statfs(struct super_block *sb, struct kstatfs *buf);
+static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
 static void
@@ -508,8 +508,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-affs_statfs(struct super_block *sb, struct kstatfs *buf)
+affs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int		 free;
 
 	pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 6ed07a5f10c..08201fab26c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -49,7 +49,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static void befs_put_super(struct super_block *);
 static int befs_remount(struct super_block *, int *, char *);
-static int befs_statfs(struct super_block *, struct kstatfs *);
+static int befs_statfs(struct dentry *, struct kstatfs *);
 static int parse_options(char *, befs_mount_options *);
 
 static const struct super_operations befs_sops = {
@@ -880,8 +880,9 @@ befs_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static int
-befs_statfs(struct super_block *sb, struct kstatfs *buf)
+befs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 
 	befs_debug(sb, "---> befs_statfs()");
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index e7da03f63a5..cf74f3d4d96 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -203,8 +203,9 @@ static void bfs_put_super(struct super_block *s)
 	s->s_fs_info = NULL;
 }
 
-static int bfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct bfs_sb_info *info = BFS_SB(s);
 	u64 id = huge_encode_dev(s->s_bdev->bd_dev);
 	buf->f_type = BFS_MAGIC;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 08b35801dfe..7520f468715 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -166,8 +166,9 @@ cifs_put_super(struct super_block *sb)
 }
 
 static int
-cifs_statfs(struct super_block *sb, struct kstatfs *buf)
+cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	int xid; 
 	int rc = -EOPNOTSUPP;
 	struct cifs_sb_info *cifs_sb;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cba70201567..87f1dc8aa24 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -36,7 +36,7 @@
 /* VFS super_block ops */
 static void coda_clear_inode(struct inode *);
 static void coda_put_super(struct super_block *);
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf);
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static kmem_cache_t * coda_inode_cachep;
 
@@ -278,13 +278,13 @@ struct inode_operations coda_file_inode_operations = {
 	.setattr	= coda_setattr,
 };
 
-static int coda_statfs(struct super_block *sb, struct kstatfs *buf)
+static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int error;
 	
 	lock_kernel();
 
-	error = venus_statfs(sb, buf);
+	error = venus_statfs(dentry, buf);
 
 	unlock_kernel();
 
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1bae99650a9..b040eba13a7 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -611,7 +611,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 	return error;
 }
 
-int venus_statfs(struct super_block *sb, struct kstatfs *sfs) 
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs)
 { 
         union inputArgs *inp;
         union outputArgs *outp;
@@ -620,7 +620,7 @@ int venus_statfs(struct super_block *sb, struct kstatfs *sfs)
 	insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs));
 	UPARG(CODA_STATFS);
 
-        error = coda_upcall(coda_sbp(sb), insize, &outsize, inp);
+        error = coda_upcall(coda_sbp(dentry->d_sb), insize, &outsize, inp);
 	
         if (!error) {
 		sfs->f_blocks = outp->coda_statfs.stat.f_blocks;
diff --git a/fs/compat.c b/fs/compat.c
index b1f64786a61..7e7e5bc4f3c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -197,7 +197,7 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs(buf, &tmp);
 		path_release(&nd);
@@ -215,7 +215,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
 	fput(file);
@@ -265,7 +265,7 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct kstatfs tmp;
-		error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs(nd.dentry, &tmp);
 		if (!error)
 			error = put_compat_statfs64(buf, &tmp);
 		path_release(&nd);
@@ -286,7 +286,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs(file->f_dentry, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
 	fput(file);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 37a91a153aa..8a9d5d3b326 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -322,8 +322,10 @@ out:
 	return -EINVAL;
 }
 
-static int cramfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = CRAMFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_blocks = CRAMFS_SB(sb)->blocks;
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 1ba5e14f879..8ac2462ae5d 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -15,7 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int efs_fill_super(struct super_block *s, void *d, int silent);
 
 static int efs_get_sb(struct file_system_type *fs_type,
@@ -322,8 +322,8 @@ out_no_fs:
 	return -EINVAL;
 }
 
-static int efs_statfs(struct super_block *s, struct kstatfs *buf) {
-	struct efs_sb_info *sb = SUPER_INFO(s);
+static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
+	struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb);
 
 	buf->f_type    = EFS_SUPER_MAGIC;	/* efs magic number */
 	buf->f_bsize   = EFS_BLOCKSIZE;		/* blocksize */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a4dfffac596..a6c4d6e0232 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -39,7 +39,7 @@
 static void ext2_sync_super(struct super_block *sb,
 			    struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 
 void ext2_error (struct super_block * sb, const char * function,
 		 const char * fmt, ...)
@@ -1038,8 +1038,9 @@ restore_opts:
 	return err;
 }
 
-static int ext2_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	unsigned long overhead;
 	int i;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 657f8e73b62..e08b6439617 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -58,7 +58,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait);
 static const char *ext3_decode_error(struct super_block * sb, int errno,
 				     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf);
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static void ext3_unlockfs(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
 static void ext3_write_super_lockfs(struct super_block *sb);
@@ -2318,8 +2318,9 @@ restore_opts:
 	return err;
 }
 
-static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
+static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
 	unsigned long overhead;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index c1ce284f8a9..7c35d582ec1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -539,18 +539,18 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-static int fat_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 
 	/* If the count of free cluster is still unknown, counts it here. */
 	if (sbi->free_clusters == -1) {
-		int err = fat_count_free_clusters(sb);
+		int err = fat_count_free_clusters(dentry->d_sb);
 		if (err)
 			return err;
 	}
 
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = sbi->cluster_size;
 	buf->f_blocks = sbi->max_cluster - FAT_START_ENT;
 	buf->f_bfree = sbi->free_clusters;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index d76eeaafbde..b74b791fc23 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/vfs.h>
+#include <linux/mount.h>
 
 #include "vxfs.h"
 #include "vxfs_extern.h"
@@ -55,7 +56,7 @@ MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */
 
 
 static void		vxfs_put_super(struct super_block *);
-static int		vxfs_statfs(struct super_block *, struct kstatfs *);
+static int		vxfs_statfs(struct dentry *, struct kstatfs *);
 static int		vxfs_remount(struct super_block *, int *, char *);
 
 static struct super_operations vxfs_super_ops = {
@@ -90,12 +91,12 @@ vxfs_put_super(struct super_block *sbp)
 
 /**
  * vxfs_statfs - get filesystem information
- * @sbp:	VFS superblock
+ * @dentry:	VFS dentry to locate superblock
  * @bufp:	output buffer
  *
  * Description:
  *   vxfs_statfs fills the statfs buffer @bufp with information
- *   about the filesystem described by @sbp.
+ *   about the filesystem described by @dentry.
  *
  * Returns:
  *   Zero.
@@ -107,12 +108,12 @@ vxfs_put_super(struct super_block *sbp)
  *   This is everything but complete...
  */
 static int
-vxfs_statfs(struct super_block *sbp, struct kstatfs *bufp)
+vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 {
-	struct vxfs_sb_info		*infp = VXFS_SBI(sbp);
+	struct vxfs_sb_info		*infp = VXFS_SBI(dentry->d_sb);
 
 	bufp->f_type = VXFS_SUPER_MAGIC;
-	bufp->f_bsize = sbp->s_blocksize;
+	bufp->f_bsize = dentry->d_sb->s_blocksize;
 	bufp->f_blocks = infp->vsi_raw->vs_dsize;
 	bufp->f_bfree = infp->vsi_raw->vs_free;
 	bufp->f_bavail = 0;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c91f0a50aad..a13c0f52905 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -236,8 +236,9 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr
 	/* fsid is left zero */
 }
 
-static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 	struct fuse_req *req;
 	struct fuse_statfs_out outarg;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ee5b80a409e..d9227bf14e8 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -80,8 +80,10 @@ static void hfs_put_super(struct super_block *sb)
  *
  * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks.
  */
-static int hfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 0ed8b7e8e87..0a92fa2336a 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -212,8 +212,10 @@ static void hfsplus_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 }
 
-static int hfsplus_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = HFSPLUS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 04035e08f5c..8e0d37743e7 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -239,7 +239,7 @@ static int read_inode(struct inode *ino)
 	return(err);
 }
 
-int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
+int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	/* do_statfs uses struct statfs64 internally, but the linux kernel
 	 * struct statfs still has 32-bit versions for most of these fields,
@@ -252,7 +252,7 @@ int hostfs_statfs(struct super_block *sb, struct kstatfs *sf)
 	long long f_files;
 	long long f_ffree;
 
-	err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename,
+	err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename,
 			&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
 			&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
 			&sf->f_namelen, sf->f_spare);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 3b25cf3e2e6..f798480a363 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -135,8 +135,9 @@ static unsigned count_bitmaps(struct super_block *s)
 	return count;
 }
 
-static int hpfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *s = dentry->d_sb;
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	lock_kernel();
 
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index ec43c22bc9c..3a9bdf58166 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -616,7 +616,7 @@ static const struct file_operations hppfs_dir_fops = {
 	.fsync		= hppfs_fsync,
 };
 
-static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf)
+static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf)
 {
 	sf->f_blocks = 0;
 	sf->f_bfree = 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4665c26171f..678fc72c364 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -467,9 +467,9 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 	return 0;
 }
 
-static int hugetlbfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
 
 	buf->f_type = HUGETLBFS_MAGIC;
 	buf->f_bsize = HPAGE_SIZE;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 17268da63a4..3f9c8ba1fa1 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -56,7 +56,7 @@ static void isofs_put_super(struct super_block *sb)
 }
 
 static void isofs_read_inode(struct inode *);
-static int isofs_statfs (struct super_block *, struct kstatfs *);
+static int isofs_statfs (struct dentry *, struct kstatfs *);
 
 static kmem_cache_t *isofs_inode_cachep;
 
@@ -901,8 +901,10 @@ out_freesbi:
 	return -EINVAL;
 }
 
-static int isofs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = ISOFS_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = (ISOFS_SB(sb)->s_nzones
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index dd93a091ad6..9e46ea6da75 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -377,9 +377,9 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
 
 /* Get statistics of the file system.  */
 static int
-jffs_statfs(struct super_block *sb, struct kstatfs *buf)
+jffs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs_control *c = (struct jffs_control *) sb->s_fs_info;
+	struct jffs_control *c = (struct jffs_control *) dentry->d_sb->s_fs_info;
 	struct jffs_fmcontrol *fmc;
 
 	lock_kernel();
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 7b6c24b14f8..2900ec3ec3a 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -192,9 +192,9 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
 	return rc;
 }
 
-int jffs2_statfs(struct super_block *sb, struct kstatfs *buf)
+int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb);
 	unsigned long avail;
 
 	buf->f_type = JFFS2_SUPER_MAGIC;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index cd4021bcb94..6b522356540 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -175,7 +175,7 @@ void jffs2_clear_inode (struct inode *);
 void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
-int jffs2_statfs (struct super_block *, struct kstatfs *);
+int jffs2_statfs (struct dentry *, struct kstatfs *);
 void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 18a28137b90..73d2aba084c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -139,9 +139,9 @@ static void jfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(jfs_inode_cachep, ji);
 }
 
-static int jfs_statfs(struct super_block *sb, struct kstatfs *buf)
+static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct jfs_sb_info *sbi = JFS_SBI(sb);
+	struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb);
 	s64 maxinodes;
 	struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 7d70efa46da..1b115638178 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,9 +20,9 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
-int simple_statfs(struct super_block *sb, struct kstatfs *buf)
+int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	buf->f_type = sb->s_magic;
+	buf->f_type = dentry->d_sb->s_magic;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_namelen = NAME_MAX;
 	return 0;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 14f24dfbfe3..a6fb509b734 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -19,7 +19,7 @@
 
 static void minix_read_inode(struct inode * inode);
 static int minix_write_inode(struct inode * inode, int wait);
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf);
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int minix_remount (struct super_block * sb, int * flags, char * data);
 
 static void minix_delete_inode(struct inode *inode)
@@ -296,11 +296,11 @@ out_bad_sb:
 	return -EINVAL;
 }
 
-static int minix_statfs(struct super_block *sb, struct kstatfs *buf)
+static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct minix_sb_info *sbi = minix_sb(sb);
-	buf->f_type = sb->s_magic;
-	buf->f_bsize = sb->s_blocksize;
+	struct minix_sb_info *sbi = minix_sb(dentry->d_sb);
+	buf->f_type = dentry->d_sb->s_magic;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
 	buf->f_bfree = minix_count_free_blocks(sbi);
 	buf->f_bavail = buf->f_bfree;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 8db033fab3f..90d2ea28f33 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -39,7 +39,7 @@
 
 static void ncp_delete_inode(struct inode *);
 static void ncp_put_super(struct super_block *);
-static int  ncp_statfs(struct super_block *, struct kstatfs *);
+static int  ncp_statfs(struct dentry *, struct kstatfs *);
 
 static kmem_cache_t * ncp_inode_cachep;
 
@@ -724,13 +724,14 @@ static void ncp_put_super(struct super_block *sb)
 	kfree(server);
 }
 
-static int ncp_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct dentry* d;
 	struct inode* i;
 	struct ncp_inode_info* ni;
 	struct ncp_server* s;
 	struct ncp_volume_info vi;
+	struct super_block *sb = dentry->d_sb;
 	int err;
 	__u8 dh;
 	
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ff645a961bc..937fbfc381b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -65,7 +65,7 @@ static int nfs_write_inode(struct inode *,int);
 static void nfs_delete_inode(struct inode *);
 static void nfs_clear_inode(struct inode *);
 static void nfs_umount_begin(struct super_block *);
-static int  nfs_statfs(struct super_block *, struct kstatfs *);
+static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
 static void nfs_zap_acl_cache(struct inode *);
@@ -534,8 +534,9 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
 }
 
 static int
-nfs_statfs(struct super_block *sb, struct kstatfs *buf)
+nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct nfs_server *server = NFS_SB(sb);
 	unsigned char blockbits;
 	unsigned long blockres;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index de3998f15f1..5446a0861d1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1310,7 +1310,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
 	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		status = vfs_statfs(dentry->d_inode->i_sb, &statfs);
+		status = vfs_statfs(dentry, &statfs);
 		if (status)
 			goto out_nfserr;
 	}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 1d65f13f458..245eaa1fb59 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1737,7 +1737,7 @@ int
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
 	int err = fh_verify(rqstp, fhp, 0, MAY_NOP);
-	if (!err && vfs_statfs(fhp->fh_dentry->d_inode->i_sb,stat))
+	if (!err && vfs_statfs(fhp->fh_dentry,stat))
 		err = nfserr_io;
 	return err;
 }
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d5d5e969294..0e14acea3f8 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2601,10 +2601,10 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 
 /**
  * ntfs_statfs - return information about mounted NTFS volume
- * @sb:		super block of mounted volume
+ * @dentry:	dentry from mounted volume
  * @sfs:	statfs structure in which to return the information
  *
- * Return information about the mounted NTFS volume @sb in the statfs structure
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
  * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
  * called). We interpret the values to be correct of the moment in time at
  * which we are called. Most values are variable otherwise and this isn't just
@@ -2617,8 +2617,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
  *
  * Return 0 on success or -errno on error.
  */
-static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs)
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 {
+	struct super_block *sb = dentry->d_sb;
 	s64 size;
 	ntfs_volume *vol = NTFS_SB(sb);
 	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 788b8b50dc4..cdf73393f09 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -100,7 +100,7 @@ static int ocfs2_initialize_mem_caches(void);
 static void ocfs2_free_mem_caches(void);
 static void ocfs2_delete_osb(struct ocfs2_super *osb);
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf);
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int ocfs2_sync_fs(struct super_block *sb, int wait);
 
@@ -857,7 +857,7 @@ static void ocfs2_put_super(struct super_block *sb)
 	mlog_exit_void();
 }
 
-static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
+static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct ocfs2_super *osb;
 	u32 numbits, freebits;
@@ -866,9 +866,9 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	struct buffer_head *bh = NULL;
 	struct inode *inode = NULL;
 
-	mlog_entry("(%p, %p)\n", sb, buf);
+	mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
 
-	osb = OCFS2_SB(sb);
+	osb = OCFS2_SB(dentry->d_sb);
 
 	inode = ocfs2_get_system_file_inode(osb,
 					    GLOBAL_BITMAP_SYSTEM_INODE,
@@ -891,7 +891,7 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf)
 	freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used);
 
 	buf->f_type = OCFS2_SUPER_MAGIC;
-	buf->f_bsize = sb->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_namelen = OCFS2_MAX_FILENAME_LEN;
 	buf->f_blocks = ((sector_t) numbits) *
 			(osb->s_clustersize >> osb->sb->s_blocksize_bits);
diff --git a/fs/open.c b/fs/open.c
index 4f178acd4c0..a37ff861108 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -31,18 +31,18 @@
 
 #include <asm/unistd.h>
 
-int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval = -ENODEV;
 
-	if (sb) {
+	if (dentry) {
 		retval = -ENOSYS;
-		if (sb->s_op->statfs) {
+		if (dentry->d_sb->s_op->statfs) {
 			memset(buf, 0, sizeof(*buf));
-			retval = security_sb_statfs(sb);
+			retval = security_sb_statfs(dentry);
 			if (retval)
 				return retval;
-			retval = sb->s_op->statfs(sb, buf);
+			retval = dentry->d_sb->s_op->statfs(dentry, buf);
 			if (retval == 0 && buf->f_frsize == 0)
 				buf->f_frsize = buf->f_bsize;
 		}
@@ -52,12 +52,12 @@ int vfs_statfs(struct super_block *sb, struct kstatfs *buf)
 
 EXPORT_SYMBOL(vfs_statfs);
 
-static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -95,12 +95,12 @@ static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
 	return 0;
 }
 
-static int vfs_statfs64(struct super_block *sb, struct statfs64 *buf)
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 {
 	struct kstatfs st;
 	int retval;
 
-	retval = vfs_statfs(sb, &st);
+	retval = vfs_statfs(dentry, &st);
 	if (retval)
 		return retval;
 
@@ -130,7 +130,7 @@ asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs tmp;
-		error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs_native(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -149,7 +149,7 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64
 	error = user_path_walk(path, &nd);
 	if (!error) {
 		struct statfs64 tmp;
-		error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp);
+		error = vfs_statfs64(nd.dentry, &tmp);
 		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 			error = -EFAULT;
 		path_release(&nd);
@@ -168,7 +168,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs_native(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
@@ -189,7 +189,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user
 	file = fget(fd);
 	if (!file)
 		goto out;
-	error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp);
+	error = vfs_statfs64(file->f_dentry, &tmp);
 	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
 		error = -EFAULT;
 	fput(file);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e6cca5cd4b4..2f24c46f72a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -128,7 +128,7 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_destroy_inode(struct inode *inode);
 static void qnx4_read_inode(struct inode *);
 static int qnx4_remount(struct super_block *sb, int *flags, char *data);
-static int qnx4_statfs(struct super_block *, struct kstatfs *);
+static int qnx4_statfs(struct dentry *, struct kstatfs *);
 
 static struct super_operations qnx4_sops =
 {
@@ -282,8 +282,10 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 	return block;
 }
 
-static int qnx4_statfs(struct super_block *sb, struct kstatfs *buf)
+static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	lock_kernel();
 
 	buf->f_type    = sb->s_magic;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f3ff41d3398..00f1321e920 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -60,7 +60,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
 }
 
 static int reiserfs_remount(struct super_block *s, int *flags, char *data);
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf);
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
@@ -1938,15 +1938,15 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	return errval;
 }
 
-static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf)
+static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb);
 
 	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
 	buf->f_bfree = sb_free_blocks(rs);
 	buf->f_bavail = buf->f_bfree;
 	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
-	buf->f_bsize = s->s_blocksize;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
 	/* changed to accommodate gcc folks. */
 	buf->f_type = REISERFS_SUPER_MAGIC;
 	return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 4d6cd662106..283fbc6b8ee 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -179,12 +179,12 @@ outnobh:
 /* That's simple too. */
 
 static int
-romfs_statfs(struct super_block *sb, struct kstatfs *buf)
+romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	buf->f_type = ROMFS_MAGIC;
 	buf->f_bsize = ROMBSIZE;
 	buf->f_bfree = buf->f_bavail = buf->f_ffree;
-	buf->f_blocks = (romfs_maxsize(sb)+ROMBSIZE-1)>>ROMBSBITS;
+	buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
 	buf->f_namelen = ROMFS_MAXFN;
 	return 0;
 }
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 4a37c2bbfa3..506ff87c1d4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -48,7 +48,7 @@
 
 static void smb_delete_inode(struct inode *);
 static void smb_put_super(struct super_block *);
-static int  smb_statfs(struct super_block *, struct kstatfs *);
+static int  smb_statfs(struct dentry *, struct kstatfs *);
 static int  smb_show_options(struct seq_file *, struct vfsmount *);
 
 static kmem_cache_t *smb_inode_cachep;
@@ -641,13 +641,13 @@ out_no_server:
 }
 
 static int
-smb_statfs(struct super_block *sb, struct kstatfs *buf)
+smb_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int result;
 	
 	lock_kernel();
 
-	result = smb_proc_dskattr(sb, buf);
+	result = smb_proc_dskattr(dentry, buf);
 
 	unlock_kernel();
 
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index b1b878b8173..c3495059889 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -3226,9 +3226,9 @@ smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr)
 }
 
 int
-smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr)
+smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr)
 {
-	struct smb_sb_info *server = SMB_SB(sb);
+	struct smb_sb_info *server = SMB_SB(dentry->d_sb);
 	int result;
 	char *p;
 	long unit;
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 47664597e6b..972ed7dad38 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -29,7 +29,7 @@ extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
 extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr);
-extern int smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr);
+extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr);
 extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len);
 extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath);
 extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry);
diff --git a/fs/super.c b/fs/super.c
index 324c2d232f5..057b5325b7e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -486,7 +486,7 @@ asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
         s = user_get_super(new_decode_dev(dev));
         if (s == NULL)
                 goto out;
-	err = vfs_statfs(s, &sbuf);
+	err = vfs_statfs(s->s_root, &sbuf);
 	drop_super(s);
 	if (err)
 		goto out;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 3ff89cc5833..58b2d22142b 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -85,8 +85,9 @@ static void sysv_put_super(struct super_block *sb)
 	kfree(sbi);
 }
 
-static int sysv_statfs(struct super_block *sb, struct kstatfs *buf)
+static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
 	buf->f_type = sb->s_magic;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 2250774a831..44fe2cb0bbb 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -91,7 +91,7 @@ static void udf_load_partdesc(struct super_block *, struct buffer_head *);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
-static int udf_statfs(struct super_block *, struct kstatfs *);
+static int udf_statfs(struct dentry *, struct kstatfs *);
 
 /* UDF filesystem type */
 static int udf_get_sb(struct file_system_type *fs_type,
@@ -1779,8 +1779,10 @@ udf_put_super(struct super_block *sb)
  *	Written, tested, and released.
  */
 static int
-udf_statfs(struct super_block *sb, struct kstatfs *buf)
+udf_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
+
 	buf->f_type = UDF_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = UDF_SB_PARTLEN(sb, UDF_SB_PARTITION(sb));
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 768fb8d9e67..fe5ab2aa289 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1113,8 +1113,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	return 0;
 }
 
-static int ufs_statfs (struct super_block *sb, struct kstatfs *buf)
+static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf)
 {
+	struct super_block *sb = dentry->d_sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block * usb;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index d03c89a3665..4fb0fc65af3 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -684,10 +684,10 @@ xfs_fs_sync_super(
 
 STATIC int
 xfs_fs_statfs(
-	struct super_block	*sb,
+	struct dentry		*dentry,
 	struct kstatfs		*statp)
 {
-	return -bhv_vfs_statvfs(vfs_from_sb(sb), statp, NULL);
+	return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp, NULL);
 }
 
 STATIC int
-- 
cgit v1.2.3


From d6938d1b2768fbf94d1091207d334ef7df786e5a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Jun 2006 02:02:59 -0700
Subject: [PATCH] XFS: Use the dentry passed to statfs() to limit the scope of
 the results

Enable XFS to limit the statfs() results to the project quota covering the
dentry used as a base for call.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/xfs/linux-2.6/xfs_super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 4fb0fc65af3..9bdef9d5190 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -687,7 +687,8 @@ xfs_fs_statfs(
 	struct dentry		*dentry,
 	struct kstatfs		*statp)
 {
-	return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp, NULL);
+	return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp,
+				vn_from_inode(dentry->d_inode));
 }
 
 STATIC int
-- 
cgit v1.2.3


From 6f0419e06a3b151ab616a31accdabef41dc2d1b0 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Fri, 23 Jun 2006 02:03:00 -0700
Subject: [PATCH] for_each_possible_cpu: xfs

for_each_cpu() actually iterates across all possible CPUs.  We've had mistakes
in the past where people were using for_each_cpu() where they should have been
iterating across only online or present CPUs.  This is inefficient and
possibly buggy.

We're renaming for_each_cpu() to for_each_possible_cpu() to avoid this in the
future.

This patch replaces for_each_cpu with for_each_possible_cpu.
in xfs.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Nathan Scott <nathans@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/xfs/linux-2.6/xfs_stats.c  | 4 ++--
 fs/xfs/linux-2.6/xfs_sysctl.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 1f0589a05ec..e480b610205 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -62,7 +62,7 @@ xfs_read_xfsstats(
 		while (j < xstats[i].endpoint) {
 			val = 0;
 			/* sum over all cpus */
-			for_each_cpu(c)
+			for_each_possible_cpu(c)
 				val += *(((__u32*)&per_cpu(xfsstats, c) + j));
 			len += sprintf(buffer + len, " %u", val);
 			j++;
@@ -70,7 +70,7 @@ xfs_read_xfsstats(
 		buffer[len++] = '\n';
 	}
 	/* extra precision counters */
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
 		xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
 		xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 4af97682bec..af246532fbf 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -38,7 +38,7 @@ xfs_stats_clear_proc_handler(
 
 	if (!ret && write && *valp) {
 		printk("XFS Clearing xfsstats\n");
-		for_each_cpu(c) {
+		for_each_possible_cpu(c) {
 			preempt_disable();
 			/* save vn_active, it's a universal truth! */
 			vn_active = per_cpu(xfsstats, c).vn_active;
-- 
cgit v1.2.3


From a43a8c39bbb493c9e93f6764b350de2e33e18e92 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Fri, 23 Jun 2006 02:03:15 -0700
Subject: [PATCH] tightening hugetlb strict accounting

Current hugetlb strict accounting for shared mapping always assume mapping
starts at zero file offset and reserves pages between zero and size of the
file.  This assumption often reserves (or lock down) a lot more pages then
necessary if application maps at none zero file offset.  libhugetlbfs is
one example that requires proper reservation on shared mapping starts at
none zero offset.

This patch extends the reservation and hugetlb strict accounting to support
any arbitrary pair of (offset, len), resulting a much more robust and
accurate scheme.  More importantly, it won't lock down any hugetlb pages
outside file mapping.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 678fc72c364..e6410d8edd0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -59,7 +59,6 @@ static void huge_pagevec_release(struct pagevec *pvec)
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 
@@ -87,9 +86,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
-	if (vma->vm_flags & VM_MAYSHARE)
-		if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
-			goto out;
+	if (vma->vm_flags & VM_MAYSHARE &&
+	    hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
+				  len >> HPAGE_SHIFT))
+		goto out;
 
 	ret = 0;
 	hugetlb_prefault_arch_hook(vma->vm_mm);
@@ -195,12 +195,8 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 	const pgoff_t start = lstart >> HPAGE_SHIFT;
 	struct pagevec pvec;
 	pgoff_t next;
-	int i;
+	int i, freed = 0;
 
-	hugetlb_truncate_reservation(HUGETLBFS_I(inode),
-				     lstart >> HPAGE_SHIFT);
-	if (!mapping->nrpages)
-		return;
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (1) {
@@ -221,10 +217,12 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 			truncate_huge_page(page);
 			unlock_page(page);
 			hugetlb_put_quota(mapping);
+			freed++;
 		}
 		huge_pagevec_release(&pvec);
 	}
 	BUG_ON(!lstart && mapping->nrpages);
+	hugetlb_unreserve_pages(inode, start, freed);
 }
 
 static void hugetlbfs_delete_inode(struct inode *inode)
@@ -366,6 +364,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		INIT_LIST_HEAD(&inode->i_mapping->private_list);
 		info = HUGETLBFS_I(inode);
 		mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
 		switch (mode & S_IFMT) {
@@ -538,7 +537,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
-	p->prereserved_hpages = 0;
 	return &p->vfs_inode;
 }
 
@@ -781,8 +779,7 @@ struct file *hugetlb_zero_setup(size_t size)
 		goto out_file;
 
 	error = -ENOMEM;
-	if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
-				       size >> HPAGE_SHIFT) != 0)
+	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
-- 
cgit v1.2.3


From 111ebb6e6f7bd7de6d722c5848e95621f43700d9 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Fri, 23 Jun 2006 02:03:26 -0700
Subject: [PATCH] writeback: fix range handling

When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request.  Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".

To make all this sane, the patch changes range of writeback_control.

So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.

And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.

This patch does,

    - Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
      -1 is usually ok for range_end (type is long long). But, if someone did,

		range_end += val;		range_end is "val - 1"
		u64val = range_end >> bits;	u64val is "~(0ULL)"

      or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
      things, and uses LLONG_MAX for range_end.

    - All callers of ->writepages() sets range_start/end or range_cyclic.

    - Fix updates of ->writeback_index. It seems already bit strange.
      If it starts at 0 and ended by check of nr_to_write, this last
      index may reduce chance to scan end of file.  So, this updates
      ->writeback_index only if range_cyclic is true or whole-file is
      scanned.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cifs/file.c    | 24 +++++++++++-------------
 fs/fs-writeback.c |  4 ++++
 fs/mpage.c        | 22 ++++++++++------------
 fs/sync.c         |  2 +-
 4 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e2b4ce1dad6..487ea8b3baa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1079,9 +1079,9 @@ static int cifs_writepages(struct address_space *mapping,
 	unsigned int bytes_written;
 	struct cifs_sb_info *cifs_sb;
 	int done = 0;
-	pgoff_t end = -1;
+	pgoff_t end;
 	pgoff_t index;
-	int is_range = 0;
+ 	int range_whole = 0;
 	struct kvec iov[32];
 	int len;
 	int n_iov = 0;
@@ -1122,16 +1122,14 @@ static int cifs_writepages(struct address_space *mapping,
 	xid = GetXid();
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE)
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
-	else {
-		index = 0;
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -1167,7 +1165,7 @@ retry:
 				break;
 			}
 
-			if (unlikely(is_range) && (page->index > end)) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				break;
@@ -1271,7 +1269,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 
 	FreeXid(xid);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f3fbe2d030f..6db95cf3aaa 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -461,6 +461,8 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 {
 	struct writeback_control wbc = {
 		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.range_start	= 0,
+		.range_end	= LLONG_MAX,
 	};
 	unsigned long nr_dirty = read_page_state(nr_dirty);
 	unsigned long nr_unstable = read_page_state(nr_unstable);
@@ -559,6 +561,8 @@ int write_inode_now(struct inode *inode, int sync)
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
 		.sync_mode = WB_SYNC_ALL,
+		.range_start = 0,
+		.range_end = LLONG_MAX,
 	};
 
 	if (!mapping_cap_writeback_dirty(inode->i_mapping))
diff --git a/fs/mpage.c b/fs/mpage.c
index 9bf2eb30e6f..1e4598247d0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -707,9 +707,9 @@ mpage_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
-	pgoff_t end = -1;		/* Inclusive */
+	pgoff_t end;		/* Inclusive */
 	int scanned = 0;
-	int is_range = 0;
+	int range_whole = 0;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -721,16 +721,14 @@ mpage_writepages(struct address_space *mapping,
 		writepage = mapping->a_ops->writepage;
 
 	pagevec_init(&pvec, 0);
-	if (wbc->sync_mode == WB_SYNC_NONE) {
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
 	} else {
-		index = 0;			  /* whole-file sweep */
-		scanned = 1;
-	}
-	if (wbc->start || wbc->end) {
-		index = wbc->start >> PAGE_CACHE_SHIFT;
-		end = wbc->end >> PAGE_CACHE_SHIFT;
-		is_range = 1;
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 		scanned = 1;
 	}
 retry:
@@ -759,7 +757,7 @@ retry:
 				continue;
 			}
 
-			if (unlikely(is_range) && page->index > end) {
+			if (!wbc->range_cyclic && page->index > end) {
 				done = 1;
 				unlock_page(page);
 				continue;
@@ -810,7 +808,7 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (!is_range)
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
diff --git a/fs/sync.c b/fs/sync.c
index aab5ffe77e9..955aef04da2 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -100,7 +100,7 @@ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
 	}
 
 	if (nbytes == 0)
-		endbyte = -1;
+		endbyte = LLONG_MAX;
 	else
 		endbyte--;		/* inclusive */
 
-- 
cgit v1.2.3


From 03e68060636e05989ea94bcb671ab633948f328c Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Fri, 23 Jun 2006 02:03:58 -0700
Subject: [PATCH] lsm: add task_setioprio hook

Implement an LSM hook for setting a task's IO priority, similar to the hook
for setting a tasks's nice value.

A previous version of this LSM hook was included in an older version of
multiadm by Jan Engelhardt, although I don't recall it being submitted
upstream.

Also included is the corresponding SELinux hook, which re-uses the setsched
permission in the proccess class.

Signed-off-by: James Morris <jmorris@namei.org>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Cc: Jan Engelhardt <jengelh@linux01.gwdg.de>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ioprio.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index ca77008146c..7fa76ed53c1 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -24,15 +24,21 @@
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
+#include <linux/security.h>
 
 static int set_task_ioprio(struct task_struct *task, int ioprio)
 {
+	int err;
 	struct io_context *ioc;
 
 	if (task->uid != current->euid &&
 	    task->uid != current->uid && !capable(CAP_SYS_NICE))
 		return -EPERM;
 
+	err = security_task_setioprio(task, ioprio);
+	if (err)
+		return err;
+
 	task_lock(task);
 
 	task->ioprio = ioprio;
-- 
cgit v1.2.3


From 530018bf3d93e04533eec9c6516e3ce8f5310e13 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 23 Jun 2006 02:04:05 -0700
Subject: [PATCH] frv: binfmt_elf_fdpic __user annotations

Add __user annotations to binfmt_elf_fdpic.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf_fdpic.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a2e48c999c2..eba4e23b9ca 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -435,9 +435,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 				   struct elf_fdpic_params *interp_params)
 {
 	unsigned long sp, csp, nitems;
-	elf_caddr_t *argv, *envp;
+	elf_caddr_t __user *argv, *envp;
 	size_t platform_len = 0, len;
-	char *k_platform, *u_platform, *p;
+	char *k_platform;
+	char __user *u_platform, *p;
 	long hwcap;
 	int loop;
 
@@ -462,12 +463,11 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	if (k_platform) {
 		platform_len = strlen(k_platform) + 1;
 		sp -= platform_len;
+		u_platform = (char __user *) sp;
 		if (__copy_to_user(u_platform, k_platform, platform_len) != 0)
 			return -EFAULT;
 	}
 
-	u_platform = (char *) sp;
-
 #if defined(__i386__) && defined(CONFIG_SMP)
 	/* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions
 	 * by the processes running on the same package. One thing we can do
@@ -490,7 +490,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	sp = (sp - len) & ~7UL;
 	exec_params->map_addr = sp;
 
-	if (copy_to_user((void *) sp, exec_params->loadmap, len) != 0)
+	if (copy_to_user((void __user *) sp, exec_params->loadmap, len) != 0)
 		return -EFAULT;
 
 	current->mm->context.exec_fdpic_loadmap = (unsigned long) sp;
@@ -501,7 +501,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 		sp = (sp - len) & ~7UL;
 		interp_params->map_addr = sp;
 
-		if (copy_to_user((void *) sp, interp_params->loadmap, len) != 0)
+		if (copy_to_user((void __user *) sp, interp_params->loadmap, len) != 0)
 			return -EFAULT;
 
 		current->mm->context.interp_fdpic_loadmap = (unsigned long) sp;
@@ -527,7 +527,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	/* put the ELF interpreter info on the stack */
 #define NEW_AUX_ENT(nr, id, val)						\
 	do {									\
-		struct { unsigned long _id, _val; } *ent = (void *) csp;	\
+		struct { unsigned long _id, _val; } __user *ent = (void __user *) csp;	\
 		__put_user((id), &ent[nr]._id);					\
 		__put_user((val), &ent[nr]._val);				\
 	} while (0)
@@ -564,13 +564,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 
 	/* allocate room for argv[] and envv[] */
 	csp -= (bprm->envc + 1) * sizeof(elf_caddr_t);
-	envp = (elf_caddr_t *) csp;
+	envp = (elf_caddr_t __user *) csp;
 	csp -= (bprm->argc + 1) * sizeof(elf_caddr_t);
-	argv = (elf_caddr_t *) csp;
+	argv = (elf_caddr_t __user *) csp;
 
 	/* stack argc */
 	csp -= sizeof(unsigned long);
-	__put_user(bprm->argc, (unsigned long *) csp);
+	__put_user(bprm->argc, (unsigned long __user *) csp);
 
 	BUG_ON(csp != sp);
 
@@ -581,7 +581,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 	current->mm->arg_start = current->mm->start_stack - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p);
 #endif
 
-	p = (char *) current->mm->arg_start;
+	p = (char __user *) current->mm->arg_start;
 	for (loop = bprm->argc; loop > 0; loop--) {
 		__put_user((elf_caddr_t) p, argv++);
 		len = strnlen_user(p, PAGE_SIZE * MAX_ARG_PAGES);
@@ -1025,7 +1025,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		/* clear the bit between beginning of mapping and beginning of PT_LOAD */
 		if (prot & PROT_WRITE && disp > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-			clear_user((void *) maddr, disp);
+			clear_user((void __user *) maddr, disp);
 			maddr += disp;
 		}
 
@@ -1059,7 +1059,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 		if (prot & PROT_WRITE && excess1 > 0) {
 			kdebug("clear[%d] ad=%lx sz=%lx",
 			       loop, maddr + phdr->p_filesz, excess1);
-			clear_user((void *) maddr + phdr->p_filesz, excess1);
+			clear_user((void __user *) maddr + phdr->p_filesz, excess1);
 		}
 
 #else
-- 
cgit v1.2.3


From 0c426f26cce71626ed39cd7b9496ea7f460ea9d0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 23 Jun 2006 02:04:08 -0700
Subject: [PATCH] ext2 XIP won't build without MMU

Disable Ext2 XIP if the kernel is configured in no-MMU mode as the former
won't build.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 20f9b557732..2aa4624cc01 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -53,7 +53,7 @@ config EXT2_FS_SECURITY
 
 config EXT2_FS_XIP
 	bool "Ext2 execute in place support"
-	depends on EXT2_FS
+	depends on EXT2_FS && MMU
 	help
 	  Execute in place can be used on memory-backed block devices. If you
 	  enable this option, you can select to mount block devices which are
-- 
cgit v1.2.3


From 090d2b185d8680fc26a2eaf4245d4171dcf4baf1 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Fri, 23 Jun 2006 02:05:08 -0700
Subject: [PATCH] read_mapping_page for address space

Add read_mapping_page() which is used for callers that pass
mapping->a_ops->readpage as the filler for read_cache_page.  This removes
some duplication from filesystem code.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/dir.c            |  4 +---
 fs/afs/mntpt.c          | 11 ++---------
 fs/cramfs/inode.c       |  4 +---
 fs/ext2/dir.c           |  3 +--
 fs/freevxfs/vxfs_subr.c |  3 +--
 fs/hfs/bnode.c          |  2 +-
 fs/hfs/btree.c          |  2 +-
 fs/hfsplus/bitmap.c     | 15 +++++++--------
 fs/hfsplus/bnode.c      |  2 +-
 fs/hfsplus/btree.c      |  2 +-
 fs/jfs/jfs_metapage.c   |  5 ++---
 fs/minix/dir.c          |  3 +--
 fs/namei.c              |  3 +--
 fs/ntfs/aops.h          |  3 +--
 fs/ntfs/attrib.c        |  6 ++----
 fs/ntfs/file.c          |  3 +--
 fs/ocfs2/symlink.c      |  3 +--
 fs/partitions/check.c   |  4 ++--
 fs/reiserfs/xattr.c     |  3 +--
 fs/sysv/dir.c           |  3 +--
 20 files changed, 30 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index a6dff6a4f20..2fc99877cb0 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -185,9 +185,7 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
 
 	_enter("{%lu},%lu", dir->i_ino, index);
 
-	page = read_cache_page(dir->i_mapping,index,
-			       (filler_t *) dir->i_mapping->a_ops->readpage,
-			       NULL);
+	page = read_mapping_page(dir->i_mapping, index, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 4e6eeb59b83..b5cf9e1205a 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -63,7 +63,6 @@ unsigned long afs_mntpt_expiry_timeout = 20;
 int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 {
 	struct page *page;
-	filler_t *filler;
 	size_t size;
 	char *buf;
 	int ret;
@@ -71,10 +70,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 	_enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
 
 	/* read the contents of the symlink into the pagecache */
-	filler = (filler_t *) AFS_VNODE_TO_I(vnode)->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
-			       filler, NULL);
+	page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto out;
@@ -160,7 +156,6 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	struct page *page = NULL;
 	size_t size;
 	char *buf, *devname = NULL, *options = NULL;
-	filler_t *filler;
 	int ret;
 
 	kenter("{%s}", mntpt->d_name.name);
@@ -182,9 +177,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 		goto error;
 
 	/* read the contents of the AFS special symlink */
-	filler = (filler_t *)mntpt->d_inode->i_mapping->a_ops->readpage;
-
-	page = read_cache_page(mntpt->d_inode->i_mapping, 0, filler, NULL);
+	page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto error;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 8a9d5d3b326..c45d7386080 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -181,9 +181,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 		struct page *page = NULL;
 
 		if (blocknr + i < devsize) {
-			page = read_cache_page(mapping, blocknr + i,
-				(filler_t *)mapping->a_ops->readpage,
-				NULL);
+			page = read_mapping_page(mapping, blocknr + i, NULL);
 			/* synchronous error? */
 			if (IS_ERR(page))
 				page = NULL;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index d672aa9f406..3c1c9aaaca6 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -159,8 +159,7 @@ fail:
 static struct page * ext2_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 50aae77651b..c1be118fc06 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -71,8 +71,7 @@ vxfs_get_page(struct address_space *mapping, u_long n)
 {
 	struct page *			pp;
 
-	pp = read_cache_page(mapping, n,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	pp = read_mapping_page(mapping, n, NULL);
 
 	if (!IS_ERR(pp)) {
 		wait_on_page_locked(pp);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 1e44dcfe49c..13231dd5ce6 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -280,7 +280,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; i++) {
-		page = read_cache_page(mapping, block++, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block++, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index d20131ce4b9..40035799431 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -59,7 +59,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	unlock_new_inode(tree->inode);
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index 9fb51632303..d128a25b74d 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -31,8 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 	dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
-	page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	i = offset % 32;
@@ -72,8 +71,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
 		offset += PAGE_CACHE_BITS;
 		if (offset >= size)
 			break;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		curr = pptr = kmap(page);
 		if ((size ^ offset) / PAGE_CACHE_BITS)
 			end = pptr + PAGE_CACHE_BITS / 32;
@@ -119,8 +118,8 @@ found:
 		set_page_dirty(page);
 		kunmap(page);
 		offset += PAGE_CACHE_BITS;
-		page = read_cache_page(mapping, offset / PAGE_CACHE_BITS,
-				       (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
+					 NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
@@ -167,7 +166,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 	mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex);
 	mapping = HFSPLUS_SB(sb).alloc_file->i_mapping;
 	pnr = offset / PAGE_CACHE_BITS;
-	page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, pnr, NULL);
 	pptr = kmap(page);
 	curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
 	end = pptr + PAGE_CACHE_BITS / 32;
@@ -199,7 +198,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
 			break;
 		set_page_dirty(page);
 		kunmap(page);
-		page = read_cache_page(mapping, ++pnr, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, ++pnr, NULL);
 		pptr = kmap(page);
 		curr = pptr;
 		end = pptr + PAGE_CACHE_BITS / 32;
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 746abc9ecf7..77bf434da67 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -440,7 +440,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	block = off >> PAGE_CACHE_SHIFT;
 	node->page_offset = off & ~PAGE_CACHE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; block++, i++) {
-		page = read_cache_page(mapping, block, (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, block, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index effa8991999..cfc852fdd1b 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -38,7 +38,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 		goto free_tree;
 
 	mapping = tree->inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto free_tree;
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 2b220dd6b4e..7f6e8803970 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -632,10 +632,9 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
 		}
 		SetPageUptodate(page);
 	} else {
-		page = read_cache_page(mapping, page_index,
-			    (filler_t *)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, page_index, NULL);
 		if (IS_ERR(page) || !PageUptodate(page)) {
-			jfs_err("read_cache_page failed!");
+			jfs_err("read_mapping_page failed!");
 			return NULL;
 		}
 		lock_page(page);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 69224d1fe04..2b0a389d198 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -60,8 +60,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/namei.c b/fs/namei.c
index 184fe4acf82..bb4a3e40e43 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2577,8 +2577,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage,
-				NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 3b74e66ca2f..325ce261a10 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -86,8 +86,7 @@ static inline void ntfs_unmap_page(struct page *page)
 static inline struct page *ntfs_map_page(struct address_space *mapping,
 		unsigned long index)
 {
-	struct page *page = read_cache_page(mapping, index,
-			(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, index, NULL);
 
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1663f5c3c6a..6708e1d68a9 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -2529,8 +2529,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	end >>= PAGE_CACHE_SHIFT;
 	/* If there is a first partial page, need to do it the slow way. */
 	if (start_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read first partial "
 					"page (sync error, index 0x%lx).", idx);
@@ -2600,8 +2599,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	}
 	/* If there is a last partial page, need to do it the slow way. */
 	if (end_ofs) {
-		page = read_cache_page(mapping, idx,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, idx, NULL);
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read last partial page "
 					"(sync error, index 0x%lx).", idx);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 36e1e136bb0..88292f9e4b9 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -231,8 +231,7 @@ do_non_resident_extend:
 		 * Read the page.  If the page is not present, this will zero
 		 * the uninitialized regions for us.
 		 */
-		page = read_cache_page(mapping, index,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+		page = read_mapping_page(mapping, index, NULL);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			goto init_err_out;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f6986bd79e7..0c8a1294ec9 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -64,8 +64,7 @@ static char *ocfs2_page_getlink(struct dentry * dentry,
 {
 	struct page * page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_cache_page(mapping, 0,
-			       (filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		goto sync_fail;
 	wait_on_page_locked(page);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 8851b81e7c5..cd885b23cb5 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -499,8 +499,8 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	struct page *page;
 
-	page = read_cache_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-			(filler_t *)mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+				 NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		if (!PageUptodate(page))
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ffb79c48c5b..39fedaa88a0 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -452,8 +452,7 @@ static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
 	/* We can deadlock if we try to free dentries,
 	   and an unlink/rmdir has just occured - GFP_NOFS avoids this */
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	page = read_cache_page(mapping, n,
-			       (filler_t *) mapping->a_ops->readpage, NULL);
+	page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index d7074341ee8..f2bef962d30 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -53,8 +53,7 @@ static int dir_commit_chunk(struct page *page, unsigned from, unsigned to)
 static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
 	struct address_space *mapping = dir->i_mapping;
-	struct page *page = read_cache_page(mapping, n,
-				(filler_t*)mapping->a_ops->readpage, NULL);
+	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
-- 
cgit v1.2.3


From 0d9a490abe1f69fda220f7866f6f23af41daa128 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 23 Jun 2006 02:05:09 -0700
Subject: [PATCH] locks: don't unnecessarily fail posix lock operations

posix_lock_file() was too cautious, failing operations on OOM, even if they
didn't actually require an allocation.

This has the disadvantage, that a failing unlock on process exit could lead to
a memory leak.  There are two possibilites for this:

- filesystem implements .lock() and calls back to posix_lock_file().  On
cleanup of files_struct locks_remove_posix() is called which should remove all
locks belonging to files_struct.  However if filesystem calls
posix_lock_file() which fails, then those locks will never be freed.

- if a file is closed while a lock is blocked, then after acquiring
fcntl_setlk() will undo the lock.  But this unlock itself might fail on OOM,
again possibly leaking the lock.

The solution is to move the checking of the allocations until after it is sure
that they will be needed.  This will solve the above problem since unlock will
always succeed unless it splits an existing region.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 69435c68c1e..c5ac6b40e76 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -834,14 +834,7 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 
-	error = -ENOLCK; /* "no luck" */
-	if (!(new_fl && new_fl2))
-		goto out;
-
 	/*
-	 * We've allocated the new locks in advance, so there are no
-	 * errors possible (and no blocking operations) from here on.
-	 * 
 	 * Find the first old lock with the same owner as the new lock.
 	 */
 	
@@ -938,10 +931,25 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
 		before = &fl->fl_next;
 	}
 
+	/*
+	 * The above code only modifies existing locks in case of
+	 * merging or replacing.  If new lock(s) need to be inserted
+	 * all modifications are done bellow this, so it's safe yet to
+	 * bail out.
+	 */
+	error = -ENOLCK; /* "no luck" */
+	if (right && left == right && !new_fl2)
+		goto out;
+
 	error = 0;
 	if (!added) {
 		if (request->fl_type == F_UNLCK)
 			goto out;
+
+		if (!new_fl) {
+			error = -ENOLCK;
+			goto out;
+		}
 		locks_copy_lock(new_fl, request);
 		locks_insert_lock(before, new_fl);
 		new_fl = NULL;
-- 
cgit v1.2.3


From 39005d022ad221b76dc2de0ac62ef475a796433b Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 23 Jun 2006 02:05:10 -0700
Subject: [PATCH] locks: don't do unnecessary allocations

posix_lock_file() always allocates new locks in advance, even if it's easy to
determine that no allocations will be needed.

Optimize these cases:

 - FL_ACCESS flag is set

 - Unlocking the whole range

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index c5ac6b40e76..2344f241c68 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -794,7 +794,8 @@ out:
 static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
 	struct file_lock *fl;
-	struct file_lock *new_fl, *new_fl2;
+	struct file_lock *new_fl = NULL;
+	struct file_lock *new_fl2 = NULL;
 	struct file_lock *left = NULL;
 	struct file_lock *right = NULL;
 	struct file_lock **before;
@@ -803,9 +804,15 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request
 	/*
 	 * We may need two file_lock structures for this operation,
 	 * so we get them in advance to avoid races.
+	 *
+	 * In some cases we can be sure, that no new locks will be needed
 	 */
-	new_fl = locks_alloc_lock();
-	new_fl2 = locks_alloc_lock();
+	if (!(request->fl_flags & FL_ACCESS) &&
+	    (request->fl_type != F_UNLCK ||
+	     request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
+		new_fl = locks_alloc_lock();
+		new_fl2 = locks_alloc_lock();
+	}
 
 	lock_kernel();
 	if (request->fl_type != F_UNLCK) {
-- 
cgit v1.2.3


From ff7b86b82083f24b8637dff1528c7101c18c7f39 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 23 Jun 2006 02:05:11 -0700
Subject: [PATCH] locks: clean up locks_remove_posix()

locks_remove_posix() can use posix_lock_file() instead of doing the lock
removal by hand.  posix_lock_file() now does exacly the same.

The comment about pids no longer applies, posix_lock_file() takes only the
owner into account.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 2344f241c68..e588e1c265f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1896,15 +1896,14 @@ out:
  */
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
-	struct file_lock lock, **before;
+	struct file_lock lock;
 
 	/*
 	 * If there are no locks held on this file, we don't need to call
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
-	before = &filp->f_dentry->d_inode->i_flock;
-	if (*before == NULL)
+	if (!filp->f_dentry->d_inode->i_flock)
 		return;
 
 	lock.fl_type = F_UNLCK;
@@ -1917,25 +1916,11 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 	lock.fl_ops = NULL;
 	lock.fl_lmops = NULL;
 
-	if (filp->f_op && filp->f_op->lock != NULL) {
+	if (filp->f_op && filp->f_op->lock != NULL)
 		filp->f_op->lock(filp, F_SETLK, &lock);
-		goto out;
-	}
+	else
+		posix_lock_file(filp, &lock);
 
-	/* Can't use posix_lock_file here; we need to remove it no matter
-	 * which pid we have.
-	 */
-	lock_kernel();
-	while (*before != NULL) {
-		struct file_lock *fl = *before;
-		if (IS_POSIX(fl) && posix_same_owner(fl, &lock)) {
-			locks_delete_lock(before);
-			continue;
-		}
-		before = &fl->fl_next;
-	}
-	unlock_kernel();
-out:
 	if (lock.fl_ops && lock.fl_ops->fl_release_private)
 		lock.fl_ops->fl_release_private(&lock);
 }
-- 
cgit v1.2.3


From 75e1fcc0b18df0a65ab113198e9dc0e98999a08c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Fri, 23 Jun 2006 02:05:12 -0700
Subject: [PATCH] vfs: add lock owner argument to flush operation

Pass the POSIX lock owner ID to the flush operation.

This is useful for filesystems which don't want to store any locking state
in inode->i_flock but want to handle locking/unlocking POSIX locks
internally.  FUSE is one such filesystem but I think it possible that some
network filesystems would need this also.

Also add a flag to indicate that a POSIX locking request was generated by
close(), so filesystems using the above feature won't send an extra locking
request in this case.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/cifs/cifsfs.h | 2 +-
 fs/cifs/file.c   | 2 +-
 fs/coda/file.c   | 2 +-
 fs/fuse/file.c   | 2 +-
 fs/locks.c       | 2 +-
 fs/nfs/file.c    | 4 ++--
 fs/open.c        | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index c98755dca86..d56c0577c71 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -74,7 +74,7 @@ extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 			 size_t write_size, loff_t * poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
 extern int cifs_fsync(struct file *, struct dentry *, int);
-extern int cifs_flush(struct file *);
+extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 487ea8b3baa..b4a18c1cab0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1417,7 +1417,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
  * As file closes, flush all cached write data for this inode checking
  * for write behind errors.
  */
-int cifs_flush(struct file *file)
+int cifs_flush(struct file *file, fl_owner_t id)
 {
 	struct inode * inode = file->f_dentry->d_inode;
 	int rc = 0;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7c2642431fa..cc66c681bd1 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -164,7 +164,7 @@ int coda_open(struct inode *coda_inode, struct file *coda_file)
 	return 0;
 }
 
-int coda_flush(struct file *coda_file)
+int coda_flush(struct file *coda_file, fl_owner_t id)
 {
 	unsigned short flags = coda_file->f_flags & ~O_EXCL;
 	unsigned short coda_flags = coda_flags_to_cflags(flags);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fc342cf7c2c..087f3b734f4 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -169,7 +169,7 @@ static int fuse_release(struct inode *inode, struct file *file)
 	return fuse_release_common(inode, file, 0);
 }
 
-static int fuse_flush(struct file *file)
+static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
diff --git a/fs/locks.c b/fs/locks.c
index e588e1c265f..f8a634ac112 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1907,7 +1907,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 		return;
 
 	lock.fl_type = F_UNLCK;
-	lock.fl_flags = FL_POSIX;
+	lock.fl_flags = FL_POSIX | FL_CLOSE;
 	lock.fl_start = 0;
 	lock.fl_end = OFFSET_MAX;
 	lock.fl_owner = owner;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index fade02c15e6..fa05c027ea1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -43,7 +43,7 @@ static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
 static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
 static ssize_t nfs_file_read(struct kiocb *, char __user *, size_t, loff_t);
 static ssize_t nfs_file_write(struct kiocb *, const char __user *, size_t, loff_t);
-static int  nfs_file_flush(struct file *);
+static int  nfs_file_flush(struct file *, fl_owner_t id);
 static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
@@ -188,7 +188,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
  *
  */
 static int
-nfs_file_flush(struct file *file)
+nfs_file_flush(struct file *file, fl_owner_t id)
 {
 	struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data;
 	struct inode	*inode = file->f_dentry->d_inode;
diff --git a/fs/open.c b/fs/open.c
index a37ff861108..5fb16e5267d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1152,7 +1152,7 @@ int filp_close(struct file *filp, fl_owner_t id)
 	}
 
 	if (filp->f_op && filp->f_op->flush)
-		retval = filp->f_op->flush(filp);
+		retval = filp->f_op->flush(filp, id);
 
 	dnotify_flush(filp, id);
 	locks_remove_posix(filp, id);
-- 
cgit v1.2.3


From b0904e147f7cbe4be3b4dae49ddccd627bb66f16 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 23 Jun 2006 02:05:13 -0700
Subject: [PATCH] fs/locks.c: make posix_locks_deadlock() static

We can now make posix_locks_deadlock() static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/locks.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index f8a634ac112..1ad29c9b625 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -703,7 +703,7 @@ EXPORT_SYMBOL(posix_test_lock);
  * from a broken NFS client. But broken NFS clients have a lot more to
  * worry about than proper deadlock detection anyway... --okir
  */
-int posix_locks_deadlock(struct file_lock *caller_fl,
+static int posix_locks_deadlock(struct file_lock *caller_fl,
 				struct file_lock *block_fl)
 {
 	struct list_head *tmp;
@@ -722,8 +722,6 @@ next_task:
 	return 0;
 }
 
-EXPORT_SYMBOL(posix_locks_deadlock);
-
 /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
  * at the head of the list, but that's secret knowledge known only to
  * flock_lock_file and posix_lock_file.
-- 
cgit v1.2.3


From 2da132646358c853d5caf296d079aefc69358d46 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 23 Jun 2006 02:05:15 -0700
Subject: [PATCH] fs/fat/misc.c: unexport fat_sync_bhs

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fat/misc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 944652e9dde..308f2b6b502 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -210,4 +210,3 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 	return err;
 }
 
-EXPORT_SYMBOL_GPL(fat_sync_bhs);
-- 
cgit v1.2.3


From 4a4b69f79ba7286794765a856349e380f984a6cb Mon Sep 17 00:00:00 2001
From: Vadim Lobanov <vlobanov@speakeasy.net>
Date: Fri, 23 Jun 2006 02:05:16 -0700
Subject: [PATCH] Poll cleanups/microoptimizations

The "count" and "pt" variables are declared and modified by do_poll(), as
well as accessed and written indirectly in the do_pollfd() subroutine.

This patch pulls all handling of these variables into the do_poll()
function, thereby eliminating the odd use of indirection in do_pollfd().
This is done by pulling the "struct pollfd" traversal loop from do_pollfd()
into its only caller do_poll().  As an added bonus, the patch saves a few
clock cycles, and also adds comments to make the code easier to follow.

Signed-off-by: Vadim Lobanov <vlobanov@speakeasy.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/select.c | 83 +++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 51 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index a8109baa5e4..9c4f0f2604f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -546,37 +546,38 @@ struct poll_list {
 
 #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))
 
-static void do_pollfd(unsigned int num, struct pollfd * fdpage,
-	poll_table ** pwait, int *count)
+/*
+ * Fish for pollable events on the pollfd->fd file descriptor. We're only
+ * interested in events matching the pollfd->events mask, and the result
+ * matching that mask is both recorded in pollfd->revents and returned. The
+ * pwait poll_table will be used by the fd-provided poll handler for waiting,
+ * if non-NULL.
+ */
+static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 {
-	int i;
-
-	for (i = 0; i < num; i++) {
-		int fd;
-		unsigned int mask;
-		struct pollfd *fdp;
-
-		mask = 0;
-		fdp = fdpage+i;
-		fd = fdp->fd;
-		if (fd >= 0) {
-			int fput_needed;
-			struct file * file = fget_light(fd, &fput_needed);
-			mask = POLLNVAL;
-			if (file != NULL) {
-				mask = DEFAULT_POLLMASK;
-				if (file->f_op && file->f_op->poll)
-					mask = file->f_op->poll(file, *pwait);
-				mask &= fdp->events | POLLERR | POLLHUP;
-				fput_light(file, fput_needed);
-			}
-			if (mask) {
-				*pwait = NULL;
-				(*count)++;
-			}
+	unsigned int mask;
+	int fd;
+
+	mask = 0;
+	fd = pollfd->fd;
+	if (fd >= 0) {
+		int fput_needed;
+		struct file * file;
+
+		file = fget_light(fd, &fput_needed);
+		mask = POLLNVAL;
+		if (file != NULL) {
+			mask = DEFAULT_POLLMASK;
+			if (file->f_op && file->f_op->poll)
+				mask = file->f_op->poll(file, pwait);
+			/* Mask out unneeded events. */
+			mask &= pollfd->events | POLLERR | POLLHUP;
+			fput_light(file, fput_needed);
 		}
-		fdp->revents = mask;
 	}
+	pollfd->revents = mask;
+
+	return mask;
 }
 
 static int do_poll(unsigned int nfds,  struct poll_list *list,
@@ -594,11 +595,29 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 		long __timeout;
 
 		set_current_state(TASK_INTERRUPTIBLE);
-		walk = list;
-		while(walk != NULL) {
-			do_pollfd( walk->len, walk->entries, &pt, &count);
-			walk = walk->next;
+		for (walk = list; walk != NULL; walk = walk->next) {
+			struct pollfd * pfd, * pfd_end;
+
+			pfd = walk->entries;
+			pfd_end = pfd + walk->len;
+			for (; pfd != pfd_end; pfd++) {
+				/*
+				 * Fish for events. If we found one, record it
+				 * and kill the poll_table, so we don't
+				 * needlessly register any other waiters after
+				 * this. They'll get immediately deregistered
+				 * when we break out and return.
+				 */
+				if (do_pollfd(pfd, pt)) {
+					count++;
+					pt = NULL;
+				}
+			}
 		}
+		/*
+		 * All waiters have already been registered, so don't provide
+		 * a poll_table to them on the next loop iteration.
+		 */
 		pt = NULL;
 		if (count || !*timeout || signal_pending(current))
 			break;
-- 
cgit v1.2.3


From 9ada7340987aa24395809570840c7c6847044f52 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 23 Jun 2006 02:05:25 -0700
Subject: [PATCH] jbd: fix BUG in journal_commit_transaction()

Fix possible assertion failure in journal_commit_transaction() on
jh->b_next_transaction == NULL (when we are processing BJ_Forget list and
buffer is not jbddirty).

!jbddirty buffers can be placed on BJ_Forget list for example by
journal_forget() or by __dispose_buffer() - generally such buffer means
that it has been freed by this transaction.

Freed buffers should not be reallocated until the transaction has committed
(that's why we have the assertion there) but they *can* be reallocated when
the transaction has already been committed to disk and we are just
processing the BJ_Forget list (as soon as we remove b_committed_data from
the bitmap bh, ext3 will be able to reallocate buffers freed by the
committing transaction).  So we have to also count with the case that the
buffer has been reallocated and b_next_transaction has been already set.

And one more subtle point: it can happen that we manage to reallocate the
buffer and also mark it jbddirty.  Then we also add the freed buffer to the
checkpoint list of the committing trasaction.  But that should do no harm.

Non-jbddirty buffers should be filed to BJ_Reserved and not BJ_Metadata
list.  It can actually happen that we refile such buffers during the commit
phase when we reallocate in the running transaction blocks deleted in
committing transaction (and that can happen if the committing transaction
already wrote all the data and is just cleaning up BJ_Forget list).

Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: "Stephen C. Tweedie" <sct@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/commit.c      | 21 ++++++++++++++++-----
 fs/jbd/transaction.c |  3 ++-
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 002ad2bbc76..0971814c38b 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -790,11 +790,22 @@ restart_loop:
 			jbd_unlock_bh_state(bh);
 		} else {
 			J_ASSERT_BH(bh, !buffer_dirty(bh));
-			J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-			__journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);  /* needs a brelse */
-			release_buffer_page(bh);
+			/* The buffer on BJ_Forget list and not jbddirty means
+			 * it has been freed by this transaction and hence it
+			 * could not have been reallocated until this
+			 * transaction has committed. *BUT* it could be
+			 * reallocated once we have written all the data to
+			 * disk and before we process the buffer on BJ_Forget
+			 * list. */
+			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+			__journal_refile_buffer(jh);
+			if (!jh->b_transaction) {
+				jbd_unlock_bh_state(bh);
+				 /* needs a brelse */
+				journal_remove_journal_head(bh);
+				release_buffer_page(bh);
+			} else
+				jbd_unlock_bh_state(bh);
 		}
 		cond_resched_lock(&journal->j_list_lock);
 	}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c609f5034fc..ff75afe9b18 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -2038,7 +2038,8 @@ void __journal_refile_buffer(struct journal_head *jh)
 	__journal_temp_unlink_buffer(jh);
 	jh->b_transaction = jh->b_next_transaction;
 	jh->b_next_transaction = NULL;
-	__journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
+	__journal_file_buffer(jh, jh->b_transaction,
+				was_dirty ? BJ_Metadata : BJ_Reserved);
 	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
 
 	if (was_dirty)
-- 
cgit v1.2.3


From 304c4c841a31c780a45d65e389b07706babf5d36 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 23 Jun 2006 02:05:31 -0700
Subject: [PATCH] jbd: avoid kfree(NULL)

There are a couple of places where JBD has to check to see whether an unneeded
memory allocation was performed.  Usually it _was_ needed, so we end up
calling kfree(NULL).  We can micro-optimise that by checking the pointer
before calling kfree().

Thanks to Steven Rostedt <rostedt@goodmis.org> for identifying this.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/transaction.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index ff75afe9b18..508b2ea91f4 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -227,7 +227,8 @@ repeat_locked:
 	spin_unlock(&transaction->t_handle_lock);
 	spin_unlock(&journal->j_state_lock);
 out:
-	kfree(new_transaction);
+	if (unlikely(new_transaction))		/* It's usually NULL */
+		kfree(new_transaction);
 	return ret;
 }
 
@@ -724,7 +725,8 @@ done:
 	journal_cancel_revoke(handle, jh);
 
 out:
-	kfree(frozen_buffer);
+	if (unlikely(frozen_buffer))	/* It's usually NULL */
+		kfree(frozen_buffer);
 
 	JBUFFER_TRACE(jh, "exit");
 	return error;
@@ -903,7 +905,8 @@ repeat:
 	jbd_unlock_bh_state(bh);
 out:
 	journal_put_journal_head(jh);
-	kfree(committed_data);
+	if (unlikely(committed_data))
+		kfree(committed_data);
 	return err;
 }
 
-- 
cgit v1.2.3


From e6022603b9aa7d61d20b392e69edcdbbc1789969 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 23 Jun 2006 02:05:32 -0700
Subject: [PATCH] ext3_clear_inode(): avoid kfree(NULL)

Steven Rostedt <rostedt@goodmis.org> points out that `rsv' here is usually
NULL, so we should avoid calling kfree().

Also, fix up some nearby whitespace damage.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/super.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e08b6439617..1a198b3985c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -499,20 +499,21 @@ static void ext3_clear_inode(struct inode *inode)
 {
 	struct ext3_block_alloc_info *rsv = EXT3_I(inode)->i_block_alloc_info;
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
-       if (EXT3_I(inode)->i_acl &&
-           EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
-               posix_acl_release(EXT3_I(inode)->i_acl);
-               EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
-       }
-       if (EXT3_I(inode)->i_default_acl &&
-           EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
-               posix_acl_release(EXT3_I(inode)->i_default_acl);
-               EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
-       }
+	if (EXT3_I(inode)->i_acl &&
+			EXT3_I(inode)->i_acl != EXT3_ACL_NOT_CACHED) {
+		posix_acl_release(EXT3_I(inode)->i_acl);
+		EXT3_I(inode)->i_acl = EXT3_ACL_NOT_CACHED;
+	}
+	if (EXT3_I(inode)->i_default_acl &&
+			EXT3_I(inode)->i_default_acl != EXT3_ACL_NOT_CACHED) {
+		posix_acl_release(EXT3_I(inode)->i_default_acl);
+		EXT3_I(inode)->i_default_acl = EXT3_ACL_NOT_CACHED;
+	}
 #endif
 	ext3_discard_reservation(inode);
 	EXT3_I(inode)->i_block_alloc_info = NULL;
-	kfree(rsv);
+	if (unlikely(rsv))
+		kfree(rsv);
 }
 
 static inline void ext3_show_quota_options(struct seq_file *seq, struct super_block *sb)
-- 
cgit v1.2.3


From f4e5cc2c44bf760c02875cf48c886c50ec7d2734 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Fri, 23 Jun 2006 02:05:35 -0700
Subject: [PATCH] binfmt_elf: CodingStyle cleanup and remove some pointless
 casts

Do a CodingStyle cleanup of fs/binfmt_elf.c and also remove some pointless
casts of kmalloc() return values in the same file.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c | 344 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 183 insertions(+), 161 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8a04216e8b4..451c04fecb4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -38,15 +38,13 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/random.h>
-
+#include <linux/elf.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
-#include <linux/elf.h>
-
-static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs);
-static int load_elf_library(struct file*);
+static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
+static int load_elf_library(struct file *);
 static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
 extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
 
@@ -59,15 +57,15 @@ extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
  * don't even try.
  */
 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file);
+static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file);
 #else
 #define elf_core_dump	NULL
 #endif
 
 #if ELF_EXEC_PAGESIZE > PAGE_SIZE
-# define ELF_MIN_ALIGN	ELF_EXEC_PAGESIZE
+#define ELF_MIN_ALIGN	ELF_EXEC_PAGESIZE
 #else
-# define ELF_MIN_ALIGN	PAGE_SIZE
+#define ELF_MIN_ALIGN	PAGE_SIZE
 #endif
 
 #ifndef ELF_CORE_EFLAGS
@@ -86,7 +84,7 @@ static struct linux_binfmt elf_format = {
 		.min_coredump	= ELF_EXEC_PAGESIZE
 };
 
-#define BAD_ADDR(x)	((unsigned long)(x) > TASK_SIZE)
+#define BAD_ADDR(x) ((unsigned long)(x) > TASK_SIZE)
 
 static int set_brk(unsigned long start, unsigned long end)
 {
@@ -104,13 +102,11 @@ static int set_brk(unsigned long start, unsigned long end)
 	return 0;
 }
 
-
 /* We need to explicitly zero any fractional pages
    after the data section (i.e. bss).  This would
    contain the junk from the file that should not
-   be in memory */
-
-
+   be in memory
+ */
 static int padzero(unsigned long elf_bss)
 {
 	unsigned long nbyte;
@@ -129,7 +125,9 @@ static int padzero(unsigned long elf_bss)
 #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items))
 #define STACK_ROUND(sp, items) \
 	((15 + (unsigned long) ((sp) + (items))) &~ 15UL)
-#define STACK_ALLOC(sp, len) ({ elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; old_sp; })
+#define STACK_ALLOC(sp, len) ({ \
+	elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \
+	old_sp; })
 #else
 #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items))
 #define STACK_ROUND(sp, items) \
@@ -138,7 +136,7 @@ static int padzero(unsigned long elf_bss)
 #endif
 
 static int
-create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
+create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 		int interp_aout, unsigned long load_addr,
 		unsigned long interp_load_addr)
 {
@@ -161,7 +159,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 	 * for userspace to get any other way, in others (i386) it is
 	 * merely difficult.
 	 */
-
 	u_platform = NULL;
 	if (k_platform) {
 		size_t len = strlen(k_platform) + 1;
@@ -171,7 +168,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 		 * evictions by the processes running on the same package. One
 		 * thing we can do is to shuffle the initial stack for them.
 		 */
-	 
+
 		p = arch_align_stack(p);
 
 		u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
@@ -182,7 +179,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 	/* Create the ELF interpreter info */
 	elf_info = (elf_addr_t *) current->mm->saved_auxv;
 #define NEW_AUX_ENT(id, val) \
-	do { elf_info[ei_index++] = id; elf_info[ei_index++] = val; } while (0)
+	do { \
+		elf_info[ei_index++] = id; elf_info[ei_index++] = val; \
+	} while (0)
 
 #ifdef ARCH_DLINFO
 	/* 
@@ -195,21 +194,22 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 	NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
 	NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
 	NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
-	NEW_AUX_ENT(AT_PHENT, sizeof (struct elf_phdr));
+	NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
 	NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
 	NEW_AUX_ENT(AT_FLAGS, 0);
 	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-	NEW_AUX_ENT(AT_UID, (elf_addr_t) tsk->uid);
-	NEW_AUX_ENT(AT_EUID, (elf_addr_t) tsk->euid);
-	NEW_AUX_ENT(AT_GID, (elf_addr_t) tsk->gid);
-	NEW_AUX_ENT(AT_EGID, (elf_addr_t) tsk->egid);
- 	NEW_AUX_ENT(AT_SECURE, (elf_addr_t) security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_UID, (elf_addr_t)tsk->uid);
+	NEW_AUX_ENT(AT_EUID, (elf_addr_t)tsk->euid);
+	NEW_AUX_ENT(AT_GID, (elf_addr_t)tsk->gid);
+	NEW_AUX_ENT(AT_EGID, (elf_addr_t)tsk->egid);
+ 	NEW_AUX_ENT(AT_SECURE, (elf_addr_t)security_bprm_secureexec(bprm));
 	if (k_platform) {
-		NEW_AUX_ENT(AT_PLATFORM, (elf_addr_t)(unsigned long)u_platform);
+		NEW_AUX_ENT(AT_PLATFORM,
+			(elf_addr_t)(unsigned long)u_platform);
 	}
 	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
-		NEW_AUX_ENT(AT_EXECFD, (elf_addr_t) bprm->interp_data);
+		NEW_AUX_ENT(AT_EXECFD, (elf_addr_t)bprm->interp_data);
 	}
 #undef NEW_AUX_ENT
 	/* AT_NULL is zero; clear the rest too */
@@ -232,7 +232,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 	/* Point sp at the lowest address on the stack */
 #ifdef CONFIG_STACK_GROWSUP
 	sp = (elf_addr_t __user *)bprm->p - items - ei_index;
-	bprm->exec = (unsigned long) sp; /* XXX: PARISC HACK */
+	bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */
 #else
 	sp = (elf_addr_t __user *)bprm->p;
 #endif
@@ -285,7 +285,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr * exec,
 #ifndef elf_map
 
 static unsigned long elf_map(struct file *filep, unsigned long addr,
-			struct elf_phdr *eppnt, int prot, int type)
+		struct elf_phdr *eppnt, int prot, int type)
 {
 	unsigned long map_addr;
 	unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr);
@@ -310,9 +310,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
    is only provided so that we can read a.out libraries that have
    an ELF header */
 
-static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
-				     struct file * interpreter,
-				     unsigned long *interp_load_addr)
+static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
+		struct file *interpreter, unsigned long *interp_load_addr)
 {
 	struct elf_phdr *elf_phdata;
 	struct elf_phdr *eppnt;
@@ -342,15 +341,15 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
 		goto out;
 
 	/* Now read in all of the header information */
-
 	size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum;
 	if (size > ELF_MIN_ALIGN)
 		goto out;
-	elf_phdata = (struct elf_phdr *) kmalloc(size, GFP_KERNEL);
+	elf_phdata = kmalloc(size, GFP_KERNEL);
 	if (!elf_phdata)
 		goto out;
 
-	retval = kernel_read(interpreter,interp_elf_ex->e_phoff,(char *)elf_phdata,size);
+	retval = kernel_read(interpreter, interp_elf_ex->e_phoff,
+			     (char *)elf_phdata,size);
 	error = -EIO;
 	if (retval != size) {
 		if (retval < 0)
@@ -359,58 +358,65 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
 	}
 
 	eppnt = elf_phdata;
-	for (i=0; i<interp_elf_ex->e_phnum; i++, eppnt++) {
-	  if (eppnt->p_type == PT_LOAD) {
-	    int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
-	    int elf_prot = 0;
-	    unsigned long vaddr = 0;
-	    unsigned long k, map_addr;
-
-	    if (eppnt->p_flags & PF_R) elf_prot =  PROT_READ;
-	    if (eppnt->p_flags & PF_W) elf_prot |= PROT_WRITE;
-	    if (eppnt->p_flags & PF_X) elf_prot |= PROT_EXEC;
-	    vaddr = eppnt->p_vaddr;
-	    if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
-	    	elf_type |= MAP_FIXED;
-
-	    map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type);
-	    error = map_addr;
-	    if (BAD_ADDR(map_addr))
-	    	goto out_close;
-
-	    if (!load_addr_set && interp_elf_ex->e_type == ET_DYN) {
-		load_addr = map_addr - ELF_PAGESTART(vaddr);
-		load_addr_set = 1;
-	    }
-
-	    /*
-	     * Check to see if the section's size will overflow the
-	     * allowed task size. Note that p_filesz must always be
-	     * <= p_memsize so it is only necessary to check p_memsz.
-	     */
-	    k = load_addr + eppnt->p_vaddr;
-	    if (k > TASK_SIZE || eppnt->p_filesz > eppnt->p_memsz ||
-		eppnt->p_memsz > TASK_SIZE || TASK_SIZE - eppnt->p_memsz < k) {
-	        error = -ENOMEM;
-		goto out_close;
-	    }
-
-	    /*
-	     * Find the end of the file mapping for this phdr, and keep
-	     * track of the largest address we see for this.
-	     */
-	    k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
-	    if (k > elf_bss)
-		elf_bss = k;
-
-	    /*
-	     * Do the same thing for the memory mapping - between
-	     * elf_bss and last_bss is the bss section.
-	     */
-	    k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
-	    if (k > last_bss)
-		last_bss = k;
-	  }
+	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
+		if (eppnt->p_type == PT_LOAD) {
+			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
+			int elf_prot = 0;
+			unsigned long vaddr = 0;
+			unsigned long k, map_addr;
+
+			if (eppnt->p_flags & PF_R)
+		    		elf_prot = PROT_READ;
+			if (eppnt->p_flags & PF_W)
+				elf_prot |= PROT_WRITE;
+			if (eppnt->p_flags & PF_X)
+				elf_prot |= PROT_EXEC;
+			vaddr = eppnt->p_vaddr;
+			if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
+				elf_type |= MAP_FIXED;
+
+			map_addr = elf_map(interpreter, load_addr + vaddr,
+					   eppnt, elf_prot, elf_type);
+			error = map_addr;
+			if (BAD_ADDR(map_addr))
+				goto out_close;
+
+			if (!load_addr_set &&
+			    interp_elf_ex->e_type == ET_DYN) {
+				load_addr = map_addr - ELF_PAGESTART(vaddr);
+				load_addr_set = 1;
+			}
+
+			/*
+			 * Check to see if the section's size will overflow the
+			 * allowed task size. Note that p_filesz must always be
+			 * <= p_memsize so it's only necessary to check p_memsz.
+			 */
+			k = load_addr + eppnt->p_vaddr;
+			if (k > TASK_SIZE ||
+			    eppnt->p_filesz > eppnt->p_memsz ||
+			    eppnt->p_memsz > TASK_SIZE ||
+			    TASK_SIZE - eppnt->p_memsz < k) {
+				error = -ENOMEM;
+				goto out_close;
+			}
+
+			/*
+			 * Find the end of the file mapping for this phdr, and
+			 * keep track of the largest address we see for this.
+			 */
+			k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
+			if (k > elf_bss)
+				elf_bss = k;
+
+			/*
+			 * Do the same thing for the memory mapping - between
+			 * elf_bss and last_bss is the bss section.
+			 */
+			k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
+			if (k > last_bss)
+				last_bss = k;
+		}
 	}
 
 	/*
@@ -424,7 +430,8 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
 		goto out_close;
 	}
 
-	elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);	/* What we have mapped so far */
+	/* What we have mapped so far */
+	elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
 
 	/* Map the last of the bss segment */
 	if (last_bss > elf_bss) {
@@ -436,7 +443,7 @@ static unsigned long load_elf_interp(struct elfhdr * interp_elf_ex,
 	}
 
 	*interp_load_addr = load_addr;
-	error = ((unsigned long) interp_elf_ex->e_entry) + load_addr;
+	error = ((unsigned long)interp_elf_ex->e_entry) + load_addr;
 
 out_close:
 	kfree(elf_phdata);
@@ -444,8 +451,8 @@ out:
 	return error;
 }
 
-static unsigned long load_aout_interp(struct exec * interp_ex,
-			     struct file * interpreter)
+static unsigned long load_aout_interp(struct exec *interp_ex,
+		struct file *interpreter)
 {
 	unsigned long text_data, elf_entry = ~0UL;
 	char __user * addr;
@@ -464,7 +471,7 @@ static unsigned long load_aout_interp(struct exec * interp_ex,
 	case ZMAGIC:
 	case QMAGIC:
 		offset = N_TXTOFF(*interp_ex);
-		addr = (char __user *) N_TXTADDR(*interp_ex);
+		addr = (char __user *)N_TXTADDR(*interp_ex);
 		break;
 	default:
 		goto out;
@@ -480,7 +487,6 @@ static unsigned long load_aout_interp(struct exec * interp_ex,
 	flush_icache_range((unsigned long)addr,
 	                   (unsigned long)addr + text_data);
 
-
 	down_write(&current->mm->mmap_sem);	
 	do_brk(ELF_PAGESTART(text_data + ELF_MIN_ALIGN - 1),
 		interp_ex->a_bss);
@@ -519,7 +525,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 #endif
 }
 
-static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 {
 	struct file *interpreter = NULL; /* to shut gcc up */
  	unsigned long load_addr = 0, load_bias = 0;
@@ -528,7 +534,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	unsigned int interpreter_type = INTERPRETER_NONE;
 	unsigned char ibcs2_interpreter = 0;
 	unsigned long error;
-	struct elf_phdr * elf_ppnt, *elf_phdata;
+	struct elf_phdr *elf_ppnt, *elf_phdata;
 	unsigned long elf_bss, elf_brk;
 	int elf_exec_fileno;
 	int retval, i;
@@ -553,7 +559,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	}
 	
 	/* Get the exec-header */
-	loc->elf_ex = *((struct elfhdr *) bprm->buf);
+	loc->elf_ex = *((struct elfhdr *)bprm->buf);
 
 	retval = -ENOEXEC;
 	/* First of all, some simple consistency checks */
@@ -568,7 +574,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		goto out;
 
 	/* Now read in all of the header information */
-
 	if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
 		goto out;
 	if (loc->elf_ex.e_phnum < 1 ||
@@ -576,18 +581,19 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		goto out;
 	size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
 	retval = -ENOMEM;
-	elf_phdata = (struct elf_phdr *) kmalloc(size, GFP_KERNEL);
+	elf_phdata = kmalloc(size, GFP_KERNEL);
 	if (!elf_phdata)
 		goto out;
 
-	retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, (char *) elf_phdata, size);
+	retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
+			     (char *)elf_phdata, size);
 	if (retval != size) {
 		if (retval >= 0)
 			retval = -EIO;
 		goto out_free_ph;
 	}
 
-	files = current->files;		/* Refcounted so ok */
+	files = current->files;	/* Refcounted so ok */
 	retval = unshare_files();
 	if (retval < 0)
 		goto out_free_ph;
@@ -598,7 +604,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* exec will make our files private anyway, but for the a.out
 	   loader stuff we need to do it earlier */
-
 	retval = get_unused_fd();
 	if (retval < 0)
 		goto out_free_fh;
@@ -620,7 +625,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			 * shared libraries - for now assume that this
 			 * is an a.out format binary
 			 */
-
 			retval = -ENOEXEC;
 			if (elf_ppnt->p_filesz > PATH_MAX || 
 			    elf_ppnt->p_filesz < 2)
@@ -628,13 +632,13 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 			retval = -ENOMEM;
 			elf_interpreter = kmalloc(elf_ppnt->p_filesz,
-							   GFP_KERNEL);
+						  GFP_KERNEL);
 			if (!elf_interpreter)
 				goto out_free_file;
 
 			retval = kernel_read(bprm->file, elf_ppnt->p_offset,
-					   elf_interpreter,
-					   elf_ppnt->p_filesz);
+					     elf_interpreter,
+					     elf_ppnt->p_filesz);
 			if (retval != elf_ppnt->p_filesz) {
 				if (retval >= 0)
 					retval = -EIO;
@@ -678,7 +682,8 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			retval = PTR_ERR(interpreter);
 			if (IS_ERR(interpreter))
 				goto out_free_interp;
-			retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE);
+			retval = kernel_read(interpreter, 0, bprm->buf,
+					     BINPRM_BUF_SIZE);
 			if (retval != BINPRM_BUF_SIZE) {
 				if (retval >= 0)
 					retval = -EIO;
@@ -686,8 +691,8 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			}
 
 			/* Get the exec headers */
-			loc->interp_ex = *((struct exec *) bprm->buf);
-			loc->interp_elf_ex = *((struct elfhdr *) bprm->buf);
+			loc->interp_ex = *((struct exec *)bprm->buf);
+			loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
 			break;
 		}
 		elf_ppnt++;
@@ -739,7 +744,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* OK, we are done with that, now set up the arg stuff,
 	   and then start this sucker up */
-
 	if ((!bprm->sh_bang) && (interpreter_type == INTERPRETER_AOUT)) {
 		char *passed_p = passed_fileno;
 		sprintf(passed_fileno, "%d", elf_exec_fileno);
@@ -777,7 +781,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
 		current->personality |= READ_IMPLIES_EXEC;
 
-	if ( !(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
 		current->flags |= PF_RANDOMIZE;
 	arch_pick_mmap_layout(current->mm);
 
@@ -798,8 +802,8 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	   the correct location in memory.  At this point, we assume that
 	   the image should be loaded at fixed address, not at a variable
 	   address. */
-
-	for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+	for(i = 0, elf_ppnt = elf_phdata;
+	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
 		int elf_prot = 0, elf_flags;
 		unsigned long k, vaddr;
 
@@ -827,30 +831,35 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 							load_bias, nbyte)) {
 					/*
 					 * This bss-zeroing can fail if the ELF
-					 * file specifies odd protections.  So
+					 * file specifies odd protections. So
 					 * we don't check the return value
 					 */
 				}
 			}
 		}
 
-		if (elf_ppnt->p_flags & PF_R) elf_prot |= PROT_READ;
-		if (elf_ppnt->p_flags & PF_W) elf_prot |= PROT_WRITE;
-		if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC;
+		if (elf_ppnt->p_flags & PF_R)
+			elf_prot |= PROT_READ;
+		if (elf_ppnt->p_flags & PF_W)
+			elf_prot |= PROT_WRITE;
+		if (elf_ppnt->p_flags & PF_X)
+			elf_prot |= PROT_EXEC;
 
-		elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
+		elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
 
 		vaddr = elf_ppnt->p_vaddr;
 		if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
 			elf_flags |= MAP_FIXED;
 		} else if (loc->elf_ex.e_type == ET_DYN) {
-			/* Try and get dynamic programs out of the way of the default mmap
-			   base, as well as whatever program they might try to exec.  This
-			   is because the brk will follow the loader, and is not movable.  */
+			/* Try and get dynamic programs out of the way of the
+			 * default mmap base, as well as whatever program they
+			 * might try to exec.  This is because the brk will
+			 * follow the loader, and is not movable.  */
 			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
 		}
 
-		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags);
+		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+				elf_prot, elf_flags);
 		if (BAD_ADDR(error)) {
 			send_sig(SIGKILL, current, 0);
 			goto out_free_dentry;
@@ -867,8 +876,10 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			}
 		}
 		k = elf_ppnt->p_vaddr;
-		if (k < start_code) start_code = k;
-		if (start_data < k) start_data = k;
+		if (k < start_code)
+			start_code = k;
+		if (start_data < k)
+			start_data = k;
 
 		/*
 		 * Check to see if the section's size will overflow the
@@ -878,7 +889,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		if (k > TASK_SIZE || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
 		    elf_ppnt->p_memsz > TASK_SIZE ||
 		    TASK_SIZE - elf_ppnt->p_memsz < k) {
-			/* set_brk can never work.  Avoid overflows.  */
+			/* set_brk can never work. Avoid overflows. */
 			send_sig(SIGKILL, current, 0);
 			goto out_free_dentry;
 		}
@@ -966,8 +977,9 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	compute_creds(bprm);
 	current->flags &= ~PF_FORKNOEXEC;
-	create_elf_tables(bprm, &loc->elf_ex, (interpreter_type == INTERPRETER_AOUT),
-			load_addr, interp_load_addr);
+	create_elf_tables(bprm, &loc->elf_ex,
+			  (interpreter_type == INTERPRETER_AOUT),
+			  load_addr, interp_load_addr);
 	/* N.B. passed_fileno might not be initialized? */
 	if (interpreter_type == INTERPRETER_AOUT)
 		current->mm->arg_start += strlen(passed_fileno) + 1;
@@ -981,7 +993,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
 		   and some applications "depend" upon this behavior.
 		   Since we do not have the power to recompile these, we
-		   emulate the SVr4 behavior.  Sigh.  */
+		   emulate the SVr4 behavior. Sigh. */
 		down_write(&current->mm->mmap_sem);
 		error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
 				MAP_FIXED | MAP_PRIVATE, 0);
@@ -1036,7 +1048,6 @@ out_free_ph:
 
 /* This is really simpleminded and specialized - we are loading an
    a.out library that is given an ELF header. */
-
 static int load_elf_library(struct file *file)
 {
 	struct elf_phdr *elf_phdata;
@@ -1046,7 +1057,7 @@ static int load_elf_library(struct file *file)
 	struct elfhdr elf_ex;
 
 	error = -ENOEXEC;
-	retval = kernel_read(file, 0, (char *) &elf_ex, sizeof(elf_ex));
+	retval = kernel_read(file, 0, (char *)&elf_ex, sizeof(elf_ex));
 	if (retval != sizeof(elf_ex))
 		goto out;
 
@@ -1055,7 +1066,7 @@ static int load_elf_library(struct file *file)
 
 	/* First of all, some simple consistency checks */
 	if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
-	   !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
+	    !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
 		goto out;
 
 	/* Now read in all of the header information */
@@ -1103,7 +1114,8 @@ static int load_elf_library(struct file *file)
 		goto out_free_ph;
 	}
 
-	len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1);
+	len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
+			    ELF_MIN_ALIGN - 1);
 	bss = eppnt->p_memsz + eppnt->p_vaddr;
 	if (bss > len) {
 		down_write(&current->mm->mmap_sem);
@@ -1162,7 +1174,7 @@ static int maydump(struct vm_area_struct *vma)
 	if (vma->vm_flags & (VM_IO | VM_RESERVED))
 		return 0;
 
-	/* Dump shared memory only if mapped from an anonymous file.  */
+	/* Dump shared memory only if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED)
 		return vma->vm_file->f_dentry->d_inode->i_nlink == 0;
 
@@ -1173,7 +1185,7 @@ static int maydump(struct vm_area_struct *vma)
 	return 1;
 }
 
-#define roundup(x, y)  ((((x)+((y)-1))/(y))*(y))
+#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
 
 /* An ELF note in memory */
 struct memelfnote
@@ -1276,11 +1288,11 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
 }
 
 /*
- * fill up all the fields in prstatus from the given task struct, except registers
- * which need to be filled up separately.
+ * fill up all the fields in prstatus from the given task struct, except
+ * registers which need to be filled up separately.
  */
 static void fill_prstatus(struct elf_prstatus *prstatus,
-			struct task_struct *p, long signr) 
+		struct task_struct *p, long signr)
 {
 	prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
 	prstatus->pr_sigpend = p->pending.signal.sig[0];
@@ -1365,8 +1377,8 @@ struct elf_thread_status
 
 /*
  * In order to add the specific thread information for the elf file format,
- * we need to keep a linked list of every threads pr_status and then
- * create a single section for them in the final core file.
+ * we need to keep a linked list of every threads pr_status and then create
+ * a single section for them in the final core file.
  */
 static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 {
@@ -1377,19 +1389,23 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
 	fill_prstatus(&t->prstatus, p, signr);
 	elf_core_copy_task_regs(p, &t->prstatus.pr_reg);	
 	
-	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus));
+	fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
+		  &(t->prstatus));
 	t->num_notes++;
 	sz += notesize(&t->notes[0]);
 
-	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL, &t->fpu))) {
-		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), &(t->fpu));
+	if ((t->prstatus.pr_fpvalid = elf_core_copy_task_fpregs(p, NULL,
+								&t->fpu))) {
+		fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
+			  &(t->fpu));
 		t->num_notes++;
 		sz += notesize(&t->notes[1]);
 	}
 
 #ifdef ELF_CORE_COPY_XFPREGS
 	if (elf_core_copy_task_xfpregs(p, &t->xfpu)) {
-		fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu), &t->xfpu);
+		fill_note(&t->notes[2], "LINUX", NT_PRXFPREG, sizeof(t->xfpu),
+			  &t->xfpu);
 		t->num_notes++;
 		sz += notesize(&t->notes[2]);
 	}
@@ -1404,7 +1420,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t)
  * and then they are actually written out.  If we run out of core limit
  * we just truncate.
  */
-static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
+static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 {
 #define	NUM_NOTES	6
 	int has_dumped = 0;
@@ -1433,12 +1449,12 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 	/*
 	 * We no longer stop all VM operations.
 	 * 
-	 * This is because those proceses that could possibly change map_count or
-	 * the mmap / vma pages are now blocked in do_exit on current finishing
-	 * this core dump.
+	 * This is because those proceses that could possibly change map_count
+	 * or the mmap / vma pages are now blocked in do_exit on current
+	 * finishing this core dump.
 	 *
 	 * Only ptrace can touch these memory addresses, but it doesn't change
-	 * the map_count or the pages allocated.  So no possibility of crashing
+	 * the map_count or the pages allocated. So no possibility of crashing
 	 * exists while dumping the mm->vm_next areas to the core file.
 	 */
   
@@ -1500,7 +1516,7 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 #endif
 
 	/* Set up header */
-	fill_elf_header(elf, segs+1);	/* including notes section */
+	fill_elf_header(elf, segs + 1);	/* including notes section */
 
 	has_dumped = 1;
 	current->flags |= PF_DUMPCORE;
@@ -1510,24 +1526,24 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 	 * with info from their /proc.
 	 */
 
-	fill_note(notes +0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
-	
+	fill_note(notes + 0, "CORE", NT_PRSTATUS, sizeof(*prstatus), prstatus);
 	fill_psinfo(psinfo, current->group_leader, current->mm);
-	fill_note(notes +1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+	fill_note(notes + 1, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
 	
 	numnote = 2;
 
-	auxv = (elf_addr_t *) current->mm->saved_auxv;
+	auxv = (elf_addr_t *)current->mm->saved_auxv;
 
 	i = 0;
 	do
 		i += 2;
 	while (auxv[i - 2] != AT_NULL);
 	fill_note(&notes[numnote++], "CORE", NT_AUXV,
-		  i * sizeof (elf_addr_t), auxv);
+		  i * sizeof(elf_addr_t), auxv);
 
   	/* Try to dump the FPU. */
-	if ((prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, fpu)))
+	if ((prstatus->pr_fpvalid =
+	     elf_core_copy_task_fpregs(current, regs, fpu)))
 		fill_note(notes + numnote++,
 			  "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
 #ifdef ELF_CORE_COPY_XFPREGS
@@ -1576,8 +1592,10 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 		phdr.p_memsz = sz;
 		offset += phdr.p_filesz;
 		phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
-		if (vma->vm_flags & VM_WRITE) phdr.p_flags |= PF_W;
-		if (vma->vm_flags & VM_EXEC) phdr.p_flags |= PF_X;
+		if (vma->vm_flags & VM_WRITE)
+			phdr.p_flags |= PF_W;
+		if (vma->vm_flags & VM_EXEC)
+			phdr.p_flags |= PF_X;
 		phdr.p_align = ELF_EXEC_PAGESIZE;
 
 		DUMP_WRITE(&phdr, sizeof(phdr));
@@ -1594,7 +1612,9 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 
 	/* write out the thread status notes section */
 	list_for_each(t, &thread_list) {
-		struct elf_thread_status *tmp = list_entry(t, struct elf_thread_status, list);
+		struct elf_thread_status *tmp =
+				list_entry(t, struct elf_thread_status, list);
+
 		for (i = 0; i < tmp->num_notes; i++)
 			if (!writenote(&tmp->notes[i], file))
 				goto end_coredump;
@@ -1611,18 +1631,19 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 		for (addr = vma->vm_start;
 		     addr < vma->vm_end;
 		     addr += PAGE_SIZE) {
-			struct page* page;
+			struct page *page;
 			struct vm_area_struct *vma;
 
 			if (get_user_pages(current, current->mm, addr, 1, 0, 1,
 						&page, &vma) <= 0) {
-				DUMP_SEEK (file->f_pos + PAGE_SIZE);
+				DUMP_SEEK(file->f_pos + PAGE_SIZE);
 			} else {
 				if (page == ZERO_PAGE(addr)) {
-					DUMP_SEEK (file->f_pos + PAGE_SIZE);
+					DUMP_SEEK(file->f_pos + PAGE_SIZE);
 				} else {
 					void *kaddr;
-					flush_cache_page(vma, addr, page_to_pfn(page));
+					flush_cache_page(vma, addr,
+							 page_to_pfn(page));
 					kaddr = kmap(page);
 					if ((size += PAGE_SIZE) > limit ||
 					    !dump_write(file, kaddr,
@@ -1644,7 +1665,8 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file)
 
 	if ((off_t)file->f_pos != offset) {
 		/* Sanity check */
-		printk(KERN_WARNING "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
+		printk(KERN_WARNING
+		       "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n",
 		       (off_t)file->f_pos, offset);
 	}
 
-- 
cgit v1.2.3


From 785d55708c24c28d7646f3d1fe6c9f82fb714311 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Fri, 23 Jun 2006 02:05:35 -0700
Subject: [PATCH] binflt_elf: remove more casts

Remove redundant casts from NEW_AUX_ENT() arguments in fs/binfmt_elf.c

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_elf.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 451c04fecb4..d0434406eae 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -177,10 +177,11 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	}
 
 	/* Create the ELF interpreter info */
-	elf_info = (elf_addr_t *) current->mm->saved_auxv;
+	elf_info = (elf_addr_t *)current->mm->saved_auxv;
 #define NEW_AUX_ENT(id, val) \
 	do { \
-		elf_info[ei_index++] = id; elf_info[ei_index++] = val; \
+		elf_info[ei_index++] = id; \
+		elf_info[ei_index++] = val; \
 	} while (0)
 
 #ifdef ARCH_DLINFO
@@ -199,17 +200,17 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_BASE, interp_load_addr);
 	NEW_AUX_ENT(AT_FLAGS, 0);
 	NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-	NEW_AUX_ENT(AT_UID, (elf_addr_t)tsk->uid);
-	NEW_AUX_ENT(AT_EUID, (elf_addr_t)tsk->euid);
-	NEW_AUX_ENT(AT_GID, (elf_addr_t)tsk->gid);
-	NEW_AUX_ENT(AT_EGID, (elf_addr_t)tsk->egid);
- 	NEW_AUX_ENT(AT_SECURE, (elf_addr_t)security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_UID, tsk->uid);
+	NEW_AUX_ENT(AT_EUID, tsk->euid);
+	NEW_AUX_ENT(AT_GID, tsk->gid);
+	NEW_AUX_ENT(AT_EGID, tsk->egid);
+ 	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
 	if (k_platform) {
 		NEW_AUX_ENT(AT_PLATFORM,
-			(elf_addr_t)(unsigned long)u_platform);
+			    (elf_addr_t)(unsigned long)u_platform);
 	}
 	if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
-		NEW_AUX_ENT(AT_EXECFD, (elf_addr_t)bprm->interp_data);
+		NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
 	}
 #undef NEW_AUX_ENT
 	/* AT_NULL is zero; clear the rest too */
-- 
cgit v1.2.3


From 0216bfcffe424a5473daa4da47440881b36c1f41 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 23 Jun 2006 02:05:41 -0700
Subject: [PATCH] percpu counter data type changes to suppport more than 2**31
 ext3 free blocks counter

The percpu counter data type are changed in this set of patches to support
more users like ext3 who need more than 32 bit to store the free blocks
total in the filesystem.

- Generic perpcu counters data type changes.  The size of the global counter
  and local counter were explictly specified using s64 and s32.  The global
  counter is changed from long to s64, while the local counter is changed from
  long to s32, so we could avoid doing 64 bit update in most cases.

- Users of the percpu counters are updated to make use of the new
  percpu_counter_init() routine now taking an additional parameter to allow
  users to pass the initial value of the global counter.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/super.c | 25 +++++++++++++------------
 fs/ext3/super.c | 36 +++++++++++++++++++-----------------
 fs/file_table.c |  2 +-
 3 files changed, 33 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a6c4d6e0232..ee4ba759581 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -834,9 +834,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 	sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
 			       GFP_KERNEL);
@@ -863,6 +860,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+
+	percpu_counter_init(&sbi->s_freeblocks_counter,
+				ext2_count_free_blocks(sb));
+	percpu_counter_init(&sbi->s_freeinodes_counter,
+				ext2_count_free_inodes(sb));
+	percpu_counter_init(&sbi->s_dirs_counter,
+				ext2_count_dirs(sb));
 	/*
 	 * set up enough so that it can read an inode
 	 */
@@ -874,24 +878,18 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root) {
 		iput(root);
 		printk(KERN_ERR "EXT2-fs: get root inode failed\n");
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
 		printk(KERN_ERR "EXT2-fs: corrupt root inode, run e2fsck\n");
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 	if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL))
 		ext2_warning(sb, __FUNCTION__,
 			"mounting ext3 filesystem as ext2");
 	ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
-				ext2_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
-				ext2_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
-				ext2_count_dirs(sb));
 	return 0;
 
 cantfind_ext2:
@@ -899,7 +897,10 @@ cantfind_ext2:
 		printk("VFS: Can't find an ext2 filesystem on dev %s.\n",
 		       sb->s_id);
 	goto failed_mount;
-
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1a198b3985c..a60cc6ec130 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1580,9 +1580,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	percpu_counter_init(&sbi->s_freeblocks_counter);
-	percpu_counter_init(&sbi->s_freeinodes_counter);
-	percpu_counter_init(&sbi->s_dirs_counter);
 	bgl_lock_init(&sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -1602,6 +1599,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+
+	percpu_counter_init(&sbi->s_freeblocks_counter,
+		ext3_count_free_blocks(sb));
+	percpu_counter_init(&sbi->s_freeinodes_counter,
+		ext3_count_free_inodes(sb));
+	percpu_counter_init(&sbi->s_dirs_counter,
+		ext3_count_dirs(sb));
+
 	/* per fileystem reservation list head & lock */
 	spin_lock_init(&sbi->s_rsv_window_lock);
 	sbi->s_rsv_window_root = RB_ROOT;
@@ -1640,16 +1645,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext3_load_journal(sb, es, journal_devnum))
-			goto failed_mount2;
+			goto failed_mount3;
 	} else if (journal_inum) {
 		if (ext3_create_journal(sb, es, journal_inum))
-			goto failed_mount2;
+			goto failed_mount3;
 	} else {
 		if (!silent)
 			printk (KERN_ERR
 				"ext3: No journal on filesystem on %s\n",
 				sb->s_id);
-		goto failed_mount2;
+		goto failed_mount3;
 	}
 
 	/* We have now updated the journal if required, so we can
@@ -1672,7 +1677,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
 			printk(KERN_ERR "EXT3-fs: Journal does not support "
 			       "requested data journaling mode\n");
-			goto failed_mount3;
+			goto failed_mount4;
 		}
 	default:
 		break;
@@ -1695,13 +1700,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	if (!sb->s_root) {
 		printk(KERN_ERR "EXT3-fs: get root inode failed\n");
 		iput(root);
-		goto failed_mount3;
+		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		dput(sb->s_root);
 		sb->s_root = NULL;
 		printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
-		goto failed_mount3;
+		goto failed_mount4;
 	}
 
 	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
@@ -1724,13 +1729,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
-	percpu_counter_mod(&sbi->s_freeblocks_counter,
-		ext3_count_free_blocks(sb));
-	percpu_counter_mod(&sbi->s_freeinodes_counter,
-		ext3_count_free_inodes(sb));
-	percpu_counter_mod(&sbi->s_dirs_counter,
-		ext3_count_dirs(sb));
-
 	lock_kernel();
 	return 0;
 
@@ -1740,8 +1738,12 @@ cantfind_ext3:
 		       sb->s_id);
 	goto failed_mount;
 
-failed_mount3:
+failed_mount4:
 	journal_destroy(sbi->s_journal);
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
diff --git a/fs/file_table.c b/fs/file_table.c
index bcea1998b4d..506d5307108 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -300,5 +300,5 @@ void __init files_init(unsigned long mempages)
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
-	percpu_counter_init(&nr_files);
+	percpu_counter_init(&nr_files, 0);
 } 
-- 
cgit v1.2.3


From 626ab0e69d376fa07599af669af8ba92d58e87c1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 23 Jun 2006 02:05:55 -0700
Subject: [PATCH] list: use list_replace_init() instead of list_splice_init()

list_splice_init(list, head) does unneeded job if it is known that
list_empty(head) == 1.  We can use list_replace_init() instead.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index e41e932ba48..8c34a62df7d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -777,11 +777,11 @@ out:
 static int __aio_run_iocbs(struct kioctx *ctx)
 {
 	struct kiocb *iocb;
-	LIST_HEAD(run_list);
+	struct list_head run_list;
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	list_splice_init(&ctx->run_list, &run_list);
+	list_replace_init(&ctx->run_list, &run_list);
 	while (!list_empty(&run_list)) {
 		iocb = list_entry(run_list.next, struct kiocb,
 			ki_run_list);
-- 
cgit v1.2.3


From 78ce89c92bc6eaf5933b5664bff64253a7103bd7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 23 Jun 2006 02:06:05 -0700
Subject: [PATCH] JBD: split checkpoint lists

Split the checkpoint list of the transaction into two lists.  In the first
list we keep the buffers that need to be submitted for IO.  In the second
list are kept buffers that were already submitted and we just have to wait
for the IO to complete.  This should simplify a handling of checkpoint
lists a bit and can eventually be also a performance gain.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: "Stephen C. Tweedie" <sct@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/checkpoint.c | 419 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 239 insertions(+), 180 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 3f5102b069d..47678a26c13 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -24,29 +24,67 @@
 #include <linux/slab.h>
 
 /*
- * Unlink a buffer from a transaction.
+ * Unlink a buffer from a transaction checkpoint list.
  *
  * Called with j_list_lock held.
  */
-
-static inline void __buffer_unlink(struct journal_head *jh)
+static inline void __buffer_unlink_first(struct journal_head *jh)
 {
-	transaction_t *transaction;
-
-	transaction = jh->b_cp_transaction;
-	jh->b_cp_transaction = NULL;
+	transaction_t *transaction = jh->b_cp_transaction;
 
 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
+	if (transaction->t_checkpoint_list == jh) {
 		transaction->t_checkpoint_list = jh->b_cpnext;
-	if (transaction->t_checkpoint_list == jh)
-		transaction->t_checkpoint_list = NULL;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
 }
 
 /*
  * Try to release a checkpointed buffer from its transaction.
- * Returns 1 if we released it.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
  * Requires j_list_lock
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
@@ -57,12 +95,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
 
 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
-		__journal_remove_checkpoint(jh);
+		ret = __journal_remove_checkpoint(jh) + 1;
 		jbd_unlock_bh_state(bh);
 		journal_remove_journal_head(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
-		ret = 1;
 	} else {
 		jbd_unlock_bh_state(bh);
 	}
@@ -117,83 +154,54 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
 }
 
 /*
- * Clean up a transaction's checkpoint list.
- *
- * We wait for any pending IO to complete and make sure any clean
- * buffers are removed from the transaction.
- *
- * Return 1 if we performed any actions which might have destroyed the
- * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
- * the last checkpoint buffer is cleansed)
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
  *
  * Called with j_list_lock held.
  */
-static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
-	struct journal_head *jh, *next_jh, *last_jh;
+	struct journal_head *jh;
 	struct buffer_head *bh;
-	int ret = 0;
-
-	assert_spin_locked(&journal->j_list_lock);
-	jh = transaction->t_checkpoint_list;
-	if (!jh)
-		return 0;
-
-	last_jh = jh->b_cpprev;
-	next_jh = jh;
-	do {
-		jh = next_jh;
+	tid_t this_tid;
+	int released = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+			transaction->t_tid != this_tid)
+		return;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		if (buffer_locked(bh)) {
 			atomic_inc(&bh->b_count);
 			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
 			__brelse(bh);
-			goto out_return_1;
+			spin_lock(&journal->j_list_lock);
+			goto restart;
 		}
-
 		/*
-		 * This is foul
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
 		 */
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			goto out_return_1;
-		}
-
-		if (jh->b_transaction != NULL) {
-			transaction_t *t = jh->b_transaction;
-			tid_t tid = t->t_tid;
-
-			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
-			log_start_commit(journal, tid);
-			log_wait_commit(journal, tid);
-			goto out_return_1;
-		}
-
-		/*
-		 * AKPM: I think the buffer_jbddirty test is redundant - it
-		 * shouldn't have NULL b_transaction?
-		 */
-		next_jh = jh->b_cpnext;
-		if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) {
-			BUFFER_TRACE(bh, "remove from checkpoint");
-			__journal_remove_checkpoint(jh);
-			jbd_unlock_bh_state(bh);
-			journal_remove_journal_head(bh);
-			__brelse(bh);
-			ret = 1;
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-	} while (jh != last_jh);
-
-	return ret;
-out_return_1:
-	spin_lock(&journal->j_list_lock);
-	return 1;
+		released = __journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
 }
 
 #define NR_BATCH	64
@@ -203,9 +211,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	spin_unlock(&journal->j_list_lock);
 	ll_rw_block(SWRITE, *batch_count, bhs);
-	spin_lock(&journal->j_list_lock);
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
@@ -221,19 +227,43 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Return 1 if something happened which requires us to abort the current
  * scan of the checkpoint list.  
  *
- * Called with j_list_lock held.
+ * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
-static int __flush_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			int *drop_count)
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
 
-	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
-		J_ASSERT_JH(jh, jh->b_transaction == NULL);
+	if (buffer_locked(bh)) {
+		atomic_inc(&bh->b_count);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+		ret = 1;
+	} else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
 
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		log_start_commit(journal, tid);
+		log_wait_commit(journal, tid);
+		ret = 1;
+	} else if (!buffer_dirty(bh)) {
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		journal_remove_journal_head(bh);
+		__brelse(bh);
+		ret = 1;
+	} else {
 		/*
 		 * Important: we are about to write the buffer, and
 		 * possibly block, while still holding the journal lock.
@@ -246,45 +276,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
 		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		(*batch_count)++;
 		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
 			__flush_batch(journal, bhs, batch_count);
 			ret = 1;
 		}
-	} else {
-		int last_buffer = 0;
-		if (jh->b_cpnext == jh) {
-			/* We may be about to drop the transaction.  Tell the
-			 * caller that the lists have changed.
-			 */
-			last_buffer = 1;
-		}
-		if (__try_to_free_cp_buf(jh)) {
-			(*drop_count)++;
-			ret = last_buffer;
-		}
 	}
 	return ret;
 }
 
 /*
- * Perform an actual checkpoint.  We don't write out only enough to
- * satisfy the current blocked requests: rather we submit a reasonably
- * sized chunk of the outstanding data to disk at once for
- * efficiency.  __log_wait_for_space() will retry if we didn't free enough.
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
  * 
- * However, we _do_ take into account the amount requested so that once
- * the IO has been queued, we can return as soon as enough of it has
- * completed to disk.
- *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
 {
+	transaction_t *transaction;
+	tid_t this_tid;
 	int result;
-	int batch_count = 0;
-	struct buffer_head *bhs[NR_BATCH];
 
 	jbd_debug(1, "Start checkpoint\n");
 
@@ -299,79 +314,68 @@ int log_do_checkpoint(journal_t *journal)
 		return result;
 
 	/*
-	 * OK, we need to start writing disk blocks.  Try to free up a
-	 * quarter of the log in a single checkpoint if we can.
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
 	 */
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
 	/*
-	 * AKPM: check this code.  I had a feeling a while back that it
-	 * degenerates into a busy loop at unmount time.
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
 	 */
-	spin_lock(&journal->j_list_lock);
-	while (journal->j_checkpoint_transactions) {
-		transaction_t *transaction;
-		struct journal_head *jh, *last_jh, *next_jh;
-		int drop_count = 0;
-		int cleanup_ret, retry = 0;
-		tid_t this_tid;
-
-		transaction = journal->j_checkpoint_transactions;
-		this_tid = transaction->t_tid;
-		jh = transaction->t_checkpoint_list;
-		last_jh = jh->b_cpprev;
-		next_jh = jh;
-		do {
+	if (journal->j_checkpoint_transactions == transaction &&
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0;
+
+		while (!retry && transaction->t_checkpoint_list) {
 			struct buffer_head *bh;
 
-			jh = next_jh;
-			next_jh = jh->b_cpnext;
+			jh = transaction->t_checkpoint_list;
 			bh = jh2bh(jh);
 			if (!jbd_trylock_bh_state(bh)) {
 				jbd_sync_bh(journal, bh);
-				spin_lock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-			retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count);
-			if (cond_resched_lock(&journal->j_list_lock)) {
+			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (!retry && lock_need_resched(&journal->j_list_lock)){
+				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 				break;
 			}
-		} while (jh != last_jh && !retry);
+		}
 
 		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
 			__flush_batch(journal, bhs, &batch_count);
-			retry = 1;
 		}
 
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
 		/*
-		 * If someone cleaned up this transaction while we slept, we're
-		 * done
-		 */
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
-		if (retry)
-			continue;
-		/*
-		 * Maybe it's a new transaction, but it fell at the same
-		 * address
-		 */
-		if (transaction->t_tid != this_tid)
-			continue;
-		/*
-		 * We have walked the whole transaction list without
-		 * finding anything to write to disk.  We had better be
-		 * able to make some progress or we are in trouble.
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list. Let's clean up the second one
 		 */
-		cleanup_ret = __cleanup_transaction(journal, transaction);
-		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
-		if (journal->j_checkpoint_transactions != transaction)
-			break;
+		__wait_cp_io(journal, transaction);
 	}
+out:
 	spin_unlock(&journal->j_list_lock);
 	result = cleanup_journal_tail(journal);
 	if (result < 0)
 		return result;
-
 	return 0;
 }
 
@@ -455,6 +459,54 @@ int cleanup_journal_tail(journal_t *journal)
 
 /* Checkpoint list management */
 
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+ 	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory
+		 * if possible so we dont have an obligation
+		 * to finish processing. Bail out if preemption
+		 * requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
 /*
  * journal_clean_checkpoint_list
  *
@@ -462,46 +514,44 @@ int cleanup_journal_tail(journal_t *journal)
  *
  * Called with the journal locked.
  * Called with j_list_lock held.
- * Returns number of bufers reaped (for debug)
+ * Returns number of buffers reaped (for debug)
  */
 
 int __journal_clean_checkpoint_list(journal_t *journal)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	int ret = 0;
+	int released;
 
 	transaction = journal->j_checkpoint_transactions;
-	if (transaction == 0)
+	if (!transaction)
 		goto out;
 
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
-		struct journal_head *jh;
-
 		transaction = next_transaction;
 		next_transaction = transaction->t_cpnext;
-		jh = transaction->t_checkpoint_list;
-		if (jh) {
-			struct journal_head *last_jh = jh->b_cpprev;
-			struct journal_head *next_jh = jh;
-
-			do {
-				jh = next_jh;
-				next_jh = jh->b_cpnext;
-				/* Use trylock because of the ranknig */
-				if (jbd_trylock_bh_state(jh2bh(jh)))
-					ret += __try_to_free_cp_buf(jh);
-				/*
-				 * This function only frees up some memory
-				 * if possible so we dont have an obligation
-				 * to finish processing. Bail out if preemption
-				 * requested:
-				 */
-				if (need_resched())
-					goto out;
-			} while (jh != last_jh);
-		}
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
 	} while (transaction != last_transaction);
 out:
 	return ret;
@@ -516,18 +566,22 @@ out:
  * buffer updates committed in that transaction have safely been stored
  * elsewhere on disk.  To achieve this, all of the buffers in a
  * transaction need to be maintained on the transaction's checkpoint
- * list until they have been rewritten, at which point this function is
+ * lists until they have been rewritten, at which point this function is
  * called to remove the buffer from the existing transaction's
- * checkpoint list.
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
  *
  * This function is called with the journal locked.
  * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
 
-void __journal_remove_checkpoint(struct journal_head *jh)
+int __journal_remove_checkpoint(struct journal_head *jh)
 {
 	transaction_t *transaction;
 	journal_t *journal;
+	int ret = 0;
 
 	JBUFFER_TRACE(jh, "entry");
 
@@ -538,8 +592,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	journal = transaction->t_journal;
 
 	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
 
-	if (transaction->t_checkpoint_list != NULL)
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
 		goto out;
 	JBUFFER_TRACE(jh, "transaction has no more buffers");
 
@@ -565,8 +621,10 @@ void __journal_remove_checkpoint(struct journal_head *jh)
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
 	wake_up(&journal->j_wait_logspace);
+	ret = 1;
 out:
 	JBUFFER_TRACE(jh, "exit");
+	return ret;
 }
 
 /*
@@ -628,6 +686,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
 	J_ASSERT(transaction->t_shadow_list == NULL);
 	J_ASSERT(transaction->t_log_list == NULL);
 	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
 	J_ASSERT(transaction->t_updates == 0);
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
-- 
cgit v1.2.3


From 98bd34eaf1a7d1f2ed9c4e5d3a9664d3dcdd2159 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Fri, 23 Jun 2006 02:06:07 -0700
Subject: [PATCH] make kernel warn about incorrectly sized partitions

Sometimes partitions claim to be larger than the reported capacity of a
disk device.  This patch makes the kernel warn about those partitions.

We still permit these patitions to be used.  Quoting Andries Brouwer
<Andries.Brouwer@cwi.nl>:

 Case 1: The kernel is mistaken about the size of the disk.  (There are
 commands to clip a disk to a certain capacity, there are jumpers to tell a
 disk that it should report a certain capacity etc.  Usually this is because
 of BIOS bugs.  In bad cases the machine will crash in the BIOS and hence fail
 to boot if the disk reports full capacity.) In such cases actually accessing
 the blocks of the partition may work fine, or may work fine after running an
 unclip utility.  I wrote "setmax" some years ago precisely for this reason.

 Case 2: There was a messy partition table (maybe just a rounding error) but
 the actual filesystem on the partition is contained in the physical disk.
 Now using the filesystem goes without problem.

 Case 3: Both partition and filesystem extend beyond the end of the disk.  In
 forensic or debugging situations one often uses a copy of the start of a
 disk.  Now access beyond the end gives an expected I/O error.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Signed-off-by: Stephen Cameron <steve.cameron@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/partitions/check.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index cd885b23cb5..2ef313a96b6 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -484,6 +484,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 		sector_t from = state->parts[p].from;
 		if (!size)
 			continue;
+		if (from + size > get_capacity(disk)) {
+			printk(" %s: p%d exceeds device capacity\n",
+				disk->disk_name, p);
+		}
 		add_partition(disk, p, from, size);
 #ifdef CONFIG_BLK_DEV_MD
 		if (state->parts[p].flags)
-- 
cgit v1.2.3


From b31dc66a54ad986b6b73bdc49c8efc17cbad1833 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 13 Jun 2006 08:26:10 +0200
Subject: [PATCH] Kill PF_SYNCWRITE flag

A process flag to indicate whether we are doing sync io is incredibly
ugly. It also causes performance problems when one does a lot of async
io and then proceeds to sync it. Part of the io will go out as async,
and the other part as sync. This causes a disconnect between the
previously submitted io and the synced io. For io schedulers such as CFQ,
this will cause us lost merges and suboptimal behaviour in scheduling.

Remove PF_SYNCWRITE completely from the fsync/msync paths, and let
the O_DIRECT path just directly indicate that the writes are sync
by using WRITE_SYNC instead.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/buffer.c       |  2 --
 fs/direct-io.c    | 18 ++++++++----------
 fs/fs-writeback.c |  2 --
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 23f1f3a6807..373bb6292bd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -331,7 +331,6 @@ long do_fsync(struct file *file, int datasync)
 		goto out;
 	}
 
-	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(mapping);
 
 	/*
@@ -346,7 +345,6 @@ long do_fsync(struct file *file, int datasync)
 	err = filemap_fdatawait(mapping);
 	if (!ret)
 		ret = err;
-	current->flags &= ~PF_SYNCWRITE;
 out:
 	return ret;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b05d1b21877..538fb0418fb 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,7 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 		NULL);				/* vmas */
 	up_read(&current->mm->mmap_sem);
 
-	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+	if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
 		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
@@ -535,7 +535,7 @@ static int get_more_blocks(struct dio *dio)
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
-		create = dio->rw == WRITE;
+		create = dio->rw & WRITE;
 		if (dio->lock_type == DIO_LOCKING) {
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 							dio->blkbits))
@@ -867,7 +867,7 @@ do_holes:
 				loff_t i_size_aligned;
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
-				if (dio->rw == WRITE) {
+				if (dio->rw & WRITE) {
 					page_cache_release(page);
 					return -ENOTBLK;
 				}
@@ -1045,7 +1045,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && rw == WRITE) {
+	if (ret == -ENOTBLK && (rw & WRITE)) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1089,7 +1089,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	if (dio->is_async) {
 		int should_wait = 0;
 
-		if (dio->result < dio->size && rw == WRITE) {
+		if (dio->result < dio->size && (rw & WRITE)) {
 			dio->waiter = current;
 			should_wait = 1;
 		}
@@ -1142,7 +1142,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 			ret = transferred;
 
 		/* We could have also come here on an AIO file extend */
-		if (!is_sync_kiocb(iocb) && rw == WRITE &&
+		if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
 		    ret >= 0 && dio->result == dio->size)
 			/*
 			 * For AIO writes where we have completed the
@@ -1194,7 +1194,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		current->flags |= PF_SYNCWRITE;
+		rw = WRITE_SYNC;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1270,7 +1270,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 * even for AIO, we need to wait for i/o to complete before
 	 * returning in this case.
 	 */
-	dio->is_async = !is_sync_kiocb(iocb) && !((rw == WRITE) &&
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
@@ -1284,8 +1284,6 @@ out:
 		mutex_unlock(&inode->i_mutex);
 	else if (acquire_i_mutex)
 		mutex_lock(&inode->i_mutex);
-	if (rw & WRITE)
-		current->flags &= ~PF_SYNCWRITE;
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6db95cf3aaa..031b27a4bc9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -623,7 +623,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 	int need_write_inode_now = 0;
 	int err2;
 
-	current->flags |= PF_SYNCWRITE;
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -636,7 +635,6 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int
 		if (!err)
 			err = err2;
 	}
-	current->flags &= ~PF_SYNCWRITE;
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & I_DIRTY) &&
-- 
cgit v1.2.3


From 9e94cd4fd1812bab45237f998b3c6fa1b24023fd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 20 Jun 2006 15:01:12 +0200
Subject: [PATCH] splice: retrieve mapping after locking the page

Otherwise we could be racing with truncate/mapping removal.

Problem found/fixed by Nick Piggin <npiggin@suse.de>, logic rewritten
by me.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c | 46 +++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index a285fd746dc..05fd2787be9 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -55,31 +55,43 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 				     struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
-	struct address_space *mapping = page_mapping(page);
+	struct address_space *mapping;
 
 	lock_page(page);
 
-	WARN_ON(!PageUptodate(page));
+	mapping = page_mapping(page);
+	if (mapping) {
+		WARN_ON(!PageUptodate(page));
 
-	/*
-	 * At least for ext2 with nobh option, we need to wait on writeback
-	 * completing on this page, since we'll remove it from the pagecache.
-	 * Otherwise truncate wont wait on the page, allowing the disk
-	 * blocks to be reused by someone else before we actually wrote our
-	 * data to them. fs corruption ensues.
-	 */
-	wait_on_page_writeback(page);
+		/*
+		 * At least for ext2 with nobh option, we need to wait on
+		 * writeback completing on this page, since we'll remove it
+		 * from the pagecache.  Otherwise truncate wont wait on the
+		 * page, allowing the disk blocks to be reused by someone else
+		 * before we actually wrote our data to them. fs corruption
+		 * ensues.
+		 */
+		wait_on_page_writeback(page);
 
-	if (PagePrivate(page))
-		try_to_release_page(page, mapping_gfp_mask(mapping));
+		if (PagePrivate(page))
+			try_to_release_page(page, mapping_gfp_mask(mapping));
 
-	if (!remove_mapping(mapping, page)) {
-		unlock_page(page);
-		return 1;
+		/*
+		 * If we succeeded in removing the mapping, set LRU flag
+		 * and return good.
+		 */
+		if (remove_mapping(mapping, page)) {
+			buf->flags |= PIPE_BUF_FLAG_LRU;
+			return 0;
+		}
 	}
 
-	buf->flags |= PIPE_BUF_FLAG_LRU;
-	return 0;
+	/*
+	 * Raced with truncate or failed to remove page from current
+	 * address space, unlock and return failure.
+	 */
+	unlock_page(page);
+	return 1;
 }
 
 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
-- 
cgit v1.2.3


From b1c5921c5b715c207d7fe77cd7aaafbb322f09f5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 20 Jun 2006 12:55:19 -0400
Subject: NFS: Separate functions for counting outstanding NFS direct I/Os

Factor out the logic that increments and decrements the outstanding I/O
count.  This will be a commonly used bit of code in upcoming patches.
Also make this an atomic_t again, since it will be very often manipulated
outside dreq->spin lock.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 402005c35ab..d78c61a41ec 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -80,8 +80,8 @@ struct nfs_direct_req {
 	unsigned int		npages;		/* count of pages */
 
 	/* completion state */
+	atomic_t		io_count;	/* i/os we're waiting for */
 	spinlock_t		lock;		/* protect completion state */
-	int			outstanding;	/* i/os we're waiting for */
 	ssize_t			count,		/* bytes actually processed */
 				error;		/* any reported error */
 	struct completion	completion;	/* wait for i/o completion */
@@ -97,6 +97,16 @@ struct nfs_direct_req {
 static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
 
+static inline void get_dreq(struct nfs_direct_req *dreq)
+{
+	atomic_inc(&dreq->io_count);
+}
+
+static inline int put_dreq(struct nfs_direct_req *dreq)
+{
+	return atomic_dec_and_test(&dreq->io_count);
+}
+
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @rw: direction (read or write)
@@ -180,7 +190,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 	dreq->iocb = NULL;
 	dreq->ctx = NULL;
 	spin_lock_init(&dreq->lock);
-	dreq->outstanding = 0;
+	atomic_set(&dreq->io_count, 0);
 	dreq->count = 0;
 	dreq->error = 0;
 	dreq->flags = 0;
@@ -278,7 +288,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
 		list_add(&data->pages, list);
 
 		data->req = (struct nfs_page *) dreq;
-		dreq->outstanding++;
+		get_dreq(dreq);
 		if (nbytes <= rsize)
 			break;
 		nbytes -= rsize;
@@ -302,13 +312,10 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	else
 		dreq->error = task->tk_status;
 
-	if (--dreq->outstanding) {
-		spin_unlock(&dreq->lock);
-		return;
-	}
-
 	spin_unlock(&dreq->lock);
-	nfs_direct_complete(dreq);
+
+	if (put_dreq(dreq))
+		nfs_direct_complete(dreq);
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -432,7 +439,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 
 	list_splice_init(&dreq->rewrite_list, &dreq->list);
 	list_for_each(pos, &dreq->list)
-		dreq->outstanding++;
+		get_dreq(dreq);
 	dreq->count = 0;
 
 	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
@@ -564,7 +571,7 @@ static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize
 		list_add(&data->pages, list);
 
 		data->req = (struct nfs_page *) dreq;
-		dreq->outstanding++;
+		get_dreq(dreq);
 		if (nbytes <= wsize)
 			break;
 		nbytes -= wsize;
@@ -620,14 +627,8 @@ static void nfs_direct_write_release(void *calldata)
 	struct nfs_write_data *data = calldata;
 	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
 
-	spin_lock(&dreq->lock);
-	if (--dreq->outstanding) {
-		spin_unlock(&dreq->lock);
-		return;
-	}
-	spin_unlock(&dreq->lock);
-
-	nfs_direct_write_complete(dreq, data->inode);
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, data->inode);
 }
 
 static const struct rpc_call_ops nfs_write_direct_ops = {
-- 
cgit v1.2.3


From fedb595c66e1fbd5acafe0d43b7e95c13c936d61 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 20 Jun 2006 12:55:45 -0400
Subject: NFS: "open code" the NFS direct write rescheduler

An NFSv3/v4 client must reschedule on-the-wire writes if the writes are
UNSTABLE, and the server reboots before the client can complete a
subsequent COMMIT request.

To support direct asynchronous scatter-gather writes, the write
rescheduler in fs/nfs/direct.c must not depend on the I/O parameters
in the controlling nfs_direct_req structure.  iovecs can be somewhat
arbitrarily complex, so there could be an unbounded amount of information
to save for a rarely encountered requirement.

Refactor the direct write rescheduler so it uses information from each
nfs_write_data structure to reschedule writes, instead of caching that
information in the controlling nfs_direct_req structure.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 51 +++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index d78c61a41ec..7101405713e 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -94,8 +94,8 @@ struct nfs_direct_req {
 	struct nfs_writeverf	verf;		/* unstable write verifier */
 };
 
-static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
+static const struct rpc_call_ops nfs_write_direct_ops;
 
 static inline void get_dreq(struct nfs_direct_req *dreq)
 {
@@ -435,14 +435,51 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
-	struct list_head *pos;
+	struct inode *inode = dreq->inode;
+	struct list_head *p;
+	struct nfs_write_data *data;
 
-	list_splice_init(&dreq->rewrite_list, &dreq->list);
-	list_for_each(pos, &dreq->list)
-		get_dreq(dreq);
 	dreq->count = 0;
+	get_dreq(dreq);
+
+	list_for_each(p, &dreq->rewrite_list) {
+		data = list_entry(p, struct nfs_write_data, pages);
+
+		get_dreq(dreq);
+
+		/*
+		 * Reset data->res.
+		 */
+		nfs_fattr_init(&data->fattr);
+		data->res.count = data->args.count;
+		memset(&data->verf, 0, sizeof(data->verf));
+
+		/*
+		 * Reuse data->task; data->args should not have changed
+		 * since the original request was sent.
+		 */
+		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
+				&nfs_write_direct_ops, data);
+		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
+
+		data->task.tk_priority = RPC_PRIORITY_NORMAL;
+		data->task.tk_cookie = (unsigned long) inode;
+
+		/*
+		 * We're called via an RPC callback, so BKL is already held.
+		 */
+		rpc_execute(&data->task);
+
+		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
+				data->task.tk_pid,
+				inode->i_sb->s_id,
+				(long long)NFS_FILEID(inode),
+				data->args.count,
+				(unsigned long long)data->args.offset);
+	}
 
-	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, inode);
 }
 
 static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
@@ -612,8 +649,6 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 				}
 		}
 	}
-	/* In case we have to resend */
-	data->args.stable = NFS_FILE_SYNC;
 
 	spin_unlock(&dreq->lock);
 }
-- 
cgit v1.2.3


From 51a7bc6caec94bab256b272bffd24d00ea81c698 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 20 Jun 2006 12:56:16 -0400
Subject: NFS: remove user_addr, user_count, and pos from nfs_direct_req

Make the user_addr, user_count, and pos parameters explicit to the
scheduler routines, and remove the fields from nfs_direct_req.  The
iovec API will be passing in a series of these, not just one set.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7101405713e..e4c9e03aff1 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -73,9 +73,6 @@ struct nfs_direct_req {
 	struct nfs_open_context	*ctx;		/* file open context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
-	unsigned long		user_addr;	/* location of user's buffer */
-	size_t			user_count;	/* total bytes to move */
-	loff_t			pos;		/* starting offset in file */
 	struct page **		pages;		/* pages in our buffer */
 	unsigned int		npages;		/* count of pages */
 
@@ -327,19 +324,17 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
  * For each nfs_read_data struct that was allocated on the list, dispatch
  * an NFS READ operation
  */
-static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
+static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	struct list_head *list = &dreq->list;
 	struct page **pages = dreq->pages;
-	size_t count = dreq->user_count;
-	loff_t pos = dreq->pos;
 	size_t rsize = NFS_SERVER(inode)->rsize;
 	unsigned int curpage, pgbase;
 
 	curpage = 0;
-	pgbase = dreq->user_addr & ~PAGE_MASK;
+	pgbase = user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_read_data *data;
 		size_t bytes;
@@ -403,9 +398,6 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 	if (!dreq)
 		return -ENOMEM;
 
-	dreq->user_addr = user_addr;
-	dreq->user_count = count;
-	dreq->pos = pos;
 	dreq->pages = pages;
 	dreq->npages = nr_pages;
 	dreq->inode = inode;
@@ -415,7 +407,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_read_schedule(dreq);
+	nfs_direct_read_schedule(dreq, user_addr, count, pos);
 	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
@@ -516,8 +508,8 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->cred = dreq->ctx->cred;
 
 	data->args.fh = NFS_FH(data->inode);
-	data->args.offset = dreq->pos;
-	data->args.count = dreq->user_count;
+	data->args.offset = 0;
+	data->args.count = 0;
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -675,19 +667,17 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
  * For each nfs_write_data struct that was allocated on the list, dispatch
  * an NFS WRITE operation
  */
-static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	struct list_head *list = &dreq->list;
 	struct page **pages = dreq->pages;
-	size_t count = dreq->user_count;
-	loff_t pos = dreq->pos;
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	unsigned int curpage, pgbase;
 
 	curpage = 0;
-	pgbase = dreq->user_addr & ~PAGE_MASK;
+	pgbase = user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_write_data *data;
 		size_t bytes;
@@ -756,9 +746,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	if (dreq->commit_data == NULL || count < wsize)
 		sync = FLUSH_STABLE;
 
-	dreq->user_addr = user_addr;
-	dreq->user_count = count;
-	dreq->pos = pos;
 	dreq->pages = pages;
 	dreq->npages = nr_pages;
 	dreq->inode = inode;
@@ -771,7 +758,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	nfs_begin_data_update(inode);
 
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_write_schedule(dreq, sync);
+	nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
 	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
-- 
cgit v1.2.3


From 9c93ab7dff5eb22027ab15010557bb73f9b44c99 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 20 Jun 2006 12:56:31 -0400
Subject: NFS: refactor nfs_direct_free_user_pages

Clean-up and fix a minor bug: the logic was dirtying page cache pages on
both read and write operations.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e4c9e03aff1..4cb3446220b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -126,16 +126,21 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
 	return -EINVAL;
 }
 
-static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
+static void nfs_direct_dirty_pages(struct page **pages, int npages)
 {
 	int i;
 	for (i = 0; i < npages; i++) {
 		struct page *page = pages[i];
-		if (do_dirty && !PageCompound(page))
+		if (!PageCompound(page))
 			set_page_dirty_lock(page);
-		page_cache_release(page);
 	}
-	kfree(pages);
+}
+
+static void nfs_direct_release_pages(struct page **pages, int npages)
+{
+	int i;
+	for (i = 0; i < npages; i++)
+		page_cache_release(pages[i]);
 }
 
 static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
@@ -162,7 +167,7 @@ static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t siz
 			 * end of a mapping; return EFAULT.
 			 */
 			if (result >= 0) {
-				nfs_free_user_pages(*pages, result, 0);
+				nfs_direct_release_pages(*pages, result);
 				result = -EFAULT;
 			} else
 				kfree(*pages);
@@ -238,8 +243,6 @@ out:
  */
 static void nfs_direct_complete(struct nfs_direct_req *dreq)
 {
-	nfs_free_user_pages(dreq->pages, dreq->npages, 1);
-
 	if (dreq->iocb) {
 		long res = (long) dreq->error;
 		if (!res)
@@ -311,8 +314,11 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 
 	spin_unlock(&dreq->lock);
 
-	if (put_dreq(dreq))
+	if (put_dreq(dreq)) {
+		nfs_direct_dirty_pages(dreq->pages, dreq->npages);
+		nfs_direct_release_pages(dreq->pages, dreq->npages);
 		nfs_direct_complete(dreq);
+	}
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -422,6 +428,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 		list_del(&data->pages);
 		nfs_writedata_release(data);
 	}
+	nfs_direct_release_pages(dreq->pages, dreq->npages);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-- 
cgit v1.2.3


From 06cf6f2ed0b19629700794727d86ed57b9c0583e Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 20 Jun 2006 12:56:49 -0400
Subject: NFS: Eliminate nfs_get_user_pages()

Neil Brown observed that the kmalloc() in nfs_get_user_pages() is more
likely to fail if the I/O is large enough to require the allocation of more
than a single page to keep track of all the pinned pages in the user's
buffer.

Instead of tracking one large page array per dreq/iocb, track pages per
nfs_read/write_data, just like the cached I/O path does.  An array for
pages is already allocated for us by nfs_readdata_alloc() (and the write
and commit equivalents).

This is also required for adding support for vectored I/O to the NFS direct
I/O path.

The original reason to pin the user buffer and allocate all the NFS data
structures before trying to schedule I/O was to ensure all needed resources
are allocated on the client before starting to send requests.  This reduces
the chance that resource exhaustion on the client will cause a short read
or write.

On the other hand, for an application making very large application I/O
requests, this means that it will be nearly impossible for the application
to make forward progress on a resource-limited client.

Thus, moving the buffer pinning functionality into the I/O scheduling
loops should be good for scalability.  The next patch will do the same for
NFS data structure allocation.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 205 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 111 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4cb3446220b..b1630d53fbb 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -73,8 +73,6 @@ struct nfs_direct_req {
 	struct nfs_open_context	*ctx;		/* file open context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
-	struct page **		pages;		/* pages in our buffer */
-	unsigned int		npages;		/* count of pages */
 
 	/* completion state */
 	atomic_t		io_count;	/* i/os we're waiting for */
@@ -104,6 +102,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
 	return atomic_dec_and_test(&dreq->io_count);
 }
 
+/*
+ * "size" is never larger than rsize or wsize.
+ */
+static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
+{
+	int page_count;
+
+	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	page_count -= user_addr >> PAGE_SHIFT;
+	BUG_ON(page_count < 0);
+
+	return page_count;
+}
+
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @rw: direction (read or write)
@@ -143,40 +155,6 @@ static void nfs_direct_release_pages(struct page **pages, int npages)
 		page_cache_release(pages[i]);
 }
 
-static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
-{
-	int result = -ENOMEM;
-	unsigned long page_count;
-	size_t array_size;
-
-	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	page_count -= user_addr >> PAGE_SHIFT;
-
-	array_size = (page_count * sizeof(struct page *));
-	*pages = kmalloc(array_size, GFP_KERNEL);
-	if (*pages) {
-		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, user_addr,
-					page_count, (rw == READ), 0,
-					*pages, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (result != page_count) {
-			/*
-			 * If we got fewer pages than expected from
-			 * get_user_pages(), the user buffer runs off the
-			 * end of a mapping; return EFAULT.
-			 */
-			if (result >= 0) {
-				nfs_direct_release_pages(*pages, result);
-				result = -EFAULT;
-			} else
-				kfree(*pages);
-			*pages = NULL;
-		}
-	}
-	return result;
-}
-
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
 	struct nfs_direct_req *dreq;
@@ -233,13 +211,8 @@ out:
 }
 
 /*
- * We must hold a reference to all the pages in this direct read request
- * until the RPCs complete.  This could be long *after* we are woken up in
- * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
- *
- * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
- * can't trust the iocb is still valid here if this is a synchronous
- * request.  If the waiter is woken prematurely, the iocb is long gone.
+ * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
+ * the iocb is still valid here if this is a synchronous request.
  */
 static void nfs_direct_complete(struct nfs_direct_req *dreq)
 {
@@ -297,6 +270,11 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
 	return dreq;
 }
 
+/*
+ * We must hold a reference to all the pages in this direct read request
+ * until the RPCs complete.  This could be long *after* we are woken up in
+ * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ */
 static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
@@ -305,6 +283,9 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 
+	nfs_direct_dirty_pages(data->pagevec, data->npages);
+	nfs_direct_release_pages(data->pagevec, data->npages);
+
 	spin_lock(&dreq->lock);
 
 	if (likely(task->tk_status >= 0))
@@ -314,11 +295,8 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 
 	spin_unlock(&dreq->lock);
 
-	if (put_dreq(dreq)) {
-		nfs_direct_dirty_pages(dreq->pages, dreq->npages);
-		nfs_direct_release_pages(dreq->pages, dreq->npages);
+	if (put_dreq(dreq))
 		nfs_direct_complete(dreq);
-	}
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -328,21 +306,23 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 
 /*
  * For each nfs_read_data struct that was allocated on the list, dispatch
- * an NFS READ operation
+ * an NFS READ operation.  If get_user_pages() fails, we stop sending reads.
+ * Read length accounting is handled by nfs_direct_read_result().
+ * Otherwise, if no requests have been sent, just return an error.
  */
-static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	struct list_head *list = &dreq->list;
-	struct page **pages = dreq->pages;
 	size_t rsize = NFS_SERVER(inode)->rsize;
-	unsigned int curpage, pgbase;
+	unsigned int pgbase;
+	int result;
+	ssize_t started = 0;
+	struct nfs_read_data *data;
 
-	curpage = 0;
 	pgbase = user_addr & ~PAGE_MASK;
 	do {
-		struct nfs_read_data *data;
 		size_t bytes;
 
 		bytes = rsize;
@@ -353,13 +333,21 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
 		data = list_entry(list->next, struct nfs_read_data, pages);
 		list_del_init(&data->pages);
 
+		data->npages = nfs_direct_count_pages(user_addr, bytes);
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					data->npages, 1, 0, data->pagevec, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (unlikely(result < data->npages))
+			goto out_err;
+
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = &pages[curpage];
+		data->args.pages = data->pagevec;
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
@@ -382,17 +370,36 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long
 				bytes,
 				(unsigned long long)data->args.offset);
 
+		started += bytes;
+		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
-		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
 	BUG_ON(!list_empty(list));
+	return 0;
+
+out_err:
+	if (result > 0)
+		nfs_direct_release_pages(data->pagevec, result);
+
+	list_add(&data->pages, list);
+	while (!list_empty(list)) {
+		data = list_entry(list->next, struct nfs_read_data, pages);
+		list_del(&data->pages);
+		nfs_readdata_free(data);
+		if (put_dreq(dreq))
+			nfs_direct_complete(dreq);
+	}
+
+	if (started)
+		return 0;
+	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
-static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
+static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
 	ssize_t result;
 	sigset_t oldset;
@@ -404,8 +411,6 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 	if (!dreq)
 		return -ENOMEM;
 
-	dreq->pages = pages;
-	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -413,8 +418,9 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_read_schedule(dreq, user_addr, count, pos);
-	result = nfs_direct_wait(dreq);
+	result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
+	if (!result)
+		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -426,9 +432,9 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 	while (!list_empty(&dreq->list)) {
 		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
+		nfs_direct_release_pages(data->pagevec, data->npages);
 		nfs_writedata_release(data);
 	}
-	nfs_direct_release_pages(dreq->pages, dreq->npages);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -672,21 +678,23 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 
 /*
  * For each nfs_write_data struct that was allocated on the list, dispatch
- * an NFS WRITE operation
+ * an NFS WRITE operation.  If get_user_pages() fails, we stop sending writes.
+ * Write length accounting is handled by nfs_direct_write_result().
+ * Otherwise, if no requests have been sent, just return an error.
  */
-static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
+static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
 	struct list_head *list = &dreq->list;
-	struct page **pages = dreq->pages;
 	size_t wsize = NFS_SERVER(inode)->wsize;
-	unsigned int curpage, pgbase;
+	unsigned int pgbase;
+	int result;
+	ssize_t started = 0;
+	struct nfs_write_data *data;
 
-	curpage = 0;
 	pgbase = user_addr & ~PAGE_MASK;
 	do {
-		struct nfs_write_data *data;
 		size_t bytes;
 
 		bytes = wsize;
@@ -695,6 +703,15 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 
 		BUG_ON(list_empty(list));
 		data = list_entry(list->next, struct nfs_write_data, pages);
+
+		data->npages = nfs_direct_count_pages(user_addr, bytes);
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					data->npages, 0, 0, data->pagevec, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (unlikely(result < data->npages))
+			goto out_err;
+
 		list_move_tail(&data->pages, &dreq->rewrite_list);
 
 		data->inode = inode;
@@ -703,7 +720,7 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = &pages[curpage];
+		data->args.pages = data->pagevec;
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.count = bytes;
@@ -727,17 +744,36 @@ static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long
 				bytes,
 				(unsigned long long)data->args.offset);
 
+		started += bytes;
+		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
-		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
 	BUG_ON(!list_empty(list));
+	return 0;
+
+out_err:
+	if (result > 0)
+		nfs_direct_release_pages(data->pagevec, result);
+
+	list_add(&data->pages, list);
+	while (!list_empty(list)) {
+		data = list_entry(list->next, struct nfs_write_data, pages);
+		list_del(&data->pages);
+		nfs_writedata_free(data);
+		if (put_dreq(dreq))
+			nfs_direct_write_complete(dreq, inode);
+	}
+
+	if (started)
+		return 0;
+	return result < 0 ? (ssize_t) result : -EFAULT;
 }
 
-static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
+static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
 	ssize_t result;
 	sigset_t oldset;
@@ -753,8 +789,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	if (dreq->commit_data == NULL || count < wsize)
 		sync = FLUSH_STABLE;
 
-	dreq->pages = pages;
-	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -765,8 +799,9 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	nfs_begin_data_update(inode);
 
 	rpc_clnt_sigmask(clnt, &oldset);
-	nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
-	result = nfs_direct_wait(dreq);
+	result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
+	if (!result)
+		result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -796,8 +831,6 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval = -EINVAL;
-	int page_count;
-	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -819,14 +852,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count,
 	if (retval)
 		goto out;
 
-	retval = nfs_get_user_pages(READ, (unsigned long) buf,
-						count, &pages);
-	if (retval < 0)
-		goto out;
-	page_count = retval;
-
-	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
-						pages, page_count);
+	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
 
@@ -862,8 +888,6 @@ out:
 ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval;
-	int page_count;
-	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -891,14 +915,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t
 	if (retval)
 		goto out;
 
-	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
-						count, &pages);
-	if (retval < 0)
-		goto out;
-	page_count = retval;
-
-	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
-					pos, pages, page_count);
+	retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
 
 	/*
 	 * XXX: nfs_end_data_update() already ensures this file's
-- 
cgit v1.2.3


From 82b145c5a572f7fa7211dffe2097234dc91bcecc Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@netapp.com>
Date: Tue, 20 Jun 2006 12:57:03 -0400
Subject: NFS: alloc nfs_read/write_data as direct I/O is scheduled

Re-arrange the logic in the NFS direct I/O path so that nfs_read/write_data
structs are allocated just before they are scheduled, rather than
allocating them all at once before we start scheduling requests.

Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c | 210 ++++++++++++++++++--------------------------------------
 1 file changed, 65 insertions(+), 145 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b1630d53fbb..e25b7595b7a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -68,8 +68,6 @@ struct nfs_direct_req {
 	struct kref		kref;		/* release manager */
 
 	/* I/O parameters */
-	struct list_head	list,		/* nfs_read/write_data structs */
-				rewrite_list;	/* saved nfs_write_data structs */
 	struct nfs_open_context	*ctx;		/* file open context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
@@ -82,6 +80,7 @@ struct nfs_direct_req {
 	struct completion	completion;	/* wait for i/o completion */
 
 	/* commit state */
+	struct list_head	rewrite_list;	/* saved nfs_write_data structs */
 	struct nfs_write_data *	commit_data;	/* special write_data for commits */
 	int			flags;
 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
@@ -116,6 +115,11 @@ static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
 	return page_count;
 }
 
+static inline unsigned int nfs_max_pages(unsigned int size)
+{
+	return (size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
  * @rw: direction (read or write)
@@ -164,8 +168,8 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 		return NULL;
 
 	kref_init(&dreq->kref);
+	kref_get(&dreq->kref);
 	init_completion(&dreq->completion);
-	INIT_LIST_HEAD(&dreq->list);
 	INIT_LIST_HEAD(&dreq->rewrite_list);
 	dreq->iocb = NULL;
 	dreq->ctx = NULL;
@@ -227,49 +231,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 	kref_put(&dreq->kref, nfs_direct_req_release);
 }
 
-/*
- * Note we also set the number of requests we have in the dreq when we are
- * done.  This prevents races with I/O completion so we will always wait
- * until all requests have been dispatched and completed.
- */
-static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
-{
-	struct list_head *list;
-	struct nfs_direct_req *dreq;
-	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
-	dreq = nfs_direct_req_alloc();
-	if (!dreq)
-		return NULL;
-
-	list = &dreq->list;
-	for(;;) {
-		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
-
-		if (unlikely(!data)) {
-			while (!list_empty(list)) {
-				data = list_entry(list->next,
-						  struct nfs_read_data, pages);
-				list_del(&data->pages);
-				nfs_readdata_free(data);
-			}
-			kref_put(&dreq->kref, nfs_direct_req_release);
-			return NULL;
-		}
-
-		INIT_LIST_HEAD(&data->pages);
-		list_add(&data->pages, list);
-
-		data->req = (struct nfs_page *) dreq;
-		get_dreq(dreq);
-		if (nbytes <= rsize)
-			break;
-		nbytes -= rsize;
-	}
-	kref_get(&dreq->kref);
-	return dreq;
-}
-
 /*
  * We must hold a reference to all the pages in this direct read request
  * until the RPCs complete.  This could be long *after* we are woken up in
@@ -305,42 +266,53 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 };
 
 /*
- * For each nfs_read_data struct that was allocated on the list, dispatch
- * an NFS READ operation.  If get_user_pages() fails, we stop sending reads.
- * Read length accounting is handled by nfs_direct_read_result().
- * Otherwise, if no requests have been sent, just return an error.
+ * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
+ * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
+ * bail and stop sending more reads.  Read length accounting is
+ * handled automatically by nfs_direct_read_result().  Otherwise, if
+ * no requests have been sent, just return an error.
  */
 static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
-	struct list_head *list = &dreq->list;
 	size_t rsize = NFS_SERVER(inode)->rsize;
+	unsigned int rpages = nfs_max_pages(rsize);
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
-	struct nfs_read_data *data;
+
+	get_dreq(dreq);
 
 	pgbase = user_addr & ~PAGE_MASK;
 	do {
+		struct nfs_read_data *data;
 		size_t bytes;
 
+		result = -ENOMEM;
+		data = nfs_readdata_alloc(rpages);
+		if (unlikely(!data))
+			break;
+
 		bytes = rsize;
 		if (count < rsize)
 			bytes = count;
 
-		BUG_ON(list_empty(list));
-		data = list_entry(list->next, struct nfs_read_data, pages);
-		list_del_init(&data->pages);
-
 		data->npages = nfs_direct_count_pages(user_addr, bytes);
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
 					data->npages, 1, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages))
-			goto out_err;
+		if (unlikely(result < data->npages)) {
+			if (result > 0)
+				nfs_direct_release_pages(data->pagevec, result);
+			nfs_readdata_release(data);
+			break;
+		}
 
+		get_dreq(dreq);
+
+		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
@@ -378,21 +350,9 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 
 		count -= bytes;
 	} while (count != 0);
-	BUG_ON(!list_empty(list));
-	return 0;
 
-out_err:
-	if (result > 0)
-		nfs_direct_release_pages(data->pagevec, result);
-
-	list_add(&data->pages, list);
-	while (!list_empty(list)) {
-		data = list_entry(list->next, struct nfs_read_data, pages);
-		list_del(&data->pages);
-		nfs_readdata_free(data);
-		if (put_dreq(dreq))
-			nfs_direct_complete(dreq);
-	}
+	if (put_dreq(dreq))
+		nfs_direct_complete(dreq);
 
 	if (started)
 		return 0;
@@ -401,13 +361,13 @@ out_err:
 
 static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
-	ssize_t result;
+	ssize_t result = 0;
 	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
 
-	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
+	dreq = nfs_direct_req_alloc();
 	if (!dreq)
 		return -ENOMEM;
 
@@ -428,9 +388,8 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 {
-	list_splice_init(&dreq->rewrite_list, &dreq->list);
-	while (!list_empty(&dreq->list)) {
-		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
+	while (!list_empty(&dreq->rewrite_list)) {
+		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
 		nfs_direct_release_pages(data->pagevec, data->npages);
 		nfs_writedata_release(data);
@@ -584,47 +543,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 }
 #endif
 
-static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
-{
-	struct list_head *list;
-	struct nfs_direct_req *dreq;
-	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
-	dreq = nfs_direct_req_alloc();
-	if (!dreq)
-		return NULL;
-
-	list = &dreq->list;
-	for(;;) {
-		struct nfs_write_data *data = nfs_writedata_alloc(wpages);
-
-		if (unlikely(!data)) {
-			while (!list_empty(list)) {
-				data = list_entry(list->next,
-						  struct nfs_write_data, pages);
-				list_del(&data->pages);
-				nfs_writedata_free(data);
-			}
-			kref_put(&dreq->kref, nfs_direct_req_release);
-			return NULL;
-		}
-
-		INIT_LIST_HEAD(&data->pages);
-		list_add(&data->pages, list);
-
-		data->req = (struct nfs_page *) dreq;
-		get_dreq(dreq);
-		if (nbytes <= wsize)
-			break;
-		nbytes -= wsize;
-	}
-
-	nfs_alloc_commit_data(dreq);
-
-	kref_get(&dreq->kref);
-	return dreq;
-}
-
 static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
@@ -677,43 +595,55 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 };
 
 /*
- * For each nfs_write_data struct that was allocated on the list, dispatch
- * an NFS WRITE operation.  If get_user_pages() fails, we stop sending writes.
- * Write length accounting is handled by nfs_direct_write_result().
- * Otherwise, if no requests have been sent, just return an error.
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes.  Write length accounting is
+ * handled automatically by nfs_direct_write_result().  Otherwise, if
+ * no requests have been sent, just return an error.
  */
 static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
-	struct list_head *list = &dreq->list;
 	size_t wsize = NFS_SERVER(inode)->wsize;
+	unsigned int wpages = nfs_max_pages(wsize);
 	unsigned int pgbase;
 	int result;
 	ssize_t started = 0;
-	struct nfs_write_data *data;
+
+	get_dreq(dreq);
 
 	pgbase = user_addr & ~PAGE_MASK;
 	do {
+		struct nfs_write_data *data;
 		size_t bytes;
 
+		result = -ENOMEM;
+		data = nfs_writedata_alloc(wpages);
+		if (unlikely(!data))
+			break;
+
 		bytes = wsize;
 		if (count < wsize)
 			bytes = count;
 
-		BUG_ON(list_empty(list));
-		data = list_entry(list->next, struct nfs_write_data, pages);
-
 		data->npages = nfs_direct_count_pages(user_addr, bytes);
 		down_read(&current->mm->mmap_sem);
 		result = get_user_pages(current, current->mm, user_addr,
 					data->npages, 0, 0, data->pagevec, NULL);
 		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages))
-			goto out_err;
+		if (unlikely(result < data->npages)) {
+			if (result > 0)
+				nfs_direct_release_pages(data->pagevec, result);
+			nfs_writedata_release(data);
+			break;
+		}
+
+		get_dreq(dreq);
 
 		list_move_tail(&data->pages, &dreq->rewrite_list);
 
+		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
@@ -752,21 +682,9 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 
 		count -= bytes;
 	} while (count != 0);
-	BUG_ON(!list_empty(list));
-	return 0;
-
-out_err:
-	if (result > 0)
-		nfs_direct_release_pages(data->pagevec, result);
 
-	list_add(&data->pages, list);
-	while (!list_empty(list)) {
-		data = list_entry(list->next, struct nfs_write_data, pages);
-		list_del(&data->pages);
-		nfs_writedata_free(data);
-		if (put_dreq(dreq))
-			nfs_direct_write_complete(dreq, inode);
-	}
+	if (put_dreq(dreq))
+		nfs_direct_write_complete(dreq, inode);
 
 	if (started)
 		return 0;
@@ -775,7 +693,7 @@ out_err:
 
 static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
 {
-	ssize_t result;
+	ssize_t result = 0;
 	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
@@ -783,9 +701,11 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	int sync = 0;
 
-	dreq = nfs_direct_write_alloc(count, wsize);
+	dreq = nfs_direct_req_alloc();
 	if (!dreq)
 		return -ENOMEM;
+	nfs_alloc_commit_data(dreq);
+
 	if (dreq->commit_data == NULL || count < wsize)
 		sync = FLUSH_STABLE;
 
-- 
cgit v1.2.3


From 1b77c54ee11ebe36ce4d0fe805e50aaafa6304b4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 27 Apr 2006 12:01:18 -0300
Subject: V4L/DVB (3809a): Remove compat stuff for DMX_GET_EVENT

The ioctl were removed by:
V4L/DVB (3727): Remove DMX_GET_EVENT and associated data structures
due to the ioctl DMX_GET_EVENT has never been implemented, and also
scrambling events can't be generated in a useful way by the hardware.

This patch removes the corresponding entry at fs/compat_ioctl.c

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 fs/compat_ioctl.c | 33 ---------------------------------
 1 file changed, 33 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d2c38875ab2..9eb9824dd33 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -205,38 +205,6 @@ static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
 }
 
-struct compat_dmx_event {
-	dmx_event_t	event;
-	compat_time_t	timeStamp;
-	union
-	{
-		dmx_scrambling_status_t scrambling;
-	} u;
-};
-
-static int do_dmx_get_event(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct dmx_event kevent;
-	mm_segment_t old_fs = get_fs();
-	int err;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
-	set_fs(old_fs);
-
-	if (!err) {
-		struct compat_dmx_event __user *up = compat_ptr(arg);
-
-		err  = put_user(kevent.event, &up->event);
-		err |= put_user(kevent.timeStamp, &up->timeStamp);
-		err |= put_user(kevent.u.scrambling, &up->u.scrambling);
-		if (err)
-			err = -EFAULT;
-	}
-
-	return err;
-}
-
 struct compat_video_event {
 	int32_t		type;
 	compat_time_t	timestamp;
@@ -2964,7 +2932,6 @@ HANDLE_IOCTL(NCP_IOC_SETPRIVATEDATA_32, do_ncp_setprivatedata)
 #endif
 
 /* dvb */
-HANDLE_IOCTL(DMX_GET_EVENT, do_dmx_get_event)
 HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
 HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
 HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette)
-- 
cgit v1.2.3


From ccf01ef7aa9c6c293a1c64c27331a2ce227916ec Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 25 Jun 2006 06:27:31 -0400
Subject: Merge branch 'odirect'

---
 fs/nfs/direct.c | 435 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 234 insertions(+), 201 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e25b7595b7a..402005c35ab 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -68,19 +68,25 @@ struct nfs_direct_req {
 	struct kref		kref;		/* release manager */
 
 	/* I/O parameters */
+	struct list_head	list,		/* nfs_read/write_data structs */
+				rewrite_list;	/* saved nfs_write_data structs */
 	struct nfs_open_context	*ctx;		/* file open context info */
 	struct kiocb *		iocb;		/* controlling i/o request */
 	struct inode *		inode;		/* target file of i/o */
+	unsigned long		user_addr;	/* location of user's buffer */
+	size_t			user_count;	/* total bytes to move */
+	loff_t			pos;		/* starting offset in file */
+	struct page **		pages;		/* pages in our buffer */
+	unsigned int		npages;		/* count of pages */
 
 	/* completion state */
-	atomic_t		io_count;	/* i/os we're waiting for */
 	spinlock_t		lock;		/* protect completion state */
+	int			outstanding;	/* i/os we're waiting for */
 	ssize_t			count,		/* bytes actually processed */
 				error;		/* any reported error */
 	struct completion	completion;	/* wait for i/o completion */
 
 	/* commit state */
-	struct list_head	rewrite_list;	/* saved nfs_write_data structs */
 	struct nfs_write_data *	commit_data;	/* special write_data for commits */
 	int			flags;
 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
@@ -88,37 +94,8 @@ struct nfs_direct_req {
 	struct nfs_writeverf	verf;		/* unstable write verifier */
 };
 
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
-static const struct rpc_call_ops nfs_write_direct_ops;
-
-static inline void get_dreq(struct nfs_direct_req *dreq)
-{
-	atomic_inc(&dreq->io_count);
-}
-
-static inline int put_dreq(struct nfs_direct_req *dreq)
-{
-	return atomic_dec_and_test(&dreq->io_count);
-}
-
-/*
- * "size" is never larger than rsize or wsize.
- */
-static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
-{
-	int page_count;
-
-	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	page_count -= user_addr >> PAGE_SHIFT;
-	BUG_ON(page_count < 0);
-
-	return page_count;
-}
-
-static inline unsigned int nfs_max_pages(unsigned int size)
-{
-	return (size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
 
 /**
  * nfs_direct_IO - NFS address space operation for direct I/O
@@ -142,21 +119,50 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
 	return -EINVAL;
 }
 
-static void nfs_direct_dirty_pages(struct page **pages, int npages)
+static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
 {
 	int i;
 	for (i = 0; i < npages; i++) {
 		struct page *page = pages[i];
-		if (!PageCompound(page))
+		if (do_dirty && !PageCompound(page))
 			set_page_dirty_lock(page);
+		page_cache_release(page);
 	}
+	kfree(pages);
 }
 
-static void nfs_direct_release_pages(struct page **pages, int npages)
+static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
 {
-	int i;
-	for (i = 0; i < npages; i++)
-		page_cache_release(pages[i]);
+	int result = -ENOMEM;
+	unsigned long page_count;
+	size_t array_size;
+
+	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	page_count -= user_addr >> PAGE_SHIFT;
+
+	array_size = (page_count * sizeof(struct page *));
+	*pages = kmalloc(array_size, GFP_KERNEL);
+	if (*pages) {
+		down_read(&current->mm->mmap_sem);
+		result = get_user_pages(current, current->mm, user_addr,
+					page_count, (rw == READ), 0,
+					*pages, NULL);
+		up_read(&current->mm->mmap_sem);
+		if (result != page_count) {
+			/*
+			 * If we got fewer pages than expected from
+			 * get_user_pages(), the user buffer runs off the
+			 * end of a mapping; return EFAULT.
+			 */
+			if (result >= 0) {
+				nfs_free_user_pages(*pages, result, 0);
+				result = -EFAULT;
+			} else
+				kfree(*pages);
+			*pages = NULL;
+		}
+	}
+	return result;
 }
 
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
@@ -168,13 +174,13 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 		return NULL;
 
 	kref_init(&dreq->kref);
-	kref_get(&dreq->kref);
 	init_completion(&dreq->completion);
+	INIT_LIST_HEAD(&dreq->list);
 	INIT_LIST_HEAD(&dreq->rewrite_list);
 	dreq->iocb = NULL;
 	dreq->ctx = NULL;
 	spin_lock_init(&dreq->lock);
-	atomic_set(&dreq->io_count, 0);
+	dreq->outstanding = 0;
 	dreq->count = 0;
 	dreq->error = 0;
 	dreq->flags = 0;
@@ -215,11 +221,18 @@ out:
 }
 
 /*
- * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
- * the iocb is still valid here if this is a synchronous request.
+ * We must hold a reference to all the pages in this direct read request
+ * until the RPCs complete.  This could be long *after* we are woken up in
+ * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ *
+ * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
+ * can't trust the iocb is still valid here if this is a synchronous
+ * request.  If the waiter is woken prematurely, the iocb is long gone.
  */
 static void nfs_direct_complete(struct nfs_direct_req *dreq)
 {
+	nfs_free_user_pages(dreq->pages, dreq->npages, 1);
+
 	if (dreq->iocb) {
 		long res = (long) dreq->error;
 		if (!res)
@@ -232,10 +245,48 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 }
 
 /*
- * We must hold a reference to all the pages in this direct read request
- * until the RPCs complete.  This could be long *after* we are woken up in
- * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
+ * Note we also set the number of requests we have in the dreq when we are
+ * done.  This prevents races with I/O completion so we will always wait
+ * until all requests have been dispatched and completed.
  */
+static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
+{
+	struct list_head *list;
+	struct nfs_direct_req *dreq;
+	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		return NULL;
+
+	list = &dreq->list;
+	for(;;) {
+		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
+
+		if (unlikely(!data)) {
+			while (!list_empty(list)) {
+				data = list_entry(list->next,
+						  struct nfs_read_data, pages);
+				list_del(&data->pages);
+				nfs_readdata_free(data);
+			}
+			kref_put(&dreq->kref, nfs_direct_req_release);
+			return NULL;
+		}
+
+		INIT_LIST_HEAD(&data->pages);
+		list_add(&data->pages, list);
+
+		data->req = (struct nfs_page *) dreq;
+		dreq->outstanding++;
+		if (nbytes <= rsize)
+			break;
+		nbytes -= rsize;
+	}
+	kref_get(&dreq->kref);
+	return dreq;
+}
+
 static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
@@ -244,9 +295,6 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	if (nfs_readpage_result(task, data) != 0)
 		return;
 
-	nfs_direct_dirty_pages(data->pagevec, data->npages);
-	nfs_direct_release_pages(data->pagevec, data->npages);
-
 	spin_lock(&dreq->lock);
 
 	if (likely(task->tk_status >= 0))
@@ -254,10 +302,13 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
 	else
 		dreq->error = task->tk_status;
 
-	spin_unlock(&dreq->lock);
+	if (--dreq->outstanding) {
+		spin_unlock(&dreq->lock);
+		return;
+	}
 
-	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
+	spin_unlock(&dreq->lock);
+	nfs_direct_complete(dreq);
 }
 
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -266,60 +317,41 @@ static const struct rpc_call_ops nfs_read_direct_ops = {
 };
 
 /*
- * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
- * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
- * bail and stop sending more reads.  Read length accounting is
- * handled automatically by nfs_direct_read_result().  Otherwise, if
- * no requests have been sent, just return an error.
+ * For each nfs_read_data struct that was allocated on the list, dispatch
+ * an NFS READ operation
  */
-static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos)
+static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
+	struct list_head *list = &dreq->list;
+	struct page **pages = dreq->pages;
+	size_t count = dreq->user_count;
+	loff_t pos = dreq->pos;
 	size_t rsize = NFS_SERVER(inode)->rsize;
-	unsigned int rpages = nfs_max_pages(rsize);
-	unsigned int pgbase;
-	int result;
-	ssize_t started = 0;
-
-	get_dreq(dreq);
+	unsigned int curpage, pgbase;
 
-	pgbase = user_addr & ~PAGE_MASK;
+	curpage = 0;
+	pgbase = dreq->user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_read_data *data;
 		size_t bytes;
 
-		result = -ENOMEM;
-		data = nfs_readdata_alloc(rpages);
-		if (unlikely(!data))
-			break;
-
 		bytes = rsize;
 		if (count < rsize)
 			bytes = count;
 
-		data->npages = nfs_direct_count_pages(user_addr, bytes);
-		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 1, 0, data->pagevec, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages)) {
-			if (result > 0)
-				nfs_direct_release_pages(data->pagevec, result);
-			nfs_readdata_release(data);
-			break;
-		}
-
-		get_dreq(dreq);
+		BUG_ON(list_empty(list));
+		data = list_entry(list->next, struct nfs_read_data, pages);
+		list_del_init(&data->pages);
 
-		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
+		data->args.pages = &pages[curpage];
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.eof = 0;
@@ -342,35 +374,33 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
 				bytes,
 				(unsigned long long)data->args.offset);
 
-		started += bytes;
-		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
+		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
-
-	if (put_dreq(dreq))
-		nfs_direct_complete(dreq);
-
-	if (started)
-		return 0;
-	return result < 0 ? (ssize_t) result : -EFAULT;
+	BUG_ON(!list_empty(list));
 }
 
-static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
 {
-	ssize_t result = 0;
+	ssize_t result;
 	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
 	struct nfs_direct_req *dreq;
 
-	dreq = nfs_direct_req_alloc();
+	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
 	if (!dreq)
 		return -ENOMEM;
 
+	dreq->user_addr = user_addr;
+	dreq->user_count = count;
+	dreq->pos = pos;
+	dreq->pages = pages;
+	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -378,9 +408,8 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
 	rpc_clnt_sigmask(clnt, &oldset);
-	result = nfs_direct_read_schedule(dreq, user_addr, count, pos);
-	if (!result)
-		result = nfs_direct_wait(dreq);
+	nfs_direct_read_schedule(dreq);
+	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -388,10 +417,10 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size
 
 static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 {
-	while (!list_empty(&dreq->rewrite_list)) {
-		struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
+	list_splice_init(&dreq->rewrite_list, &dreq->list);
+	while (!list_empty(&dreq->list)) {
+		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
 		list_del(&data->pages);
-		nfs_direct_release_pages(data->pagevec, data->npages);
 		nfs_writedata_release(data);
 	}
 }
@@ -399,51 +428,14 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
-	struct inode *inode = dreq->inode;
-	struct list_head *p;
-	struct nfs_write_data *data;
+	struct list_head *pos;
 
+	list_splice_init(&dreq->rewrite_list, &dreq->list);
+	list_for_each(pos, &dreq->list)
+		dreq->outstanding++;
 	dreq->count = 0;
-	get_dreq(dreq);
-
-	list_for_each(p, &dreq->rewrite_list) {
-		data = list_entry(p, struct nfs_write_data, pages);
-
-		get_dreq(dreq);
-
-		/*
-		 * Reset data->res.
-		 */
-		nfs_fattr_init(&data->fattr);
-		data->res.count = data->args.count;
-		memset(&data->verf, 0, sizeof(data->verf));
-
-		/*
-		 * Reuse data->task; data->args should not have changed
-		 * since the original request was sent.
-		 */
-		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
-				&nfs_write_direct_ops, data);
-		NFS_PROTO(inode)->write_setup(data, FLUSH_STABLE);
-
-		data->task.tk_priority = RPC_PRIORITY_NORMAL;
-		data->task.tk_cookie = (unsigned long) inode;
-
-		/*
-		 * We're called via an RPC callback, so BKL is already held.
-		 */
-		rpc_execute(&data->task);
-
-		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-				data->task.tk_pid,
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode),
-				data->args.count,
-				(unsigned long long)data->args.offset);
-	}
 
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, inode);
+	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
 }
 
 static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
@@ -480,8 +472,8 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	data->cred = dreq->ctx->cred;
 
 	data->args.fh = NFS_FH(data->inode);
-	data->args.offset = 0;
-	data->args.count = 0;
+	data->args.offset = dreq->pos;
+	data->args.count = dreq->user_count;
 	data->res.count = 0;
 	data->res.fattr = &data->fattr;
 	data->res.verf = &data->verf;
@@ -543,6 +535,47 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 }
 #endif
 
+static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
+{
+	struct list_head *list;
+	struct nfs_direct_req *dreq;
+	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	dreq = nfs_direct_req_alloc();
+	if (!dreq)
+		return NULL;
+
+	list = &dreq->list;
+	for(;;) {
+		struct nfs_write_data *data = nfs_writedata_alloc(wpages);
+
+		if (unlikely(!data)) {
+			while (!list_empty(list)) {
+				data = list_entry(list->next,
+						  struct nfs_write_data, pages);
+				list_del(&data->pages);
+				nfs_writedata_free(data);
+			}
+			kref_put(&dreq->kref, nfs_direct_req_release);
+			return NULL;
+		}
+
+		INIT_LIST_HEAD(&data->pages);
+		list_add(&data->pages, list);
+
+		data->req = (struct nfs_page *) dreq;
+		dreq->outstanding++;
+		if (nbytes <= wsize)
+			break;
+		nbytes -= wsize;
+	}
+
+	nfs_alloc_commit_data(dreq);
+
+	kref_get(&dreq->kref);
+	return dreq;
+}
+
 static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
@@ -572,6 +605,8 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 				}
 		}
 	}
+	/* In case we have to resend */
+	data->args.stable = NFS_FILE_SYNC;
 
 	spin_unlock(&dreq->lock);
 }
@@ -585,8 +620,14 @@ static void nfs_direct_write_release(void *calldata)
 	struct nfs_write_data *data = calldata;
 	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
 
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, data->inode);
+	spin_lock(&dreq->lock);
+	if (--dreq->outstanding) {
+		spin_unlock(&dreq->lock);
+		return;
+	}
+	spin_unlock(&dreq->lock);
+
+	nfs_direct_write_complete(dreq, data->inode);
 }
 
 static const struct rpc_call_ops nfs_write_direct_ops = {
@@ -595,62 +636,41 @@ static const struct rpc_call_ops nfs_write_direct_ops = {
 };
 
 /*
- * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
- * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
- * bail and stop sending more writes.  Write length accounting is
- * handled automatically by nfs_direct_write_result().  Otherwise, if
- * no requests have been sent, just return an error.
+ * For each nfs_write_data struct that was allocated on the list, dispatch
+ * an NFS WRITE operation
  */
-static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned long user_addr, size_t count, loff_t pos, int sync)
+static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
 {
 	struct nfs_open_context *ctx = dreq->ctx;
 	struct inode *inode = ctx->dentry->d_inode;
+	struct list_head *list = &dreq->list;
+	struct page **pages = dreq->pages;
+	size_t count = dreq->user_count;
+	loff_t pos = dreq->pos;
 	size_t wsize = NFS_SERVER(inode)->wsize;
-	unsigned int wpages = nfs_max_pages(wsize);
-	unsigned int pgbase;
-	int result;
-	ssize_t started = 0;
+	unsigned int curpage, pgbase;
 
-	get_dreq(dreq);
-
-	pgbase = user_addr & ~PAGE_MASK;
+	curpage = 0;
+	pgbase = dreq->user_addr & ~PAGE_MASK;
 	do {
 		struct nfs_write_data *data;
 		size_t bytes;
 
-		result = -ENOMEM;
-		data = nfs_writedata_alloc(wpages);
-		if (unlikely(!data))
-			break;
-
 		bytes = wsize;
 		if (count < wsize)
 			bytes = count;
 
-		data->npages = nfs_direct_count_pages(user_addr, bytes);
-		down_read(&current->mm->mmap_sem);
-		result = get_user_pages(current, current->mm, user_addr,
-					data->npages, 0, 0, data->pagevec, NULL);
-		up_read(&current->mm->mmap_sem);
-		if (unlikely(result < data->npages)) {
-			if (result > 0)
-				nfs_direct_release_pages(data->pagevec, result);
-			nfs_writedata_release(data);
-			break;
-		}
-
-		get_dreq(dreq);
-
+		BUG_ON(list_empty(list));
+		data = list_entry(list->next, struct nfs_write_data, pages);
 		list_move_tail(&data->pages, &dreq->rewrite_list);
 
-		data->req = (struct nfs_page *) dreq;
 		data->inode = inode;
 		data->cred = ctx->cred;
 		data->args.fh = NFS_FH(inode);
 		data->args.context = ctx;
 		data->args.offset = pos;
 		data->args.pgbase = pgbase;
-		data->args.pages = data->pagevec;
+		data->args.pages = &pages[curpage];
 		data->args.count = bytes;
 		data->res.fattr = &data->fattr;
 		data->res.count = bytes;
@@ -674,26 +694,19 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
 				bytes,
 				(unsigned long long)data->args.offset);
 
-		started += bytes;
-		user_addr += bytes;
 		pos += bytes;
 		pgbase += bytes;
+		curpage += pgbase >> PAGE_SHIFT;
 		pgbase &= ~PAGE_MASK;
 
 		count -= bytes;
 	} while (count != 0);
-
-	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, inode);
-
-	if (started)
-		return 0;
-	return result < 0 ? (ssize_t) result : -EFAULT;
+	BUG_ON(!list_empty(list));
 }
 
-static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos)
+static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
 {
-	ssize_t result = 0;
+	ssize_t result;
 	sigset_t oldset;
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	struct rpc_clnt *clnt = NFS_CLIENT(inode);
@@ -701,14 +714,17 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	size_t wsize = NFS_SERVER(inode)->wsize;
 	int sync = 0;
 
-	dreq = nfs_direct_req_alloc();
+	dreq = nfs_direct_write_alloc(count, wsize);
 	if (!dreq)
 		return -ENOMEM;
-	nfs_alloc_commit_data(dreq);
-
 	if (dreq->commit_data == NULL || count < wsize)
 		sync = FLUSH_STABLE;
 
+	dreq->user_addr = user_addr;
+	dreq->user_count = count;
+	dreq->pos = pos;
+	dreq->pages = pages;
+	dreq->npages = nr_pages;
 	dreq->inode = inode;
 	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
 	if (!is_sync_kiocb(iocb))
@@ -719,9 +735,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 	nfs_begin_data_update(inode);
 
 	rpc_clnt_sigmask(clnt, &oldset);
-	result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);
-	if (!result)
-		result = nfs_direct_wait(dreq);
+	nfs_direct_write_schedule(dreq, sync);
+	result = nfs_direct_wait(dreq);
 	rpc_clnt_sigunmask(clnt, &oldset);
 
 	return result;
@@ -751,6 +766,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval = -EINVAL;
+	int page_count;
+	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -772,7 +789,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count,
 	if (retval)
 		goto out;
 
-	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos);
+	retval = nfs_get_user_pages(READ, (unsigned long) buf,
+						count, &pages);
+	if (retval < 0)
+		goto out;
+	page_count = retval;
+
+	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
+						pages, page_count);
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
 
@@ -808,6 +832,8 @@ out:
 ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
 {
 	ssize_t retval;
+	int page_count;
+	struct page **pages;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 
@@ -835,7 +861,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t
 	if (retval)
 		goto out;
 
-	retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
+	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
+						count, &pages);
+	if (retval < 0)
+		goto out;
+	page_count = retval;
+
+	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
+					pos, pages, page_count);
 
 	/*
 	 * XXX: nfs_end_data_update() already ensures this file's
-- 
cgit v1.2.3


From d75d54147db9db5194040bd1c5022df6ba36ee48 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 25 Jun 2006 02:41:26 -0700
Subject: git-nfs-build-fixes

Fix various problems with nfs4 disabled.  And various other things.

In file included from fs/nfs/inode.c:50:
fs/nfs/internal.h:24: error: static declaration of 'nfs_do_refmount' follows non-static declaration
include/linux/nfs_fs.h:320: error: previous declaration of 'nfs_do_refmount' was here
fs/nfs/internal.h:65: warning: 'struct nfs4_fs_locations' declared inside parameter list
fs/nfs/internal.h:65: warning: its scope is only this definition or declaration, which is probably not what you want
fs/nfs/internal.h: In function 'nfs4_path':
fs/nfs/internal.h:97: error: 'struct nfs_server' has no member named 'mnt_path'
fs/nfs/inode.c: In function 'init_once':
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'open_states'
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'delegation'
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'delegation_state'
fs/nfs/inode.c:1116: error: 'struct nfs_inode' has no member named 'rwsem'
distcc[26452] ERROR: compile fs/nfs/inode.c on g5/64 failed
make[1]: *** [fs/nfs/inode.o] Error 1
make: *** [fs/nfs/inode.o] Error 2
make: *** Waiting for unfinished jobs....
In file included from fs/nfs/nfs3xdr.c:26:
fs/nfs/internal.h:24: error: static declaration of 'nfs_do_refmount' follows non-static declaration
include/linux/nfs_fs.h:320: error: previous declaration of 'nfs_do_refmount' was here
fs/nfs/internal.h:65: warning: 'struct nfs4_fs_locations' declared inside parameter list
fs/nfs/internal.h:65: warning: its scope is only this definition or declaration, which is probably not what you want
fs/nfs/internal.h: In function 'nfs4_path':
fs/nfs/internal.h:97: error: 'struct nfs_server' has no member named 'mnt_path'
distcc[26486] ERROR: compile fs/nfs/nfs3xdr.c on g5/64 failed
make[1]: *** [fs/nfs/nfs3xdr.o] Error 1
make: *** [fs/nfs/nfs3xdr.o] Error 2
In file included from fs/nfs/nfs3proc.c:24:
fs/nfs/internal.h:24: error: static declaration of 'nfs_do_refmount' follows non-static declaration
include/linux/nfs_fs.h:320: error: previous declaration of 'nfs_do_refmount' was here
fs/nfs/internal.h:65: warning: 'struct nfs4_fs_locations' declared inside parameter list
fs/nfs/internal.h:65: warning: its scope is only this definition or declaration, which is probably not what you want
fs/nfs/internal.h: In function 'nfs4_path':
fs/nfs/internal.h:97: error: 'struct nfs_server' has no member named 'mnt_path'
distcc[26469] ERROR: compile fs/nfs/nfs3proc.c on bix/32 failed
make[1]: *** [fs/nfs/nfs3proc.o] Error 1
make: *** [fs/nfs/nfs3proc.o] Error 2
**FAILED**

Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andreas Gruenbacher <agruen@suse.de>
Cc: Andy Adamson <andros@citi.umich.edu>
Cc: Chuck Lever <cel@netapp.com>
Cc: David Howells <dhowells@redhat.com>
Cc: J. Bruce Fields <bfields@fieldses.org>
Cc: Manoj Naik <manoj@almaden.ibm.com>
Cc: Marc Eshel <eshel@almaden.ibm.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c    | 16 +++++++++-------
 fs/nfs/internal.h |  9 ++++++++-
 fs/nfs/nfs2xdr.c  |  2 ++
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 24a7139d344..51bc88b662f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1089,13 +1089,15 @@ void nfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(nfs_inode_cachep, NFS_I(inode));
 }
 
-#define nfs4_init_once(nfsi) \
-	do { \
-		INIT_LIST_HEAD(&(nfsi)->open_states); \
-		nfsi->delegation = NULL; \
-		nfsi->delegation_state = 0; \
-		init_rwsem(&nfsi->rwsem); \
-	} while(0)
+static inline void nfs4_init_once(struct nfs_inode *nfsi)
+{
+#ifdef CONFIG_NFS_V4
+	INIT_LIST_HEAD(&nfsi->open_states);
+	nfsi->delegation = NULL;
+	nfsi->delegation_state = 0;
+	init_rwsem(&nfsi->rwsem);
+#endif
+}
 
 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5e51c4535b6..bd2815e2dec 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -58,11 +58,13 @@ extern int nfs_stat_to_errno(int);
 extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
 
 /* nfs4proc.c */
+#ifdef CONFIG_NFS_V4
 extern struct rpc_procinfo nfs4_procedures[];
 
 extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry,
 				  struct nfs4_fs_locations *fs_locations,
 				  struct page *page);
+#endif
 
 /* inode.c */
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
@@ -92,9 +94,14 @@ extern char *nfs_path(const char *base, const struct dentry *dentry,
 /*
  * Determine the mount path as a string
  */
-static inline char *nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
+static inline char *
+nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen)
 {
+#ifdef CONFIG_NFS_V4
 	return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen);
+#else
+	return NULL;
+#endif
 }
 
 /*
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 67391eef6b9..3b939e055a0 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -25,6 +25,8 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
 
+#include "internal.h"
+
 #define NFSDBG_FACILITY		NFSDBG_XDR
 /* #define NFS_PARANOIA 1 */
 
-- 
cgit v1.2.3


From 6ab86aa13045e7f6742af0b3c3c45f952f9fbb8d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 25 Jun 2006 02:41:27 -0700
Subject: nfs-build-fix-99

fs/built-in.o:(__param+0x20): undefined reference to `nfs_idmap_cache_timeout'
fs/built-in.o:(__param+0x48): undefined reference to `nfs_callback_set_tcpport'

Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andreas Gruenbacher <agruen@suse.de>
Cc: Andy Adamson <andros@citi.umich.edu>
Cc: Chuck Lever <cel@netapp.com>
Cc: David Howells <dhowells@redhat.com>
Cc: J. Bruce Fields <bfields@fieldses.org>
Cc: Manoj Naik <manoj@almaden.ibm.com>
Cc: Marc Eshel <eshel@almaden.ibm.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b977748553d..e8a9bee74d9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -187,6 +187,7 @@ static struct super_operations nfs4_sops = {
 };
 #endif
 
+#ifdef CONFIG_NFS_V4
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
 
@@ -202,7 +203,9 @@ static int param_set_port(const char *val, struct kernel_param *kp)
 
 module_param_call(callback_tcpport, param_set_port, param_get_int,
 		 &nfs_callback_set_tcpport, 0644);
+#endif
 
+#ifdef CONFIG_NFS_V4
 static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
 {
 	char *endp;
@@ -216,6 +219,7 @@ static int param_set_idmap_timeout(const char *val, struct kernel_param *kp)
 
 module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int,
 		 &nfs_idmap_cache_timeout, 0644);
+#endif
 
 /*
  * Register the NFS filesystems
-- 
cgit v1.2.3


From 9bf2aa129a107a0e9e2a5318d35aca731ae7e666 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 25 Jun 2006 02:41:28 -0700
Subject: nfs: remove nfs_put_link()

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/symlink.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 636c479995b..600bbe630ab 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -75,22 +75,13 @@ read_failed:
 	return NULL;
 }
 
-static void nfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-{
-	if (cookie) {
-		struct page *page = cookie;
-		kunmap(page);
-		page_cache_release(page);
-	}
-}
-
 /*
  * symlinks can't do much...
  */
 struct inode_operations nfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= nfs_follow_link,
-	.put_link	= nfs_put_link,
+	.put_link	= page_put_link,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
 };
-- 
cgit v1.2.3


From f90f00a358fe887fe5c3454897521064bdace7cc Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sun, 25 Jun 2006 15:59:32 +0000
Subject: [CIFS] Fix compile warning when CONFIG_CIFS_EXPERIMENTAL is off

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h | 2 --
 fs/cifs/cifssmb.c   | 2 --
 2 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 7ed06bfa192..a5ddc62d6fe 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -64,11 +64,9 @@ extern int map_smb_to_linux_error(struct smb_hdr *smb);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
 			    const struct cifsTconInfo *, int /* length of
 			    fixed section (word count) in two byte units */);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
 				struct cifsSesInfo *ses,
 				void ** request_buf);
-#endif
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 			     const int stage, 
 			     const struct nls_table *nls_cp);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 271a037a386..38f83db2076 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -209,7 +209,6 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 	return rc;
 }
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL  
 int
 small_smb_init_no_tc(const int smb_command, const int wct, 
 		     struct cifsSesInfo *ses, void **request_buf)
@@ -235,7 +234,6 @@ small_smb_init_no_tc(const int smb_command, const int wct,
 
 	return rc;
 }
-#endif  /* CONFIG_CIFS_EXPERIMENTAL */
 
 /* If the return code is zero, this function must fill in request_buf pointer */
 static int
-- 
cgit v1.2.3


From 552c03483e49a69312c9e7384fda9282c991880a Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Sun, 25 Jun 2006 05:47:16 -0700
Subject: [PATCH] fs/freevxfs: cleanup of spelling errors

Fix of some spelling errors in fs/freevxfs error messages and comments

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/freevxfs/vxfs.h        |  4 ++--
 fs/freevxfs/vxfs_fshead.c | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index 583bd78086d..d35979a5874 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -159,11 +159,11 @@ struct vxfs_sb {
  * In core superblock filesystem private data for VxFS.
  */
 struct vxfs_sb_info {
-	struct vxfs_sb		*vsi_raw;	/* raw (on disk) supeblock */
+	struct vxfs_sb		*vsi_raw;	/* raw (on disk) superblock */
 	struct buffer_head	*vsi_bp;	/* buffer for raw superblock*/
 	struct inode		*vsi_fship;	/* fileset header inode */
 	struct inode		*vsi_ilist;	/* inode list inode */
-	struct inode		*vsi_stilist;	/* structual inode list inode */
+	struct inode		*vsi_stilist;	/* structural inode list inode */
 	u_long			vsi_iext;	/* initial inode list */
 	ino_t			vsi_fshino;	/* fileset header inode */
 	daddr_t			vsi_oltext;	/* OLT extent */
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index 6dee109aeea..78948b4b189 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -112,7 +112,7 @@ vxfs_read_fshead(struct super_block *sbp)
 
 	vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
 	if (!vip) {
-		printk(KERN_ERR "vxfs: unabled to read fsh inode\n");
+		printk(KERN_ERR "vxfs: unable to read fsh inode\n");
 		return -EINVAL;
 	}
 	if (!VXFS_ISFSH(vip)) {
@@ -129,13 +129,13 @@ vxfs_read_fshead(struct super_block *sbp)
 
 	infp->vsi_fship = vxfs_get_fake_inode(sbp, vip);
 	if (!infp->vsi_fship) {
-		printk(KERN_ERR "vxfs: unabled to get fsh inode\n");
+		printk(KERN_ERR "vxfs: unable to get fsh inode\n");
 		goto out_free_fship;
 	}
 
 	sfp = vxfs_getfsh(infp->vsi_fship, 0);
 	if (!sfp) {
-		printk(KERN_ERR "vxfs: unabled to get structural fsh\n");
+		printk(KERN_ERR "vxfs: unable to get structural fsh\n");
 		goto out_iput_fship;
 	} 
 
@@ -145,7 +145,7 @@ vxfs_read_fshead(struct super_block *sbp)
 
 	pfp = vxfs_getfsh(infp->vsi_fship, 1);
 	if (!pfp) {
-		printk(KERN_ERR "vxfs: unabled to get primary fsh\n");
+		printk(KERN_ERR "vxfs: unable to get primary fsh\n");
 		goto out_free_sfp;
 	}
 
@@ -159,7 +159,7 @@ vxfs_read_fshead(struct super_block *sbp)
 
 	infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip);
 	if (!infp->vsi_stilist) {
-		printk(KERN_ERR "vxfs: unabled to get structual list inode\n");
+		printk(KERN_ERR "vxfs: unable to get structural list inode\n");
 		kfree(tip);
 		goto out_free_pfp;
 	}
@@ -174,7 +174,7 @@ vxfs_read_fshead(struct super_block *sbp)
 		goto out_iput_stilist;
 	infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip);
 	if (!infp->vsi_ilist) {
-		printk(KERN_ERR "vxfs: unabled to get inode list inode\n");
+		printk(KERN_ERR "vxfs: unable to get inode list inode\n");
 		kfree(tip);
 		goto out_iput_stilist;
 	}
-- 
cgit v1.2.3


From 2061df0f89201c0abeb4c17d343309c9fae5b861 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:18 -0700
Subject: [PATCH] ufs: ufs_trunc_indirect: infinite cycle

Currently, ufs write support have two sets of problems: work with files and
work with directories.

This series of patches should solve the first problem.

This patch is similar to http://lkml.org/lkml/2006/1/17/61 this patch
complements it.

The situation the same: in ufs_trunc_(not direct), we read block, check if
count of links to it is equal to one, if so we finish cycle, if not
continue.  Because of "count of links" always >=2 this operation cause
infinite cycle and hang up the kernel.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/truncate.c | 55 +++++++++++++++++++++----------------------------------
 fs/ufs/util.c     | 12 ------------
 fs/ufs/util.h     |  1 -
 3 files changed, 21 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 02e86291ef8..29c66e1e24d 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -238,18 +238,13 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 		if (*ubh_get_addr32(ind_ubh,i))
 			break;
 	if (i >= uspi->s_apb) {
-		if (ubh_max_bcount(ind_ubh) != 1) {
-			retry = 1;
-		}
-		else {
-			tmp = fs32_to_cpu(sb, *p);
-			*p = 0;
-			inode->i_blocks -= uspi->s_nspb;
-			mark_inode_dirty(inode);
-			ufs_free_blocks (inode, tmp, uspi->s_fpb);
-			ubh_bforget(ind_ubh);
-			ind_ubh = NULL;
-		}
+		tmp = fs32_to_cpu(sb, *p);
+		*p = 0;
+		inode->i_blocks -= uspi->s_nspb;
+		mark_inode_dirty(inode);
+		ufs_free_blocks (inode, tmp, uspi->s_fpb);
+		ubh_bforget(ind_ubh);
+		ind_ubh = NULL;
 	}
 	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
 		ubh_ll_rw_block (SWRITE, 1, &ind_ubh);
@@ -306,17 +301,13 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 		if (*ubh_get_addr32 (dind_bh, i))
 			break;
 	if (i >= uspi->s_apb) {
-		if (ubh_max_bcount(dind_bh) != 1)
-			retry = 1;
-		else {
-			tmp = fs32_to_cpu(sb, *p);
-			*p = 0;
-			inode->i_blocks -= uspi->s_nspb;
-			mark_inode_dirty(inode);
-			ufs_free_blocks (inode, tmp, uspi->s_fpb);
-			ubh_bforget(dind_bh);
-			dind_bh = NULL;
-		}
+		tmp = fs32_to_cpu(sb, *p);
+		*p = 0;
+		inode->i_blocks -= uspi->s_nspb;
+		mark_inode_dirty(inode);
+		ufs_free_blocks (inode, tmp, uspi->s_fpb);
+		ubh_bforget(dind_bh);
+		dind_bh = NULL;
 	}
 	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
 		ubh_ll_rw_block (SWRITE, 1, &dind_bh);
@@ -370,17 +361,13 @@ static int ufs_trunc_tindirect (struct inode * inode)
 		if (*ubh_get_addr32 (tind_bh, i))
 			break;
 	if (i >= uspi->s_apb) {
-		if (ubh_max_bcount(tind_bh) != 1)
-			retry = 1;
-		else {
-			tmp = fs32_to_cpu(sb, *p);
-			*p = 0;
-			inode->i_blocks -= uspi->s_nspb;
-			mark_inode_dirty(inode);
-			ufs_free_blocks (inode, tmp, uspi->s_fpb);
-			ubh_bforget(tind_bh);
-			tind_bh = NULL;
-		}
+		tmp = fs32_to_cpu(sb, *p);
+		*p = 0;
+		inode->i_blocks -= uspi->s_nspb;
+		mark_inode_dirty(inode);
+		ufs_free_blocks (inode, tmp, uspi->s_fpb);
+		ubh_bforget(tind_bh);
+		tind_bh = NULL;
 	}
 	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
 		ubh_ll_rw_block (SWRITE, 1, &tind_bh);
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 59acc8f073a..72f91cc84bf 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -139,18 +139,6 @@ void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
 		wait_on_buffer (ubh->bh[i]);
 }
 
-unsigned ubh_max_bcount (struct ufs_buffer_head * ubh)
-{
-	unsigned i;
-	unsigned max = 0;
-	if (!ubh)
-		return 0;
-	for ( i = 0; i < ubh->count; i++ ) 
-		if ( atomic_read(&ubh->bh[i]->b_count) > max )
-			max = atomic_read(&ubh->bh[i]->b_count);
-	return max;
-}
-
 void ubh_bforget (struct ufs_buffer_head * ubh)
 {
 	unsigned i;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 48d6d9bcc15..e10362d8f45 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -238,7 +238,6 @@ extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
 extern void ubh_ll_rw_block (int, unsigned, struct ufs_buffer_head **);
 extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
-extern unsigned ubh_max_bcount (struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
 #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
-- 
cgit v1.2.3


From c9a27b5dca52bbd0955e065e49e56eb313d02c34 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:19 -0700
Subject: [PATCH] ufs: right block allocation

* After block allocation, we map it on the same "address" as 8 others
  blocks

* We nullify block several times: once in ufs/block.c and once in
  block_*write_full_page, and use different "caches" for this.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c | 15 ---------------
 fs/ufs/inode.c  | 30 ++++++++++++++++++------------
 2 files changed, 18 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 3ada9dcf55b..cc0c8f15d8f 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -223,18 +223,6 @@ failed:
 }
 
 
-
-#define NULLIFY_FRAGMENTS \
-	for (i = oldcount; i < newcount; i++) { \
-		bh = sb_getblk(sb, result + i); \
-		memset (bh->b_data, 0, sb->s_blocksize); \
-		set_buffer_uptodate(bh); \
-		mark_buffer_dirty (bh); \
-		if (IS_SYNC(inode)) \
-			sync_dirty_buffer(bh); \
-		brelse (bh); \
-	}
-
 unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 	unsigned goal, unsigned count, int * err )
 {
@@ -312,7 +300,6 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 			*err = 0;
 			inode->i_blocks += count << uspi->s_nspfshift;
 			UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-			NULLIFY_FRAGMENTS
 		}
 		unlock_super(sb);
 		UFSD(("EXIT, result %u\n", result))
@@ -327,7 +314,6 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 		*err = 0;
 		inode->i_blocks += count << uspi->s_nspfshift;
 		UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-		NULLIFY_FRAGMENTS
 		unlock_super(sb);
 		UFSD(("EXIT, result %u\n", result))
 		return result;
@@ -379,7 +365,6 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 		*err = 0;
 		inode->i_blocks += count << uspi->s_nspfshift;
 		UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
-		NULLIFY_FRAGMENTS
 		unlock_super(sb);
 		if (newcount < request)
 			ufs_free_fragments (inode, result + newcount, request - newcount);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 3c3f62ce2ad..2b2366360e5 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -161,6 +161,17 @@ out:
 	return ret;
 }
 
+static void ufs_clear_block(struct inode *inode, struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	unlock_buffer(bh);
+	if (IS_SYNC(inode))
+		sync_dirty_buffer(bh);
+}
+
 static struct buffer_head * ufs_inode_getfrag (struct inode *inode,
 	unsigned int fragment, unsigned int new_fragment,
 	unsigned int required, int *err, int metadata, long *phys, int *new)
@@ -204,7 +215,7 @@ repeat:
 			brelse (result);
 			goto repeat;
 		} else {
-			*phys = tmp;
+			*phys = tmp + blockoff;
 			return NULL;
 		}
 	}
@@ -259,14 +270,11 @@ repeat:
 		return NULL;
 	}
 
-	/* The nullification of framgents done in ufs/balloc.c is
-	 * something I don't have the stomache to move into here right
-	 * now. -DaveM
-	 */
 	if (metadata) {
 		result = sb_getblk(inode->i_sb, tmp + blockoff);
+		ufs_clear_block(inode, result);
 	} else {
-		*phys = tmp;
+		*phys = tmp + blockoff;
 		result = NULL;
 		*err = 0;
 		*new = 1;
@@ -333,7 +341,7 @@ repeat:
 			brelse (result);
 			goto repeat;
 		} else {
-			*phys = tmp;
+			*phys = tmp + blockoff;
 			goto out;
 		}
 	}
@@ -349,14 +357,12 @@ repeat:
 		goto out;
 	}		
 
-	/* The nullification of framgents done in ufs/balloc.c is
-	 * something I don't have the stomache to move into here right
-	 * now. -DaveM
-	 */
+
 	if (metadata) {
 		result = sb_getblk(sb, tmp + blockoff);
+		ufs_clear_block(inode, result);
 	} else {
-		*phys = tmp;
+		*phys = tmp + blockoff;
 		*new = 1;
 	}
 
-- 
cgit v1.2.3


From 6ef4d6bf86a82965896eaa1a189177239ec2bbab Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:20 -0700
Subject: [PATCH] ufs: change block number on the fly

First of all some necessary notes about UFS by it self: To avoid waste of disk
space the tail of file consists not from blocks (which is ordinary big enough,
16K usually), it consists from fragments(which is ordinary 2K).  When file is
growing its tail occupy 1 fragment, 2 fragments...  At some stage decision to
allocate whole block is made and all fragments are moved to one block.

How this situation was handled before:

  ufs_prepare_write
  ->block_prepare_write
    ->ufs_getfrag_block
      ->...
        ->ufs_new_fragments:

	bh = sb_bread
	bh->b_blocknr = result + i;
	mark_buffer_dirty (bh);

This is wrong solution, because:

- it didn't take into consideration that there is another cache: "inode page
  cache"

- because of sb_getblk uses not b_blocknr, (it uses page->index) to find
  certain block, this breaks sb_getblk.

How this situation is handled now: we go though all "page inode cache", if
there are no such page in cache we load it into cache, and change b_blocknr.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++----------
 fs/ufs/inode.c  |  44 ++++++++++--------
 2 files changed, 138 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index cc0c8f15d8f..06f970d02e3 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -39,7 +39,8 @@ static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *,
 /*
  * Free 'count' fragments from fragment number 'fragment'
  */
-void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
+{
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
@@ -134,7 +135,8 @@ failed:
 /*
  * Free 'count' fragments from fragment number 'fragment' (free whole blocks)
  */
-void ufs_free_blocks (struct inode * inode, unsigned fragment, unsigned count) {
+void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count)
+{
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
@@ -222,15 +224,118 @@ failed:
 	return;
 }
 
+static struct page *ufs_get_locked_page(struct address_space *mapping,
+				  unsigned long index)
+{
+	struct page *page;
+
+try_again:
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		page = read_cache_page(mapping, index,
+				       (filler_t*)mapping->a_ops->readpage,
+				       NULL);
+		if (IS_ERR(page)) {
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "read_cache_page error: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+			goto out;
+		}
+
+		lock_page(page);
+
+		if (!PageUptodate(page) || PageError(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "can not read page: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+
+			page = ERR_PTR(-EIO);
+			goto out;
+		}
+	}
+
+	if (unlikely(!page->mapping || !page_has_buffers(page))) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto try_again;/*we really need these buffers*/
+	}
+out:
+	return page;
+}
+
+/*
+ * Modify inode page cache in such way:
+ * have - blocks with b_blocknr equal to oldb...oldb+count-1
+ * get - blocks with b_blocknr equal to newb...newb+count-1
+ * also we suppose that oldb...oldb+count-1 blocks
+ * situated at the end of file.
+ *
+ * We can come here from ufs_writepage or ufs_prepare_write,
+ * locked_page is argument of these functions, so we already lock it.
+ */
+static void ufs_change_blocknr(struct inode *inode, unsigned int count,
+			       unsigned int oldb, unsigned int newb,
+			       struct page *locked_page)
+{
+	unsigned int blk_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	sector_t baseblk;
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t index, cur_index = locked_page->index;
+	unsigned int i, j;
+	struct page *page;
+	struct buffer_head *head, *bh;
+
+	baseblk = ((i_size_read(inode) - 1) >> inode->i_blkbits) + 1 - count;
+
+	UFSD(("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
+	      inode->i_ino, count, oldb, newb));
+
+	BUG_ON(!PageLocked(locked_page));
+
+	for (i = 0; i < count; i += blk_per_page) {
+		index = (baseblk+i) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+		if (likely(cur_index != index)) {
+			page = ufs_get_locked_page(mapping, index);
+			if (IS_ERR(page))
+				continue;
+		} else
+			page = locked_page;
+
+		j = i;
+		head = page_buffers(page);
+		bh = head;
+		do {
+			if (likely(bh->b_blocknr == j + oldb && j < count)) {
+				unmap_underlying_metadata(bh->b_bdev,
+							  bh->b_blocknr);
+				bh->b_blocknr = newb + j++;
+				mark_buffer_dirty(bh);
+			}
+
+			bh = bh->b_this_page;
+		} while (bh != head);
+
+		set_page_dirty(page);
 
-unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
-	unsigned goal, unsigned count, int * err )
+		if (likely(cur_index != index)) {
+			unlock_page(page);
+			page_cache_release(page);
+		}
+ 	}
+	UFSD(("EXIT\n"));
+}
+
+unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
+			   unsigned goal, unsigned count, int * err, struct page *locked_page)
 {
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
-	struct buffer_head * bh;
-	unsigned cgno, oldcount, newcount, tmp, request, i, result;
+	unsigned cgno, oldcount, newcount, tmp, request, result;
 	
 	UFSD(("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count))
 	
@@ -343,24 +448,8 @@ unsigned ufs_new_fragments (struct inode * inode, __fs32 * p, unsigned fragment,
 	}
 	result = ufs_alloc_fragments (inode, cgno, goal, request, err);
 	if (result) {
-		for (i = 0; i < oldcount; i++) {
-			bh = sb_bread(sb, tmp + i);
-			if(bh)
-			{
-				clear_buffer_dirty(bh);
-				bh->b_blocknr = result + i;
-				mark_buffer_dirty (bh);
-				if (IS_SYNC(inode))
-					sync_dirty_buffer(bh);
-				brelse (bh);
-			}
-			else
-			{
-				printk(KERN_ERR "ufs_new_fragments: bread fail\n");
-				unlock_super(sb);
-				return 0;
-			}
-		}
+		ufs_change_blocknr(inode, oldcount, tmp, result, locked_page);
+
 		*p = cpu_to_fs32(sb, result);
 		*err = 0;
 		inode->i_blocks += count << uspi->s_nspfshift;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b2366360e5..ea2267316a7 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -172,9 +172,10 @@ static void ufs_clear_block(struct inode *inode, struct buffer_head *bh)
 		sync_dirty_buffer(bh);
 }
 
-static struct buffer_head * ufs_inode_getfrag (struct inode *inode,
-	unsigned int fragment, unsigned int new_fragment,
-	unsigned int required, int *err, int metadata, long *phys, int *new)
+static struct buffer_head *ufs_inode_getfrag(struct inode *inode,
+					     unsigned int fragment, unsigned int new_fragment,
+					     unsigned int required, int *err, int metadata,
+					     long *phys, int *new, struct page *locked_page)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block * sb;
@@ -232,7 +233,8 @@ repeat:
 		if (lastblockoff) {
 			p2 = ufsi->i_u1.i_data + lastblock;
 			tmp = ufs_new_fragments (inode, p2, lastfrag, 
-				fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff, err);
+						 fs32_to_cpu(sb, *p2), uspi->s_fpb - lastblockoff,
+						 err, locked_page);
 			if (!tmp) {
 				if (lastfrag != ufsi->i_lastfrag)
 					goto repeat;
@@ -244,14 +246,16 @@ repeat:
 		}
 		goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb;
 		tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
-			goal, required + blockoff, err);
+					 goal, required + blockoff,
+					 err, locked_page);
 	}
 	/*
 	 * We will extend last allocated block
 	 */
 	else if (lastblock == block) {
-		tmp = ufs_new_fragments (inode, p, fragment - (blockoff - lastblockoff),
-			fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff), err);
+		tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff),
+					fs32_to_cpu(sb, *p), required +  (blockoff - lastblockoff),
+					err, locked_page);
 	}
 	/*
 	 * We will allocate new block before last allocated block
@@ -259,8 +263,8 @@ repeat:
 	else /* (lastblock > block) */ {
 		if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1])))
 			goal = tmp + uspi->s_fpb;
-		tmp = ufs_new_fragments (inode, p, fragment - blockoff, 
-			goal, uspi->s_fpb, err);
+		tmp = ufs_new_fragments(inode, p, fragment - blockoff,
+					goal, uspi->s_fpb, err, locked_page);
 	}
 	if (!tmp) {
 		if ((!blockoff && *p) || 
@@ -303,9 +307,10 @@ repeat2:
      */
 }
 
-static struct buffer_head * ufs_block_getfrag (struct inode *inode,
-	struct buffer_head *bh, unsigned int fragment, unsigned int new_fragment, 
-	unsigned int blocksize, int * err, int metadata, long *phys, int *new)
+static struct buffer_head *ufs_block_getfrag(struct inode *inode, struct buffer_head *bh,
+					     unsigned int fragment, unsigned int new_fragment,
+					     unsigned int blocksize, int * err, int metadata,
+					     long *phys, int *new, struct page *locked_page)
 {
 	struct super_block * sb;
 	struct ufs_sb_private_info * uspi;
@@ -350,7 +355,8 @@ repeat:
 		goal = tmp + uspi->s_fpb;
 	else
 		goal = bh->b_blocknr + uspi->s_fpb;
-	tmp = ufs_new_fragments (inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err);
+	tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
+				uspi->s_fpb, err, locked_page);
 	if (!tmp) {
 		if (fs32_to_cpu(sb, *p))
 			goto repeat;
@@ -424,15 +430,15 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 	 * it much more readable:
 	 */
 #define GET_INODE_DATABLOCK(x) \
-		ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new)
+	ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new, bh_result->b_page)
 #define GET_INODE_PTR(x) \
-		ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL)
+	ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL, bh_result->b_page)
 #define GET_INDIRECT_DATABLOCK(x) \
-		ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
-				  &err, 0, &phys, &new);
+	ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize,	\
+			  &err, 0, &phys, &new, bh_result->b_page);
 #define GET_INDIRECT_PTR(x) \
-		ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize, \
-				  &err, 1, NULL, NULL);
+	ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize,	\
+			  &err, 1, NULL, NULL, bh_result->b_page);
 
 	if (ptr < UFS_NDIR_FRAGMENT) {
 		bh = GET_INODE_DATABLOCK(ptr);
-- 
cgit v1.2.3


From 826843a347cc8fd596a4c73d3fbdf04a1f130b8a Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:21 -0700
Subject: [PATCH] ufs: directory and page cache: install aops

This series of patches finished "bugs fixing" mentioned
here http://lkml.org/lkml/2006/1/31/275 .

The main bugs:
* for i in `seq 1 1000`; do touch $i; done - crash system
* mkdir create directory without "." and ".." entries

The suggested solution is work with page cache instead of straight work
with blocks.  Such solution has following advantages

* reduce code size and its complexity
* some global locks go away
* fix bugs

The most part of code is stolen from ext2, because of it has similar
directory structure.

Patches testes with UFS1 and UFS2 file systems.

This patch installs i_mapping->a_ops for directory inodes and removes some
duplicated code.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/inode.c | 58 ++++++++++++++++++++++++----------------------------------
 fs/ufs/namei.c |  1 +
 2 files changed, 25 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index ea2267316a7..c28b7522c9e 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -552,6 +552,28 @@ struct address_space_operations ufs_aops = {
 	.bmap = ufs_bmap
 };
 
+static void ufs_set_inode_ops(struct inode *inode)
+{
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ufs_file_inode_operations;
+		inode->i_fop = &ufs_file_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ufs_dir_inode_operations;
+		inode->i_fop = &ufs_dir_operations;
+		inode->i_mapping->a_ops = &ufs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (!inode->i_blocks)
+			inode->i_op = &ufs_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &page_symlink_inode_operations;
+			inode->i_mapping->a_ops = &ufs_aops;
+		}
+	} else
+		init_special_inode(inode, inode->i_mode,
+				   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
+}
+
 void ufs_read_inode (struct inode * inode)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
@@ -626,23 +648,7 @@ void ufs_read_inode (struct inode * inode)
 	}
 	ufsi->i_osync = 0;
 
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_op = &ufs_file_inode_operations;
-		inode->i_fop = &ufs_file_operations;
-		inode->i_mapping->a_ops = &ufs_aops;
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &ufs_dir_inode_operations;
-		inode->i_fop = &ufs_dir_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
-		if (!inode->i_blocks)
-			inode->i_op = &ufs_fast_symlink_inode_operations;
-		else {
-			inode->i_op = &page_symlink_inode_operations;
-			inode->i_mapping->a_ops = &ufs_aops;
-		}
-	} else
-		init_special_inode(inode, inode->i_mode,
-			ufs_get_inode_dev(sb, ufsi));
+	ufs_set_inode_ops(inode);
 
 	brelse (bh);
 
@@ -702,23 +708,7 @@ ufs2_inode :
 	}
 	ufsi->i_osync = 0;
 
-	if (S_ISREG(inode->i_mode)) {
-		inode->i_op = &ufs_file_inode_operations;
-		inode->i_fop = &ufs_file_operations;
-		inode->i_mapping->a_ops = &ufs_aops;
-	} else if (S_ISDIR(inode->i_mode)) {
-		inode->i_op = &ufs_dir_inode_operations;
-		inode->i_fop = &ufs_dir_operations;
-	} else if (S_ISLNK(inode->i_mode)) {
-		if (!inode->i_blocks)
-			inode->i_op = &ufs_fast_symlink_inode_operations;
-		else {
-			inode->i_op = &page_symlink_inode_operations;
-			inode->i_mapping->a_ops = &ufs_aops;
-		}
-	} else   /* TODO  : here ...*/
-		init_special_inode(inode, inode->i_mode,
-			ufs_get_inode_dev(sb, ufsi));
+	ufs_set_inode_ops(inode);
 
 	brelse(bh);
 
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 8d5f98a01c7..51f70270030 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -205,6 +205,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 
 	inode->i_op = &ufs_dir_inode_operations;
 	inode->i_fop = &ufs_dir_operations;
+	inode->i_mapping->a_ops = &ufs_aops;
 
 	inode_inc_link_count(inode);
 
-- 
cgit v1.2.3


From b71034e5e67d1577424cebe7bbb7d0ce134a4cd8 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:22 -0700
Subject: [PATCH] ufs: directory and page cache: from blocks to pages

Change function in fs/ufs/dir.c and fs/ufs/namei.c to work with pages
instead of straight work with blocks.  It fixed such bugs:

* for i in `seq 1 1000`; do touch $i; done - crash system
* mkdir create directory without "." and ".." entries

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/dir.c   | 989 +++++++++++++++++++++++++++++----------------------------
 fs/ufs/namei.c |  62 ++--
 2 files changed, 547 insertions(+), 504 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 1a561202d3f..9473df5bff5 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -11,13 +11,15 @@
  * 4.4BSD (FreeBSD) support added on February 1st 1998 by
  * Niels Kristian Bech Jensen <nkbj@image.dk> partially based
  * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>.
+ *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
  */
 
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include <linux/sched.h>
 
 #include "swab.h"
@@ -31,11 +33,6 @@
 #define UFSD(x)
 #endif
 
-static int
-ufs_check_dir_entry (const char *, struct inode *, struct ufs_dir_entry *,
-		     struct buffer_head *, unsigned long);
-
-
 /*
  * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
  *
@@ -51,495 +48,540 @@ static inline int ufs_match(struct super_block *sb, int len,
 	return !memcmp(name, de->d_name, len);
 }
 
-/*
- * This is blatantly stolen from ext2fs
- */
-static int
-ufs_readdir (struct file * filp, void * dirent, filldir_t filldir)
+static int ufs_commit_chunk(struct page *page, unsigned from, unsigned to)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
-	int error = 0;
-	unsigned long offset, lblk;
-	int i, stored;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de;
-	struct super_block * sb;
-	int de_reclen;
-	unsigned flags;
-	u64     blk= 0L;
-
-	lock_kernel();
-
-	sb = inode->i_sb;
-	flags = UFS_SB(sb)->s_flags;
-
-	UFSD(("ENTER, ino %lu  f_pos %lu\n", inode->i_ino, (unsigned long) filp->f_pos))
-
-	stored = 0;
-	bh = NULL;
-	offset = filp->f_pos & (sb->s_blocksize - 1);
-
-	while (!error && !stored && filp->f_pos < inode->i_size) {
-		lblk = (filp->f_pos) >> sb->s_blocksize_bits;
-		blk = ufs_frag_map(inode, lblk);
-		if (!blk || !(bh = sb_bread(sb, blk))) {
-			/* XXX - error - skip to the next block */
-			printk("ufs_readdir: "
-			       "dir inode %lu has a hole at offset %lu\n",
-			       inode->i_ino, (unsigned long int)filp->f_pos);
-			filp->f_pos += sb->s_blocksize - offset;
-			continue;
-		}
-
-revalidate:
-		/* If the dir block has changed since the last call to
-		 * readdir(2), then we might be pointing to an invalid
-		 * dirent right now.  Scan from the start of the block
-		 * to make sure. */
-		if (filp->f_version != inode->i_version) {
-			for (i = 0; i < sb->s_blocksize && i < offset; ) {
-				de = (struct ufs_dir_entry *)(bh->b_data + i);
-				/* It's too expensive to do a full
-				 * dirent test each time round this
-				 * loop, but we do have to test at
-				 * least that it is non-zero.  A
-				 * failure will be detected in the
-				 * dirent test below. */
-				de_reclen = fs16_to_cpu(sb, de->d_reclen);
-				if (de_reclen < 1)
-					break;
-				i += de_reclen;
-			}
-			offset = i;
-			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
-				| offset;
-			filp->f_version = inode->i_version;
-		}
+	struct inode *dir = page->mapping->host;
+	int err = 0;
+	dir->i_version++;
+	page->mapping->a_ops->commit_write(NULL, page, from, to);
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+	return err;
+}
 
-		while (!error && filp->f_pos < inode->i_size
-		       && offset < sb->s_blocksize) {
-			de = (struct ufs_dir_entry *) (bh->b_data + offset);
-			/* XXX - put in a real ufs_check_dir_entry() */
-			if ((de->d_reclen == 0) || (ufs_get_de_namlen(sb, de) == 0)) {
-				filp->f_pos = (filp->f_pos &
-				              (sb->s_blocksize - 1)) +
-				               sb->s_blocksize;
-				brelse(bh);
-				unlock_kernel();
-				return stored;
-			}
-			if (!ufs_check_dir_entry ("ufs_readdir", inode, de,
-						   bh, offset)) {
-				/* On error, skip the f_pos to the
-				   next block. */
-				filp->f_pos = (filp->f_pos |
-				              (sb->s_blocksize - 1)) +
-					       1;
-				brelse (bh);
-				unlock_kernel();
-				return stored;
-			}
-			offset += fs16_to_cpu(sb, de->d_reclen);
-			if (de->d_ino) {
-				/* We might block in the next section
-				 * if the data destination is
-				 * currently swapped out.  So, use a
-				 * version stamp to detect whether or
-				 * not the directory has been modified
-				 * during the copy operation. */
-				unsigned long version = filp->f_version;
-				unsigned char d_type = DT_UNKNOWN;
+static inline void ufs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
 
-				UFSD(("filldir(%s,%u)\n", de->d_name,
-							fs32_to_cpu(sb, de->d_ino)))
-				UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de)))
+static inline unsigned long ufs_dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
 
-				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
-					d_type = de->d_u.d_44.d_type;
-				error = filldir(dirent, de->d_name,
-						ufs_get_de_namlen(sb, de), filp->f_pos,
-						fs32_to_cpu(sb, de->d_ino), d_type);
-				if (error)
-					break;
-				if (version != filp->f_version)
-					goto revalidate;
-				stored ++;
-			}
-			filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
-		}
-		offset = 0;
-		brelse (bh);
+ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct ufs_dir_entry *de;
+	struct page *page;
+	
+	de = ufs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = fs32_to_cpu(dir->i_sb, de->d_ino);
+		ufs_put_page(page);
 	}
-	unlock_kernel();
-	return 0;
+	return res;
 }
 
-/*
- * define how far ahead to read directories while searching them.
- */
-#define NAMEI_RA_CHUNKS  2
-#define NAMEI_RA_BLOCKS  4
-#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
 
-/*
- *	ufs_find_entry()
- *
- * finds an entry in the specified directory with the wanted name. It
- * returns the cache buffer in which the entry was found, and the entry
- * itself (as a parameter - res_bh). It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- */
-struct ufs_dir_entry * ufs_find_entry (struct dentry *dentry,
-	struct buffer_head ** res_bh)
+/* Releases the page */
+void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+		  struct page *page, struct inode *inode)
 {
-	struct super_block * sb;
-	struct buffer_head * bh_use[NAMEI_RA_SIZE];
-	struct buffer_head * bh_read[NAMEI_RA_SIZE];
-	unsigned long offset;
-	int block, toread, i, err;
-	struct inode *dir = dentry->d_parent->d_inode;
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
+	unsigned from = (char *) de - (char *) page_address(page);
+	unsigned to = from + fs16_to_cpu(dir->i_sb, de->d_reclen);
+	int err;
 
-	UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen))
-	
-	*res_bh = NULL;
-	
-	sb = dir->i_sb;
-	
-	if (namelen > UFS_MAXNAMLEN)
-		return NULL;
+	lock_page(page);
+	err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
+	BUG_ON(err);
+	de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
+	ufs_set_de_type(dir->i_sb, de, inode->i_mode);
+	err = ufs_commit_chunk(page, from, to);
+	ufs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dir);
+}
 
-	memset (bh_use, 0, sizeof (bh_use));
-	toread = 0;
-	for (block = 0; block < NAMEI_RA_SIZE; ++block) {
-		struct buffer_head * bh;
 
-		if ((block << sb->s_blocksize_bits) >= dir->i_size)
-			break;
-		bh = ufs_getfrag (dir, block, 0, &err);
-		bh_use[block] = bh;
-		if (bh && !buffer_uptodate(bh))
-			bh_read[toread++] = bh;
+static void ufs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct ufs_dir_entry *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (UFS_SECTOR_SIZE - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
 	}
+	for (offs = 0; offs <= limit - UFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct ufs_dir_entry *)(kaddr + offs);
+		rec_len = fs16_to_cpu(sb, p->d_reclen);
+
+		if (rec_len < UFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, p)))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(UFS_SECTOR_SIZE-1))
+			goto Espan;
+		if (fs32_to_cpu(sb, p->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
+						  UFS_SB(sb)->s_uspi->s_ncg))
+			goto Einumber;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	ufs_error(sb, "ufs_check_page",
+		  "size of directory #%lu is not a multiple of chunk size",
+		  dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+Einumber:
+	error = "inode out of bounds";
+bad_entry:
+	ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
+		   "offset=%lu, rec_len=%d, name_len=%d",
+		   dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		   rec_len, ufs_get_de_namlen(sb, p));
+	goto fail;
+Eend:
+	p = (struct ufs_dir_entry *)(kaddr + offs);
+	ufs_error (sb, "ext2_check_page",
+		   "entry in directory #%lu spans the page boundary"
+		   "offset=%lu",
+		   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
 
-	for (block = 0, offset = 0; offset < dir->i_size; block++) {
-		struct buffer_head * bh;
-		struct ufs_dir_entry * de;
-		char * dlimit;
-
-		if ((block % NAMEI_RA_BLOCKS) == 0 && toread) {
-			ll_rw_block (READ, toread, bh_read);
-			toread = 0;
-		}
-		bh = bh_use[block % NAMEI_RA_SIZE];
-		if (!bh) {
-			ufs_error (sb, "ufs_find_entry", 
-				"directory #%lu contains a hole at offset %lu",
-				dir->i_ino, offset);
-			offset += sb->s_blocksize;
-			continue;
-		}
-		wait_on_buffer (bh);
-		if (!buffer_uptodate(bh)) {
-			/*
-			 * read error: all bets are off
-			 */
-			break;
-		}
-
-		de = (struct ufs_dir_entry *) bh->b_data;
-		dlimit = bh->b_data + sb->s_blocksize;
-		while ((char *) de < dlimit && offset < dir->i_size) {
-			/* this code is executed quadratically often */
-			/* do minimal checking by hand */
-			int de_len;
-
-			if ((char *) de + namelen <= dlimit &&
-			    ufs_match(sb, namelen, name, de)) {
-				/* found a match -
-				just to be sure, do a full check */
-				if (!ufs_check_dir_entry("ufs_find_entry",
-				    dir, de, bh, offset))
-					goto failed;
-				for (i = 0; i < NAMEI_RA_SIZE; ++i) {
-					if (bh_use[i] != bh)
-						brelse (bh_use[i]);
-				}
-				*res_bh = bh;
-				return de;
-			}
-                        /* prevent looping on a bad block */
-			de_len = fs16_to_cpu(sb, de->d_reclen);
-			if (de_len <= 0)
-				goto failed;
-			offset += de_len;
-			de = (struct ufs_dir_entry *) ((char *) de + de_len);
-		}
-
-		brelse (bh);
-		if (((block + NAMEI_RA_SIZE) << sb->s_blocksize_bits ) >=
-		    dir->i_size)
-			bh = NULL;
-		else
-			bh = ufs_getfrag (dir, block + NAMEI_RA_SIZE, 0, &err);
-		bh_use[block % NAMEI_RA_SIZE] = bh;
-		if (bh && !buffer_uptodate(bh))
-			bh_read[toread++] = bh;
+static struct page *ufs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_cache_page(mapping, n,
+				(filler_t*)mapping->a_ops->readpage, NULL);
+	if (!IS_ERR(page)) {
+		wait_on_page_locked(page);
+		kmap(page);
+		if (!PageUptodate(page))
+			goto fail;
+		if (!PageChecked(page))
+			ufs_check_page(page);
+		if (PageError(page))
+			goto fail;
 	}
+	return page;
 
-failed:
-	for (i = 0; i < NAMEI_RA_SIZE; ++i) brelse (bh_use[i]);
-	UFSD(("EXIT\n"))
-	return NULL;
+fail:
+	ufs_put_page(page);
+	return ERR_PTR(-EIO);
 }
 
-static int
-ufs_check_dir_entry (const char *function, struct inode *dir,
-		     struct ufs_dir_entry *de, struct buffer_head *bh,
-		     unsigned long offset)
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned
+ufs_last_byte(struct inode *inode, unsigned long page_nr)
 {
-	struct super_block *sb = dir->i_sb;
-	const char *error_msg = NULL;
-	int rlen = fs16_to_cpu(sb, de->d_reclen);
-
-	if (rlen < UFS_DIR_REC_LEN(1))
-		error_msg = "reclen is smaller than minimal";
-	else if (rlen % 4 != 0)
-		error_msg = "reclen % 4 != 0";
-	else if (rlen < UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)))
-		error_msg = "reclen is too small for namlen";
-	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
-		error_msg = "directory entry across blocks";
-	else if (fs32_to_cpu(sb, de->d_ino) > (UFS_SB(sb)->s_uspi->s_ipg *
-				      UFS_SB(sb)->s_uspi->s_ncg))
-		error_msg = "inode out of bounds";
-
-	if (error_msg != NULL)
-		ufs_error (sb, function, "bad entry in directory #%lu, size %Lu: %s - "
-			    "offset=%lu, inode=%lu, reclen=%d, namlen=%d",
-			    dir->i_ino, dir->i_size, error_msg, offset,
-			    (unsigned long)fs32_to_cpu(sb, de->d_ino),
-			    rlen, ufs_get_de_namlen(sb, de));
-	
-	return (error_msg == NULL ? 1 : 0);
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
 }
 
-struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct buffer_head **p)
+static inline struct ufs_dir_entry *
+ufs_next_entry(struct super_block *sb, struct ufs_dir_entry *p)
 {
-	int err;
-	struct buffer_head *bh = ufs_bread (dir, 0, 0, &err);
-	struct ufs_dir_entry *res = NULL;
-
-	if (bh) {
-		res = (struct ufs_dir_entry *) bh->b_data;
-		res = (struct ufs_dir_entry *)((char *)res +
-			fs16_to_cpu(dir->i_sb, res->d_reclen));
-	}
-	*p = bh;
-	return res;
+	return (struct ufs_dir_entry *)((char *)p +
+					fs16_to_cpu(sb, p->d_reclen));
 }
-ino_t ufs_inode_by_name(struct inode * dir, struct dentry *dentry)
+
+struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
 {
-	ino_t res = 0;
-	struct ufs_dir_entry * de;
-	struct buffer_head *bh;
+	struct page *page = ufs_get_page(dir, 0);
+	struct ufs_dir_entry *de = NULL;
 
-	de = ufs_find_entry (dentry, &bh);
-	if (de) {
-		res = fs32_to_cpu(dir->i_sb, de->d_ino);
-		brelse(bh);
+	if (!IS_ERR(page)) {
+		de = ufs_next_entry(dir->i_sb,
+				    (struct ufs_dir_entry *)page_address(page));
+		*p = page;
 	}
-	return res;
+	return de;
 }
 
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
-		struct buffer_head *bh, struct inode *inode)
+/*
+ *	ufs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
+				     struct page **res_page)
 {
-	dir->i_version++;
-	de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
-	mark_buffer_dirty(bh);
-	if (IS_DIRSYNC(dir))
-		sync_dirty_buffer(bh);
-	brelse (bh);
+	struct super_block *sb = dir->i_sb;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = ufs_dir_pages(dir);
+	struct page *page = NULL;
+	struct ufs_dir_entry *de;
+
+	UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen));
+
+	if (npages == 0 || namelen > UFS_MAXNAMLEN)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	/* start = ei->i_dir_start_lookup; */
+	start = 0;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = ufs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct ufs_dir_entry *) kaddr;
+			kaddr += ufs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->d_reclen == 0) {
+					ufs_error(dir->i_sb, __FUNCTION__,
+						  "zero-length directory entry");
+					ufs_put_page(page);
+					goto out;
+				}
+				if (ufs_match(sb, namelen, name, de))
+					goto found;
+				de = ufs_next_entry(sb, de);
+			}
+			ufs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	/* ei->i_dir_start_lookup = n; */
+	return de;
 }
 
 /*
- *	ufs_add_entry()
- *
- * adds a file entry to the specified directory, using the same
- * semantics as ufs_find_entry(). It returns NULL if it failed.
+ *	Parent is locked.
  */
 int ufs_add_link(struct dentry *dentry, struct inode *inode)
 {
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
-	unsigned long offset;
-	unsigned fragoff;
-	unsigned short rec_len;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de, * de1;
 	struct inode *dir = dentry->d_parent->d_inode;
 	const char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
+	struct super_block *sb = dir->i_sb;
+	unsigned reclen = UFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct ufs_dir_entry *de;
+	unsigned long npages = ufs_dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	unsigned from, to;
 	int err;
 
-	UFSD(("ENTER, name %s, namelen %u\n", name, namelen))
-	
-	sb = dir->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
-
-	if (!namelen)
-		return -EINVAL;
-	bh = ufs_bread (dir, 0, 0, &err);
-	if (!bh)
-		return err;
-	rec_len = UFS_DIR_REC_LEN(namelen);
-	offset = 0;
-	de = (struct ufs_dir_entry *) bh->b_data;
-	while (1) {
-		if ((char *)de >= UFS_SECTOR_SIZE + bh->b_data) {
-			fragoff = offset & ~uspi->s_fmask;
-			if (fragoff != 0 && fragoff != UFS_SECTOR_SIZE)
-				ufs_error (sb, "ufs_add_entry", "internal error"
-					" fragoff %u", fragoff);
-			if (!fragoff) {
-				brelse (bh);
-				bh = ufs_bread (dir, offset >> sb->s_blocksize_bits, 1, &err);
-				if (!bh)
-					return err;
-			}
-			if (dir->i_size <= offset) {
-				if (dir->i_size == 0) {
-					brelse(bh);
-					return -ENOENT;
-				}
-				de = (struct ufs_dir_entry *) (bh->b_data + fragoff);
-				de->d_ino = 0;
+	UFSD(("ENTER, name %s, namelen %u\n", name, namelen));
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = ufs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + ufs_last_byte(dir, n);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = UFS_SECTOR_SIZE;
 				de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE);
-				ufs_set_de_namlen(sb, de, 0);
-				dir->i_size = offset + UFS_SECTOR_SIZE;
-				mark_inode_dirty(dir);
-			} else {
-				de = (struct ufs_dir_entry *) bh->b_data;
+				de->d_ino = 0;
+				goto got_it;
 			}
+			if (de->d_reclen == 0) {
+				ufs_error(dir->i_sb, __FUNCTION__,
+					  "zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (ufs_match(sb, namelen, name, de))
+				goto out_unlock;
+			name_len = UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de));
+			rec_len = fs16_to_cpu(sb, de->d_reclen);
+			if (!de->d_ino && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct ufs_dir_entry *) ((char *) de + rec_len);
 		}
-		if (!ufs_check_dir_entry ("ufs_add_entry", dir, de, bh, offset)) {
-			brelse (bh);
-			return -ENOENT;
-		}
-		if (ufs_match(sb, namelen, name, de)) {
-			brelse (bh);
-			return -EEXIST;
-		}
-		if (de->d_ino == 0 && fs16_to_cpu(sb, de->d_reclen) >= rec_len)
-			break;
-			
-		if (fs16_to_cpu(sb, de->d_reclen) >=
-		     UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)) + rec_len)
-			break;
-		offset += fs16_to_cpu(sb, de->d_reclen);
-		de = (struct ufs_dir_entry *) ((char *) de + fs16_to_cpu(sb, de->d_reclen));
+		unlock_page(page);
+		ufs_put_page(page);
 	}
-
+	BUG();
+	return -EINVAL;
+
+got_it:
+	from = (char*)de - (char*)page_address(page);
+	to = from + rec_len;
+	err = page->mapping->a_ops->prepare_write(NULL, page, from, to);
+	if (err)
+		goto out_unlock;
 	if (de->d_ino) {
-		de1 = (struct ufs_dir_entry *) ((char *) de +
-			UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
-		de1->d_reclen =
-			cpu_to_fs16(sb, fs16_to_cpu(sb, de->d_reclen) -
-				UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
-		de->d_reclen =
-			cpu_to_fs16(sb, UFS_DIR_REC_LEN(ufs_get_de_namlen(sb, de)));
+		struct ufs_dir_entry *de1 =
+			(struct ufs_dir_entry *) ((char *) de + name_len);
+		de1->d_reclen = cpu_to_fs16(sb, rec_len - name_len);
+		de->d_reclen = cpu_to_fs16(sb, name_len);
+
 		de = de1;
 	}
-	de->d_ino = 0;
+
 	ufs_set_de_namlen(sb, de, namelen);
-	memcpy (de->d_name, name, namelen + 1);
+	memcpy(de->d_name, name, namelen + 1);
 	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
 	ufs_set_de_type(sb, de, inode->i_mode);
-	mark_buffer_dirty(bh);
-	if (IS_DIRSYNC(dir))
-		sync_dirty_buffer(bh);
-	brelse (bh);
+
+	err = ufs_commit_chunk(page, from, to);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-	dir->i_version++;
+
 	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	ufs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+static inline unsigned
+ufs_validate_entry(struct super_block *sb, char *base,
+		   unsigned offset, unsigned mask)
+{
+	struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
+	struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
+	while ((char*)p < (char*)de) {
+		if (p->d_reclen == 0)
+			break;
+		p = ufs_next_entry(sb, p);
+	}
+	return (char *)p - base;
+}
 
-	UFSD(("EXIT\n"))
+
+/*
+ * This is blatantly stolen from ext2fs
+ */
+static int
+ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = ufs_dir_pages(inode);
+	unsigned chunk_mask = ~(UFS_SECTOR_SIZE - 1);
+	int need_revalidate = filp->f_version != inode->i_version;
+	unsigned flags = UFS_SB(sb)->s_flags;
+
+	UFSD(("BEGIN"));
+
+	if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
+		return 0;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct ufs_dir_entry *de;
+
+		struct page *page = ufs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			ufs_error(sb, __FUNCTION__,
+				  "bad page in #%lu",
+				  inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			return -EIO;
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct ufs_dir_entry *)(kaddr+offset);
+		limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
+		for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
+			if (de->d_reclen == 0) {
+				ufs_error(sb, __FUNCTION__,
+					"zero-length directory entry");
+				ufs_put_page(page);
+				return -EIO;
+			}
+			if (de->d_ino) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				offset = (char *)de - kaddr;
+
+				UFSD(("filldir(%s,%u)\n", de->d_name,
+				      fs32_to_cpu(sb, de->d_ino)));
+				UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de)));
+
+				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
+					d_type = de->d_u.d_44.d_type;
+
+				over = filldir(dirent, de->d_name,
+					       ufs_get_de_namlen(sb, de),
+						(n<<PAGE_CACHE_SHIFT) | offset,
+					       fs32_to_cpu(sb, de->d_ino), d_type);
+				if (over) {
+					ufs_put_page(page);
+					return 0;
+				}
+			}
+			filp->f_pos += fs16_to_cpu(sb, de->d_reclen);
+		}
+		ufs_put_page(page);
+	}
 	return 0;
 }
 
+
 /*
  * ufs_delete_entry deletes a directory entry by merging it with the
  * previous entry.
  */
-int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir,
-	struct buffer_head * bh )
-	
+int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
+		     struct page * page)
 {
-	struct super_block * sb;
-	struct ufs_dir_entry * de, * pde;
-	unsigned i;
-	
-	UFSD(("ENTER\n"))
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = page->mapping;
+	char *kaddr = page_address(page);
+	unsigned from = ((char*)dir - kaddr) & ~(UFS_SECTOR_SIZE - 1);
+	unsigned to = ((char*)dir - kaddr) + fs16_to_cpu(sb, dir->d_reclen);
+	struct ufs_dir_entry *pde = NULL;
+	struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
+	int err;
 
-	sb = inode->i_sb;
-	i = 0;
-	pde = NULL;
-	de = (struct ufs_dir_entry *) bh->b_data;
-	
-	UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
-		fs32_to_cpu(sb, de->d_ino),
-		fs16_to_cpu(sb, de->d_reclen),
-		ufs_get_de_namlen(sb, de), de->d_name))
+	UFSD(("ENTER\n"));
 
-	while (i < bh->b_size) {
-		if (!ufs_check_dir_entry ("ufs_delete_entry", inode, de, bh, i)) {
-			brelse(bh);
-			return -EIO;
-		}
-		if (de == dir)  {
-			if (pde)
-				fs16_add(sb, &pde->d_reclen,
-					fs16_to_cpu(sb, dir->d_reclen));
-			dir->d_ino = 0;
-			inode->i_version++;
-			inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
-			mark_inode_dirty(inode);
-			mark_buffer_dirty(bh);
-			if (IS_DIRSYNC(inode))
-				sync_dirty_buffer(bh);
-			brelse(bh);
-			UFSD(("EXIT\n"))
-			return 0;
+	UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
+	      fs32_to_cpu(sb, de->d_ino),
+	      fs16_to_cpu(sb, de->d_reclen),
+	      ufs_get_de_namlen(sb, de), de->d_name));
+
+	while ((char*)de < (char*)dir) {
+		if (de->d_reclen == 0) {
+			ufs_error(inode->i_sb, __FUNCTION__,
+				  "zero-length directory entry");
+			err = -EIO;
+			goto out;
 		}
-		i += fs16_to_cpu(sb, de->d_reclen);
-		if (i == UFS_SECTOR_SIZE) pde = NULL;
-		else pde = de;
-		de = (struct ufs_dir_entry *)
-		    ((char *) de + fs16_to_cpu(sb, de->d_reclen));
-		if (i == UFS_SECTOR_SIZE && de->d_reclen == 0)
-			break;
+		pde = de;
+		de = ufs_next_entry(sb, de);
 	}
-	UFSD(("EXIT\n"))
-	brelse(bh);
-	return -ENOENT;
+	if (pde)
+		from = (char*)pde - (char*)page_address(page);
+	lock_page(page);
+	err = mapping->a_ops->prepare_write(NULL, page, from, to);
+	BUG_ON(err);
+	if (pde)
+		pde->d_reclen = cpu_to_fs16(sb, to-from);
+	dir->d_ino = 0;
+	err = ufs_commit_chunk(page, from, to);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+out:
+	ufs_put_page(page);
+	UFSD(("EXIT\n"));
+	return err;
 }
 
 int ufs_make_empty(struct inode * inode, struct inode *dir)
 {
 	struct super_block * sb = dir->i_sb;
-	struct buffer_head * dir_block;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
 	struct ufs_dir_entry * de;
+	char *base;
 	int err;
 
-	dir_block = ufs_bread (inode, 0, 1, &err);
-	if (!dir_block)
-		return err;
+	if (!page)
+		return -ENOMEM;
+	kmap(page);
+	err = mapping->a_ops->prepare_write(NULL, page, 0, UFS_SECTOR_SIZE);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+
+	base = (char*)page_address(page);
+	memset(base, 0, PAGE_CACHE_SIZE);
+
+	de = (struct ufs_dir_entry *) base;
 
-	inode->i_blocks = sb->s_blocksize / UFS_SECTOR_SIZE;
-	de = (struct ufs_dir_entry *) dir_block->b_data;
 	de->d_ino = cpu_to_fs32(sb, inode->i_ino);
 	ufs_set_de_type(sb, de, inode->i_mode);
 	ufs_set_de_namlen(sb, de, 1);
@@ -552,72 +594,65 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
 	de->d_reclen = cpu_to_fs16(sb, UFS_SECTOR_SIZE - UFS_DIR_REC_LEN(1));
 	ufs_set_de_namlen(sb, de, 2);
 	strcpy (de->d_name, "..");
-	mark_buffer_dirty(dir_block);
-	brelse (dir_block);
-	mark_inode_dirty(inode);
-	return 0;
+
+	err = ufs_commit_chunk(page, 0, UFS_SECTOR_SIZE);
+fail:
+	kunmap(page);
+	page_cache_release(page);
+	return err;
 }
 
 /*
  * routine to check that the specified directory is empty (for rmdir)
  */
-int ufs_empty_dir (struct inode * inode)
+int ufs_empty_dir(struct inode * inode)
 {
-	struct super_block * sb;
-	unsigned long offset;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de, * de1;
-	int err;
-	
-	sb = inode->i_sb;
-
-	if (inode->i_size < UFS_DIR_REC_LEN(1) + UFS_DIR_REC_LEN(2) ||
-	    !(bh = ufs_bread (inode, 0, 0, &err))) {
-	    	ufs_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no data block",
-			      inode->i_ino);
-		return 1;
-	}
-	de = (struct ufs_dir_entry *) bh->b_data;
-	de1 = (struct ufs_dir_entry *)
-		((char *)de + fs16_to_cpu(sb, de->d_reclen));
-	if (fs32_to_cpu(sb, de->d_ino) != inode->i_ino || de1->d_ino == 0 ||
-	     strcmp (".", de->d_name) || strcmp ("..", de1->d_name)) {
-	    	ufs_warning (inode->i_sb, "empty_dir",
-			      "bad directory (dir #%lu) - no `.' or `..'",
-			      inode->i_ino);
-		return 1;
-	}
-	offset = fs16_to_cpu(sb, de->d_reclen) + fs16_to_cpu(sb, de1->d_reclen);
-	de = (struct ufs_dir_entry *)
-		((char *)de1 + fs16_to_cpu(sb, de1->d_reclen));
-	while (offset < inode->i_size ) {
-		if (!bh || (void *) de >= (void *) (bh->b_data + sb->s_blocksize)) {
-			brelse (bh);
-			bh = ufs_bread (inode, offset >> sb->s_blocksize_bits, 1, &err);
-	 		if (!bh) {
-				ufs_error (sb, "empty_dir",
-					    "directory #%lu contains a hole at offset %lu",
-					    inode->i_ino, offset);
-				offset += sb->s_blocksize;
-				continue;
+	struct super_block *sb = inode->i_sb;
+	struct page *page = NULL;
+	unsigned long i, npages = ufs_dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct ufs_dir_entry *de;
+		page = ufs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct ufs_dir_entry *)kaddr;
+		kaddr += ufs_last_byte(inode, i) - UFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->d_reclen == 0) {
+				ufs_error(inode->i_sb, __FUNCTION__,
+					"zero-length directory entry: "
+					"kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
 			}
-			de = (struct ufs_dir_entry *) bh->b_data;
-		}
-		if (!ufs_check_dir_entry ("empty_dir", inode, de, bh, offset)) {
-			brelse (bh);
-			return 1;
-		}
-		if (de->d_ino) {
-			brelse (bh);
-			return 0;
+			if (de->d_ino) {
+				u16 namelen=ufs_get_de_namlen(sb, de);
+				/* check for . and .. */
+				if (de->d_name[0] != '.')
+					goto not_empty;
+				if (namelen > 2)
+					goto not_empty;
+				if (namelen < 2) {
+					if (inode->i_ino !=
+					    fs32_to_cpu(sb, de->d_ino))
+						goto not_empty;
+				} else if (de->d_name[1] != '.')
+					goto not_empty;
+			}
+			de = ufs_next_entry(sb, de);
 		}
-		offset += fs16_to_cpu(sb, de->d_reclen);
-		de = (struct ufs_dir_entry *)
-			((char *)de + fs16_to_cpu(sb, de->d_reclen));
+		ufs_put_page(page);
 	}
-	brelse (bh);
 	return 1;
+
+not_empty:
+	ufs_put_page(page);
+	return 0;
 }
 
 const struct file_operations ufs_dir_operations = {
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 51f70270030..364bb92b091 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -1,6 +1,9 @@
 /*
  * linux/fs/ufs/namei.c
  *
+ * Migration to usage of "page cache" on May 2006 by
+ * Evgeniy Dushistov <dushistov@mail.ru> based on ext2 code base.
+ *
  * Copyright (C) 1998
  * Daniel Pirkl <daniel.pirkl@email.cz>
  * Charles University, Faculty of Mathematics and Physics
@@ -28,7 +31,6 @@
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
 #include "swab.h"	/* will go away - see comment in mknod() */
 #include "util.h"
 
@@ -232,19 +234,18 @@ out_dir:
 	goto out;
 }
 
-static int ufs_unlink(struct inode * dir, struct dentry *dentry)
+static int ufs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode * inode = dentry->d_inode;
-	struct buffer_head * bh;
-	struct ufs_dir_entry * de;
+	struct ufs_dir_entry *de;
+	struct page *page;
 	int err = -ENOENT;
 
-	lock_kernel();
-	de = ufs_find_entry (dentry, &bh);
+	de = ufs_find_entry(dir, dentry, &page);
 	if (!de)
 		goto out;
 
-	err = ufs_delete_entry (dir, de, bh);
+	err = ufs_delete_entry(dir, de, page);
 	if (err)
 		goto out;
 
@@ -252,7 +253,6 @@ static int ufs_unlink(struct inode * dir, struct dentry *dentry)
 	inode_dec_link_count(inode);
 	err = 0;
 out:
-	unlock_kernel();
 	return err;
 }
 
@@ -274,42 +274,42 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
 	return err;
 }
 
-static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
-	struct inode * new_dir,	struct dentry * new_dentry )
+static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct inode *old_inode = old_dentry->d_inode;
 	struct inode *new_inode = new_dentry->d_inode;
-	struct buffer_head *dir_bh = NULL;
-	struct ufs_dir_entry *dir_de = NULL;
-	struct buffer_head *old_bh;
+	struct page *dir_page = NULL;
+	struct ufs_dir_entry * dir_de = NULL;
+	struct page *old_page;
 	struct ufs_dir_entry *old_de;
 	int err = -ENOENT;
 
-	lock_kernel();
-	old_de = ufs_find_entry (old_dentry, &old_bh);
+	old_de = ufs_find_entry(old_dir, old_dentry, &old_page);
 	if (!old_de)
 		goto out;
 
 	if (S_ISDIR(old_inode->i_mode)) {
 		err = -EIO;
-		dir_de = ufs_dotdot(old_inode, &dir_bh);
+		dir_de = ufs_dotdot(old_inode, &dir_page);
 		if (!dir_de)
 			goto out_old;
 	}
 
 	if (new_inode) {
-		struct buffer_head *new_bh;
+		struct page *new_page;
 		struct ufs_dir_entry *new_de;
 
 		err = -ENOTEMPTY;
-		if (dir_de && !ufs_empty_dir (new_inode))
+		if (dir_de && !ufs_empty_dir(new_inode))
 			goto out_dir;
+
 		err = -ENOENT;
-		new_de = ufs_find_entry (new_dentry, &new_bh);
+		new_de = ufs_find_entry(new_dir, new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
 		inode_inc_link_count(old_inode);
-		ufs_set_link(new_dir, new_de, new_bh, old_inode);
+		ufs_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
 			new_inode->i_nlink--;
@@ -330,24 +330,32 @@ static int ufs_rename (struct inode * old_dir, struct dentry * old_dentry,
 			inode_inc_link_count(new_dir);
 	}
 
-	ufs_delete_entry (old_dir, old_de, old_bh);
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+ 	 * rename.
+	 * inode_dec_link_count() will mark the inode dirty.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
 
+	ufs_delete_entry(old_dir, old_de, old_page);
 	inode_dec_link_count(old_inode);
 
 	if (dir_de) {
-		ufs_set_link(old_inode, dir_de, dir_bh, new_dir);
+		ufs_set_link(old_inode, dir_de, dir_page, new_dir);
 		inode_dec_link_count(old_dir);
 	}
-	unlock_kernel();
 	return 0;
 
+
 out_dir:
-	if (dir_de)
-		brelse(dir_bh);
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
 out_old:
-	brelse (old_bh);
+	kunmap(old_page);
+	page_cache_release(old_page);
 out:
-	unlock_kernel();
 	return err;
 }
 
-- 
cgit v1.2.3


From 9695ef16ed4e00b59303f39f9a4a422a2c6a3b89 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:22 -0700
Subject: [PATCH] ufs: wrong type cast

There are two ugly macros in ufs code:
#define UCPI_UBH ((struct ufs_buffer_head *)ucpi)
#define USPI_UBH ((struct ufs_buffer_head *)uspi)
when uspi looks like
struct {
struct ufs_buffer_head ;
}
and USPI_UBH has some sence,
ucpi looks like
struct {
struct not_ufs_buffer_head;
}

To prevent bugs in future, this patch convert macros to inline function and
fix "ucpi" structure.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c   | 84 +++++++++++++++++++++++++++----------------------------
 fs/ufs/cylinder.c | 18 ++++++------
 fs/ufs/ialloc.c   | 28 +++++++++----------
 fs/ufs/super.c    |  8 +++---
 fs/ufs/util.c     | 20 ++++++-------
 fs/ufs/util.h     | 16 +++++++----
 6 files changed, 90 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 06f970d02e3..68de1312e4b 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -69,7 +69,7 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi) 
 		goto failed;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_free_fragments", "internal error, bad magic number on cg %u", cgno);
 		goto failed;
@@ -77,11 +77,11 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 
 	end_bit = bit + count;
 	bbase = ufs_blknum (bit);
-	blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
 	ufs_fragacct (sb, blkmap, ucg->cg_frsum, -1);
 	for (i = bit; i < end_bit; i++) {
-		if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, i))
-			ubh_setbit (UCPI_UBH, ucpi->c_freeoff, i);
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, i))
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, i);
 		else 
 			ufs_error (sb, "ufs_free_fragments",
 				   "bit already cleared for fragment %u", i);
@@ -93,14 +93,14 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
 	fs32_add(sb, &usb1->fs_cstotal.cs_nffree, count);
 	fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-	blkmap = ubh_blkmap (UCPI_UBH, ucpi->c_freeoff, bbase);
+	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
 	ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
 
 	/*
 	 * Trying to reassemble free fragments into block
 	 */
 	blkno = ufs_fragstoblks (bbase);
-	if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
 		fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
 		fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, uspi->s_fpb);
 		fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
@@ -114,11 +114,11 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 		fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
 	}
 	
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 	
@@ -176,7 +176,7 @@ do_more:
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi) 
 		goto failed;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno);
 		goto failed;
@@ -184,10 +184,10 @@ do_more:
 
 	for (i = bit; i < end_bit; i += uspi->s_fpb) {
 		blkno = ufs_fragstoblks(i);
-		if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, blkno)) {
+		if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
 			ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
 		}
-		ubh_setblock(UCPI_UBH, ucpi->c_freeoff, blkno);
+		ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
 		DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
@@ -200,11 +200,11 @@ do_more:
 		fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
 	}
 
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 
 	if (overflow) {
@@ -493,7 +493,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi)
 		return 0;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_add_fragments",
 			"internal error, bad magic number on cg %u", cgno);
@@ -503,14 +503,14 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	fragno = ufs_dtogd (fragment);
 	fragoff = ufs_fragnum (fragno);
 	for (i = oldcount; i < newcount; i++)
-		if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
 			return 0;
 	/*
 	 * Block can be extended
 	 */
 	ucg->cg_time = cpu_to_fs32(sb, get_seconds());
 	for (i = newcount; i < (uspi->s_fpb - fragoff); i++)
-		if (ubh_isclr (UCPI_UBH, ucpi->c_freeoff, fragno + i))
+		if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i))
 			break;
 	fragsize = i - oldcount;
 	if (!fs32_to_cpu(sb, ucg->cg_frsum[fragsize]))
@@ -520,7 +520,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	if (fragsize != count)
 		fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
 	for (i = oldcount; i < newcount; i++)
-		ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, fragno + i);
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
 	if(DQUOT_ALLOC_BLOCK(inode, count)) {
 		*err = -EDQUOT;
 		return 0;
@@ -530,11 +530,11 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
 	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
 	
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 
@@ -602,7 +602,7 @@ cg_found:
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi)
 		return 0;
-	ucg = ubh_get_ucg (UCPI_UBH);
+	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) 
 		ufs_panic (sb, "ufs_alloc_fragments",
 			"internal error, bad magic number on cg %u", cgno);
@@ -625,7 +625,7 @@ cg_found:
 			return 0;
 		goal = ufs_dtogd (result);
 		for (i = count; i < uspi->s_fpb; i++)
-			ubh_setbit (UCPI_UBH, ucpi->c_freeoff, goal + i);
+			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
 		i = uspi->s_fpb - count;
 		DQUOT_FREE_BLOCK(inode, i);
 
@@ -644,7 +644,7 @@ cg_found:
 		return 0;
 	}
 	for (i = 0; i < count; i++)
-		ubh_clrbit (UCPI_UBH, ucpi->c_freeoff, result + i);
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
 	
 	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
 	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
@@ -655,11 +655,11 @@ cg_found:
 		fs32_add(sb, &ucg->cg_frsum[allocsize - count], 1);
 
 succed:
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 
@@ -682,7 +682,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
 	if (goal == 0) {
 		goal = ucpi->c_rotor;
@@ -694,7 +694,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
 	/*
 	 * If the requested block is available, use it.
 	 */
-	if (ubh_isblockset(UCPI_UBH, ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
 		result = goal;
 		goto gotit;
 	}
@@ -706,7 +706,7 @@ norot:
 	ucpi->c_rotor = result;
 gotit:
 	blkno = ufs_fragstoblks(result);
-	ubh_clrblock (UCPI_UBH, ucpi->c_freeoff, blkno);
+	ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 	if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 		ufs_clusteracct (sb, ucpi, blkno, -1);
 	if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) {
@@ -739,7 +739,7 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
 
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first (uspi);
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
 	if (goal)
 		start = ufs_dtogd(goal) >> 3;
@@ -747,12 +747,12 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
 		start = ucpi->c_frotor >> 3;
 		
 	length = ((uspi->s_fpg + 7) >> 3) - start;
-	location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff + start, length,
+	location = ubh_scanc(UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
 		(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
 		1 << (count - 1 + (uspi->s_fpb & 7))); 
 	if (location == 0) {
 		length = start + 1;
-		location = ubh_scanc(UCPI_UBH, ucpi->c_freeoff, length, 
+		location = ubh_scanc(UCPI_UBH(ucpi), ucpi->c_freeoff, length,
 			(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
 			1 << (count - 1 + (uspi->s_fpb & 7)));
 		if (location == 0) {
@@ -769,7 +769,7 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
 	/*
 	 * found the byte in the map
 	 */
-	blockmap = ubh_blkmap(UCPI_UBH, ucpi->c_freeoff, result);
+	blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
 	fragsize = 0;
 	for (possition = 0, mask = 1; possition < 8; possition++, mask <<= 1) {
 		if (blockmap & mask) {
@@ -808,9 +808,9 @@ static void ufs_clusteracct(struct super_block * sb,
 		return;
 
 	if (cnt > 0)
-		ubh_setbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+		ubh_setbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
 	else
-		ubh_clrbit(UCPI_UBH, ucpi->c_clusteroff, blkno);
+		ubh_clrbit(UCPI_UBH(ucpi), ucpi->c_clusteroff, blkno);
 
 	/*
 	 * Find the size of the cluster going forward.
@@ -819,7 +819,7 @@ static void ufs_clusteracct(struct super_block * sb,
 	end = start + uspi->s_contigsumsize;
 	if ( end >= ucpi->c_nclusterblks)
 		end = ucpi->c_nclusterblks;
-	i = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_clusteroff, end, start);
+	i = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, end, start);
 	if (i > end)
 		i = end;
 	forw = i - start;
@@ -831,7 +831,7 @@ static void ufs_clusteracct(struct super_block * sb,
 	end = start - uspi->s_contigsumsize;
 	if (end < 0 ) 
 		end = -1;
-	i = ubh_find_last_zero_bit (UCPI_UBH, ucpi->c_clusteroff, start, end);
+	i = ubh_find_last_zero_bit (UCPI_UBH(ucpi), ucpi->c_clusteroff, start, end);
 	if ( i < end) 
 		i = end;
 	back = start - i;
@@ -843,11 +843,11 @@ static void ufs_clusteracct(struct super_block * sb,
 	i = back + forw + 1;
 	if (i > uspi->s_contigsumsize)
 		i = uspi->s_contigsumsize;
-	fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (i << 2)), cnt);
+	fs32_add(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (i << 2)), cnt);
 	if (back > 0)
-		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (back << 2)), cnt);
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (back << 2)), cnt);
 	if (forw > 0)
-		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH, ucpi->c_clustersumoff + (forw << 2)), cnt);
+		fs32_sub(sb, (__fs32*)ubh_get_addr(UCPI_UBH(ucpi), ucpi->c_clustersumoff + (forw << 2)), cnt);
 }
 
 
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 14abb8b835f..65fe0681017 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -47,14 +47,14 @@ static void ufs_read_cylinder (struct super_block * sb,
 	ucpi = sbi->s_ucpi[bitmap_nr];
 	ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
 
-	UCPI_UBH->fragment = ufs_cgcmin(cgno);
-	UCPI_UBH->count = uspi->s_cgsize >> sb->s_blocksize_bits;
+	UCPI_UBH(ucpi)->fragment = ufs_cgcmin(cgno);
+	UCPI_UBH(ucpi)->count = uspi->s_cgsize >> sb->s_blocksize_bits;
 	/*
 	 * We have already the first fragment of cylinder group block in buffer
 	 */
-	UCPI_UBH->bh[0] = sbi->s_ucg[cgno];
-	for (i = 1; i < UCPI_UBH->count; i++)
-		if (!(UCPI_UBH->bh[i] = sb_bread(sb, UCPI_UBH->fragment + i)))
+	UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
+		if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
 			goto failed;
 	sbi->s_cgno[bitmap_nr] = cgno;
 			
@@ -103,7 +103,7 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 		return;
 	}
 	ucpi = sbi->s_ucpi[bitmap_nr];
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
 	if (uspi->s_ncg > UFS_MAX_GROUP_LOADED && bitmap_nr >= sbi->s_cg_loaded) {
 		ufs_panic (sb, "ufs_put_cylinder", "internal error");
@@ -116,9 +116,9 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 	ucg->cg_rotor = cpu_to_fs32(sb, ucpi->c_rotor);
 	ucg->cg_frotor = cpu_to_fs32(sb, ucpi->c_frotor);
 	ucg->cg_irotor = cpu_to_fs32(sb, ucpi->c_irotor);
-	ubh_mark_buffer_dirty (UCPI_UBH);
-	for (i = 1; i < UCPI_UBH->count; i++) {
-		brelse (UCPI_UBH->bh[i]);
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
+	for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
+		brelse (UCPI_UBH(ucpi)->bh[i]);
 	}
 
 	sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index c7a47ed4f43..2da0ffda82c 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -91,7 +91,7 @@ void ufs_free_inode (struct inode * inode)
 		unlock_super (sb);
 		return;
 	}
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg))
 		ufs_panic (sb, "ufs_free_fragments", "internal error, bad cg magic number");
 
@@ -104,10 +104,10 @@ void ufs_free_inode (struct inode * inode)
 
 	clear_inode (inode);
 
-	if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
 		ufs_error(sb, "ufs_free_inode", "bit already cleared for inode %u", ino);
 	else {
-		ubh_clrbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
 		if (ino < ucpi->c_irotor)
 			ucpi->c_irotor = ino;
 		fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
@@ -121,11 +121,11 @@ void ufs_free_inode (struct inode * inode)
 		}
 	}
 
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	
 	sb->s_dirt = 1;
@@ -213,14 +213,14 @@ cg_found:
 	ucpi = ufs_load_cylinder (sb, cg);
 	if (!ucpi)
 		goto failed;
-	ucg = ubh_get_ucg(UCPI_UBH);
+	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) 
 		ufs_panic (sb, "ufs_new_inode", "internal error, bad cg magic number");
 
 	start = ucpi->c_irotor;
-	bit = ubh_find_next_zero_bit (UCPI_UBH, ucpi->c_iusedoff, uspi->s_ipg, start);
+	bit = ubh_find_next_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, uspi->s_ipg, start);
 	if (!(bit < uspi->s_ipg)) {
-		bit = ubh_find_first_zero_bit (UCPI_UBH, ucpi->c_iusedoff, start);
+		bit = ubh_find_first_zero_bit (UCPI_UBH(ucpi), ucpi->c_iusedoff, start);
 		if (!(bit < start)) {
 			ufs_error (sb, "ufs_new_inode",
 			    "cylinder group %u corrupted - error in inode bitmap\n", cg);
@@ -228,8 +228,8 @@ cg_found:
 		}
 	}
 	UFSD(("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg))
-	if (ubh_isclr (UCPI_UBH, ucpi->c_iusedoff, bit))
-		ubh_setbit (UCPI_UBH, ucpi->c_iusedoff, bit);
+	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
+		ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
 	else {
 		ufs_panic (sb, "ufs_new_inode", "internal error");
 		goto failed;
@@ -245,11 +245,11 @@ cg_found:
 		fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
 	}
 
-	ubh_mark_buffer_dirty (USPI_UBH);
-	ubh_mark_buffer_dirty (UCPI_UBH);
+	ubh_mark_buffer_dirty (USPI_UBH(uspi));
+	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
 		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
-		ubh_wait_on_buffer (UCPI_UBH);
+		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index fe5ab2aa289..c00d1e74152 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -225,7 +225,7 @@ void ufs_error (struct super_block * sb, const char * function,
 	
 	if (!(sb->s_flags & MS_RDONLY)) {
 		usb1->fs_clean = UFS_FSBAD;
-		ubh_mark_buffer_dirty(USPI_UBH);
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
 		sb->s_dirt = 1;
 		sb->s_flags |= MS_RDONLY;
 	}
@@ -257,7 +257,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 	
 	if (!(sb->s_flags & MS_RDONLY)) {
 		usb1->fs_clean = UFS_FSBAD;
-		ubh_mark_buffer_dirty(USPI_UBH);
+		ubh_mark_buffer_dirty(USPI_UBH(uspi));
 		sb->s_dirt = 1;
 	}
 	va_start (args, fmt);
@@ -1014,7 +1014,7 @@ static void ufs_write_super (struct super_block *sb) {
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
 			ufs_set_fs_state(sb, usb1, usb3,
 					UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ubh_mark_buffer_dirty (USPI_UBH);
+		ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	}
 	sb->s_dirt = 0;
 	UFSD(("EXIT\n"))
@@ -1083,7 +1083,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
 			ufs_set_fs_state(sb, usb1, usb3,
 				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ubh_mark_buffer_dirty (USPI_UBH);
+		ubh_mark_buffer_dirty (USPI_UBH(uspi));
 		sb->s_dirt = 0;
 		sb->s_flags |= MS_RDONLY;
 	}
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 72f91cc84bf..f9556bc484e 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -63,17 +63,17 @@ struct ufs_buffer_head * ubh_bread_uspi (struct ufs_sb_private_info * uspi,
 	count = size >> uspi->s_fshift;
 	if (count <= 0 || count > UFS_MAXFRAG)
 		return NULL;
-	USPI_UBH->fragment = fragment;
-	USPI_UBH->count = count;
+	USPI_UBH(uspi)->fragment = fragment;
+	USPI_UBH(uspi)->count = count;
 	for (i = 0; i < count; i++)
-		if (!(USPI_UBH->bh[i] = sb_bread(sb, fragment + i)))
+		if (!(USPI_UBH(uspi)->bh[i] = sb_bread(sb, fragment + i)))
 			goto failed;
 	for (; i < UFS_MAXFRAG; i++)
-		USPI_UBH->bh[i] = NULL;
-	return USPI_UBH;
+		USPI_UBH(uspi)->bh[i] = NULL;
+	return USPI_UBH(uspi);
 failed:
 	for (j = 0; j < i; j++)
-		brelse (USPI_UBH->bh[j]);
+		brelse (USPI_UBH(uspi)->bh[j]);
 	return NULL;
 }
 
@@ -90,11 +90,11 @@ void ubh_brelse (struct ufs_buffer_head * ubh)
 void ubh_brelse_uspi (struct ufs_sb_private_info * uspi)
 {
 	unsigned i;
-	if (!USPI_UBH)
+	if (!USPI_UBH(uspi))
 		return;
-	for ( i = 0; i < USPI_UBH->count; i++ ) {
-		brelse (USPI_UBH->bh[i]);
-		USPI_UBH->bh[i] = NULL;
+	for ( i = 0; i < USPI_UBH(uspi)->count; i++ ) {
+		brelse (USPI_UBH(uspi)->bh[i]);
+		USPI_UBH(uspi)->bh[i] = NULL;
 	}
 }
 
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index e10362d8f45..6a0b48cf9ce 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -17,10 +17,16 @@
 #define in_range(b,first,len)	((b)>=(first)&&(b)<(first)+(len))
 
 /*
- * macros used for retyping
+ * functions used for retyping
  */
-#define UCPI_UBH ((struct ufs_buffer_head *)ucpi)
-#define USPI_UBH ((struct ufs_buffer_head *)uspi)
+static inline struct ufs_buffer_head *UCPI_UBH(struct ufs_cg_private_info *cpi)
+{
+	return &cpi->c_ubh;
+}
+static inline struct ufs_buffer_head *USPI_UBH(struct ufs_sb_private_info *spi)
+{
+	return &spi->s_ubh;
+}
 
 
@@ -326,10 +332,10 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
  * Macros to access cylinder group array structures
  */
 #define ubh_cg_blktot(ucpi,cylno) \
-	(*((__fs32*)ubh_get_addr(UCPI_UBH, (ucpi)->c_btotoff + ((cylno) << 2))))
+	(*((__fs32*)ubh_get_addr(UCPI_UBH(ucpi), (ucpi)->c_btotoff + ((cylno) << 2))))
 
 #define ubh_cg_blks(ucpi,cylno,rpos) \
-	(*((__fs16*)ubh_get_addr(UCPI_UBH, \
+	(*((__fs16*)ubh_get_addr(UCPI_UBH(ucpi), \
 	(ucpi)->c_boff + (((cylno) * uspi->s_nrpos + (rpos)) << 1 ))))
 
 /*
-- 
cgit v1.2.3


From 3e41f597b1595479e4a1b2e6b17b3542120ef165 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:23 -0700
Subject: [PATCH] ufs: not usual amounts of fragments per block

The writing to UFS file system with block/fragment!=8 may cause bogus
behaviour.  The problem in "ufs_bitmap_search" function, which doesn't work
correctly in "block/fragment!=8" case.  The idea is stolen from BSD code.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c | 132 +++++++++++++++++++++++++++++++++++++-------------------
 fs/ufs/util.h   |  26 -----------
 2 files changed, 87 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 68de1312e4b..343eaf4542f 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -726,18 +726,63 @@ gotit:
 	return result;
 }
 
-static unsigned ufs_bitmap_search (struct super_block * sb,
-	struct ufs_cg_private_info * ucpi, unsigned goal, unsigned count)
+static unsigned ubh_scanc(struct ufs_sb_private_info *uspi,
+			  struct ufs_buffer_head *ubh,
+			  unsigned begin, unsigned size,
+			  unsigned char *table, unsigned char mask)
 {
-	struct ufs_sb_private_info * uspi;
-	struct ufs_super_block_first * usb1;
-	struct ufs_cylinder_group * ucg;
-	unsigned start, length, location, result;
-	unsigned possition, fragsize, blockmap, mask;
-	
-	UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count))
+	unsigned rest, offset;
+	unsigned char *cp;
+	
+
+	offset = begin & ~uspi->s_fmask;
+	begin >>= uspi->s_fshift;
+	for (;;) {
+		if ((offset + size) < uspi->s_fsize)
+			rest = size;
+		else
+			rest = uspi->s_fsize - offset;
+		size -= rest;
+		cp = ubh->bh[begin]->b_data + offset;
+		while ((table[*cp++] & mask) == 0 && --rest)
+			;
+		if (rest || !size)
+			break;
+		begin++;
+		offset = 0;
+	}
+	return (size + rest);
+}
+
+/*
+ * Find a block of the specified size in the specified cylinder group.
+ * @sp: pointer to super block
+ * @ucpi: pointer to cylinder group info
+ * @goal: near which block we want find new one
+ * @count: specified size
+ */
+static unsigned ufs_bitmap_search(struct super_block *sb,
+				  struct ufs_cg_private_info *ucpi,
+				  unsigned goal, unsigned count)
+{
+	/*
+	 * Bit patterns for identifying fragments in the block map
+	 * used as ((map & mask_arr) == want_arr)
+	 */
+	static const int mask_arr[9] = {
+		0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff
+	};
+	static const int want_arr[9] = {
+		0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
+	};
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_cylinder_group *ucg;
+	unsigned start, length, loc, result;
+	unsigned pos, want, blockmap, mask, end;
+
+	UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count));
 
-	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first (uspi);
 	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
@@ -747,53 +792,50 @@ static unsigned ufs_bitmap_search (struct super_block * sb,
 		start = ucpi->c_frotor >> 3;
 		
 	length = ((uspi->s_fpg + 7) >> 3) - start;
-	location = ubh_scanc(UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
+	loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff + start, length,
 		(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
 		1 << (count - 1 + (uspi->s_fpb & 7))); 
-	if (location == 0) {
+	if (loc == 0) {
 		length = start + 1;
-		location = ubh_scanc(UCPI_UBH(ucpi), ucpi->c_freeoff, length,
-			(uspi->s_fpb == 8) ? ufs_fragtable_8fpb : ufs_fragtable_other,
-			1 << (count - 1 + (uspi->s_fpb & 7)));
-		if (location == 0) {
-			ufs_error (sb, "ufs_bitmap_search",
-			"bitmap corrupted on cg %u, start %u, length %u, count %u, freeoff %u\n",
-			ucpi->c_cgx, start, length, count, ucpi->c_freeoff);
+		loc = ubh_scanc(uspi, UCPI_UBH(ucpi), ucpi->c_freeoff, length,
+				(uspi->s_fpb == 8) ? ufs_fragtable_8fpb :
+				ufs_fragtable_other,
+				1 << (count - 1 + (uspi->s_fpb & 7)));
+		if (loc == 0) {
+			ufs_error(sb, "ufs_bitmap_search",
+				  "bitmap corrupted on cg %u, start %u,"
+				  " length %u, count %u, freeoff %u\n",
+				  ucpi->c_cgx, start, length, count,
+				  ucpi->c_freeoff);
 			return (unsigned)-1;
 		}
 		start = 0;
 	}
-	result = (start + length - location) << 3;
+	result = (start + length - loc) << 3;
 	ucpi->c_frotor = result;
 
 	/*
 	 * found the byte in the map
 	 */
-	blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
-	fragsize = 0;
-	for (possition = 0, mask = 1; possition < 8; possition++, mask <<= 1) {
-		if (blockmap & mask) {
-			if (!(possition & uspi->s_fpbmask))
-				fragsize = 1;
-			else 
-				fragsize++;
-		}
-		else {
-			if (fragsize == count) {
-				result += possition - count;
-				UFSD(("EXIT, result %u\n", result))
-				return result;
-			}
-			fragsize = 0;
-		}
-	}
-	if (fragsize == count) {
-		result += possition - count;
-		UFSD(("EXIT, result %u\n", result))
-		return result;
-	}
-	ufs_error (sb, "ufs_bitmap_search", "block not in map on cg %u\n", ucpi->c_cgx);
-	UFSD(("EXIT (FAILED)\n"))
+
+	for (end = result + 8; result < end; result += uspi->s_fpb) {
+		blockmap = ubh_blkmap(UCPI_UBH(ucpi), ucpi->c_freeoff, result);
+		blockmap <<= 1;
+		mask = mask_arr[count];
+		want = want_arr[count];
+		for (pos = 0; pos <= uspi->s_fpb - count; pos++) {
+			if ((blockmap & mask) == want) {
+				UFSD(("EXIT, result %u\n", result));
+				return result + pos;
+ 			}
+			mask <<= 1;
+			want <<= 1;
+ 		}
+ 	}
+
+	ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n",
+		  ucpi->c_cgx);
+	UFSD(("EXIT (FAILED)\n"));
 	return (unsigned)-1;
 }
 
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 6a0b48cf9ce..e95d1c46461 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -513,29 +513,3 @@ static inline void ufs_fragacct (struct super_block * sb, unsigned blockmap,
 	if (fragsize > 0 && fragsize < uspi->s_fpb)
 		fs32_add(sb, &fraglist[fragsize], cnt);
 }
-
-#define ubh_scanc(ubh,begin,size,table,mask) _ubh_scanc_(uspi,ubh,begin,size,table,mask)
-static inline unsigned _ubh_scanc_(struct ufs_sb_private_info * uspi, struct ufs_buffer_head * ubh, 
-	unsigned begin, unsigned size, unsigned char * table, unsigned char mask)
-{
-	unsigned rest, offset;
-	unsigned char * cp;
-	
-
-	offset = begin & ~uspi->s_fmask;
-	begin >>= uspi->s_fshift;
-	for (;;) {
-		if ((offset + size) < uspi->s_fsize)
-			rest = size;
-		else
-			rest = uspi->s_fsize - offset;
-		size -= rest;
-		cp = ubh->bh[begin]->b_data + offset;
-		while ((table[*cp++] & mask) == 0 && --rest);
-		if (rest || !size)
-			break;
-		begin++;
-		offset = 0;
-	}
-	return (size + rest);
-}
-- 
cgit v1.2.3


From 5afb3145c9a733166174e1f5a07c46410b5c4091 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:24 -0700
Subject: [PATCH] ufs: Unmark CONFIG_UFS_FS_WRITE as BROKEN

To find new bugs, I suggest revert this patch:
http://lkml.org/lkml/2006/1/31/275 in -mm tree.

So others can test "write support" of UFS.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 2aa4624cc01..c0afaccad60 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1370,7 +1370,7 @@ config UFS_FS
 
 config UFS_FS_WRITE
 	bool "UFS file system write support (DANGEROUS)"
-	depends on UFS_FS && EXPERIMENTAL && BROKEN
+	depends on UFS_FS && EXPERIMENTAL
 	help
 	  Say Y here if you want to try writing to UFS partitions. This is
 	  experimental, so you should back up your UFS partitions beforehand.
-- 
cgit v1.2.3


From abf5d15fd2e52517dd56a17a846d5a1f900b7db4 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:24 -0700
Subject: [PATCH] ufs: easy debug

Currently to turn on debug mode "user" has to edit ~10 files, to turn off he
has to do it again.

This patch introduce such changes:
1)turn on(off) debug messages via ".config"
2)remove unnecessary duplication of code
3)make "UFSD" macros more similar to function
4)fix some compiler warnings

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig        |   8 ++++
 fs/ufs/balloc.c   |  62 +++++++++++++----------------
 fs/ufs/cylinder.c |  31 ++++++---------
 fs/ufs/dir.c      |  28 +++++---------
 fs/ufs/ialloc.c   |  22 ++++-------
 fs/ufs/inode.c    |  47 +++++++++-------------
 fs/ufs/namei.c    |  21 ++++------
 fs/ufs/super.c    | 114 ++++++++++++++++++++++++++----------------------------
 fs/ufs/truncate.c |  30 ++++++--------
 fs/ufs/util.c     |   9 -----
 10 files changed, 155 insertions(+), 217 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index c0afaccad60..ea60e83e7fe 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1375,6 +1375,14 @@ config UFS_FS_WRITE
 	  Say Y here if you want to try writing to UFS partitions. This is
 	  experimental, so you should back up your UFS partitions beforehand.
 
+config UFS_DEBUG
+	bool "UFS debugging"
+	depends on UFS_FS
+	help
+	  If you are experiencing any problems with the UFS filesystem, say
+	  Y here.  This will result in _many_ additional debugging messages to be
+	  written to the system log.
+
 endmenu
 
 menu "Network File Systems"
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 343eaf4542f..7a4735f591b 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -21,14 +21,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_BALLOC_DEBUG
-
-#ifdef UFS_BALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 static unsigned ufs_add_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloc_fragments (struct inode *, unsigned, unsigned, unsigned, int *);
 static unsigned ufs_alloccg_block (struct inode *, struct ufs_cg_private_info *, unsigned, int *);
@@ -52,7 +44,7 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	
-	UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+	UFSD("ENTER, fragment %u, count %u\n", fragment, count);
 	
 	if (ufs_fragnum(fragment) + count > uspi->s_fpg)
 		ufs_error (sb, "ufs_free_fragments", "internal error");
@@ -123,12 +115,12 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	sb->s_dirt = 1;
 	
 	unlock_super (sb);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 
 failed:
 	unlock_super (sb);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return;
 }
 
@@ -148,7 +140,7 @@ void ufs_free_blocks(struct inode *inode, unsigned fragment, unsigned count)
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 
-	UFSD(("ENTER, fragment %u, count %u\n", fragment, count))
+	UFSD("ENTER, fragment %u, count %u\n", fragment, count);
 	
 	if ((fragment & uspi->s_fpbmask) || (count & uspi->s_fpbmask)) {
 		ufs_error (sb, "ufs_free_blocks", "internal error, "
@@ -215,12 +207,12 @@ do_more:
 
 	sb->s_dirt = 1;
 	unlock_super (sb);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 
 failed:
 	unlock_super (sb);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return;
 }
 
@@ -290,8 +282,8 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int count,
 
 	baseblk = ((i_size_read(inode) - 1) >> inode->i_blkbits) + 1 - count;
 
-	UFSD(("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
-	      inode->i_ino, count, oldb, newb));
+	UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
+	      inode->i_ino, count, oldb, newb);
 
 	BUG_ON(!PageLocked(locked_page));
 
@@ -326,7 +318,7 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int count,
 			page_cache_release(page);
 		}
  	}
-	UFSD(("EXIT\n"));
+	UFSD("EXIT\n");
 }
 
 unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
@@ -337,7 +329,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	struct ufs_super_block_first * usb1;
 	unsigned cgno, oldcount, newcount, tmp, request, result;
 	
-	UFSD(("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count))
+	UFSD("ENTER, ino %lu, fragment %u, goal %u, count %u\n", inode->i_ino, fragment, goal, count);
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -366,14 +358,14 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 			return (unsigned)-1;
 		}
 		if (fragment < UFS_I(inode)->i_lastfrag) {
-			UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
 			unlock_super (sb);
 			return 0;
 		}
 	}
 	else {
 		if (tmp) {
-			UFSD(("EXIT (ALREADY ALLOCATED)\n"))
+			UFSD("EXIT (ALREADY ALLOCATED)\n");
 			unlock_super(sb);
 			return 0;
 		}
@@ -384,7 +376,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	 */
 	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(usb1, UFS_MINFREE) <= 0) {
 		unlock_super (sb);
-		UFSD(("EXIT (FAILED)\n"))
+		UFSD("EXIT (FAILED)\n");
 		return 0;
 	}
 
@@ -407,7 +399,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 			UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
 		}
 		unlock_super(sb);
-		UFSD(("EXIT, result %u\n", result))
+		UFSD("EXIT, result %u\n", result);
 		return result;
 	}
 
@@ -420,7 +412,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 		inode->i_blocks += count << uspi->s_nspfshift;
 		UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
 		unlock_super(sb);
-		UFSD(("EXIT, result %u\n", result))
+		UFSD("EXIT, result %u\n", result);
 		return result;
 	}
 
@@ -458,12 +450,12 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 		if (newcount < request)
 			ufs_free_fragments (inode, result + newcount, request - newcount);
 		ufs_free_fragments (inode, tmp, oldcount);
-		UFSD(("EXIT, result %u\n", result))
+		UFSD("EXIT, result %u\n", result);
 		return result;
 	}
 
 	unlock_super(sb);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return 0;
 }		
 
@@ -478,7 +470,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	struct ufs_cylinder_group * ucg;
 	unsigned cgno, fragno, fragoff, count, fragsize, i;
 	
-	UFSD(("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount))
+	UFSD("ENTER, fragment %u, oldcount %u, newcount %u\n", fragment, oldcount, newcount);
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -538,7 +530,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	}
 	sb->s_dirt = 1;
 
-	UFSD(("EXIT, fragment %u\n", fragment))
+	UFSD("EXIT, fragment %u\n", fragment);
 	
 	return fragment;
 }
@@ -561,7 +553,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
 	struct ufs_cylinder_group * ucg;
 	unsigned oldcg, i, j, k, result, allocsize;
 	
-	UFSD(("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count))
+	UFSD("ENTER, ino %lu, cgno %u, goal %u, count %u\n", inode->i_ino, cgno, goal, count);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -595,7 +587,7 @@ static unsigned ufs_alloc_fragments (struct inode * inode, unsigned cgno,
 		UFS_TEST_FREE_SPACE_CG
 	}
 	
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return 0;
 
 cg_found:
@@ -664,7 +656,7 @@ succed:
 	sb->s_dirt = 1;
 
 	result += cgno * uspi->s_fpg;
-	UFSD(("EXIT3, result %u\n", result))
+	UFSD("EXIT3, result %u\n", result);
 	return result;
 }
 
@@ -677,7 +669,7 @@ static unsigned ufs_alloccg_block (struct inode * inode,
 	struct ufs_cylinder_group * ucg;
 	unsigned result, cylno, blkno;
 
-	UFSD(("ENTER, goal %u\n", goal))
+	UFSD("ENTER, goal %u\n", goal);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -721,7 +713,7 @@ gotit:
 	fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1);
 	fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
 	
-	UFSD(("EXIT, result %u\n", result))
+	UFSD("EXIT, result %u\n", result);
 
 	return result;
 }
@@ -781,7 +773,7 @@ static unsigned ufs_bitmap_search(struct super_block *sb,
 	unsigned start, length, loc, result;
 	unsigned pos, want, blockmap, mask, end;
 
-	UFSD(("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count));
+	UFSD("ENTER, cg %u, goal %u, count %u\n", ucpi->c_cgx, goal, count);
 
 	usb1 = ubh_get_usb_first (uspi);
 	ucg = ubh_get_ucg(UCPI_UBH(ucpi));
@@ -825,7 +817,7 @@ static unsigned ufs_bitmap_search(struct super_block *sb,
 		want = want_arr[count];
 		for (pos = 0; pos <= uspi->s_fpb - count; pos++) {
 			if ((blockmap & mask) == want) {
-				UFSD(("EXIT, result %u\n", result));
+				UFSD("EXIT, result %u\n", result);
 				return result + pos;
  			}
 			mask <<= 1;
@@ -835,7 +827,7 @@ static unsigned ufs_bitmap_search(struct super_block *sb,
 
 	ufs_error(sb, "ufs_bitmap_search", "block not in map on cg %u\n",
 		  ucpi->c_cgx);
-	UFSD(("EXIT (FAILED)\n"));
+	UFSD("EXIT (FAILED)\n");
 	return (unsigned)-1;
 }
 
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 65fe0681017..09c39e5e638 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -20,15 +20,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_CYLINDER_DEBUG
-
-#ifdef UFS_CYLINDER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s:", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-
 /*
  * Read cylinder group into cache. The memory space for ufs_cg_private_info
  * structure is already allocated during ufs_read_super.
@@ -42,7 +33,7 @@ static void ufs_read_cylinder (struct super_block * sb,
 	struct ufs_cylinder_group * ucg;
 	unsigned i, j;
 
-	UFSD(("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr))
+	UFSD("ENTER, cgno %u, bitmap_nr %u\n", cgno, bitmap_nr);
 	uspi = sbi->s_uspi;
 	ucpi = sbi->s_ucpi[bitmap_nr];
 	ucg = (struct ufs_cylinder_group *)sbi->s_ucg[cgno]->b_data;
@@ -73,7 +64,7 @@ static void ufs_read_cylinder (struct super_block * sb,
 	ucpi->c_clustersumoff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clustersumoff);
 	ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
 	ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;	
 	
 failed:
@@ -95,11 +86,11 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 	struct ufs_cylinder_group * ucg;
 	unsigned i;
 
-	UFSD(("ENTER, bitmap_nr %u\n", bitmap_nr))
+	UFSD("ENTER, bitmap_nr %u\n", bitmap_nr);
 
 	uspi = sbi->s_uspi;
 	if (sbi->s_cgno[bitmap_nr] == UFS_CGNO_EMPTY) {
-		UFSD(("EXIT\n"))
+		UFSD("EXIT\n");
 		return;
 	}
 	ucpi = sbi->s_ucpi[bitmap_nr];
@@ -122,7 +113,7 @@ void ufs_put_cylinder (struct super_block * sb, unsigned bitmap_nr)
 	}
 
 	sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
 
 /*
@@ -139,7 +130,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 	struct ufs_cg_private_info * ucpi;
 	unsigned cg, i, j;
 
-	UFSD(("ENTER, cgno %u\n", cgno))
+	UFSD("ENTER, cgno %u\n", cgno);
 
 	uspi = sbi->s_uspi;
 	if (cgno >= uspi->s_ncg) {
@@ -150,7 +141,7 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 	 * Cylinder group number cg it in cache and it was last used
 	 */
 	if (sbi->s_cgno[0] == cgno) {
-		UFSD(("EXIT\n"))
+		UFSD("EXIT\n");
 		return sbi->s_ucpi[0];
 	}
 	/*
@@ -160,16 +151,16 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 		if (sbi->s_cgno[cgno] != UFS_CGNO_EMPTY) {
 			if (sbi->s_cgno[cgno] != cgno) {
 				ufs_panic (sb, "ufs_load_cylinder", "internal error, wrong number of cg in cache");
-				UFSD(("EXIT (FAILED)\n"))
+				UFSD("EXIT (FAILED)\n");
 				return NULL;
 			}
 			else {
-				UFSD(("EXIT\n"))
+				UFSD("EXIT\n");
 				return sbi->s_ucpi[cgno];
 			}
 		} else {
 			ufs_read_cylinder (sb, cgno, cgno);
-			UFSD(("EXIT\n"))
+			UFSD("EXIT\n");
 			return sbi->s_ucpi[cgno];
 		}
 	}
@@ -204,6 +195,6 @@ struct ufs_cg_private_info * ufs_load_cylinder (
 		sbi->s_ucpi[0] = ucpi;
 		ufs_read_cylinder (sb, cgno, 0);
 	}
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return sbi->s_ucpi[0];
 }
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 9473df5bff5..732c3fd2b6f 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -25,14 +25,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_DIR_DEBUG
-
-#ifdef UFS_DIR_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 /*
  * NOTE! unlike strncmp, ufs_match returns 1 for success, 0 for failure.
  *
@@ -262,7 +254,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
 	struct page *page = NULL;
 	struct ufs_dir_entry *de;
 
-	UFSD(("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen));
+	UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
 
 	if (npages == 0 || namelen > UFS_MAXNAMLEN)
 		goto out;
@@ -326,7 +318,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
 	unsigned from, to;
 	int err;
 
-	UFSD(("ENTER, name %s, namelen %u\n", name, namelen));
+	UFSD("ENTER, name %s, namelen %u\n", name, namelen);
 
 	/*
 	 * We take care of directory expansion in the same loop.
@@ -442,7 +434,7 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int need_revalidate = filp->f_version != inode->i_version;
 	unsigned flags = UFS_SB(sb)->s_flags;
 
-	UFSD(("BEGIN"));
+	UFSD("BEGIN\n");
 
 	if (pos > inode->i_size - UFS_DIR_REC_LEN(1))
 		return 0;
@@ -484,9 +476,9 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 				offset = (char *)de - kaddr;
 
-				UFSD(("filldir(%s,%u)\n", de->d_name,
-				      fs32_to_cpu(sb, de->d_ino)));
-				UFSD(("namlen %u\n", ufs_get_de_namlen(sb, de)));
+				UFSD("filldir(%s,%u)\n", de->d_name,
+				      fs32_to_cpu(sb, de->d_ino));
+				UFSD("namlen %u\n", ufs_get_de_namlen(sb, de));
 
 				if ((flags & UFS_DE_MASK) == UFS_DE_44BSD)
 					d_type = de->d_u.d_44.d_type;
@@ -524,12 +516,12 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
 	struct ufs_dir_entry *de = (struct ufs_dir_entry *) (kaddr + from);
 	int err;
 
-	UFSD(("ENTER\n"));
+	UFSD("ENTER\n");
 
-	UFSD(("ino %u, reclen %u, namlen %u, name %s\n",
+	UFSD("ino %u, reclen %u, namlen %u, name %s\n",
 	      fs32_to_cpu(sb, de->d_ino),
 	      fs16_to_cpu(sb, de->d_reclen),
-	      ufs_get_de_namlen(sb, de), de->d_name));
+	      ufs_get_de_namlen(sb, de), de->d_name);
 
 	while ((char*)de < (char*)dir) {
 		if (de->d_reclen == 0) {
@@ -554,7 +546,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
 	mark_inode_dirty(inode);
 out:
 	ufs_put_page(page);
-	UFSD(("EXIT\n"));
+	UFSD("EXIT\n");
 	return err;
 }
 
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 2da0ffda82c..ad017fa2dd2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -34,14 +34,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_IALLOC_DEBUG
-
-#ifdef UFS_IALLOC_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 /*
  * NOTE! When we get the inode, we're the only people
  * that have access to it, and as such there are no
@@ -68,7 +60,7 @@ void ufs_free_inode (struct inode * inode)
 	int is_directory;
 	unsigned ino, cg, bit;
 	
-	UFSD(("ENTER, ino %lu\n", inode->i_ino))
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -130,7 +122,7 @@ void ufs_free_inode (struct inode * inode)
 	
 	sb->s_dirt = 1;
 	unlock_super (sb);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
 
 /*
@@ -155,7 +147,7 @@ struct inode * ufs_new_inode(struct inode * dir, int mode)
 	unsigned cg, bit, i, j, start;
 	struct ufs_inode_info *ufsi;
 
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
@@ -227,7 +219,7 @@ cg_found:
 			goto failed;
 		}
 	}
-	UFSD(("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg))
+	UFSD("start = %u, bit = %u, ipg = %u\n", start, bit, uspi->s_ipg);
 	if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
 		ubh_setbit (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit);
 	else {
@@ -287,14 +279,14 @@ cg_found:
 		return ERR_PTR(-EDQUOT);
 	}
 
-	UFSD(("allocating inode %lu\n", inode->i_ino))
-	UFSD(("EXIT\n"))
+	UFSD("allocating inode %lu\n", inode->i_ino);
+	UFSD("EXIT\n");
 	return inode;
 
 failed:
 	unlock_super (sb);
 	make_bad_inode(inode);
 	iput (inode);
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return ERR_PTR(-ENOSPC);
 }
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c28b7522c9e..01f75446234 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,15 +41,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_INODE_DEBUG
-#undef UFS_INODE_DEBUG_MORE
-
-#ifdef UFS_INODE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
 	struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
@@ -61,7 +52,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 	int n = 0;
 
 
-	UFSD(("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks));
+	UFSD("ptrs=uspi->s_apb = %d,double_blocks=%ld \n",ptrs,double_blocks);
 	if (i_block < 0) {
 		ufs_warning(inode->i_sb, "ufs_block_to_path", "block < 0");
 	} else if (i_block < direct_blocks) {
@@ -104,8 +95,8 @@ u64  ufs_frag_map(struct inode *inode, sector_t frag)
 	unsigned flags = UFS_SB(sb)->s_flags;
 	u64 temp = 0L;
 
-	UFSD((": frag = %llu  depth = %d\n", (unsigned long long)frag, depth));
-	UFSD((": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask));
+	UFSD(": frag = %llu  depth = %d\n", (unsigned long long)frag, depth);
+	UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask);
 
 	if (depth == 0)
 		return 0;
@@ -186,8 +177,8 @@ static struct buffer_head *ufs_inode_getfrag(struct inode *inode,
 	__fs32 * p, * p2;
 	unsigned flags = 0;
 
-	UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n",
-		inode->i_ino, fragment, new_fragment, required))         
+	UFSD("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n",
+		inode->i_ino, fragment, new_fragment, required);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -210,7 +201,7 @@ repeat:
 		if (metadata) {
 			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
 			if (tmp == fs32_to_cpu(sb, *p)) {
-				UFSD(("EXIT, result %u\n", tmp + blockoff))
+				UFSD("EXIT, result %u\n", tmp + blockoff);
 				return result;
 			}
 			brelse (result);
@@ -288,7 +279,7 @@ repeat:
 	if (IS_SYNC(inode))
 		ufs_sync_inode (inode);
 	mark_inode_dirty(inode);
-	UFSD(("EXIT, result %u\n", tmp + blockoff))
+	UFSD("EXIT, result %u\n", tmp + blockoff);
 	return result;
 
      /* This part : To be implemented ....
@@ -323,7 +314,7 @@ static struct buffer_head *ufs_block_getfrag(struct inode *inode, struct buffer_
 	block = ufs_fragstoblks (fragment);
 	blockoff = ufs_fragnum (fragment);
 
-	UFSD(("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment))	
+	UFSD("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment);
 
 	result = NULL;
 	if (!bh)
@@ -377,10 +368,10 @@ repeat:
 		sync_dirty_buffer(bh);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
-	UFSD(("result %u\n", tmp + blockoff));
+	UFSD("result %u\n", tmp + blockoff);
 out:
 	brelse (bh);
-	UFSD(("EXIT\n"));
+	UFSD("EXIT\n");
 	return result;
 }
 
@@ -399,7 +390,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 	
 	if (!create) {
 		phys64 = ufs_frag_map(inode, fragment);
-		UFSD(("phys64 = %llu \n",phys64));
+		UFSD("phys64 = %llu \n",phys64);
 		if (phys64)
 			map_bh(bh_result, sb, phys64);
 		return 0;
@@ -414,7 +405,7 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 
 	lock_kernel();
 
-	UFSD(("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment))
+	UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
 	if (fragment < 0)
 		goto abort_negative;
 	if (fragment >
@@ -514,7 +505,7 @@ struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
 {
 	struct buffer_head * bh;
 
-	UFSD(("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment))
+	UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
 	bh = ufs_getfrag (inode, fragment, create, err);
 	if (!bh || buffer_uptodate(bh)) 		
 		return bh;
@@ -586,7 +577,7 @@ void ufs_read_inode (struct inode * inode)
 	unsigned i;
 	unsigned flags;
 	
-	UFSD(("ENTER, ino %lu\n", inode->i_ino))
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -652,7 +643,7 @@ void ufs_read_inode (struct inode * inode)
 
 	brelse (bh);
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 
 bad_inode:
@@ -660,7 +651,7 @@ bad_inode:
 	return;
 
 ufs2_inode :
-	UFSD(("Reading ufs2 inode, ino %lu\n", inode->i_ino))
+	UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
 
 	ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino));
 
@@ -712,7 +703,7 @@ ufs2_inode :
 
 	brelse(bh);
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return;
 }
 
@@ -726,7 +717,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 	unsigned i;
 	unsigned flags;
 
-	UFSD(("ENTER, ino %lu\n", inode->i_ino))
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -787,7 +778,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 		sync_dirty_buffer(bh);
 	brelse (bh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return 0;
 }
 
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 364bb92b091..abd5f23a426 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -34,17 +34,6 @@
 #include "swab.h"	/* will go away - see comment in mknod() */
 #include "util.h"
 
-/*
-#undef UFS_NAMEI_DEBUG
-*/
-#define UFS_NAMEI_DEBUG
-
-#ifdef UFS_NAMEI_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
 static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 {
 	int err = ufs_add_link(dentry, inode);
@@ -90,8 +79,13 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
 static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
 		struct nameidata *nd)
 {
-	struct inode * inode = ufs_new_inode(dir, mode);
-	int err = PTR_ERR(inode);
+	struct inode *inode;
+	int err;
+
+	UFSD("BEGIN\n");
+	inode = ufs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ufs_file_inode_operations;
 		inode->i_fop = &ufs_file_operations;
@@ -101,6 +95,7 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
 		err = ufs_add_nondir(dentry, inode);
 		unlock_kernel();
 	}
+	UFSD("END: err=%d\n", err);
 	return err;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index c00d1e74152..42425999d2d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -90,18 +90,7 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_SUPER_DEBUG
-#undef UFS_SUPER_DEBUG_MORE
-
-
-#undef UFS_SUPER_DEBUG_MORE
-#ifdef UFS_SUPER_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-#ifdef UFS_SUPER_DEBUG_MORE
+#ifdef CONFIG_UFS_DEBUG
 /*
  * Print contents of ufs_super_block, useful for debugging
  */
@@ -157,18 +146,23 @@ void ufs2_print_super_stuff(
 	printk("ufs_print_super_stuff\n");
 	printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
 	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb->fs_magic));
-	printk("  fs_size:   %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
-	printk("  fs_dsize:  %u\n",fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize));
-	printk("  bsize:         %u\n", fs32_to_cpu(usb, usb->fs_bsize));
-	printk("  fsize:         %u\n", fs32_to_cpu(usb, usb->fs_fsize));
+	printk("  fs_size:   %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
+	printk("  fs_dsize:  %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize));
+	printk("  bsize:         %u\n", fs32_to_cpu(sb, usb->fs_bsize));
+	printk("  fsize:         %u\n", fs32_to_cpu(sb, usb->fs_fsize));
 	printk("  fs_volname:  %s\n", usb->fs_u11.fs_u2.fs_volname);
 	printk("  fs_fsmnt:  %s\n", usb->fs_u11.fs_u2.fs_fsmnt);
-	printk("  fs_sblockloc: %u\n",fs64_to_cpu(sb,
-			usb->fs_u11.fs_u2.fs_sblockloc));
-	printk("  cs_ndir(No of dirs):  %u\n",fs64_to_cpu(sb,
-			usb->fs_u11.fs_u2.fs_cstotal.cs_ndir));
-	printk("  cs_nbfree(No of free blocks):  %u\n",fs64_to_cpu(sb,
-			usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
+	printk("  fs_sblockloc: %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb,
+					       usb->fs_u11.fs_u2.fs_sblockloc));
+	printk("  cs_ndir(No of dirs):  %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb,
+				         usb->fs_u11.fs_u2.fs_cstotal.cs_ndir));
+	printk("  cs_nbfree(No of free blocks):  %llu\n",
+	       (unsigned long long)fs64_to_cpu(sb,
+				       usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
 	printk("\n");
 }
 
@@ -207,7 +201,7 @@ void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group
 	printk("  nclusterblks  %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
 	printk("\n");
 }
-#endif /* UFS_SUPER_DEBUG_MORE */
+#endif /* CONFIG_UFS_DEBUG */
 
 static struct super_operations ufs_super_ops;
 
@@ -309,7 +303,7 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 {
 	char * p;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	if (!options)
 		return 1;
@@ -398,7 +392,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 	unsigned size, blks, i;
 	unsigned flags = 0;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	uspi = sbi->s_uspi;
 
@@ -451,12 +445,12 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
 	}
 	for (i = 0; i < uspi->s_ncg; i++) {
-		UFSD(("read cg %u\n", i))
+		UFSD("read cg %u\n", i);
 		if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i))))
 			goto failed;
 		if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
 			goto failed;
-#ifdef UFS_SUPER_DEBUG_MORE
+#ifdef CONFIG_UFS_DEBUG
 		ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
 #endif
 	}
@@ -466,7 +460,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 		sbi->s_cgno[i] = UFS_CGNO_EMPTY;
 	}
 	sbi->s_cg_loaded = 0;
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return 1;
 
 failed:
@@ -479,7 +473,7 @@ failed:
 		for (i = 0; i < UFS_MAX_GROUP_LOADED; i++)
 			kfree (sbi->s_ucpi[i]);
 	}
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return 0;
 }
 
@@ -495,7 +489,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb)
 	unsigned char * base, * space;
 	unsigned blks, size, i;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	uspi = sbi->s_uspi;
 
@@ -523,7 +517,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb)
 		brelse (sbi->s_ucg[i]);
 	kfree (sbi->s_ucg);
 	kfree (base);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
 
 static int ufs_fill_super(struct super_block *sb, void *data, int silent)
@@ -544,7 +538,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	ubh = NULL;
 	flags = 0;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 		
 	sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
 	if (!sbi)
@@ -552,7 +546,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = sbi;
 	memset(sbi, 0, sizeof(struct ufs_sb_info));
 
-	UFSD(("flag %u\n", (int)(sb->s_flags & MS_RDONLY)))
+	UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
 	
 #ifndef CONFIG_UFS_FS_WRITE
 	if (!(sb->s_flags & MS_RDONLY)) {
@@ -593,7 +587,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	   the rules */
 	switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
 	case UFS_MOUNT_UFSTYPE_44BSD:
-		UFSD(("ufstype=44bsd\n"))
+		UFSD("ufstype=44bsd\n");
 		uspi->s_fsize = block_size = 512;
 		uspi->s_fmask = ~(512 - 1);
 		uspi->s_fshift = 9;
@@ -602,7 +596,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
 		break;
 	case UFS_MOUNT_UFSTYPE_UFS2:
-		UFSD(("ufstype=ufs2\n"));
+		UFSD("ufstype=ufs2\n");
 		super_block_offset=SBLOCK_UFS2;
 		uspi->s_fsize = block_size = 512;
 		uspi->s_fmask = ~(512 - 1);
@@ -617,7 +611,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 		
 	case UFS_MOUNT_UFSTYPE_SUN:
-		UFSD(("ufstype=sun\n"))
+		UFSD("ufstype=sun\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -628,7 +622,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 
 	case UFS_MOUNT_UFSTYPE_SUNx86:
-		UFSD(("ufstype=sunx86\n"))
+		UFSD("ufstype=sunx86\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -639,7 +633,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 
 	case UFS_MOUNT_UFSTYPE_OLD:
-		UFSD(("ufstype=old\n"))
+		UFSD("ufstype=old\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -654,7 +648,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_NEXTSTEP:
-		UFSD(("ufstype=nextstep\n"))
+		UFSD("ufstype=nextstep\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -669,7 +663,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD:
-		UFSD(("ufstype=nextstep-cd\n"))
+		UFSD("ufstype=nextstep-cd\n");
 		uspi->s_fsize = block_size = 2048;
 		uspi->s_fmask = ~(2048 - 1);
 		uspi->s_fshift = 11;
@@ -684,7 +678,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_OPENSTEP:
-		UFSD(("ufstype=openstep\n"))
+		UFSD("ufstype=openstep\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -699,7 +693,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	
 	case UFS_MOUNT_UFSTYPE_HP:
-		UFSD(("ufstype=hp\n"))
+		UFSD("ufstype=hp\n");
 		uspi->s_fsize = block_size = 1024;
 		uspi->s_fmask = ~(1024 - 1);
 		uspi->s_fshift = 10;
@@ -820,11 +814,11 @@ magic_found:
 		ubh = NULL;
 		block_size = uspi->s_fsize;
 		super_block_size = uspi->s_sbsize;
-		UFSD(("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size))
+		UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size);
 		goto again;
 	}
 
-#ifdef UFS_SUPER_DEBUG_MORE
+#ifdef CONFIG_UFS_DEBUG
         if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
 		ufs2_print_super_stuff(sb,usb);
         else
@@ -842,13 +836,13 @@ magic_found:
 	  (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) {
 		switch(usb1->fs_clean) {
 		case UFS_FSCLEAN:
-			UFSD(("fs is clean\n"))
+			UFSD("fs is clean\n");
 			break;
 		case UFS_FSSTABLE:
-			UFSD(("fs is stable\n"))
+			UFSD("fs is stable\n");
 			break;
 		case UFS_FSOSF1:
-			UFSD(("fs is DEC OSF/1\n"))
+			UFSD("fs is DEC OSF/1\n");
 			break;
 		case UFS_FSACTIVE:
 			printk("ufs_read_super: fs is active\n");
@@ -901,8 +895,8 @@ magic_found:
 	uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask);
 	uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift);
 	uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
-	UFSD(("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
-		uspi->s_fshift));
+	UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift,
+		uspi->s_fshift);
 	uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift);
 	uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb);
 	/* s_sbsize already set */
@@ -935,12 +929,11 @@ magic_found:
 	 * Compute another frequently used values
 	 */
 	uspi->s_fpbmask = uspi->s_fpb - 1;
-	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
 		uspi->s_apbshift = uspi->s_bshift - 3;
-	}
-	else {
+	else
 		uspi->s_apbshift = uspi->s_bshift - 2;
-	}
+
 	uspi->s_2apbshift = uspi->s_apbshift * 2;
 	uspi->s_3apbshift = uspi->s_apbshift * 3;
 	uspi->s_apb = 1 << uspi->s_apbshift;
@@ -975,7 +968,7 @@ magic_found:
 		if (!ufs_read_cylinder_structures(sb))
 			goto failed;
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return 0;
 
 dalloc_failed:
@@ -986,11 +979,11 @@ failed:
 	kfree (uspi);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
-	UFSD(("EXIT (FAILED)\n"))
+	UFSD("EXIT (FAILED)\n");
 	return -EINVAL;
 
 failed_nomem:
-	UFSD(("EXIT (NOMEM)\n"))
+	UFSD("EXIT (NOMEM)\n");
 	return -ENOMEM;
 }
 
@@ -1002,7 +995,7 @@ static void ufs_write_super (struct super_block *sb) {
 
 	lock_kernel();
 
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	flags = UFS_SB(sb)->s_flags;
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
@@ -1017,15 +1010,15 @@ static void ufs_write_super (struct super_block *sb) {
 		ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	}
 	sb->s_dirt = 0;
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	unlock_kernel();
 }
 
-static void ufs_put_super (struct super_block *sb)
+static void ufs_put_super(struct super_block *sb)
 {
 	struct ufs_sb_info * sbi = UFS_SB(sb);
 		
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 
 	if (!(sb->s_flags & MS_RDONLY))
 		ufs_put_cylinder_structures (sb);
@@ -1034,6 +1027,7 @@ static void ufs_put_super (struct super_block *sb)
 	kfree (sbi->s_uspi);
 	kfree (sbi);
 	sb->s_fs_info = NULL;
+UFSD("EXIT\n");
 	return;
 }
 
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 29c66e1e24d..716183d834e 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -49,14 +49,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_TRUNCATE_DEBUG
-
-#ifdef UFS_TRUNCATE_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
- 
 /*
  * Secure deletion currently doesn't work. It interacts very badly
  * with buffers shared with memory mappings, and for that reason
@@ -82,7 +74,7 @@ static int ufs_trunc_direct (struct inode * inode)
 	unsigned i, tmp;
 	int retry;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -105,7 +97,7 @@ static int ufs_trunc_direct (struct inode * inode)
 		block2 = ufs_fragstoblks (frag3);
 	}
 
-	UFSD(("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4))
+	UFSD("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4);
 
 	if (frag1 >= frag2)
 		goto next1;		
@@ -171,7 +163,7 @@ next1:
 	ufs_free_fragments (inode, tmp, frag4);
  next3:
 
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return retry;
 }
 
@@ -186,7 +178,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 	unsigned frag_to_free, free_count;
 	int retry;
 
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 		
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -252,7 +244,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 	}
 	ubh_brelse (ind_ubh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	
 	return retry;
 }
@@ -266,7 +258,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 	__fs32 * dind;
 	int retry = 0;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -315,7 +307,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 	}
 	ubh_brelse (dind_bh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	
 	return retry;
 }
@@ -330,7 +322,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
 	__fs32 * tind, * p;
 	int retry;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
@@ -375,7 +367,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
 	}
 	ubh_brelse (tind_bh);
 	
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 	return retry;
 }
 		
@@ -386,7 +378,7 @@ void ufs_truncate (struct inode * inode)
 	struct ufs_sb_private_info * uspi;
 	int retry;
 	
-	UFSD(("ENTER\n"))
+	UFSD("ENTER\n");
 	sb = inode->i_sb;
 	uspi = UFS_SB(sb)->s_uspi;
 
@@ -417,5 +409,5 @@ void ufs_truncate (struct inode * inode)
 	ufsi->i_lastfrag = DIRECT_FRAGMENT;
 	unlock_kernel();
 	mark_inode_dirty(inode);
-	UFSD(("EXIT\n"))
+	UFSD("EXIT\n");
 }
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index f9556bc484e..4685f7cb70b 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -14,15 +14,6 @@
 #include "swab.h"
 #include "util.h"
 
-#undef UFS_UTILS_DEBUG
-
-#ifdef UFS_UTILS_DEBUG
-#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x;
-#else
-#define UFSD(x)
-#endif
-
-
 struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
 	struct super_block *sb, u64 fragment, u64 size)
 {
-- 
cgit v1.2.3


From dd187a2603d9904ddc410441348f0cfc558a5233 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:25 -0700
Subject: [PATCH] ufs: little directory lookup optimization

This patch make little optimization of ufs_find_entry like "ext2" does.  Save
number of page and reuse it again in the next call.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/dir.c    | 7 ++++---
 fs/ufs/ialloc.c | 1 +
 fs/ufs/inode.c  | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 732c3fd2b6f..7f0a0aa6358 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -252,6 +252,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
 	unsigned long start, n;
 	unsigned long npages = ufs_dir_pages(dir);
 	struct page *page = NULL;
+	struct ufs_inode_info *ui = UFS_I(dir);
 	struct ufs_dir_entry *de;
 
 	UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen);
@@ -262,8 +263,8 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
 	/* OFFSET_CACHE */
 	*res_page = NULL;
 
-	/* start = ei->i_dir_start_lookup; */
-	start = 0;
+	start = ui->i_dir_start_lookup;
+
 	if (start >= npages)
 		start = 0;
 	n = start;
@@ -295,7 +296,7 @@ out:
 
 found:
 	*res_page = page;
-	/* ei->i_dir_start_lookup = n; */
+	ui->i_dir_start_lookup = n;
 	return de;
 }
 
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index ad017fa2dd2..c684aaad999 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -264,6 +264,7 @@ cg_found:
 	ufsi->i_shadow = 0;
 	ufsi->i_osync = 0;
 	ufsi->i_oeftflag = 0;
+	ufsi->i_dir_start_lookup = 0;
 	memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
 
 	insert_inode_hash(inode);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 01f75446234..c57612d443d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -628,12 +628,12 @@ void ufs_read_inode (struct inode * inode)
 	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
 	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
 	ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	ufsi->i_dir_start_lookup = 0;
 	
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
 			ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i];
-	}
-	else {
+	} else {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
 			ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
 	}
-- 
cgit v1.2.3


From 50aa4eb0b978f4a0283471c776ed812269ac8af5 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:26 -0700
Subject: [PATCH] ufs: i_blocks wrong count

At now UFS code uses DQUOT_* mechanism, but it also update inode->i_blocks
manually, this cause wrong i_blocks value.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c   |  3 ---
 fs/ufs/truncate.c | 25 ++++++++++++-------------
 2 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 7a4735f591b..62193db3434 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -395,7 +395,6 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 		if (result) {
 			*p = cpu_to_fs32(sb, result);
 			*err = 0;
-			inode->i_blocks += count << uspi->s_nspfshift;
 			UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
 		}
 		unlock_super(sb);
@@ -409,7 +408,6 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	result = ufs_add_fragments (inode, tmp, oldcount, newcount, err);
 	if (result) {
 		*err = 0;
-		inode->i_blocks += count << uspi->s_nspfshift;
 		UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
 		unlock_super(sb);
 		UFSD("EXIT, result %u\n", result);
@@ -444,7 +442,6 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 
 		*p = cpu_to_fs32(sb, result);
 		*err = 0;
-		inode->i_blocks += count << uspi->s_nspfshift;
 		UFS_I(inode)->i_lastfrag = max_t(u32, UFS_I(inode)->i_lastfrag, fragment + count);
 		unlock_super(sb);
 		if (newcount < request)
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 716183d834e..b860a29c4ef 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -112,9 +112,8 @@ static int ufs_trunc_direct (struct inode * inode)
 	frag1 = ufs_fragnum (frag1);
 	frag2 = ufs_fragnum (frag2);
 
-	inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift;
-	mark_inode_dirty(inode);
 	ufs_free_fragments (inode, tmp + frag1, frag2 - frag1);
+	mark_inode_dirty(inode);
 	frag_to_free = tmp + frag1;
 
 next1:
@@ -128,8 +127,7 @@ next1:
 			continue;
 
 		*p = 0;
-		inode->i_blocks -= uspi->s_nspb;
-		mark_inode_dirty(inode);
+
 		if (free_count == 0) {
 			frag_to_free = tmp;
 			free_count = uspi->s_fpb;
@@ -140,6 +138,7 @@ next1:
 			frag_to_free = tmp;
 			free_count = uspi->s_fpb;
 		}
+		mark_inode_dirty(inode);
 	}
 	
 	if (free_count > 0)
@@ -158,9 +157,9 @@ next1:
 	frag4 = ufs_fragnum (frag4);
 
 	*p = 0;
-	inode->i_blocks -= frag4 << uspi->s_nspfshift;
-	mark_inode_dirty(inode);
+
 	ufs_free_fragments (inode, tmp, frag4);
+	mark_inode_dirty(inode);
  next3:
 
 	UFSD("EXIT\n");
@@ -219,7 +218,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 			frag_to_free = tmp;
 			free_count = uspi->s_fpb;
 		}
-		inode->i_blocks -= uspi->s_nspb;
+
 		mark_inode_dirty(inode);
 	}
 
@@ -232,9 +231,9 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 	if (i >= uspi->s_apb) {
 		tmp = fs32_to_cpu(sb, *p);
 		*p = 0;
-		inode->i_blocks -= uspi->s_nspb;
-		mark_inode_dirty(inode);
+
 		ufs_free_blocks (inode, tmp, uspi->s_fpb);
+		mark_inode_dirty(inode);
 		ubh_bforget(ind_ubh);
 		ind_ubh = NULL;
 	}
@@ -295,9 +294,9 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 	if (i >= uspi->s_apb) {
 		tmp = fs32_to_cpu(sb, *p);
 		*p = 0;
-		inode->i_blocks -= uspi->s_nspb;
+
+		ufs_free_blocks(inode, tmp, uspi->s_fpb);
 		mark_inode_dirty(inode);
-		ufs_free_blocks (inode, tmp, uspi->s_fpb);
 		ubh_bforget(dind_bh);
 		dind_bh = NULL;
 	}
@@ -355,9 +354,9 @@ static int ufs_trunc_tindirect (struct inode * inode)
 	if (i >= uspi->s_apb) {
 		tmp = fs32_to_cpu(sb, *p);
 		*p = 0;
-		inode->i_blocks -= uspi->s_nspb;
+
+		ufs_free_blocks(inode, tmp, uspi->s_fpb);
 		mark_inode_dirty(inode);
-		ufs_free_blocks (inode, tmp, uspi->s_fpb);
 		ubh_bforget(tind_bh);
 		tind_bh = NULL;
 	}
-- 
cgit v1.2.3


From 2e006393ba5b599d9c43f94f8d8989e68131433e Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:26 -0700
Subject: [PATCH] ufs: unlock_super without lock

ufs_free_blocks function looks now in so way:
if (err)
 goto failed;
 lock_super();
failed:
 unlock_super();

So if error happen we'll unlock not locked super.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 62193db3434..ac709e70f64 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -156,7 +156,7 @@ do_more:
 	bit = ufs_dtogd (fragment);
 	if (cgno >= uspi->s_ncg) {
 		ufs_panic (sb, "ufs_free_blocks", "freeing blocks are outside device");
-		goto failed;
+		goto failed_unlock;
 	}
 	end_bit = bit + count;
 	if (end_bit > uspi->s_fpg) {
@@ -167,11 +167,11 @@ do_more:
 
 	ucpi = ufs_load_cylinder (sb, cgno);
 	if (!ucpi) 
-		goto failed;
+		goto failed_unlock;
 	ucg = ubh_get_ucg (UCPI_UBH(ucpi));
 	if (!ufs_cg_chkmagic(sb, ucg)) {
 		ufs_panic (sb, "ufs_free_blocks", "internal error, bad magic number on cg %u", cgno);
-		goto failed;
+		goto failed_unlock;
 	}
 
 	for (i = bit; i < end_bit; i += uspi->s_fpb) {
@@ -210,8 +210,9 @@ do_more:
 	UFSD("EXIT\n");
 	return;
 
-failed:
+failed_unlock:
 	unlock_super (sb);
+failed:
 	UFSD("EXIT (FAILED)\n");
 	return;
 }
-- 
cgit v1.2.3


From 022a6dc5f461a30615bcd1687569abeee7ef8ba2 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:27 -0700
Subject: [PATCH] ufs: zero metadata

Presently if we allocate several "metadata" blocks (pointers to indirect
blocks for example), we fill with zeroes only the first block.  This cause
some problems in "truncate" function.  Also this patch remove some unused
arguments from several functions and add comments.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/inode.c | 116 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 76 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c57612d443d..72282043a8f 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -152,7 +152,7 @@ out:
 	return ret;
 }
 
-static void ufs_clear_block(struct inode *inode, struct buffer_head *bh)
+static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh)
 {
 	lock_buffer(bh);
 	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
@@ -163,27 +163,52 @@ static void ufs_clear_block(struct inode *inode, struct buffer_head *bh)
 		sync_dirty_buffer(bh);
 }
 
-static struct buffer_head *ufs_inode_getfrag(struct inode *inode,
-					     unsigned int fragment, unsigned int new_fragment,
-					     unsigned int required, int *err, int metadata,
-					     long *phys, int *new, struct page *locked_page)
+static struct buffer_head *
+ufs_clear_frags(struct inode *inode, sector_t beg,
+		unsigned int n)
+{
+	struct buffer_head *res, *bh;
+	sector_t end = beg + n;
+
+	res = sb_getblk(inode->i_sb, beg);
+	ufs_clear_frag(inode, res);
+	for (++beg; beg < end; ++beg) {
+		bh = sb_getblk(inode->i_sb, beg);
+		ufs_clear_frag(inode, bh);
+	}
+	return res;
+}
+
+/**
+ * ufs_inode_getfrag() - allocate new fragment(s)
+ * @inode - pointer to inode
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated fragment(s)
+ * @new_fragment - number of new allocated fragment(s)
+ * @required - how many fragment(s) we require
+ * @err - we set it if something wrong
+ * @phys - pointer to where we save physical number of new allocated fragments,
+ *   NULL if we allocate not data(indirect blocks for example).
+ * @new - we set it if we allocate new block
+ * @locked_page - for ufs_new_fragments()
+ */
+static struct buffer_head *
+ufs_inode_getfrag(struct inode *inode, unsigned int fragment,
+		  sector_t new_fragment, unsigned int required, int *err,
+		  long *phys, int *new, struct page *locked_page)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
 	struct buffer_head * result;
 	unsigned block, blockoff, lastfrag, lastblock, lastblockoff;
 	unsigned tmp, goal;
 	__fs32 * p, * p2;
-	unsigned flags = 0;
 
-	UFSD("ENTER, ino %lu, fragment %u, new_fragment %u, required %u\n",
-		inode->i_ino, fragment, new_fragment, required);
+	UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, required %u, "
+	     "metadata %d\n", inode->i_ino, fragment,
+	     (unsigned long long)new_fragment, required, !phys);
 
-	sb = inode->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
-
-	flags = UFS_SB(sb)->s_flags;
         /* TODO : to be done for write support
         if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
              goto ufs2;
@@ -198,7 +223,7 @@ repeat:
 	tmp = fs32_to_cpu(sb, *p);
 	lastfrag = ufsi->i_lastfrag;
 	if (tmp && fragment < lastfrag) {
-		if (metadata) {
+		if (!phys) {
 			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
 			if (tmp == fs32_to_cpu(sb, *p)) {
 				UFSD("EXIT, result %u\n", tmp + blockoff);
@@ -265,9 +290,8 @@ repeat:
 		return NULL;
 	}
 
-	if (metadata) {
-		result = sb_getblk(inode->i_sb, tmp + blockoff);
-		ufs_clear_block(inode, result);
+	if (!phys) {
+		result = ufs_clear_frags(inode, tmp + blockoff, required);
 	} else {
 		*phys = tmp + blockoff;
 		result = NULL;
@@ -298,23 +322,35 @@ repeat2:
      */
 }
 
-static struct buffer_head *ufs_block_getfrag(struct inode *inode, struct buffer_head *bh,
-					     unsigned int fragment, unsigned int new_fragment,
-					     unsigned int blocksize, int * err, int metadata,
-					     long *phys, int *new, struct page *locked_page)
+/**
+ * ufs_inode_getblock() - allocate new block
+ * @inode - pointer to inode
+ * @bh - pointer to block which hold "pointer" to new allocated block
+ * @fragment - number of `fragment' which hold pointer
+ *   to new allocated block
+ * @new_fragment - number of new allocated fragment
+ *  (block will hold this fragment and also uspi->s_fpb-1)
+ * @err - see ufs_inode_getfrag()
+ * @phys - see ufs_inode_getfrag()
+ * @new - see ufs_inode_getfrag()
+ * @locked_page - see ufs_inode_getfrag()
+ */
+static struct buffer_head *
+ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
+		  unsigned int fragment, sector_t new_fragment, int *err,
+		  long *phys, int *new, struct page *locked_page)
 {
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
 	struct buffer_head * result;
 	unsigned tmp, goal, block, blockoff;
 	__fs32 * p;
 
-	sb = inode->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
 	block = ufs_fragstoblks (fragment);
 	blockoff = ufs_fragnum (fragment);
 
-	UFSD("ENTER, ino %lu, fragment %u, new_fragment %u\n", inode->i_ino, fragment, new_fragment);
+	UFSD("ENTER, ino %lu, fragment %u, new_fragment %llu, metadata %d\n",
+	     inode->i_ino, fragment, (unsigned long long)new_fragment, !phys);
 
 	result = NULL;
 	if (!bh)
@@ -330,7 +366,7 @@ static struct buffer_head *ufs_block_getfrag(struct inode *inode, struct buffer_
 repeat:
 	tmp = fs32_to_cpu(sb, *p);
 	if (tmp) {
-		if (metadata) {
+		if (!phys) {
 			result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
 			if (tmp == fs32_to_cpu(sb, *p))
 				goto out;
@@ -355,9 +391,8 @@ repeat:
 	}		
 
 
-	if (metadata) {
-		result = sb_getblk(sb, tmp + blockoff);
-		ufs_clear_block(inode, result);
+	if (!phys) {
+		result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb);
 	} else {
 		*phys = tmp + blockoff;
 		*new = 1;
@@ -375,11 +410,12 @@ out:
 	return result;
 }
 
-/*
- * This function gets the block which contains the fragment.
+/**
+ * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and
+ * readpage, writepage and so on
  */
 
-int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
+int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
 {
 	struct super_block * sb = inode->i_sb;
 	struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
@@ -421,15 +457,15 @@ int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_hea
 	 * it much more readable:
 	 */
 #define GET_INODE_DATABLOCK(x) \
-	ufs_inode_getfrag(inode, x, fragment, 1, &err, 0, &phys, &new, bh_result->b_page)
+	ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new, bh_result->b_page)
 #define GET_INODE_PTR(x) \
-	ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, 1, NULL, NULL, bh_result->b_page)
+	ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page)
 #define GET_INDIRECT_DATABLOCK(x) \
-	ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize,	\
-			  &err, 0, &phys, &new, bh_result->b_page);
+	ufs_inode_getblock(inode, bh, x, fragment,	\
+			  &err, &phys, &new, bh_result->b_page);
 #define GET_INDIRECT_PTR(x) \
-	ufs_block_getfrag(inode, bh, x, fragment, sb->s_blocksize,	\
-			  &err, 1, NULL, NULL, bh_result->b_page);
+	ufs_inode_getblock(inode, bh, x, fragment,	\
+			  &err, NULL, NULL, bh_result->b_page);
 
 	if (ptr < UFS_NDIR_FRAGMENT) {
 		bh = GET_INODE_DATABLOCK(ptr);
-- 
cgit v1.2.3


From 96710b29e05f3b470bc4206366021b56e28d5208 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 25 Jun 2006 05:47:28 -0700
Subject: [PATCH] ufs: printk warning fixes

fs/ufs/super.c: In function `ufs_print_super_stuff':
fs/ufs/super.c:103: warning: unsigned int format, different type arg (arg 2)    fs/ufs/super.c: In function `ufs2_print_super_stuff':                           fs/ufs/super.c:147: warning: unsigned int format, different type arg (arg 2)    fs/ufs/super.c: In function `ufs_print_cylinder_stuff':
fs/ufs/super.c:175: warning: unsigned int format, different type arg (arg 2)

Cc: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 42425999d2d..c8339e56638 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -100,7 +100,7 @@ void ufs_print_super_stuff(struct super_block *sb,
 	struct ufs_super_block_third * usb3)
 {
 	printk("ufs_print_super_stuff\n");
-	printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
+	printk("size of usb:     %zu\n", sizeof(struct ufs_super_block));
 	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
 	printk("  sblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
 	printk("  cblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
@@ -144,7 +144,7 @@ void ufs2_print_super_stuff(
       struct ufs_super_block *usb)
 {
 	printk("ufs_print_super_stuff\n");
-	printk("size of usb:     %u\n", sizeof(struct ufs_super_block));
+	printk("size of usb:     %zu\n", sizeof(struct ufs_super_block));
 	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb->fs_magic));
 	printk("  fs_size:   %llu\n",
 	       (unsigned long long)fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
@@ -172,7 +172,7 @@ void ufs2_print_super_stuff(
 void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg)
 {
 	printk("\nufs_print_cylinder_stuff\n");
-	printk("size of ucg: %u\n", sizeof(struct ufs_cylinder_group));
+	printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
 	printk("  magic:        %x\n", fs32_to_cpu(sb, cg->cg_magic));
 	printk("  time:         %u\n", fs32_to_cpu(sb, cg->cg_time));
 	printk("  cgx:          %u\n", fs32_to_cpu(sb, cg->cg_cgx));
-- 
cgit v1.2.3


From f391475812ba39afa322c835217ffe936f5e754a Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:28 -0700
Subject: [PATCH] ufs: missed brelse and wrong baseblk

This patch fixes two bugs, which introduced by previous patches:

1) Missed "brelse"

2) Sometimes "baseblk" may be wrongly calculated, if i_size is equal to
   zero, which lead infinite cycle in "mpage_writepages".

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c | 12 +++++-------
 fs/ufs/inode.c  |  1 +
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index ac709e70f64..99d881812ad 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -269,20 +269,17 @@ out:
  * We can come here from ufs_writepage or ufs_prepare_write,
  * locked_page is argument of these functions, so we already lock it.
  */
-static void ufs_change_blocknr(struct inode *inode, unsigned int count,
-			       unsigned int oldb, unsigned int newb,
-			       struct page *locked_page)
+static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk,
+			       unsigned int count, unsigned int oldb,
+			       unsigned int newb, struct page *locked_page)
 {
 	unsigned int blk_per_page = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	sector_t baseblk;
 	struct address_space *mapping = inode->i_mapping;
 	pgoff_t index, cur_index = locked_page->index;
 	unsigned int i, j;
 	struct page *page;
 	struct buffer_head *head, *bh;
 
-	baseblk = ((i_size_read(inode) - 1) >> inode->i_blkbits) + 1 - count;
-
 	UFSD("ENTER, ino %lu, count %u, oldb %u, newb %u\n",
 	      inode->i_ino, count, oldb, newb);
 
@@ -439,7 +436,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	}
 	result = ufs_alloc_fragments (inode, cgno, goal, request, err);
 	if (result) {
-		ufs_change_blocknr(inode, oldcount, tmp, result, locked_page);
+		ufs_change_blocknr(inode, fragment - oldcount, oldcount, tmp,
+				   result, locked_page);
 
 		*p = cpu_to_fs32(sb, result);
 		*err = 0;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 72282043a8f..01c5f19cbab 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -175,6 +175,7 @@ ufs_clear_frags(struct inode *inode, sector_t beg,
 	for (++beg; beg < end; ++beg) {
 		bh = sb_getblk(inode->i_sb, beg);
 		ufs_clear_frag(inode, bh);
+		brelse(bh);
 	}
 	return res;
 }
-- 
cgit v1.2.3


From 647b7e87b56f594daf648f44abfbeeb5eb6a9457 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:29 -0700
Subject: [PATCH] ufs: one way to access super block

Super block of UFS usually has size >512, because of fragment size may be 512,
this cause some problems.

Currently, there are two methods to work with ufs super block:

1) split structure which describes ufs super blocks into structures with
   size <=512

2) use one structure which describes ufs super block, and hope that array
   of "buffer_head" which holds "super block", has such construction:

	bh[n]->b_data + bh[n]->b_size == bh[n + 1]->b_data

The second variant may cause some problems in the future, and usage of two
variants cause unnecessary code duplication.

This patch remove the second variant.  Also patch contains some CodingStyle
fixes.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/super.c | 217 +++++++++++++++++++++++++++------------------------------
 fs/ufs/util.h  |  52 +++++---------
 2 files changed, 121 insertions(+), 148 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index c8339e56638..3aadbd3167a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -94,82 +94,77 @@
 /*
  * Print contents of ufs_super_block, useful for debugging
  */
-void ufs_print_super_stuff(struct super_block *sb,
-	struct ufs_super_block_first * usb1,
-	struct ufs_super_block_second * usb2, 
-	struct ufs_super_block_third * usb3)
+static void ufs_print_super_stuff(struct super_block *sb, unsigned flags,
+				  struct ufs_super_block_first *usb1,
+				  struct ufs_super_block_second *usb2,
+				  struct ufs_super_block_third *usb3)
 {
 	printk("ufs_print_super_stuff\n");
-	printk("size of usb:     %zu\n", sizeof(struct ufs_super_block));
-	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
-	printk("  sblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
-	printk("  cblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
-	printk("  iblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
-	printk("  dblkno:        %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
-	printk("  cgoffset:      %u\n", fs32_to_cpu(sb, usb1->fs_cgoffset));
-	printk("  ~cgmask:       0x%x\n", ~fs32_to_cpu(sb, usb1->fs_cgmask));
-	printk("  size:          %u\n", fs32_to_cpu(sb, usb1->fs_size));
-	printk("  dsize:         %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
-	printk("  ncg:           %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
-	printk("  bsize:         %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
-	printk("  fsize:         %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
-	printk("  frag:          %u\n", fs32_to_cpu(sb, usb1->fs_frag));
-	printk("  fragshift:     %u\n", fs32_to_cpu(sb, usb1->fs_fragshift));
-	printk("  ~fmask:        %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
-	printk("  fshift:        %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
-	printk("  sbsize:        %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
-	printk("  spc:           %u\n", fs32_to_cpu(sb, usb1->fs_spc));
-	printk("  cpg:           %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
-	printk("  ipg:           %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
-	printk("  fpg:           %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
-	printk("  csaddr:        %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
-	printk("  cssize:        %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
-	printk("  cgsize:        %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
-	printk("  fstodb:        %u\n", fs32_to_cpu(sb, usb1->fs_fsbtodb));
-	printk("  contigsumsize: %d\n", fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize));
-	printk("  postblformat:  %u\n", fs32_to_cpu(sb, usb3->fs_postblformat));
-	printk("  nrpos:         %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
-	printk("  ndir           %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
-	printk("  nifree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
-	printk("  nbfree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
-	printk("  nffree         %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
-	printk("\n");
-}
-
-/*
- * Print contents of ufs2 ufs_super_block, useful for debugging
- */
-void ufs2_print_super_stuff(
-     struct super_block *sb,
-      struct ufs_super_block *usb)
-{
-	printk("ufs_print_super_stuff\n");
-	printk("size of usb:     %zu\n", sizeof(struct ufs_super_block));
-	printk("  magic:         0x%x\n", fs32_to_cpu(sb, usb->fs_magic));
-	printk("  fs_size:   %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size));
-	printk("  fs_dsize:  %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize));
-	printk("  bsize:         %u\n", fs32_to_cpu(sb, usb->fs_bsize));
-	printk("  fsize:         %u\n", fs32_to_cpu(sb, usb->fs_fsize));
-	printk("  fs_volname:  %s\n", usb->fs_u11.fs_u2.fs_volname);
-	printk("  fs_fsmnt:  %s\n", usb->fs_u11.fs_u2.fs_fsmnt);
-	printk("  fs_sblockloc: %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb,
-					       usb->fs_u11.fs_u2.fs_sblockloc));
-	printk("  cs_ndir(No of dirs):  %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb,
-				         usb->fs_u11.fs_u2.fs_cstotal.cs_ndir));
-	printk("  cs_nbfree(No of free blocks):  %llu\n",
-	       (unsigned long long)fs64_to_cpu(sb,
-				       usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree));
+	printk("  magic:     0x%x\n", fs32_to_cpu(sb, usb3->fs_magic));
+	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		printk("  fs_size:   %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
+		printk("  fs_dsize:  %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
+		printk("  bsize:         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_bsize));
+		printk("  fsize:         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fsize));
+		printk("  fs_volname:  %s\n", usb2->fs_un.fs_u2.fs_volname);
+		printk("  fs_sblockloc: %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
+		printk("  cs_ndir(No of dirs):  %llu\n", (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
+		printk("  cs_nbfree(No of free blocks):  %llu\n",
+		       (unsigned long long)
+		       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
+	} else {
+		printk(" sblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
+		printk(" cblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
+		printk(" iblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
+		printk(" dblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
+		printk(" cgoffset:    %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cgoffset));
+		printk(" ~cgmask:     0x%x\n",
+		       ~fs32_to_cpu(sb, usb1->fs_cgmask));
+		printk(" size:        %u\n", fs32_to_cpu(sb, usb1->fs_size));
+		printk(" dsize:       %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
+		printk(" ncg:         %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
+		printk(" bsize:       %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
+		printk(" fsize:       %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
+		printk(" frag:        %u\n", fs32_to_cpu(sb, usb1->fs_frag));
+		printk(" fragshift:   %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fragshift));
+		printk(" ~fmask:      %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
+		printk(" fshift:      %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
+		printk(" sbsize:      %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
+		printk(" spc:         %u\n", fs32_to_cpu(sb, usb1->fs_spc));
+		printk(" cpg:         %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
+		printk(" ipg:         %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
+		printk(" fpg:         %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
+		printk(" csaddr:      %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
+		printk(" cssize:      %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
+		printk(" cgsize:      %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
+		printk(" fstodb:      %u\n",
+		       fs32_to_cpu(sb, usb1->fs_fsbtodb));
+		printk(" nrpos:       %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
+		printk(" ndir         %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
+		printk(" nifree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
+		printk(" nbfree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
+		printk(" nffree       %u\n",
+		       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
+	}
 	printk("\n");
 }
 
 /*
  * Print contents of ufs_cylinder_group, useful for debugging
  */
-void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg)
+static void ufs_print_cylinder_stuff(struct super_block *sb,
+				     struct ufs_cylinder_group *cg)
 {
 	printk("\nufs_print_cylinder_stuff\n");
 	printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
@@ -196,11 +191,17 @@ void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group
 	printk("  iuseoff:      %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
 	printk("  freeoff:      %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
 	printk("  nextfreeoff:  %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
-	printk("  clustersumoff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
-	printk("  clusteroff    %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
-	printk("  nclusterblks  %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
+	printk("  clustersumoff %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
+	printk("  clusteroff    %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+	printk("  nclusterblks  %u\n",
+	       fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
 	printk("\n");
 }
+#else
+#  define ufs_print_super_stuff(sb, flags, usb1, usb2, usb3) /**/
+#  define ufs_print_cylinder_stuff(sb, cg) /**/
 #endif /* CONFIG_UFS_DEBUG */
 
 static struct super_operations ufs_super_ops;
@@ -384,9 +385,9 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
  */
 static int ufs_read_cylinder_structures (struct super_block *sb)
 {
-	struct ufs_sb_info * sbi = UFS_SB(sb);
-	struct ufs_sb_private_info * uspi;
-	struct ufs_super_block *usb;
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_super_block_third *usb3;
 	struct ufs_buffer_head * ubh;
 	unsigned char * base, * space;
 	unsigned size, blks, i;
@@ -394,10 +395,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 	
 	UFSD("ENTER\n");
 	
-	uspi = sbi->s_uspi;
-
-	usb  = (struct ufs_super_block *)
-		((struct ufs_buffer_head *)uspi)->bh[0]->b_data;
+	usb3 = ubh_get_usb_third(uspi);
 
         flags = UFS_SB(sb)->s_flags;
 	
@@ -418,7 +416,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 
 		if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 
 			ubh = ubh_bread(sb,
-				fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_csaddr) + i, size);
+				fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr) + i, size);
 		else 
 			ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
 		
@@ -450,9 +448,8 @@ static int ufs_read_cylinder_structures (struct super_block *sb)
 			goto failed;
 		if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data))
 			goto failed;
-#ifdef CONFIG_UFS_DEBUG
+
 		ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
-#endif
 	}
 	for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
 		if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
@@ -818,12 +815,8 @@ magic_found:
 		goto again;
 	}
 
-#ifdef CONFIG_UFS_DEBUG
-        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
-		ufs2_print_super_stuff(sb,usb);
-        else
-		ufs_print_super_stuff(sb, usb1, usb2, usb3);
-#endif
+
+	ufs_print_super_stuff(sb, flags, usb1, usb2, usb3);
 
 	/*
 	 * Check, if file system was correctly unmounted.
@@ -878,10 +871,9 @@ magic_found:
 	uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask);
 
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
-		uspi->s_u2_size  = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_size);
-		uspi->s_u2_dsize = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
-	}
-	else {
+		uspi->s_u2_size  = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size);
+		uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+	} else {
 		uspi->s_size  =  fs32_to_cpu(sb, usb1->fs_size);
 		uspi->s_dsize =  fs32_to_cpu(sb, usb1->fs_dsize);
 	}
@@ -916,8 +908,8 @@ magic_found:
 	uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc);
 	uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg);
 	uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg);
-	uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_cpc);
-	uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_contigsumsize);
+	uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc);
+	uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize);
 	uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3);
 	uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3);
 	uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat);
@@ -949,7 +941,7 @@ magic_found:
 	if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) ==
 	    UFS_MOUNT_UFSTYPE_44BSD)
 		uspi->s_maxsymlinklen =
-		    fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_maxsymlinklen);
+		    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
 	
 	sbi->s_flags = flags;
 
@@ -987,7 +979,8 @@ failed_nomem:
 	return -ENOMEM;
 }
 
-static void ufs_write_super (struct super_block *sb) {
+static void ufs_write_super(struct super_block *sb)
+{
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block_third * usb3;
@@ -1027,7 +1020,7 @@ static void ufs_put_super(struct super_block *sb)
 	kfree (sbi->s_uspi);
 	kfree (sbi);
 	sb->s_fs_info = NULL;
-UFSD("EXIT\n");
+	UFSD("EXIT\n");
 	return;
 }
 
@@ -1107,31 +1100,29 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	return 0;
 }
 
-static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf)
+static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-	struct ufs_sb_private_info * uspi;
-	struct ufs_super_block_first * usb1;
-	struct ufs_super_block * usb;
-	unsigned  flags = 0;
+	struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
+	unsigned  flags = UFS_SB(sb)->s_flags;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
 
 	lock_kernel();
 
-	uspi = UFS_SB(sb)->s_uspi;
-	usb1 = ubh_get_usb_first (uspi);
-	usb  = (struct ufs_super_block *)
-		((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
 	
-	flags = UFS_SB(sb)->s_flags;
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
 		buf->f_type = UFS2_MAGIC;
-		buf->f_blocks = fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_dsize);
-		buf->f_bfree = ufs_blkstofrags(fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nbfree)) +
-			fs64_to_cpu(sb, usb->fs_u11.fs_u2.fs_cstotal.cs_nffree);
-		buf->f_ffree = fs64_to_cpu(sb,
-        		usb->fs_u11.fs_u2.fs_cstotal.cs_nifree);
-	}
-	else {
+		buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
+		buf->f_bfree = ufs_blkstofrags(
+			fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)) +
+			fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
+		buf->f_ffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
+	} else {
 		buf->f_type = UFS_MAGIC;
 		buf->f_blocks = uspi->s_dsize;
 		buf->f_bfree = ufs_blkstofrags(fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)) +
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index e95d1c46461..eacd5e37b8e 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -39,12 +39,12 @@ ufs_get_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		return fs32_to_cpu(sb, usb3->fs_u2.fs_sun.fs_state);
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sun.fs_state);
 	case UFS_ST_SUNx86:
 		return fs32_to_cpu(sb, usb1->fs_u1.fs_sunx86.fs_state);
 	case UFS_ST_44BSD:
 	default:
-		return fs32_to_cpu(sb, usb3->fs_u2.fs_44.fs_state);
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_state);
 	}
 }
 
@@ -54,13 +54,13 @@ ufs_set_fs_state(struct super_block *sb, struct ufs_super_block_first *usb1,
 {
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		usb3->fs_u2.fs_sun.fs_state = cpu_to_fs32(sb, value);
+		usb3->fs_un2.fs_sun.fs_state = cpu_to_fs32(sb, value);
 		break;
 	case UFS_ST_SUNx86:
 		usb1->fs_u1.fs_sunx86.fs_state = cpu_to_fs32(sb, value);
 		break;
 	case UFS_ST_44BSD:
-		usb3->fs_u2.fs_44.fs_state = cpu_to_fs32(sb, value);
+		usb3->fs_un2.fs_44.fs_state = cpu_to_fs32(sb, value);
 		break;
 	}
 }
@@ -70,7 +70,7 @@ ufs_get_fs_npsect(struct super_block *sb, struct ufs_super_block_first *usb1,
 		  struct ufs_super_block_third *usb3)
 {
 	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-		return fs32_to_cpu(sb, usb3->fs_u2.fs_sunx86.fs_npsect);
+		return fs32_to_cpu(sb, usb3->fs_un2.fs_sunx86.fs_npsect);
 	else
 		return fs32_to_cpu(sb, usb1->fs_u1.fs_sun.fs_npsect);
 }
@@ -82,16 +82,16 @@ ufs_get_fs_qbmask(struct super_block *sb, struct ufs_super_block_third *usb3)
 
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qbmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qbmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qbmask[1];
 		break;
 	case UFS_ST_SUNx86:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qbmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qbmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qbmask[1];
 		break;
 	case UFS_ST_44BSD:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qbmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qbmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qbmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qbmask[1];
 		break;
 	}
 
@@ -105,16 +105,16 @@ ufs_get_fs_qfmask(struct super_block *sb, struct ufs_super_block_third *usb3)
 
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUN:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sun.fs_qfmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sun.fs_qfmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sun.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sun.fs_qfmask[1];
 		break;
 	case UFS_ST_SUNx86:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_sunx86.fs_qfmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_sunx86.fs_qfmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_sunx86.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_sunx86.fs_qfmask[1];
 		break;
 	case UFS_ST_44BSD:
-		((__fs32 *)&tmp)[0] = usb3->fs_u2.fs_44.fs_qfmask[0];
-		((__fs32 *)&tmp)[1] = usb3->fs_u2.fs_44.fs_qfmask[1];
+		((__fs32 *)&tmp)[0] = usb3->fs_un2.fs_44.fs_qfmask[0];
+		((__fs32 *)&tmp)[1] = usb3->fs_un2.fs_44.fs_qfmask[1];
 		break;
 	}
 
@@ -302,24 +302,6 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
 #define ubh_blkmap(ubh,begin,bit) \
 	((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb)))
 
-
-/*
- * Macros for access to superblock array structures
- */
-#define ubh_postbl(ubh,cylno,i) \
-	((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-	? (*(__s16*)(ubh_get_addr(ubh, \
-	(unsigned)(&((struct ufs_super_block *)0)->fs_opostbl) \
-	+ (((cylno) * 16 + (i)) << 1) ) )) \
-	: (*(__s16*)(ubh_get_addr(ubh, \
-	uspi->s_postbloff + (((cylno) * uspi->s_nrpos + (i)) << 1) ))))
-
-#define ubh_rotbl(ubh,i) \
-	((uspi->s_postblformat != UFS_DYNAMICPOSTBLFMT) \
-	? (*(__u8*)(ubh_get_addr(ubh, \
-	(unsigned)(&((struct ufs_super_block *)0)->fs_space) + (i)))) \
-	: (*(__u8*)(ubh_get_addr(ubh, uspi->s_rotbloff + (i)))))
-
 /*
  * Determine the number of available frags given a
  * percentage to hold in reserve.
-- 
cgit v1.2.3


From 577a82752f95a5680d7c14569ffd3fd630d9fb22 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:30 -0700
Subject: [PATCH] ufs: fsync implementation

Presently ufs doesn't support "fsync", this make some applications unhappy,
for example vim.  This patch fixes this situation.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/file.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 312fd3f8631..0e5001512a9 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -25,6 +25,26 @@
 
 #include <linux/fs.h>
 #include <linux/ufs_fs.h>
+#include <linux/buffer_head.h>	/* for sync_mapping_buffers() */
+
+static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+	int ret;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return ret;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return ret;
+
+	err = ufs_sync_inode(inode);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
+
 
 /*
  * We have mostly NULL's here: the current defaults are ok for
@@ -37,6 +57,7 @@ const struct file_operations ufs_file_operations = {
 	.write		= generic_file_write,
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
+	.fsync		= ufs_sync_file,
 	.sendfile	= generic_file_sendfile,
 };
 
-- 
cgit v1.2.3


From ee3ffd6c126323693b3b32a71a1f1acfce30bd66 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:30 -0700
Subject: [PATCH] ufs: make fsck -f happy

ufs super block contains some statistic about file systems, like amount of
directories, free blocks, inodes and so on.

UFS1 hold this information in one location and uses 32bit integers for such
information, UFS2 hold statistic in another location and uses 64bit integers.

There is transition variant, if UFS1 has type 44BSD and flags field in super
block has some special value this mean that we work with statistic like UFS2
does.  and this also means that nobody care about old(UFS1) statistic.

So if start fsck against such file system, after usage linux ufs driver, it
found error: at now only UFS1 like statistic is updated.

This patch should fix this.  Also it contains some minor cleanup: CodingSytle
and remove unused variables.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c |  24 +++++-----
 fs/ufs/ialloc.c |   8 ++--
 fs/ufs/super.c  | 140 +++++++++++++++++++++++++++++++++++++++++---------------
 fs/ufs/util.h   |  10 ++--
 4 files changed, 126 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 99d881812ad..cb36d2dadef 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -83,7 +83,7 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 
 	
 	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
-	fs32_add(sb, &usb1->fs_cstotal.cs_nffree, count);
+	uspi->cs_total.cs_nffree += count;
 	fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
 	blkmap = ubh_blkmap (UCPI_UBH(ucpi), ucpi->c_freeoff, bbase);
 	ufs_fragacct(sb, blkmap, ucg->cg_frsum, 1);
@@ -94,12 +94,12 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	blkno = ufs_fragstoblks (bbase);
 	if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
 		fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
-		fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, uspi->s_fpb);
+		uspi->cs_total.cs_nffree -= uspi->s_fpb;
 		fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
 		cylno = ufs_cbtocylno (bbase);
 		fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(bbase)), 1);
@@ -185,7 +185,7 @@ do_more:
 		DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+		uspi->cs_total.cs_nbfree++;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
 		cylno = ufs_cbtocylno(i);
 		fs16_add(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(i)), 1);
@@ -372,7 +372,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	/*
 	 * There is not enough space for user on the device
 	 */
-	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(usb1, UFS_MINFREE) <= 0) {
+	if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
 		unlock_super (sb);
 		UFSD("EXIT (FAILED)\n");
 		return 0;
@@ -418,8 +418,8 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	switch (fs32_to_cpu(sb, usb1->fs_optim)) {
 	    case UFS_OPTSPACE:
 		request = newcount;
-		if (uspi->s_minfree < 5 || fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) 
-		    > uspi->s_dsize * uspi->s_minfree / (2 * 100) )
+		if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree
+		    > uspi->s_dsize * uspi->s_minfree / (2 * 100))
 			break;
 		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
 		break;
@@ -428,7 +428,7 @@ unsigned ufs_new_fragments(struct inode * inode, __fs32 * p, unsigned fragment,
 	
 	    case UFS_OPTTIME:
 		request = uspi->s_fpb;
-		if (fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree) < uspi->s_dsize *
+		if (uspi->cs_total.cs_nffree < uspi->s_dsize *
 		    (uspi->s_minfree - 2) / 100)
 			break;
 		usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME);
@@ -516,7 +516,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 
 	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
@@ -618,7 +618,7 @@ cg_found:
 		DQUOT_FREE_BLOCK(inode, i);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nffree, i);
+		uspi->cs_total.cs_nffree += i;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, i);
 		fs32_add(sb, &ucg->cg_frsum[i], 1);
 		goto succed;
@@ -635,7 +635,7 @@ cg_found:
 		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
 	
 	fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nffree, count);
+	uspi->cs_total.cs_nffree -= count;
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
 	fs32_sub(sb, &ucg->cg_frsum[allocsize], 1);
 
@@ -703,7 +703,7 @@ gotit:
 	}
 
 	fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nbfree, 1);
+	uspi->cs_total.cs_nbfree--;
 	fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
 	cylno = ufs_cbtocylno(result);
 	fs16_sub(sb, &ubh_cg_blks(ucpi, cylno, ufs_cbtorpos(result)), 1);
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index c684aaad999..6d752735002 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -103,12 +103,12 @@ void ufs_free_inode (struct inode * inode)
 		if (ino < ucpi->c_irotor)
 			ucpi->c_irotor = ino;
 		fs32_add(sb, &ucg->cg_cs.cs_nifree, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_nifree, 1);
+		uspi->cs_total.cs_nifree++;
 		fs32_add(sb, &UFS_SB(sb)->fs_cs(cg).cs_nifree, 1);
 
 		if (is_directory) {
 			fs32_sub(sb, &ucg->cg_cs.cs_ndir, 1);
-			fs32_sub(sb, &usb1->fs_cstotal.cs_ndir, 1);
+			uspi->cs_total.cs_ndir--;
 			fs32_sub(sb, &UFS_SB(sb)->fs_cs(cg).cs_ndir, 1);
 		}
 	}
@@ -228,12 +228,12 @@ cg_found:
 	}
 	
 	fs32_sub(sb, &ucg->cg_cs.cs_nifree, 1);
-	fs32_sub(sb, &usb1->fs_cstotal.cs_nifree, 1);
+	uspi->cs_total.cs_nifree--;
 	fs32_sub(sb, &sbi->fs_cs(cg).cs_nifree, 1);
 	
 	if (S_ISDIR(mode)) {
 		fs32_add(sb, &ucg->cg_cs.cs_ndir, 1);
-		fs32_add(sb, &usb1->fs_cstotal.cs_ndir, 1);
+		uspi->cs_total.cs_ndir++;
 		fs32_add(sb, &sbi->fs_cs(cg).cs_ndir, 1);
 	}
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 3aadbd3167a..74ef5e9bedf 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -381,24 +381,57 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 }
 
 /*
- * Read on-disk structures associated with cylinder groups
+ * Diffrent types of UFS hold fs_cstotal in different
+ * places, and use diffrent data structure for it.
+ * To make things simplier we just copy fs_cstotal to ufs_sb_private_info
  */
-static int ufs_read_cylinder_structures (struct super_block *sb)
+static void ufs_setup_cstotal(struct super_block *sb)
 {
 	struct ufs_sb_info *sbi = UFS_SB(sb);
 	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
 	struct ufs_super_block_third *usb3;
+	unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+
+	UFSD("ENTER, mtype=%u\n", mtype);
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
+		uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
+	} else {
+		uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir);
+		uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree);
+		uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
+		uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
+	}
+	UFSD("EXIT\n");
+}
+
+/*
+ * Read on-disk structures associated with cylinder groups
+ */
+static int ufs_read_cylinder_structures(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
+	unsigned flags = sbi->s_flags;
 	struct ufs_buffer_head * ubh;
 	unsigned char * base, * space;
 	unsigned size, blks, i;
-	unsigned flags = 0;
-	
+	struct ufs_super_block_third *usb3;
+
 	UFSD("ENTER\n");
-	
-	usb3 = ubh_get_usb_third(uspi);
 
-        flags = UFS_SB(sb)->s_flags;
-	
+	usb3 = ubh_get_usb_third(uspi);
 	/*
 	 * Read cs structures from (usually) first data block
 	 * on the device. 
@@ -475,21 +508,64 @@ failed:
 }
 
 /*
- * Put on-disk structures associated with cylinder groups and 
- * write them back to disk
+ * Sync our internal copy of fs_cstotal with disk
  */
-static void ufs_put_cylinder_structures (struct super_block *sb)
+static void ufs_put_cstotal(struct super_block *sb)
 {
-	struct ufs_sb_info * sbi = UFS_SB(sb);
-	struct ufs_sb_private_info * uspi;
+	unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	struct ufs_super_block_first *usb1;
+	struct ufs_super_block_second *usb2;
+	struct ufs_super_block_third *usb3;
+
+	UFSD("ENTER\n");
+	usb1 = ubh_get_usb_first(uspi);
+	usb2 = ubh_get_usb_second(uspi);
+	usb3 = ubh_get_usb_third(uspi);
+
+	if ((mtype == UFS_MOUNT_UFSTYPE_44BSD &&
+	     (usb1->fs_flags & UFS_FLAGS_UPDATED)) ||
+	    mtype == UFS_MOUNT_UFSTYPE_UFS2) {
+		/*we have statistic in different place, then usual*/
+		usb2->fs_un.fs_u2.cs_ndir =
+			cpu_to_fs64(sb, uspi->cs_total.cs_ndir);
+		usb2->fs_un.fs_u2.cs_nbfree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nbfree);
+		usb3->fs_un1.fs_u2.cs_nifree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nifree);
+		usb3->fs_un1.fs_u2.cs_nffree =
+			cpu_to_fs64(sb, uspi->cs_total.cs_nffree);
+	} else {
+		usb1->fs_cstotal.cs_ndir =
+			cpu_to_fs32(sb, uspi->cs_total.cs_ndir);
+		usb1->fs_cstotal.cs_nbfree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nbfree);
+		usb1->fs_cstotal.cs_nifree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nifree);
+		usb1->fs_cstotal.cs_nffree =
+			cpu_to_fs32(sb, uspi->cs_total.cs_nffree);
+	}
+	ubh_mark_buffer_dirty(USPI_UBH(uspi));
+	UFSD("EXIT\n");
+}
+
+/**
+ * ufs_put_super_internal() - put on-disk intrenal structures
+ * @sb: pointer to super_block structure
+ * Put on-disk structures associated with cylinder groups
+ * and write them back to disk, also update cs_total on disk
+ */
+static void ufs_put_super_internal(struct super_block *sb)
+{
+	struct ufs_sb_info *sbi = UFS_SB(sb);
+	struct ufs_sb_private_info *uspi = sbi->s_uspi;
 	struct ufs_buffer_head * ubh;
 	unsigned char * base, * space;
 	unsigned blks, size, i;
+
 	
 	UFSD("ENTER\n");
-	
-	uspi = sbi->s_uspi;
-
+	ufs_put_cstotal(sb);
 	size = uspi->s_cssize;
 	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
 	base = space = (char*) sbi->s_csp;
@@ -524,7 +600,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	struct ufs_super_block_first * usb1;
 	struct ufs_super_block_second * usb2;
 	struct ufs_super_block_third * usb3;
-	struct ufs_super_block *usb;
 	struct ufs_buffer_head * ubh;	
 	struct inode *inode;
 	unsigned block_size, super_block_size;
@@ -728,8 +803,6 @@ again:
 	usb1 = ubh_get_usb_first(uspi);
 	usb2 = ubh_get_usb_second(uspi);
 	usb3 = ubh_get_usb_third(uspi);
-	usb  = (struct ufs_super_block *)
-		((struct ufs_buffer_head *)uspi)->bh[0]->b_data ;
 
 	/*
 	 * Check ufs magic number
@@ -850,8 +923,7 @@ magic_found:
 			sb->s_flags |= MS_RDONLY;
 			break;
 		}
-	}
-	else {
+	} else {
 		printk("ufs_read_super: fs needs fsck\n");
 		sb->s_flags |= MS_RDONLY;
 	}
@@ -952,7 +1024,7 @@ magic_found:
 	if (!sb->s_root)
 		goto dalloc_failed;
 
-
+	ufs_setup_cstotal(sb);
 	/*
 	 * Read cylinder group structures
 	 */
@@ -1000,7 +1072,7 @@ static void ufs_write_super(struct super_block *sb)
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
 			ufs_set_fs_state(sb, usb1, usb3,
 					UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ubh_mark_buffer_dirty (USPI_UBH(uspi));
+		ufs_put_cstotal(sb);
 	}
 	sb->s_dirt = 0;
 	UFSD("EXIT\n");
@@ -1014,7 +1086,7 @@ static void ufs_put_super(struct super_block *sb)
 	UFSD("ENTER\n");
 
 	if (!(sb->s_flags & MS_RDONLY))
-		ufs_put_cylinder_structures (sb);
+		ufs_put_super_internal(sb);
 	
 	ubh_brelse_uspi (sbi->s_uspi);
 	kfree (sbi->s_uspi);
@@ -1049,8 +1121,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		return -EINVAL;
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
 		new_mount_opt |= ufstype;
-	}
-	else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
+	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
 		return -EINVAL;
 	}
@@ -1064,7 +1135,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	 * fs was mouted as rw, remounting ro
 	 */
 	if (*mount_flags & MS_RDONLY) {
-		ufs_put_cylinder_structures(sb);
+		ufs_put_super_internal(sb);
 		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
 		if ((flags & UFS_ST_MASK) == UFS_ST_SUN
 		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) 
@@ -1073,11 +1144,10 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		ubh_mark_buffer_dirty (USPI_UBH(uspi));
 		sb->s_dirt = 0;
 		sb->s_flags |= MS_RDONLY;
-	}
+	} else {
 	/*
 	 * fs was mounted as ro, remounting rw
 	 */
-	else {
 #ifndef CONFIG_UFS_FS_WRITE
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
@@ -1089,7 +1159,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 			printk("this ufstype is read-only supported\n");
 			return -EINVAL;
 		}
-		if (!ufs_read_cylinder_structures (sb)) {
+		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
 			return -EPERM;
 		}
@@ -1118,17 +1188,13 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
 		buf->f_type = UFS2_MAGIC;
 		buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize);
-		buf->f_bfree = ufs_blkstofrags(
-			fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)) +
-			fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree);
-		buf->f_ffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree);
 	} else {
 		buf->f_type = UFS_MAGIC;
 		buf->f_blocks = uspi->s_dsize;
-		buf->f_bfree = ufs_blkstofrags(fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)) +
-			fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree);
-		buf->f_ffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree);
 	}
+	buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree;
+	buf->f_ffree = uspi->cs_total.cs_nifree;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree))
 		? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index eacd5e37b8e..99bfd6bba6d 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -306,9 +306,13 @@ static inline void *get_usb_offset(struct ufs_sb_private_info *uspi,
  * Determine the number of available frags given a
  * percentage to hold in reserve.
  */
-#define ufs_freespace(usb, percentreserved) \
-	(ufs_blkstofrags(fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nbfree)) + \
-	fs32_to_cpu(sb, (usb)->fs_cstotal.cs_nffree) - (uspi->s_dsize * (percentreserved) / 100))
+static inline u64
+ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved)
+{
+	return ufs_blkstofrags(uspi->cs_total.cs_nbfree) +
+		uspi->cs_total.cs_nffree -
+		(uspi->s_dsize * (percentreserved) / 100);
+}
 
 /*
  * Macros to access cylinder group array structures
-- 
cgit v1.2.3


From 098d5af7be694c66af44093f7217da3d22af1057 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:47:31 -0700
Subject: [PATCH] ufs: ubh_ll_rw_block cleanup

In ufs code there is function: ubh_ll_rw_block, it has parameter how many
ufs_buffer_head it should handle, but it always called with "1" on the place
of this parameter.  This patch removes unused parameter of "ubh_ll_wr_block".

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c   | 8 ++++----
 fs/ufs/ialloc.c   | 4 ++--
 fs/ufs/truncate.c | 6 +++---
 fs/ufs/util.c     | 7 +++----
 fs/ufs/util.h     | 2 +-
 5 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index cb36d2dadef..95b878e5c7a 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -109,7 +109,7 @@ void ufs_free_fragments(struct inode *inode, unsigned fragment, unsigned count)
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
 		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
@@ -195,7 +195,7 @@ do_more:
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
 		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 
@@ -521,7 +521,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
 		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
@@ -646,7 +646,7 @@ succed:
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
 		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 6d752735002..9501dcd3b21 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -116,7 +116,7 @@ void ufs_free_inode (struct inode * inode)
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
 		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	
@@ -240,7 +240,7 @@ cg_found:
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
 	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
+		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
 		ubh_wait_on_buffer (UCPI_UBH(ucpi));
 	}
 	sb->s_dirt = 1;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index b860a29c4ef..3c3b301f870 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -238,7 +238,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p)
 		ind_ubh = NULL;
 	}
 	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
-		ubh_ll_rw_block (SWRITE, 1, &ind_ubh);
+		ubh_ll_rw_block(SWRITE, ind_ubh);
 		ubh_wait_on_buffer (ind_ubh);
 	}
 	ubh_brelse (ind_ubh);
@@ -301,7 +301,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
 		dind_bh = NULL;
 	}
 	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
-		ubh_ll_rw_block (SWRITE, 1, &dind_bh);
+		ubh_ll_rw_block(SWRITE, dind_bh);
 		ubh_wait_on_buffer (dind_bh);
 	}
 	ubh_brelse (dind_bh);
@@ -361,7 +361,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
 		tind_bh = NULL;
 	}
 	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
-		ubh_ll_rw_block (SWRITE, 1, &tind_bh);
+		ubh_ll_rw_block(SWRITE, tind_bh);
 		ubh_wait_on_buffer (tind_bh);
 	}
 	ubh_brelse (tind_bh);
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 4685f7cb70b..a2f13f45708 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -112,13 +112,12 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
 	}
 }
 
-void ubh_ll_rw_block (int rw, unsigned nr, struct ufs_buffer_head * ubh[])
+void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
 {
-	unsigned i;
 	if (!ubh)
 		return;
-	for ( i = 0; i < nr; i++ )
-		ll_rw_block (rw, ubh[i]->count, ubh[i]->bh);
+
+	ll_rw_block(rw, ubh->count, ubh->bh);
 }
 
 void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 99bfd6bba6d..406981fff5e 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -242,7 +242,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
 extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
 extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block (int, unsigned, struct ufs_buffer_head **);
+extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
 extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
-- 
cgit v1.2.3


From 138bb68ac9d49b0ea7eeecb3a245dc4e20f181da Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 25 Jun 2006 05:47:32 -0700
Subject: [PATCH] fs/ufs/inode.c: make 2 functions static

Make two needlessly global functions static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/inode.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 01c5f19cbab..f2dbdf5a876 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,6 +41,8 @@
 #include "swab.h"
 #include "util.h"
 
+static u64 ufs_frag_map(struct inode *inode, sector_t frag);
+
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
 	struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
@@ -80,7 +82,7 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
  * the begining of the filesystem.
  */
 
-u64  ufs_frag_map(struct inode *inode, sector_t frag)
+static u64 ufs_frag_map(struct inode *inode, sector_t frag)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -514,8 +516,9 @@ abort_too_big:
 	goto abort;
 }
 
-struct buffer_head *ufs_getfrag(struct inode *inode, unsigned int fragment,
-				int create, int *err)
+static struct buffer_head *ufs_getfrag(struct inode *inode,
+				       unsigned int fragment,
+				       int create, int *err)
 {
 	struct buffer_head dummy;
 	int error;
-- 
cgit v1.2.3


From 0928d68056fa25456830b1de9f0ee89bc37447cd Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@gmx.de>
Date: Sun, 25 Jun 2006 05:47:35 -0700
Subject: [PATCH] openpromfs: fix missing NUL

tchars is not '\0'-terminated so the strtoul may run into problems.  Fix that.
 Also make tchars as big as a long in hexadecimal form would take rather than
just 16.

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/openpromfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 464e2bce020..c0cbe97cdc6 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -448,10 +448,11 @@ static ssize_t property_write(struct file *filp, const char __user *buf,
 					*q |= simple_strtoul (tmp, NULL, 16);
 					buf += last_cnt;
 				} else {
-					char tchars[17]; /* XXX yuck... */
+					char tchars[2 * sizeof(long) + 1];
 
-					if (copy_from_user(tchars, buf, 16))
+					if (copy_from_user(tchars, buf, sizeof(tchars) - 1))
 						return -EFAULT;
+                                        tchars[sizeof(tchars) - 1] = '\0';
 					*q = simple_strtoul (tchars, NULL, 16);
 					buf += 9;
 				}
-- 
cgit v1.2.3


From 515decdccf81cfbf5273d7f0085aea954ecd26c4 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@gmx.de>
Date: Sun, 25 Jun 2006 05:47:35 -0700
Subject: [PATCH] openpromfs: remove unnecessary casts

Remove unnecessary casts in fs/openpromfs/inode.c

Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/openpromfs/inode.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index c0cbe97cdc6..3bd2eeccb2a 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -72,7 +72,7 @@ static ssize_t nodenum_read(struct file *file, char __user *buf,
 	
 	if (count < 0 || !inode->u.generic_ip)
 		return -EINVAL;
-	sprintf (buffer, "%8.8x\n", (u32)(long)(inode->u.generic_ip));
+	sprintf (buffer, "%8.8lx\n", (long)inode->u.generic_ip);
 	if (file->f_pos >= 9)
 		return 0;
 	if (count > 9 - file->f_pos)
@@ -123,7 +123,7 @@ static ssize_t property_read(struct file *filp, char __user *buf,
 					      GFP_KERNEL);
 		if (!filp->private_data)
 			return -ENOMEM;
-		op = (openprom_property *)filp->private_data;
+		op = filp->private_data;
 		op->flag = 0;
 		op->alloclen = 2 * i;
 		strcpy (op->name, p);
@@ -163,7 +163,7 @@ static ssize_t property_read(struct file *filp, char __user *buf,
 				op->len--;
 		}
 	} else
-		op = (openprom_property *)filp->private_data;
+		op = filp->private_data;
 	if (!count || !(op->len || (op->flag & OPP_ASCIIZ)))
 		return 0;
 	if (*ppos >= 0xffffff || count >= 0xffffff)
@@ -335,7 +335,7 @@ static ssize_t property_write(struct file *filp, const char __user *buf,
 			return i;
 	}
 	k = *ppos;
-	op = (openprom_property *)filp->private_data;
+	op = filp->private_data;
 	if (!(op->flag & OPP_STRING)) {
 		u32 *first, *last;
 		int first_off, last_cnt;
@@ -388,13 +388,13 @@ static ssize_t property_write(struct file *filp, const char __user *buf,
 			memcpy (b, filp->private_data,
 				sizeof (openprom_property)
 				+ strlen (op->name) + op->alloclen);
-			memset (((char *)b) + sizeof (openprom_property)
+			memset (b + sizeof (openprom_property)
 				+ strlen (op->name) + op->alloclen, 
 				0, 2 * i - op->alloclen);
-			op = (openprom_property *)b;
+			op = b;
 			op->alloclen = 2*i;
 			b = filp->private_data;
-			filp->private_data = (void *)op;
+			filp->private_data = op;
 			kfree (b);
 		}
 		first = ((u32 *)op->value) + (k / 9);
@@ -498,13 +498,13 @@ write_try_string:
 			memcpy (b, filp->private_data,
 				sizeof (openprom_property)
 				+ strlen (op->name) + op->alloclen);
-			memset (((char *)b) + sizeof (openprom_property)
+			memset (b + sizeof (openprom_property)
 				+ strlen (op->name) + op->alloclen, 
 				0, 2*(count - *ppos) - op->alloclen);
-			op = (openprom_property *)b;
+			op = b;
 			op->alloclen = 2*(count + *ppos);
 			b = filp->private_data;
-			filp->private_data = (void *)op;
+			filp->private_data = op;
 			kfree (b);
 		}
 		p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0);
@@ -533,7 +533,7 @@ write_try_string:
 
 int property_release (struct inode *inode, struct file *filp)
 {
-	openprom_property *op = (openprom_property *)filp->private_data;
+	openprom_property *op = filp->private_data;
 	int error;
 	u32 node;
 	
@@ -932,7 +932,7 @@ static int __init check_space (u16 n)
 			return -1;
 
 		if (nodes) {
-			memcpy ((char *)pages, (char *)nodes,
+			memcpy ((char *)pages, nodes,
 				(1 << alloced) * PAGE_SIZE);
 			free_pages ((unsigned long)nodes, alloced);
 		}
-- 
cgit v1.2.3


From a04ee14636fa339c4609766bd6173629d4f9026e Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@gmx.de>
Date: Sun, 25 Jun 2006 05:47:36 -0700
Subject: [PATCH] openpromfs: factorize out

"Move" "common code" out to PTR_NOD, which does the conversion from private
pointer to node number.  This is to reduce potential casting/conversion errors
due to redundancy.  (The naming PTR_NOD follows PTR_ERR, turning a pointer
into xyz.)

[akpm@osdl.org: cleanups]
Signed-off-by: Jan Engelhardt <jengelh@gmx.de>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/openpromfs/inode.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 3bd2eeccb2a..efc7c91128a 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -64,6 +64,11 @@ static int openpromfs_readdir(struct file *, void *, filldir_t);
 static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
 static int openpromfs_unlink (struct inode *, struct dentry *dentry);
 
+static inline u16 ptr_nod(void *p)
+{
+    return (long)p & 0xFFFF;
+}
+
 static ssize_t nodenum_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
@@ -95,9 +100,9 @@ static ssize_t property_read(struct file *filp, char __user *buf,
 	char buffer[64];
 	
 	if (!filp->private_data) {
-		node = nodes[(u16)((long)inode->u.generic_ip)].node;
+		node = nodes[ptr_nod(inode->u.generic_ip)].node;
 		i = ((u32)(long)inode->u.generic_ip) >> 16;
-		if ((u16)((long)inode->u.generic_ip) == aliases) {
+		if (ptr_nod(inode->u.generic_ip) == aliases) {
 			if (i >= aliases_nodes)
 				p = NULL;
 			else
@@ -111,7 +116,7 @@ static ssize_t property_read(struct file *filp, char __user *buf,
 			return -EIO;
 		i = prom_getproplen (node, p);
 		if (i < 0) {
-			if ((u16)((long)inode->u.generic_ip) == aliases)
+			if (ptr_nod(inode->u.generic_ip) == aliases)
 				i = 0;
 			else
 				return -EIO;
@@ -540,8 +545,8 @@ int property_release (struct inode *inode, struct file *filp)
 	if (!op)
 		return 0;
 	lock_kernel();
-	node = nodes[(u16)((long)inode->u.generic_ip)].node;
-	if ((u16)((long)inode->u.generic_ip) == aliases) {
+	node = nodes[ptr_nod(inode->u.generic_ip)].node;
+	if (ptr_nod(inode->u.generic_ip) == aliases) {
 		if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) {
 			char *p = op->name;
 			int i = (op->value - op->name) - strlen (op->name) - 1;
-- 
cgit v1.2.3


From fcd5df35882b128ef3e160fab3074e6fe7ae501b Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Sun, 25 Jun 2006 05:47:50 -0700
Subject: [PATCH] Avoid disk sector_t overflow for >2TB ext3 filesystem

If ext3 filesystem is larger than 2TB, and sector_t is a u32 (i.e.
CONFIG_LBD not defined in the kernel), the calculation of the disk sector
will overflow.  Add check at ext3_fill_super() and ext3_group_extend() to
prevent mount/remount/resize >2TB ext3 filesystem if sector_t size is 4
bytes.

Verified this patch on a 32 bit platform without CONFIG_LBD defined
(sector_t is 32 bits long), mount refuse to mount a 10TB ext3.

Signed-off-by: Mingming Cao<cmm@us.ibm.com>
Acked-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/resize.c | 10 ++++++++++
 fs/ext3/super.c  | 10 ++++++++++
 2 files changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 34b39e9a1e5..a31dff81ed7 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -925,6 +925,16 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
 		return 0;
 
+	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+			" too large to resize to %lu blocks safely\n",
+			sb->s_id, n_blocks_count);
+		if (sizeof(sector_t) < 8)
+			ext3_warning(sb, __FUNCTION__,
+			"CONFIG_LBD not enabled\n");
+		return -EINVAL;
+	}
+
 	if (n_blocks_count < o_blocks_count) {
 		ext3_warning(sb, __FUNCTION__,
 			     "can't shrink FS - resize aborted");
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a60cc6ec130..df98a770925 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1565,6 +1565,16 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
+	if (le32_to_cpu(es->s_blocks_count) >
+		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
+			" too large to mount safely\n", sb->s_id);
+		if (sizeof(sector_t) < 8)
+			printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
+					"enabled\n");
+		goto failed_mount;
+	}
+
 	if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext3;
 	sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
-- 
cgit v1.2.3


From f16fdadba28add689b567cf03c21dd6dec8e43be Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 25 Jun 2006 05:47:50 -0700
Subject: [PATCH] ext2: clean up dead code from mount code

The variable i is guaranteed to be the same as db_count given the previous
for loop.  So get rid of it since it's dead code.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ee4ba759581..fec55ff23de 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -854,7 +854,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	if (!ext2_check_descriptors (sb)) {
 		printk ("EXT2-fs: group descriptors corrupted!\n");
-		db_count = i;
 		goto failed_mount2;
 	}
 	sbi->s_gdb_count = db_count;
-- 
cgit v1.2.3


From e8f1c6227a0bc9b1e3a7e87cd31f650a909f647f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 25 Jun 2006 05:47:51 -0700
Subject: [PATCH] ext3: fix memory leak when the journal file is corrupted

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/recovery.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 80d7f53fd0a..de5bafb4e85 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -531,6 +531,7 @@ static int do_one_pass(journal_t *journal,
 		default:
 			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
 				  blocktype);
+			brelse(bh);
 			goto done;
 		}
 	}
-- 
cgit v1.2.3


From d2e5b13c4a7c68fdbcf389c9fffc12cfa2c185af Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 25 Jun 2006 05:47:52 -0700
Subject: [PATCH] ext3: remove inconsistent space before exclamation point in
 mount code

This was reported as Debian bug #336604.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index df98a770925..e0fc0c83be9 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1603,7 +1603,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		}
 	}
 	if (!ext3_check_descriptors (sb)) {
-		printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
+		printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
 		goto failed_mount2;
 	}
 	sbi->s_gdb_count = db_count;
-- 
cgit v1.2.3


From 01408c4939479ec46c15aa7ef6e2406be50eeeca Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sun, 25 Jun 2006 05:47:58 -0700
Subject: [PATCH] Prepare for __copy_from_user_inatomic to not zero missed
 bytes

The problem is that when we write to a file, the copy from userspace to
pagecache is first done with preemption disabled, so if the source address is
not immediately available the copy fails *and* *zeros* *the* *destination*.

This is a problem because a concurrent read (which admittedly is an odd thing
to do) might see zeros rather that was there before the write, or what was
there after, or some mixture of the two (any of these being a reasonable thing
to see).

If the copy did fail, it will immediately be retried with preemption
re-enabled so any transient problem with accessing the source won't cause an
error.

The first copying does not need to zero any uncopied bytes, and doing so
causes the problem.  It uses copy_from_user_atomic rather than copy_from_user
so the simple expedient is to change copy_from_user_atomic to *not* zero out
bytes on failure.

The first of these two patches prepares for the change by fixing two places
which assume copy_from_user_atomic does zero the tail.  The two usages are
very similar pieces of code which copy from a userspace iovec into one or more
page-cache pages.  These are changed to remove the assumption.

The second patch changes __copy_from_user_inatomic* to not zero the tail.
Once these are accepted, I will look at similar patches of other architectures
where this is important (ppc, mips and sparc being the ones I can find).

This patch:

There is a problem with __copy_from_user_inatomic zeroing the tail of the
buffer in the case of an error.  As it is called in atomic context, the error
may be transient, so it results in zeros being written where maybe they
shouldn't be.

In the usage in filemap, this opens a window for a well timed read to see data
(zeros) which is not consistent with any ordering of reads and writes.

Most cases where __copy_from_user_inatomic is called, a failure results in
__copy_from_user being called immediately.  As long as the latter zeros the
tail, the former doesn't need to.  However in *copy_from_user_iovec
implementations (in both filemap and ntfs/file), it is assumed that
copy_from_user_inatomic will zero the tail.

This patch removes that assumption, so that after this patch it will
be safe for copy_from_user_inatomic to not zero the tail.

This patch also adds some commentary to filemap.h and asm-i386/uaccess.h.

After this patch, all architectures that might disable preempt when
kmap_atomic is called need to have their __copy_from_user_inatomic* "fixed".
This includes
 - powerpc
 - i386
 - mips
 - sparc

Signed-off-by: Neil Brown <neilb@suse.de>
Cc: David Howells <dhowells@redhat.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ntfs/file.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 88292f9e4b9..2e42c2dcae1 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1358,7 +1358,7 @@ err_out:
 	goto out;
 }
 
-static size_t __ntfs_copy_from_user_iovec(char *vaddr,
+static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
 		const struct iovec *iov, size_t iov_ofs, size_t bytes)
 {
 	size_t total = 0;
@@ -1376,10 +1376,6 @@ static size_t __ntfs_copy_from_user_iovec(char *vaddr,
 		bytes -= len;
 		vaddr += len;
 		if (unlikely(left)) {
-			/*
-			 * Zero the rest of the target like __copy_from_user().
-			 */
-			memset(vaddr, 0, bytes);
 			total -= left;
 			break;
 		}
@@ -1420,11 +1416,13 @@ static inline void ntfs_set_next_iovec(const struct iovec **iovp,
  * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
  * single-segment behaviour.
  *
- * We call the same helper (__ntfs_copy_from_user_iovec()) both when atomic and
- * when not atomic.  This is ok because __ntfs_copy_from_user_iovec() calls
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic.  In
- * fact, the only difference between __copy_from_user_inatomic() and
- * __copy_from_user() is that the latter calls might_sleep().  And on many
+ * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both
+ * when atomic and when not atomic.  This is ok because
+ * __ntfs_copy_from_user_iovec_inatomic() calls __copy_from_user_inatomic()
+ * and it is ok to call this when non-atomic.
+ * Infact, the only difference between __copy_from_user_inatomic() and
+ * __copy_from_user() is that the latter calls might_sleep() and the former
+ * should not zero the tail of the buffer on error.  And on many
  * architectures __copy_from_user_inatomic() is just defined to
  * __copy_from_user() so it makes no difference at all on those architectures.
  */
@@ -1441,14 +1439,18 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
 		if (len > bytes)
 			len = bytes;
 		kaddr = kmap_atomic(*pages, KM_USER0);
-		copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+		copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
 				*iov, *iov_ofs, len);
 		kunmap_atomic(kaddr, KM_USER0);
 		if (unlikely(copied != len)) {
 			/* Do it the slow way. */
 			kaddr = kmap(*pages);
-			copied = __ntfs_copy_from_user_iovec(kaddr + ofs,
+			copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
 					*iov, *iov_ofs, len);
+			/*
+			 * Zero the rest of the target like __copy_from_user().
+			 */
+			memset(kaddr + ofs + copied, 0, len - copied);
 			kunmap(*pages);
 			if (unlikely(copied != len))
 				goto err_out;
-- 
cgit v1.2.3


From 1c2bf374a4b8c2e1a3e6ff3a64fb67272a8cd2e2 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Sun, 25 Jun 2006 05:48:06 -0700
Subject: [PATCH] ext3_fsblk_t: filesystem, group blocks and bug fixes

Some of the in-kernel ext3 block variable type are treated as signed 4 bytes
int type, thus limited ext3 filesystem to 8TB (4kblock size based).  While
trying to fix them, it seems quite confusing in the ext3 code where some
blocks are filesystem-wide blocks, some are group relative offsets that need
to be signed value (as -1 has special meaning).  So it seem saner to define
two types of physical blocks: one is filesystem wide blocks, another is
group-relative blocks.  The following patches clarify these two types of
blocks in the ext3 code, and fix the type bugs which limit current 32 bit ext3
filesystem limit to 8TB.

With this series of patches and the percpu counter data type changes in the mm
tree, we are able to extend exts filesystem limit to 16TB.

This work is also a pre-request for the recent >32 bit ext3 work, and makes
the kernel to able to address 48 bit ext3 block a lot easier: Simply redefine
ext3_fsblk_t from unsigned long to sector_t and redefine the format string for
ext3 filesystem block corresponding.

Two RFC with a series patches have been posted to ext2-devel list and have
been reviewed and discussed:
http://marc.theaimsgroup.com/?l=ext2-devel&m=114722190816690&w=2

http://marc.theaimsgroup.com/?l=ext2-devel&m=114784919525942&w=2

Patches are tested on both 32 bit machine and 64 bit machine, <8TB ext3 and
>8TB ext3 filesystem(with the latest to be released e2fsprogs-1.39).  Tests
includes overnight fsx, tiobench, dbench and fsstress.

This patch:

Defines ext3_fsblk_t and ext3_grpblk_t, and the printk format string for
filesystem wide blocks.

This patch classifies all block group relative blocks, and ext3_fsblk_t blocks
occurs in the same function where used to be confusing before.  Also include
kernel bug fixes for filesystem wide in-kernel block variables.  There are
some fileystem wide blocks are treated as int/unsigned int type in the kernel
currently, especially in ext3 block allocation and reservation code.  This
patch fixed those bugs by converting those variables to ext3_fsblk_t(unsigned
long) type.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/balloc.c | 215 +++++++++++++++++++++++++++++--------------------------
 fs/ext3/ialloc.c |  10 +--
 fs/ext3/inode.c  |   2 +-
 fs/ext3/resize.c |  43 ++++++-----
 fs/ext3/super.c  |   2 +-
 fs/ext3/xattr.c  |  27 +++----
 6 files changed, 158 insertions(+), 141 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 77927d6938f..b1633cd28ec 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -163,10 +163,10 @@ restart:
 #endif
 
 static int
-goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
+goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
 			unsigned int group, struct super_block * sb)
 {
-	unsigned long group_first_block, group_last_block;
+	ext3_fsblk_t group_first_block, group_last_block;
 
 	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
 				group * EXT3_BLOCKS_PER_GROUP(sb);
@@ -175,8 +175,8 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
 	if ((rsv->_rsv_start > group_last_block) ||
 	    (rsv->_rsv_end < group_first_block))
 		return 0;
-	if ((goal >= 0) && ((goal + group_first_block < rsv->_rsv_start)
-		|| (goal + group_first_block > rsv->_rsv_end)))
+	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+		|| (grp_goal + group_first_block > rsv->_rsv_end)))
 		return 0;
 	return 1;
 }
@@ -187,7 +187,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, int goal,
  * Returns NULL if there are no windows or if all windows start after the goal.
  */
 static struct ext3_reserve_window_node *
-search_reserve_window(struct rb_root *root, unsigned long goal)
+search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
 {
 	struct rb_node *n = root->rb_node;
 	struct ext3_reserve_window_node *rsv;
@@ -223,7 +223,7 @@ void ext3_rsv_window_add(struct super_block *sb,
 {
 	struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
 	struct rb_node *node = &rsv->rsv_node;
-	unsigned int start = rsv->rsv_start;
+	ext3_fsblk_t start = rsv->rsv_start;
 
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
@@ -310,20 +310,20 @@ void ext3_discard_reservation(struct inode *inode)
 
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
-			 unsigned long block, unsigned long count,
-			 int *pdquot_freed_blocks)
+			 ext3_fsblk_t block, unsigned long count,
+			 unsigned long *pdquot_freed_blocks)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gd_bh;
 	unsigned long block_group;
-	unsigned long bit;
+	ext3_grpblk_t bit;
 	unsigned long i;
 	unsigned long overflow;
 	struct ext3_group_desc * desc;
 	struct ext3_super_block * es;
 	struct ext3_sb_info *sbi;
 	int err = 0, ret;
-	unsigned group_freed;
+	ext3_grpblk_t group_freed;
 
 	*pdquot_freed_blocks = 0;
 	sbi = EXT3_SB(sb);
@@ -333,7 +333,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
 	    block + count > le32_to_cpu(es->s_blocks_count)) {
 		ext3_error (sb, "ext3_free_blocks",
 			    "Freeing blocks not in datazone - "
-			    "block = %lu, count = %lu", block, count);
+			    "block = "E3FSBLK", count = %lu", block, count);
 		goto error_return;
 	}
 
@@ -369,7 +369,7 @@ do_more:
 		      sbi->s_itb_per_group))
 		ext3_error (sb, "ext3_free_blocks",
 			    "Freeing blocks in system zones - "
-			    "Block = %lu, count = %lu",
+			    "Block = "E3FSBLK", count = %lu",
 			    block, count);
 
 	/*
@@ -453,7 +453,8 @@ do_more:
 						bit + i, bitmap_bh->b_data)) {
 			jbd_unlock_bh_state(bitmap_bh);
 			ext3_error(sb, __FUNCTION__,
-				"bit already cleared for block %lu", block + i);
+				"bit already cleared for block "E3FSBLK,
+				 block + i);
 			jbd_lock_bh_state(bitmap_bh);
 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
 		} else {
@@ -493,10 +494,10 @@ error_return:
 
 /* Free given blocks, update quota and i_blocks field */
 void ext3_free_blocks(handle_t *handle, struct inode *inode,
-			unsigned long block, unsigned long count)
+			ext3_fsblk_t block, unsigned long count)
 {
 	struct super_block * sb;
-	int dquot_freed_blocks;
+	unsigned long dquot_freed_blocks;
 
 	sb = inode->i_sb;
 	if (!sb) {
@@ -525,7 +526,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
  * data-writes at some point, and disable it for metadata allocations or
  * sync-data inodes.
  */
-static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
 {
 	int ret;
 	struct journal_head *jh = bh2jh(bh);
@@ -542,11 +543,11 @@ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
 	return ret;
 }
 
-static int
-bitmap_search_next_usable_block(int start, struct buffer_head *bh,
-					int maxblocks)
+static ext3_grpblk_t
+bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+					ext3_grpblk_t maxblocks)
 {
-	int next;
+	ext3_grpblk_t next;
 	struct journal_head *jh = bh2jh(bh);
 
 	/*
@@ -576,10 +577,11 @@ bitmap_search_next_usable_block(int start, struct buffer_head *bh,
  * the initial goal; then for a free byte somewhere in the bitmap; then
  * for any free bit in the bitmap.
  */
-static int
-find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
+static ext3_grpblk_t
+find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
+			ext3_grpblk_t maxblocks)
 {
-	int here, next;
+	ext3_grpblk_t here, next;
 	char *p, *r;
 
 	if (start > 0) {
@@ -591,7 +593,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
 		 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
 		 * next 64-bit boundary is simple..
 		 */
-		int end_goal = (start + 63) & ~63;
+		ext3_grpblk_t end_goal = (start + 63) & ~63;
 		if (end_goal > maxblocks)
 			end_goal = maxblocks;
 		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
@@ -628,7 +630,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
  * zero (failure).
  */
 static inline int
-claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
+claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
 {
 	struct journal_head *jh = bh2jh(bh);
 	int ret;
@@ -651,12 +653,13 @@ claim_block(spinlock_t *lock, int block, struct buffer_head *bh)
  * new bitmap.  In that case we must release write access to the old one via
  * ext3_journal_release_buffer(), else we'll run out of credits.
  */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
-			struct buffer_head *bitmap_bh, int goal,
+			struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
 			unsigned long *count, struct ext3_reserve_window *my_rsv)
 {
-	int group_first_block, start, end;
+	ext3_fsblk_t group_first_block;
+	ext3_grpblk_t start, end;
 	unsigned long num = 0;
 
 	/* we do allocation within the reservation window if we have a window */
@@ -673,13 +676,13 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 		if (end > EXT3_BLOCKS_PER_GROUP(sb))
 			/* reservation window crosses group boundary */
 			end = EXT3_BLOCKS_PER_GROUP(sb);
-		if ((start <= goal) && (goal < end))
-			start = goal;
+		if ((start <= grp_goal) && (grp_goal < end))
+			start = grp_goal;
 		else
-			goal = -1;
+			grp_goal = -1;
 	} else {
-		if (goal > 0)
-			start = goal;
+		if (grp_goal > 0)
+			start = grp_goal;
 		else
 			start = 0;
 		end = EXT3_BLOCKS_PER_GROUP(sb);
@@ -688,43 +691,43 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 	BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
 
 repeat:
-	if (goal < 0 || !ext3_test_allocatable(goal, bitmap_bh)) {
-		goal = find_next_usable_block(start, bitmap_bh, end);
-		if (goal < 0)
+	if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
+		grp_goal = find_next_usable_block(start, bitmap_bh, end);
+		if (grp_goal < 0)
 			goto fail_access;
 		if (!my_rsv) {
 			int i;
 
-			for (i = 0; i < 7 && goal > start &&
-					ext3_test_allocatable(goal - 1,
+			for (i = 0; i < 7 && grp_goal > start &&
+					ext3_test_allocatable(grp_goal - 1,
 								bitmap_bh);
-					i++, goal--)
+					i++, grp_goal--)
 				;
 		}
 	}
-	start = goal;
+	start = grp_goal;
 
-	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
 		/*
 		 * The block was allocated by another thread, or it was
 		 * allocated and then freed by another thread
 		 */
 		start++;
-		goal++;
+		grp_goal++;
 		if (start >= end)
 			goto fail_access;
 		goto repeat;
 	}
 	num++;
-	goal++;
-	while (num < *count && goal < end
-		&& ext3_test_allocatable(goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
+	grp_goal++;
+	while (num < *count && grp_goal < end
+		&& ext3_test_allocatable(grp_goal, bitmap_bh)
+		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
 		num++;
-		goal++;
+		grp_goal++;
 	}
 	*count = num;
-	return goal - num;
+	return grp_goal - num;
 fail_access:
 	*count = num;
 	return -1;
@@ -766,12 +769,13 @@ fail_access:
 static int find_next_reservable_window(
 				struct ext3_reserve_window_node *search_head,
 				struct ext3_reserve_window_node *my_rsv,
-				struct super_block * sb, int start_block,
-				int last_block)
+				struct super_block * sb,
+				ext3_fsblk_t start_block,
+				ext3_fsblk_t last_block)
 {
 	struct rb_node *next;
 	struct ext3_reserve_window_node *rsv, *prev;
-	int cur;
+	ext3_fsblk_t cur;
 	int size = my_rsv->rsv_goal_size;
 
 	/* TODO: make the start of the reservation window byte-aligned */
@@ -873,10 +877,10 @@ static int find_next_reservable_window(
  *
  *	@rsv: the reservation
  *
- *	@goal: The goal (group-relative).  It is where the search for a
+ *	@grp_goal: The goal (group-relative).  It is where the search for a
  *		free reservable space should start from.
- *		if we have a goal(goal >0 ), then start from there,
- *		no goal(goal = -1), we start from the first block
+ *		if we have a grp_goal(grp_goal >0 ), then start from there,
+ *		no grp_goal(grp_goal = -1), we start from the first block
  *		of the group.
  *
  *	@sb: the super block
@@ -885,12 +889,12 @@ static int find_next_reservable_window(
  *
  */
 static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
-		int goal, struct super_block *sb,
+		ext3_grpblk_t grp_goal, struct super_block *sb,
 		unsigned int group, struct buffer_head *bitmap_bh)
 {
 	struct ext3_reserve_window_node *search_head;
-	int group_first_block, group_end_block, start_block;
-	int first_free_block;
+	ext3_fsblk_t group_first_block, group_end_block, start_block;
+	ext3_grpblk_t first_free_block;
 	struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
 	unsigned long size;
 	int ret;
@@ -900,10 +904,10 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
 				group * EXT3_BLOCKS_PER_GROUP(sb);
 	group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
 
-	if (goal < 0)
+	if (grp_goal < 0)
 		start_block = group_first_block;
 	else
-		start_block = goal + group_first_block;
+		start_block = grp_goal + group_first_block;
 
 	size = my_rsv->rsv_goal_size;
 
@@ -1057,14 +1061,15 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
  * sorted double linked list should be fast.
  *
  */
-static int
+static ext3_grpblk_t
 ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			unsigned int group, struct buffer_head *bitmap_bh,
-			int goal, struct ext3_reserve_window_node * my_rsv,
+			ext3_grpblk_t grp_goal,
+			struct ext3_reserve_window_node * my_rsv,
 			unsigned long *count, int *errp)
 {
-	unsigned long group_first_block;
-	int ret = 0;
+	ext3_fsblk_t group_first_block;
+	ext3_grpblk_t ret = 0;
 	int fatal;
 	unsigned long num = *count;
 
@@ -1090,12 +1095,12 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 */
 	if (my_rsv == NULL ) {
 		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
-						goal, count, NULL);
+						grp_goal, count, NULL);
 		goto out;
 	}
 	/*
-	 * goal is a group relative block number (if there is a goal)
-	 * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb)
+	 * grp_goal is a group relative block number (if there is a goal)
+	 * 0 < grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
 	 * first block is a filesystem wide block number
 	 * first block is the block number of the first block in this group
 	 */
@@ -1119,24 +1124,24 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 */
 	while (1) {
 		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) {
+			!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) {
 			if (my_rsv->rsv_goal_size < *count)
 				my_rsv->rsv_goal_size = *count;
-			ret = alloc_new_reservation(my_rsv, goal, sb,
+			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
 							group, bitmap_bh);
 			if (ret < 0)
 				break;			/* failed */
 
-			if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb))
-				goal = -1;
-		} else if (goal > 0 && (my_rsv->rsv_end-goal+1) < *count)
+			if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb))
+				grp_goal = -1;
+		} else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count)
 			try_to_extend_reservation(my_rsv, sb,
-					*count-my_rsv->rsv_end + goal - 1);
+					*count-my_rsv->rsv_end + grp_goal - 1);
 
 		if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
 		    || (my_rsv->rsv_end < group_first_block))
 			BUG();
-		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal,
+		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal,
 					   &num, &my_rsv->rsv_window);
 		if (ret >= 0) {
 			my_rsv->rsv_alloc_hit += num;
@@ -1164,7 +1169,7 @@ out:
 
 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 {
-	int free_blocks, root_blocks;
+	ext3_fsblk_t free_blocks, root_blocks;
 
 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
 	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
@@ -1200,19 +1205,20 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
  * bitmap, and then for any free bit if that fails.
  * This function also updates quota and i_blocks field.
  */
-int ext3_new_blocks(handle_t *handle, struct inode *inode,
-			unsigned long goal, unsigned long *count, int *errp)
+ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gdp_bh;
 	int group_no;
 	int goal_group;
-	int ret_block;
+	ext3_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
+	ext3_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
+	ext3_fsblk_t ret_block;		/* filesyetem-wide allocated block */
 	int bgi;			/* blockgroup iteration index */
-	int target_block;
 	int fatal = 0, err;
 	int performed_allocation = 0;
-	int free_blocks;
+	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
 	struct super_block *sb;
 	struct ext3_group_desc *gdp;
 	struct ext3_super_block *es;
@@ -1285,16 +1291,17 @@ retry:
 		my_rsv = NULL;
 
 	if (free_blocks > 0) {
-		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
+		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
 				EXT3_BLOCKS_PER_GROUP(sb));
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, ret_block, my_rsv, &num, &fatal);
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, grp_target_blk,
+					my_rsv,	&num, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0)
+		if (grp_alloc_blk >= 0)
 			goto allocated;
 	}
 
@@ -1327,11 +1334,15 @@ retry:
 		bitmap_bh = read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
-		ret_block = ext3_try_to_allocate_with_rsv(sb, handle, group_no,
-					bitmap_bh, -1, my_rsv, &num, &fatal);
+		/*
+		 * try to allocate block(s) from this group, without a goal(-1).
+		 */
+		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, -1, my_rsv,
+					&num, &fatal);
 		if (fatal)
 			goto out;
-		if (ret_block >= 0) 
+		if (grp_alloc_blk >= 0)
 			goto allocated;
 	}
 	/*
@@ -1360,18 +1371,19 @@ allocated:
 	if (fatal)
 		goto out;
 
-	target_block = ret_block + group_no * EXT3_BLOCKS_PER_GROUP(sb)
+	ret_block = grp_alloc_blk + group_no * EXT3_BLOCKS_PER_GROUP(sb)
 				+ le32_to_cpu(es->s_first_data_block);
 
-	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), target_block, num) ||
-	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), target_block, num) ||
-	    in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
+	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
+	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
+	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
 		      EXT3_SB(sb)->s_itb_per_group) ||
-	    in_range(target_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
+	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
 		      EXT3_SB(sb)->s_itb_per_group))
 		ext3_error(sb, "ext3_new_block",
 			    "Allocating block in system zone - "
-			    "blocks from %u, length %lu", target_block, num);
+			    "blocks from "E3FSBLK", length %lu",
+			     ret_block, num);
 
 	performed_allocation = 1;
 
@@ -1380,7 +1392,7 @@ allocated:
 		struct buffer_head *debug_bh;
 
 		/* Record bitmap buffer state in the newly allocated block */
-		debug_bh = sb_find_get_block(sb, target_block);
+		debug_bh = sb_find_get_block(sb, ret_block);
 		if (debug_bh) {
 			BUFFER_TRACE(debug_bh, "state when allocated");
 			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
@@ -1393,24 +1405,21 @@ allocated:
 		int i;
 
 		for (i = 0; i < num; i++) {
-			if (ext3_test_bit(ret_block,
+			if (ext3_test_bit(grp_alloc_blk+i,
 					bh2jh(bitmap_bh)->b_committed_data)) {
 				printk("%s: block was unexpectedly set in "
 					"b_committed_data\n", __FUNCTION__);
 			}
 		}
 	}
-	ext3_debug("found bit %d\n", ret_block);
+	ext3_debug("found bit %d\n", grp_alloc_blk);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
 	jbd_unlock_bh_state(bitmap_bh);
 #endif
 
-	/* ret_block was blockgroup-relative.  Now it becomes fs-relative */
-	ret_block = target_block;
-
 	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
 		ext3_error(sb, "ext3_new_block",
-			    "block(%d) >= blocks count(%d) - "
+			    "block("E3FSBLK") >= blocks count(%d) - "
 			    "block_group = %d, es == %p ", ret_block,
 			le32_to_cpu(es->s_blocks_count), group_no, es);
 		goto out;
@@ -1421,7 +1430,7 @@ allocated:
 	 * list of some description.  We don't know in advance whether
 	 * the caller wants to use it as metadata or data.
 	 */
-	ext3_debug("allocating block %d. Goal hits %d of %d.\n",
+	ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
 			ret_block, goal_hits, goal_attempts);
 
 	spin_lock(sb_bgl_lock(sbi, group_no));
@@ -1461,8 +1470,8 @@ out:
 	return 0;
 }
 
-int ext3_new_block(handle_t *handle, struct inode *inode,
-			unsigned long goal, int *errp)
+ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
+			ext3_fsblk_t goal, int *errp)
 {
 	unsigned long count = 1;
 
@@ -1520,7 +1529,7 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 }
 
 static inline int
-block_in_use(unsigned long block, struct super_block *sb, unsigned char *map)
+block_in_use(ext3_fsblk_t block, struct super_block *sb, unsigned char *map)
 {
 	return ext3_test_bit ((block -
 		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dc826464f31..36546ed36a1 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -262,9 +262,11 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
 	int freei, avefreei;
-	int freeb, avefreeb;
-	int blocks_per_dir, ndirs;
-	int max_debt, max_dirs, min_blocks, min_inodes;
+	ext3_fsblk_t freeb, avefreeb;
+	ext3_fsblk_t blocks_per_dir;
+	int ndirs;
+	int max_debt, max_dirs, min_inodes;
+	ext3_grpblk_t min_blocks;
 	int group = -1, i;
 	struct ext3_group_desc *desc;
 	struct buffer_head *bh;
@@ -307,7 +309,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 	min_inodes = avefreei - inodes_per_group / 4;
 	min_blocks = avefreeb - EXT3_BLOCKS_PER_GROUP(sb) / 4;
 
-	max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, BLOCK_COST);
+	max_debt = EXT3_BLOCKS_PER_GROUP(sb) / max(blocks_per_dir, (ext3_fsblk_t)BLOCK_COST);
 	if (max_debt * INODE_COST > inodes_per_group)
 		max_debt = inodes_per_group / INODE_COST;
 	if (max_debt > 255)
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2edd7eec88f..b02bc32c57a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -62,7 +62,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode)
  * still needs to be revoked.
  */
 int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
-			struct buffer_head *bh, int blocknr)
+			struct buffer_head *bh, ext3_fsblk_t blocknr)
 {
 	int err;
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index a31dff81ed7..82c678e9268 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -28,16 +28,16 @@ static int verify_group_input(struct super_block *sb,
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	unsigned start = le32_to_cpu(es->s_blocks_count);
-	unsigned end = start + input->blocks_count;
+	ext3_fsblk_t start = le32_to_cpu(es->s_blocks_count);
+	ext3_fsblk_t end = start + input->blocks_count;
 	unsigned group = input->group;
-	unsigned itend = input->inode_table + sbi->s_itb_per_group;
+	ext3_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
 	unsigned overhead = ext3_bg_has_super(sb, group) ?
 		(1 + ext3_bg_num_gdb(sb, group) +
 		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
-	unsigned metaend = start + overhead;
+	ext3_fsblk_t metaend = start + overhead;
 	struct buffer_head *bh = NULL;
-	int free_blocks_count;
+	ext3_grpblk_t free_blocks_count;
 	int err = -EINVAL;
 
 	input->free_blocks_count = free_blocks_count =
@@ -64,7 +64,8 @@ static int verify_group_input(struct super_block *sb,
 		ext3_warning(sb, __FUNCTION__, "Bad blocks count %u",
 			     input->blocks_count);
 	else if (!(bh = sb_bread(sb, end - 1)))
-		ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)",
+		ext3_warning(sb, __FUNCTION__,
+			     "Cannot read last block ("E3FSBLK")",
 			     end - 1);
 	else if (outside(input->block_bitmap, start, end))
 		ext3_warning(sb, __FUNCTION__,
@@ -77,7 +78,7 @@ static int verify_group_input(struct super_block *sb,
 	else if (outside(input->inode_table, start, end) ||
 	         outside(itend - 1, start, end))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode table not in group (blocks %u-%u)",
+			     "Inode table not in group (blocks %u-"E3FSBLK")",
 			     input->inode_table, itend - 1);
 	else if (input->inode_bitmap == input->block_bitmap)
 		ext3_warning(sb, __FUNCTION__,
@@ -85,24 +86,27 @@ static int verify_group_input(struct super_block *sb,
 			     input->block_bitmap);
 	else if (inside(input->block_bitmap, input->inode_table, itend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Block bitmap (%u) in inode table (%u-%u)",
+			     "Block bitmap (%u) in inode table (%u-"E3FSBLK")",
 			     input->block_bitmap, input->inode_table, itend-1);
 	else if (inside(input->inode_bitmap, input->inode_table, itend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode bitmap (%u) in inode table (%u-%u)",
+			     "Inode bitmap (%u) in inode table (%u-"E3FSBLK")",
 			     input->inode_bitmap, input->inode_table, itend-1);
 	else if (inside(input->block_bitmap, start, metaend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Block bitmap (%u) in GDT table (%u-%u)",
+			     "Block bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
 			     input->block_bitmap, start, metaend - 1);
 	else if (inside(input->inode_bitmap, start, metaend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode bitmap (%u) in GDT table (%u-%u)",
+			     "Inode bitmap (%u) in GDT table"
+			     " ("E3FSBLK"-"E3FSBLK")",
 			     input->inode_bitmap, start, metaend - 1);
 	else if (inside(input->inode_table, start, metaend) ||
 	         inside(itend - 1, start, metaend))
 		ext3_warning(sb, __FUNCTION__,
-			     "Inode table (%u-%u) overlaps GDT table (%u-%u)",
+			     "Inode table (%u-"E3FSBLK") overlaps"
+			     "GDT table ("E3FSBLK"-"E3FSBLK")",
 			     input->inode_table, itend - 1, start, metaend - 1);
 	else
 		err = 0;
@@ -171,7 +175,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	struct buffer_head *bh;
 	handle_t *handle;
 	unsigned long block;
-	int bit;
+	ext3_grpblk_t bit;
 	int i;
 	int err = 0, err2;
 
@@ -340,7 +344,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved GDT %ld missing grp %d (%ld)",
+				     "reserved GDT %lu missing grp %d (%lu)",
 				     blk, grp,
 				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
 			return -EINVAL;
@@ -906,11 +910,12 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 {
 	unsigned long o_blocks_count;
 	unsigned long o_groups_count;
-	unsigned long last;
-	int add;
+	ext3_grpblk_t last;
+	ext3_grpblk_t add;
 	struct buffer_head * bh;
 	handle_t *handle;
-	int err, freed_blocks;
+	int err;
+	unsigned long freed_blocks;
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
@@ -1001,10 +1006,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
-	ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freeing blocks %lu through %lu\n", o_blocks_count,
 		   o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks %ld through %ld\n", o_blocks_count,
+	ext3_debug("freed blocks %lu through %lu\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext3_journal_stop(handle)))
 		goto exit_put;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e0fc0c83be9..94113500fc5 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1841,7 +1841,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	struct buffer_head * bh;
 	journal_t *journal;
 	int start;
-	int len;
+	ext3_fsblk_t len;
 	int hblock, blocksize;
 	unsigned long sb_block;
 	unsigned long offset;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e8d60bf6b7d..1ba515de5a7 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -225,7 +225,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	error = -ENODATA;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh)
 		goto cleanup;
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 bad_block:	ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %ld: bad block %u", inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -366,7 +366,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 	error = 0;
 	if (!EXT3_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %u", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	error = -EIO;
 	if (!bh)
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %d", inode->i_ino,
+			   "inode %ld: bad block %u", inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext3_xattr_check_block(bs->bh)) {
 			ext3_error(sb, __FUNCTION__,
-				"inode %ld: bad block %d", inode->i_ino,
+				"inode %ld: bad block %u", inode->i_ino,
 				EXT3_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
@@ -792,11 +792,12 @@ inserted:
 			get_bh(new_bh);
 		} else {
 			/* We need to allocate a new block */
-			int goal = le32_to_cpu(
+			ext3_fsblk_t goal = le32_to_cpu(
 					EXT3_SB(sb)->s_es->s_first_data_block) +
-				EXT3_I(inode)->i_block_group *
+				(ext3_fsblk_t)EXT3_I(inode)->i_block_group *
 				EXT3_BLOCKS_PER_GROUP(sb);
-			int block = ext3_new_block(handle, inode, goal, &error);
+			ext3_fsblk_t block = ext3_new_block(handle, inode,
+							goal, &error);
 			if (error)
 				goto cleanup;
 			ea_idebug(inode, "creating block %d", block);
@@ -847,7 +848,7 @@ cleanup_dquot:
 
 bad_block:
 	ext3_error(inode->i_sb, __FUNCTION__,
-		   "inode %ld: bad block %d", inode->i_ino,
+		   "inode %ld: bad block %u", inode->i_ino,
 		   EXT3_I(inode)->i_file_acl);
 	goto cleanup;
 
@@ -1076,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: block %d read error", inode->i_ino,
+			"inode %ld: block %u read error", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: bad block %d", inode->i_ino,
+			"inode %ld: bad block %u", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
@@ -1210,11 +1211,11 @@ again:
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
 			ext3_error(inode->i_sb, __FUNCTION__,
-				"inode %ld: block %ld read error",
+				"inode %ld: block %lu read error",
 				inode->i_ino, (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
 				EXT3_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %ld refcount %d>=%d",
+			ea_idebug(inode, "block %lu refcount %d>=%d",
 				  (unsigned long) ce->e_block,
 				  le32_to_cpu(BHDR(bh)->h_refcount),
 					  EXT3_XATTR_REFCOUNT_MAX);
-- 
cgit v1.2.3


From 43d23f9039fc810ecd621f1e4f9d578eadce058a Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Sun, 25 Jun 2006 05:48:07 -0700
Subject: [PATCH] ext3_fsblk_t: the rest of in-kernel filesystem blocks
 conversion

Convert the ext3 in-kernel filesystem blocks to ext3_fsblk_t.  Convert the
rest of all unsigned long type in-kernel filesystem blocks to ext3_fsblk_t,
and replace the printk format string respondingly.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/balloc.c | 29 +++++++++++++----------------
 fs/ext3/inode.c  | 55 ++++++++++++++++++++++++++++---------------------------
 fs/ext3/ioctl.c  |  2 +-
 fs/ext3/resize.c | 34 ++++++++++++++++++----------------
 fs/ext3/super.c  | 32 ++++++++++++++++----------------
 fs/ext3/xattr.c  | 12 ++++++------
 6 files changed, 82 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index b1633cd28ec..96172e89ddc 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -168,8 +168,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
 {
 	ext3_fsblk_t group_first_block, group_last_block;
 
-	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-				group * EXT3_BLOCKS_PER_GROUP(sb);
+	group_first_block = ext3_group_first_block_no(sb, group);
 	group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
 
 	if ((rsv->_rsv_start > group_last_block) ||
@@ -664,9 +663,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 
 	/* we do allocation within the reservation window if we have a window */
 	if (my_rsv) {
-		group_first_block =
-			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-			group * EXT3_BLOCKS_PER_GROUP(sb);
+		group_first_block = ext3_group_first_block_no(sb, group);
 		if (my_rsv->_rsv_start >= group_first_block)
 			start = my_rsv->_rsv_start - group_first_block;
 		else
@@ -900,8 +897,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
 	int ret;
 	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
 
-	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-				group * EXT3_BLOCKS_PER_GROUP(sb);
+	group_first_block = ext3_group_first_block_no(sb, group);
 	group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
 
 	if (grp_goal < 0)
@@ -1104,8 +1100,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 * first block is a filesystem wide block number
 	 * first block is the block number of the first block in this group
 	 */
-	group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
-			group * EXT3_BLOCKS_PER_GROUP(sb);
+	group_first_block = ext3_group_first_block_no(sb, group);
 
 	/*
 	 * Basically we will allocate a new block from inode's reservation
@@ -1371,8 +1366,7 @@ allocated:
 	if (fatal)
 		goto out;
 
-	ret_block = grp_alloc_blk + group_no * EXT3_BLOCKS_PER_GROUP(sb)
-				+ le32_to_cpu(es->s_first_data_block);
+	ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
 
 	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
 	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
@@ -1478,15 +1472,16 @@ ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
 	return ext3_new_blocks(handle, inode, goal, &count, errp);
 }
 
-unsigned long ext3_count_free_blocks(struct super_block *sb)
+ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
 {
-	unsigned long desc_count;
+	ext3_fsblk_t desc_count;
 	struct ext3_group_desc *gdp;
 	int i;
 	unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
 #ifdef EXT3FS_DEBUG
 	struct ext3_super_block *es;
-	unsigned long bitmap_count, x;
+	ext3_fsblk_t bitmap_count;
+	unsigned long x;
 	struct buffer_head *bitmap_bh = NULL;
 
 	es = EXT3_SB(sb)->s_es;
@@ -1511,8 +1506,10 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
-	printk("ext3_count_free_blocks: stored = %u, computed = %lu, %lu\n",
-	       le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
+	printk("ext3_count_free_blocks: stored = "E3FSBLK
+		", computed = "E3FSBLK", "E3FSBLK"\n",
+	       le32_to_cpu(es->s_free_blocks_count),
+		desc_count, bitmap_count);
 	return bitmap_count;
 #else
 	desc_count = 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b02bc32c57a..0321e1b9034 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -407,13 +407,13 @@ no_block:
  *
  *	Caller must make sure that @ind is valid and will stay that way.
  */
-static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
 {
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
 	__le32 *p;
-	unsigned long bg_start;
-	unsigned long colour;
+	ext3_fsblk_t bg_start;
+	ext3_grpblk_t colour;
 
 	/* Try to find previous block */
 	for (p = ind->p - 1; p >= start; p--) {
@@ -429,8 +429,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
 	 * It is going to be referred to from the inode itself? OK, just put it
 	 * into the same cylinder group then.
 	 */
-	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
-		le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
+	bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
 	colour = (current->pid % 16) *
 			(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
 	return bg_start + colour;
@@ -448,7 +447,7 @@ static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
  *	stores it in *@goal and returns zero.
  */
 
-static unsigned long ext3_find_goal(struct inode *inode, long block,
+static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
 		Indirect chain[4], Indirect *partial)
 {
 	struct ext3_block_alloc_info *block_i;
@@ -516,13 +515,13 @@ static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
  *		direct blocks
  */
 static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
-			unsigned long goal, int indirect_blks, int blks,
-			unsigned long long new_blocks[4], int *err)
+			ext3_fsblk_t goal, int indirect_blks, int blks,
+			ext3_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
 	unsigned long count = 0;
 	int index = 0;
-	unsigned long current_block = 0;
+	ext3_fsblk_t current_block = 0;
 	int ret = 0;
 
 	/*
@@ -592,7 +591,7 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, unsigned long goal,
+			int indirect_blks, int *blks, ext3_fsblk_t goal,
 			int *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
@@ -600,8 +599,8 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
 	int err = 0;
 	struct buffer_head *bh;
 	int num;
-	unsigned long long new_blocks[4];
-	unsigned long long current_block;
+	ext3_fsblk_t new_blocks[4];
+	ext3_fsblk_t current_block;
 
 	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
 				*blks, new_blocks, &err);
@@ -688,7 +687,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
 	int i;
 	int err = 0;
 	struct ext3_block_alloc_info *block_i;
-	unsigned long current_block;
+	ext3_fsblk_t current_block;
 
 	block_i = EXT3_I(inode)->i_block_alloc_info;
 	/*
@@ -795,13 +794,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 	int offsets[4];
 	Indirect chain[4];
 	Indirect *partial;
-	unsigned long goal;
+	ext3_fsblk_t goal;
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	int count = 0;
-	unsigned long first_block = 0;
+	ext3_fsblk_t first_block = 0;
 
 
 	J_ASSERT(handle != NULL || create == 0);
@@ -819,7 +818,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
 		count++;
 		/*map more blocks*/
 		while (count < maxblocks && count <= blocks_to_boundary) {
-			unsigned long blk;
+			ext3_fsblk_t blk;
 
 			if (!verify_chain(chain, partial)) {
 				/*
@@ -1759,7 +1758,7 @@ void ext3_set_aops(struct inode *inode)
 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
 		struct address_space *mapping, loff_t from)
 {
-	unsigned long index = from >> PAGE_CACHE_SHIFT;
+	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned blocksize, iblock, length, pos;
 	struct inode *inode = mapping->host;
@@ -1960,7 +1959,7 @@ no_top:
  * than `count' because there can be holes in there.
  */
 static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
-		struct buffer_head *bh, unsigned long block_to_free,
+		struct buffer_head *bh, ext3_fsblk_t block_to_free,
 		unsigned long count, __le32 *first, __le32 *last)
 {
 	__le32 *p;
@@ -2022,12 +2021,12 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
 			   struct buffer_head *this_bh,
 			   __le32 *first, __le32 *last)
 {
-	unsigned long block_to_free = 0;    /* Starting block # of a run */
+	ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
 	unsigned long count = 0;	    /* Number of blocks in the run */ 
 	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
 					       corresponding to
 					       block_to_free */
-	unsigned long nr;		    /* Current block # */
+	ext3_fsblk_t nr;		    /* Current block # */
 	__le32 *p;			    /* Pointer into inode/ind
 					       for current block */
 	int err;
@@ -2089,7 +2088,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 			       struct buffer_head *parent_bh,
 			       __le32 *first, __le32 *last, int depth)
 {
-	unsigned long nr;
+	ext3_fsblk_t nr;
 	__le32 *p;
 
 	if (is_handle_aborted(handle))
@@ -2113,7 +2112,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 			 */
 			if (!bh) {
 				ext3_error(inode->i_sb, "ext3_free_branches",
-					   "Read failure, inode=%ld, block=%ld",
+					   "Read failure, inode=%ld, block="E3FSBLK,
 					   inode->i_ino, nr);
 				continue;
 			}
@@ -2394,11 +2393,12 @@ out_stop:
 	ext3_journal_stop(handle);
 }
 
-static unsigned long ext3_get_inode_block(struct super_block *sb,
+static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
 		unsigned long ino, struct ext3_iloc *iloc)
 {
 	unsigned long desc, group_desc, block_group;
-	unsigned long offset, block;
+	unsigned long offset;
+	ext3_fsblk_t block;
 	struct buffer_head *bh;
 	struct ext3_group_desc * gdp;
 
@@ -2448,7 +2448,7 @@ static unsigned long ext3_get_inode_block(struct super_block *sb,
 static int __ext3_get_inode_loc(struct inode *inode,
 				struct ext3_iloc *iloc, int in_mem)
 {
-	unsigned long block;
+	ext3_fsblk_t block;
 	struct buffer_head *bh;
 
 	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
@@ -2459,7 +2459,8 @@ static int __ext3_get_inode_loc(struct inode *inode,
 	if (!bh) {
 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
 				"unable to read inode block - "
-				"inode=%lu, block=%lu", inode->i_ino, block);
+				"inode=%lu, block="E3FSBLK,
+				 inode->i_ino, block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -2540,7 +2541,7 @@ make_io:
 		if (!buffer_uptodate(bh)) {
 			ext3_error(inode->i_sb, "ext3_get_inode_loc",
 					"unable to read inode block - "
-					"inode=%lu, block=%lu",
+					"inode=%lu, block="E3FSBLK,
 					inode->i_ino, block);
 			brelse(bh);
 			return -EIO;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 8c22aa9a7fb..3a6b012d120 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -204,7 +204,7 @@ flags_err:
 		return 0;
 	}
 	case EXT3_IOC_GROUP_EXTEND: {
-		unsigned long n_blocks_count;
+		ext3_fsblk_t n_blocks_count;
 		struct super_block *sb = inode->i_sb;
 		int err;
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 82c678e9268..dfd811895d8 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -116,7 +116,7 @@ static int verify_group_input(struct super_block *sb,
 }
 
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
-				  unsigned long blk)
+				  ext3_fsblk_t blk)
 {
 	struct buffer_head *bh;
 	int err;
@@ -167,14 +167,13 @@ static int setup_new_group_blocks(struct super_block *sb,
 				  struct ext3_new_group_data *input)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long start = input->group * sbi->s_blocks_per_group +
-		le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext3_fsblk_t start = ext3_group_first_block_no(sb, input->group);
 	int reserved_gdb = ext3_bg_has_super(sb, input->group) ?
 		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
 	unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group);
 	struct buffer_head *bh;
 	handle_t *handle;
-	unsigned long block;
+	ext3_fsblk_t block;
 	ext3_grpblk_t bit;
 	int i;
 	int err = 0, err2;
@@ -332,7 +331,7 @@ static unsigned ext3_list_backups(struct super_block *sb, unsigned *three,
 static int verify_reserved_gdb(struct super_block *sb,
 			       struct buffer_head *primary)
 {
-	const unsigned long blk = primary->b_blocknr;
+	const ext3_fsblk_t blk = primary->b_blocknr;
 	const unsigned long end = EXT3_SB(sb)->s_groups_count;
 	unsigned three = 1;
 	unsigned five = 5;
@@ -344,7 +343,8 @@ static int verify_reserved_gdb(struct super_block *sb,
 	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
 		if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved GDT %lu missing grp %d (%lu)",
+				     "reserved GDT "E3FSBLK
+				     " missing grp %d ("E3FSBLK")",
 				     blk, grp,
 				     grp * EXT3_BLOCKS_PER_GROUP(sb) + blk);
 			return -EINVAL;
@@ -376,7 +376,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	struct super_block *sb = inode->i_sb;
 	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
 	unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb);
-	unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+	ext3_fsblk_t gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
 	struct buffer_head **o_group_desc, **n_group_desc;
 	struct buffer_head *dind;
 	int gdbackups;
@@ -421,7 +421,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	data = (__u32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
 		ext3_warning(sb, __FUNCTION__,
-			     "new group %u GDT block %lu not reserved",
+			     "new group %u GDT block "E3FSBLK" not reserved",
 			     input->group, gdblock);
 		err = -EINVAL;
 		goto exit_dind;
@@ -519,7 +519,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	struct buffer_head **primary;
 	struct buffer_head *dind;
 	struct ext3_iloc iloc;
-	unsigned long blk;
+	ext3_fsblk_t blk;
 	__u32 *data, *end;
 	int gdbackups = 0;
 	int res, i;
@@ -544,7 +544,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	for (res = 0; res < reserved_gdb; res++, blk++) {
 		if (le32_to_cpu(*data) != blk) {
 			ext3_warning(sb, __FUNCTION__,
-				     "reserved block %lu not at offset %ld",
+				     "reserved block "E3FSBLK
+				     " not at offset %ld",
 				     blk, (long)(data - (__u32 *)dind->b_data));
 			err = -EINVAL;
 			goto exit_bh;
@@ -906,9 +907,9 @@ exit_put:
  * GDT blocks are reserved to grow to the desired size.
  */
 int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
-		      unsigned long n_blocks_count)
+		      ext3_fsblk_t n_blocks_count)
 {
-	unsigned long o_blocks_count;
+	ext3_fsblk_t o_blocks_count;
 	unsigned long o_groups_count;
 	ext3_grpblk_t last;
 	ext3_grpblk_t add;
@@ -924,7 +925,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	o_groups_count = EXT3_SB(sb)->s_groups_count;
 
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n",
+		printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
 		       o_blocks_count, n_blocks_count);
 
 	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -963,7 +964,8 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 
 	if (o_blocks_count + add < n_blocks_count)
 		ext3_warning(sb, __FUNCTION__,
-			     "will only finish group (%lu blocks, %u new)",
+			     "will only finish group ("E3FSBLK
+			     " blocks, %u new)",
 			     o_blocks_count + add, add);
 
 	/* See if the device is actually as big as what was requested */
@@ -1006,10 +1008,10 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
-	ext3_debug("freeing blocks %lu through %lu\n", o_blocks_count,
+	ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
 		   o_blocks_count + add);
 	ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
-	ext3_debug("freed blocks %lu through %lu\n", o_blocks_count,
+	ext3_debug("freed blocks "E3FSBLK" through "E3FSBLK"\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext3_journal_stop(handle)))
 		goto exit_put;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 94113500fc5..b2891cc29db 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -689,14 +689,15 @@ static match_table_t tokens = {
 	{Opt_resize, "resize"},
 };
 
-static unsigned long get_sb_block(void **data)
+static ext3_fsblk_t get_sb_block(void **data)
 {
-	unsigned long 	sb_block;
+	ext3_fsblk_t 	sb_block;
 	char 		*options = (char *) *data;
 
 	if (!options || strncmp(options, "sb=", 3) != 0)
 		return 1;	/* Default location */
 	options += 3;
+	/*todo: use simple_strtoll with >32bit ext3 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
 		printk("EXT3-fs: Invalid sb specification: %s\n",
@@ -711,7 +712,7 @@ static unsigned long get_sb_block(void **data)
 
 static int parse_options (char *options, struct super_block *sb,
 			  unsigned long *inum, unsigned long *journal_devnum,
-			  unsigned long *n_blocks_count, int is_remount)
+			  ext3_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	char * p;
@@ -1128,7 +1129,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 static int ext3_check_descriptors (struct super_block * sb)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block);
 	struct ext3_group_desc * gdp = NULL;
 	int desc_block = 0;
 	int i;
@@ -1315,15 +1316,14 @@ static loff_t ext3_max_size(int bits)
 	return res;
 }
 
-static unsigned long descriptor_loc(struct super_block *sb,
-				    unsigned long logic_sb_block,
+static ext3_fsblk_t descriptor_loc(struct super_block *sb,
+				    ext3_fsblk_t logic_sb_block,
 				    int nr)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long bg, first_data_block, first_meta_bg;
+	unsigned long bg, first_meta_bg;
 	int has_super = 0;
 
-	first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block);
 	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
 
 	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_META_BG) ||
@@ -1332,7 +1332,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
 	bg = sbi->s_desc_per_block * nr;
 	if (ext3_bg_has_super(sb, bg))
 		has_super = 1;
-	return (first_data_block + has_super + (bg * sbi->s_blocks_per_group));
+	return (has_super + ext3_group_first_block_no(sb, bg));
 }
 
 
@@ -1341,9 +1341,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	struct buffer_head * bh;
 	struct ext3_super_block *es = NULL;
 	struct ext3_sb_info *sbi;
-	unsigned long block;
-	unsigned long sb_block = get_sb_block(&data);
-	unsigned long logic_sb_block;
+	ext3_fsblk_t block;
+	ext3_fsblk_t sb_block = get_sb_block(&data);
+	ext3_fsblk_t logic_sb_block;
 	unsigned long offset = 0;
 	unsigned long journal_inum = 0;
 	unsigned long journal_devnum = 0;
@@ -1840,10 +1840,10 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 {
 	struct buffer_head * bh;
 	journal_t *journal;
-	int start;
+	ext3_fsblk_t start;
 	ext3_fsblk_t len;
 	int hblock, blocksize;
-	unsigned long sb_block;
+	ext3_fsblk_t sb_block;
 	unsigned long offset;
 	struct ext3_super_block * es;
 	struct block_device *bdev;
@@ -2216,7 +2216,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 {
 	struct ext3_super_block * es;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long n_blocks_count = 0;
+	ext3_fsblk_t n_blocks_count = 0;
 	unsigned long old_sb_flags;
 	struct ext3_mount_options old_opts;
 	int err;
@@ -2336,7 +2336,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
 	struct super_block *sb = dentry->d_sb;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
 	struct ext3_super_block *es = sbi->s_es;
-	unsigned long overhead;
+	ext3_fsblk_t overhead;
 	int i;
 
 	if (test_opt (sb, MINIX_DF))
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 1ba515de5a7..a44a0562203 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 bad_block:	ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %u", inode->i_ino,
+			   "inode %ld: bad block "E3FSBLK, inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block %u", inode->i_ino,
+			   "inode %ld: bad block "E3FSBLK, inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext3_xattr_check_block(bs->bh)) {
 			ext3_error(sb, __FUNCTION__,
-				"inode %ld: bad block %u", inode->i_ino,
+				"inode %ld: bad block "E3FSBLK, inode->i_ino,
 				EXT3_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
@@ -848,7 +848,7 @@ cleanup_dquot:
 
 bad_block:
 	ext3_error(inode->i_sb, __FUNCTION__,
-		   "inode %ld: bad block %u", inode->i_ino,
+		   "inode %ld: bad block "E3FSBLK, inode->i_ino,
 		   EXT3_I(inode)->i_file_acl);
 	goto cleanup;
 
@@ -1077,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: block %u read error", inode->i_ino,
+			"inode %ld: block "E3FSBLK" read error", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: bad block %u", inode->i_ino,
+			"inode %ld: bad block "E3FSBLK, inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
-- 
cgit v1.2.3


From 69755652c92106855b4b096b7c2935b59e6252c6 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 25 Jun 2006 05:48:08 -0700
Subject: [PATCH] Make procfs obligatory except under CONFIG_EMBEDDED

Make procfs non-optional unless EMBEDDED is set, just like sysfs.  procfs
is already de facto required for a large subset of Linux functionality.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index ea60e83e7fe..1cdc043922d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -776,7 +776,8 @@ endmenu
 menu "Pseudo filesystems"
 
 config PROC_FS
-	bool "/proc file system support"
+	bool "/proc file system support" if EMBEDDED
+	default y
 	help
 	  This is a virtual file system providing information about the status
 	  of the system. "Virtual" means that it doesn't take up any space on
-- 
cgit v1.2.3


From 21730eed11de42f22afcbd43f450a1872a0b5ea1 Mon Sep 17 00:00:00 2001
From: Valerie Henson <val_henson@linux.intel.com>
Date: Sun, 25 Jun 2006 05:48:12 -0700
Subject: [PATCH] Make EXT2_DEBUG work again

This patch makes EXT2_DEBUG work again.  Due to lack of proper include
file, EXT2_DEBUG was undefined in bitmap.c and ext2_count_free() is left
out.  Moved to balloc.c and removed bitmap.c entirely.

Second, debug versions of ext2_count_free_{inodes/blocks} reacquires
superblock lock.  Moved lock into callers.

Signed-off-by: Val Henson <val_henson@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/Makefile |  2 +-
 fs/ext2/balloc.c | 22 ++++++++++++++++++++--
 fs/ext2/bitmap.c | 32 --------------------------------
 fs/ext2/ialloc.c |  3 +--
 fs/ext2/super.c  |  2 ++
 5 files changed, 24 insertions(+), 37 deletions(-)
 delete mode 100644 fs/ext2/bitmap.c

(limited to 'fs')

diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index c5d02da73bc..e0b2b43c1fd 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_EXT2_FS) += ext2.o
 
-ext2-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
 	  ioctl.o namei.o super.o symlink.o
 
 ext2-$(CONFIG_EXT2_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 2c00953d4b0..433a213a8bd 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -521,6 +521,26 @@ io_error:
 	goto out_release;
 }
 
+#ifdef EXT2FS_DEBUG
+
+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+
+	if (!map)
+		return (0);
+	for (i = 0; i < numchars; i++)
+		sum += nibblemap[map->b_data[i] & 0xf] +
+			nibblemap[(map->b_data[i] >> 4) & 0xf];
+	return (sum);
+}
+
+#endif  /*  EXT2FS_DEBUG  */
+
+/* Superblock must be locked */
 unsigned long ext2_count_free_blocks (struct super_block * sb)
 {
 	struct ext2_group_desc * desc;
@@ -530,7 +550,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
 	unsigned long bitmap_count, x;
 	struct ext2_super_block *es;
 
-	lock_super (sb);
 	es = EXT2_SB(sb)->s_es;
 	desc_count = 0;
 	bitmap_count = 0;
@@ -554,7 +573,6 @@ unsigned long ext2_count_free_blocks (struct super_block * sb)
 	printk("ext2_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
 		(long)le32_to_cpu(es->s_free_blocks_count),
 		desc_count, bitmap_count);
-	unlock_super (sb);
 	return bitmap_count;
 #else
         for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/bitmap.c b/fs/ext2/bitmap.c
deleted file mode 100644
index e9983a0dd39..00000000000
--- a/fs/ext2/bitmap.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  linux/fs/ext2/bitmap.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- */
-
-#ifdef EXT2FS_DEBUG
-
-#include <linux/buffer_head.h>
-
-#include "ext2.h"
-
-static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-
-unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
-{
-	unsigned int i;
-	unsigned long sum = 0;
-	
-	if (!map) 
-		return (0);
-	for (i = 0; i < numchars; i++)
-		sum += nibblemap[map->b_data[i] & 0xf] +
-			nibblemap[(map->b_data[i] >> 4) & 0xf];
-	return (sum);
-}
-
-#endif  /*  EXT2FS_DEBUG  */
-
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index e52765219e1..308c252568c 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -638,6 +638,7 @@ fail:
 	return ERR_PTR(err);
 }
 
+/* Superblock must be locked */
 unsigned long ext2_count_free_inodes (struct super_block * sb)
 {
 	struct ext2_group_desc *desc;
@@ -649,7 +650,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
 	unsigned long bitmap_count = 0;
 	struct buffer_head *bitmap_bh = NULL;
 
-	lock_super (sb);
 	es = EXT2_SB(sb)->s_es;
 	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
 		unsigned x;
@@ -672,7 +672,6 @@ unsigned long ext2_count_free_inodes (struct super_block * sb)
 	printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
 		percpu_counter_read(&EXT2_SB(sb)->s_freeinodes_counter),
 		desc_count, bitmap_count);
-	unlock_super(sb);
 	return desc_count;
 #else
 	for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index fec55ff23de..d4233b2e643 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1045,6 +1045,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	unsigned long overhead;
 	int i;
 
+	lock_super(sb);
 	if (test_opt (sb, MINIX_DF))
 		overhead = 0;
 	else {
@@ -1085,6 +1086,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
 	buf->f_files = le32_to_cpu(sbi->s_es->s_inodes_count);
 	buf->f_ffree = ext2_count_free_inodes (sb);
 	buf->f_namelen = EXT2_NAME_LEN;
+	unlock_super(sb);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3419b23a919698f75944d3e0d97eb1d9c51e4bb6 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Sun, 25 Jun 2006 05:48:14 -0700
Subject: [PATCH] epoll: use unlocked wqueue operations

A few days ago Arjan signaled a lockdep red flag on epoll locks, and
precisely between the epoll's device structure lock (->lock) and the wait
queue head lock (->lock).

Like I explained in another email, and directly to Arjan, this can't happen
in reality because of the explicit check at eventpoll.c:592, that does not
allow to drop an epoll fd inside the same epoll fd.  Since lockdep is
working on per-structure locks, it will never be able to know of policies
enforced in other parts of the code.

It was decided time ago of having the ability to drop epoll fds inside
other epoll fds, that triggers a very trick wakeup operations (due to
possibly reentrant callback-driven wakeups) handled by the
ep_poll_safewake() function.  While looking again at the code though, I
noticed that all the operations done on the epoll's main structure wait
queue head (->wq) are already protected by the epoll lock (->lock), so that
locked-style functions can be used to manipulate the ->wq member.  This
makes both a lock-acquire save, and lockdep happy.

Running totalmess on my dual opteron for a while did not reveal any problem
so far:

http://www.xmailserver.org/totalmess.c

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/eventpoll.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 08e7e6a555c..9c677bbd0b0 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
  *  fs/eventpoll.c ( Efficent event polling implementation )
- *  Copyright (C) 2001,...,2003	 Davide Libenzi
+ *  Copyright (C) 2001,...,2006	 Davide Libenzi
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -1004,7 +1004,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 
 		/* Notify waiting tasks that events are available */
 		if (waitqueue_active(&ep->wq))
-			wake_up(&ep->wq);
+			__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
@@ -1083,7 +1083,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 
 				/* Notify waiting tasks that events are available */
 				if (waitqueue_active(&ep->wq))
-					wake_up(&ep->wq);
+					__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+							 TASK_INTERRUPTIBLE);
 				if (waitqueue_active(&ep->poll_wait))
 					pwake++;
 			}
@@ -1260,7 +1261,8 @@ is_linked:
 	 * wait list.
 	 */
 	if (waitqueue_active(&ep->wq))
-		wake_up(&ep->wq);
+		__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+				 TASK_INTERRUPTIBLE);
 	if (waitqueue_active(&ep->poll_wait))
 		pwake++;
 
@@ -1444,7 +1446,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
 		 * wait list.
 		 */
 		if (waitqueue_active(&ep->wq))
-			wake_up(&ep->wq);
+			__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
+					 TASK_INTERRUPTIBLE);
 		if (waitqueue_active(&ep->poll_wait))
 			pwake++;
 	}
@@ -1516,7 +1519,7 @@ retry:
 		 * ep_poll_callback() when events will become available.
 		 */
 		init_waitqueue_entry(&wait, current);
-		add_wait_queue(&ep->wq, &wait);
+		__add_wait_queue(&ep->wq, &wait);
 
 		for (;;) {
 			/*
@@ -1536,7 +1539,7 @@ retry:
 			jtimeout = schedule_timeout(jtimeout);
 			write_lock_irqsave(&ep->lock, flags);
 		}
-		remove_wait_queue(&ep->wq, &wait);
+		__remove_wait_queue(&ep->wq, &wait);
 
 		set_current_state(TASK_RUNNING);
 	}
-- 
cgit v1.2.3


From 0710d36a0fd5db3176369397f0fc49db32a63507 Mon Sep 17 00:00:00 2001
From: Florin Malita <fmalita@gmail.com>
Date: Sun, 25 Jun 2006 05:48:31 -0700
Subject: [PATCH] 9pfs: missing result check in v9fs_vfs_readlink() and
 v9fs_vfs_link()

__getname() may fail and return NULL (as pointed out by Coverity 437 &
1220).

Signed-off-by: Florin Malita <fmalita@gmail.com>
Acked-by: Eric Van Hensbergen <ericvh@gmail.com>
Cc: <rminnich@lanl.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_inode.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2cb87ba4b1c..8e60dc7ec4a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1054,6 +1054,9 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
 	int ret;
 	char *link = __getname();
 
+	if (unlikely(!link))
+		return -ENOMEM;
+
 	if (buflen > PATH_MAX)
 		buflen = PATH_MAX;
 
@@ -1227,6 +1230,9 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	name = __getname();
+	if (unlikely(!name))
+		return -ENOMEM;
+
 	sprintf(name, "%d\n", oldfid->fid);
 	retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name);
 	__putname(name);
-- 
cgit v1.2.3


From 92eeccd8badbfebe12383b6e5326b27dc707601d Mon Sep 17 00:00:00 2001
From: Johann Lombardi <johann.lombardi@bull.net>
Date: Sun, 25 Jun 2006 05:48:33 -0700
Subject: [PATCH] ext3: cleanup dead code in ext3_add_entry()

The variables nlen and rlen are defined/initialized but not used in
ext3_add_entry().

Signed-off-by: Johann Lombardi <johann.lombardi@bull.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/namei.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b8f5cd1e540..d9176dba369 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1379,7 +1379,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
 	int	dx_fallback=0;
 #endif
 	unsigned blocksize;
-	unsigned nlen, rlen;
 	u32 block, blocks;
 
 	sb = dir->i_sb;
@@ -1417,8 +1416,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
 		return retval;
 	de = (struct ext3_dir_entry_2 *) bh->b_data;
 	de->inode = 0;
-	de->rec_len = cpu_to_le16(rlen = blocksize);
-	nlen = 0;
+	de->rec_len = cpu_to_le16(blocksize);
 	return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
 
-- 
cgit v1.2.3


From 6e656be899993f450a765056cdc8d87e58906508 Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Sun, 25 Jun 2006 05:48:36 -0700
Subject: [PATCH] ftruncate does not always update m/ctime

In the course of trying to track down a bug where a file mtime was not
being updated correctly, it was discovered that the m/ctime updates were
not quite being handled correctly for ftruncate() calls.

Quoth SUSv3:

open(2):

        If O_TRUNC is set and the file did previously exist, upon
        successful completion, open() shall mark for update the st_ctime
        and st_mtime fields of the file.

truncate(2):

        Upon successful completion, if the file size is changed, this
        function shall mark for update the st_ctime and st_mtime fields
        of the file, and the S_ISUID and S_ISGID bits of the file mode
        may be cleared.

ftruncate(2):

        Upon successful completion, if fildes refers to a regular file,
        the ftruncate() function shall mark for update the st_ctime and
        st_mtime fields of the file and the S_ISUID and S_ISGID bits of
        the file mode may be cleared. If the ftruncate() function is
        unsuccessful, the file is unaffected.

The open(O_TRUNC) and truncate cases were being handled correctly, but the
ftruncate case was being handled like the truncate case.  The semantics of
truncate and ftruncate don't quite match, so ftruncate needs to be handled
slightly differently.

The attached patch addresses this issue for ftruncate(2).

My thanx to Stephen Tweedie and Trond Myklebust for their help in
understanding the situation and semantics.

Signed-off-by: Peter Staubach <staubach@redhat.com>
Cc: "Stephen C. Tweedie" <sct@redhat.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 5fb16e5267d..303f06d2a7b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -322,7 +322,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
-		error = do_truncate(dentry, length, 0, file);
+		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
 	fput(file);
 out:
-- 
cgit v1.2.3


From f9022f66336b8ad680884d4810f1c421fff899e3 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Sun, 25 Jun 2006 05:48:47 -0700
Subject: [PATCH] autofs4: need to invalidate children on tree mount expire

I've found a case where invalid dentrys in a mount tree, waiting to be
cleaned up by d_invalidate, prevent the expected expire.

In this case dentrys created during a lookup for which a mount fails or has
no entry in the mount map contribute to the d_count of the parent dentry.
These dentrys may not be invalidated prior to comparing the interanl usage
count of valid autofs dentrys against the dentry d_count which makes a
mount tree appear busy so it doesn't expire.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/autofs4/expire.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b8ce02607d6..4456d1daa40 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -174,6 +174,12 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
 			struct autofs_info *ino = autofs4_dentry_ino(p);
 			unsigned int ino_count = atomic_read(&ino->count);
 
+			/*
+			 * Clean stale dentries below that have not been
+			 * invalidated after a mount fail during lookup
+			 */
+			d_invalidate(p);
+
 			/* allow for dget above and top is already dgot */
 			if (p == top)
 				ino_count += 2;
-- 
cgit v1.2.3


From 51eb01e73599efb88c6c20b1c226d20309a75450 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 25 Jun 2006 05:48:50 -0700
Subject: [PATCH] fuse: no backgrounding on interrupt

Don't put requests into the background when a fatal interrupt occurs while the
request is in userspace.  This removes a major wart from the implementation.

Backgrounding of requests was introduced to allow breaking of deadlocks.
However now the same can be achieved by aborting the filesystem through the
'abort' sysfs attribute.

This is a change in the interface, but should not cause problems, since these
kinds of deadlocks never happen during normal operation.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    | 157 ++++++++++++++++---------------------------------------
 fs/fuse/dir.c    |  56 +++++++++++---------
 fs/fuse/file.c   |  49 ++++++-----------
 fs/fuse/fuse_i.h |  49 +++++------------
 fs/fuse/inode.c  |  14 +----
 5 files changed, 106 insertions(+), 219 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 104a62dadb9..fec4779e2b5 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -64,18 +64,6 @@ static void restore_sigs(sigset_t *oldset)
 	sigprocmask(SIG_SETMASK, oldset, NULL);
 }
 
-/*
- * Reset request, so that it can be reused
- *
- * The caller must be _very_ careful to make sure, that it is holding
- * the only reference to req
- */
-void fuse_reset_request(struct fuse_req *req)
-{
-	BUG_ON(atomic_read(&req->count) != 1);
-	fuse_request_init(req);
-}
-
 static void __fuse_get_request(struct fuse_req *req)
 {
 	atomic_inc(&req->count);
@@ -103,6 +91,10 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 	if (intr)
 		goto out;
 
+	err = -ENOTCONN;
+	if (!fc->connected)
+		goto out;
+
 	req = fuse_request_alloc();
 	err = -ENOMEM;
 	if (!req)
@@ -128,114 +120,39 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-/*
- * Called with sbput_sem held for read (request_end) or write
- * (fuse_put_super).  By the time fuse_put_super() is finished, all
- * inodes belonging to background requests must be released, so the
- * iputs have to be done within the locked region.
- */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req)
-{
-	iput(req->inode);
-	iput(req->inode2);
-	spin_lock(&fc->lock);
-	list_del(&req->bg_entry);
-	if (fc->num_background == FUSE_MAX_BACKGROUND) {
-		fc->blocked = 0;
-		wake_up_all(&fc->blocked_waitq);
-	}
-	fc->num_background--;
-	spin_unlock(&fc->lock);
-}
-
 /*
  * This function is called when a request is finished.  Either a reply
  * has arrived or it was interrupted (and not yet sent) or some error
  * occurred during communication with userspace, or the device file
- * was closed.  In case of a background request the reference to the
- * stored objects are released.  The requester thread is woken up (if
- * still waiting), the 'end' callback is called if given, else the
- * reference to the request is released
- *
- * Releasing extra reference for foreground requests must be done
- * within the same locked region as setting state to finished.  This
- * is because fuse_reset_request() may be called after request is
- * finished and it must be the sole possessor.  If request is
- * interrupted and put in the background, it will return with an error
- * and hence never be reset and reused.
+ * was closed.  The requester thread is woken up (if still waiting),
+ * the 'end' callback is called if given, else the reference to the
+ * request is released
  *
  * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 {
+	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
+	req->end = NULL;
 	list_del(&req->list);
 	req->state = FUSE_REQ_FINISHED;
-	if (!req->background) {
-		spin_unlock(&fc->lock);
-		wake_up(&req->waitq);
-		fuse_put_request(fc, req);
-	} else {
-		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
-		req->end = NULL;
-		spin_unlock(&fc->lock);
-		down_read(&fc->sbput_sem);
-		if (fc->mounted)
-			fuse_release_background(fc, req);
-		up_read(&fc->sbput_sem);
-
-		/* fput must go outside sbput_sem, otherwise it can deadlock */
-		if (req->file)
-			fput(req->file);
-
-		if (end)
-			end(fc, req);
-		else
-			fuse_put_request(fc, req);
+	if (req->background) {
+		if (fc->num_background == FUSE_MAX_BACKGROUND) {
+			fc->blocked = 0;
+			wake_up_all(&fc->blocked_waitq);
+		}
+		fc->num_background--;
 	}
-}
-
-/*
- * Unfortunately request interruption not just solves the deadlock
- * problem, it causes problems too.  These stem from the fact, that an
- * interrupted request is continued to be processed in userspace,
- * while all the locks and object references (inode and file) held
- * during the operation are released.
- *
- * To release the locks is exactly why there's a need to interrupt the
- * request, so there's not a lot that can be done about this, except
- * introduce additional locking in userspace.
- *
- * More important is to keep inode and file references until userspace
- * has replied, otherwise FORGET and RELEASE could be sent while the
- * inode/file is still used by the filesystem.
- *
- * For this reason the concept of "background" request is introduced.
- * An interrupted request is backgrounded if it has been already sent
- * to userspace.  Backgrounding involves getting an extra reference to
- * inode(s) or file used in the request, and adding the request to
- * fc->background list.  When a reply is received for a background
- * request, the object references are released, and the request is
- * removed from the list.  If the filesystem is unmounted while there
- * are still background requests, the list is walked and references
- * are released as if a reply was received.
- *
- * There's one more use for a background request.  The RELEASE message is
- * always sent as background, since it doesn't return an error or
- * data.
- */
-static void background_request(struct fuse_conn *fc, struct fuse_req *req)
-{
-	req->background = 1;
-	list_add(&req->bg_entry, &fc->background);
-	fc->num_background++;
-	if (fc->num_background == FUSE_MAX_BACKGROUND)
-		fc->blocked = 1;
-	if (req->inode)
-		req->inode = igrab(req->inode);
-	if (req->inode2)
-		req->inode2 = igrab(req->inode2);
+	spin_unlock(&fc->lock);
+	dput(req->dentry);
+	mntput(req->vfsmount);
 	if (req->file)
-		get_file(req->file);
+		fput(req->file);
+	wake_up(&req->waitq);
+	if (end)
+		end(fc, req);
+	else
+		fuse_put_request(fc, req);
 }
 
 /* Called with fc->lock held.  Releases, and then reacquires it. */
@@ -244,9 +161,14 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 	sigset_t oldset;
 
 	spin_unlock(&fc->lock);
-	block_sigs(&oldset);
-	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
-	restore_sigs(&oldset);
+	if (req->force)
+		wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+	else {
+		block_sigs(&oldset);
+		wait_event_interruptible(req->waitq,
+					 req->state == FUSE_REQ_FINISHED);
+		restore_sigs(&oldset);
+	}
 	spin_lock(&fc->lock);
 	if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
 		return;
@@ -268,8 +190,11 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 	if (req->state == FUSE_REQ_PENDING) {
 		list_del(&req->list);
 		__fuse_put_request(req);
-	} else if (req->state == FUSE_REQ_SENT)
-		background_request(fc, req);
+	} else if (req->state == FUSE_REQ_SENT) {
+		spin_unlock(&fc->lock);
+		wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+		spin_lock(&fc->lock);
+	}
 }
 
 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
@@ -327,8 +252,12 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
 	spin_lock(&fc->lock);
-	background_request(fc, req);
 	if (fc->connected) {
+		req->background = 1;
+		fc->num_background++;
+		if (fc->num_background == FUSE_MAX_BACKGROUND)
+			fc->blocked = 1;
+
 		queue_request(fc, req);
 		spin_unlock(&fc->lock);
 	} else {
@@ -883,10 +812,12 @@ void fuse_abort_conn(struct fuse_conn *fc)
 	spin_lock(&fc->lock);
 	if (fc->connected) {
 		fc->connected = 0;
+		fc->blocked = 0;
 		end_io_requests(fc);
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
 		wake_up_all(&fc->waitq);
+		wake_up_all(&fc->blocked_waitq);
 		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	}
 	spin_unlock(&fc->lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8d7546e832e..72a74cde6de 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -79,7 +79,6 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
 {
 	req->in.h.opcode = FUSE_LOOKUP;
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
@@ -224,6 +223,20 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	return NULL;
 }
 
+/*
+ * Synchronous release for the case when something goes wrong in CREATE_OPEN
+ */
+static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
+			      u64 nodeid, int flags)
+{
+	struct fuse_req *req;
+
+	req = fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
+	req->force = 1;
+	request_send(fc, req);
+	fuse_put_request(fc, req);
+}
+
 /*
  * Atomic create+open operation
  *
@@ -237,6 +250,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct inode *inode;
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	struct fuse_req *req;
+	struct fuse_req *forget_req;
 	struct fuse_open_in inarg;
 	struct fuse_open_out outopen;
 	struct fuse_entry_out outentry;
@@ -247,9 +261,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (fc->no_create)
 		return -ENOSYS;
 
+	forget_req = fuse_get_req(fc);
+	if (IS_ERR(forget_req))
+		return PTR_ERR(forget_req);
+
 	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
 	if (IS_ERR(req))
-		return PTR_ERR(req);
+		goto out_put_forget_req;
 
 	err = -ENOMEM;
 	ff = fuse_file_alloc();
@@ -262,7 +281,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	inarg.mode = mode;
 	req->in.h.opcode = FUSE_CREATE;
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->in.numargs = 2;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -285,25 +303,23 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
 		goto out_free_ff;
 
+	fuse_put_request(fc, req);
 	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
 			  &outentry.attr);
-	err = -ENOMEM;
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
 		ff->fh = outopen.fh;
-		/* Special release, with inode = NULL, this will
-		   trigger a 'forget' request when the release is
-		   complete */
-		fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0);
-		goto out_put_request;
+		fuse_sync_release(fc, ff, outentry.nodeid, flags);
+		fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
+		return -ENOMEM;
 	}
-	fuse_put_request(fc, req);
+	fuse_put_request(fc, forget_req);
 	d_instantiate(entry, inode);
 	fuse_change_timeout(entry, &outentry);
 	file = lookup_instantiate_filp(nd, entry, generic_file_open);
 	if (IS_ERR(file)) {
 		ff->fh = outopen.fh;
-		fuse_send_release(fc, ff, outentry.nodeid, inode, flags, 0);
+		fuse_sync_release(fc, ff, outentry.nodeid, flags);
 		return PTR_ERR(file);
 	}
 	fuse_finish_open(inode, file, ff, &outopen);
@@ -313,6 +329,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	fuse_file_free(ff);
  out_put_request:
 	fuse_put_request(fc, req);
+ out_put_forget_req:
+	fuse_put_request(fc, forget_req);
 	return err;
 }
 
@@ -328,7 +346,6 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	int err;
 
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
@@ -448,7 +465,6 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 
 	req->in.h.opcode = FUSE_UNLINK;
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
@@ -480,7 +496,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 
 	req->in.h.opcode = FUSE_RMDIR;
 	req->in.h.nodeid = get_node_id(dir);
-	req->inode = dir;
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
@@ -510,8 +525,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	inarg.newdir = get_node_id(newdir);
 	req->in.h.opcode = FUSE_RENAME;
 	req->in.h.nodeid = get_node_id(olddir);
-	req->inode = olddir;
-	req->inode2 = newdir;
 	req->in.numargs = 3;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -558,7 +571,6 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.oldnodeid = get_node_id(inode);
 	req->in.h.opcode = FUSE_LINK;
-	req->inode2 = inode;
 	req->in.numargs = 2;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -587,7 +599,6 @@ int fuse_do_getattr(struct inode *inode)
 
 	req->in.h.opcode = FUSE_GETATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(arg);
 	req->out.args[0].value = &arg;
@@ -679,7 +690,6 @@ static int fuse_access(struct inode *inode, int mask)
 	inarg.mask = mask;
 	req->in.h.opcode = FUSE_ACCESS;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -820,7 +830,6 @@ static char *read_link(struct dentry *dentry)
 	}
 	req->in.h.opcode = FUSE_READLINK;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->out.argvar = 1;
 	req->out.numargs = 1;
 	req->out.args[0].size = PAGE_SIZE - 1;
@@ -939,7 +948,6 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 	iattr_to_fattr(attr, &inarg);
 	req->in.h.opcode = FUSE_SETATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -1002,7 +1010,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	inarg.flags = flags;
 	req->in.h.opcode = FUSE_SETXATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 3;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -1041,7 +1048,6 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
 	inarg.size = size;
 	req->in.h.opcode = FUSE_GETXATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 2;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -1091,7 +1097,6 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 	inarg.size = size;
 	req->in.h.opcode = FUSE_LISTXATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -1135,7 +1140,6 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 
 	req->in.h.opcode = FUSE_REMOVEXATTR;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = strlen(name) + 1;
 	req->in.args[0].value = name;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 087f3b734f4..1d59af306b2 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -30,7 +30,6 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 	req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -113,37 +112,22 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 	return err;
 }
 
-/* Special case for failed iget in CREATE */
-static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
+struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
+				   int opcode)
 {
-	/* If called from end_io_requests(), req has more than one
-	   reference and fuse_reset_request() cannot work */
-	if (fc->connected) {
-		u64 nodeid = req->in.h.nodeid;
-		fuse_reset_request(req);
-		fuse_send_forget(fc, req, nodeid, 1);
-	} else
-		fuse_put_request(fc, req);
-}
-
-void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
-		       u64 nodeid, struct inode *inode, int flags, int isdir)
-{
-	struct fuse_req * req = ff->release_req;
+	struct fuse_req *req = ff->release_req;
 	struct fuse_release_in *inarg = &req->misc.release_in;
 
 	inarg->fh = ff->fh;
 	inarg->flags = flags;
-	req->in.h.opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
+	req->in.h.opcode = opcode;
 	req->in.h.nodeid = nodeid;
-	req->inode = inode;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(struct fuse_release_in);
 	req->in.args[0].value = inarg;
-	request_send_background(fc, req);
-	if (!inode)
-		req->end = fuse_release_end;
 	kfree(ff);
+
+	return req;
 }
 
 int fuse_release_common(struct inode *inode, struct file *file, int isdir)
@@ -151,8 +135,15 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
 	struct fuse_file *ff = file->private_data;
 	if (ff) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
-		u64 nodeid = get_node_id(inode);
-		fuse_send_release(fc, ff, nodeid, inode, file->f_flags, isdir);
+		struct fuse_req *req;
+
+		req = fuse_release_fill(ff, get_node_id(inode), file->f_flags,
+					isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
+
+		/* Hold vfsmount and dentry until release is finished */
+		req->vfsmount = mntget(file->f_vfsmnt);
+		req->dentry = dget(file->f_dentry);
+		request_send_background(fc, req);
 	}
 
 	/* Return value is ignored by VFS */
@@ -192,8 +183,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	inarg.fh = ff->fh;
 	req->in.h.opcode = FUSE_FLUSH;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
-	req->file = file;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -232,8 +221,6 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	inarg.fsync_flags = datasync ? 1 : 0;
 	req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
-	req->file = file;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
@@ -266,8 +253,6 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 	inarg->size = count;
 	req->in.h.opcode = opcode;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
-	req->file = file;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(struct fuse_read_in);
 	req->in.args[0].value = inarg;
@@ -342,6 +327,8 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
 	req->out.page_zeroing = 1;
 	fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
 	if (fc->async_read) {
+		get_file(file);
+		req->file = file;
 		req->end = fuse_readpages_end;
 		request_send_background(fc, req);
 	} else {
@@ -420,8 +407,6 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
 	inarg.size = count;
 	req->in.h.opcode = FUSE_WRITE;
 	req->in.h.nodeid = get_node_id(inode);
-	req->inode = inode;
-	req->file = file;
 	req->in.argpages = 1;
 	req->in.numargs = 2;
 	req->in.args[0].size = sizeof(struct fuse_write_in);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 0474202cb5d..25f8581a770 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -8,12 +8,12 @@
 
 #include <linux/fuse.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
 #include <linux/wait.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
-#include <asm/semaphore.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -135,9 +135,6 @@ struct fuse_req {
 	    fuse_conn */
 	struct list_head list;
 
-	/** Entry on the background list */
-	struct list_head bg_entry;
-
 	/** refcount */
 	atomic_t count;
 
@@ -150,6 +147,9 @@ struct fuse_req {
 	/** True if the request has reply */
 	unsigned isreply:1;
 
+	/** Force sending of the request even if interrupted */
+	unsigned force:1;
+
 	/** The request was interrupted */
 	unsigned interrupted:1;
 
@@ -192,15 +192,15 @@ struct fuse_req {
 	/** offset of data on first page */
 	unsigned page_offset;
 
-	/** Inode used in the request */
-	struct inode *inode;
-
-	/** Second inode used in the request (or NULL) */
-	struct inode *inode2;
-
 	/** File used in the request (or NULL) */
 	struct file *file;
 
+	/** vfsmount used in release */
+	struct vfsmount *vfsmount;
+
+	/** dentry used in release */
+	struct dentry *dentry;
+
 	/** Request completion callback */
 	void (*end)(struct fuse_conn *, struct fuse_req *);
 };
@@ -243,10 +243,6 @@ struct fuse_conn {
 	/** The list of requests under I/O */
 	struct list_head io;
 
-	/** Requests put in the background (RELEASE or any other
-	    interrupted request) */
-	struct list_head background;
-
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
@@ -258,15 +254,9 @@ struct fuse_conn {
 	/** waitq for blocked connection */
 	wait_queue_head_t blocked_waitq;
 
-	/** RW semaphore for exclusion with fuse_put_super() */
-	struct rw_semaphore sbput_sem;
-
 	/** The next unique request id */
 	u64 reqctr;
 
-	/** Mount is active */
-	unsigned mounted;
-
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -383,12 +373,9 @@ void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file,
 		      struct fuse_file *ff, struct fuse_open_out *outarg);
 
-/**
- * Send a RELEASE request
- */
-void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff,
-		       u64 nodeid, struct inode *inode, int flags, int isdir);
-
+/** */
+struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
+				   int opcode);
 /**
  * Send RELEASE or RELEASEDIR request
  */
@@ -445,11 +432,6 @@ struct fuse_req *fuse_request_alloc(void);
  */
 void fuse_request_free(struct fuse_req *req);
 
-/**
- * Reinitialize a request, the preallocated flag is left unmodified
- */
-void fuse_reset_request(struct fuse_req *req);
-
 /**
  * Reserve a preallocated request
  */
@@ -476,11 +458,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
  */
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
-/**
- * Release inodes and file associated with background request
- */
-void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req);
-
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a13c0f52905..0225729977c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -11,7 +11,6 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/file.h>
-#include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -204,20 +203,14 @@ static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
-	down_write(&fc->sbput_sem);
-	while (!list_empty(&fc->background))
-		fuse_release_background(fc,
-					list_entry(fc->background.next,
-						   struct fuse_req, bg_entry));
-
 	spin_lock(&fc->lock);
-	fc->mounted = 0;
 	fc->connected = 0;
+	fc->blocked = 0;
 	spin_unlock(&fc->lock);
-	up_write(&fc->sbput_sem);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
+	wake_up_all(&fc->blocked_waitq);
 	kobject_del(&fc->kobj);
 	kobject_put(&fc->kobj);
 }
@@ -386,8 +379,6 @@ static struct fuse_conn *new_conn(void)
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
-		INIT_LIST_HEAD(&fc->background);
-		init_rwsem(&fc->sbput_sem);
 		kobj_set_kset_s(fc, connections_subsys);
 		kobject_init(&fc->kobj);
 		atomic_set(&fc->num_waiting, 0);
@@ -543,7 +534,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err_kobject_del;
 
 	sb->s_root = root_dentry;
-	fc->mounted = 1;
 	fc->connected = 1;
 	kobject_get(&fc->kobj);
 	file->private_data = fc;
-- 
cgit v1.2.3


From bafa96541b250a7051e3fbc5de6e8369daf8ffec Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 25 Jun 2006 05:48:51 -0700
Subject: [PATCH] fuse: add control filesystem

Add a control filesystem to fuse, replacing the attributes currently exported
through sysfs.  An empty directory '/sys/fs/fuse/connections' is still created
in sysfs, and mounting the control filesystem here provides backward
compatibility.

Advantages of the control filesystem over the previous solution:

  - allows the object directory and the attributes to be owned by the
    filesystem owner, hence letting unpriviled users abort the
    filesystem connection

  - does not suffer from module unload race

[akpm@osdl.org: fix this fs for recent dhowells depredations]
[akpm@osdl.org: fix 64-bit printk warnings]
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/Makefile  |   2 +-
 fs/fuse/control.c | 218 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/dev.c     |   2 +-
 fs/fuse/fuse_i.h  |  53 +++++++++++--
 fs/fuse/inode.c   | 141 +++++++++++------------------------
 5 files changed, 310 insertions(+), 106 deletions(-)
 create mode 100644 fs/fuse/control.c

(limited to 'fs')

diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index c3e1f760cac..72437065f6a 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_FUSE_FS) += fuse.o
 
-fuse-objs := dev.o dir.o file.o inode.o
+fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
new file mode 100644
index 00000000000..a3bce3a7725
--- /dev/null
+++ b/fs/fuse/control.c
@@ -0,0 +1,218 @@
+/*
+  FUSE: Filesystem in Userspace
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+
+  This program can be distributed under the terms of the GNU GPL.
+  See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#define FUSE_CTL_SUPER_MAGIC 0x65735543
+
+/*
+ * This is non-NULL when the single instance of the control filesystem
+ * exists.  Protected by fuse_mutex
+ */
+static struct super_block *fuse_control_sb;
+
+static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
+{
+	struct fuse_conn *fc;
+	mutex_lock(&fuse_mutex);
+	fc = file->f_dentry->d_inode->u.generic_ip;
+	if (fc)
+		fc = fuse_conn_get(fc);
+	mutex_unlock(&fuse_mutex);
+	return fc;
+}
+
+static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+	if (fc) {
+		fuse_abort_conn(fc);
+		fuse_conn_put(fc);
+	}
+	return count;
+}
+
+static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
+				      size_t len, loff_t *ppos)
+{
+	char tmp[32];
+	size_t size;
+
+	if (!*ppos) {
+		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+		if (!fc)
+			return 0;
+
+		file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+		fuse_conn_put(fc);
+	}
+	size = sprintf(tmp, "%ld\n", (long)file->private_data);
+	return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static const struct file_operations fuse_ctl_abort_ops = {
+	.open = nonseekable_open,
+	.write = fuse_conn_abort_write,
+};
+
+static const struct file_operations fuse_ctl_waiting_ops = {
+	.open = nonseekable_open,
+	.read = fuse_conn_waiting_read,
+};
+
+static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
+					  struct fuse_conn *fc,
+					  const char *name,
+					  int mode, int nlink,
+					  struct inode_operations *iop,
+					  const struct file_operations *fop)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
+	dentry = d_alloc_name(parent, name);
+	if (!dentry)
+		return NULL;
+
+	fc->ctl_dentry[fc->ctl_ndents++] = dentry;
+	inode = new_inode(fuse_control_sb);
+	if (!inode)
+		return NULL;
+
+	inode->i_mode = mode;
+	inode->i_uid = fc->user_id;
+	inode->i_gid = fc->group_id;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	/* setting ->i_op to NULL is not allowed */
+	if (iop)
+		inode->i_op = iop;
+	inode->i_fop = fop;
+	inode->i_nlink = nlink;
+	inode->u.generic_ip = fc;
+	d_add(dentry, inode);
+	return dentry;
+}
+
+/*
+ * Add a connection to the control filesystem (if it exists).  Caller
+ * must host fuse_mutex
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc)
+{
+	struct dentry *parent;
+	char name[32];
+
+	if (!fuse_control_sb)
+		return 0;
+
+	parent = fuse_control_sb->s_root;
+	parent->d_inode->i_nlink++;
+	sprintf(name, "%llu", (unsigned long long) fc->id);
+	parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
+				     &simple_dir_inode_operations,
+				     &simple_dir_operations);
+	if (!parent)
+		goto err;
+
+	if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
+				NULL, &fuse_ctl_waiting_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
+				 NULL, &fuse_ctl_abort_ops))
+		goto err;
+
+	return 0;
+
+ err:
+	fuse_ctl_remove_conn(fc);
+	return -ENOMEM;
+}
+
+/*
+ * Remove a connection from the control filesystem (if it exists).
+ * Caller must host fuse_mutex
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc)
+{
+	int i;
+
+	if (!fuse_control_sb)
+		return;
+
+	for (i = fc->ctl_ndents - 1; i >= 0; i--) {
+		struct dentry *dentry = fc->ctl_dentry[i];
+		dentry->d_inode->u.generic_ip = NULL;
+		d_drop(dentry);
+		dput(dentry);
+	}
+	fuse_control_sb->s_root->d_inode->i_nlink--;
+}
+
+static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct tree_descr empty_descr = {""};
+	struct fuse_conn *fc;
+	int err;
+
+	err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr);
+	if (err)
+		return err;
+
+	mutex_lock(&fuse_mutex);
+	BUG_ON(fuse_control_sb);
+	fuse_control_sb = sb;
+	list_for_each_entry(fc, &fuse_conn_list, entry) {
+		err = fuse_ctl_add_conn(fc);
+		if (err) {
+			fuse_control_sb = NULL;
+			mutex_unlock(&fuse_mutex);
+			return err;
+		}
+	}
+	mutex_unlock(&fuse_mutex);
+
+	return 0;
+}
+
+static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *raw_data,
+			struct vfsmount *mnt)
+{
+	return get_sb_single(fs_type, flags, raw_data,
+				fuse_ctl_fill_super, mnt);
+}
+
+static void fuse_ctl_kill_sb(struct super_block *sb)
+{
+	mutex_lock(&fuse_mutex);
+	fuse_control_sb = NULL;
+	mutex_unlock(&fuse_mutex);
+
+	kill_litter_super(sb);
+}
+
+static struct file_system_type fuse_ctl_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "fusectl",
+	.get_sb		= fuse_ctl_get_sb,
+	.kill_sb	= fuse_ctl_kill_sb,
+};
+
+int __init fuse_ctl_init(void)
+{
+	return register_filesystem(&fuse_ctl_fs_type);
+}
+
+void fuse_ctl_cleanup(void)
+{
+	unregister_filesystem(&fuse_ctl_fs_type);
+}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fec4779e2b5..fe3adf58917 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -833,7 +833,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
 		end_requests(fc, &fc->processing);
 		spin_unlock(&fc->lock);
 		fasync_helper(-1, file, 0, &fc->fasync);
-		kobject_put(&fc->kobj);
+		fuse_conn_put(fc);
 	}
 
 	return 0;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 25f8581a770..ac12b01f444 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
+#include <linux/mutex.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -24,6 +25,9 @@
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
 
+/** Number of dentries for each connection in the control filesystem */
+#define FUSE_CTL_NUM_DENTRIES 3
+
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
     permission checking is done in the kernel */
@@ -33,6 +37,11 @@
     doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
 
+/** List of active connections */
+extern struct list_head fuse_conn_list;
+
+/** Global mutex protecting fuse_conn_list and the control filesystem */
+extern struct mutex fuse_mutex;
 
 /** FUSE inode */
 struct fuse_inode {
@@ -216,6 +225,9 @@ struct fuse_conn {
 	/** Lock protecting accessess to  members of this structure */
 	spinlock_t lock;
 
+	/** Refcount */
+	atomic_t count;
+
 	/** The user id for this mount */
 	uid_t user_id;
 
@@ -310,8 +322,17 @@ struct fuse_conn {
 	/** Backing dev info */
 	struct backing_dev_info bdi;
 
-	/** kobject */
-	struct kobject kobj;
+	/** Entry on the fuse_conn_list */
+	struct list_head entry;
+
+	/** Unique ID */
+	u64 id;
+
+	/** Dentries in the control filesystem */
+	struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
+
+	/** number of dentries used in the above array */
+	int ctl_ndents;
 
 	/** O_ASYNC requests */
 	struct fasync_struct *fasync;
@@ -327,11 +348,6 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
 	return get_fuse_conn_super(inode->i_sb);
 }
 
-static inline struct fuse_conn *get_fuse_conn_kobj(struct kobject *obj)
-{
-	return container_of(obj, struct fuse_conn, kobj);
-}
-
 static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
 {
 	return container_of(inode, struct fuse_inode, inode);
@@ -422,6 +438,9 @@ int fuse_dev_init(void);
  */
 void fuse_dev_cleanup(void);
 
+int fuse_ctl_init(void);
+void fuse_ctl_cleanup(void);
+
 /**
  * Allocate a request
  */
@@ -470,3 +489,23 @@ int fuse_do_getattr(struct inode *inode);
  * Invalidate inode attributes
  */
 void fuse_invalidate_attr(struct inode *inode);
+
+/**
+ * Acquire reference to fuse_conn
+ */
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+
+/**
+ * Release reference to fuse_conn
+ */
+void fuse_conn_put(struct fuse_conn *fc);
+
+/**
+ * Add connection to control filesystem
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc);
+
+/**
+ * Remove connection from control filesystem
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0225729977c..13a7e8ab7a7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -22,13 +22,8 @@ MODULE_DESCRIPTION("Filesystem in Userspace");
 MODULE_LICENSE("GPL");
 
 static kmem_cache_t *fuse_inode_cachep;
-static struct subsystem connections_subsys;
-
-struct fuse_conn_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct fuse_conn *, char *);
-	ssize_t (*store)(struct fuse_conn *, const char *, size_t);
-};
+struct list_head fuse_conn_list;
+DEFINE_MUTEX(fuse_mutex);
 
 #define FUSE_SUPER_MAGIC 0x65735546
 
@@ -211,8 +206,11 @@ static void fuse_put_super(struct super_block *sb)
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
 	wake_up_all(&fc->blocked_waitq);
-	kobject_del(&fc->kobj);
-	kobject_put(&fc->kobj);
+	mutex_lock(&fuse_mutex);
+	list_del(&fc->entry);
+	fuse_ctl_remove_conn(fc);
+	mutex_unlock(&fuse_mutex);
+	fuse_conn_put(fc);
 }
 
 static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
@@ -362,11 +360,6 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
-static void fuse_conn_release(struct kobject *kobj)
-{
-	kfree(get_fuse_conn_kobj(kobj));
-}
-
 static struct fuse_conn *new_conn(void)
 {
 	struct fuse_conn *fc;
@@ -374,13 +367,12 @@ static struct fuse_conn *new_conn(void)
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (fc) {
 		spin_lock_init(&fc->lock);
+		atomic_set(&fc->count, 1);
 		init_waitqueue_head(&fc->waitq);
 		init_waitqueue_head(&fc->blocked_waitq);
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
-		kobj_set_kset_s(fc, connections_subsys);
-		kobject_init(&fc->kobj);
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
@@ -390,6 +382,18 @@ static struct fuse_conn *new_conn(void)
 	return fc;
 }
 
+void fuse_conn_put(struct fuse_conn *fc)
+{
+	if (atomic_dec_and_test(&fc->count))
+		kfree(fc);
+}
+
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
+{
+	atomic_inc(&fc->count);
+	return fc;
+}
+
 static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 {
 	struct fuse_attr attr;
@@ -459,10 +463,9 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	request_send_background(fc, req);
 }
 
-static unsigned long long conn_id(void)
+static u64 conn_id(void)
 {
-	/* BKL is held for ->get_sb() */
-	static unsigned long long ctr = 1;
+	static u64 ctr = 1;
 	return ctr++;
 }
 
@@ -519,24 +522,21 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!init_req)
 		goto err_put_root;
 
-	err = kobject_set_name(&fc->kobj, "%llu", conn_id());
-	if (err)
-		goto err_free_req;
-
-	err = kobject_add(&fc->kobj);
-	if (err)
-		goto err_free_req;
-
-	/* Setting file->private_data can't race with other mount()
-	   instances, since BKL is held for ->get_sb() */
+	mutex_lock(&fuse_mutex);
 	err = -EINVAL;
 	if (file->private_data)
-		goto err_kobject_del;
+		goto err_unlock;
 
+	fc->id = conn_id();
+	err = fuse_ctl_add_conn(fc);
+	if (err)
+		goto err_unlock;
+
+	list_add_tail(&fc->entry, &fuse_conn_list);
 	sb->s_root = root_dentry;
 	fc->connected = 1;
-	kobject_get(&fc->kobj);
-	file->private_data = fc;
+	file->private_data = fuse_conn_get(fc);
+	mutex_unlock(&fuse_mutex);
 	/*
 	 * atomic_dec_and_test() in fput() provides the necessary
 	 * memory barrier for file->private_data to be visible on all
@@ -548,15 +548,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 
 	return 0;
 
- err_kobject_del:
-	kobject_del(&fc->kobj);
- err_free_req:
+ err_unlock:
+	mutex_unlock(&fuse_mutex);
 	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
  err:
 	fput(file);
-	kobject_put(&fc->kobj);
+	fuse_conn_put(fc);
 	return err;
 }
 
@@ -574,68 +573,8 @@ static struct file_system_type fuse_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static ssize_t fuse_conn_waiting_show(struct fuse_conn *fc, char *page)
-{
-	return sprintf(page, "%i\n", atomic_read(&fc->num_waiting));
-}
-
-static ssize_t fuse_conn_abort_store(struct fuse_conn *fc, const char *page,
-				     size_t count)
-{
-	fuse_abort_conn(fc);
-	return count;
-}
-
-static struct fuse_conn_attr fuse_conn_waiting =
-	__ATTR(waiting, 0400, fuse_conn_waiting_show, NULL);
-static struct fuse_conn_attr fuse_conn_abort =
-	__ATTR(abort, 0600, NULL, fuse_conn_abort_store);
-
-static struct attribute *fuse_conn_attrs[] = {
-	&fuse_conn_waiting.attr,
-	&fuse_conn_abort.attr,
-	NULL,
-};
-
-static ssize_t fuse_conn_attr_show(struct kobject *kobj,
-				   struct attribute *attr,
-				   char *page)
-{
-	struct fuse_conn_attr *fca =
-		container_of(attr, struct fuse_conn_attr, attr);
-
-	if (fca->show)
-		return fca->show(get_fuse_conn_kobj(kobj), page);
-	else
-		return -EACCES;
-}
-
-static ssize_t fuse_conn_attr_store(struct kobject *kobj,
-				    struct attribute *attr,
-				    const char *page, size_t count)
-{
-	struct fuse_conn_attr *fca =
-		container_of(attr, struct fuse_conn_attr, attr);
-
-	if (fca->store)
-		return fca->store(get_fuse_conn_kobj(kobj), page, count);
-	else
-		return -EACCES;
-}
-
-static struct sysfs_ops fuse_conn_sysfs_ops = {
-	.show	= &fuse_conn_attr_show,
-	.store	= &fuse_conn_attr_store,
-};
-
-static struct kobj_type ktype_fuse_conn = {
-	.release	= fuse_conn_release,
-	.sysfs_ops	= &fuse_conn_sysfs_ops,
-	.default_attrs	= fuse_conn_attrs,
-};
-
 static decl_subsys(fuse, NULL, NULL);
-static decl_subsys(connections, &ktype_fuse_conn, NULL);
+static decl_subsys(connections, NULL, NULL);
 
 static void fuse_inode_init_once(void *foo, kmem_cache_t *cachep,
 				 unsigned long flags)
@@ -709,6 +648,7 @@ static int __init fuse_init(void)
 	printk("fuse init (API version %i.%i)\n",
 	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
 
+	INIT_LIST_HEAD(&fuse_conn_list);
 	res = fuse_fs_init();
 	if (res)
 		goto err;
@@ -721,8 +661,14 @@ static int __init fuse_init(void)
 	if (res)
 		goto err_dev_cleanup;
 
+	res = fuse_ctl_init();
+	if (res)
+		goto err_sysfs_cleanup;
+
 	return 0;
 
+ err_sysfs_cleanup:
+	fuse_sysfs_cleanup();
  err_dev_cleanup:
 	fuse_dev_cleanup();
  err_fs_cleanup:
@@ -735,6 +681,7 @@ static void __exit fuse_exit(void)
 {
 	printk(KERN_DEBUG "fuse exit\n");
 
+	fuse_ctl_cleanup();
 	fuse_sysfs_cleanup();
 	fuse_fs_cleanup();
 	fuse_dev_cleanup();
-- 
cgit v1.2.3


From 7142125937e1482ad3ae4366594c6586153dfc86 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 25 Jun 2006 05:48:52 -0700
Subject: [PATCH] fuse: add POSIX file locking support

This patch adds POSIX file locking support to the fuse interface.

This implementation doesn't keep any locking state in kernel.  Unlocking on
close() is handled by the FLUSH message, which now contains the lock owner id.

Mandatory locking is not supported.  The filesystem may enfoce mandatory
locking in userspace if needed.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c   | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h |   4 ++
 fs/fuse/inode.c  |  20 ++++++++-
 3 files changed, 154 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 1d59af306b2..d9a8289297c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -160,6 +160,18 @@ static int fuse_release(struct inode *inode, struct file *file)
 	return fuse_release_common(inode, file, 0);
 }
 
+/*
+ * It would be nice to scramble the ID space, so that the value of the
+ * files_struct pointer is not exposed to userspace.  Symmetric crypto
+ * functions are overkill, since the inverse function doesn't need to
+ * be implemented (though it does have to exist).  Is there something
+ * simpler?
+ */
+static inline u64 fuse_lock_owner_id(fl_owner_t id)
+{
+	return (unsigned long) id;
+}
+
 static int fuse_flush(struct file *file, fl_owner_t id)
 {
 	struct inode *inode = file->f_dentry->d_inode;
@@ -181,11 +193,13 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
+	inarg.lock_owner = fuse_lock_owner_id(id);
 	req->in.h.opcode = FUSE_FLUSH;
 	req->in.h.nodeid = get_node_id(inode);
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
+	req->force = 1;
 	request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
@@ -604,6 +618,122 @@ static int fuse_set_page_dirty(struct page *page)
 	return 0;
 }
 
+static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+				  struct file_lock *fl)
+{
+	switch (ffl->type) {
+	case F_UNLCK:
+		break;
+
+	case F_RDLCK:
+	case F_WRLCK:
+		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
+		    ffl->end < ffl->start)
+			return -EIO;
+
+		fl->fl_start = ffl->start;
+		fl->fl_end = ffl->end;
+		fl->fl_pid = ffl->pid;
+		break;
+
+	default:
+		return -EIO;
+	}
+	fl->fl_type = ffl->type;
+	return 0;
+}
+
+static void fuse_lk_fill(struct fuse_req *req, struct file *file,
+			 const struct file_lock *fl, int opcode, pid_t pid)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_lk_in *arg = &req->misc.lk_in;
+
+	arg->fh = ff->fh;
+	arg->owner = fuse_lock_owner_id(fl->fl_owner);
+	arg->lk.start = fl->fl_start;
+	arg->lk.end = fl->fl_end;
+	arg->lk.type = fl->fl_type;
+	arg->lk.pid = pid;
+	req->in.h.opcode = opcode;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*arg);
+	req->in.args[0].value = arg;
+}
+
+static int fuse_getlk(struct file *file, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_lk_out outarg;
+	int err;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	fuse_lk_fill(req, file, fl, FUSE_GETLK, 0);
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	if (!err)
+		err = convert_fuse_file_lock(&outarg.lk, fl);
+
+	return err;
+}
+
+static int fuse_setlk(struct file *file, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+	pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+	int err;
+
+	/* Unlock on close is handled by the flush method */
+	if (fl->fl_flags & FL_CLOSE)
+		return 0;
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	fuse_lk_fill(req, file, fl, opcode, pid);
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+	return err;
+}
+
+static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	if (cmd == F_GETLK) {
+		if (fc->no_lock) {
+			if (!posix_test_lock(file, fl, fl))
+				fl->fl_type = F_UNLCK;
+			err = 0;
+		} else
+			err = fuse_getlk(file, fl);
+	} else {
+		if (fc->no_lock)
+			err = posix_lock_file_wait(file, fl);
+		else
+			err = fuse_setlk(file, fl);
+	}
+	return err;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_file_read,
@@ -613,6 +743,7 @@ static const struct file_operations fuse_file_operations = {
 	.flush		= fuse_flush,
 	.release	= fuse_release,
 	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
 	.sendfile	= generic_file_sendfile,
 };
 
@@ -624,6 +755,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	.flush		= fuse_flush,
 	.release	= fuse_release,
 	.fsync		= fuse_fsync,
+	.lock		= fuse_file_lock,
 	/* no mmap and sendfile */
 };
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ac12b01f444..eb3166625ca 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -190,6 +190,7 @@ struct fuse_req {
 		struct fuse_init_in init_in;
 		struct fuse_init_out init_out;
 		struct fuse_read_in read_in;
+		struct fuse_lk_in lk_in;
 	} misc;
 
 	/** page vector */
@@ -307,6 +308,9 @@ struct fuse_conn {
 	/** Is removexattr not implemented by fs? */
 	unsigned no_removexattr : 1;
 
+	/** Are file locking primitives not implemented by fs? */
+	unsigned no_lock : 1;
+
 	/** Is access not implemented by fs? */
 	unsigned no_access : 1;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 13a7e8ab7a7..41289290583 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -98,6 +98,14 @@ static void fuse_clear_inode(struct inode *inode)
 	}
 }
 
+static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	if (*flags & MS_MANDLOCK)
+		return -EINVAL;
+
+	return 0;
+}
+
 void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 {
 	if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
@@ -409,6 +417,7 @@ static struct super_operations fuse_super_operations = {
 	.destroy_inode  = fuse_destroy_inode,
 	.read_inode	= fuse_read_inode,
 	.clear_inode	= fuse_clear_inode,
+	.remount_fs	= fuse_remount_fs,
 	.put_super	= fuse_put_super,
 	.umount_begin	= fuse_umount_begin,
 	.statfs		= fuse_statfs,
@@ -428,8 +437,12 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
 			if (arg->flags & FUSE_ASYNC_READ)
 				fc->async_read = 1;
-		} else
+			if (!(arg->flags & FUSE_POSIX_LOCKS))
+				fc->no_lock = 1;
+		} else {
 			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+			fc->no_lock = 1;
+		}
 
 		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
 		fc->minor = arg->minor;
@@ -447,7 +460,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
-	arg->flags |= FUSE_ASYNC_READ;
+	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -479,6 +492,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	struct fuse_req *init_req;
 	int err;
 
+	if (sb->s_flags & MS_MANDLOCK)
+		return -EINVAL;
+
 	if (!parse_fuse_opt((char *) data, &d))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 33649c91a3df57c1090a657637d44b896de367e7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 25 Jun 2006 05:48:52 -0700
Subject: [PATCH] fuse: ensure FLUSH reaches userspace

All POSIX locks owned by the current task are removed on close().  If the
FLUSH request resulting initiated by close() fails to reach userspace, there
might be locks remaining, which cannot be removed.

The only reason it could fail, is if allocating the request fails.  In this
case use the request reserved for RELEASE, or if that is currently used by
another FLUSH, wait for it to become available.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/fuse/file.c   | 13 ++++-----
 fs/fuse/fuse_i.h | 12 ++++++--
 3 files changed, 99 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fe3adf58917..ae26f37e53a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -76,6 +76,13 @@ static void __fuse_put_request(struct fuse_req *req)
 	atomic_dec(&req->count);
 }
 
+static void fuse_req_init_context(struct fuse_req *req)
+{
+	req->in.h.uid = current->fsuid;
+	req->in.h.gid = current->fsgid;
+	req->in.h.pid = current->pid;
+}
+
 struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
 	struct fuse_req *req;
@@ -100,9 +107,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 	if (!req)
 		goto out;
 
-	req->in.h.uid = current->fsuid;
-	req->in.h.gid = current->fsgid;
-	req->in.h.pid = current->pid;
+	fuse_req_init_context(req);
 	req->waiting = 1;
 	return req;
 
@@ -111,12 +116,87 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 	return ERR_PTR(err);
 }
 
+/*
+ * Return request in fuse_file->reserved_req.  However that may
+ * currently be in use.  If that is the case, wait for it to become
+ * available.
+ */
+static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
+					 struct file *file)
+{
+	struct fuse_req *req = NULL;
+	struct fuse_file *ff = file->private_data;
+
+	do {
+		wait_event(fc->blocked_waitq, ff->reserved_req);
+		spin_lock(&fc->lock);
+		if (ff->reserved_req) {
+			req = ff->reserved_req;
+			ff->reserved_req = NULL;
+			get_file(file);
+			req->stolen_file = file;
+		}
+		spin_unlock(&fc->lock);
+	} while (!req);
+
+	return req;
+}
+
+/*
+ * Put stolen request back into fuse_file->reserved_req
+ */
+static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct file *file = req->stolen_file;
+	struct fuse_file *ff = file->private_data;
+
+	spin_lock(&fc->lock);
+	fuse_request_init(req);
+	BUG_ON(ff->reserved_req);
+	ff->reserved_req = req;
+	wake_up(&fc->blocked_waitq);
+	spin_unlock(&fc->lock);
+	fput(file);
+}
+
+/*
+ * Gets a requests for a file operation, always succeeds
+ *
+ * This is used for sending the FLUSH request, which must get to
+ * userspace, due to POSIX locks which may need to be unlocked.
+ *
+ * If allocation fails due to OOM, use the reserved request in
+ * fuse_file.
+ *
+ * This is very unlikely to deadlock accidentally, since the
+ * filesystem should not have it's own file open.  If deadlock is
+ * intentional, it can still be broken by "aborting" the filesystem.
+ */
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
+{
+	struct fuse_req *req;
+
+	atomic_inc(&fc->num_waiting);
+	wait_event(fc->blocked_waitq, !fc->blocked);
+	req = fuse_request_alloc();
+	if (!req)
+		req = get_reserved_req(fc, file);
+
+	fuse_req_init_context(req);
+	req->waiting = 1;
+	return req;
+}
+
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
 		if (req->waiting)
 			atomic_dec(&fc->num_waiting);
-		fuse_request_free(req);
+
+		if (req->stolen_file)
+			put_reserved_req(fc, req);
+		else
+			fuse_request_free(req);
 	}
 }
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d9a8289297c..ce759414cff 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -48,8 +48,8 @@ struct fuse_file *fuse_file_alloc(void)
 	struct fuse_file *ff;
 	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
 	if (ff) {
-		ff->release_req = fuse_request_alloc();
-		if (!ff->release_req) {
+		ff->reserved_req = fuse_request_alloc();
+		if (!ff->reserved_req) {
 			kfree(ff);
 			ff = NULL;
 		}
@@ -59,7 +59,7 @@ struct fuse_file *fuse_file_alloc(void)
 
 void fuse_file_free(struct fuse_file *ff)
 {
-	fuse_request_free(ff->release_req);
+	fuse_request_free(ff->reserved_req);
 	kfree(ff);
 }
 
@@ -115,7 +115,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 struct fuse_req *fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags,
 				   int opcode)
 {
-	struct fuse_req *req = ff->release_req;
+	struct fuse_req *req = ff->reserved_req;
 	struct fuse_release_in *inarg = &req->misc.release_in;
 
 	inarg->fh = ff->fh;
@@ -187,10 +187,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	if (fc->no_flush)
 		return 0;
 
-	req = fuse_get_req(fc);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
-
+	req = fuse_get_req_nofail(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
 	inarg.lock_owner = fuse_lock_owner_id(id);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index eb3166625ca..f7c74516f3a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -65,7 +65,7 @@ struct fuse_inode {
 /** FUSE specific file data */
 struct fuse_file {
 	/** Request reserved for flush and release */
-	struct fuse_req *release_req;
+	struct fuse_req *reserved_req;
 
 	/** File handle used by userspace */
 	u64 fh;
@@ -213,6 +213,9 @@ struct fuse_req {
 
 	/** Request completion callback */
 	void (*end)(struct fuse_conn *, struct fuse_req *);
+
+	/** Request is stolen from fuse_file->reserved_req */
+	struct file *stolen_file;
 };
 
 /**
@@ -456,10 +459,15 @@ struct fuse_req *fuse_request_alloc(void);
 void fuse_request_free(struct fuse_req *req);
 
 /**
- * Reserve a preallocated request
+ * Get a request, may fail with -ENOMEM
  */
 struct fuse_req *fuse_get_req(struct fuse_conn *fc);
 
+/**
+ * Gets a requests for a file operation, always succeeds
+ */
+struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
+
 /**
  * Decrement reference count of a request.  If count goes to zero free
  * the request.
-- 
cgit v1.2.3


From f9a2842e5612b93fa20a624a8baa6c2a7ecea504 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 25 Jun 2006 05:48:53 -0700
Subject: [PATCH] fuse: rename the interrupted flag

Rename the 'interrupted' flag to 'aborted', since it indicates exactly that,
and next patch will introduce an 'interrupted' flag for a

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    | 39 +++++++++++++++++++--------------------
 fs/fuse/fuse_i.h |  4 ++--
 2 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ae26f37e53a..6b5f74cb7b5 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -202,7 +202,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 
 /*
  * This function is called when a request is finished.  Either a reply
- * has arrived or it was interrupted (and not yet sent) or some error
+ * has arrived or it was aborted (and not yet sent) or some error
  * occurred during communication with userspace, or the device file
  * was closed.  The requester thread is woken up (if still waiting),
  * the 'end' callback is called if given, else the reference to the
@@ -250,12 +250,12 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 		restore_sigs(&oldset);
 	}
 	spin_lock(&fc->lock);
-	if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
+	if (req->state == FUSE_REQ_FINISHED && !req->aborted)
 		return;
 
-	if (!req->interrupted) {
+	if (!req->aborted) {
 		req->out.h.error = -EINTR;
-		req->interrupted = 1;
+		req->aborted = 1;
 	}
 	if (req->locked) {
 		/* This is uninterruptible sleep, because data is
@@ -361,14 +361,14 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 /*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
- * interrupted bail out.
+ * aborted bail out.
  */
 static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	int err = 0;
 	if (req) {
 		spin_lock(&fc->lock);
-		if (req->interrupted)
+		if (req->aborted)
 			err = -ENOENT;
 		else
 			req->locked = 1;
@@ -378,7 +378,7 @@ static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 }
 
 /*
- * Unlock request.  If it was interrupted during being locked, the
+ * Unlock request.  If it was aborted during being locked, the
  * requester thread is currently waiting for it to be unlocked, so
  * wake it up.
  */
@@ -387,7 +387,7 @@ static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
 	if (req) {
 		spin_lock(&fc->lock);
 		req->locked = 0;
-		if (req->interrupted)
+		if (req->aborted)
 			wake_up(&req->waitq);
 		spin_unlock(&fc->lock);
 	}
@@ -589,8 +589,8 @@ static void request_wait(struct fuse_conn *fc)
  * Read a single request into the userspace filesystem's buffer.  This
  * function waits until a request is available, then removes it from
  * the pending list and copies request data to userspace buffer.  If
- * no reply is needed (FORGET) or request has been interrupted or
- * there was an error during the copying then it's finished by calling
+ * no reply is needed (FORGET) or request has been aborted or there
+ * was an error during the copying then it's finished by calling
  * request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
@@ -645,10 +645,10 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	fuse_copy_finish(&cs);
 	spin_lock(&fc->lock);
 	req->locked = 0;
-	if (!err && req->interrupted)
+	if (!err && req->aborted)
 		err = -ENOENT;
 	if (err) {
-		if (!req->interrupted)
+		if (!req->aborted)
 			req->out.h.error = -EIO;
 		request_end(fc, req);
 		return err;
@@ -754,7 +754,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	if (!req)
 		goto err_unlock;
 
-	if (req->interrupted) {
+	if (req->aborted) {
 		spin_unlock(&fc->lock);
 		fuse_copy_finish(&cs);
 		spin_lock(&fc->lock);
@@ -773,9 +773,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	spin_lock(&fc->lock);
 	req->locked = 0;
 	if (!err) {
-		if (req->interrupted)
+		if (req->aborted)
 			err = -ENOENT;
-	} else if (!req->interrupted)
+	} else if (!req->aborted)
 		req->out.h.error = -EIO;
 	request_end(fc, req);
 
@@ -835,7 +835,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 /*
  * Abort requests under I/O
  *
- * The requests are set to interrupted and finished, and the request
+ * The requests are set to aborted and finished, and the request
  * waiter is woken up.  This will make request_wait_answer() wait
  * until the request is unlocked and then return.
  *
@@ -850,7 +850,7 @@ static void end_io_requests(struct fuse_conn *fc)
 			list_entry(fc->io.next, struct fuse_req, list);
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 
-		req->interrupted = 1;
+		req->aborted = 1;
 		req->out.h.error = -ECONNABORTED;
 		req->state = FUSE_REQ_FINISHED;
 		list_del_init(&req->list);
@@ -883,9 +883,8 @@ static void end_io_requests(struct fuse_conn *fc)
  * onto the pending list is prevented by req->connected being false.
  *
  * Progression of requests under I/O to the processing list is
- * prevented by the req->interrupted flag being true for these
- * requests.  For this reason requests on the io list must be aborted
- * first.
+ * prevented by the req->aborted flag being true for these requests.
+ * For this reason requests on the io list must be aborted first.
  */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f7c74516f3a..fd65e75e162 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -159,8 +159,8 @@ struct fuse_req {
 	/** Force sending of the request even if interrupted */
 	unsigned force:1;
 
-	/** The request was interrupted */
-	unsigned interrupted:1;
+	/** The request was aborted */
+	unsigned aborted:1;
 
 	/** Request is sent in the background */
 	unsigned background:1;
-- 
cgit v1.2.3


From a4d27e75ffb7b8ecb7eed0c7db0df975525f3fd7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 25 Jun 2006 05:48:54 -0700
Subject: [PATCH] fuse: add request interruption

Add synchronous request interruption.  This is needed for file locking
operations which have to be interruptible.  However filesystem may implement
interruptibility of other operations (e.g.  like NFS 'intr' mount option).

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/dev.c    | 162 +++++++++++++++++++++++++++++++++++++++++++++----------
 fs/fuse/file.c   |   3 ++
 fs/fuse/fuse_i.h |  16 ++++++
 fs/fuse/inode.c  |   1 +
 4 files changed, 155 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6b5f74cb7b5..1e2006caf15 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -34,6 +34,7 @@ static void fuse_request_init(struct fuse_req *req)
 {
 	memset(req, 0, sizeof(*req));
 	INIT_LIST_HEAD(&req->list);
+	INIT_LIST_HEAD(&req->intr_entry);
 	init_waitqueue_head(&req->waitq);
 	atomic_set(&req->count, 1);
 }
@@ -215,6 +216,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 	req->end = NULL;
 	list_del(&req->list);
+	list_del(&req->intr_entry);
 	req->state = FUSE_REQ_FINISHED;
 	if (req->background) {
 		if (fc->num_background == FUSE_MAX_BACKGROUND) {
@@ -235,28 +237,63 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 		fuse_put_request(fc, req);
 }
 
+static void wait_answer_interruptible(struct fuse_conn *fc,
+				      struct fuse_req *req)
+{
+	if (signal_pending(current))
+		return;
+
+	spin_unlock(&fc->lock);
+	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
+	spin_lock(&fc->lock);
+}
+
+static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+{
+	list_add_tail(&req->intr_entry, &fc->interrupts);
+	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+
 /* Called with fc->lock held.  Releases, and then reacquires it. */
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 {
-	sigset_t oldset;
+	if (!fc->no_interrupt) {
+		/* Any signal may interrupt this */
+		wait_answer_interruptible(fc, req);
 
-	spin_unlock(&fc->lock);
-	if (req->force)
+		if (req->aborted)
+			goto aborted;
+		if (req->state == FUSE_REQ_FINISHED)
+			return;
+
+		req->interrupted = 1;
+		if (req->state == FUSE_REQ_SENT)
+			queue_interrupt(fc, req);
+	}
+
+	if (req->force) {
+		spin_unlock(&fc->lock);
 		wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
-	else {
+		spin_lock(&fc->lock);
+	} else {
+		sigset_t oldset;
+
+		/* Only fatal signals may interrupt this */
 		block_sigs(&oldset);
-		wait_event_interruptible(req->waitq,
-					 req->state == FUSE_REQ_FINISHED);
+		wait_answer_interruptible(fc, req);
 		restore_sigs(&oldset);
 	}
-	spin_lock(&fc->lock);
-	if (req->state == FUSE_REQ_FINISHED && !req->aborted)
-		return;
 
-	if (!req->aborted) {
-		req->out.h.error = -EINTR;
-		req->aborted = 1;
-	}
+	if (req->aborted)
+		goto aborted;
+	if (req->state == FUSE_REQ_FINISHED)
+ 		return;
+
+	req->out.h.error = -EINTR;
+	req->aborted = 1;
+
+ aborted:
 	if (req->locked) {
 		/* This is uninterruptible sleep, because data is
 		   being copied to/from the buffers of req.  During
@@ -288,13 +325,19 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
 	return nbytes;
 }
 
+static u64 fuse_get_unique(struct fuse_conn *fc)
+ {
+ 	fc->reqctr++;
+ 	/* zero is special */
+ 	if (fc->reqctr == 0)
+ 		fc->reqctr = 1;
+
+	return fc->reqctr;
+}
+
 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 {
-	fc->reqctr++;
-	/* zero is special */
-	if (fc->reqctr == 0)
-		fc->reqctr = 1;
-	req->in.h.unique = fc->reqctr;
+	req->in.h.unique = fuse_get_unique(fc);
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
 	list_add_tail(&req->list, &fc->pending);
@@ -307,9 +350,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
 
-/*
- * This can only be interrupted by a SIGKILL
- */
 void request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
@@ -566,13 +606,18 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
 	return err;
 }
 
+static int request_pending(struct fuse_conn *fc)
+{
+	return !list_empty(&fc->pending) || !list_empty(&fc->interrupts);
+}
+
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
 	add_wait_queue_exclusive(&fc->waitq, &wait);
-	while (fc->connected && list_empty(&fc->pending)) {
+	while (fc->connected && !request_pending(fc)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (signal_pending(current))
 			break;
@@ -585,6 +630,45 @@ static void request_wait(struct fuse_conn *fc)
 	remove_wait_queue(&fc->waitq, &wait);
 }
 
+/*
+ * Transfer an interrupt request to userspace
+ *
+ * Unlike other requests this is assembled on demand, without a need
+ * to allocate a separate fuse_req structure.
+ *
+ * Called with fc->lock held, releases it
+ */
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
+			       const struct iovec *iov, unsigned long nr_segs)
+{
+	struct fuse_copy_state cs;
+	struct fuse_in_header ih;
+	struct fuse_interrupt_in arg;
+	unsigned reqsize = sizeof(ih) + sizeof(arg);
+	int err;
+
+	list_del_init(&req->intr_entry);
+	req->intr_unique = fuse_get_unique(fc);
+	memset(&ih, 0, sizeof(ih));
+	memset(&arg, 0, sizeof(arg));
+	ih.len = reqsize;
+	ih.opcode = FUSE_INTERRUPT;
+	ih.unique = req->intr_unique;
+	arg.unique = req->in.h.unique;
+
+	spin_unlock(&fc->lock);
+	if (iov_length(iov, nr_segs) < reqsize)
+		return -EINVAL;
+
+	fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
+	err = fuse_copy_one(&cs, &ih, sizeof(ih));
+	if (!err)
+		err = fuse_copy_one(&cs, &arg, sizeof(arg));
+	fuse_copy_finish(&cs);
+
+	return err ? err : reqsize;
+}
+
 /*
  * Read a single request into the userspace filesystem's buffer.  This
  * function waits until a request is available, then removes it from
@@ -610,7 +694,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	spin_lock(&fc->lock);
 	err = -EAGAIN;
 	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
-	    list_empty(&fc->pending))
+	    !request_pending(fc))
 		goto err_unlock;
 
 	request_wait(fc);
@@ -618,9 +702,15 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	if (!fc->connected)
 		goto err_unlock;
 	err = -ERESTARTSYS;
-	if (list_empty(&fc->pending))
+	if (!request_pending(fc))
 		goto err_unlock;
 
+	if (!list_empty(&fc->interrupts)) {
+		req = list_entry(fc->interrupts.next, struct fuse_req,
+				 intr_entry);
+		return fuse_read_interrupt(fc, req, iov, nr_segs);
+	}
+
 	req = list_entry(fc->pending.next, struct fuse_req, list);
 	req->state = FUSE_REQ_READING;
 	list_move(&req->list, &fc->io);
@@ -658,6 +748,8 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	else {
 		req->state = FUSE_REQ_SENT;
 		list_move_tail(&req->list, &fc->processing);
+		if (req->interrupted)
+			queue_interrupt(fc, req);
 		spin_unlock(&fc->lock);
 	}
 	return reqsize;
@@ -684,7 +776,7 @@ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 	list_for_each(entry, &fc->processing) {
 		struct fuse_req *req;
 		req = list_entry(entry, struct fuse_req, list);
-		if (req->in.h.unique == unique)
+		if (req->in.h.unique == unique || req->intr_unique == unique)
 			return req;
 	}
 	return NULL;
@@ -750,7 +842,6 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 		goto err_unlock;
 
 	req = request_find(fc, oh.unique);
-	err = -EINVAL;
 	if (!req)
 		goto err_unlock;
 
@@ -761,6 +852,23 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 		request_end(fc, req);
 		return -ENOENT;
 	}
+	/* Is it an interrupt reply? */
+	if (req->intr_unique == oh.unique) {
+		err = -EINVAL;
+		if (nbytes != sizeof(struct fuse_out_header))
+			goto err_unlock;
+
+		if (oh.error == -ENOSYS)
+			fc->no_interrupt = 1;
+		else if (oh.error == -EAGAIN)
+			queue_interrupt(fc, req);
+
+		spin_unlock(&fc->lock);
+		fuse_copy_finish(&cs);
+		return nbytes;
+	}
+
+	req->state = FUSE_REQ_WRITING;
 	list_move(&req->list, &fc->io);
 	req->out.h = oh;
 	req->locked = 1;
@@ -809,7 +917,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 	spin_lock(&fc->lock);
 	if (!fc->connected)
 		mask = POLLERR;
-	else if (!list_empty(&fc->pending))
+	else if (request_pending(fc))
 		mask |= POLLIN | POLLRDNORM;
 	spin_unlock(&fc->lock);
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ce759414cff..36f92f181d2 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -705,6 +705,9 @@ static int fuse_setlk(struct file *file, struct file_lock *fl)
 	fuse_lk_fill(req, file, fl, opcode, pid);
 	request_send(fc, req);
 	err = req->out.h.error;
+	/* locking is restartable */
+	if (err == -EINTR)
+		err = -ERESTARTSYS;
 	fuse_put_request(fc, req);
 	return err;
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fd65e75e162..c862df58da9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -131,6 +131,7 @@ enum fuse_req_state {
 	FUSE_REQ_PENDING,
 	FUSE_REQ_READING,
 	FUSE_REQ_SENT,
+	FUSE_REQ_WRITING,
 	FUSE_REQ_FINISHED
 };
 
@@ -144,9 +145,15 @@ struct fuse_req {
 	    fuse_conn */
 	struct list_head list;
 
+	/** Entry on the interrupts list  */
+	struct list_head intr_entry;
+
 	/** refcount */
 	atomic_t count;
 
+	/** Unique ID for the interrupt request */
+	u64 intr_unique;
+
 	/*
 	 * The following bitfields are either set once before the
 	 * request is queued or setting/clearing them is protected by
@@ -165,6 +172,9 @@ struct fuse_req {
 	/** Request is sent in the background */
 	unsigned background:1;
 
+	/** The request has been interrupted */
+	unsigned interrupted:1;
+
 	/** Data is being copied to/from the request */
 	unsigned locked:1;
 
@@ -262,6 +272,9 @@ struct fuse_conn {
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
+	/** Pending interrupts */
+	struct list_head interrupts;
+
 	/** Flag indicating if connection is blocked.  This will be
 	    the case before the INIT reply is received, and if there
 	    are too many outstading backgrounds requests */
@@ -320,6 +333,9 @@ struct fuse_conn {
 	/** Is create not implemented by fs? */
 	unsigned no_create : 1;
 
+	/** Is interrupt not implemented by fs? */
+	unsigned no_interrupt : 1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 41289290583..e21ef8a3ad3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -381,6 +381,7 @@ static struct fuse_conn *new_conn(void)
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
+		INIT_LIST_HEAD(&fc->interrupts);
 		atomic_set(&fc->num_waiting, 0);
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
-- 
cgit v1.2.3


From 9c8ef5614da22666e339b125263d315cfaa89109 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Sun, 25 Jun 2006 05:48:55 -0700
Subject: [PATCH] fuse: scramble lock owner ID

VFS uses current->files pointer as lock owner ID, and it wouldn't be
prudent to expose this value to userspace.  So scramble it with XTEA using
a per connection random key, known only to the kernel.  Only one direction
needs to be implemented, since the ID is never sent in the reverse
direction.

The XTEA algorithm is implemented inline since it's simple enough to do so,
and this adds less complexity than if the crypto API were used.

Thanks to Jesper Juhl for the idea.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fuse/file.c   | 29 ++++++++++++++++++++---------
 fs/fuse/fuse_i.h |  3 +++
 fs/fuse/inode.c  |  2 ++
 3 files changed, 25 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 36f92f181d2..28aa81eae2c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -161,15 +161,25 @@ static int fuse_release(struct inode *inode, struct file *file)
 }
 
 /*
- * It would be nice to scramble the ID space, so that the value of the
- * files_struct pointer is not exposed to userspace.  Symmetric crypto
- * functions are overkill, since the inverse function doesn't need to
- * be implemented (though it does have to exist).  Is there something
- * simpler?
+ * Scramble the ID space with XTEA, so that the value of the files_struct
+ * pointer is not exposed to userspace.
  */
-static inline u64 fuse_lock_owner_id(fl_owner_t id)
+static u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 {
-	return (unsigned long) id;
+	u32 *k = fc->scramble_key;
+	u64 v = (unsigned long) id;
+	u32 v0 = v;
+	u32 v1 = v >> 32;
+	u32 sum = 0;
+	int i;
+
+	for (i = 0; i < 32; i++) {
+		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
+		sum += 0x9E3779B9;
+		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
+	}
+
+	return (u64) v0 + ((u64) v1 << 32);
 }
 
 static int fuse_flush(struct file *file, fl_owner_t id)
@@ -190,7 +200,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	req = fuse_get_req_nofail(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
-	inarg.lock_owner = fuse_lock_owner_id(id);
+	inarg.lock_owner = fuse_lock_owner_id(fc, id);
 	req->in.h.opcode = FUSE_FLUSH;
 	req->in.h.nodeid = get_node_id(inode);
 	req->in.numargs = 1;
@@ -644,11 +654,12 @@ static void fuse_lk_fill(struct fuse_req *req, struct file *file,
 			 const struct file_lock *fl, int opcode, pid_t pid)
 {
 	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_file *ff = file->private_data;
 	struct fuse_lk_in *arg = &req->misc.lk_in;
 
 	arg->fh = ff->fh;
-	arg->owner = fuse_lock_owner_id(fl->fl_owner);
+	arg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
 	arg->lk.start = fl->fl_start;
 	arg->lk.end = fl->fl_end;
 	arg->lk.type = fl->fl_type;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c862df58da9..0dbf9662184 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -359,6 +359,9 @@ struct fuse_conn {
 
 	/** O_ASYNC requests */
 	struct fasync_struct *fasync;
+
+	/** Key for lock owner ID scrambling */
+	u32 scramble_key[4];
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e21ef8a3ad3..5ceb8bd7a18 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/parser.h>
 #include <linux/statfs.h>
+#include <linux/random.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -387,6 +388,7 @@ static struct fuse_conn *new_conn(void)
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
 		fc->reqctr = 0;
 		fc->blocked = 1;
+		get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 	}
 	return fc;
 }
-- 
cgit v1.2.3


From 099a71d99578a53bdf5f383c55e4095f1c59410c Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sun, 25 Jun 2006 05:48:57 -0700
Subject: [PATCH] Remove needless checks in fs/9p/vfs_inode.c

coverity found two needless checks in vfs_inode.c (cid #1165 and #1164)
In both cases inode is always NULL when we goto error; either because it
is still initialized to NULL or is set to NULL explicitly. This patch
simply removes these checks to save some code.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Acked-by: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@lanl.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_inode.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8e60dc7ec4a..5c6bdf82146 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -530,9 +530,6 @@ error:
 	if (vfid)
 		v9fs_fid_destroy(vfid);
 
-	if (inode)
-		iput(inode);
-
 	return err;
 }
 
@@ -1174,9 +1171,6 @@ error:
 	if (vfid)
 		v9fs_fid_destroy(vfid);
 
-	if (inode)
-		iput(inode);
-
 	return err;
 
 }
-- 
cgit v1.2.3


From fa366ad5d7fe05abaae44a1cd216348669e42ef8 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Sun, 25 Jun 2006 05:49:00 -0700
Subject: [PATCH] kthread: convert smbiod

Update smbiod to use kthread instead of deprecated kernel_thread.

[akpm@osdl.org: cleanup]
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/smbfs/smbiod.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 481a97a423f..3f71384020c 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -20,6 +20,7 @@
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/net.h>
+#include <linux/kthread.h>
 #include <net/ip.h>
 
 #include <linux/smb_fs.h>
@@ -40,7 +41,7 @@ enum smbiod_state {
 };
 
 static enum smbiod_state smbiod_state = SMBIOD_DEAD;
-static pid_t smbiod_pid;
+static struct task_struct *smbiod_thread;
 static DECLARE_WAIT_QUEUE_HEAD(smbiod_wait);
 static LIST_HEAD(smb_servers);
 static DEFINE_SPINLOCK(servers_lock);
@@ -67,20 +68,29 @@ void smbiod_wake_up(void)
  */
 static int smbiod_start(void)
 {
-	pid_t pid;
+	struct task_struct *tsk;
+	int err = 0;
+
 	if (smbiod_state != SMBIOD_DEAD)
 		return 0;
 	smbiod_state = SMBIOD_STARTING;
 	__module_get(THIS_MODULE);
 	spin_unlock(&servers_lock);
-	pid = kernel_thread(smbiod, NULL, 0);
-	if (pid < 0)
+	tsk = kthread_run(smbiod, NULL, "smbiod");
+	if (IS_ERR(tsk)) {
+		err = PTR_ERR(tsk);
 		module_put(THIS_MODULE);
+	}
 
 	spin_lock(&servers_lock);
-	smbiod_state = pid < 0 ? SMBIOD_DEAD : SMBIOD_RUNNING;
-	smbiod_pid = pid;
-	return pid;
+	if (err < 0) {
+		smbiod_state = SMBIOD_DEAD;
+		smbiod_thread = NULL;
+	} else {
+		smbiod_state = SMBIOD_RUNNING;
+		smbiod_thread = tsk;
+	}
+	return err;
 }
 
 /*
@@ -290,8 +300,6 @@ out:
  */
 static int smbiod(void *unused)
 {
-	daemonize("smbiod");
-
 	allow_signal(SIGKILL);
 
 	VERBOSE("SMB Kernel thread starting (%d) ...\n", current->pid);
-- 
cgit v1.2.3


From 2b943cf09d75c0dfbfd22548988df48f6c24c727 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sun, 25 Jun 2006 05:49:08 -0700
Subject: [PATCH] fix %s in affs_fill_super()

%s is only valid if array is known to contain NUL or precision is given and
does not exceed the size of array.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/affs/super.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/super.c b/fs/affs/super.c
index 8765cba35bb..5200f4938df 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -271,6 +271,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	int			 reserved;
 	unsigned long		 mount_flags;
 	int			 tmp_flags;	/* fix remount prototype... */
+	u8			 sig[4];
 
 	pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
 
@@ -370,8 +371,9 @@ got_root:
 		printk(KERN_ERR "AFFS: Cannot read boot block\n");
 		goto out_error;
 	}
-	chksum = be32_to_cpu(*(__be32 *)boot_bh->b_data);
+	memcpy(sig, boot_bh->b_data, 4);
 	brelse(boot_bh);
+	chksum = be32_to_cpu(*(__be32 *)sig);
 
 	/* Dircache filesystems are compatible with non-dircache ones
 	 * when reading. As long as they aren't supported, writing is
@@ -420,11 +422,11 @@ got_root:
 	}
 
 	if (mount_flags & SF_VERBOSE) {
-		chksum = cpu_to_be32(chksum);
-		printk(KERN_NOTICE "AFFS: Mounting volume \"%*s\": Type=%.3s\\%c, Blocksize=%d\n",
-			AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0],
+		u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
+		printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
+			len > 31 ? 31 : len,
 			AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
-			(char *)&chksum,((char *)&chksum)[3] + '0',blocksize);
+			sig, sig[3] + '0', blocksize);
 	}
 
 	sb->s_flags |= MS_NODEV | MS_NOSUID;
-- 
cgit v1.2.3


From 04a3446c90137a473837288b04b517b19dc67a0d Mon Sep 17 00:00:00 2001
From: Frode Isaksen <frode.isaksen@gmail.com>
Date: Sun, 25 Jun 2006 05:49:09 -0700
Subject: [PATCH] fs: sys_poll with timeout -1 bug fix

If you do a poll() call with timeout -1, the wait will be a big number
(depending on HZ) instead of infinite wait, since -1 is passed to the
msecs_to_jiffies function.

Signed-off-by: Frode Isaksen <frode.isaksen@gmail.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/select.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 9c4f0f2604f..33b72ba0f86 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -746,9 +746,9 @@ out_fds:
 asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 			long timeout_msecs)
 {
-	s64 timeout_jiffies = 0;
+	s64 timeout_jiffies;
 
-	if (timeout_msecs) {
+	if (timeout_msecs > 0) {
 #if HZ > 1000
 		/* We can only overflow if HZ > 1000 */
 		if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ)
@@ -756,6 +756,9 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 		else
 #endif
 			timeout_jiffies = msecs_to_jiffies(timeout_msecs);
+	} else {
+		/* Infinite (< 0) or no (0) timeout */
+		timeout_jiffies = timeout_msecs;
 	}
 
 	return do_sys_poll(ufds, nfds, &timeout_jiffies);
-- 
cgit v1.2.3


From 45c9b11a1d07770cabb48cb0f7960a77650ffc64 Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Sun, 25 Jun 2006 05:49:11 -0700
Subject: [PATCH] Implement AT_SYMLINK_FOLLOW flag for linkat

When the linkat() syscall was added the flag parameter was added in the
last minute but it wasn't used so far.  The following patch should change
that.  My tests show that this is all that's needed.

If OLDNAME is a symlink setting the flag causes linkat to follow the
symlink and create a hardlink with the target.  This is actually the
behavior POSIX demands for link() as well but Linux wisely does not do
this.  With this flag (which will most likely be in the next POSIX
revision) the programmer can choose the behavior, defaulting to the safe
variant.  As a side effect it is now possible to implement a
POSIX-compliant link(2) function for those who are interested.

  touch file
  ln -s file symlink

  linkat(fd, "symlink", fd, "newlink", 0)
    -> newlink is hardlink of symlink

  linkat(fd, "symlink", fd, "newlink", AT_SYMLINK_FOLLOW)
    -> newlink is hardlink of file

The value of AT_SYMLINK_FOLLOW is determined by the definition we already
use in glibc.

Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index bb4a3e40e43..c784e8bb57a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2243,14 +2243,16 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	int error;
 	char * to;
 
-	if (flags != 0)
+	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
 		return -EINVAL;
 
 	to = getname(newname);
 	if (IS_ERR(to))
 		return PTR_ERR(to);
 
-	error = __user_walk_fd(olddfd, oldname, 0, &old_nd);
+	error = __user_walk_fd(olddfd, oldname,
+			       flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
+			       &old_nd);
 	if (error)
 		goto exit;
 	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
-- 
cgit v1.2.3


From 7d93a1a53a72aa704a8fef6e0fb4f6c6cd6aa07a Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sun, 25 Jun 2006 05:49:27 -0700
Subject: [PATCH] ext2: cleanup: put_page and comment fix

Things which force me think a little: why so?

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/dir.c   | 3 +--
 fs/ext2/fsync.c | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 3c1c9aaaca6..92ea8265d7d 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -399,8 +399,7 @@ ino_t ext2_inode_by_name(struct inode * dir, struct dentry *dentry)
 	de = ext2_find_entry (dir, dentry, &page);
 	if (de) {
 		res = le32_to_cpu(de->inode);
-		kunmap(page);
-		page_cache_release(page);
+		ext2_put_page(page);
 	}
 	return res;
 }
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
index c9c2e5ffa48..7806b9e8155 100644
--- a/fs/ext2/fsync.c
+++ b/fs/ext2/fsync.c
@@ -24,7 +24,7 @@
 
 #include "ext2.h"
 #include <linux/smp_lock.h>
-#include <linux/buffer_head.h>		/* for fsync_inode_buffers() */
+#include <linux/buffer_head.h>		/* for sync_mapping_buffers() */
 
 
 /*
-- 
cgit v1.2.3


From f58a1ebb22f128fccfeafb2b18557c87741cd627 Mon Sep 17 00:00:00 2001
From: Hua Zhong <hzhong@gmail.com>
Date: Sun, 25 Jun 2006 05:49:32 -0700
Subject: [PATCH] remove unlikely(sb) in prune_dcache

likely profiling shows that the following is a miss.

After boot:
[+- ] Type | # True | # False | Function:Filename@Line
+unlikely |     1074|        0  prune_dcache()@:fs/dcache.c@409

After a bonnie++ run:
+unlikely |    66716|    19584  prune_dcache()@:fs/dcache.c@409

So remove it.

Signed-off-by: Hua Zhong <hzhong@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 313b54b2b8f..b85fda36053 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -406,7 +406,7 @@ static void prune_dcache(int count, struct super_block *sb)
 		cond_resched_lock(&dcache_lock);
 
 		tmp = dentry_unused.prev;
-		if (unlikely(sb)) {
+		if (sb) {
 			/* Try to find a dentry for this sb, but don't try
 			 * too hard, if they aren't near the tail they will
 			 * be moved down again soon
-- 
cgit v1.2.3


From f36f44de721db44b4c2944133c3c5c2e06f633f0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sun, 25 Jun 2006 12:30:33 -0700
Subject: Fix NFS2 compile error

Trond had apparently merged the same patch twice, causing a duplicate
include of the "internal.h" file, with resulting obvious confusion.

Tssk.  I'm the only one allowed to send out trees that don't even
compile! Who does this Trond guy think he is?

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/nfs2xdr.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 3b939e055a0..67391eef6b9 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -25,8 +25,6 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
 
-#include "internal.h"
-
 #define NFSDBG_FACILITY		NFSDBG_XDR
 /* #define NFS_PARANOIA 1 */
 
-- 
cgit v1.2.3


From 1e788f8d1ad71fea99a763114eae73c3ad55bc30 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 25 Jun 2006 16:42:33 -0700
Subject: [PATCH] xfs: update ->flush method proto

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Nathan Scott <nathans@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/xfs/linux-2.6/xfs_file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 70662371bb1..3d4f6dff211 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -299,7 +299,8 @@ xfs_file_open(
 
 STATIC int
 xfs_file_close(
-	struct file	*filp)
+	struct file	*filp,
+	fl_owner_t	id)
 {
 	return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0,
 				file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL);
-- 
cgit v1.2.3


From fcc18e83e1f6fd9fa6b333735bf0fcd530655511 Mon Sep 17 00:00:00 2001
From: Malcolm Parsons <malcolm.parsons@gmail.com>
Date: Mon, 26 Jun 2006 11:49:41 +1000
Subject: [PATCH] uclinux: use PER_LINUX_32BIT in binfmt_flat

binfmt_flat.c calls set_personality with PER_LINUX as the personality.
On the arm architecture this results in the program running in 26bit
usermode.  PER_LINUX_32BIT should be used instead.  This doesn't affect
other architectures that use binfmt_flat.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/binfmt_flat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b1c902e319c..c94d52eafd1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -510,7 +510,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		}
 
 		/* OK, This is the point of no return */
-		set_personality(PER_LINUX);
+		set_personality(PER_LINUX_32BIT);
 	}
 
 	/*
-- 
cgit v1.2.3


From cd49b492fe3fef0b6348f18001f0d73f2577eb01 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 26 Jun 2006 04:22:36 +0000
Subject: [CIFS] remove some redundant null pointer checks

some of them pointed out by Dave Jones

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 45 +++++++++++++++------------------------------
 1 file changed, 15 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 01608bb4d67..c0f98ddea88 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2173,8 +2173,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 				if (remaining_words > 0) {
 					len = UniStrnlen((wchar_t *)bcc_ptr,
 							 remaining_words-1);
-					if(ses->serverNOS)
-						kfree(ses->serverNOS);
+					kfree(ses->serverNOS);
 					ses->serverNOS = kzalloc(2 * (len + 1),GFP_KERNEL);
 					if(ses->serverNOS == NULL)
 						goto sesssetup_nomem;
@@ -2214,12 +2213,10 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					/* if these kcallocs fail not much we
 					   can do, but better to not fail the
 					   sesssetup itself */
-					if(ses->serverDomain)
-						kfree(ses->serverDomain);
+					kfree(ses->serverDomain);
 					ses->serverDomain =
 					    kzalloc(2, GFP_KERNEL);
-					if(ses->serverNOS)
-						kfree(ses->serverNOS);
+					kfree(ses->serverNOS);
 					ses->serverNOS =
 					    kzalloc(2, GFP_KERNEL);
 				}
@@ -2228,8 +2225,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 				if (((long) bcc_ptr + len) - (long)
 				    pByteArea(smb_buffer_response)
 					    <= BCC(smb_buffer_response)) {
-					if(ses->serverOS)
-						kfree(ses->serverOS);
+					kfree(ses->serverOS);
 					ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverOS == NULL)
 						goto sesssetup_nomem;
@@ -2240,8 +2236,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 					bcc_ptr++;
 
 					len = strnlen(bcc_ptr, 1024);
-					if(ses->serverNOS)
-						kfree(ses->serverNOS);
+					kfree(ses->serverNOS);
 					ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
 					if(ses->serverNOS == NULL)
 						goto sesssetup_nomem;
@@ -2508,8 +2503,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 								 bcc_ptr,
 								 remaining_words
 								 - 1);
-						if(ses->serverNOS)
-							kfree(ses->serverNOS);
+						kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
@@ -2527,8 +2521,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						if (remaining_words > 0) {
 							len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);	
            /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
-							if(ses->serverDomain)
-								kfree(ses->serverDomain);
+							kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2 *
 								    (len +
@@ -2547,19 +2540,16 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 							    = 0;
 						} /* else no more room so create dummy domain string */
 						else {
-							if(ses->serverDomain)
-								kfree(ses->serverDomain);
+							kfree(ses->serverDomain);
 							ses->serverDomain =
 							    kzalloc(2,
 								    GFP_KERNEL);
 						}
 					} else {	/* no room so create dummy domain and NOS string */
-						if(ses->serverDomain);
-							kfree(ses->serverDomain);
+						kfree(ses->serverDomain);
 						ses->serverDomain =
 						    kzalloc(2, GFP_KERNEL);
-						if(ses->serverNOS)
-							kfree(ses->serverNOS);
+						kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2, GFP_KERNEL);
 					}
@@ -2581,8 +2571,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
-						if(ses->serverNOS)
-							kfree(ses->serverNOS);
+						kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(len + 1,
 							    GFP_KERNEL);
@@ -2592,8 +2581,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
-						if(ses->serverDomain)
-							kfree(ses->serverDomain);
+						kfree(ses->serverDomain);
 						ses->serverDomain =
 						    kzalloc(len + 1,
 							    GFP_KERNEL);
@@ -2915,8 +2903,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 								 bcc_ptr,
 								 remaining_words
 								 - 1);
-						if(ses->serverNOS)
-							kfree(ses->serverNOS);
+						kfree(ses->serverNOS);
 						ses->serverNOS =
 						    kzalloc(2 * (len + 1),
 							    GFP_KERNEL);
@@ -2969,8 +2956,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						if(ses->serverDomain)
 							kfree(ses->serverDomain);
 						ses->serverDomain = kzalloc(2, GFP_KERNEL);
-						if(ses->serverNOS)
-							kfree(ses->serverNOS);
+						kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(2, GFP_KERNEL);
 					}
 				} else {	/* ASCII */
@@ -2988,8 +2974,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 						bcc_ptr++;
 
 						len = strnlen(bcc_ptr, 1024);
-						if(ses->serverNOS)
-							kfree(ses->serverNOS);
+						kfree(ses->serverNOS);
 						ses->serverNOS = kzalloc(len+1,GFP_KERNEL);
 						strncpy(ses->serverNOS, bcc_ptr, len);	
 						bcc_ptr += len;
-- 
cgit v1.2.3


From 3d824a46b7210ea3b0a13ab0d0fbd7f6e2e91ddf Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 25 Jun 2006 23:19:14 -0700
Subject: [OPENPROMFS]: Rewrite using in-kernel device tree and seq_file.

We lose property writing functionality for the time being, but
that will be easy to add back.  The code and framework is so
much simpler now.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/openpromfs/inode.c | 1158 +++++++++++--------------------------------------
 1 file changed, 255 insertions(+), 903 deletions(-)

(limited to 'fs')

diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index efc7c91128a..93a56bd4a2b 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1,5 +1,4 @@
-/* $Id: inode.c,v 1.15 2001/11/12 09:43:39 davem Exp $
- * openpromfs.c: /proc/openprom handling routines
+/* inode.c: /proc/openprom handling routines
  *
  * Copyright (C) 1996-1999 Jakub Jelinek  (jakub@redhat.com)
  * Copyright (C) 1998      Eddie C. Dost  (ecd@skynet.be)
@@ -12,762 +11,245 @@
 #include <linux/openprom_fs.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
 
 #include <asm/openprom.h>
 #include <asm/oplib.h>
+#include <asm/prom.h>
 #include <asm/uaccess.h>
 
-#define ALIASES_NNODES 64
-
-typedef struct {
-	u16	parent;
-	u16	next;
-	u16	child;
-	u16	first_prop;
-	u32	node;
-} openpromfs_node;
-
-typedef struct {
-#define OPP_STRING	0x10
-#define OPP_STRINGLIST	0x20
-#define OPP_BINARY	0x40
-#define OPP_HEXSTRING	0x80
-#define OPP_DIRTY	0x01
-#define OPP_QUOTED	0x02
-#define OPP_NOTQUOTED	0x04
-#define OPP_ASCIIZ	0x08
-	u32	flag;
-	u32	alloclen;
-	u32	len;
-	char	*value;
-	char	name[8];
-} openprom_property;
-
-static openpromfs_node *nodes;
-static int alloced;
-static u16 last_node;
-static u16 first_prop;
-static u16 options = 0xffff;
-static u16 aliases = 0xffff;
-static int aliases_nodes;
-static char *alias_names [ALIASES_NNODES];
-
-#define OPENPROM_ROOT_INO	16
-#define OPENPROM_FIRST_INO	OPENPROM_ROOT_INO
-#define NODE(ino) nodes[ino - OPENPROM_FIRST_INO]
-#define NODE2INO(node) (node + OPENPROM_FIRST_INO)
-#define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node)
-
-static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *);
-static int openpromfs_readdir(struct file *, void *, filldir_t);
-static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
-static int openpromfs_unlink (struct inode *, struct dentry *dentry);
+static DEFINE_MUTEX(op_mutex);
 
-static inline u16 ptr_nod(void *p)
-{
-    return (long)p & 0xFFFF;
-}
+#define OPENPROM_ROOT_INO	0
 
-static ssize_t nodenum_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *ppos)
+enum op_inode_type {
+	op_inode_node,
+	op_inode_prop,
+};
+
+union op_inode_data {
+	struct device_node	*node;
+	struct property		*prop;
+};
+
+struct op_inode_info {
+	struct inode		vfs_inode;
+	enum op_inode_type	type;
+	union op_inode_data	u;
+};
+
+static inline struct op_inode_info *OP_I(struct inode *inode)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	char buffer[10];
-	
-	if (count < 0 || !inode->u.generic_ip)
-		return -EINVAL;
-	sprintf (buffer, "%8.8lx\n", (long)inode->u.generic_ip);
-	if (file->f_pos >= 9)
-		return 0;
-	if (count > 9 - file->f_pos)
-		count = 9 - file->f_pos;
-	if (copy_to_user(buf, buffer + file->f_pos, count))
-		return -EFAULT;
-	*ppos += count;
-	return count;
+	return container_of(inode, struct op_inode_info, vfs_inode);
 }
 
-static ssize_t property_read(struct file *filp, char __user *buf,
-			     size_t count, loff_t *ppos)
+static int is_string(unsigned char *p, int len)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
-	int i, j, k;
-	u32 node;
-	char *p, *s;
-	u32 *q;
-	openprom_property *op;
-	char buffer[64];
-	
-	if (!filp->private_data) {
-		node = nodes[ptr_nod(inode->u.generic_ip)].node;
-		i = ((u32)(long)inode->u.generic_ip) >> 16;
-		if (ptr_nod(inode->u.generic_ip) == aliases) {
-			if (i >= aliases_nodes)
-				p = NULL;
-			else
-				p = alias_names [i];
-		} else
-			for (p = prom_firstprop (node, buffer);
-			     i && p && *p;
-			     p = prom_nextprop (node, p, buffer), i--)
-				/* nothing */ ;
-		if (!p || !*p)
-			return -EIO;
-		i = prom_getproplen (node, p);
-		if (i < 0) {
-			if (ptr_nod(inode->u.generic_ip) == aliases)
-				i = 0;
-			else
-				return -EIO;
-		}
-		k = i;
-		if (i < 64) i = 64;
-		filp->private_data = kmalloc (sizeof (openprom_property)
-					      + (j = strlen (p)) + 2 * i,
-					      GFP_KERNEL);
-		if (!filp->private_data)
-			return -ENOMEM;
-		op = filp->private_data;
-		op->flag = 0;
-		op->alloclen = 2 * i;
-		strcpy (op->name, p);
-		op->value = (char *)(((unsigned long)(op->name + j + 4)) & ~3);
-		op->len = k;
-		if (k && prom_getproperty (node, p, op->value, i) < 0)
-			return -EIO;
-		op->value [k] = 0;
-		if (k) {
-			for (s = NULL, p = op->value; p < op->value + k; p++) {
-				if ((*p >= ' ' && *p <= '~') || *p == '\n') {
-					op->flag |= OPP_STRING;
-					s = p;
-					continue;
-				}
-				if (p > op->value && !*p && s == p - 1) {
-					if (p < op->value + k - 1)
-						op->flag |= OPP_STRINGLIST;
-					else
-						op->flag |= OPP_ASCIIZ;
-					continue;
-				}
-				if (k == 1 && !*p) {
-					op->flag |= (OPP_STRING|OPP_ASCIIZ);
-					break;
-				}
-				op->flag &= ~(OPP_STRING|OPP_STRINGLIST);
-				if (k & 3)
-					op->flag |= OPP_HEXSTRING;
-				else
-					op->flag |= OPP_BINARY;
-				break;
-			}
-			if (op->flag & OPP_STRINGLIST)
-				op->flag &= ~(OPP_STRING);
-			if (op->flag & OPP_ASCIIZ)
-				op->len--;
-		}
-	} else
-		op = filp->private_data;
-	if (!count || !(op->len || (op->flag & OPP_ASCIIZ)))
-		return 0;
-	if (*ppos >= 0xffffff || count >= 0xffffff)
-		return -EINVAL;
-	if (op->flag & OPP_STRINGLIST) {
-		for (k = 0, p = op->value; p < op->value + op->len; p++)
-			if (!*p)
-				k++;
-		i = op->len + 4 * k + 3;
-	} else if (op->flag & OPP_STRING) {
-		i = op->len + 3;
-	} else if (op->flag & OPP_BINARY) {
-		i = (op->len * 9) >> 2;
-	} else {
-		i = (op->len << 1) + 1;
-	}
-	k = *ppos;
-	if (k >= i) return 0;
-	if (count > i - k) count = i - k;
-	if (op->flag & OPP_STRING) {
-		if (!k) {
-			if (put_user('\'', buf))
-				return -EFAULT;
-			k++;
-			count--;
-		}
+	int i;
 
-		if (k + count >= i - 2)
-			j = i - 2 - k;
-		else
-			j = count;
-
-		if (j >= 0) {
-			if (copy_to_user(buf + k - *ppos,
-					 op->value + k - 1, j))
-				return -EFAULT;
-			count -= j;
-			k += j;
-		}
+	for (i = 0; i < len; i++) {
+		unsigned char val = p[i];
 
-		if (count) {
-			if (put_user('\'', &buf [k++ - *ppos]))
-				return -EFAULT;
-		}
-		if (count > 1) {
-			if (put_user('\n', &buf [k++ - *ppos]))
-				return -EFAULT;
-		}
-	} else if (op->flag & OPP_STRINGLIST) {
-		char *tmp;
-
-		tmp = kmalloc (i, GFP_KERNEL);
-		if (!tmp)
-			return -ENOMEM;
-
-		s = tmp;
-		*s++ = '\'';
-		for (p = op->value; p < op->value + op->len; p++) {
-			if (!*p) {
-				strcpy(s, "' + '");
-				s += 5;
-				continue;
-			}
-			*s++ = *p;
-		}
-		strcpy(s, "'\n");
-
-		if (copy_to_user(buf, tmp + k, count))
-			return -EFAULT;
-
-		kfree(tmp);
-		k += count;
-
-	} else if (op->flag & OPP_BINARY) {
-		char buffer[10];
-		u32 *first, *last;
-		int first_off, last_cnt;
-
-		first = ((u32 *)op->value) + k / 9;
-		first_off = k % 9;
-		last = ((u32 *)op->value) + (k + count - 1) / 9;
-		last_cnt = (k + count) % 9;
-		if (!last_cnt) last_cnt = 9;
-
-		if (first == last) {
-			sprintf (buffer, "%08x.", *first);
-			if (copy_to_user(buf, buffer + first_off,
-					 last_cnt - first_off))
-				return -EFAULT;
-			buf += last_cnt - first_off;
-		} else {		
-			for (q = first; q <= last; q++) {
-				sprintf (buffer, "%08x.", *q);
-				if (q == first) {
-					if (copy_to_user(buf, buffer + first_off,
-							 9 - first_off))
-						return -EFAULT;
-					buf += 9 - first_off;
-				} else if (q == last) {
-					if (copy_to_user(buf, buffer, last_cnt))
-						return -EFAULT;
-					buf += last_cnt;
-				} else {
-					if (copy_to_user(buf, buffer, 9))
-						return -EFAULT;
-					buf += 9;
-				}
-			}
-		}
+		if ((i && !val) ||
+		    (val >= ' ' && val <= '~'))
+			continue;
 
-		if (last == (u32 *)(op->value + op->len - 4) && last_cnt == 9) {
-			if (put_user('\n', (buf - 1)))
-				return -EFAULT;
-		}
+		return 0;
+	}
 
-		k += count;
+	return 1;
+}
 
-	} else if (op->flag & OPP_HEXSTRING) {
-		char buffer[3];
+static int property_show(struct seq_file *f, void *v)
+{
+	struct property *prop = f->private;
+	void *pval;
+	int len;
 
-		if ((k < i - 1) && (k & 1)) {
-			sprintf (buffer, "%02x",
-				 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-			if (put_user(buffer[1], &buf[k++ - *ppos]))
-				return -EFAULT;
-			count--;
-		}
+	len = prop->length;
+	pval = prop->value;
 
-		for (; (count > 1) && (k < i - 1); k += 2) {
-			sprintf (buffer, "%02x",
-				 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-			if (copy_to_user(buf + k - *ppos, buffer, 2))
-				return -EFAULT;
-			count -= 2;
-		}
+	if (is_string(pval, len)) {
+		while (len > 0) {
+			int n = strlen(pval);
 
-		if (count && (k < i - 1)) {
-			sprintf (buffer, "%02x",
-				 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-			if (put_user(buffer[0], &buf[k++ - *ppos]))
-				return -EFAULT;
-			count--;
-		}
+			seq_printf(f, "%s", (char *) pval);
 
-		if (count) {
-			if (put_user('\n', &buf [k++ - *ppos]))
-				return -EFAULT;
-		}
-	}
-	count = k - *ppos;
-	*ppos = k;
-	return count;
-}
+			/* Skip over the NULL byte too.  */
+			pval += n + 1;
+			len -= n + 1;
 
-static ssize_t property_write(struct file *filp, const char __user *buf,
-			      size_t count, loff_t *ppos)
-{
-	int i, j, k;
-	char *p;
-	u32 *q;
-	void *b;
-	openprom_property *op;
-	
-	if (*ppos >= 0xffffff || count >= 0xffffff)
-		return -EINVAL;
-	if (!filp->private_data) {
-		i = property_read (filp, NULL, 0, NULL);
-		if (i)
-			return i;
-	}
-	k = *ppos;
-	op = filp->private_data;
-	if (!(op->flag & OPP_STRING)) {
-		u32 *first, *last;
-		int first_off, last_cnt;
-		u32 mask, mask2;
-		char tmp [9];
-		int forcelen = 0;
-		
-		j = k % 9;
-		for (i = 0; i < count; i++, j++) {
-			if (j == 9) j = 0;
-			if (!j) {
-				char ctmp;
-				if (get_user(ctmp, &buf[i]))
-					return -EFAULT;
-				if (ctmp != '.') {
-					if (ctmp != '\n') {
-						if (op->flag & OPP_BINARY)
-							return -EINVAL;
-						else
-							goto write_try_string;
-					} else {
-						count = i + 1;
-						forcelen = 1;
-						break;
-					}
-				}
-			} else {
-				char ctmp;
-				if (get_user(ctmp, &buf[i]))
-					return -EFAULT;
-				if (ctmp < '0' || 
-				    (ctmp > '9' && ctmp < 'A') ||
-				    (ctmp > 'F' && ctmp < 'a') ||
-				    ctmp > 'f') {
-					if (op->flag & OPP_BINARY)
-						return -EINVAL;
-					else
-						goto write_try_string;
-				}
-			}
-		}
-		op->flag |= OPP_BINARY;
-		tmp [8] = 0;
-		i = ((count + k + 8) / 9) << 2;
-		if (op->alloclen <= i) {
-			b = kmalloc (sizeof (openprom_property) + 2 * i,
-				     GFP_KERNEL);
-			if (!b)
-				return -ENOMEM;
-			memcpy (b, filp->private_data,
-				sizeof (openprom_property)
-				+ strlen (op->name) + op->alloclen);
-			memset (b + sizeof (openprom_property)
-				+ strlen (op->name) + op->alloclen, 
-				0, 2 * i - op->alloclen);
-			op = b;
-			op->alloclen = 2*i;
-			b = filp->private_data;
-			filp->private_data = op;
-			kfree (b);
+			if (len > 0)
+				seq_printf(f, " + ");
 		}
-		first = ((u32 *)op->value) + (k / 9);
-		first_off = k % 9;
-		last = (u32 *)(op->value + i);
-		last_cnt = (k + count) % 9;
-		if (first + 1 == last) {
-			memset (tmp, '0', 8);
-			if (copy_from_user(tmp + first_off, buf,
-					   (count + first_off > 8) ?
-					   8 - first_off : count))
-				return -EFAULT;
-			mask = 0xffffffff;
-			mask2 = 0xffffffff;
-			for (j = 0; j < first_off; j++)
-				mask >>= 1;
-			for (j = 8 - count - first_off; j > 0; j--)
-				mask2 <<= 1;
-			mask &= mask2;
-			if (mask) {
-				*first &= ~mask;
-				*first |= simple_strtoul (tmp, NULL, 16);
-				op->flag |= OPP_DIRTY;
+	} else {
+		if (len & 3) {
+			while (len) {
+				len--;
+				if (len)
+					seq_printf(f, "%02x.",
+						   *(unsigned char *) pval);
+				else
+					seq_printf(f, "%02x",
+						   *(unsigned char *) pval);
+				pval++;
 			}
 		} else {
-			op->flag |= OPP_DIRTY;
-			for (q = first; q < last; q++) {
-				if (q == first) {
-					if (first_off < 8) {
-						memset (tmp, '0', 8);
-						if (copy_from_user(tmp + first_off,
-								   buf,
-								   8 - first_off))
-							return -EFAULT;
-						mask = 0xffffffff;
-						for (j = 0; j < first_off; j++)
-							mask >>= 1;
-						*q &= ~mask;
-						*q |= simple_strtoul (tmp,NULL,16);
-					}
-					buf += 9;
-				} else if ((q == last - 1) && last_cnt
-					   && (last_cnt < 8)) {
-					memset (tmp, '0', 8);
-					if (copy_from_user(tmp, buf, last_cnt))
-						return -EFAULT;
-					mask = 0xffffffff;
-					for (j = 0; j < 8 - last_cnt; j++)
-						mask <<= 1;
-					*q &= ~mask;
-					*q |= simple_strtoul (tmp, NULL, 16);
-					buf += last_cnt;
-				} else {
-					char tchars[2 * sizeof(long) + 1];
-
-					if (copy_from_user(tchars, buf, sizeof(tchars) - 1))
-						return -EFAULT;
-                                        tchars[sizeof(tchars) - 1] = '\0';
-					*q = simple_strtoul (tchars, NULL, 16);
-					buf += 9;
-				}
-			}
-		}
-		if (!forcelen) {
-			if (op->len < i)
-				op->len = i;
-		} else
-			op->len = i;
-		*ppos += count;
-	}
-write_try_string:
-	if (!(op->flag & OPP_BINARY)) {
-		if (!(op->flag & (OPP_QUOTED | OPP_NOTQUOTED))) {
-			char ctmp;
-
-			/* No way, if somebody starts writing from the middle, 
-			 * we don't know whether he uses quotes around or not 
-			 */
-			if (k > 0)
-				return -EINVAL;
-			if (get_user(ctmp, buf))
-				return -EFAULT;
-			if (ctmp == '\'') {
-				op->flag |= OPP_QUOTED;
-				buf++;
-				count--;
-				(*ppos)++;
-				if (!count) {
-					op->flag |= OPP_STRING;
-					return 1;
-				}
-			} else
-				op->flag |= OPP_NOTQUOTED;
-		}
-		op->flag |= OPP_STRING;
-		if (op->alloclen <= count + *ppos) {
-			b = kmalloc (sizeof (openprom_property)
-				     + 2 * (count + *ppos), GFP_KERNEL);
-			if (!b)
-				return -ENOMEM;
-			memcpy (b, filp->private_data,
-				sizeof (openprom_property)
-				+ strlen (op->name) + op->alloclen);
-			memset (b + sizeof (openprom_property)
-				+ strlen (op->name) + op->alloclen, 
-				0, 2*(count - *ppos) - op->alloclen);
-			op = b;
-			op->alloclen = 2*(count + *ppos);
-			b = filp->private_data;
-			filp->private_data = op;
-			kfree (b);
-		}
-		p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0);
-		if (copy_from_user(p, buf, count))
-			return -EFAULT;
-		op->flag |= OPP_DIRTY;
-		for (i = 0; i < count; i++, p++)
-			if (*p == '\n') {
-				*p = 0;
-				break;
+			while (len >= 4) {
+				len -= 4;
+
+				if (len)
+					seq_printf(f, "%08x.",
+						   *(unsigned int *) pval);
+				else
+					seq_printf(f, "%08x",
+						   *(unsigned int *) pval);
+				pval += 4;
 			}
-		if (i < count) {
-			op->len = p - op->value;
-			*ppos += i + 1;
-			if ((p > op->value) && (op->flag & OPP_QUOTED)
-			    && (*(p - 1) == '\''))
-				op->len--;
-		} else {
-			if (p - op->value > op->len)
-				op->len = p - op->value;
-			*ppos += count;
 		}
 	}
-	return *ppos - k;
+	seq_printf(f, "\n");
+
+	return 0;
 }
 
-int property_release (struct inode *inode, struct file *filp)
+static void *property_start(struct seq_file *f, loff_t *pos)
 {
-	openprom_property *op = filp->private_data;
-	int error;
-	u32 node;
-	
-	if (!op)
-		return 0;
-	lock_kernel();
-	node = nodes[ptr_nod(inode->u.generic_ip)].node;
-	if (ptr_nod(inode->u.generic_ip) == aliases) {
-		if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) {
-			char *p = op->name;
-			int i = (op->value - op->name) - strlen (op->name) - 1;
-			op->value [op->len] = 0;
-			*(op->value - 1) = ' ';
-			if (i) {
-				for (p = op->value - i - 2; p >= op->name; p--)
-					p[i] = *p;
-				p = op->name + i;
-			}
-			memcpy (p - 8, "nvalias ", 8);
-			prom_feval (p - 8);
-		}
-	} else if (op->flag & OPP_DIRTY) {
-		if (op->flag & OPP_STRING) {
-			op->value [op->len] = 0;
-			error = prom_setprop (node, op->name,
-					      op->value, op->len + 1);
-			if (error <= 0)
-				printk (KERN_WARNING "openpromfs: "
-					"Couldn't write property %s\n",
-					op->name);
-		} else if ((op->flag & OPP_BINARY) || !op->len) {
-			error = prom_setprop (node, op->name,
-					      op->value, op->len);
-			if (error <= 0)
-				printk (KERN_WARNING "openpromfs: "
-					"Couldn't write property %s\n",
-					op->name);
-		} else {
-			printk (KERN_WARNING "openpromfs: "
-				"Unknown property type of %s\n",
-				op->name);
-		}
+	if (*pos == 0)
+		return pos;
+	return NULL;
+}
+
+static void *property_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return NULL;
+}
+
+static void property_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static struct seq_operations property_op = {
+	.start		= property_start,
+	.next		= property_next,
+	.stop		= property_stop,
+	.show		= property_show
+};
+
+static int property_open(struct inode *inode, struct file *file)
+{
+	struct op_inode_info *oi = OP_I(inode);
+	int ret;
+
+	BUG_ON(oi->type != op_inode_prop);
+
+	ret = seq_open(file, &property_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = oi->u.prop;
 	}
-	unlock_kernel();
-	kfree (filp->private_data);
-	return 0;
+	return ret;
 }
 
 static const struct file_operations openpromfs_prop_ops = {
-	.read		= property_read,
-	.write		= property_write,
-	.release	= property_release,
+	.open		= property_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
 };
 
-static const struct file_operations openpromfs_nodenum_ops = {
-	.read		= nodenum_read,
-};
+static int openpromfs_readdir(struct file *, void *, filldir_t);
 
 static const struct file_operations openprom_operations = {
 	.read		= generic_read_dir,
 	.readdir	= openpromfs_readdir,
 };
 
-static struct inode_operations openprom_alias_inode_operations = {
-	.create		= openpromfs_create,
-	.lookup		= openpromfs_lookup,
-	.unlink		= openpromfs_unlink,
-};
+static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 static struct inode_operations openprom_inode_operations = {
 	.lookup		= openpromfs_lookup,
 };
 
-static int lookup_children(u16 n, const char * name, int len)
+static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	int ret;
-	u16 node;
-	for (; n != 0xffff; n = nodes[n].next) {
-		node = nodes[n].child;
-		if (node != 0xffff) {
-			char buffer[128];
-			int i;
-			char *p;
-			
-			while (node != 0xffff) {
-				if (prom_getname (nodes[node].node,
-						  buffer, 128) >= 0) {
-					i = strlen (buffer);
-					if ((len == i)
-					    && !strncmp (buffer, name, len))
-						return NODE2INO(node);
-					p = strchr (buffer, '@');
-					if (p && (len == p - buffer)
-					    && !strncmp (buffer, name, len))
-						return NODE2INO(node);
-				}
-				node = nodes[node].next;
-			}
-		} else
-			continue;
-		ret = lookup_children (nodes[n].child, name, len);
-		if (ret) return ret;
-	}
-	return 0;
-}
-
-static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
-{
-	int ino = 0;
-#define OPFSL_DIR	0
-#define OPFSL_PROPERTY	1
-#define OPFSL_NODENUM	2
-	int type = 0;
-	char buffer[128];
-	char *p;
+	struct op_inode_info *ent_oi, *oi = OP_I(dir);
+	struct device_node *dp, *child;
+	struct property *prop;
+	enum op_inode_type ent_type;
+	union op_inode_data ent_data;
 	const char *name;
-	u32 n;
-	u16 dirnode;
-	unsigned int len;
-	int i;
 	struct inode *inode;
-	char buffer2[64];
+	unsigned int ino;
+	int len;
 	
-	inode = NULL;
+	BUG_ON(oi->type != op_inode_node);
+
+	dp = oi->u.node;
+
 	name = dentry->d_name.name;
 	len = dentry->d_name.len;
-	lock_kernel();
-	if (name [0] == '.' && len == 5 && !strncmp (name + 1, "node", 4)) {
-		ino = NODEP2INO(NODE(dir->i_ino).first_prop);
-		type = OPFSL_NODENUM;
-	}
-	if (!ino) {
-		u16 node = NODE(dir->i_ino).child;
-		while (node != 0xffff) {
-			if (prom_getname (nodes[node].node, buffer, 128) >= 0) {
-				i = strlen (buffer);
-				if (len == i && !strncmp (buffer, name, len)) {
-					ino = NODE2INO(node);
-					type = OPFSL_DIR;
-					break;
-				}
-				p = strchr (buffer, '@');
-				if (p && (len == p - buffer)
-				    && !strncmp (buffer, name, len)) {
-					ino = NODE2INO(node);
-					type = OPFSL_DIR;
-					break;
-				}
-			}
-			node = nodes[node].next;
-		}
-	}
-	n = NODE(dir->i_ino).node;
-	dirnode = dir->i_ino - OPENPROM_FIRST_INO;
-	if (!ino) {
-		int j = NODEP2INO(NODE(dir->i_ino).first_prop);
-		if (dirnode != aliases) {
-			for (p = prom_firstprop (n, buffer2);
-			     p && *p;
-			     p = prom_nextprop (n, p, buffer2)) {
-				j++;
-				if ((len == strlen (p))
-				    && !strncmp (p, name, len)) {
-					ino = j;
-					type = OPFSL_PROPERTY;
-					break;
-				}
-			}
-		} else {
-			int k;
-			for (k = 0; k < aliases_nodes; k++) {
-				j++;
-				if (alias_names [k]
-				    && (len == strlen (alias_names [k]))
-				    && !strncmp (alias_names [k], name, len)) {
-					ino = j;
-					type = OPFSL_PROPERTY;
-					break;
-				}
-			}
+
+	mutex_lock(&op_mutex);
+
+	child = dp->child;
+	while (child) {
+		int n = strlen(child->path_component_name);
+
+		if (len == n &&
+		    !strncmp(child->path_component_name, name, len)) {
+			ent_type = op_inode_node;
+			ent_data.node = child;
+			ino = child->unique_id;
+			goto found;
 		}
+		child = child->sibling;
 	}
-	if (!ino) {
-		ino = lookup_children (NODE(dir->i_ino).child, name, len);
-		if (ino)
-			type = OPFSL_DIR;
-		else {
-			unlock_kernel();
-			return ERR_PTR(-ENOENT);
+
+	prop = dp->properties;
+	while (prop) {
+		int n = strlen(prop->name);
+
+		if (len == n && !strncmp(prop->name, name, len)) {
+			ent_type = op_inode_prop;
+			ent_data.prop = prop;
+			ino = prop->unique_id;
+			goto found;
 		}
+
+		prop = prop->next;
 	}
-	inode = iget (dir->i_sb, ino);
-	unlock_kernel();
+
+	mutex_unlock(&op_mutex);
+	return ERR_PTR(-ENOENT);
+
+found:
+	inode = iget(dir->i_sb, ino);
+	mutex_unlock(&op_mutex);
 	if (!inode)
 		return ERR_PTR(-EINVAL);
-	switch (type) {
-	case OPFSL_DIR:
+	ent_oi = OP_I(inode);
+	ent_oi->type = ent_type;
+	ent_oi->u = ent_data;
+
+	switch (ent_type) {
+	case op_inode_node:
 		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-		if (ino == OPENPROM_FIRST_INO + aliases) {
-			inode->i_mode |= S_IWUSR;
-			inode->i_op = &openprom_alias_inode_operations;
-		} else
-			inode->i_op = &openprom_inode_operations;
+		inode->i_op = &openprom_inode_operations;
 		inode->i_fop = &openprom_operations;
 		inode->i_nlink = 2;
 		break;
-	case OPFSL_NODENUM:
-		inode->i_mode = S_IFREG | S_IRUGO;
-		inode->i_fop = &openpromfs_nodenum_ops;
-		inode->i_nlink = 1;
-		inode->u.generic_ip = (void *)(long)(n);
-		break;
-	case OPFSL_PROPERTY:
-		if ((dirnode == options) && (len == 17)
-		    && !strncmp (name, "security-password", 17))
+	case op_inode_prop:
+		if (!strcmp(dp->name, "options") && (len == 17) &&
+		    !strncmp (name, "security-password", 17))
 			inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
-		else {
+		else
 			inode->i_mode = S_IFREG | S_IRUGO;
-			if (dirnode == options || dirnode == aliases) {
-				if (len != 4 || strncmp (name, "name", 4))
-					inode->i_mode |= S_IWUSR;
-			}
-		}
 		inode->i_fop = &openpromfs_prop_ops;
 		inode->i_nlink = 1;
-		if (inode->i_size < 0)
-			inode->i_size = 0;
-		inode->u.generic_ip = (void *)(long)(((u16)dirnode) | 
-			(((u16)(ino - NODEP2INO(NODE(dir->i_ino).first_prop) - 1)) << 16));
+		inode->i_size = ent_oi->u.prop->length;
 		break;
 	}
 
@@ -781,237 +263,89 @@ static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentr
 static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
+	struct op_inode_info *oi = OP_I(inode);
+	struct device_node *dp = oi->u.node;
+	struct device_node *child;
+	struct property *prop;
 	unsigned int ino;
-	u32 n;
-	int i, j;
-	char buffer[128];
-	u16 node;
-	char *p;
-	char buffer2[64];
-
-	lock_kernel();
+	int i;
+
+	mutex_lock(&op_mutex);
 	
 	ino = inode->i_ino;
 	i = filp->f_pos;
 	switch (i) {
 	case 0:
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out;
+		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+			goto out;
 		i++;
 		filp->f_pos++;
 		/* fall thru */
 	case 1:
-		if (filldir(dirent, "..", 2, i, 
-			(NODE(ino).parent == 0xffff) ? 
-			OPENPROM_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0) 
+		if (filldir(dirent, "..", 2, i,
+			    (dp->parent == NULL ?
+			     OPENPROM_ROOT_INO :
+			     dp->parent->unique_id), DT_DIR) < 0) 
 			goto out;
 		i++;
 		filp->f_pos++;
 		/* fall thru */
 	default:
 		i -= 2;
-		node = NODE(ino).child;
-		while (i && node != 0xffff) {
-			node = nodes[node].next;
+
+		/* First, the children nodes as directories.  */
+		child = dp->child;
+		while (i && child) {
+			child = child->sibling;
 			i--;
 		}
-		while (node != 0xffff) {
-			if (prom_getname (nodes[node].node, buffer, 128) < 0)
-				goto out;
-			if (filldir(dirent, buffer, strlen(buffer),
-				    filp->f_pos, NODE2INO(node), DT_DIR) < 0)
+		while (child) {
+			if (filldir(dirent,
+				    child->path_component_name,
+				    strlen(child->path_component_name),
+				    filp->f_pos, child->unique_id, DT_DIR) < 0)
 				goto out;
+
 			filp->f_pos++;
-			node = nodes[node].next;
+			child = child->sibling;
 		}
-		j = NODEP2INO(NODE(ino).first_prop);
-		if (!i) {
-			if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0)
+
+		/* Next, the properties as files.  */
+		prop = dp->properties;
+		while (i && prop) {
+			prop = prop->next;
+			i--;
+		}
+		while (prop) {
+			if (filldir(dirent, prop->name, strlen(prop->name),
+				    filp->f_pos, prop->unique_id, DT_REG) < 0)
 				goto out;
+
 			filp->f_pos++;
-		} else
-			i--;
-		n = NODE(ino).node;
-		if (ino == OPENPROM_FIRST_INO + aliases) {
-			for (j++; i < aliases_nodes; i++, j++) {
-				if (alias_names [i]) {
-					if (filldir (dirent, alias_names [i], 
-						strlen (alias_names [i]), 
-						filp->f_pos, j, DT_REG) < 0) goto out; 
-					filp->f_pos++;
-				}
-			}
-		} else {
-			for (p = prom_firstprop (n, buffer2);
-			     p && *p;
-			     p = prom_nextprop (n, p, buffer2)) {
-				j++;
-				if (i) i--;
-				else {
-					if (filldir(dirent, p, strlen(p),
-						    filp->f_pos, j, DT_REG) < 0)
-						goto out;
-					filp->f_pos++;
-				}
-			}
+			prop = prop->next;
 		}
 	}
 out:
-	unlock_kernel();
-	return 0;
-}
-
-static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode,
-		struct nameidata *nd)
-{
-	char *p;
-	struct inode *inode;
-	
-	if (!dir)
-		return -ENOENT;
-	if (dentry->d_name.len > 256)
-		return -EINVAL;
-	p = kmalloc (dentry->d_name.len + 1, GFP_KERNEL);
-	if (!p)
-		return -ENOMEM;
-	strncpy (p, dentry->d_name.name, dentry->d_name.len);
-	p [dentry->d_name.len] = 0;
-	lock_kernel();
-	if (aliases_nodes == ALIASES_NNODES) {
-		kfree(p);
-		unlock_kernel();
-		return -EIO;
-	}
-	alias_names [aliases_nodes++] = p;
-	inode = iget (dir->i_sb,
-			NODEP2INO(NODE(dir->i_ino).first_prop) + aliases_nodes);
-	if (!inode) {
-		unlock_kernel();
-		return -EINVAL;
-	}
-	inode->i_mode = S_IFREG | S_IRUGO | S_IWUSR;
-	inode->i_fop = &openpromfs_prop_ops;
-	inode->i_nlink = 1;
-	if (inode->i_size < 0) inode->i_size = 0;
-	inode->u.generic_ip = (void *)(long)(((u16)aliases) | 
-			(((u16)(aliases_nodes - 1)) << 16));
-	unlock_kernel();
-	d_instantiate(dentry, inode);
+	mutex_unlock(&op_mutex);
 	return 0;
 }
 
-static int openpromfs_unlink (struct inode *dir, struct dentry *dentry)
-{
-	unsigned int len;
-	char *p;
-	const char *name;
-	int i;
-	
-	name = dentry->d_name.name;
-	len = dentry->d_name.len;
-	lock_kernel();
-	for (i = 0; i < aliases_nodes; i++)
-		if ((strlen (alias_names [i]) == len)
-		    && !strncmp (name, alias_names[i], len)) {
-			char buffer[512];
-			
-			p = alias_names [i];
-			alias_names [i] = NULL;
-			kfree (p);
-			strcpy (buffer, "nvunalias ");
-			memcpy (buffer + 10, name, len);
-			buffer [10 + len] = 0;
-			prom_feval (buffer);
-		}
-	unlock_kernel();
-	return 0;
-}
+static kmem_cache_t *op_inode_cachep;
 
-/* {{{ init section */
-static int __init check_space (u16 n)
+static struct inode *openprom_alloc_inode(struct super_block *sb)
 {
-	unsigned long pages;
+	struct op_inode_info *oi;
 
-	if ((1 << alloced) * PAGE_SIZE < (n + 2) * sizeof(openpromfs_node)) {
-		pages = __get_free_pages (GFP_KERNEL, alloced + 1);
-		if (!pages)
-			return -1;
+	oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL);
+	if (!oi)
+		return NULL;
 
-		if (nodes) {
-			memcpy ((char *)pages, nodes,
-				(1 << alloced) * PAGE_SIZE);
-			free_pages ((unsigned long)nodes, alloced);
-		}
-		alloced++;
-		nodes = (openpromfs_node *)pages;
-	}
-	return 0;
+	return &oi->vfs_inode;
 }
 
-static u16 __init get_nodes (u16 parent, u32 node)
+static void openprom_destroy_inode(struct inode *inode)
 {
-	char *p;
-	u16 n = last_node++, i;
-	char buffer[64];
-
-	if (check_space (n) < 0)
-		return 0xffff;
-	nodes[n].parent = parent;
-	nodes[n].node = node;
-	nodes[n].next = 0xffff;
-	nodes[n].child = 0xffff;
-	nodes[n].first_prop = first_prop++;
-	if (!parent) {
-		char buffer[8];
-		int j;
-		
-		if ((j = prom_getproperty (node, "name", buffer, 8)) >= 0) {
-		    buffer[j] = 0;
-		    if (!strcmp (buffer, "options"))
-			options = n;
-		    else if (!strcmp (buffer, "aliases"))
-		        aliases = n;
-		}
-	}
-	if (n != aliases)
-		for (p = prom_firstprop (node, buffer);
-		     p && p != (char *)-1 && *p;
-		     p = prom_nextprop (node, p, buffer))
-			first_prop++;
-	else {
-		char *q;
-		for (p = prom_firstprop (node, buffer);
-		     p && p != (char *)-1 && *p;
-		     p = prom_nextprop (node, p, buffer)) {
-			if (aliases_nodes == ALIASES_NNODES)
-				break;
-			for (i = 0; i < aliases_nodes; i++)
-				if (!strcmp (p, alias_names [i]))
-					break;
-			if (i < aliases_nodes)
-				continue;
-			q = kmalloc (strlen (p) + 1, GFP_KERNEL);
-			if (!q)
-				return 0xffff;
-			strcpy (q, p);
-			alias_names [aliases_nodes++] = q;
-		}
-		first_prop += ALIASES_NNODES;
-	}
-	node = prom_getchild (node);
-	if (node) {
-		parent = get_nodes (n, node);
-		if (parent == 0xffff)
-			return 0xffff;
-		nodes[n].child = parent;
-		while ((node = prom_getsibling (node)) != 0) {
-			i = get_nodes (n, node);
-			if (i == 0xffff)
-				return 0xffff;
-			nodes[parent].next = i;
-			parent = i;
-		}
-	}
-	return n;
+	kmem_cache_free(op_inode_cachep, OP_I(inode));
 }
 
 static void openprom_read_inode(struct inode * inode)
@@ -1031,6 +365,8 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static struct super_operations openprom_sops = { 
+	.alloc_inode	= openprom_alloc_inode,
+	.destroy_inode	= openprom_destroy_inode,
 	.read_inode	= openprom_read_inode,
 	.statfs		= simple_statfs,
 	.remount_fs	= openprom_remount,
@@ -1038,7 +374,8 @@ static struct super_operations openprom_sops = {
 
 static int openprom_fill_super(struct super_block *s, void *data, int silent)
 {
-	struct inode * root_inode;
+	struct inode *root_inode;
+	struct op_inode_info *oi;
 
 	s->s_flags |= MS_NOATIME;
 	s->s_blocksize = 1024;
@@ -1049,6 +386,11 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
 	root_inode = iget(s, OPENPROM_ROOT_INO);
 	if (!root_inode)
 		goto out_no_root;
+
+	oi = OP_I(root_inode);
+	oi->type = op_inode_node;
+	oi->u.node = of_find_node_by_path("/");
+
 	s->s_root = d_alloc_root(root_inode);
 	if (!s->s_root)
 		goto out_no_root;
@@ -1073,29 +415,39 @@ static struct file_system_type openprom_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
+static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags)
+{
+	struct op_inode_info *oi = (struct op_inode_info *) data;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&oi->vfs_inode);
+}
+
 static int __init init_openprom_fs(void)
 {
-	nodes = (openpromfs_node *)__get_free_pages(GFP_KERNEL, 0);
-	if (!nodes) {
-		printk (KERN_WARNING "openpromfs: can't get free page\n");
-		return -EIO;
-	}
-	if (get_nodes (0xffff, prom_root_node) == 0xffff) {
-		printk (KERN_WARNING "openpromfs: couldn't setup tree\n");
-		return -EIO;
-	}
-	nodes[last_node].first_prop = first_prop;
-	return register_filesystem(&openprom_fs_type);
+	int err;
+
+	op_inode_cachep = kmem_cache_create("op_inode_cache",
+					    sizeof(struct op_inode_info),
+					    0,
+					    (SLAB_RECLAIM_ACCOUNT |
+					     SLAB_MEM_SPREAD),
+					    op_inode_init_once, NULL);
+	if (!op_inode_cachep)
+		return -ENOMEM;
+
+	err = register_filesystem(&openprom_fs_type);
+	if (err)
+		kmem_cache_destroy(op_inode_cachep);
+
+	return err;
 }
 
 static void __exit exit_openprom_fs(void)
 {
-	int i;
 	unregister_filesystem(&openprom_fs_type);
-	free_pages ((unsigned long)nodes, alloced);
-	for (i = 0; i < aliases_nodes; i++)
-		kfree (alias_names [i]);
-	nodes = NULL;
+	kmem_cache_destroy(op_inode_cachep);
 }
 
 module_init(init_openprom_fs)
-- 
cgit v1.2.3


From 124a27fe32398a69d16bae374aeb17ad67a0ebbf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 26 Jun 2006 13:47:59 +0000
Subject: [CIFS] Remove calls to to take f_owner.lock

CIFS takes/releases f_owner.lock - why?  It does not change anything in the
fowner state.  Remove this locking.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 616b140534b..e9c1573f6aa 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -110,7 +110,6 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 			 &pCifsInode->openFileList);
 	}
 	write_unlock(&GlobalSMBSeslock);
-	write_unlock(&file->f_owner.lock);
 	if (pCifsInode->clientCanCacheRead) {
 		/* we have the inode open somewhere else
 		   no need to discard cache data */
@@ -287,7 +286,6 @@ int cifs_open(struct inode *inode, struct file *file)
 		goto out;
 	}
 	pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
-	write_lock(&file->f_owner.lock);
 	write_lock(&GlobalSMBSeslock);
 	list_add(&pCifsFile->tlist, &pTcon->openFileList);
 
@@ -298,7 +296,6 @@ int cifs_open(struct inode *inode, struct file *file)
 					    &oplock, buf, full_path, xid);
 	} else {
 		write_unlock(&GlobalSMBSeslock);
-		write_unlock(&file->f_owner.lock);
 	}
 
 	if (oplock & CIFS_CREATE_ACTION) {           
@@ -477,7 +474,6 @@ int cifs_close(struct inode *inode, struct file *file)
 	pTcon = cifs_sb->tcon;
 	if (pSMBFile) {
 		pSMBFile->closePend = TRUE;
-		write_lock(&file->f_owner.lock);
 		if (pTcon) {
 			/* no sense reconnecting to close a file that is
 			   already closed */
@@ -492,23 +488,18 @@ int cifs_close(struct inode *inode, struct file *file)
 					the struct would be in each open file,
 					but this should give enough time to 
 					clear the socket */
-					write_unlock(&file->f_owner.lock);
 					cERROR(1,("close with pending writes"));
 					msleep(timeout);
-					write_lock(&file->f_owner.lock);
 					timeout *= 4;
 				} 
-				write_unlock(&file->f_owner.lock);
 				rc = CIFSSMBClose(xid, pTcon,
 						  pSMBFile->netfid);
-				write_lock(&file->f_owner.lock);
 			}
 		}
 		write_lock(&GlobalSMBSeslock);
 		list_del(&pSMBFile->flist);
 		list_del(&pSMBFile->tlist);
 		write_unlock(&GlobalSMBSeslock);
-		write_unlock(&file->f_owner.lock);
 		kfree(pSMBFile->search_resume_name);
 		kfree(file->private_data);
 		file->private_data = NULL;
-- 
cgit v1.2.3


From d6e05edc59ecd79e8badf440c0d295a979bdfa3e Mon Sep 17 00:00:00 2001
From: Andreas Mohr <andi@lisas.de>
Date: Mon, 26 Jun 2006 18:35:02 +0200
Subject: spelling fixes

acquired (aquired)
contiguous (contigious)
successful (succesful, succesfull)
surprise (suprise)
whether (weather)
some other misspellings

Signed-off-by: Andreas Mohr <andi@lisas.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 fs/9p/mux.c         | 2 +-
 fs/aio.c            | 2 +-
 fs/jffs2/summary.c  | 2 +-
 fs/jfs/jfs_extent.c | 8 ++++----
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f4407eb276c..12e1baa4508 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -712,7 +712,7 @@ static void v9fs_read_work(void *a)
  * v9fs_send_request - send 9P request
  * The function can sleep until the request is scheduled for sending.
  * The function can be interrupted. Return from the function is not
- * a guarantee that the request is sent succesfully. Can return errors
+ * a guarantee that the request is sent successfully. Can return errors
  * that can be retrieved by PTR_ERR macros.
  *
  * @m: mux data
diff --git a/fs/aio.c b/fs/aio.c
index 8c34a62df7d..950630187ac 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb)
  *	invoked both for initial i/o submission and
  *	subsequent retries via the aio_kick_handler.
  *	Expects to be invoked with iocb->ki_ctx->lock
- *	already held. The lock is released and reaquired
+ *	already held. The lock is released and reacquired
  *	as needed during processing.
  *
  * Calls the iocb retry method (already setup for the
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0b02fc79e4d..be1acc3dad9 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
 		return -ENOMEM;
 	}
 
-	dbg_summary("returned succesfully\n");
+	dbg_summary("returned successfully\n");
 
 	return 0;
 }
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5549378358b..4d52593a5fc 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
 
 	/* allocate the disk blocks for the extent.  initially, extBalloc()
 	 * will try to allocate disk blocks for the requested size (xlen). 
-	 * if this fails (xlen contigious free blocks not avaliable), it'll
+	 * if this fails (xlen contiguous free blocks not avaliable), it'll
 	 * try to allocate a smaller number of blocks (producing a smaller
 	 * extent), with this smaller number of blocks consisting of the
 	 * requested number of blocks rounded down to the next smaller
@@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp)
  *
  *		initially, we will try to allocate disk blocks for the
  *		requested size (nblocks).  if this fails (nblocks 
- *		contigious free blocks not avaliable), we'll try to allocate
+ *		contiguous free blocks not avaliable), we'll try to allocate
  *		a smaller number of blocks (producing a smaller extent), with
  *		this smaller number of blocks consisting of the requested
  *		number of blocks rounded down to the next smaller power of 2
@@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 
 	/* get the number of blocks to initially attempt to allocate.
 	 * we'll first try the number of blocks requested unless this
-	 * number is greater than the maximum number of contigious free
+	 * number is greater than the maximum number of contiguous free
 	 * blocks in the map. in that case, we'll start off with the 
 	 * maximum free.
 	 */
@@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
  *		in place.  if this fails, we'll try to move the extent
  *		to a new set of blocks. if moving the extent, we initially
  *		will try to allocate disk blocks for the requested size
- *		(nnew).  if this fails 	(nnew contigious free blocks not
+ *		(nnew).  if this fails 	(new contiguous free blocks not
  *		avaliable), we'll try  to allocate a smaller number of
  *		blocks (producing a smaller extent), with this smaller
  *		number of blocks consisting of the requested number of
-- 
cgit v1.2.3


From 8e13059a37252c45ab7173a0e4bac05e4a444ab6 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Mon, 26 Jun 2006 00:24:37 -0700
Subject: [PATCH] use list_add_tail() instead of list_add()

This patch converts list_add(A, B.prev) to list_add_tail(A, &B) for
readability.

Acked-by: Karsten Keil <kkeil@suse.de>
Cc: Jan Harkes <jaharkes@cs.cmu.edu>
Acked-by: Jan Kara <jack@suse.cz>
AOLed-by: David Woodhouse <dwmw2@infradead.org>
Cc: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/coda/psdev.c  | 2 +-
 fs/coda/upcall.c | 2 +-
 fs/dcache.c      | 2 +-
 fs/dquot.c       | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 6c6771db36d..7caee8d8ea3 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -259,7 +259,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 	/* If request was not a signal, enqueue and don't free */
 	if (!(req->uc_flags & REQ_ASYNC)) {
 		req->uc_flags |= REQ_READ;
-		list_add(&(req->uc_chain), vcp->vc_processing.prev);
+		list_add_tail(&(req->uc_chain), &vcp->vc_processing);
 		goto out;
 	}
 
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b040eba13a7..a5b5e631ba6 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi,
 	((union inputArgs *)buffer)->ih.unique = req->uc_unique;
 
 	/* Append msg to pending queue and poke Venus. */
-	list_add(&(req->uc_chain), vcommp->vc_pending.prev);
+	list_add_tail(&(req->uc_chain), &vcommp->vc_pending);
         
 	wake_up_interruptible(&vcommp->vc_waitq);
 	/* We can be interrupted while we wait for Venus to process
diff --git a/fs/dcache.c b/fs/dcache.c
index b85fda36053..6aa635fbfa0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -638,7 +638,7 @@ resume:
 		 * of the unused list for prune_dcache
 		 */
 		if (!atomic_read(&dentry->d_count)) {
-			list_add(&dentry->d_lru, dentry_unused.prev);
+			list_add_tail(&dentry->d_lru, &dentry_unused);
 			dentry_stat.nr_unused++;
 			found++;
 		}
diff --git a/fs/dquot.c b/fs/dquot.c
index 81d87a413c6..0122a279106 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block
 /* Add a dquot to the tail of the free list */
 static inline void put_dquot_last(struct dquot *dquot)
 {
-	list_add(&dquot->dq_free, free_dquots.prev);
+	list_add_tail(&dquot->dq_free, &free_dquots);
 	dqstats.free_dquots++;
 }
 
@@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot)
 {
 	/* We add to the back of inuse list so we don't have to restart
 	 * when traversing this list and we block */
-	list_add(&dquot->dq_inuse, inuse_list.prev);
+	list_add_tail(&dquot->dq_inuse, &inuse_list);
 	dqstats.allocated_dquots++;
 }
 
-- 
cgit v1.2.3


From 1bfba4e8ea0e555e3a0296051517d96253660ccc Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Mon, 26 Jun 2006 00:24:40 -0700
Subject: [PATCH] core: use list_move()

This patch converts the combination of list_del(A) and list_add(A, B) to
list_move(A, B).

Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Ram Pai <linuxram@us.ibm.com>
Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c    |  3 +--
 fs/libfs.c     | 10 ++++------
 fs/namespace.c |  6 ++----
 fs/pnode.c     |  9 +++------
 fs/sysfs/dir.c | 10 ++++------
 5 files changed, 14 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 6aa635fbfa0..48b44a714b3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb)
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
-		list_del(tmp);
-		list_add(tmp, &dentry_unused);
+		list_move(tmp, &dentry_unused);
 	}
 
 	/*
diff --git a/fs/libfs.c b/fs/libfs.c
index fc785d8befb..ac02ea602c3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			/* fallthrough */
 		default:
 			spin_lock(&dcache_lock);
-			if (filp->f_pos == 2) {
-				list_del(q);
-				list_add(q, &dentry->d_subdirs);
-			}
+			if (filp->f_pos == 2)
+				list_move(q, &dentry->d_subdirs);
+
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_u.d_child);
@@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 					return 0;
 				spin_lock(&dcache_lock);
 				/* next is still alive */
-				list_del(q);
-				list_add(q, p);
+				list_move(q, p);
 				p = q;
 				filp->f_pos++;
 			}
diff --git a/fs/namespace.c b/fs/namespace.c
index 866430bb024..b3ed212ea41 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
 	struct vfsmount *p;
 
-	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		list_del(&p->mnt_hash);
-		list_add(&p->mnt_hash, kill);
-	}
+	for (p = mnt; p; p = next_mnt(p, mnt))
+		list_move(&p->mnt_hash, kill);
 
 	if (propagate)
 		propagate_umount(kill);
diff --git a/fs/pnode.c b/fs/pnode.c
index 37b568ed0e0..da42ee61c1d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt)
 	if (master) {
 		list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
 			slave_mnt->mnt_master = master;
-		list_del(&mnt->mnt_slave);
-		list_add(&mnt->mnt_slave, &master->mnt_slave_list);
+		list_move(&mnt->mnt_slave, &master->mnt_slave_list);
 		list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 	} else {
@@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt)
 		 * umount the child only if the child has no
 		 * other children
 		 */
-		if (child && list_empty(&child->mnt_mounts)) {
-			list_del(&child->mnt_hash);
-			list_add_tail(&child->mnt_hash, &mnt->mnt_hash);
-		}
+		if (child && list_empty(&child->mnt_mounts))
+			list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
 	}
 }
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 610b5bdbe75..61c42430cba 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			i++;
 			/* fallthrough */
 		default:
-			if (filp->f_pos == 2) {
-				list_del(q);
-				list_add(q, &parent_sd->s_children);
-			}
+			if (filp->f_pos == 2)
+				list_move(q, &parent_sd->s_children);
+
 			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
 				struct sysfs_dirent *next;
 				const char * name;
@@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 						 dt_type(next)) < 0)
 					return 0;
 
-				list_del(q);
-				list_add(q, p);
+				list_move(q, p);
 				p = q;
 				filp->f_pos++;
 			}
-- 
cgit v1.2.3


From f116629d03655adaf7832b93b03c99391d09d4a7 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Mon, 26 Jun 2006 00:24:46 -0700
Subject: [PATCH] fs: use list_move()

This patch converts the combination of list_del(A) and list_add(A, B) to
list_move(A, B) under fs/.

Cc: Ian Kent <raven@themaw.net>
Acked-by: Joel Becker <joel.becker@oracle.com>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: Hans Reiser <reiserfs-dev@namesys.com>
Cc: Urban Widmark <urban@teststation.com>
Acked-by: David Howells <dhowells@redhat.com>
Acked-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/afs/cell.c              |  3 +--
 fs/afs/kafsasyncd.c        |  9 +++------
 fs/afs/server.c            |  6 ++----
 fs/afs/vlocation.c         |  6 ++----
 fs/afs/vnode.c             |  3 +--
 fs/autofs4/expire.c        |  3 +--
 fs/configfs/dir.c          |  6 ++----
 fs/jffs2/erase.c           | 15 +++++----------
 fs/jffs2/nodemgmt.c        |  3 +--
 fs/jffs2/wbuf.c            |  3 +--
 fs/nfsd/nfs4state.c        |  3 +--
 fs/nfsd/nfscache.c         |  3 +--
 fs/ocfs2/dlm/dlmast.c      |  3 +--
 fs/ocfs2/dlm/dlmconvert.c  |  9 +++------
 fs/ocfs2/dlm/dlmlock.c     |  3 +--
 fs/ocfs2/dlm/dlmrecovery.c |  9 +++------
 fs/ocfs2/dlm/dlmthread.c   |  6 ++----
 fs/ocfs2/dlm/dlmunlock.c   |  3 +--
 fs/ocfs2/journal.c         |  3 +--
 fs/reiserfs/journal.c      |  6 ++----
 fs/smbfs/request.c         |  6 ++----
 fs/smbfs/smbiod.c          |  3 +--
 22 files changed, 38 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 009a9ae88d6..bfc1fd22d5b 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer,
 
 	/* we found it in the graveyard - resurrect it */
  found_dead_server:
-	list_del(&server->link);
-	list_add_tail(&server->link, &cell->sv_list);
+	list_move_tail(&server->link, &cell->sv_list);
 	afs_get_server(server);
 	afs_kafstimod_del_timer(&server->timeout);
 	spin_unlock(&cell->sv_gylock);
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index 7ac07d0d47b..f09a794f248 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -136,8 +136,7 @@ static int kafsasyncd(void *arg)
 			if (!list_empty(&kafsasyncd_async_attnq)) {
 				op = list_entry(kafsasyncd_async_attnq.next,
 						struct afs_async_op, link);
-				list_del(&op->link);
-				list_add_tail(&op->link,
+				list_move_tail(&op->link,
 					      &kafsasyncd_async_busyq);
 			}
 
@@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op)
 	init_waitqueue_entry(&op->waiter, kafsasyncd_task);
 	add_wait_queue(&op->call->waitq, &op->waiter);
 
-	list_del(&op->link);
-	list_add_tail(&op->link, &kafsasyncd_async_busyq);
+	list_move_tail(&op->link, &kafsasyncd_async_busyq);
 
 	spin_unlock(&kafsasyncd_async_lock);
 
@@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op)
 
 	spin_lock(&kafsasyncd_async_lock);
 
-	list_del(&op->link);
-	list_add_tail(&op->link, &kafsasyncd_async_attnq);
+	list_move_tail(&op->link, &kafsasyncd_async_attnq);
 
 	spin_unlock(&kafsasyncd_async_lock);
 
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 62b093aa41c..22afaae1a4c 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
  resurrect_server:
 	_debug("resurrecting server");
 
-	list_del(&zombie->link);
-	list_add_tail(&zombie->link, &cell->sv_list);
+	list_move_tail(&zombie->link, &cell->sv_list);
 	afs_get_server(zombie);
 	afs_kafstimod_del_timer(&zombie->timeout);
 	spin_unlock(&cell->sv_gylock);
@@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server)
 	}
 
 	spin_lock(&cell->sv_gylock);
-	list_del(&server->link);
-	list_add_tail(&server->link, &cell->sv_graveyard);
+	list_move_tail(&server->link, &cell->sv_graveyard);
 
 	/* time out in 10 secs */
 	afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index eced20618ec..331f730a1fb 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell,
 	/* found in the graveyard - resurrect */
 	_debug("found in graveyard");
 	atomic_inc(&vlocation->usage);
-	list_del(&vlocation->link);
-	list_add_tail(&vlocation->link, &cell->vl_list);
+	list_move_tail(&vlocation->link, &cell->vl_list);
 	spin_unlock(&cell->vl_gylock);
 
 	afs_kafstimod_del_timer(&vlocation->timeout);
@@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation)
 	}
 
 	/* move to graveyard queue */
-	list_del(&vlocation->link);
-	list_add_tail(&vlocation->link,&cell->vl_graveyard);
+	list_move_tail(&vlocation->link,&cell->vl_graveyard);
 
 	/* remove from pending timeout queue (refcounted if actually being
 	 * updated) */
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 9867fef3261..cf62da5d782 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
 					vnode->cb_expiry * HZ);
 
 		spin_lock(&afs_cb_hash_lock);
-		list_del(&vnode->cb_hash_link);
-		list_add_tail(&vnode->cb_hash_link,
+		list_move_tail(&vnode->cb_hash_link,
 			      &afs_cb_hash(server, &vnode->fid));
 		spin_unlock(&afs_cb_hash_lock);
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4456d1daa40..8dbd44f10e9 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -376,8 +376,7 @@ next:
 		DPRINTK("returning %p %.*s",
 			expired, (int)expired->d_name.len, expired->d_name.name);
 		spin_lock(&dcache_lock);
-		list_del(&expired->d_parent->d_subdirs);
-		list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
+		list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 		spin_unlock(&dcache_lock);
 		return expired;
 	}
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5f952187fc5..207f8006fd6 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
 			/* fallthrough */
 		default:
 			if (filp->f_pos == 2) {
-				list_del(q);
-				list_add(q, &parent_sd->s_children);
+				list_move(q, &parent_sd->s_children);
 			}
 			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
 				struct configfs_dirent *next;
@@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
 						 dt_type(next)) < 0)
 					return 0;
 
-				list_del(q);
-				list_add(q, p);
+				list_move(q, p);
 				p = q;
 				filp->f_pos++;
 			}
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 1862e8bc101..b8886f048ea 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -53,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 	if (!instr) {
 		printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
 		spin_lock(&c->erase_completion_lock);
-		list_del(&jeb->list);
-		list_add(&jeb->list, &c->erase_pending_list);
+		list_move(&jeb->list, &c->erase_pending_list);
 		c->erasing_size -= c->sector_size;
 		c->dirty_size += c->sector_size;
 		jeb->dirty_size = c->sector_size;
@@ -86,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 		/* Erase failed immediately. Refile it on the list */
 		D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret));
 		spin_lock(&c->erase_completion_lock);
-		list_del(&jeb->list);
-		list_add(&jeb->list, &c->erase_pending_list);
+		list_move(&jeb->list, &c->erase_pending_list);
 		c->erasing_size -= c->sector_size;
 		c->dirty_size += c->sector_size;
 		jeb->dirty_size = c->sector_size;
@@ -161,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
 {
 	D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset));
 	spin_lock(&c->erase_completion_lock);
-	list_del(&jeb->list);
-	list_add_tail(&jeb->list, &c->erase_complete_list);
+	list_move_tail(&jeb->list, &c->erase_complete_list);
 	spin_unlock(&c->erase_completion_lock);
 	/* Ensure that kupdated calls us again to mark them clean */
 	jffs2_erase_pending_trigger(c);
@@ -178,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
 			/* We'd like to give this block another try. */
 			spin_lock(&c->erase_completion_lock);
-			list_del(&jeb->list);
-			list_add(&jeb->list, &c->erase_pending_list);
+			list_move(&jeb->list, &c->erase_pending_list);
 			c->erasing_size -= c->sector_size;
 			c->dirty_size += c->sector_size;
 			jeb->dirty_size = c->sector_size;
@@ -191,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	spin_lock(&c->erase_completion_lock);
 	c->erasing_size -= c->sector_size;
 	c->bad_size += c->sector_size;
-	list_del(&jeb->list);
-	list_add(&jeb->list, &c->bad_list);
+	list_move(&jeb->list, &c->bad_list);
 	c->nr_erasing_blocks--;
 	spin_unlock(&c->erase_completion_lock);
 	wake_up(&c->erase_wait);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 8bedfd2ff68..ac0c350ed7d 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -211,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 			struct jffs2_eraseblock *ejeb;
 
 			ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
-			list_del(&ejeb->list);
-			list_add_tail(&ejeb->list, &c->erase_pending_list);
+			list_move_tail(&ejeb->list, &c->erase_pending_list);
 			c->nr_erasing_blocks++;
 			jffs2_erase_pending_trigger(c);
 			D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a7f153f79ec..b9b700730df 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -495,8 +495,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 	/* Fix up the original jeb now it's on the bad_list */
 	if (first_raw == jeb->first_node) {
 		D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
-		list_del(&jeb->list);
-		list_add(&jeb->list, &c->erase_pending_list);
+		list_move(&jeb->list, &c->erase_pending_list);
 		c->nr_erasing_blocks++;
 		jffs2_erase_pending_trigger(c);
 	}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 96c7578cbe1..1630b5670dc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp)
 
 	dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
 	list_del_init(&clp->cl_strhash);
-	list_del_init(&clp->cl_idhash);
-	list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
+	list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
 	strhashval = clientstr_hashval(clp->cl_recdir);
 	list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
 	renew_client(clp);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index d852ebb538e..fdf7cf3dfad 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -103,8 +103,7 @@ nfsd_cache_shutdown(void)
 static void
 lru_put_end(struct svc_cacherep *rp)
 {
-	list_del(&rp->c_lru);
-	list_add_tail(&rp->c_lru, &lru_head);
+	list_move_tail(&rp->c_lru, &lru_head);
 }
 
 /*
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 355593dd8ef..87ee29cad50 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -381,8 +381,7 @@ do_ast:
 	ret = DLM_NORMAL;
 	if (past->type == DLM_AST) {
 		/* do not alter lock refcount.  switching lists. */
-		list_del_init(&lock->list);
-		list_add_tail(&lock->list, &res->granted);
+		list_move_tail(&lock->list, &res->granted);
 		mlog(0, "ast: adding to granted list... type=%d, "
 			  "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
 		if (lock->ml.convert_type != LKM_IVMODE) {
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 8285228d9e3..70888b31e75 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -231,8 +231,7 @@ switch_queues:
 
 	lock->ml.convert_type = type;
 	/* do not alter lock refcount.  switching lists. */
-	list_del_init(&lock->list);
-	list_add_tail(&lock->list, &res->converting);
+	list_move_tail(&lock->list, &res->converting);
 
 unlock_exit:
 	spin_unlock(&lock->spinlock);
@@ -248,8 +247,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res,
 				struct dlm_lock *lock)
 {
 	/* do not alter lock refcount.  switching lists. */
-	list_del_init(&lock->list);
-	list_add_tail(&lock->list, &res->granted);
+	list_move_tail(&lock->list, &res->granted);
 	lock->ml.convert_type = LKM_IVMODE;
 	lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
 }
@@ -294,8 +292,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
 	res->state |= DLM_LOCK_RES_IN_PROGRESS;
 	/* move lock to local convert queue */
 	/* do not alter lock refcount.  switching lists. */
-	list_del_init(&lock->list);
-	list_add_tail(&lock->list, &res->converting);
+	list_move_tail(&lock->list, &res->converting);
 	lock->convert_pending = 1;
 	lock->ml.convert_type = type;
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 6fea28318d6..55cda25ae11 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -239,8 +239,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 		mlog(0, "%s: $RECOVERY lock for this node (%u) is "
 		     "mastered by %u; got lock, manually granting (no ast)\n",
 		     dlm->name, dlm->node_num, res->owner);
-		list_del_init(&lock->list);
-		list_add_tail(&lock->list, &res->granted);
+		list_move_tail(&lock->list, &res->granted);
 	}
 	spin_unlock(&res->spinlock);
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 805cbabac05..9962190e741 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -905,13 +905,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
 			mlog(0, "found lockres owned by dead node while "
 				  "doing recovery for node %u. sending it.\n",
 				  dead_node);
-			list_del_init(&res->recovering);
-			list_add_tail(&res->recovering, list);
+			list_move_tail(&res->recovering, list);
 		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
 			mlog(0, "found UNKNOWN owner while doing recovery "
 				  "for node %u. sending it.\n", dead_node);
-			list_del_init(&res->recovering);
-			list_add_tail(&res->recovering, list);
+			list_move_tail(&res->recovering, list);
 		}
 	}
 	spin_unlock(&dlm->spinlock);
@@ -1529,8 +1527,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 
 			/* move the lock to its proper place */
 			/* do not alter lock refcount.  switching lists. */
-			list_del_init(&lock->list);
-			list_add_tail(&lock->list, queue);
+			list_move_tail(&lock->list, queue);
 			spin_unlock(&res->spinlock);
 
 			mlog(0, "just reordered a local lock!\n");
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 5be9d14f12c..44d3b57ae8a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -318,8 +318,7 @@ converting:
 
 		target->ml.type = target->ml.convert_type;
 		target->ml.convert_type = LKM_IVMODE;
-		list_del_init(&target->list);
-		list_add_tail(&target->list, &res->granted);
+		list_move_tail(&target->list, &res->granted);
 
 		BUG_ON(!target->lksb);
 		target->lksb->status = DLM_NORMAL;
@@ -380,8 +379,7 @@ blocked:
 		     target->ml.type, target->ml.node);
 
 		// target->ml.type is already correct
-		list_del_init(&target->list);
-		list_add_tail(&target->list, &res->granted);
+		list_move_tail(&target->list, &res->granted);
 
 		BUG_ON(!target->lksb);
 		target->lksb->status = DLM_NORMAL;
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 7b1a2754267..ac89c509daf 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
 void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
 			       struct dlm_lock *lock)
 {
-	list_del_init(&lock->list);
-	list_add_tail(&lock->list, &res->granted);
+	list_move_tail(&lock->list, &res->granted);
 	lock->ml.convert_type = LKM_IVMODE;
 }
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index eebc3cfa6be..3fe8781c22c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
 	BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
 
 	OCFS2_I(inode)->ip_handle = handle;
-	list_del(&(OCFS2_I(inode)->ip_handle_list));
-	list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
+	list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
 }
 
 static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1b73529b809..49d1a53dbef 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -834,8 +834,7 @@ static int write_ordered_buffers(spinlock_t * lock,
 		get_bh(bh);
 		if (test_set_buffer_locked(bh)) {
 			if (!buffer_dirty(bh)) {
-				list_del_init(&jh->list);
-				list_add(&jh->list, &tmp);
+				list_move(&jh->list, &tmp);
 				goto loop_next;
 			}
 			spin_unlock(lock);
@@ -855,8 +854,7 @@ static int write_ordered_buffers(spinlock_t * lock,
 			ret = -EIO;
 		}
 		if (buffer_dirty(bh)) {
-			list_del_init(&jh->list);
-			list_add(&jh->list, &tmp);
+			list_move(&jh->list, &tmp);
 			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
 		} else {
 			reiserfs_free_jh(bh);
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index c71dd2760d3..c8e96195b96 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req)
 	if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
 		goto out;
 
-	list_del_init(&req->rq_queue);
-	list_add_tail(&req->rq_queue, &server->recvq);
+	list_move_tail(&req->rq_queue, &server->recvq);
 	result = 1;
 out:
 	return result;
@@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server)
 	result = smb_request_send_req(req);
 	if (result < 0) {
 		server->conn_error = result;
-		list_del_init(&req->rq_queue);
-		list_add(&req->rq_queue, &server->xmitq);
+		list_move(&req->rq_queue, &server->xmitq);
 		result = -EIO;
 		goto out;
 	}
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 3f71384020c..24577e2c489 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -193,8 +193,7 @@ int smbiod_retry(struct smb_sb_info *server)
 		if (req->rq_flags & SMB_REQ_RETRY) {
 			/* must move the request to the xmitq */
 			VERBOSE("retrying request %p on recvq\n", req);
-			list_del(&req->rq_queue);
-			list_add(&req->rq_queue, &server->xmitq);
+			list_move(&req->rq_queue, &server->xmitq);
 			continue;
 		}
 #endif
-- 
cgit v1.2.3


From 4eb582cf1fbd7b9e5f466e3718a59c957e75254e Mon Sep 17 00:00:00 2001
From: Michael LeMay <mdlemay@epoch.ncsc.mil>
Date: Mon, 26 Jun 2006 00:24:57 -0700
Subject: [PATCH] keys: add a way to store the appropriate context for
 newly-created keys

Add a /proc/<pid>/attr/keycreate entry that stores the appropriate context for
newly-created keys.  Modify the selinux_key_alloc hook to make use of the new
entry.  Update the flask headers to include a new "setkeycreate" permission
for processes.  Update the flask headers to include a new "create" permission
for keys.  Use the create permission to restrict which SIDs each task can
assign to newly-created keys.  Add a new parameter to the security hook
"security_key_alloc" to indicate whether it is being invoked by the kernel, or
from userspace.  If it is being invoked by the kernel, the security hook
should never fail.  Update the documentation to reflect these changes.

Signed-off-by: Michael LeMay <mdlemay@epoch.ncsc.mil>
Signed-off-by: James Morris <jmorris@namei.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6afff725a8c..c4a1ff371b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -121,6 +121,7 @@ enum pid_directory_inos {
 	PROC_TGID_ATTR_PREV,
 	PROC_TGID_ATTR_EXEC,
 	PROC_TGID_ATTR_FSCREATE,
+	PROC_TGID_ATTR_KEYCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
 	PROC_TGID_LOGINUID,
@@ -162,6 +163,7 @@ enum pid_directory_inos {
 	PROC_TID_ATTR_PREV,
 	PROC_TID_ATTR_EXEC,
 	PROC_TID_ATTR_FSCREATE,
+	PROC_TID_ATTR_KEYCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
 	PROC_TID_LOGINUID,
@@ -275,6 +277,7 @@ static struct pid_entry tgid_attr_stuff[] = {
 	E(PROC_TGID_ATTR_PREV,     "prev",     S_IFREG|S_IRUGO),
 	E(PROC_TGID_ATTR_EXEC,     "exec",     S_IFREG|S_IRUGO|S_IWUGO),
 	E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+	E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
 	{0,0,NULL,0}
 };
 static struct pid_entry tid_attr_stuff[] = {
@@ -282,6 +285,7 @@ static struct pid_entry tid_attr_stuff[] = {
 	E(PROC_TID_ATTR_PREV,      "prev",     S_IFREG|S_IRUGO),
 	E(PROC_TID_ATTR_EXEC,      "exec",     S_IFREG|S_IRUGO|S_IWUGO),
 	E(PROC_TID_ATTR_FSCREATE,  "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+	E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
 	{0,0,NULL,0}
 };
 #endif
@@ -1801,6 +1805,8 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 		case PROC_TGID_ATTR_EXEC:
 		case PROC_TID_ATTR_FSCREATE:
 		case PROC_TGID_ATTR_FSCREATE:
+		case PROC_TID_ATTR_KEYCREATE:
+		case PROC_TGID_ATTR_KEYCREATE:
 			inode->i_fop = &proc_pid_attr_operations;
 			break;
 #endif
-- 
cgit v1.2.3


From 9637f28f8b9facff53b00bea6b5d27c9b150b422 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 26 Jun 2006 00:24:57 -0700
Subject: [PATCH] reiserfs: remove reiserfs_aio_write()

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <reiserfs-dev@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/file.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index cf6e1cf4035..752cea12e30 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 	return res;
 }
 
-static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
-				  size_t count, loff_t pos)
-{
-	return generic_file_aio_write(iocb, buf, count, pos);
-}
-
 const struct file_operations reiserfs_file_operations = {
 	.read = generic_file_read,
 	.write = reiserfs_file_write,
@@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = {
 	.fsync = reiserfs_sync_file,
 	.sendfile = generic_file_sendfile,
 	.aio_read = generic_file_aio_read,
-	.aio_write = reiserfs_aio_write,
+	.aio_write = generic_file_aio_write,
 	.splice_read = generic_file_splice_read,
 	.splice_write = generic_file_splice_write,
 };
-- 
cgit v1.2.3


From ade1a29e168ba08b699a418ff5e762315fa33f70 Mon Sep 17 00:00:00 2001
From: Badari Pulavarty <pbadari@us.ibm.com>
Date: Mon, 26 Jun 2006 00:25:04 -0700
Subject: [PATCH] ext3: Add "-o bh" option

This patch adds "-o bh" option to force use of buffer_heads.  This option
is needed when we make "nobh" as default - and if we run into problems.

Signed-off-by: Badari Pulavarty <pbadari@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/super.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b2891cc29db..b7483360a2d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -630,7 +630,7 @@ enum {
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
+	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -666,6 +666,7 @@ static match_table_t tokens = {
 	{Opt_noreservation, "noreservation"},
 	{Opt_noload, "noload"},
 	{Opt_nobh, "nobh"},
+	{Opt_bh, "bh"},
 	{Opt_commit, "commit=%u"},
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_inum, "journal=%u"},
@@ -1014,6 +1015,9 @@ clear_qf_name:
 		case Opt_nobh:
 			set_opt(sbi->s_mount_opt, NOBH);
 			break;
+		case Opt_bh:
+			clear_opt(sbi->s_mount_opt, NOBH);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT3-fs: Unrecognized mount option \"%s\" "
-- 
cgit v1.2.3


From 5024ad4af69b3570e18d312786dc46318a1bad1a Mon Sep 17 00:00:00 2001
From: Hansjoerg Lipp <hjlipp@web.de>
Date: Mon, 26 Jun 2006 00:25:35 -0700
Subject: [PATCH] i4l: Gigaset drivers: add IOCTLs to compat_ioctl.h

Add the IOCTLs of the Gigaset drivers to compat_ioctl.h in order to make
them available for 32 bit programs on 64 bit platforms.  Please merge.

Signed-off-by: Hansjoerg Lipp <hjlipp@web.de>
Acked-by: Tilman Schmidt <tilman@imap.cc>
Cc: Karsten Keil <kkeil@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat_ioctl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9eb9824dd33..d8ecfedef18 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -80,6 +80,7 @@
 #include <net/bluetooth/rfcomm.h>
 
 #include <linux/capi.h>
+#include <linux/gigaset_dev.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
-- 
cgit v1.2.3


From f05e15b59467d97c30fbffed80db9e00b73d640b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 26 Jun 2006 00:25:39 -0700
Subject: [PATCH] nfsd kconfig: select things at the closest tristate instead
 of bool

I noticed recently that my CONFIG_CRYPTO_MD5 turned into a y again instead
of m.  It turns out that CONFIG_NFSD_V4 is selecting it to be y even though
I've chosen to compile nfsd as a module.

In general when we have a bool sitting under a tristate it is better to
select things you need from the tristate rather than the bool since that
allows the things you select to be modules.

The following patch does it for nfsd.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 1cdc043922d..6c5051802bd 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1490,7 +1490,12 @@ config NFSD
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
-	select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL
+	select NFSD_V2_ACL if NFSD_V3_ACL
+	select NFS_ACL_SUPPORT if NFSD_V2_ACL
+	select NFSD_TCP if NFSD_V4
+	select CRYPTO_MD5 if NFSD_V4
+	select CRYPTO if NFSD_V4
+	select FS_POSIX_ACL if NFSD_V4
 	help
 	  If you want your Linux box to act as an NFS *server*, so that other
 	  computers on your local network which support NFS can access certain
@@ -1528,7 +1533,6 @@ config NFSD_V3
 config NFSD_V3_ACL
 	bool "Provide server support for the NFSv3 ACL protocol extension"
 	depends on NFSD_V3
-	select NFSD_V2_ACL
 	help
 	  Implement the NFSv3 ACL protocol extension for manipulating POSIX
 	  Access Control Lists on exported file systems. NFS clients should
@@ -1538,10 +1542,6 @@ config NFSD_V3_ACL
 config NFSD_V4
 	bool "Provide NFSv4 server support (EXPERIMENTAL)"
 	depends on NFSD_V3 && EXPERIMENTAL
-	select NFSD_TCP
-	select CRYPTO_MD5
-	select CRYPTO
-	select FS_POSIX_ACL
 	help
 	  If you would like to include the NFSv4 server as well as the NFSv2
 	  and NFSv3 servers, say Y here.  This feature is experimental, and
-- 
cgit v1.2.3


From 5634708b5fce807bdf2091cfafc2fb24d791f0c8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:40 -0700
Subject: [PATCH] proc: Fix the .. inode number on /proc/<pid>/fd

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c4a1ff371b8..13e3ab99eb7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1196,7 +1196,8 @@ static struct inode_operations proc_pid_link_inode_operations = {
 
 static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
+	struct dentry *dentry = filp->f_dentry;
+	struct inode *inode = dentry->d_inode;
 	struct task_struct *p = proc_task(inode);
 	unsigned int fd, tid, ino;
 	int retval;
@@ -1217,7 +1218,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 				goto out;
 			filp->f_pos++;
 		case 1:
-			ino = fake_ino(tid, PROC_TID_INO);
+			ino = parent_ino(dentry);
 			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
 				goto out;
 			filp->f_pos++;
-- 
cgit v1.2.3


From ff9724a3f7a69f7b443c05981f84e28017c2fc5a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:42 -0700
Subject: [PATCH] proc: Remove useless BKL in proc_pid_readlink

We already call everything except do_proc_readlink outside of the BKL in
proc_pid_followlink, and there appears to be nothing in do_proc_readlink that
needs any special protection.

So remove this leftover from one of the BKL cleanup efforts.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 13e3ab99eb7..7169a713da1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1167,7 +1167,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
 	struct dentry *de;
 	struct vfsmount *mnt = NULL;
 
-	lock_kernel();
 
 	if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
 		goto out;
@@ -1183,7 +1182,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
 	dput(de);
 	mntput(mnt);
 out:
-	unlock_kernel();
 	return error;
 }
 
-- 
cgit v1.2.3


From 167965495153ba9829d0a34b6800dbf5d94de8c8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:42 -0700
Subject: [PATCH] proc: Remove unnecessary and misleading assignments from
 proc_pid_make_inode

The removed fields are already set by proc_alloc_inode.  Initializing them in
proc_alloc_inode implies they need it for proper cleanup.  At least ei->pde
was not set on all paths making it look like proc_alloc_inode was buggy.  So
just remove the redundant assignments.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7169a713da1..0762bc34d98 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1357,7 +1357,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 
 	/* Common stuff */
 	ei = PROC_I(inode);
-	ei->task = NULL;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_ino = fake_ino(task->pid, ino);
 
@@ -1382,7 +1381,6 @@ out:
 	return inode;
 
 out_unlock:
-	ei->pde = NULL;
 	iput(inode);
 	return NULL;
 }
-- 
cgit v1.2.3


From 87bfbf679ffb1e95dd9ada694f66aafc4bfa5959 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:43 -0700
Subject: [PATCH] proc: Simplify the ownership rules for /proc

Currently in /proc if the task is dumpable all of files are owned by the tasks
effective users.  Otherwise the files are owned by root.  Unless it is the
/proc/<tgid>/ or /proc/<tgid>/task/<pid> directory in that case we always make
the directory owned by the effective user.

However the special case for directories is pointless except as a way to read
the effective user, because the permissions on both of those directories are
world readable, and executable.

/proc/<tgid>/status provides a much better way to read a processes effecitve
userid, so it is silly to try to provide that on the directory.

So this patch simplifies the code by removing a pointless special case and
gets us one step closer to being able to remove the hard coded /proc inode
numbers.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0762bc34d98..fa0e6bee40f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1371,7 +1371,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	ei->type = ino;
 	inode->i_uid = 0;
 	inode->i_gid = 0;
-	if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) {
+	if (task_dumpable(task)) {
 		inode->i_uid = task->euid;
 		inode->i_gid = task->egid;
 	}
@@ -1400,7 +1400,7 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task = proc_task(inode);
 	if (pid_alive(task)) {
-		if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) {
+		if (task_dumpable(task)) {
 			inode->i_uid = task->euid;
 			inode->i_gid = task->egid;
 		} else {
-- 
cgit v1.2.3


From aed7a6c476d90660ac5af860158407ae9fe61c68 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:44 -0700
Subject: [PATCH] proc: Replace proc_inode.type with proc_inode.fd

The sole renaming use of proc_inode.type is to discover the file descriptor
number, so just store the file descriptor number and don't wory about
processing this field.  This removes any /proc limits on the maximum number of
file descriptors, and clears the path to make the hard coded /proc inode
numbers go away.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c     | 6 +++---
 fs/proc/inode.c    | 2 +-
 fs/proc/internal.h | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fa0e6bee40f..9562df76090 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -297,7 +297,7 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
 	struct task_struct *task = proc_task(inode);
 	struct files_struct *files;
 	struct file *file;
-	int fd = proc_type(inode) - PROC_TID_FD_DIR;
+	int fd = proc_fd(inode);
 
 	files = get_files_struct(task);
 	if (files) {
@@ -1368,7 +1368,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	 */
 	get_task_struct(task);
 	ei->task = task;
-	ei->type = ino;
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	if (task_dumpable(task)) {
@@ -1418,7 +1417,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task = proc_task(inode);
-	int fd = proc_type(inode) - PROC_TID_FD_DIR;
+	int fd = proc_fd(inode);
 	struct files_struct *files;
 
 	files = get_files_struct(task);
@@ -1525,6 +1524,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (!inode)
 		goto out;
 	ei = PROC_I(inode);
+	ei->fd = fd;
 	files = get_files_struct(task);
 	if (!files)
 		goto out_unlock;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 722b9c46311..fbc94df138a 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	if (!ei)
 		return NULL;
 	ei->task = NULL;
-	ei->type = 0;
+	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
 	inode = &ei->vfs_inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0502f17b860..6264b7a3a9f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -46,7 +46,7 @@ static inline struct task_struct *proc_task(struct inode *inode)
 	return PROC_I(inode)->task;
 }
 
-static inline int proc_type(struct inode *inode)
+static inline int proc_fd(struct inode *inode)
 {
-	return PROC_I(inode)->type;
+	return PROC_I(inode)->fd;
 }
-- 
cgit v1.2.3


From 68602066c3327fa340899609d715781eda423751 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:45 -0700
Subject: [PATCH] proc: Remove bogus proc_task_permission

First we can access every /proc/<tgid>/task/<pid> directory as /proc/<pid> so
proc_task_permission is not usefully limiting visibility.

Second having related filesystems information should have nothing to do with
process visibility.  kill does not implement any checks like that.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 63 ----------------------------------------------------------
 1 file changed, 63 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9562df76090..6080672276d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -361,54 +361,6 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
 	return result;
 }
 
-
-/* Same as proc_root_link, but this addionally tries to get fs from other
- * threads in the group */
-static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
-				struct vfsmount **mnt)
-{
-	struct fs_struct *fs;
-	int result = -ENOENT;
-	struct task_struct *leader = proc_task(inode);
-
-	task_lock(leader);
-	fs = leader->fs;
-	if (fs) {
-		atomic_inc(&fs->count);
-		task_unlock(leader);
-	} else {
-		/* Try to get fs from other threads */
-		task_unlock(leader);
-		read_lock(&tasklist_lock);
-		if (pid_alive(leader)) {
-			struct task_struct *task = leader;
-
-			while ((task = next_thread(task)) != leader) {
-				task_lock(task);
-				fs = task->fs;
-				if (fs) {
-					atomic_inc(&fs->count);
-					task_unlock(task);
-					break;
-				}
-				task_unlock(task);
-			}
-		}
-		read_unlock(&tasklist_lock);
-	}
-
-	if (fs) {
-		read_lock(&fs->lock);
-		*mnt = mntget(fs->rootmnt);
-		*dentry = dget(fs->root);
-		read_unlock(&fs->lock);
-		result = 0;
-		put_fs_struct(fs);
-	}
-	return result;
-}
-
-
 #define MAY_PTRACE(task) \
 	(task == current || \
 	(task->parent == current && \
@@ -600,20 +552,6 @@ static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
 	return proc_check_root(inode);
 }
 
-static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	struct dentry *root;
-	struct vfsmount *vfsmnt;
-
-	if (generic_permission(inode, mask, NULL) != 0)
-		return -EACCES;
-
-	if (proc_task_root_link(inode, &root, &vfsmnt))
-		return -ENOENT;
-
-	return proc_check_chroot(root, vfsmnt);
-}
-
 extern struct seq_operations proc_pid_maps_op;
 static int maps_open(struct inode *inode, struct file *file)
 {
@@ -1583,7 +1521,6 @@ static struct inode_operations proc_fd_inode_operations = {
 
 static struct inode_operations proc_task_inode_operations = {
 	.lookup		= proc_task_lookup,
-	.permission	= proc_task_permission,
 };
 
 #ifdef CONFIG_SECURITY
-- 
cgit v1.2.3


From 22c2c5d75e6ad4b9ac41269476b32ba8c9fe263f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:45 -0700
Subject: [PATCH] proc: Kill proc_mem_inode_operations

The inode operations only exist to support the proc_permission function.
Currently mem_read and mem_write have all the same permission checks as
ptrace.  The fs check makes no sense in this context, and we can trivially get
around it by calling ptrace.

So simply the code by killing the strange weird case.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6080672276d..2e4356f5d5e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -927,10 +927,6 @@ static struct file_operations proc_oom_adjust_operations = {
 	.write		= oom_adjust_write,
 };
 
-static struct inode_operations proc_mem_inode_operations = {
-	.permission	= proc_permission,
-};
-
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -1697,7 +1693,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 #endif
 		case PROC_TID_MEM:
 		case PROC_TGID_MEM:
-			inode->i_op = &proc_mem_inode_operations;
 			inode->i_fop = &proc_mem_operations;
 			break;
 #ifdef CONFIG_SECCOMP
-- 
cgit v1.2.3


From 0f2fe20f55c85f26efaf14feeb69c7c2eb3f7a75 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:46 -0700
Subject: [PATCH] proc: Properly filter out files that are not visible to a
 process

Long ago and far away in 2.2 we started checking to ensure the files we
displayed in /proc were visible to the current process.  It was an
unsophisticated time and no one was worried about functions full of FIXMES in
a stable kernel.  As time passed the function became sacred and was enshrined
in the shrine of how things have always been.  The fixes came in but only to
keep the function working no one really remembering or documenting why we did
things that way.

The intent and the functionality make a lot of sense.  Don't let /proc be an
access point for files a process can see no other way.  The implementation
however is completely wrong.

We are currently checking the root directories of the two processes, we are
not checking the actual file descriptors themselves.

We are strangely checking with a permission method instead of just when we use
the data.

This patch fixes the logic to actually check the file descriptors and make a
note that implementing a permission method for this part of /proc almost
certainly indicates a bug in the reasoning.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 101 ++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 68 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2e4356f5d5e..a85b073408e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -74,6 +74,16 @@
 #include <linux/poll.h>
 #include "internal.h"
 
+/* NOTE:
+ *	Implementing inode permission operations in /proc is almost
+ *	certainly an error.  Permission checks need to happen during
+ *	each system call not at open time.  The reason is that most of
+ *	what we wish to check for permissions in /proc varies at runtime.
+ *
+ *	The classic example of a problem is opening file descriptors
+ *	in /proc for a task before it execs a suid executable.
+ */
+
 /*
  * For hysterical raisins we keep the same inumbers as in the old procfs.
  * Feel free to change the macro below - just keep the range distinct from
@@ -494,13 +504,11 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 
 /* If the process being read is separated by chroot from the reading process,
  * don't let the reader access the threads.
- *
- * note: this does dput(root) and mntput(vfsmnt) on exit.
  */
-static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt)
+static int proc_check_chroot(struct dentry *de, struct vfsmount *mnt)
 {
-	struct dentry *de, *base;
-	struct vfsmount *our_vfsmnt, *mnt;
+	struct dentry *base;
+	struct vfsmount *our_vfsmnt;
 	int res = 0;
 
 	read_lock(&current->fs->lock);
@@ -509,8 +517,6 @@ static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt)
 	read_unlock(&current->fs->lock);
 
 	spin_lock(&vfsmount_lock);
-	de = root;
-	mnt = vfsmnt;
 
 	while (mnt != our_vfsmnt) {
 		if (mnt == mnt->mnt_parent)
@@ -526,8 +532,6 @@ static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt)
 exit:
 	dput(base);
 	mntput(our_vfsmnt);
-	dput(root);
-	mntput(vfsmnt);
 	return res;
 out:
 	spin_unlock(&vfsmount_lock);
@@ -535,23 +539,6 @@ out:
 	goto exit;
 }
 
-static int proc_check_root(struct inode *inode)
-{
-	struct dentry *root;
-	struct vfsmount *vfsmnt;
-
-	if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */
-		return -ENOENT;
-	return proc_check_chroot(root, vfsmnt);
-}
-
-static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	if (generic_permission(inode, mask, NULL) != 0)
-		return -EACCES;
-	return proc_check_root(inode);
-}
-
 extern struct seq_operations proc_pid_maps_op;
 static int maps_open(struct inode *inode, struct file *file)
 {
@@ -1048,6 +1035,48 @@ static struct file_operations proc_seccomp_operations = {
 };
 #endif /* CONFIG_SECCOMP */
 
+static int proc_check_dentry_visible(struct inode *inode,
+	struct dentry *dentry, struct vfsmount *mnt)
+{
+	/* Verify that the current process can already see the
+	 * file pointed at by the file descriptor.
+	 * This prevents /proc from being an accidental information leak.
+	 *
+	 * This prevents access to files that are not visible do to
+	 * being on the otherside of a chroot, in a different
+	 * namespace, or are simply process local (like pipes).
+	 */
+	struct task_struct *task;
+	struct files_struct *task_files, *files;
+	int error = -EACCES;
+
+	/* See if the the two tasks share a commone set of
+	 * file descriptors.  If so everything is visible.
+	 */
+	task = proc_task(inode);
+	if (!task)
+		goto out;
+	files = get_files_struct(current);
+	task_files = get_files_struct(task);
+	if (files && task_files && (files == task_files))
+		error = 0;
+	if (task_files)
+		put_files_struct(task_files);
+	if (files)
+		put_files_struct(files);
+	if (!error)
+		goto out;
+
+	/* If the two tasks don't share a common set of file
+	 * descriptors see if the destination dentry is already
+	 * visible in the current tasks filesystem namespace.
+	 */
+	error = proc_check_chroot(dentry, mnt);
+out:
+	return error;
+
+}
+
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1058,12 +1087,16 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 
 	if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
 		goto out;
-	error = proc_check_root(inode);
-	if (error)
-		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt);
 	nd->last_type = LAST_BIND;
+	if (error)
+		goto out;
+
+	/* Only return files this task can already see */
+	error = proc_check_dentry_visible(inode, nd->dentry, nd->mnt);
+	if (error)
+		path_release(nd);
 out:
 	return ERR_PTR(error);
 }
@@ -1104,15 +1137,18 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
 
 	if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
 		goto out;
-	error = proc_check_root(inode);
-	if (error)
-		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt);
 	if (error)
 		goto out;
 
+	/* Only return files this task can already see */
+	error = proc_check_dentry_visible(inode, de, mnt);
+	if (error)
+		goto out_put;
+
 	error = do_proc_readlink(de, mnt, buffer, buflen);
+out_put:
 	dput(de);
 	mntput(mnt);
 out:
@@ -1512,7 +1548,6 @@ static struct file_operations proc_task_operations = {
  */
 static struct inode_operations proc_fd_inode_operations = {
 	.lookup		= proc_lookupfd,
-	.permission	= proc_permission,
 };
 
 static struct inode_operations proc_task_inode_operations = {
-- 
cgit v1.2.3


From 6e66b52bf587f0dd9a8e0a581b9570e5c1969e33 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:47 -0700
Subject: [PATCH] proc: Fix the link count for /proc/<pid>/task

Use getattr to get an accurate link count when needed.  This is cheaper and
more accurate than trying to derive it by walking the thread list of a
process.

Especially as it happens when needed stat instead of at readdir time.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index a85b073408e..29539c2268a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1532,6 +1532,7 @@ out:
 
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd);
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 
 static struct file_operations proc_fd_operations = {
 	.read		= generic_read_dir,
@@ -1552,6 +1553,7 @@ static struct inode_operations proc_fd_inode_operations = {
 
 static struct inode_operations proc_task_inode_operations = {
 	.lookup		= proc_task_lookup,
+	.getattr	= proc_task_getattr,
 };
 
 #ifdef CONFIG_SECURITY
@@ -1658,7 +1660,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	 */
 	switch(p->type) {
 		case PROC_TGID_TASK:
-			inode->i_nlink = 2 + get_tid_list(2, NULL, dir);
+			inode->i_nlink = 2;
 			inode->i_op = &proc_task_inode_operations;
 			inode->i_fop = &proc_task_operations;
 			break;
@@ -2261,7 +2263,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	}
 
 	nr_tids = get_tid_list(pos, tid_array, inode);
-	inode->i_nlink = pos + nr_tids;
 
 	for (i = 0; i < nr_tids; i++) {
 		unsigned long j = PROC_NUMBUF;
@@ -2281,3 +2282,19 @@ out:
 	filp->f_pos = pos;
 	return retval;
 }
+
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *p = proc_task(inode);
+	generic_fillattr(inode, stat);
+
+	if (pid_alive(p)) {
+		task_lock(p);
+		if (p->signal)
+			stat->nlink += atomic_read(&p->signal->count);
+		task_unlock(p);
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 662795deb854b31501e0ffb42b7f0cce802c134a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:48 -0700
Subject: [PATCH] proc: Move proc_maps_operations into task_mmu.c

All of the functions for proc_maps_operations are already defined in
task_mmu.c so move the operations structure to keep the functionality
together.

Since task_nommu.c implements a dummy version of /proc/<pid>/maps give it a
simplified version of proc_maps_operations that it can modify to best suit its
needs.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c       | 61 ----------------------------------------------------
 fs/proc/internal.h   |  4 ++++
 fs/proc/task_mmu.c   | 54 +++++++++++++++++++++++++++++++++++++++++++---
 fs/proc/task_nommu.c | 21 +++++++++++++++++-
 4 files changed, 75 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 29539c2268a..c8636841bbc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -539,67 +539,6 @@ out:
 	goto exit;
 }
 
-extern struct seq_operations proc_pid_maps_op;
-static int maps_open(struct inode *inode, struct file *file)
-{
-	struct task_struct *task = proc_task(inode);
-	int ret = seq_open(file, &proc_pid_maps_op);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = task;
-	}
-	return ret;
-}
-
-static struct file_operations proc_maps_operations = {
-	.open		= maps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-#ifdef CONFIG_NUMA
-extern struct seq_operations proc_pid_numa_maps_op;
-static int numa_maps_open(struct inode *inode, struct file *file)
-{
-	struct task_struct *task = proc_task(inode);
-	int ret = seq_open(file, &proc_pid_numa_maps_op);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = task;
-	}
-	return ret;
-}
-
-static struct file_operations proc_numa_maps_operations = {
-	.open		= numa_maps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
-#ifdef CONFIG_MMU
-extern struct seq_operations proc_pid_smaps_op;
-static int smaps_open(struct inode *inode, struct file *file)
-{
-	struct task_struct *task = proc_task(inode);
-	int ret = seq_open(file, &proc_pid_smaps_op);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = task;
-	}
-	return ret;
-}
-
-static struct file_operations proc_smaps_operations = {
-	.open		= smaps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
 extern struct seq_operations mounts_op;
 struct proc_mounts {
 	struct seq_file m;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 6264b7a3a9f..548e7447ea4 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,6 +37,10 @@ extern int proc_tgid_stat(struct task_struct *, char *);
 extern int proc_pid_status(struct task_struct *, char *);
 extern int proc_pid_statm(struct task_struct *, char *);
 
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+
 void free_proc_entry(struct proc_dir_entry *de);
 
 int proc_init_inodecache(void);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 91b7c15ab37..4187b4e9cdb 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -374,27 +374,75 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 	return (vma != tail_vma)? tail_vma: NULL;
 }
 
-struct seq_operations proc_pid_maps_op = {
+static struct seq_operations proc_pid_maps_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= show_map
 };
 
-struct seq_operations proc_pid_smaps_op = {
+static struct seq_operations proc_pid_smaps_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= show_smap
 };
 
+static int do_maps_open(struct inode *inode, struct file *file,
+			struct seq_operations *ops)
+{
+	struct task_struct *task = proc_task(inode);
+	int ret = seq_open(file, ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = task;
+	}
+	return ret;
+}
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+struct file_operations proc_maps_operations = {
+	.open		= maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
 
-struct seq_operations proc_pid_numa_maps_op = {
+static struct seq_operations proc_pid_numa_maps_op = {
         .start  = m_start,
         .next   = m_next,
         .stop   = m_stop,
         .show   = show_numa_map
 };
+
+static int numa_maps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+
+struct file_operations proc_numa_maps_operations = {
+	.open		= numa_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
 #endif
+
+static int smaps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+
+struct file_operations proc_smaps_operations = {
+	.open		= smaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8f68827ed10..af69f28277b 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	return NULL;
 }
-struct seq_operations proc_pid_maps_op = {
+static struct seq_operations proc_pid_maps_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= show_map
 };
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	ret = seq_open(file, &proc_pid_maps_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = NULL;
+	}
+	return ret;
+}
+
+struct file_operations proc_maps_operations = {
+	.open		= maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
-- 
cgit v1.2.3


From 48e6484d49020dba3578ad117b461e8a391e8f0f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:48 -0700
Subject: [PATCH] proc: Rewrite the proc dentry flush on exit optimization

To keep the dcache from filling up with dead /proc entries we flush them on
process exit.  However over the years that code has gotten hairy with a
dentry_pointer and a lock in task_struct and misdocumented as a correctness
feature.

I have rewritten this code to look and see if we have a corresponding entry in
the dcache and if so flush it on process exit.  This removes the extra fields
in the task_struct and allows me to trivially handle the case of a
/proc/<tgid>/task/<pid> entry as well as the current /proc/<pid> entries.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c      |  10 -----
 fs/proc/base.c | 134 ++++++++++++++++++++++++++-------------------------------
 2 files changed, 61 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 0b88bf64614..8c5196087f3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk)
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(current)) {
-		struct dentry *proc_dentry1, *proc_dentry2;
-
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
@@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk)
 		 */
 		current->start_time = leader->start_time;
 
-		spin_lock(&leader->proc_lock);
-		spin_lock(&current->proc_lock);
-		proc_dentry1 = proc_pid_unhash(current);
-		proc_dentry2 = proc_pid_unhash(leader);
 		write_lock_irq(&tasklist_lock);
 
 		BUG_ON(leader->tgid != current->tgid);
@@ -729,10 +723,6 @@ static int de_thread(struct task_struct *tsk)
 		leader->exit_state = EXIT_DEAD;
 
 		write_unlock_irq(&tasklist_lock);
-		spin_unlock(&leader->proc_lock);
-		spin_unlock(&current->proc_lock);
-		proc_pid_flush(proc_dentry1);
-		proc_pid_flush(proc_dentry2);
         }
 
 	/*
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c8636841bbc..f435932e643 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1352,16 +1352,6 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static void pid_base_iput(struct dentry *dentry, struct inode *inode)
-{
-	struct task_struct *task = proc_task(inode);
-	spin_lock(&task->proc_lock);
-	if (task->proc_dentry == dentry)
-		task->proc_dentry = NULL;
-	spin_unlock(&task->proc_lock);
-	iput(inode);
-}
-
 static int pid_delete_dentry(struct dentry * dentry)
 {
 	/* Is the task we represent dead?
@@ -1383,13 +1373,6 @@ static struct dentry_operations pid_dentry_operations =
 	.d_delete	= pid_delete_dentry,
 };
 
-static struct dentry_operations pid_base_dentry_operations =
-{
-	.d_revalidate	= pid_revalidate,
-	.d_iput		= pid_base_iput,
-	.d_delete	= pid_delete_dentry,
-};
-
 /* Lookups */
 
 static unsigned name_to_int(struct dentry *dentry)
@@ -1859,57 +1842,70 @@ static struct inode_operations proc_self_inode_operations = {
 };
 
 /**
- * proc_pid_unhash -  Unhash /proc/@pid entry from the dcache.
- * @p: task that should be flushed.
+ * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
+ *
+ * @task: task that should be flushed.
+ *
+ * Looks in the dcache for
+ * /proc/@pid
+ * /proc/@tgid/task/@pid
+ * if either directory is present flushes it and all of it'ts children
+ * from the dcache.
  *
- * Drops the /proc/@pid dcache entry from the hash chains.
+ * It is safe and reasonable to cache /proc entries for a task until
+ * that task exits.  After that they just clog up the dcache with
+ * useless entries, possibly causing useful dcache entries to be
+ * flushed instead.  This routine is proved to flush those useless
+ * dcache entries at process exit time.
  *
- * Dropping /proc/@pid entries and detach_pid must be synchroneous,
- * otherwise e.g. /proc/@pid/exe might point to the wrong executable,
- * if the pid value is immediately reused. This is enforced by
- * - caller must acquire spin_lock(p->proc_lock)
- * - must be called before detach_pid()
- * - proc_pid_lookup acquires proc_lock, and checks that
- *   the target is not dead by looking at the attach count
- *   of PIDTYPE_PID.
+ * NOTE: This routine is just an optimization so it does not guarantee
+ *       that no dcache entries will exist at process exit time it
+ *       just makes it very unlikely that any will persist.
  */
-
-struct dentry *proc_pid_unhash(struct task_struct *p)
+void proc_flush_task(struct task_struct *task)
 {
-	struct dentry *proc_dentry;
+	struct dentry *dentry, *leader, *dir;
+	char buf[30];
+	struct qstr name;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+	dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		d_drop(dentry);
+		dput(dentry);
+	}
 
-	proc_dentry = p->proc_dentry;
-	if (proc_dentry != NULL) {
+	if (thread_group_leader(task))
+		goto out;
 
-		spin_lock(&dcache_lock);
-		spin_lock(&proc_dentry->d_lock);
-		if (!d_unhashed(proc_dentry)) {
-			dget_locked(proc_dentry);
-			__d_drop(proc_dentry);
-			spin_unlock(&proc_dentry->d_lock);
-		} else {
-			spin_unlock(&proc_dentry->d_lock);
-			proc_dentry = NULL;
-		}
-		spin_unlock(&dcache_lock);
-	}
-	return proc_dentry;
-}
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", task->tgid);
+	leader = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+	if (!leader)
+		goto out;
 
-/**
- * proc_pid_flush - recover memory used by stale /proc/@pid/x entries
- * @proc_dentry: directoy to prune.
- *
- * Shrink the /proc directory that was used by the just killed thread.
- */
-	
-void proc_pid_flush(struct dentry *proc_dentry)
-{
-	might_sleep();
-	if(proc_dentry != NULL) {
-		shrink_dcache_parent(proc_dentry);
-		dput(proc_dentry);
+	name.name = "task";
+	name.len = strlen(name.name);
+	dir = d_hash_and_lookup(leader, &name);
+	if (!dir)
+		goto out_put_leader;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+	dentry = d_hash_and_lookup(dir, &name);
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		d_drop(dentry);
+		dput(dentry);
 	}
+
+	dput(dir);
+out_put_leader:
+	dput(leader);
+out:
+	return;
 }
 
 /* SMP-safe */
@@ -1919,7 +1915,6 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 	struct inode *inode;
 	struct proc_inode *ei;
 	unsigned tgid;
-	int died;
 
 	if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) {
 		inode = new_inode(dir->i_sb);
@@ -1965,23 +1960,16 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 	inode->i_nlink = 4;
 #endif
 
-	dentry->d_op = &pid_base_dentry_operations;
+	dentry->d_op = &pid_dentry_operations;
 
-	died = 0;
 	d_add(dentry, inode);
-	spin_lock(&task->proc_lock);
-	task->proc_dentry = dentry;
 	if (!pid_alive(task)) {
-		dentry = proc_pid_unhash(task);
-		died = 1;
+		d_drop(dentry);
+		shrink_dcache_parent(dentry);
+		goto out;
 	}
-	spin_unlock(&task->proc_lock);
 
 	put_task_struct(task);
-	if (died) {
-		proc_pid_flush(dentry);
-		goto out;
-	}
 	return NULL;
 out:
 	return ERR_PTR(-ENOENT);
@@ -2024,7 +2012,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 	inode->i_nlink = 3;
 #endif
 
-	dentry->d_op = &pid_base_dentry_operations;
+	dentry->d_op = &pid_dentry_operations;
 
 	d_add(dentry, inode);
 
-- 
cgit v1.2.3


From cd6a3ce9ec040c0b56ea92a81ff710417798c559 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:49 -0700
Subject: [PATCH] proc: Close the race of a process dying durning lookup

proc_lookup and task exiting are not synchronized, although some of the
previous code may have suggested that.  Every time before we reuse a dentry
namei.c calls d_op->derevalidate which prevents us from reusing a stale dcache
entry.  Unfortunately it does not prevent us from returning a stale dcache
entry.  This race has been explicitly plugged in proc_pid_lookup but there is
nothing to confine it to just that proc lookup function.

So to prevent the race I call revalidate explictily in all of the proc lookup
functions after I call d_add, and report an error if the revalidate does not
succeed.

Years ago Al Viro did something similar but those changes got lost in the
churn.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 54 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f435932e643..98eaeaa9fdd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1402,6 +1402,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 {
 	struct task_struct *task = proc_task(dir);
 	unsigned fd = name_to_int(dentry);
+	struct dentry *result = ERR_PTR(-ENOENT);
 	struct file * file;
 	struct files_struct * files;
 	struct inode *inode;
@@ -1441,15 +1442,18 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	ei->op.proc_get_link = proc_fd_link;
 	dentry->d_op = &tid_fd_dentry_operations;
 	d_add(dentry, inode);
-	return NULL;
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, NULL))
+		result = NULL;
+out:
+	return result;
 
 out_unlock2:
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 out_unlock:
 	iput(inode);
-out:
-	return ERR_PTR(-ENOENT);
+	goto out;
 }
 
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir);
@@ -1549,12 +1553,12 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 					 struct pid_entry *ents)
 {
 	struct inode *inode;
-	int error;
+	struct dentry *error;
 	struct task_struct *task = proc_task(dir);
 	struct pid_entry *p;
 	struct proc_inode *ei;
 
-	error = -ENOENT;
+	error = ERR_PTR(-ENOENT);
 	inode = NULL;
 
 	if (!pid_alive(task))
@@ -1569,7 +1573,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	if (!p->name)
 		goto out;
 
-	error = -EINVAL;
+	error = ERR_PTR(-EINVAL);
 	inode = proc_pid_make_inode(dir->i_sb, task, p->type);
 	if (!inode)
 		goto out;
@@ -1736,14 +1740,16 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 		default:
 			printk("procfs: impossible type (%d)",p->type);
 			iput(inode);
-			return ERR_PTR(-EINVAL);
+			error = ERR_PTR(-EINVAL);
+			goto out;
 	}
 	dentry->d_op = &pid_dentry_operations;
 	d_add(dentry, inode);
-	return NULL;
-
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		error = NULL;
 out:
-	return ERR_PTR(error);
+	return error;
 }
 
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -1911,6 +1917,7 @@ out:
 /* SMP-safe */
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
+	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
 	struct inode *inode;
 	struct proc_inode *ei;
@@ -1944,12 +1951,9 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 		goto out;
 
 	inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO);
+	if (!inode)
+		goto out_put_task;
 
-
-	if (!inode) {
-		put_task_struct(task);
-		goto out;
-	}
 	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
 	inode->i_op = &proc_tgid_base_inode_operations;
 	inode->i_fop = &proc_tgid_base_operations;
@@ -1963,21 +1967,20 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 	dentry->d_op = &pid_dentry_operations;
 
 	d_add(dentry, inode);
-	if (!pid_alive(task)) {
-		d_drop(dentry);
-		shrink_dcache_parent(dentry);
-		goto out;
-	}
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		result = NULL;
 
+out_put_task:
 	put_task_struct(task);
-	return NULL;
 out:
-	return ERR_PTR(-ENOENT);
+	return result;
 }
 
 /* SMP-safe */
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
+	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
 	struct task_struct *leader = proc_task(dir);
 	struct inode *inode;
@@ -2015,13 +2018,14 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 	dentry->d_op = &pid_dentry_operations;
 
 	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		result = NULL;
 
-	put_task_struct(task);
-	return NULL;
 out_drop_task:
 	put_task_struct(task);
 out:
-	return ERR_PTR(-ENOENT);
+	return result;
 }
 
 #define PROC_NUMBUF 10
-- 
cgit v1.2.3


From 0bc58a910220be3446eedc8e77fd45c0a16d8f25 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:50 -0700
Subject: [PATCH] proc: refactor reading directories of tasks

There are a couple of problems this patch addresses.
- /proc/<tgid>/task currently does not work correctly if you stop reading
  in the middle of a directory.

- /proc/ currently requires a full pass through the task list with
  the tasklist lock held, to determine there are no more processes to read.

- The hand rolled integer to string conversion does not properly running
  out of buffer space.

- We seem to be batching reading of pids from the tasklist without reason,
  and complicating the logic of the code.

This patch addresses that by changing how tasks are processed.  A
first_<task_type> function is built that handles restarts, and a
next_<task_type> function is built that just advances to the next task.

first_<task_type> when it detects a restart usually uses find_task_by_pid.  If
that doesn't work because there has been a seek on the directory, or we have
already given a complete directory listing, it first checks the number tasks
of that type, and only if we are under that count does it walk through all of
the tasks to find the one we are interested in.

The code that fills in the directory is simpler because there is only a single
for loop.

The hand rolled integer to string conversion is replaced by snprintf which
should handle the the out of buffer case correctly.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 268 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 163 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 98eaeaa9fdd..2236f7d3878 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1545,8 +1545,6 @@ static struct file_operations proc_tgid_attr_operations;
 static struct inode_operations proc_tgid_attr_inode_operations;
 #endif
 
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir);
-
 /* SMP-safe */
 static struct dentry *proc_pident_lookup(struct inode *dir, 
 					 struct dentry *dentry,
@@ -2029,88 +2027,83 @@ out:
 }
 
 #define PROC_NUMBUF 10
-#define PROC_MAXPIDS 20
 
 /*
- * Get a few tgid's to return for filldir - we need to hold the
- * tasklist lock while doing this, and we must release it before
- * we actually do the filldir itself, so we use a temp buffer..
+ * Find the first tgid to return to user space.
+ *
+ * Usually this is just whatever follows &init_task, but if the users
+ * buffer was too small to hold the full list or there was a seek into
+ * the middle of the directory we have more work to do.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with &init_task and walk nr
+ * threads past it.
  */
-static int get_tgid_list(int index, unsigned long version, unsigned int *tgids)
+static struct task_struct *first_tgid(int tgid, int nr)
 {
-	struct task_struct *p;
-	int nr_tgids = 0;
-
-	index--;
+	struct task_struct *pos = NULL;
 	read_lock(&tasklist_lock);
-	p = NULL;
-	if (version) {
-		p = find_task_by_pid(version);
-		if (p && !thread_group_leader(p))
-			p = NULL;
+	if (tgid && nr) {
+		pos = find_task_by_pid(tgid);
+		if (pos && !thread_group_leader(pos))
+			pos = NULL;
+		if (pos)
+			nr = 0;
 	}
+	/* If nr exceeds the number of processes get out quickly */
+	if (nr && nr >= nr_processes())
+		goto done;
 
-	if (p)
-		index = 0;
-	else
-		p = next_task(&init_task);
+	/* If we haven't found our starting place yet start with
+	 * the init_task and walk nr tasks forward.
+	 */
+	if (!pos && (nr >= 0))
+		pos = next_task(&init_task);
 
-	for ( ; p != &init_task; p = next_task(p)) {
-		int tgid = p->pid;
-		if (!pid_alive(p))
-			continue;
-		if (--index >= 0)
+	for (; pos && pid_alive(pos); pos = next_task(pos)) {
+		if (--nr > 0)
 			continue;
-		tgids[nr_tgids] = tgid;
-		nr_tgids++;
-		if (nr_tgids >= PROC_MAXPIDS)
-			break;
+		get_task_struct(pos);
+		goto done;
 	}
+	pos = NULL;
+done:
 	read_unlock(&tasklist_lock);
-	return nr_tgids;
+	return pos;
 }
 
 /*
- * Get a few tid's to return for filldir - we need to hold the
- * tasklist lock while doing this, and we must release it before
- * we actually do the filldir itself, so we use a temp buffer..
+ * Find the next task in the task list.
+ * Return NULL if we loop or there is any error.
+ *
+ * The reference to the input task_struct is released.
  */
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir)
+static struct task_struct *next_tgid(struct task_struct *start)
 {
-	struct task_struct *leader_task = proc_task(dir);
-	struct task_struct *task = leader_task;
-	int nr_tids = 0;
-
-	index -= 2;
+	struct task_struct *pos;
 	read_lock(&tasklist_lock);
-	/*
-	 * The starting point task (leader_task) might be an already
-	 * unlinked task, which cannot be used to access the task-list
-	 * via next_thread().
-	 */
-	if (pid_alive(task)) do {
-		int tid = task->pid;
-
-		if (--index >= 0)
-			continue;
-		if (tids != NULL)
-			tids[nr_tids] = tid;
-		nr_tids++;
-		if (nr_tids >= PROC_MAXPIDS)
-			break;
-	} while ((task = next_thread(task)) != leader_task);
+	pos = start;
+	if (pid_alive(start))
+		pos = next_task(start);
+	if (pid_alive(pos) && (pos != &init_task)) {
+		get_task_struct(pos);
+		goto done;
+	}
+	pos = NULL;
+done:
 	read_unlock(&tasklist_lock);
-	return nr_tids;
+	put_task_struct(start);
+	return pos;
 }
 
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-	unsigned int tgid_array[PROC_MAXPIDS];
 	char buf[PROC_NUMBUF];
 	unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-	unsigned int nr_tgids, i;
-	int next_tgid;
+	struct task_struct *task;
+	int tgid;
 
 	if (!nr) {
 		ino_t ino = fake_ino(0,PROC_TGID_INO);
@@ -2119,62 +2112,123 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		filp->f_pos++;
 		nr++;
 	}
+	nr -= 1;
 
 	/* f_version caches the tgid value that the last readdir call couldn't
 	 * return. lseek aka telldir automagically resets f_version to 0.
 	 */
-	next_tgid = filp->f_version;
+	tgid = filp->f_version;
 	filp->f_version = 0;
-	for (;;) {
-		nr_tgids = get_tgid_list(nr, next_tgid, tgid_array);
-		if (!nr_tgids) {
-			/* no more entries ! */
+	for (task = first_tgid(tgid, nr);
+	     task;
+	     task = next_tgid(task), filp->f_pos++) {
+		int len;
+		ino_t ino;
+		tgid = task->pid;
+		len = snprintf(buf, sizeof(buf), "%d", tgid);
+		ino = fake_ino(tgid, PROC_TGID_INO);
+		if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
+			/* returning this tgid failed, save it as the first
+			 * pid for the next readir call */
+			filp->f_version = tgid;
+			put_task_struct(task);
 			break;
 		}
-		next_tgid = 0;
+	}
+	return 0;
+}
 
-		/* do not use the last found pid, reserve it for next_tgid */
-		if (nr_tgids == PROC_MAXPIDS) {
-			nr_tgids--;
-			next_tgid = tgid_array[nr_tgids];
-		}
+/*
+ * Find the first tid of a thread group to return to user space.
+ *
+ * Usually this is just the thread group leader, but if the users
+ * buffer was too small or there was a seek into the middle of the
+ * directory we have more work todo.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with the leader and walk nr
+ * threads past it.
+ */
+static struct task_struct *first_tid(struct task_struct *leader, int tid, int nr)
+{
+	struct task_struct *pos = NULL;
+	read_lock(&tasklist_lock);
 
-		for (i=0;i<nr_tgids;i++) {
-			int tgid = tgid_array[i];
-			ino_t ino = fake_ino(tgid,PROC_TGID_INO);
-			unsigned long j = PROC_NUMBUF;
+	/* Attempt to start with the pid of a thread */
+	if (tid && (nr > 0)) {
+		pos = find_task_by_pid(tid);
+		if (pos && (pos->group_leader != leader))
+			pos = NULL;
+		if (pos)
+			nr = 0;
+	}
 
-			do
-				buf[--j] = '0' + (tgid % 10);
-			while ((tgid /= 10) != 0);
+	/* If nr exceeds the number of threads there is nothing todo */
+	if (nr) {
+		int threads = 0;
+		task_lock(leader);
+		if (leader->signal)
+			threads = atomic_read(&leader->signal->count);
+		task_unlock(leader);
+		if (nr >= threads)
+			goto done;
+	}
 
-			if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) {
-				/* returning this tgid failed, save it as the first
-				 * pid for the next readir call */
-				filp->f_version = tgid_array[i];
-				goto out;
-			}
-			filp->f_pos++;
-			nr++;
-		}
+	/* If we haven't found our starting place yet start with the
+	 * leader and walk nr threads forward.
+	 */
+	if (!pos && (nr >= 0))
+		pos = leader;
+
+	for (; pos && pid_alive(pos); pos = next_thread(pos)) {
+		if (--nr > 0)
+			continue;
+		get_task_struct(pos);
+		goto done;
 	}
-out:
-	return 0;
+	pos = NULL;
+done:
+	read_unlock(&tasklist_lock);
+	return pos;
+}
+
+/*
+ * Find the next thread in the thread list.
+ * Return NULL if there is an error or no next thread.
+ *
+ * The reference to the input task_struct is released.
+ */
+static struct task_struct *next_tid(struct task_struct *start)
+{
+	struct task_struct *pos;
+	read_lock(&tasklist_lock);
+	pos = start;
+	if (pid_alive(start))
+		pos = next_thread(start);
+	if (pid_alive(pos) && (pos != start->group_leader))
+		get_task_struct(pos);
+	else
+		pos = NULL;
+	read_unlock(&tasklist_lock);
+	put_task_struct(start);
+	return pos;
 }
 
 /* for the /proc/TGID/task/ directories */
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-	unsigned int tid_array[PROC_MAXPIDS];
 	char buf[PROC_NUMBUF];
-	unsigned int nr_tids, i;
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
+	struct task_struct *leader = proc_task(inode);
+	struct task_struct *task;
 	int retval = -ENOENT;
 	ino_t ino;
+	int tid;
 	unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
 
-	if (!pid_alive(proc_task(inode)))
+	if (!pid_alive(leader))
 		goto out;
 	retval = 0;
 
@@ -2193,21 +2247,25 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 		/* fall through */
 	}
 
-	nr_tids = get_tid_list(pos, tid_array, inode);
-
-	for (i = 0; i < nr_tids; i++) {
-		unsigned long j = PROC_NUMBUF;
-		int tid = tid_array[i];
-
-		ino = fake_ino(tid,PROC_TID_INO);
-
-		do
-			buf[--j] = '0' + (tid % 10);
-		while ((tid /= 10) != 0);
-
-		if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0)
+	/* f_version caches the tgid value that the last readdir call couldn't
+	 * return. lseek aka telldir automagically resets f_version to 0.
+	 */
+	tid = filp->f_version;
+	filp->f_version = 0;
+	for (task = first_tid(leader, tid, pos - 2);
+	     task;
+	     task = next_tid(task), pos++) {
+		int len;
+		tid = task->pid;
+		len = snprintf(buf, sizeof(buf), "%d", tid);
+		ino = fake_ino(tid, PROC_TID_INO);
+		if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) {
+			/* returning this tgid failed, save it as the first
+			 * pid for the next readir call */
+			filp->f_version = tid;
+			put_task_struct(task);
 			break;
-		pos++;
+		}
 	}
 out:
 	filp->f_pos = pos;
-- 
cgit v1.2.3


From 454cc105ef690f2a0ba7c6b194d55666b4e918ce Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:51 -0700
Subject: [PATCH] proc: Remove tasklist_lock from proc_pid_readdir

We don't need the tasklist_lock to safely iterate through processes
anymore.

This depends on my previous to task patches that make get_task_struct rcu
safe, and that make next_task() rcu safe.  I haven't gotten
first_tid/next_tid yet only because next_thread is missing an
rcu_dereference.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2236f7d3878..cc578a300a2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2043,7 +2043,7 @@ out:
 static struct task_struct *first_tgid(int tgid, int nr)
 {
 	struct task_struct *pos = NULL;
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	if (tgid && nr) {
 		pos = find_task_by_pid(tgid);
 		if (pos && !thread_group_leader(pos))
@@ -2069,7 +2069,7 @@ static struct task_struct *first_tgid(int tgid, int nr)
 	}
 	pos = NULL;
 done:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return pos;
 }
 
@@ -2082,7 +2082,7 @@ done:
 static struct task_struct *next_tgid(struct task_struct *start)
 {
 	struct task_struct *pos;
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	pos = start;
 	if (pid_alive(start))
 		pos = next_task(start);
@@ -2092,7 +2092,7 @@ static struct task_struct *next_tgid(struct task_struct *start)
 	}
 	pos = NULL;
 done:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	put_task_struct(start);
 	return pos;
 }
-- 
cgit v1.2.3


From de7587343bfebc186995ad294e3de0da382eb9bc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:51 -0700
Subject: [PATCH] proc: Remove tasklist_lock from proc_pid_lookup() and
 proc_task_lookup()

Since we no longer need the tasklist_lock for get_task_struct the lookup
methods no longer need the tasklist_lock.

This just depends on my previous patch that makes get_task_struct() rcu
safe.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index cc578a300a2..8180579f879 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1940,11 +1940,11 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 	if (tgid == ~0U)
 		goto out;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	task = find_task_by_pid(tgid);
 	if (task)
 		get_task_struct(task);
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	if (!task)
 		goto out;
 
@@ -1988,11 +1988,11 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 	if (tid == ~0U)
 		goto out;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	task = find_task_by_pid(tid);
 	if (task)
 		get_task_struct(task);
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	if (!task)
 		goto out;
 	if (leader->tgid != task->tgid)
-- 
cgit v1.2.3


From 9cc8cbc7f8b7bc3db48bf6d59a731af728e786ce Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:52 -0700
Subject: [PATCH] simply fix first_tgid

Like the bug Oleg spotted in first_tid there was also a small off by one
error in first_tgid, when a seek was done on the /proc directory.  This
fixes that and changes the code structure to make it a little more obvious
what is going on.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8180579f879..bbcaef9adb5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2040,34 +2040,32 @@ out:
  * In the case of a seek we start with &init_task and walk nr
  * threads past it.
  */
-static struct task_struct *first_tgid(int tgid, int nr)
+static struct task_struct *first_tgid(int tgid, unsigned int nr)
 {
-	struct task_struct *pos = NULL;
+	struct task_struct *pos;
 	rcu_read_lock();
 	if (tgid && nr) {
 		pos = find_task_by_pid(tgid);
-		if (pos && !thread_group_leader(pos))
-			pos = NULL;
-		if (pos)
-			nr = 0;
+		if (pos && thread_group_leader(pos))
+			goto found;
 	}
 	/* If nr exceeds the number of processes get out quickly */
+	pos = NULL;
 	if (nr && nr >= nr_processes())
 		goto done;
 
 	/* If we haven't found our starting place yet start with
 	 * the init_task and walk nr tasks forward.
 	 */
-	if (!pos && (nr >= 0))
-		pos = next_task(&init_task);
-
-	for (; pos && pid_alive(pos); pos = next_task(pos)) {
-		if (--nr > 0)
-			continue;
-		get_task_struct(pos);
-		goto done;
+	for (pos = next_task(&init_task); nr > 0; --nr) {
+		pos = next_task(pos);
+		if (pos == &init_task) {
+			pos = NULL;
+			goto done;
+		}
 	}
-	pos = NULL;
+found:
+	get_task_struct(pos);
 done:
 	rcu_read_unlock();
 	return pos;
-- 
cgit v1.2.3


From 8578cea7509cbdec25b31d08b48a92fcc3b1a9e3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:54 -0700
Subject: [PATCH] proc: make PROC_NUMBUF the buffer size for holding integers
 as strings

Currently in /proc at several different places we define buffers to hold a
process id, or a file descriptor .  In most of them we use either a hard coded
number or a different define.  Modify them all to use PROC_NUMBUF, so the code
has a chance of being maintained.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index bbcaef9adb5..20746e12440 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -185,6 +185,9 @@ enum pid_directory_inos {
 	PROC_TID_FD_DIR = 0x8000,	/* 0x8000-0xffff */
 };
 
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 10
+
 struct pid_entry {
 	int type;
 	int len;
@@ -807,12 +810,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 				size_t count, loff_t *ppos)
 {
 	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[8];
+	char buffer[PROC_NUMBUF];
 	size_t len;
 	int oom_adjust = task->oomkilladj;
 	loff_t __ppos = *ppos;
 
-	len = sprintf(buffer, "%i\n", oom_adjust);
+	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 	if (__ppos >= len)
 		return 0;
 	if (count > len-__ppos)
@@ -827,14 +830,14 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
 	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[8], *end;
+	char buffer[PROC_NUMBUF], *end;
 	int oom_adjust;
 
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
-	memset(buffer, 0, 8);
-	if (count > 6)
-		count = 6;
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
 		return -EFAULT;
 	oom_adjust = simple_strtol(buffer, &end, 0);
@@ -1099,8 +1102,6 @@ static struct inode_operations proc_pid_link_inode_operations = {
 	.follow_link	= proc_pid_follow_link
 };
 
-#define NUMBUF 10
-
 static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_dentry;
@@ -1108,7 +1109,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 	struct task_struct *p = proc_task(inode);
 	unsigned int fd, tid, ino;
 	int retval;
-	char buf[NUMBUF];
+	char buf[PROC_NUMBUF];
 	struct files_struct * files;
 	struct fdtable *fdt;
 
@@ -1144,7 +1145,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 					continue;
 				rcu_read_unlock();
 
-				j = NUMBUF;
+				j = PROC_NUMBUF;
 				i = fd;
 				do {
 					j--;
@@ -1153,7 +1154,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 				} while (i);
 
 				ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
-				if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
+				if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
 					rcu_read_lock();
 					break;
 				}
@@ -1828,14 +1829,14 @@ static struct inode_operations proc_tid_attr_inode_operations = {
 static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
 			      int buflen)
 {
-	char tmp[30];
+	char tmp[PROC_NUMBUF];
 	sprintf(tmp, "%d", current->tgid);
 	return vfs_readlink(dentry,buffer,buflen,tmp);
 }
 
 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	char tmp[30];
+	char tmp[PROC_NUMBUF];
 	sprintf(tmp, "%d", current->tgid);
 	return ERR_PTR(vfs_follow_link(nd,tmp));
 }	
@@ -1869,7 +1870,7 @@ static struct inode_operations proc_self_inode_operations = {
 void proc_flush_task(struct task_struct *task)
 {
 	struct dentry *dentry, *leader, *dir;
-	char buf[30];
+	char buf[PROC_NUMBUF];
 	struct qstr name;
 
 	name.name = buf;
@@ -2026,8 +2027,6 @@ out:
 	return result;
 }
 
-#define PROC_NUMBUF 10
-
 /*
  * Find the first tgid to return to user space.
  *
-- 
cgit v1.2.3


From 99f895518368252ba862cc15ce4eb98ebbe1bec6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:55 -0700
Subject: [PATCH] proc: don't lock task_structs indefinitely

Every inode in /proc holds a reference to a struct task_struct.  If a
directory or file is opened and remains open after the the task exits this
pinning continues.  With 8K stacks on a 32bit machine the amount pinned per
file descriptor is about 10K.

Normally I would figure a reasonable per user process limit is about 100
processes.  With 80 processes, with a 1000 file descriptors each I can trigger
the 00M killer on a 32bit kernel, because I have pinned about 800MB of useless
data.

This patch replaces the struct task_struct pointer with a pointer to a struct
task_ref which has a struct task_struct pointer.  The so the pinning of dead
tasks does not happen.

The code now has to contend with the fact that the task may now exit at any
time.  Which is a little but not muh more complicated.

With this change it takes about 1000 processes each opening up 1000 file
descriptors before I can trigger the OOM killer.  Much better.

[mlp@google.com: task_mmu small fixes]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Paul Jackson <pj@sgi.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Albert Cahalan <acahalan@gmail.com>
Signed-off-by: Prasanna Meda <mlp@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c     | 355 ++++++++++++++++++++++++++++++++++++++---------------
 fs/proc/inode.c    |   9 +-
 fs/proc/internal.h |  15 ++-
 fs/proc/task_mmu.c |  72 +++++++----
 4 files changed, 319 insertions(+), 132 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 20746e12440..489810abc72 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -307,12 +307,15 @@ static struct pid_entry tid_attr_stuff[] = {
 
 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct task_struct *task = proc_task(inode);
-	struct files_struct *files;
+	struct task_struct *task = get_proc_task(inode);
+	struct files_struct *files = NULL;
 	struct file *file;
 	int fd = proc_fd(inode);
 
-	files = get_files_struct(task);
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
 	if (files) {
 		/*
 		 * We are not taking a ref to the file structure, so we must
@@ -344,10 +347,29 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
 	return fs;
 }
 
+static int get_nr_threads(struct task_struct *tsk)
+{
+	/* Must be called with the rcu_read_lock held */
+	unsigned long flags;
+	int count = 0;
+
+	if (lock_task_sighand(tsk, &flags)) {
+		count = atomic_read(&tsk->signal->count);
+		unlock_task_sighand(tsk, &flags);
+	}
+	return count;
+}
+
 static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct fs_struct *fs = get_fs_struct(proc_task(inode));
+	struct task_struct *task = get_proc_task(inode);
+	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
+
+	if (task) {
+		fs = get_fs_struct(task);
+		put_task_struct(task);
+	}
 	if (fs) {
 		read_lock(&fs->lock);
 		*mnt = mntget(fs->pwdmnt);
@@ -361,8 +383,14 @@ static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfs
 
 static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct fs_struct *fs = get_fs_struct(proc_task(inode));
+	struct task_struct *task = get_proc_task(inode);
+	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
+
+	if (task) {
+		fs = get_fs_struct(task);
+		put_task_struct(task);
+	}
 	if (fs) {
 		read_lock(&fs->lock);
 		*mnt = mntget(fs->rootmnt);
@@ -550,16 +578,19 @@ struct proc_mounts {
 
 static int mounts_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *task = proc_task(inode);
-	struct namespace *namespace;
+	struct task_struct *task = get_proc_task(inode);
+	struct namespace *namespace = NULL;
 	struct proc_mounts *p;
 	int ret = -EINVAL;
 
-	task_lock(task);
-	namespace = task->namespace;
-	if (namespace)
-		get_namespace(namespace);
-	task_unlock(task);
+	if (task) {
+		task_lock(task);
+		namespace = task->namespace;
+		if (namespace)
+			get_namespace(namespace);
+		task_unlock(task);
+		put_task_struct(task);
+	}
 
 	if (namespace) {
 		ret = -ENOMEM;
@@ -616,17 +647,21 @@ static struct file_operations proc_mounts_operations = {
 extern struct seq_operations mountstats_op;
 static int mountstats_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *task = proc_task(inode);
 	int ret = seq_open(file, &mountstats_op);
 
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		struct namespace *namespace;
-		task_lock(task);
-		namespace = task->namespace;
-		if (namespace)
-			get_namespace(namespace);
-		task_unlock(task);
+		struct namespace *namespace = NULL;
+		struct task_struct *task = get_proc_task(inode);
+
+		if (task) {
+			task_lock(task);
+			namespace = task->namespace;
+			if (namespace)
+				get_namespace(namespace);
+			task_unlock(task);
+			put_task_struct(task);
+		}
 
 		if (namespace)
 			m->private = namespace;
@@ -653,18 +688,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	unsigned long page;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 
 	if (count > PROC_BLOCK_SIZE)
 		count = PROC_BLOCK_SIZE;
+
+	length = -ENOMEM;
 	if (!(page = __get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
+		goto out;
 
 	length = PROC_I(inode)->op.proc_read(task, (char*)page);
 
 	if (length >= 0)
 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 	free_page(page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 }
 
@@ -681,12 +725,15 @@ static int mem_open(struct inode* inode, struct file* file)
 static ssize_t mem_read(struct file * file, char __user * buf,
 			size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	char *page;
 	unsigned long src = *ppos;
 	int ret = -ESRCH;
 	struct mm_struct *mm;
 
+	if (!task)
+		goto out_no_task;
+
 	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		goto out;
 
@@ -736,6 +783,8 @@ out_put:
 out_free:
 	free_page((unsigned long) page);
 out:
+	put_task_struct(task);
+out_no_task:
 	return ret;
 }
 
@@ -748,15 +797,20 @@ static ssize_t mem_write(struct file * file, const char * buf,
 {
 	int copied = 0;
 	char *page;
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	unsigned long dst = *ppos;
 
+	copied = -ESRCH;
+	if (!task)
+		goto out_no_task;
+
 	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
-		return -ESRCH;
+		goto out;
 
+	copied = -ENOMEM;
 	page = (char *)__get_free_page(GFP_USER);
 	if (!page)
-		return -ENOMEM;
+		goto out;
 
 	while (count > 0) {
 		int this_len, retval;
@@ -779,6 +833,9 @@ static ssize_t mem_write(struct file * file, const char * buf,
 	}
 	*ppos = dst;
 	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return copied;
 }
 #endif
@@ -809,12 +866,17 @@ static struct file_operations proc_mem_operations = {
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	char buffer[PROC_NUMBUF];
 	size_t len;
-	int oom_adjust = task->oomkilladj;
+	int oom_adjust;
 	loff_t __ppos = *ppos;
 
+	if (!task)
+		return -ESRCH;
+	oom_adjust = task->oomkilladj;
+	put_task_struct(task);
+
 	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 	if (__ppos >= len)
 		return 0;
@@ -829,7 +891,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task;
 	char buffer[PROC_NUMBUF], *end;
 	int oom_adjust;
 
@@ -845,7 +907,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EINVAL;
 	if (*end == '\n')
 		end++;
+	task = get_proc_task(file->f_dentry->d_inode);
+	if (!task)
+		return -ESRCH;
 	task->oomkilladj = oom_adjust;
+	put_task_struct(task);
 	if (end - buffer == 0)
 		return -EIO;
 	return end - buffer;
@@ -862,12 +928,15 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
 {
 	struct inode * inode = file->f_dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
 	ssize_t length;
 	char tmpbuf[TMPBUFLEN];
 
+	if (!task)
+		return -ESRCH;
 	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
 				audit_get_loginuid(task->audit_context));
+	put_task_struct(task);
 	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 }
 
@@ -877,13 +946,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	char *page, *tmp;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
 	uid_t loginuid;
 
 	if (!capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 
-	if (current != task)
+	if (current != proc_tref(inode)->task)
 		return -EPERM;
 
 	if (count >= PAGE_SIZE)
@@ -907,7 +975,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(task, loginuid);
+	length = audit_set_loginuid(current, loginuid);
 	if (likely(length == 0))
 		length = count;
 
@@ -926,13 +994,16 @@ static struct file_operations proc_loginuid_operations = {
 static ssize_t seccomp_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
-	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
 	char __buf[20];
 	loff_t __ppos = *ppos;
 	size_t len;
 
+	if (!tsk)
+		return -ESRCH;
 	/* no need to print the trailing zero, so use only len */
 	len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
+	put_task_struct(tsk);
 	if (__ppos >= len)
 		return 0;
 	if (count > len - __ppos)
@@ -946,29 +1017,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
 static ssize_t seccomp_write(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
 	char __buf[20], *end;
 	unsigned int seccomp_mode;
+	ssize_t result;
+
+	result = -ESRCH;
+	if (!tsk)
+		goto out_no_task;
 
 	/* can set it only once to be even more secure */
+	result = -EPERM;
 	if (unlikely(tsk->seccomp.mode))
-		return -EPERM;
+		goto out;
 
+	result = -EFAULT;
 	memset(__buf, 0, sizeof(__buf));
 	count = min(count, sizeof(__buf) - 1);
 	if (copy_from_user(__buf, buf, count))
-		return -EFAULT;
+		goto out;
+
 	seccomp_mode = simple_strtoul(__buf, &end, 0);
 	if (*end == '\n')
 		end++;
+	result = -EINVAL;
 	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
 		tsk->seccomp.mode = seccomp_mode;
 		set_tsk_thread_flag(tsk, TIF_SECCOMP);
 	} else
-		return -EINVAL;
+		goto out;
+	result = -EIO;
 	if (unlikely(!(end - __buf)))
-		return -EIO;
-	return end - __buf;
+		goto out;
+	result = end - __buf;
+out:
+	put_task_struct(tsk);
+out_no_task:
+	return result;
 }
 
 static struct file_operations proc_seccomp_operations = {
@@ -995,7 +1080,7 @@ static int proc_check_dentry_visible(struct inode *inode,
 	/* See if the the two tasks share a commone set of
 	 * file descriptors.  If so everything is visible.
 	 */
-	task = proc_task(inode);
+	task = get_proc_task(inode);
 	if (!task)
 		goto out;
 	files = get_files_struct(current);
@@ -1006,6 +1091,7 @@ static int proc_check_dentry_visible(struct inode *inode,
 		put_files_struct(task_files);
 	if (files)
 		put_files_struct(files);
+	put_task_struct(task);
 	if (!error)
 		goto out;
 
@@ -1106,7 +1192,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = proc_task(inode);
+	struct task_struct *p = get_proc_task(inode);
 	unsigned int fd, tid, ino;
 	int retval;
 	char buf[PROC_NUMBUF];
@@ -1114,8 +1200,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 	struct fdtable *fdt;
 
 	retval = -ENOENT;
-	if (!pid_alive(p))
-		goto out;
+	if (!p)
+		goto out_no_task;
 	retval = 0;
 	tid = p->pid;
 
@@ -1164,6 +1250,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 			put_files_struct(files);
 	}
 out:
+	put_task_struct(p);
+out_no_task:
 	return retval;
 }
 
@@ -1175,16 +1263,18 @@ static int proc_pident_readdir(struct file *filp,
 	int pid;
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
 	struct pid_entry *p;
 	ino_t ino;
 	int ret;
 
 	ret = -ENOENT;
-	if (!pid_alive(proc_task(inode)))
+	if (!task)
 		goto out;
 
 	ret = 0;
-	pid = proc_task(inode)->pid;
+	pid = task->pid;
+	put_task_struct(task);
 	i = filp->f_pos;
 	switch (i) {
 	case 0:
@@ -1270,14 +1360,13 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_ino = fake_ino(task->pid, ino);
 
-	if (!pid_alive(task))
-		goto out_unlock;
-
 	/*
 	 * grab the reference to task.
 	 */
-	get_task_struct(task);
-	ei->task = task;
+	ei->tref = tref_get_by_task(task);
+	if (!tref_task(ei->tref))
+		goto out_unlock;
+
 	inode->i_uid = 0;
 	inode->i_gid = 0;
 	if (task_dumpable(task)) {
@@ -1303,13 +1392,21 @@ out_unlock:
  *
  * Rewrite the inode's ownerships here because the owning task may have
  * performed a setuid(), etc.
+ *
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid.  Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid.  To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
  */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
-	if (pid_alive(task)) {
-		if (task_dumpable(task)) {
+	struct task_struct *task = get_proc_task(inode);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
 			inode->i_uid = task->euid;
 			inode->i_gid = task->egid;
 		} else {
@@ -1317,37 +1414,63 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 			inode->i_gid = 0;
 		}
 		security_task_to_inode(task, inode);
+		put_task_struct(task);
 		return 1;
 	}
 	d_drop(dentry);
 	return 0;
 }
 
+static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	generic_fillattr(inode, stat);
+
+	rcu_read_lock();
+	stat->uid = 0;
+	stat->gid = 0;
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
+			stat->uid = task->euid;
+			stat->gid = task->egid;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
 	int fd = proc_fd(inode);
 	struct files_struct *files;
 
-	files = get_files_struct(task);
-	if (files) {
-		rcu_read_lock();
-		if (fcheck_files(files, fd)) {
+	if (task) {
+		files = get_files_struct(task);
+		if (files) {
+			rcu_read_lock();
+			if (fcheck_files(files, fd)) {
+				rcu_read_unlock();
+				put_files_struct(files);
+				if (task_dumpable(task)) {
+					inode->i_uid = task->euid;
+					inode->i_gid = task->egid;
+				} else {
+					inode->i_uid = 0;
+					inode->i_gid = 0;
+				}
+				security_task_to_inode(task, inode);
+				put_task_struct(task);
+				return 1;
+			}
 			rcu_read_unlock();
 			put_files_struct(files);
-			if (task_dumpable(task)) {
-				inode->i_uid = task->euid;
-				inode->i_gid = task->egid;
-			} else {
-				inode->i_uid = 0;
-				inode->i_gid = 0;
-			}
-			security_task_to_inode(task, inode);
-			return 1;
 		}
-		rcu_read_unlock();
-		put_files_struct(files);
+		put_task_struct(task);
 	}
 	d_drop(dentry);
 	return 0;
@@ -1359,7 +1482,7 @@ static int pid_delete_dentry(struct dentry * dentry)
 	 * If so, then don't put the dentry on the lru list,
 	 * kill it immediately.
 	 */
-	return !pid_alive(proc_task(dentry->d_inode));
+	return !proc_tref(dentry->d_inode)->task;
 }
 
 static struct dentry_operations tid_fd_dentry_operations =
@@ -1401,7 +1524,7 @@ out:
 /* SMP-safe */
 static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
-	struct task_struct *task = proc_task(dir);
+	struct task_struct *task = get_proc_task(dir);
 	unsigned fd = name_to_int(dentry);
 	struct dentry *result = ERR_PTR(-ENOENT);
 	struct file * file;
@@ -1409,10 +1532,10 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	struct inode *inode;
 	struct proc_inode *ei;
 
+	if (!task)
+		goto out_no_task;
 	if (fd == ~0U)
 		goto out;
-	if (!pid_alive(task))
-		goto out;
 
 	inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
 	if (!inode)
@@ -1447,6 +1570,8 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (tid_fd_revalidate(dentry, NULL))
 		result = NULL;
 out:
+	put_task_struct(task);
+out_no_task:
 	return result;
 
 out_unlock2:
@@ -1490,12 +1615,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	unsigned long page;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 
 	if (count > PAGE_SIZE)
 		count = PAGE_SIZE;
+	length = -ENOMEM;
 	if (!(page = __get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
+		goto out;
 
 	length = security_getprocattr(task, 
 				      (char*)file->f_dentry->d_name.name, 
@@ -1503,6 +1633,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	if (length >= 0)
 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 	free_page(page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 }
 
@@ -1512,26 +1645,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	char *page; 
 	ssize_t length; 
-	struct task_struct *task = proc_task(inode); 
+	struct task_struct *task = get_proc_task(inode);
 
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 	if (count > PAGE_SIZE) 
 		count = PAGE_SIZE; 
-	if (*ppos != 0) {
-		/* No partial writes. */
-		return -EINVAL;
-	}
+
+	/* No partial writes. */
+	length = -EINVAL;
+	if (*ppos != 0)
+		goto out;
+
+	length = -ENOMEM;
 	page = (char*)__get_free_page(GFP_USER); 
 	if (!page) 
-		return -ENOMEM;
+		goto out;
+
 	length = -EFAULT; 
 	if (copy_from_user(page, buf, count)) 
-		goto out;
+		goto out_free;
 
 	length = security_setprocattr(task, 
 				      (char*)file->f_dentry->d_name.name, 
 				      (void*)page, count);
-out:
+out_free:
 	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 } 
 
@@ -1553,15 +1696,15 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 {
 	struct inode *inode;
 	struct dentry *error;
-	struct task_struct *task = proc_task(dir);
+	struct task_struct *task = get_proc_task(dir);
 	struct pid_entry *p;
 	struct proc_inode *ei;
 
 	error = ERR_PTR(-ENOENT);
 	inode = NULL;
 
-	if (!pid_alive(task))
-		goto out;
+	if (!task)
+		goto out_no_task;
 
 	for (p = ents; p->name; p++) {
 		if (p->len != dentry->d_name.len)
@@ -1748,6 +1891,8 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	if (pid_revalidate(dentry, NULL))
 		error = NULL;
 out:
+	put_task_struct(task);
+out_no_task:
 	return error;
 }
 
@@ -1771,10 +1916,12 @@ static struct file_operations proc_tid_base_operations = {
 
 static struct inode_operations proc_tgid_base_inode_operations = {
 	.lookup		= proc_tgid_base_lookup,
+	.getattr	= pid_getattr,
 };
 
 static struct inode_operations proc_tid_base_inode_operations = {
 	.lookup		= proc_tid_base_lookup,
+	.getattr	= pid_getattr,
 };
 
 #ifdef CONFIG_SECURITY
@@ -1816,10 +1963,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir,
 
 static struct inode_operations proc_tgid_attr_inode_operations = {
 	.lookup		= proc_tgid_attr_lookup,
+	.getattr	= pid_getattr,
 };
 
 static struct inode_operations proc_tid_attr_inode_operations = {
 	.lookup		= proc_tid_attr_lookup,
+	.getattr	= pid_getattr,
 };
 #endif
 
@@ -1981,10 +2130,13 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 {
 	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
-	struct task_struct *leader = proc_task(dir);
+	struct task_struct *leader = get_proc_task(dir);
 	struct inode *inode;
 	unsigned tid;
 
+	if (!leader)
+		goto out_no_task;
+
 	tid = name_to_int(dentry);
 	if (tid == ~0U)
 		goto out;
@@ -2024,6 +2176,8 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 out_drop_task:
 	put_task_struct(task);
 out:
+	put_task_struct(leader);
+out_no_task:
 	return result;
 }
 
@@ -2163,12 +2317,7 @@ static struct task_struct *first_tid(struct task_struct *leader, int tid, int nr
 
 	/* If nr exceeds the number of threads there is nothing todo */
 	if (nr) {
-		int threads = 0;
-		task_lock(leader);
-		if (leader->signal)
-			threads = atomic_read(&leader->signal->count);
-		task_unlock(leader);
-		if (nr >= threads)
+		if (nr >= get_nr_threads(leader))
 			goto done;
 	}
 
@@ -2218,15 +2367,15 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	char buf[PROC_NUMBUF];
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *leader = proc_task(inode);
+	struct task_struct *leader = get_proc_task(inode);
 	struct task_struct *task;
 	int retval = -ENOENT;
 	ino_t ino;
 	int tid;
 	unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
 
-	if (!pid_alive(leader))
-		goto out;
+	if (!leader)
+		goto out_no_task;
 	retval = 0;
 
 	switch (pos) {
@@ -2266,20 +2415,22 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	}
 out:
 	filp->f_pos = pos;
+	put_task_struct(leader);
+out_no_task:
 	return retval;
 }
 
 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = proc_task(inode);
+	struct task_struct *p = get_proc_task(inode);
 	generic_fillattr(inode, stat);
 
-	if (pid_alive(p)) {
-		task_lock(p);
-		if (p->signal)
-			stat->nlink += atomic_read(&p->signal->count);
-		task_unlock(p);
+	if (p) {
+		rcu_read_lock();
+		stat->nlink += get_nr_threads(p);
+		rcu_read_unlock();
+		put_task_struct(p);
 	}
 
 	return 0;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index fbc94df138a..31e0475c6cb 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de)
 static void proc_delete_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
-	struct task_struct *tsk;
 
 	truncate_inode_pages(&inode->i_data, 0);
 
-	/* Let go of any associated process */
-	tsk = PROC_I(inode)->task;
-	if (tsk)
-		put_task_struct(tsk);
+	/* Stop tracking associated processes */
+	tref_put(PROC_I(inode)->tref);
 
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
@@ -94,7 +91,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
 	if (!ei)
 		return NULL;
-	ei->task = NULL;
+	ei->tref = NULL;
 	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 548e7447ea4..37f1648adc2 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,6 +10,7 @@
  */
 
 #include <linux/proc_fs.h>
+#include <linux/task_ref.h>
 
 struct vmalloc_info {
 	unsigned long	used;
@@ -41,13 +42,23 @@ extern struct file_operations proc_maps_operations;
 extern struct file_operations proc_numa_maps_operations;
 extern struct file_operations proc_smaps_operations;
 
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+
+
 void free_proc_entry(struct proc_dir_entry *de);
 
 int proc_init_inodecache(void);
 
-static inline struct task_struct *proc_task(struct inode *inode)
+static inline struct task_ref *proc_tref(struct inode *inode)
+{
+	return PROC_I(inode)->tref;
+}
+
+static inline struct task_struct *get_proc_task(struct inode *inode)
 {
-	return PROC_I(inode)->task;
+	return get_tref_task(proc_tref(inode));
 }
 
 static inline int proc_fd(struct inode *inode)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4187b4e9cdb..abf3208c3f6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
 {
 	struct vm_area_struct * vma;
 	int result = -ENOENT;
-	struct task_struct *task = proc_task(inode);
-	struct mm_struct * mm = get_task_mm(task);
+	struct task_struct *task = get_proc_task(inode);
+	struct mm_struct * mm = NULL;
 
+	if (task) {
+		mm = get_task_mm(task);
+		put_task_struct(task);
+	}
 	if (!mm)
 		goto out;
 	down_read(&mm->mmap_sem);
@@ -120,7 +124,8 @@ struct mem_size_stats
 
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
 	struct vm_area_struct *vma = v;
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
@@ -295,12 +300,16 @@ static int show_smap(struct seq_file *m, void *v)
 
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	unsigned long last_addr = m->version;
 	struct mm_struct *mm;
-	struct vm_area_struct *vma, *tail_vma;
+	struct vm_area_struct *vma, *tail_vma = NULL;
 	loff_t l = *pos;
 
+	/* Clear the per syscall fields in priv */
+	priv->task = NULL;
+	priv->tail_vma = NULL;
+
 	/*
 	 * We remember last_addr rather than next_addr to hit with
 	 * mmap_cache most of the time. We have zero last_addr at
@@ -311,11 +320,15 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	if (last_addr == -1UL)
 		return NULL;
 
-	mm = get_task_mm(task);
+	priv->task = get_tref_task(priv->tref);
+	if (!priv->task)
+		return NULL;
+
+	mm = get_task_mm(priv->task);
 	if (!mm)
 		return NULL;
 
-	tail_vma = get_gate_vma(task);
+	priv->tail_vma = tail_vma = get_gate_vma(priv->task);
 	down_read(&mm->mmap_sem);
 
 	/* Start with last addr hint */
@@ -350,11 +363,9 @@ out:
 	return tail_vma;
 }
 
-static void m_stop(struct seq_file *m, void *v)
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
-	struct task_struct *task = m->private;
-	struct vm_area_struct *vma = v;
-	if (vma && vma != get_gate_vma(task)) {
+	if (vma && vma != priv->tail_vma) {
 		struct mm_struct *mm = vma->vm_mm;
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -363,17 +374,27 @@ static void m_stop(struct seq_file *m, void *v)
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
-	struct vm_area_struct *tail_vma = get_gate_vma(task);
+	struct vm_area_struct *tail_vma = priv->tail_vma;
 
 	(*pos)++;
 	if (vma && (vma != tail_vma) && vma->vm_next)
 		return vma->vm_next;
-	m_stop(m, v);
+	vma_stop(priv, vma);
 	return (vma != tail_vma)? tail_vma: NULL;
 }
 
+static void m_stop(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+
+	vma_stop(priv, vma);
+	if (priv->task)
+		put_task_struct(priv->task);
+}
+
 static struct seq_operations proc_pid_maps_op = {
 	.start	= m_start,
 	.next	= m_next,
@@ -391,11 +412,18 @@ static struct seq_operations proc_pid_smaps_op = {
 static int do_maps_open(struct inode *inode, struct file *file,
 			struct seq_operations *ops)
 {
-	struct task_struct *task = proc_task(inode);
-	int ret = seq_open(file, ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = task;
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->tref = proc_tref(inode);
+		ret = seq_open(file, ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
 	}
 	return ret;
 }
@@ -409,7 +437,7 @@ struct file_operations proc_maps_operations = {
 	.open		= maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 
 #ifdef CONFIG_NUMA
@@ -431,7 +459,7 @@ struct file_operations proc_numa_maps_operations = {
 	.open		= numa_maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 #endif
 
@@ -444,5 +472,5 @@ struct file_operations proc_smaps_operations = {
 	.open		= smaps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
-- 
cgit v1.2.3


From 13b41b09491e5d75e8027dca1ee78f5e073bc4c0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:56 -0700
Subject: [PATCH] proc: Use struct pid not struct task_ref

Incrementally update my proc-dont-lock-task_structs-indefinitely patches so
that they work with struct pid instead of struct task_ref.

Mostly this is a straight 1-1 substitution.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c     | 8 ++++----
 fs/proc/inode.c    | 4 ++--
 fs/proc/internal.h | 7 +++----
 fs/proc/task_mmu.c | 4 ++--
 4 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 489810abc72..c7f85544157 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -951,7 +951,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	if (!capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 
-	if (current != proc_tref(inode)->task)
+	if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
 		return -EPERM;
 
 	if (count >= PAGE_SIZE)
@@ -1363,8 +1363,8 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	/*
 	 * grab the reference to task.
 	 */
-	ei->tref = tref_get_by_task(task);
-	if (!tref_task(ei->tref))
+	ei->pid = get_pid(task->pids[PIDTYPE_PID].pid);
+	if (!ei->pid)
 		goto out_unlock;
 
 	inode->i_uid = 0;
@@ -1482,7 +1482,7 @@ static int pid_delete_dentry(struct dentry * dentry)
 	 * If so, then don't put the dentry on the lru list,
 	 * kill it immediately.
 	 */
-	return !proc_tref(dentry->d_inode)->task;
+	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
 
 static struct dentry_operations tid_fd_dentry_operations =
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 31e0475c6cb..6dcef089e18 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -62,7 +62,7 @@ static void proc_delete_inode(struct inode *inode)
 	truncate_inode_pages(&inode->i_data, 0);
 
 	/* Stop tracking associated processes */
-	tref_put(PROC_I(inode)->tref);
+	put_pid(PROC_I(inode)->pid);
 
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
@@ -91,7 +91,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
 	if (!ei)
 		return NULL;
-	ei->tref = NULL;
+	ei->pid = NULL;
 	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 37f1648adc2..146a434ba94 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -10,7 +10,6 @@
  */
 
 #include <linux/proc_fs.h>
-#include <linux/task_ref.h>
 
 struct vmalloc_info {
 	unsigned long	used;
@@ -51,14 +50,14 @@ void free_proc_entry(struct proc_dir_entry *de);
 
 int proc_init_inodecache(void);
 
-static inline struct task_ref *proc_tref(struct inode *inode)
+static inline struct pid *proc_pid(struct inode *inode)
 {
-	return PROC_I(inode)->tref;
+	return PROC_I(inode)->pid;
 }
 
 static inline struct task_struct *get_proc_task(struct inode *inode)
 {
-	return get_tref_task(proc_tref(inode));
+	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
 
 static inline int proc_fd(struct inode *inode)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index abf3208c3f6..0137ec4c136 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -320,7 +320,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	if (last_addr == -1UL)
 		return NULL;
 
-	priv->task = get_tref_task(priv->tref);
+	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
 	if (!priv->task)
 		return NULL;
 
@@ -416,7 +416,7 @@ static int do_maps_open(struct inode *inode, struct file *file,
 	int ret = -ENOMEM;
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (priv) {
-		priv->tref = proc_tref(inode);
+		priv->pid = proc_pid(inode);
 		ret = seq_open(file, ops);
 		if (!ret) {
 			struct seq_file *m = file->private_data;
-- 
cgit v1.2.3


From 5b0c1dd38b66e2dd0cf655aa845e341b50b93ddd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:57 -0700
Subject: [PATCH] proc: optimize proc_check_dentry_visible

The code doesn't need to sleep to when making this check so I can just do the
comparison and not worry about the reference counts.

TODO: While looking at this I realized that my original cleanup did not push
the permission check far enough down into the stack.  The call of
proc_check_dentry_visible needs to move out of the generic proc
readlink/follow link code and into the individual get_link instances.
Otherwise the shared resources checks are not quite correct (shared
files_struct does not require a shared fs_struct), and there are races with
unshare.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f85544157..f0db7f616ac 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1074,24 +1074,27 @@ static int proc_check_dentry_visible(struct inode *inode,
 	 * namespace, or are simply process local (like pipes).
 	 */
 	struct task_struct *task;
-	struct files_struct *task_files, *files;
 	int error = -EACCES;
 
 	/* See if the the two tasks share a commone set of
 	 * file descriptors.  If so everything is visible.
 	 */
-	task = get_proc_task(inode);
-	if (!task)
-		goto out;
-	files = get_files_struct(current);
-	task_files = get_files_struct(task);
-	if (files && task_files && (files == task_files))
-		error = 0;
-	if (task_files)
-		put_files_struct(task_files);
-	if (files)
-		put_files_struct(files);
-	put_task_struct(task);
+	rcu_read_lock();
+	task = tref_task(proc_tref(inode));
+	if (task) {
+		struct files_struct *task_files, *files;
+		/* This test answeres the question:
+		 * Is there a point in time since we looked up the
+		 * file descriptor where the two tasks share the
+		 * same files struct?
+		 */
+		rmb();
+		files = current->files;
+		task_files = task->files;
+		if (files && (files == task_files))
+			error = 0;
+	}
+	rcu_read_unlock();
 	if (!error)
 		goto out;
 
-- 
cgit v1.2.3


From 778c1144771f0064b6f51bee865cceb0d996f2f9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:58 -0700
Subject: [PATCH] proc: Use sane permission checks on the /proc/<pid>/fd/
 symlinks

Since 2.2 we have been doing a chroot check to see if it is appropriate to
return a read or follow one of these magic symlinks.  The chroot check was
asking a question about the visibility of files to the calling process and
it was actually checking the destination process, and not the files
themselves.  That test was clearly bogus.

In my first pass through I simply fixed the test to check the visibility of
the files themselves.  That naive approach to fixing the permissions was
too strict and resulted in cases where a task could not even see all of
it's file descriptors.

What has disturbed me about relaxing this check is that file descriptors
are per-process private things, and they are occasionaly used a user space
capability tokens.  Looking a little farther into the symlink path on /proc
I did find userid checks and a check for capability (CAP_DAC_OVERRIDE) so
there were permissions checking this.

But I was still concerned about privacy.  Besides /proc there is only one
other way to find out this kind of information, and that is ptrace.  ptrace
has been around for a long time and it has a well established security
model.

So after thinking about it I finally realized that the permission checks
that make sense are the permission checks applied to ptrace_attach.  The
checks are simple per process, and won't cause nasty surprises for people
coming from less capable unices.

Unfortunately there is one case that the current ptrace_attach test does
not cover: Zombies and kernel threads.  Single stepping those kinds of
processes is impossible.  Being able to see which file descriptors are open
on these tasks is important to lsof, fuser and friends.  So for these
special processes I made the rule you can't find out unless you have
CAP_SYS_PTRACE.

These proc permission checks should now conform to the principle of least
surprise.  As well as using much less code to implement :)

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 124 ++++++++++++++-------------------------------------------
 1 file changed, 29 insertions(+), 95 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f0db7f616ac..f38da6bda26 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -532,42 +532,34 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 /************************************************************************/
 
 /* permission checks */
-
-/* If the process being read is separated by chroot from the reading process,
- * don't let the reader access the threads.
- */
-static int proc_check_chroot(struct dentry *de, struct vfsmount *mnt)
+static int proc_fd_access_allowed(struct inode *inode)
 {
-	struct dentry *base;
-	struct vfsmount *our_vfsmnt;
-	int res = 0;
-
-	read_lock(&current->fs->lock);
-	our_vfsmnt = mntget(current->fs->rootmnt);
-	base = dget(current->fs->root);
-	read_unlock(&current->fs->lock);
-
-	spin_lock(&vfsmount_lock);
+	struct task_struct *task;
+	int allowed = 0;
+	/* Allow access to a task's file descriptors if either we may
+	 * use ptrace attach to the process and find out that
+	 * information, or if the task cannot possibly be ptraced
+	 * allow access if we have the proper capability.
+	 */
+	task = get_proc_task(inode);
+	if (task == current)
+		allowed = 1;
+	if (task && !allowed) {
+		int alive;
 
-	while (mnt != our_vfsmnt) {
-		if (mnt == mnt->mnt_parent)
-			goto out;
-		de = mnt->mnt_mountpoint;
-		mnt = mnt->mnt_parent;
+		task_lock(task);
+		alive = !!task->mm;
+		task_unlock(task);
+		if (alive)
+			/* For a living task obey ptrace_may_attach */
+			allowed = ptrace_may_attach(task);
+		else
+			/* For a special task simply check the capability */
+			allowed = capable(CAP_SYS_PTRACE);
 	}
-
-	if (!is_subdir(de, base))
-		goto out;
-	spin_unlock(&vfsmount_lock);
-
-exit:
-	dput(base);
-	mntput(our_vfsmnt);
-	return res;
-out:
-	spin_unlock(&vfsmount_lock);
-	res = -EACCES;
-	goto exit;
+	if (task)
+		put_task_struct(task);
+	return allowed;
 }
 
 extern struct seq_operations mounts_op;
@@ -1062,52 +1054,6 @@ static struct file_operations proc_seccomp_operations = {
 };
 #endif /* CONFIG_SECCOMP */
 
-static int proc_check_dentry_visible(struct inode *inode,
-	struct dentry *dentry, struct vfsmount *mnt)
-{
-	/* Verify that the current process can already see the
-	 * file pointed at by the file descriptor.
-	 * This prevents /proc from being an accidental information leak.
-	 *
-	 * This prevents access to files that are not visible do to
-	 * being on the otherside of a chroot, in a different
-	 * namespace, or are simply process local (like pipes).
-	 */
-	struct task_struct *task;
-	int error = -EACCES;
-
-	/* See if the the two tasks share a commone set of
-	 * file descriptors.  If so everything is visible.
-	 */
-	rcu_read_lock();
-	task = tref_task(proc_tref(inode));
-	if (task) {
-		struct files_struct *task_files, *files;
-		/* This test answeres the question:
-		 * Is there a point in time since we looked up the
-		 * file descriptor where the two tasks share the
-		 * same files struct?
-		 */
-		rmb();
-		files = current->files;
-		task_files = task->files;
-		if (files && (files == task_files))
-			error = 0;
-	}
-	rcu_read_unlock();
-	if (!error)
-		goto out;
-
-	/* If the two tasks don't share a common set of file
-	 * descriptors see if the destination dentry is already
-	 * visible in the current tasks filesystem namespace.
-	 */
-	error = proc_check_chroot(dentry, mnt);
-out:
-	return error;
-
-}
-
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1116,18 +1062,12 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 	/* We don't need a base pointer in the /proc filesystem */
 	path_release(nd);
 
-	if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
+	/* Are we allowed to snoop on the tasks file descriptors? */
+	if (!proc_fd_access_allowed(inode))
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt);
 	nd->last_type = LAST_BIND;
-	if (error)
-		goto out;
-
-	/* Only return files this task can already see */
-	error = proc_check_dentry_visible(inode, nd->dentry, nd->mnt);
-	if (error)
-		path_release(nd);
 out:
 	return ERR_PTR(error);
 }
@@ -1165,21 +1105,15 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
 	struct dentry *de;
 	struct vfsmount *mnt = NULL;
 
-
-	if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
+	/* Are we allowed to snoop on the tasks file descriptors? */
+	if (!proc_fd_access_allowed(inode))
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt);
 	if (error)
 		goto out;
 
-	/* Only return files this task can already see */
-	error = proc_check_dentry_visible(inode, de, mnt);
-	if (error)
-		goto out_put;
-
 	error = do_proc_readlink(de, mnt, buffer, buflen);
-out_put:
 	dput(de);
 	mntput(mnt);
 out:
-- 
cgit v1.2.3


From df26c40e567356caeefe2861311e19c54444d917 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:25:59 -0700
Subject: [PATCH] proc: Cleanup proc_fd_access_allowed

In process of getting proc_fd_access_allowed to work it has developed a few
warts.  In particular the special case that always allows introspection and
the special case to allow inspection of kernel threads.

The special case for introspection is needed for /proc/self/mem.

The special case for kernel threads really should be overridable
by security modules.

So consolidate these checks into ptrace.c:may_attach().

The check to always allow introspection is trivial.

The check to allow access to kernel threads, and zombies is a little
trickier.  mem_read and mem_write already verify an mm exists so it isn't
needed twice.  proc_fd_access_allowed only doesn't want a check to verify
task->mm exits, s it prevents all access to kernel threads.  So just move
the task->mm check into ptrace_attach where it is needed for practical
reasons.

I did a quick audit and none of the security modules in the kernel seem to
care if they are passed a task without an mm into security_ptrace.  So the
above move should be safe and it allows security modules to come up with
more restrictive policy.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: James Morris <jmorris@namei.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f38da6bda26..773469703c6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -536,29 +536,15 @@ static int proc_fd_access_allowed(struct inode *inode)
 {
 	struct task_struct *task;
 	int allowed = 0;
-	/* Allow access to a task's file descriptors if either we may
-	 * use ptrace attach to the process and find out that
-	 * information, or if the task cannot possibly be ptraced
-	 * allow access if we have the proper capability.
+	/* Allow access to a task's file descriptors if it is us or we
+	 * may use ptrace attach to the process and find out that
+	 * information.
 	 */
 	task = get_proc_task(inode);
-	if (task == current)
-		allowed = 1;
-	if (task && !allowed) {
-		int alive;
-
-		task_lock(task);
-		alive = !!task->mm;
-		task_unlock(task);
-		if (alive)
-			/* For a living task obey ptrace_may_attach */
-			allowed = ptrace_may_attach(task);
-		else
-			/* For a special task simply check the capability */
-			allowed = capable(CAP_SYS_PTRACE);
-	}
-	if (task)
+	if (task) {
+		allowed = ptrace_may_attach(task);
 		put_task_struct(task);
+	}
 	return allowed;
 }
 
-- 
cgit v1.2.3


From cc288738c9ae3c64d3c50b86604044d1f6d22941 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Jun 2006 00:26:01 -0700
Subject: [PATCH] proc: Remove tasklist_lock from proc_task_readdir.

This is just like my previous removal of tasklist_lock from first_tgid, and
next_tgid.  It simply had to wait until it was rcu safe to walk the thread
list.

This should be the last instance of the tasklist_lock in proc.  So user
processes should not be able to influence the tasklist lock hold times.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 773469703c6..6092a6e2c5a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2224,11 +2224,12 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
  * In the case of a seek we start with the leader and walk nr
  * threads past it.
  */
-static struct task_struct *first_tid(struct task_struct *leader, int tid, int nr)
+static struct task_struct *first_tid(struct task_struct *leader,
+					int tid, int nr)
 {
 	struct task_struct *pos = NULL;
-	read_lock(&tasklist_lock);
 
+	rcu_read_lock();
 	/* Attempt to start with the pid of a thread */
 	if (tid && (nr > 0)) {
 		pos = find_task_by_pid(tid);
@@ -2258,7 +2259,7 @@ static struct task_struct *first_tid(struct task_struct *leader, int tid, int nr
 	}
 	pos = NULL;
 done:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return pos;
 }
 
@@ -2271,7 +2272,7 @@ done:
 static struct task_struct *next_tid(struct task_struct *start)
 {
 	struct task_struct *pos;
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	pos = start;
 	if (pid_alive(start))
 		pos = next_thread(start);
@@ -2279,7 +2280,7 @@ static struct task_struct *next_tid(struct task_struct *start)
 		get_task_struct(pos);
 	else
 		pos = NULL;
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	put_task_struct(start);
 	return pos;
 }
-- 
cgit v1.2.3


From a872ff0cb2218dc9688b990c5ccda064dc40946b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:01 -0700
Subject: [PATCH] simplify/fix first_tid()

first_tid:

	/* If nr exceeds the number of threads there is nothing todo */
	if (nr) {
		if (nr >= get_nr_threads(leader))
			goto done;
	}

This is not reliable: sub-threads can exit after this check, so the
'for' loop below can overlap and proc_task_readdir() can return an
already filldir'ed dirents.

	for (; pos && pid_alive(pos); pos = next_thread(pos)) {
		if (--nr > 0)
			continue;

Off-by-one error, will return 'leader' when nr == 1.

This patch tries to fix these problems and simplify the code.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 38 +++++++++++++++++---------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6092a6e2c5a..5ee46d3a5ca 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2227,38 +2227,34 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 static struct task_struct *first_tid(struct task_struct *leader,
 					int tid, int nr)
 {
-	struct task_struct *pos = NULL;
+	struct task_struct *pos;
 
 	rcu_read_lock();
 	/* Attempt to start with the pid of a thread */
 	if (tid && (nr > 0)) {
 		pos = find_task_by_pid(tid);
-		if (pos && (pos->group_leader != leader))
-			pos = NULL;
-		if (pos)
-			nr = 0;
+		if (pos && (pos->group_leader == leader))
+			goto found;
 	}
 
 	/* If nr exceeds the number of threads there is nothing todo */
-	if (nr) {
-		if (nr >= get_nr_threads(leader))
-			goto done;
-	}
+	pos = NULL;
+	if (nr && nr >= get_nr_threads(leader))
+		goto out;
 
-	/* If we haven't found our starting place yet start with the
-	 * leader and walk nr threads forward.
+	/* If we haven't found our starting place yet start
+	 * with the leader and walk nr threads forward.
 	 */
-	if (!pos && (nr >= 0))
-		pos = leader;
-
-	for (; pos && pid_alive(pos); pos = next_thread(pos)) {
-		if (--nr > 0)
-			continue;
-		get_task_struct(pos);
-		goto done;
+	for (pos = leader; nr > 0; --nr) {
+		pos = next_thread(pos);
+		if (pos == leader) {
+			pos = NULL;
+			goto out;
+		}
 	}
-	pos = NULL;
-done:
+found:
+	get_task_struct(pos);
+out:
 	rcu_read_unlock();
 	return pos;
 }
-- 
cgit v1.2.3


From c1df7fb88a011b39ea722ac00975c5b8a803261b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:02 -0700
Subject: [PATCH] cleanup next_tid()

Try to make next_tid() a bit more readable and deletes unnecessary
"pid_alive(pos)" check.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5ee46d3a5ca..43871c85729 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2267,15 +2267,15 @@ out:
  */
 static struct task_struct *next_tid(struct task_struct *start)
 {
-	struct task_struct *pos;
+	struct task_struct *pos = NULL;
 	rcu_read_lock();
-	pos = start;
-	if (pid_alive(start))
+	if (pid_alive(start)) {
 		pos = next_thread(start);
-	if (pid_alive(pos) && (pos != start->group_leader))
-		get_task_struct(pos);
-	else
-		pos = NULL;
+		if (thread_group_leader(pos))
+			pos = NULL;
+		else
+			get_task_struct(pos);
+	}
 	rcu_read_unlock();
 	put_task_struct(start);
 	return pos;
-- 
cgit v1.2.3


From 42c3e03ef6b298813557cdb997bd6db619cd65a2 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 26 Jun 2006 00:26:03 -0700
Subject: [PATCH] SELinux: Add sockcreate node to procattr API

Below is a patch to add a new /proc/self/attr/sockcreate A process may write a
context into this interface and all subsequent sockets created will be labeled
with that context.  This is the same idea as the fscreate interface where a
process can specify the label of a file about to be created.  At this time one
envisioned user of this will be xinetd.  It will be able to better label
sockets for the actual services.  At this time all sockets take the label of
the creating process, so all xinitd sockets would just be labeled the same.

I tested this by creating a tcp sender and listener.  The sender was able to
write to this new proc file and then create sockets with the specified label.
I am able to be sure the new label was used since the avc denial messages
kicked out by the kernel included both the new security permission
setsockcreate and all the socket denials were for the new label, not the label
of the running process.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
Cc: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/base.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 43871c85729..6ba7785319d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -132,6 +132,7 @@ enum pid_directory_inos {
 	PROC_TGID_ATTR_EXEC,
 	PROC_TGID_ATTR_FSCREATE,
 	PROC_TGID_ATTR_KEYCREATE,
+	PROC_TGID_ATTR_SOCKCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
 	PROC_TGID_LOGINUID,
@@ -174,6 +175,7 @@ enum pid_directory_inos {
 	PROC_TID_ATTR_EXEC,
 	PROC_TID_ATTR_FSCREATE,
 	PROC_TID_ATTR_KEYCREATE,
+	PROC_TID_ATTR_SOCKCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
 	PROC_TID_LOGINUID,
@@ -291,6 +293,7 @@ static struct pid_entry tgid_attr_stuff[] = {
 	E(PROC_TGID_ATTR_EXEC,     "exec",     S_IFREG|S_IRUGO|S_IWUGO),
 	E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
 	E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+	E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
 	{0,0,NULL,0}
 };
 static struct pid_entry tid_attr_stuff[] = {
@@ -299,6 +302,7 @@ static struct pid_entry tid_attr_stuff[] = {
 	E(PROC_TID_ATTR_EXEC,      "exec",     S_IFREG|S_IRUGO|S_IWUGO),
 	E(PROC_TID_ATTR_FSCREATE,  "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
 	E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+	E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
 	{0,0,NULL,0}
 };
 #endif
@@ -1764,6 +1768,8 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 		case PROC_TGID_ATTR_FSCREATE:
 		case PROC_TID_ATTR_KEYCREATE:
 		case PROC_TGID_ATTR_KEYCREATE:
+		case PROC_TID_ATTR_SOCKCREATE:
+		case PROC_TGID_ATTR_SOCKCREATE:
 			inode->i_fop = &proc_pid_attr_operations;
 			break;
 #endif
-- 
cgit v1.2.3


From 2ceb8693ef63ae3d154ce1a05d275f2bb20a5e4c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:04 -0700
Subject: [PATCH] de_thread: fix lockless do_each_thread

We should keep the value of old_leader->tasks.next in de_thread, otherwise
we can't do for_each_process/do_each_thread without tasklist_lock held.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 8c5196087f3..fffea1eef8d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -707,7 +707,7 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_PID,  current->pid);
 		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
-		list_add_tail_rcu(&current->tasks, &init_task.tasks);
+		list_replace_rcu(&leader->tasks, &current->tasks);
 
 		current->group_leader = current;
 		leader->group_leader = current;
@@ -715,7 +715,6 @@ static int de_thread(struct task_struct *tsk)
 		/* Reduce leader to a thread */
 		detach_pid(leader, PIDTYPE_PGID);
 		detach_pid(leader, PIDTYPE_SID);
-		list_del_init(&leader->tasks);
 
 		current->exit_signal = SIGCHLD;
 
-- 
cgit v1.2.3


From aceecc041217b35df753d1ed6e25bd17c0c558d8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:05 -0700
Subject: [PATCH] coredump: optimize ->mm users traversal

zap_threads() iterates over all threads to find those ones which share
current->mm.  All threads in the thread group share the same ->mm, so we can
skip entire thread group if it has another ->mm.

This patch shifts the killing of thread group into the newly added
zap_process() function.  This looks as unnecessary complication, but it is
used in further patches.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index fffea1eef8d..80fe7bcfa09 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1368,6 +1368,22 @@ static void format_corename(char *corename, const char *pattern, long signr)
 	*out_ptr = 0;
 }
 
+static void zap_process(struct task_struct *start, int *ptraced)
+{
+	struct task_struct *t;
+
+	t = start;
+	do {
+		if (t != current && t->mm) {
+			t->mm->core_waiters++;
+			force_sig_specific(SIGKILL, t);
+			if (unlikely(t->ptrace) &&
+			    unlikely(t->parent->mm == t->mm))
+				*ptraced = 1;
+		}
+	} while ((t = next_thread(t)) != start);
+}
+
 static void zap_threads (struct mm_struct *mm)
 {
 	struct task_struct *g, *p;
@@ -1385,16 +1401,16 @@ static void zap_threads (struct mm_struct *mm)
 	}
 
 	read_lock(&tasklist_lock);
-	do_each_thread(g,p)
-		if (mm == p->mm && p != tsk) {
-			force_sig_specific(SIGKILL, p);
-			mm->core_waiters++;
-			if (unlikely(p->ptrace) &&
-			    unlikely(p->parent->mm == mm))
-				traced = 1;
-		}
-	while_each_thread(g,p);
-
+	for_each_process(g) {
+		p = g;
+		do {
+			if (p->mm) {
+				if (p->mm == mm)
+					zap_process(p, &traced);
+				break;
+			}
+		} while ((p = next_thread(p)) != g);
+	}
 	read_unlock(&tasklist_lock);
 
 	if (unlikely(traced)) {
-- 
cgit v1.2.3


From 281de339ceb822ca6c04d4373ecb9a45c1890ce4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:06 -0700
Subject: [PATCH] coredump: speedup SIGKILL sending

With this patch a thread group is killed atomically under ->siglock.  This is
faster because we can use sigaddset() instead of force_sig_info() and this is
used in further patches.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 80fe7bcfa09..a5c51646d1a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1371,17 +1371,24 @@ static void format_corename(char *corename, const char *pattern, long signr)
 static void zap_process(struct task_struct *start, int *ptraced)
 {
 	struct task_struct *t;
+	unsigned long flags;
+
+	spin_lock_irqsave(&start->sighand->siglock, flags);
 
 	t = start;
 	do {
 		if (t != current && t->mm) {
 			t->mm->core_waiters++;
-			force_sig_specific(SIGKILL, t);
+			sigaddset(&t->pending.signal, SIGKILL);
+			signal_wake_up(t, 1);
+
 			if (unlikely(t->ptrace) &&
 			    unlikely(t->parent->mm == t->mm))
 				*ptraced = 1;
 		}
 	} while ((t = next_thread(t)) != start);
+
+	spin_unlock_irqrestore(&start->sighand->siglock, flags);
 }
 
 static void zap_threads (struct mm_struct *mm)
-- 
cgit v1.2.3


From d5f70c00ad24cd1158d3678b44ff969b4c971d49 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:07 -0700
Subject: [PATCH] coredump: kill ptrace related stuff

With this patch zap_process() sets SIGNAL_GROUP_EXIT while sending SIGKILL to
the thread group.  This means that a TASK_TRACED task

	1. Will be awakened by signal_wake_up(1)

	2. Can't sleep again via ptrace_notify()

	3. Can't go to do_signal_stop() after return
	   from ptrace_stop() in get_signal_to_deliver()

So we can remove all ptrace related stuff from coredump path.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 30 +++++-------------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index a5c51646d1a..b58ba7d127e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1368,12 +1368,14 @@ static void format_corename(char *corename, const char *pattern, long signr)
 	*out_ptr = 0;
 }
 
-static void zap_process(struct task_struct *start, int *ptraced)
+static void zap_process(struct task_struct *start)
 {
 	struct task_struct *t;
 	unsigned long flags;
 
 	spin_lock_irqsave(&start->sighand->siglock, flags);
+	start->signal->flags = SIGNAL_GROUP_EXIT;
+	start->signal->group_stop_count = 0;
 
 	t = start;
 	do {
@@ -1381,22 +1383,17 @@ static void zap_process(struct task_struct *start, int *ptraced)
 			t->mm->core_waiters++;
 			sigaddset(&t->pending.signal, SIGKILL);
 			signal_wake_up(t, 1);
-
-			if (unlikely(t->ptrace) &&
-			    unlikely(t->parent->mm == t->mm))
-				*ptraced = 1;
 		}
 	} while ((t = next_thread(t)) != start);
 
 	spin_unlock_irqrestore(&start->sighand->siglock, flags);
 }
 
-static void zap_threads (struct mm_struct *mm)
+static void zap_threads(struct mm_struct *mm)
 {
 	struct task_struct *g, *p;
 	struct task_struct *tsk = current;
 	struct completion *vfork_done = tsk->vfork_done;
-	int traced = 0;
 
 	/*
 	 * Make sure nobody is waiting for us to release the VM,
@@ -1413,29 +1410,12 @@ static void zap_threads (struct mm_struct *mm)
 		do {
 			if (p->mm) {
 				if (p->mm == mm)
-					zap_process(p, &traced);
+					zap_process(p);
 				break;
 			}
 		} while ((p = next_thread(p)) != g);
 	}
 	read_unlock(&tasklist_lock);
-
-	if (unlikely(traced)) {
-		/*
-		 * We are zapping a thread and the thread it ptraces.
-		 * If the tracee went into a ptrace stop for exit tracing,
-		 * we could deadlock since the tracer is waiting for this
-		 * coredump to finish.  Detach them so they can both die.
-		 */
-		write_lock_irq(&tasklist_lock);
-		do_each_thread(g,p) {
-			if (mm == p->mm && p != tsk &&
-			    p->ptrace && p->parent->mm == mm) {
-				__ptrace_detach(p, 0);
-			}
-		} while_each_thread(g,p);
-		write_unlock_irq(&tasklist_lock);
-	}
 }
 
 static void coredump_wait(struct mm_struct *mm)
-- 
cgit v1.2.3


From 7b1c6154fa8bb937e0b1b4f2adbb315d70270f10 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:08 -0700
Subject: [PATCH] coredump: don't take tasklist_lock

This patch removes tasklist_lock from zap_threads().
This is safe wrt:

	do_exit:
		The caller holds mm->mmap_sem. This means that task which
		shares the same ->mm can't pass exit_mm(), so it can't be
		unhashed from init_task.tasks or ->thread_group lists.

	fork:
		None of sub-threads can fork after zap_process(leader). All
		processes which were created before this point should be
		visible to zap_threads() because copy_process() adds the new
		process to the tail of init_task.tasks list, and ->siglock
		lock/unlock provides a memory barrier.

	de_thread:
		It does list_replace_rcu(&leader->tasks, &current->tasks).
		So zap_threads() will see either old or new leader, it does
		not matter. However, it can change p->sighand, so we should
		use lock_task_sighand() in zap_process().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index b58ba7d127e..49fa0127a33 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1373,7 +1373,11 @@ static void zap_process(struct task_struct *start)
 	struct task_struct *t;
 	unsigned long flags;
 
-	spin_lock_irqsave(&start->sighand->siglock, flags);
+	/*
+	 * start->sighand can't disappear, but may be
+	 * changed by de_thread()
+	 */
+	lock_task_sighand(start, &flags);
 	start->signal->flags = SIGNAL_GROUP_EXIT;
 	start->signal->group_stop_count = 0;
 
@@ -1386,7 +1390,7 @@ static void zap_process(struct task_struct *start)
 		}
 	} while ((t = next_thread(t)) != start);
 
-	spin_unlock_irqrestore(&start->sighand->siglock, flags);
+	unlock_task_sighand(start, &flags);
 }
 
 static void zap_threads(struct mm_struct *mm)
@@ -1404,7 +1408,7 @@ static void zap_threads(struct mm_struct *mm)
 		complete(vfork_done);
 	}
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	for_each_process(g) {
 		p = g;
 		do {
@@ -1415,7 +1419,7 @@ static void zap_threads(struct mm_struct *mm)
 			}
 		} while ((p = next_thread(p)) != g);
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 }
 
 static void coredump_wait(struct mm_struct *mm)
-- 
cgit v1.2.3


From dcf560c59330945a231d5e52f95dfedde4e32c9d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:08 -0700
Subject: [PATCH] coredump: some code relocations

This is a preparation for the next patch.  No functional changes.
Basically, this patch moves '->flags & SIGNAL_GROUP_EXIT' check into
zap_threads(), and 'complete(vfork_done)' into coredump_wait outside of
->mmap_sem protected area.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 70 ++++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 49fa0127a33..8c8f2894949 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1393,20 +1393,22 @@ static void zap_process(struct task_struct *start)
 	unlock_task_sighand(start, &flags);
 }
 
-static void zap_threads(struct mm_struct *mm)
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+				int exit_code)
 {
 	struct task_struct *g, *p;
-	struct task_struct *tsk = current;
-	struct completion *vfork_done = tsk->vfork_done;
-
-	/*
-	 * Make sure nobody is waiting for us to release the VM,
-	 * otherwise we can deadlock when we wait on each other
-	 */
-	if (vfork_done) {
-		tsk->vfork_done = NULL;
-		complete(vfork_done);
+	int err = -EAGAIN;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+		tsk->signal->flags = SIGNAL_GROUP_EXIT;
+		tsk->signal->group_exit_code = exit_code;
+		tsk->signal->group_stop_count = 0;
+		err = 0;
 	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+	if (err)
+		return err;
 
 	rcu_read_lock();
 	for_each_process(g) {
@@ -1420,22 +1422,43 @@ static void zap_threads(struct mm_struct *mm)
 		} while ((p = next_thread(p)) != g);
 	}
 	rcu_read_unlock();
+
+	return mm->core_waiters;
 }
 
-static void coredump_wait(struct mm_struct *mm)
+static int coredump_wait(int exit_code)
 {
-	DECLARE_COMPLETION(startup_done);
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+	struct completion startup_done;
+	struct completion *vfork_done;
 	int core_waiters;
 
+	init_completion(&mm->core_done);
+	init_completion(&startup_done);
 	mm->core_startup_done = &startup_done;
 
-	zap_threads(mm);
-	core_waiters = mm->core_waiters;
+	core_waiters = zap_threads(tsk, mm, exit_code);
 	up_write(&mm->mmap_sem);
 
+	if (unlikely(core_waiters < 0))
+		goto fail;
+
+	/*
+	 * Make sure nobody is waiting for us to release the VM,
+	 * otherwise we can deadlock when we wait on each other
+	 */
+	vfork_done = tsk->vfork_done;
+	if (vfork_done) {
+		tsk->vfork_done = NULL;
+		complete(vfork_done);
+	}
+
 	if (core_waiters)
 		wait_for_completion(&startup_done);
+fail:
 	BUG_ON(mm->core_waiters);
+	return core_waiters;
 }
 
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
@@ -1469,22 +1492,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	}
 	mm->dumpable = 0;
 
-	retval = -EAGAIN;
-	spin_lock_irq(&current->sighand->siglock);
-	if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
-		current->signal->flags = SIGNAL_GROUP_EXIT;
-		current->signal->group_exit_code = exit_code;
-		current->signal->group_stop_count = 0;
-		retval = 0;
-	}
-	spin_unlock_irq(&current->sighand->siglock);
-	if (retval) {
-		up_write(&mm->mmap_sem);
+	retval = coredump_wait(exit_code);
+	if (retval < 0)
 		goto fail;
-	}
-
-	init_completion(&mm->core_done);
-	coredump_wait(mm);
 
 	/*
 	 * Clear any false indication of pending signals that might
-- 
cgit v1.2.3


From 5debfa6da5b06954bc79fe8deed0d1062c58dcec Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 26 Jun 2006 00:26:09 -0700
Subject: [PATCH] coredump: shutdown current process first

This patch optimizes zap_threads() for the case when there are no ->mm
users except the current's thread group.  In that case we can avoid
'for_each_process()' loop.

It also adds a useful invariant: SIGNAL_GROUP_EXIT (if checked under
->siglock) always implies that all threads (except may be current) have
pending SIGKILL.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 8c8f2894949..c8494f513ea 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1371,13 +1371,7 @@ static void format_corename(char *corename, const char *pattern, long signr)
 static void zap_process(struct task_struct *start)
 {
 	struct task_struct *t;
-	unsigned long flags;
 
-	/*
-	 * start->sighand can't disappear, but may be
-	 * changed by de_thread()
-	 */
-	lock_task_sighand(start, &flags);
 	start->signal->flags = SIGNAL_GROUP_EXIT;
 	start->signal->group_stop_count = 0;
 
@@ -1389,40 +1383,51 @@ static void zap_process(struct task_struct *start)
 			signal_wake_up(t, 1);
 		}
 	} while ((t = next_thread(t)) != start);
-
-	unlock_task_sighand(start, &flags);
 }
 
 static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 				int exit_code)
 {
 	struct task_struct *g, *p;
+	unsigned long flags;
 	int err = -EAGAIN;
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
-		tsk->signal->flags = SIGNAL_GROUP_EXIT;
 		tsk->signal->group_exit_code = exit_code;
-		tsk->signal->group_stop_count = 0;
+		zap_process(tsk);
 		err = 0;
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
 	if (err)
 		return err;
 
+	if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+		goto done;
+
 	rcu_read_lock();
 	for_each_process(g) {
+		if (g == tsk->group_leader)
+			continue;
+
 		p = g;
 		do {
 			if (p->mm) {
-				if (p->mm == mm)
+				if (p->mm == mm) {
+					/*
+					 * p->sighand can't disappear, but
+					 * may be changed by de_thread()
+					 */
+					lock_task_sighand(p, &flags);
 					zap_process(p);
+					unlock_task_sighand(p, &flags);
+				}
 				break;
 			}
 		} while ((p = next_thread(p)) != g);
 	}
 	rcu_read_unlock();
-
+done:
 	return mm->core_waiters;
 }
 
-- 
cgit v1.2.3


From bebfa1013eee1d91b3242e5801cc8fbdfaf148ec Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Mon, 26 Jun 2006 13:56:52 +0200
Subject: [PATCH] x86_64: Add compat_printk and sysctl to turn off compat layer
 warnings

Sometimes e.g. with crashme the compat layer warnings can be noisy.
Add a way to turn them off by gating all output through compat_printk
that checks a global sysctl. The default is not changed.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 7e7e5bc4f3c..e31e9cf9664 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -55,6 +55,20 @@
 
 extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
 
+int compat_log = 1;
+
+int compat_printk(const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	if (!compat_log)
+		return 0;
+	va_start(ap, fmt);
+	ret = vprintk(fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
 	sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
 	if (!isprint(buf[1]))
 		sprintf(buf, "%02x", buf[1]);
-	printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
 			"cmd(%08x){%s} arg(%08x) on %s\n",
 			current->comm, current->pid,
 			(int)fd, (unsigned int)cmd, buf,
-- 
cgit v1.2.3


From d8deac5094988c7ad1127ee61f52c59a952fcabb Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Jun 2005 21:15:16 -0700
Subject: [PATCH] devfs: Remove devfs from the kernel tree

This is the first patch in a series of patches that removes devfs
support from the kernel.  This patch removes the core devfs code, and
its private header file.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/Makefile       |    1 -
 fs/compat_ioctl.c |    1 -
 fs/devfs/Makefile |    8 -
 fs/devfs/base.c   | 2836 -----------------------------------------------------
 fs/devfs/util.c   |   97 --
 5 files changed, 2943 deletions(-)
 delete mode 100644 fs/devfs/Makefile
 delete mode 100644 fs/devfs/base.c
 delete mode 100644 fs/devfs/util.c

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index d0ea6bfccf2..89135428a53 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -66,7 +66,6 @@ obj-$(CONFIG_MSDOS_FS)		+= msdos/
 obj-$(CONFIG_VFAT_FS)		+= vfat/
 obj-$(CONFIG_BFS_FS)		+= bfs/
 obj-$(CONFIG_ISO9660_FS)	+= isofs/
-obj-$(CONFIG_DEVFS_FS)		+= devfs/
 obj-$(CONFIG_HFSPLUS_FS)	+= hfsplus/ # Before hfs to find wrapped HFS+
 obj-$(CONFIG_HFS_FS)		+= hfs/
 obj-$(CONFIG_VXFS_FS)		+= freevxfs/
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d8ecfedef18..d8d50a70c58 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -44,7 +44,6 @@
 #include <linux/loop.h>
 #include <linux/auto_fs.h>
 #include <linux/auto_fs4.h>
-#include <linux/devfs_fs.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
 #include <linux/fb.h>
diff --git a/fs/devfs/Makefile b/fs/devfs/Makefile
deleted file mode 100644
index 6dd8d1245e2..00000000000
--- a/fs/devfs/Makefile
+++ /dev/null
@@ -1,8 +0,0 @@
-#
-# Makefile for the linux devfs-filesystem routines.
-#
-
-obj-$(CONFIG_DEVFS_FS) += devfs.o
-
-devfs-objs := base.o util.o
-
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
deleted file mode 100644
index 51a97f13274..00000000000
--- a/fs/devfs/base.c
+++ /dev/null
@@ -1,2836 +0,0 @@
-/*  devfs (Device FileSystem) driver.
-
-    Copyright (C) 1998-2002  Richard Gooch
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Library General Public
-    License as published by the Free Software Foundation; either
-    version 2 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Library General Public License for more details.
-
-    You should have received a copy of the GNU Library General Public
-    License along with this library; if not, write to the Free
-    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-    Richard Gooch may be reached by email at  rgooch@atnf.csiro.au
-    The postal address is:
-      Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
-
-    ChangeLog
-
-    19980110   Richard Gooch <rgooch@atnf.csiro.au>
-               Original version.
-  v0.1
-    19980111   Richard Gooch <rgooch@atnf.csiro.au>
-               Created per-fs inode table rather than using inode->u.generic_ip
-  v0.2
-    19980111   Richard Gooch <rgooch@atnf.csiro.au>
-               Created .epoch inode which has a ctime of 0.
-	       Fixed loss of named pipes when dentries lost.
-	       Fixed loss of inode data when devfs_register() follows mknod().
-  v0.3
-    19980111   Richard Gooch <rgooch@atnf.csiro.au>
-               Fix for when compiling with CONFIG_KERNELD.
-    19980112   Richard Gooch <rgooch@atnf.csiro.au>
-               Fix for readdir() which sometimes didn't show entries.
-	       Added <<tolerant>> option to <devfs_register>.
-  v0.4
-    19980113   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_fill_file> function.
-  v0.5
-    19980115   Richard Gooch <rgooch@atnf.csiro.au>
-               Added subdirectory support. Major restructuring.
-    19980116   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed <find_by_dev> to not search major=0,minor=0.
-	       Added symlink support.
-  v0.6
-    19980120   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_mk_dir> function and support directory unregister
-    19980120   Richard Gooch <rgooch@atnf.csiro.au>
-               Auto-ownership uses real uid/gid rather than effective uid/gid.
-  v0.7
-    19980121   Richard Gooch <rgooch@atnf.csiro.au>
-               Supported creation of sockets.
-  v0.8
-    19980122   Richard Gooch <rgooch@atnf.csiro.au>
-               Added DEVFS_FL_HIDE_UNREG flag.
-	       Interface change to <devfs_mk_symlink>.
-               Created <devfs_symlink> to support symlink(2).
-  v0.9
-    19980123   Richard Gooch <rgooch@atnf.csiro.au>
-               Added check to <devfs_fill_file> to check inode is in devfs.
-	       Added optional traversal of symlinks.
-  v0.10
-    19980124   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_get_flags> and <devfs_set_flags>.
-  v0.11
-    19980125   C. Scott Ananian <cananian@alumni.princeton.edu>
-               Created <devfs_find_handle>.
-    19980125   Richard Gooch <rgooch@atnf.csiro.au>
-               Allow removal of symlinks.
-  v0.12
-    19980125   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_set_symlink_destination>.
-    19980126   Richard Gooch <rgooch@atnf.csiro.au>
-               Moved DEVFS_SUPER_MAGIC into header file.
-	       Added DEVFS_FL_HIDE flag.
-	       Created <devfs_get_maj_min>.
-	       Created <devfs_get_handle_from_inode>.
-	       Fixed minor bug in <find_by_dev>.
-    19980127   Richard Gooch <rgooch@atnf.csiro.au>
-	       Changed interface to <find_by_dev>, <find_entry>,
-	       <devfs_unregister>, <devfs_fill_file> and <devfs_find_handle>.
-	       Fixed inode times when symlink created with symlink(2).
-  v0.13
-    19980129   C. Scott Ananian <cananian@alumni.princeton.edu>
-               Exported <devfs_set_symlink_destination>, <devfs_get_maj_min>
-	       and <devfs_get_handle_from_inode>.
-    19980129   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created <devfs_unlink> to support unlink(2).
-  v0.14
-    19980129   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed kerneld support for entries in devfs subdirectories.
-    19980130   Richard Gooch <rgooch@atnf.csiro.au>
-	       Bugfixes in <call_kerneld>.
-  v0.15
-    19980207   Richard Gooch <rgooch@atnf.csiro.au>
-	       Call kerneld when looking up unregistered entries.
-  v0.16
-    19980326   Richard Gooch <rgooch@atnf.csiro.au>
-	       Modified interface to <devfs_find_handle> for symlink traversal.
-  v0.17
-    19980331   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed persistence bug with device numbers for manually created
-	       device files.
-	       Fixed problem with recreating symlinks with different content.
-  v0.18
-    19980401   Richard Gooch <rgooch@atnf.csiro.au>
-	       Changed to CONFIG_KMOD.
-	       Hide entries which are manually unlinked.
-	       Always invalidate devfs dentry cache when registering entries.
-	       Created <devfs_rmdir> to support rmdir(2).
-	       Ensure directories created by <devfs_mk_dir> are visible.
-  v0.19
-    19980402   Richard Gooch <rgooch@atnf.csiro.au>
-	       Invalidate devfs dentry cache when making directories.
-	       Invalidate devfs dentry cache when removing entries.
-	       Fixed persistence bug with fifos.
-  v0.20
-    19980421   Richard Gooch <rgooch@atnf.csiro.au>
-	       Print process command when debugging kerneld/kmod.
-	       Added debugging for register/unregister/change operations.
-    19980422   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added "devfs=" boot options.
-  v0.21
-    19980426   Richard Gooch <rgooch@atnf.csiro.au>
-	       No longer lock/unlock superblock in <devfs_put_super>.
-	       Drop negative dentries when they are released.
-	       Manage dcache more efficiently.
-  v0.22
-    19980427   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added DEVFS_FL_AUTO_DEVNUM flag.
-  v0.23
-    19980430   Richard Gooch <rgooch@atnf.csiro.au>
-	       No longer set unnecessary methods.
-  v0.24
-    19980504   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added PID display to <call_kerneld> debugging message.
-	       Added "after" debugging message to <call_kerneld>.
-    19980519   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added "diread" and "diwrite" boot options.
-    19980520   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed persistence problem with permissions.
-  v0.25
-    19980602   Richard Gooch <rgooch@atnf.csiro.au>
-	       Support legacy device nodes.
-	       Fixed bug where recreated inodes were hidden.
-  v0.26
-    19980602   Richard Gooch <rgooch@atnf.csiro.au>
-	       Improved debugging in <get_vfs_inode>.
-    19980607   Richard Gooch <rgooch@atnf.csiro.au>
-	       No longer free old dentries in <devfs_mk_dir>.
-	       Free all dentries for a given entry when deleting inodes.
-  v0.27
-    19980627   Richard Gooch <rgooch@atnf.csiro.au>
-	       Limit auto-device numbering to majors 128 to 239.
-  v0.28
-    19980629   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed inode times persistence problem.
-  v0.29
-    19980704   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed spelling in <devfs_readlink> debug.
-	       Fixed bug in <devfs_setup> parsing "dilookup".
-  v0.30
-    19980705   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed devfs inode leak when manually recreating inodes.
-	       Fixed permission persistence problem when recreating inodes.
-  v0.31
-    19980727   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed harmless "unused variable" compiler warning.
-	       Fixed modes for manually recreated device nodes.
-  v0.32
-    19980728   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added NULL devfs inode warning in <devfs_read_inode>.
-	       Force all inode nlink values to 1.
-  v0.33
-    19980730   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added "dimknod" boot option.
-	       Set inode nlink to 0 when freeing dentries.
-	       Fixed modes for manually recreated symlinks.
-  v0.34
-    19980802   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed bugs in recreated directories and symlinks.
-  v0.35
-    19980806   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed bugs in recreated device nodes.
-    19980807   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed bug in currently unused <devfs_get_handle_from_inode>.
-	       Defined new <devfs_handle_t> type.
-	       Improved debugging when getting entries.
-	       Fixed bug where directories could be emptied.
-  v0.36
-    19980809   Richard Gooch <rgooch@atnf.csiro.au>
-	       Replaced dummy .epoch inode with .devfsd character device.
-    19980810   Richard Gooch <rgooch@atnf.csiro.au>
-	       Implemented devfsd protocol revision 0.
-  v0.37
-    19980819   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added soothing message to warning in <devfs_d_iput>.
-  v0.38
-    19980829   Richard Gooch <rgooch@atnf.csiro.au>
-	       Use GCC extensions for structure initialisations.
-	       Implemented async open notification.
-	       Incremented devfsd protocol revision to 1.
-  v0.39
-    19980908   Richard Gooch <rgooch@atnf.csiro.au>
-	       Moved async open notification to end of <devfs_open>.
-  v0.40
-    19980910   Richard Gooch <rgooch@atnf.csiro.au>
-	       Prepended "/dev/" to module load request.
-	       Renamed <call_kerneld> to <call_kmod>.
-  v0.41
-    19980910   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed typo "AYSNC" -> "ASYNC".
-  v0.42
-    19980910   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added open flag for files.
-  v0.43
-    19980927   Richard Gooch <rgooch@atnf.csiro.au>
-	       Set i_blocks=0 and i_blksize=1024 in <devfs_read_inode>.
-  v0.44
-    19981005   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added test for empty <<name>> in <devfs_find_handle>.
-	       Renamed <generate_path> to <devfs_generate_path> and published.
-  v0.45
-    19981006   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created <devfs_get_fops>.
-  v0.46
-    19981007   Richard Gooch <rgooch@atnf.csiro.au>
-	       Limit auto-device numbering to majors 144 to 239.
-  v0.47
-    19981010   Richard Gooch <rgooch@atnf.csiro.au>
-	       Updated <devfs_follow_link> for VFS change in 2.1.125.
-  v0.48
-    19981022   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created DEVFS_ FL_COMPAT flag.
-  v0.49
-    19981023   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created "nocompat" boot option.
-  v0.50
-    19981025   Richard Gooch <rgooch@atnf.csiro.au>
-	       Replaced "mount" boot option with "nomount".
-  v0.51
-    19981110   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created "only" boot option.
-  v0.52
-    19981112   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added DEVFS_FL_REMOVABLE flag.
-  v0.53
-    19981114   Richard Gooch <rgooch@atnf.csiro.au>
-	       Only call <scan_dir_for_removable> on first call to
-	       <devfs_readdir>.
-  v0.54
-    19981205   Richard Gooch <rgooch@atnf.csiro.au>
-	       Updated <devfs_rmdir> for VFS change in 2.1.131.
-  v0.55
-    19981218   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created <devfs_mk_compat>.
-    19981220   Richard Gooch <rgooch@atnf.csiro.au>
-	       Check for partitions on removable media in <devfs_lookup>.
-  v0.56
-    19990118   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added support for registering regular files.
-	       Created <devfs_set_file_size>.
-	       Update devfs inodes from entries if not changed through FS.
-  v0.57
-    19990124   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed <devfs_fill_file> to only initialise temporary inodes.
-	       Trap for NULL fops in <devfs_register>.
-	       Return -ENODEV in <devfs_fill_file> for non-driver inodes.
-  v0.58
-    19990126   Richard Gooch <rgooch@atnf.csiro.au>
-	       Switched from PATH_MAX to DEVFS_PATHLEN.
-  v0.59
-    19990127   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created "nottycompat" boot option.
-  v0.60
-    19990318   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed <devfsd_read> to not overrun event buffer.
-  v0.61
-    19990329   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created <devfs_auto_unregister>.
-  v0.62
-    19990330   Richard Gooch <rgooch@atnf.csiro.au>
-	       Don't return unregistred entries in <devfs_find_handle>.
-	       Panic in <devfs_unregister> if entry unregistered.
-    19990401   Richard Gooch <rgooch@atnf.csiro.au>
-	       Don't panic in <devfs_auto_unregister> for duplicates.
-  v0.63
-    19990402   Richard Gooch <rgooch@atnf.csiro.au>
-	       Don't unregister already unregistered entries in <unregister>.
-  v0.64
-    19990510   Richard Gooch <rgooch@atnf.csiro.au>
-	       Disable warning messages when unable to read partition table for
-	       removable media.
-  v0.65
-    19990512   Richard Gooch <rgooch@atnf.csiro.au>
-	       Updated <devfs_lookup> for VFS change in 2.3.1-pre1.
-	       Created "oops-on-panic" boot option.
-	       Improved debugging in <devfs_register> and <devfs_unregister>.
-  v0.66
-    19990519   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added documentation for some functions.
-    19990525   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed "oops-on-panic" boot option: now always Oops.
-  v0.67
-    19990531   Richard Gooch <rgooch@atnf.csiro.au>
-	       Improved debugging in <devfs_register>.
-  v0.68
-    19990604   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added "diunlink" and "nokmod" boot options.
-	       Removed superfluous warning message in <devfs_d_iput>.
-  v0.69
-    19990611   Richard Gooch <rgooch@atnf.csiro.au>
-	       Took account of change to <d_alloc_root>.
-  v0.70
-    19990614   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created separate event queue for each mounted devfs.
-	       Removed <devfs_invalidate_dcache>.
-	       Created new ioctl()s.
-	       Incremented devfsd protocol revision to 3.
-	       Fixed bug when re-creating directories: contents were lost.
-	       Block access to inodes until devfsd updates permissions.
-    19990615   Richard Gooch <rgooch@atnf.csiro.au>
-	       Support 2.2.x kernels.
-  v0.71
-    19990623   Richard Gooch <rgooch@atnf.csiro.au>
-	       Switched to sending process uid/gid to devfsd.
-	       Renamed <call_kmod> to <try_modload>.
-	       Added DEVFSD_NOTIFY_LOOKUP event.
-    19990624   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added DEVFSD_NOTIFY_CHANGE event.
-	       Incremented devfsd protocol revision to 4.
-  v0.72
-    19990713   Richard Gooch <rgooch@atnf.csiro.au>
-	       Return EISDIR rather than EINVAL for read(2) on directories.
-  v0.73
-    19990809   Richard Gooch <rgooch@atnf.csiro.au>
-	       Changed <devfs_setup> to new __init scheme.
-  v0.74
-    19990901   Richard Gooch <rgooch@atnf.csiro.au>
-	       Changed remaining function declarations to new __init scheme.
-  v0.75
-    19991013   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created <devfs_get_info>, <devfs_set_info>,
-	       <devfs_get_first_child> and <devfs_get_next_sibling>.
-	       Added <<dir>> parameter to <devfs_register>, <devfs_mk_compat>,
-	       <devfs_mk_dir> and <devfs_find_handle>.
-	       Work sponsored by SGI.
-  v0.76
-    19991017   Richard Gooch <rgooch@atnf.csiro.au>
-	       Allow multiple unregistrations.
-	       Work sponsored by SGI.
-  v0.77
-    19991026   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added major and minor number to devfsd protocol.
-	       Incremented devfsd protocol revision to 5.
-	       Work sponsored by SGI.
-  v0.78
-    19991030   Richard Gooch <rgooch@atnf.csiro.au>
-	       Support info pointer for all devfs entry types.
-	       Added <<info>> parameter to <devfs_mk_dir> and
-	       <devfs_mk_symlink>.
-	       Work sponsored by SGI.
-  v0.79
-    19991031   Richard Gooch <rgooch@atnf.csiro.au>
-	       Support "../" when searching devfs namespace.
-	       Work sponsored by SGI.
-  v0.80
-    19991101   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created <devfs_get_unregister_slave>.
-	       Work sponsored by SGI.
-  v0.81
-    19991103   Richard Gooch <rgooch@atnf.csiro.au>
-	       Exported <devfs_get_parent>.
-	       Work sponsored by SGI.
-  v0.82
-    19991104   Richard Gooch <rgooch@atnf.csiro.au>
-               Removed unused <devfs_set_symlink_destination>.
-    19991105   Richard Gooch <rgooch@atnf.csiro.au>
-               Do not hide entries from devfsd or children.
-	       Removed DEVFS_ FL_TTY_COMPAT flag.
-	       Removed "nottycompat" boot option.
-	       Removed <devfs_mk_compat>.
-	       Work sponsored by SGI.
-  v0.83
-    19991107   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added DEVFS_FL_WAIT flag.
-	       Work sponsored by SGI.
-  v0.84
-    19991107   Richard Gooch <rgooch@atnf.csiro.au>
-	       Support new "disc" naming scheme in <get_removable_partition>.
-	       Allow NULL fops in <devfs_register>.
-	       Work sponsored by SGI.
-  v0.85
-    19991110   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fall back to major table if NULL fops given to <devfs_register>.
-	       Work sponsored by SGI.
-  v0.86
-    19991204   Richard Gooch <rgooch@atnf.csiro.au>
-	       Support fifos when unregistering.
-	       Work sponsored by SGI.
-  v0.87
-    19991209   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed obsolete DEVFS_ FL_COMPAT and DEVFS_ FL_TOLERANT flags.
-	       Work sponsored by SGI.
-  v0.88
-    19991214   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed kmod support.
-	       Work sponsored by SGI.
-  v0.89
-    19991216   Richard Gooch <rgooch@atnf.csiro.au>
-	       Improved debugging in <get_vfs_inode>.
-	       Ensure dentries created by devfsd will be cleaned up.
-	       Work sponsored by SGI.
-  v0.90
-    19991223   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created <devfs_get_name>.
-	       Work sponsored by SGI.
-  v0.91
-    20000203   Richard Gooch <rgooch@atnf.csiro.au>
-	       Ported to kernel 2.3.42.
-	       Removed <devfs_fill_file>.
-	       Work sponsored by SGI.
-  v0.92
-    20000306   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added DEVFS_ FL_NO_PERSISTENCE flag.
-	       Removed unnecessary call to <update_devfs_inode_from_entry> in
-	       <devfs_readdir>.
-	       Work sponsored by SGI.
-  v0.93
-    20000413   Richard Gooch <rgooch@atnf.csiro.au>
-	       Set inode->i_size to correct size for symlinks.
-    20000414   Richard Gooch <rgooch@atnf.csiro.au>
-	       Only give lookup() method to directories to comply with new VFS
-	       assumptions.
-	       Work sponsored by SGI.
-    20000415   Richard Gooch <rgooch@atnf.csiro.au>
-	       Remove unnecessary tests in symlink methods.
-	       Don't kill existing block ops in <devfs_read_inode>.
-	       Work sponsored by SGI.
-  v0.94
-    20000424   Richard Gooch <rgooch@atnf.csiro.au>
-	       Don't create missing directories in <devfs_find_handle>.
-	       Work sponsored by SGI.
-  v0.95
-    20000430   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added CONFIG_DEVFS_MOUNT.
-	       Work sponsored by SGI.
-  v0.96
-    20000608   Richard Gooch <rgooch@atnf.csiro.au>
-	       Disabled multi-mount capability (use VFS bindings instead).
-	       Work sponsored by SGI.
-  v0.97
-    20000610   Richard Gooch <rgooch@atnf.csiro.au>
-	       Switched to FS_SINGLE to disable multi-mounts.
-    20000612   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed module support.
-	       Removed multi-mount code.
-	       Removed compatibility macros: VFS has changed too much.
-	       Work sponsored by SGI.
-  v0.98
-    20000614   Richard Gooch <rgooch@atnf.csiro.au>
-	       Merged devfs inode into devfs entry.
-	       Work sponsored by SGI.
-  v0.99
-    20000619   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed dead code in <devfs_register> which used to call
-	       <free_dentries>.
-	       Work sponsored by SGI.
-  v0.100
-    20000621   Richard Gooch <rgooch@atnf.csiro.au>
-	       Changed interface to <devfs_register>.
-	       Work sponsored by SGI.
-  v0.101
-    20000622   Richard Gooch <rgooch@atnf.csiro.au>
-	       Simplified interface to <devfs_mk_symlink> and <devfs_mk_dir>.
-	       Simplified interface to <devfs_find_handle>.
-	       Work sponsored by SGI.
-  v0.102
-    20010519   Richard Gooch <rgooch@atnf.csiro.au>
-	       Ensure <devfs_generate_path> terminates string for root entry.
-	       Exported <devfs_get_name> to modules.
-    20010520   Richard Gooch <rgooch@atnf.csiro.au>
-	       Make <devfs_mk_symlink> send events to devfsd.
-	       Cleaned up option processing in <devfs_setup>.
-    20010521   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed bugs in handling symlinks: could leak or cause Oops.
-    20010522   Richard Gooch <rgooch@atnf.csiro.au>
-	       Cleaned up directory handling by separating fops.
-  v0.103
-    20010601   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed handling of inverted options in <devfs_setup>.
-  v0.104
-    20010604   Richard Gooch <rgooch@atnf.csiro.au>
-	       Adjusted <try_modload> to account for <devfs_generate_path> fix.
-  v0.105
-    20010617   Richard Gooch <rgooch@atnf.csiro.au>
-	       Answered question posed by Al Viro and removed his comments.
-	       Moved setting of registered flag after other fields are changed.
-	       Fixed race between <devfsd_close> and <devfsd_notify_one>.
-	       Global VFS changes added bogus BKL to <devfsd_close>: removed.
-	       Widened locking in <devfs_readlink> and <devfs_follow_link>.
-	       Replaced <devfsd_read> stack usage with <devfsd_ioctl> kmalloc.
-	       Simplified locking in <devfsd_ioctl> and fixed memory leak.
-  v0.106
-    20010709   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed broken devnum allocation and use <devfs_alloc_devnum>.
-	       Fixed old devnum leak by calling new <devfs_dealloc_devnum>.
-  v0.107
-    20010712   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed bug in <devfs_setup> which could hang boot process.
-  v0.108
-    20010730   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added DEVFSD_NOTIFY_DELETE event.
-    20010801   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed #include <asm/segment.h>.
-  v0.109
-    20010807   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed inode table races by removing it and using
-	       inode->u.generic_ip instead.
-	       Moved <devfs_read_inode> into <get_vfs_inode>.
-	       Moved <devfs_write_inode> into <devfs_notify_change>.
-  v0.110
-    20010808   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed race in <devfs_do_symlink> for uni-processor.
-  v0.111
-    20010818   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed remnant of multi-mount support in <devfs_mknod>.
-               Removed unused DEVFS_FL_SHOW_UNREG flag.
-  v0.112
-    20010820   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed nlink field from struct devfs_inode.
-  v0.113
-    20010823   Richard Gooch <rgooch@atnf.csiro.au>
-	       Replaced BKL with global rwsem to protect symlink data (quick
-	       and dirty hack).
-  v0.114
-    20010827   Richard Gooch <rgooch@atnf.csiro.au>
-	       Replaced global rwsem for symlink with per-link refcount.
-  v0.115
-    20010919   Richard Gooch <rgooch@atnf.csiro.au>
-	       Set inode->i_mapping->a_ops for block nodes in <get_vfs_inode>.
-  v0.116
-    20011008   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed overrun in <devfs_link> by removing function (not needed).
-    20011009   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed buffer underrun in <try_modload>.
-    20011029   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed race in <devfsd_ioctl> when setting event mask.
-    20011114   Richard Gooch <rgooch@atnf.csiro.au>
-	       First release of new locking code.
-  v1.0
-    20011117   Richard Gooch <rgooch@atnf.csiro.au>
-	       Discard temporary buffer, now use "%s" for dentry names.
-    20011118   Richard Gooch <rgooch@atnf.csiro.au>
-	       Don't generate path in <try_modload>: use fake entry instead.
-	       Use "existing" directory in <_devfs_make_parent_for_leaf>.
-    20011122   Richard Gooch <rgooch@atnf.csiro.au>
-	       Use slab cache rather than fixed buffer for devfsd events.
-  v1.1
-    20011125   Richard Gooch <rgooch@atnf.csiro.au>
-	       Send DEVFSD_NOTIFY_REGISTERED events in <devfs_mk_dir>.
-    20011127   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed locking bug in <devfs_d_revalidate_wait> due to typo.
-	       Do not send CREATE, CHANGE, ASYNC_OPEN or DELETE events from
-	       devfsd or children.
-  v1.2
-    20011202   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed bug in <devfsd_read>: was dereferencing freed pointer.
-  v1.3
-    20011203   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed bug in <devfsd_close>: was dereferencing freed pointer.
-	       Added process group check for devfsd privileges.
-  v1.4
-    20011204   Richard Gooch <rgooch@atnf.csiro.au>
-	       Use SLAB_ATOMIC in <devfsd_notify_de> from <devfs_d_delete>.
-  v1.5
-    20011211   Richard Gooch <rgooch@atnf.csiro.au>
-	       Return old entry in <devfs_mk_dir> for 2.4.x kernels.
-    20011212   Richard Gooch <rgooch@atnf.csiro.au>
-	       Increment refcount on module in <check_disc_changed>.
-    20011215   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created <devfs_get_handle> and exported <devfs_put>.
-	       Increment refcount on module in <devfs_get_ops>.
-	       Created <devfs_put_ops>.
-  v1.6
-    20011216   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added poisoning to <devfs_put>.
-	       Improved debugging messages.
-  v1.7
-    20011221   Richard Gooch <rgooch@atnf.csiro.au>
-	       Corrected (made useful) debugging message in <unregister>.
-	       Moved <kmem_cache_create> in <mount_devfs_fs> to <init_devfs_fs>
-    20011224   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added magic number to guard against scribbling drivers.
-    20011226   Richard Gooch <rgooch@atnf.csiro.au>
-	       Only return old entry in <devfs_mk_dir> if a directory.
-	       Defined macros for error and debug messages.
-  v1.8
-    20020113   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed (rare, old) race in <devfs_lookup>.
-  v1.9
-    20020120   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed deadlock bug in <devfs_d_revalidate_wait>.
-	       Tag VFS deletable in <devfs_mk_symlink> if handle ignored.
-  v1.10
-    20020129   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added KERN_* to remaining messages.
-	       Cleaned up declaration of <stat_read>.
-  v1.11
-    20020219   Richard Gooch <rgooch@atnf.csiro.au>
-	       Changed <devfs_rmdir> to allow later additions if not yet empty.
-  v1.12
-    20020406   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed silently introduced calls to lock_kernel() and
-	       unlock_kernel() due to recent VFS locking changes. BKL isn't
-	       required in devfs.
-  v1.13
-    20020428   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed 2.4.x compatibility code.
-  v1.14
-    20020510   Richard Gooch <rgooch@atnf.csiro.au>
-	       Added BKL to <devfs_open> because drivers still need it.
-  v1.15
-    20020512   Richard Gooch <rgooch@atnf.csiro.au>
-	       Protected <scan_dir_for_removable> and <get_removable_partition>
-	       from changing directory contents.
-  v1.16
-    20020514   Richard Gooch <rgooch@atnf.csiro.au>
-	       Minor cleanup of <scan_dir_for_removable>.
-  v1.17
-    20020721   Richard Gooch <rgooch@atnf.csiro.au>
-	       Switched to ISO C structure field initialisers.
-	       Switch to set_current_state() and move before add_wait_queue().
-    20020722   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed devfs entry leak in <devfs_readdir> when *readdir fails.
-  v1.18
-    20020725   Richard Gooch <rgooch@atnf.csiro.au>
-	       Created <devfs_find_and_unregister>.
-  v1.19
-    20020728   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed deprecated <devfs_find_handle>.
-  v1.20
-    20020820   Richard Gooch <rgooch@atnf.csiro.au>
-	       Fixed module unload race in <devfs_open>.
-  v1.21
-    20021013   Richard Gooch <rgooch@atnf.csiro.au>
-	       Removed DEVFS_ FL_AUTO_OWNER.
-	       Switched lingering structure field initialiser to ISO C.
-	       Added locking when updating FCB flags.
-  v1.22
-*/
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/tty.h>
-#include <linux/timer.h>
-#include <linux/config.h>
-#include <linux/kernel.h>
-#include <linux/wait.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/ctype.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/devfs_fs.h>
-#include <linux/devfs_fs_kernel.h>
-#include <linux/smp_lock.h>
-#include <linux/smp.h>
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/namei.h>
-#include <linux/bitops.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/pgtable.h>
-#include <asm/atomic.h>
-
-#define DEVFS_VERSION            "2004-01-31"
-
-#define DEVFS_NAME "devfs"
-
-#define FIRST_INODE 1
-
-#define STRING_LENGTH 256
-#define FAKE_BLOCK_SIZE 1024
-#define POISON_PTR ( *(void **) poison_array )
-#define MAGIC_VALUE 0x327db823
-
-#ifndef TRUE
-#  define TRUE 1
-#  define FALSE 0
-#endif
-
-#define MODE_DIR (S_IFDIR | S_IWUSR | S_IRUGO | S_IXUGO)
-
-#define DEBUG_NONE         0x0000000
-#define DEBUG_MODULE_LOAD  0x0000001
-#define DEBUG_REGISTER     0x0000002
-#define DEBUG_UNREGISTER   0x0000004
-#define DEBUG_FREE         0x0000008
-#define DEBUG_SET_FLAGS    0x0000010
-#define DEBUG_S_READ       0x0000100	/*  Break  */
-#define DEBUG_I_LOOKUP     0x0001000	/*  Break  */
-#define DEBUG_I_CREATE     0x0002000
-#define DEBUG_I_GET        0x0004000
-#define DEBUG_I_CHANGE     0x0008000
-#define DEBUG_I_UNLINK     0x0010000
-#define DEBUG_I_RLINK      0x0020000
-#define DEBUG_I_FLINK      0x0040000
-#define DEBUG_I_MKNOD      0x0080000
-#define DEBUG_F_READDIR    0x0100000	/*  Break  */
-#define DEBUG_D_DELETE     0x1000000	/*  Break  */
-#define DEBUG_D_RELEASE    0x2000000
-#define DEBUG_D_IPUT       0x4000000
-#define DEBUG_ALL          0xfffffff
-#define DEBUG_DISABLED     DEBUG_NONE
-
-#define OPTION_NONE             0x00
-#define OPTION_MOUNT            0x01
-
-#define PRINTK(format, args...) \
-   {printk (KERN_ERR "%s" format, __FUNCTION__ , ## args);}
-
-#define OOPS(format, args...) \
-   {printk (KERN_CRIT "%s" format, __FUNCTION__ , ## args); \
-    printk ("Forcing Oops\n"); \
-    BUG();}
-
-#ifdef CONFIG_DEVFS_DEBUG
-#  define VERIFY_ENTRY(de) \
-   {if ((de) && (de)->magic_number != MAGIC_VALUE) \
-        OOPS ("(%p): bad magic value: %x\n", (de), (de)->magic_number);}
-#  define WRITE_ENTRY_MAGIC(de,magic) (de)->magic_number = (magic)
-#  define DPRINTK(flag, format, args...) \
-   {if (devfs_debug & flag) \
-	printk (KERN_INFO "%s" format, __FUNCTION__ , ## args);}
-#else
-#  define VERIFY_ENTRY(de)
-#  define WRITE_ENTRY_MAGIC(de,magic)
-#  define DPRINTK(flag, format, args...)
-#endif
-
-typedef struct devfs_entry *devfs_handle_t;
-
-struct directory_type {
-	rwlock_t lock;		/*  Lock for searching(R)/updating(W)   */
-	struct devfs_entry *first;
-	struct devfs_entry *last;
-	unsigned char no_more_additions:1;
-};
-
-struct symlink_type {
-	unsigned int length;	/*  Not including the NULL-termimator       */
-	char *linkname;		/*  This is NULL-terminated                 */
-};
-
-struct devfs_inode {		/*  This structure is for "persistent" inode storage  */
-	struct dentry *dentry;
-	struct timespec atime;
-	struct timespec mtime;
-	struct timespec ctime;
-	unsigned int ino;	/*  Inode number as seen in the VFS         */
-	uid_t uid;
-	gid_t gid;
-};
-
-struct devfs_entry {
-#ifdef CONFIG_DEVFS_DEBUG
-	unsigned int magic_number;
-#endif
-	void *info;
-	atomic_t refcount;	/*  When this drops to zero, it's unused    */
-	union {
-		struct directory_type dir;
-		dev_t dev;
-		struct symlink_type symlink;
-		const char *name;	/*  Only used for (mode == 0)               */
-	} u;
-	struct devfs_entry *prev;	/*  Previous entry in the parent directory  */
-	struct devfs_entry *next;	/*  Next entry in the parent directory      */
-	struct devfs_entry *parent;	/*  The parent directory                    */
-	struct devfs_inode inode;
-	umode_t mode;
-	unsigned short namelen;	/*  I think 64k+ filenames are a way off... */
-	unsigned char vfs:1;	/*  Whether the VFS may delete the entry   */
-	char name[1];		/*  This is just a dummy: the allocated array
-				   is bigger. This is NULL-terminated      */
-};
-
-/*  The root of the device tree  */
-static struct devfs_entry *root_entry;
-
-struct devfsd_buf_entry {
-	struct devfs_entry *de;	/*  The name is generated with this         */
-	unsigned short type;	/*  The type of event                       */
-	umode_t mode;
-	uid_t uid;
-	gid_t gid;
-	struct devfsd_buf_entry *next;
-};
-
-struct fs_info {		/*  This structure is for the mounted devfs  */
-	struct super_block *sb;
-	spinlock_t devfsd_buffer_lock;	/*  Lock when inserting/deleting events  */
-	struct devfsd_buf_entry *devfsd_first_event;
-	struct devfsd_buf_entry *devfsd_last_event;
-	volatile int devfsd_sleeping;
-	volatile struct task_struct *devfsd_task;
-	volatile pid_t devfsd_pgrp;
-	volatile struct file *devfsd_file;
-	struct devfsd_notify_struct *devfsd_info;
-	volatile unsigned long devfsd_event_mask;
-	atomic_t devfsd_overrun_count;
-	wait_queue_head_t devfsd_wait_queue;	/*  Wake devfsd on input       */
-	wait_queue_head_t revalidate_wait_queue;	/*  Wake when devfsd sleeps    */
-};
-
-static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED };
-static kmem_cache_t *devfsd_buf_cache;
-#ifdef CONFIG_DEVFS_DEBUG
-static unsigned int devfs_debug_init __initdata = DEBUG_NONE;
-static unsigned int devfs_debug = DEBUG_NONE;
-static DEFINE_SPINLOCK(stat_lock);
-static unsigned int stat_num_entries;
-static unsigned int stat_num_bytes;
-#endif
-static unsigned char poison_array[8] =
-    { 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a };
-
-#ifdef CONFIG_DEVFS_MOUNT
-static unsigned int boot_options = OPTION_MOUNT;
-#else
-static unsigned int boot_options = OPTION_NONE;
-#endif
-
-/*  Forward function declarations  */
-static devfs_handle_t _devfs_walk_path(struct devfs_entry *dir,
-				       const char *name, int namelen,
-				       int traverse_symlink);
-static ssize_t devfsd_read(struct file *file, char __user *buf, size_t len,
-			   loff_t * ppos);
-static int devfsd_ioctl(struct inode *inode, struct file *file,
-			unsigned int cmd, unsigned long arg);
-static int devfsd_close(struct inode *inode, struct file *file);
-#ifdef CONFIG_DEVFS_DEBUG
-static ssize_t stat_read(struct file *file, char __user *buf, size_t len,
-			 loff_t * ppos);
-static const struct file_operations stat_fops = {
-	.open = nonseekable_open,
-	.read = stat_read,
-};
-#endif
-
-/*  Devfs daemon file operations  */
-static const struct file_operations devfsd_fops = {
-	.open = nonseekable_open,
-	.read = devfsd_read,
-	.ioctl = devfsd_ioctl,
-	.release = devfsd_close,
-};
-
-/*  Support functions follow  */
-
-/**
- *	devfs_get - Get a reference to a devfs entry.
- *	@de:  The devfs entry.
- */
-
-static struct devfs_entry *devfs_get(struct devfs_entry *de)
-{
-	VERIFY_ENTRY(de);
-	if (de)
-		atomic_inc(&de->refcount);
-	return de;
-}				/*  End Function devfs_get  */
-
-/**
- *	devfs_put - Put (release) a reference to a devfs entry.
- *	@de:  The handle to the devfs entry.
- */
-
-static void devfs_put(devfs_handle_t de)
-{
-	if (!de)
-		return;
-	VERIFY_ENTRY(de);
-	if (de->info == POISON_PTR)
-		OOPS("(%p): poisoned pointer\n", de);
-	if (!atomic_dec_and_test(&de->refcount))
-		return;
-	if (de == root_entry)
-		OOPS("(%p): root entry being freed\n", de);
-	DPRINTK(DEBUG_FREE, "(%s): de: %p, parent: %p \"%s\"\n",
-		de->name, de, de->parent,
-		de->parent ? de->parent->name : "no parent");
-	if (S_ISLNK(de->mode))
-		kfree(de->u.symlink.linkname);
-	WRITE_ENTRY_MAGIC(de, 0);
-#ifdef CONFIG_DEVFS_DEBUG
-	spin_lock(&stat_lock);
-	--stat_num_entries;
-	stat_num_bytes -= sizeof *de + de->namelen;
-	if (S_ISLNK(de->mode))
-		stat_num_bytes -= de->u.symlink.length + 1;
-	spin_unlock(&stat_lock);
-#endif
-	de->info = POISON_PTR;
-	kfree(de);
-}				/*  End Function devfs_put  */
-
-/**
- *	_devfs_search_dir - Search for a devfs entry in a directory.
- *	@dir:  The directory to search.
- *	@name:  The name of the entry to search for.
- *	@namelen:  The number of characters in @name.
- *
- *  Search for a devfs entry in a directory and returns a pointer to the entry
- *   on success, else %NULL. The directory must be locked already.
- *   An implicit devfs_get() is performed on the returned entry.
- */
-
-static struct devfs_entry *_devfs_search_dir(struct devfs_entry *dir,
-					     const char *name,
-					     unsigned int namelen)
-{
-	struct devfs_entry *curr;
-
-	if (!S_ISDIR(dir->mode)) {
-		PRINTK("(%s): not a directory\n", dir->name);
-		return NULL;
-	}
-	for (curr = dir->u.dir.first; curr != NULL; curr = curr->next) {
-		if (curr->namelen != namelen)
-			continue;
-		if (memcmp(curr->name, name, namelen) == 0)
-			break;
-		/*  Not found: try the next one  */
-	}
-	return devfs_get(curr);
-}				/*  End Function _devfs_search_dir  */
-
-/**
- *	_devfs_alloc_entry - Allocate a devfs entry.
- *	@name:     the name of the entry
- *	@namelen:  the number of characters in @name
- *      @mode:     the mode for the entry
- *
- *  Allocate a devfs entry and returns a pointer to the entry on success, else
- *   %NULL.
- */
-
-static struct devfs_entry *_devfs_alloc_entry(const char *name,
-					      unsigned int namelen,
-					      umode_t mode)
-{
-	struct devfs_entry *new;
-	static unsigned long inode_counter = FIRST_INODE;
-	static DEFINE_SPINLOCK(counter_lock);
-
-	if (name && (namelen < 1))
-		namelen = strlen(name);
-	if ((new = kmalloc(sizeof *new + namelen, GFP_KERNEL)) == NULL)
-		return NULL;
-	memset(new, 0, sizeof *new + namelen);	/*  Will set '\0' on name  */
-	new->mode = mode;
-	if (S_ISDIR(mode))
-		rwlock_init(&new->u.dir.lock);
-	atomic_set(&new->refcount, 1);
-	spin_lock(&counter_lock);
-	new->inode.ino = inode_counter++;
-	spin_unlock(&counter_lock);
-	if (name)
-		memcpy(new->name, name, namelen);
-	new->namelen = namelen;
-	WRITE_ENTRY_MAGIC(new, MAGIC_VALUE);
-#ifdef CONFIG_DEVFS_DEBUG
-	spin_lock(&stat_lock);
-	++stat_num_entries;
-	stat_num_bytes += sizeof *new + namelen;
-	spin_unlock(&stat_lock);
-#endif
-	return new;
-}				/*  End Function _devfs_alloc_entry  */
-
-/**
- *	_devfs_append_entry - Append a devfs entry to a directory's child list.
- *	@dir:  The directory to add to.
- *	@de:  The devfs entry to append.
- *	@old_de: If an existing entry exists, it will be written here. This may
- *		 be %NULL. An implicit devfs_get() is performed on this entry.
- *
- *  Append a devfs entry to a directory's list of children, checking first to
- *   see if an entry of the same name exists. The directory will be locked.
- *   The value 0 is returned on success, else a negative error code.
- *   On failure, an implicit devfs_put() is performed on %de.
- */
-
-static int _devfs_append_entry(devfs_handle_t dir, devfs_handle_t de,
-			       devfs_handle_t * old_de)
-{
-	int retval;
-
-	if (old_de)
-		*old_de = NULL;
-	if (!S_ISDIR(dir->mode)) {
-		PRINTK("(%s): dir: \"%s\" is not a directory\n", de->name,
-		       dir->name);
-		devfs_put(de);
-		return -ENOTDIR;
-	}
-	write_lock(&dir->u.dir.lock);
-	if (dir->u.dir.no_more_additions)
-		retval = -ENOENT;
-	else {
-		struct devfs_entry *old;
-
-		old = _devfs_search_dir(dir, de->name, de->namelen);
-		if (old_de)
-			*old_de = old;
-		else
-			devfs_put(old);
-		if (old == NULL) {
-			de->parent = dir;
-			de->prev = dir->u.dir.last;
-			/*  Append to the directory's list of children  */
-			if (dir->u.dir.first == NULL)
-				dir->u.dir.first = de;
-			else
-				dir->u.dir.last->next = de;
-			dir->u.dir.last = de;
-			retval = 0;
-		} else
-			retval = -EEXIST;
-	}
-	write_unlock(&dir->u.dir.lock);
-	if (retval)
-		devfs_put(de);
-	return retval;
-}				/*  End Function _devfs_append_entry  */
-
-/**
- *	_devfs_get_root_entry - Get the root devfs entry.
- *
- *	Returns the root devfs entry on success, else %NULL.
- *
- *	TODO it must be called asynchronously due to the fact
- *	that devfs is initialized relatively late. Proper way
- *	is to remove module_init from init_devfs_fs and manually
- *	call it early enough during system init
- */
-
-static struct devfs_entry *_devfs_get_root_entry(void)
-{
-	struct devfs_entry *new;
-	static DEFINE_SPINLOCK(root_lock);
-
-	if (root_entry)
-		return root_entry;
-
-	new = _devfs_alloc_entry(NULL, 0, MODE_DIR);
-	if (new == NULL)
-		return NULL;
-
-	spin_lock(&root_lock);
-	if (root_entry) {
-		spin_unlock(&root_lock);
-		devfs_put(new);
-		return root_entry;
-	}
-	root_entry = new;
-	spin_unlock(&root_lock);
-
-	return root_entry;
-}				/*  End Function _devfs_get_root_entry  */
-
-/**
- *	_devfs_descend - Descend down a tree using the next component name.
- *	@dir:  The directory to search.
- *	@name:  The component name to search for.
- *	@namelen:  The length of %name.
- *	@next_pos:  The position of the next '/' or '\0' is written here.
- *
- *  Descend into a directory, searching for a component. This function forms
- *   the core of a tree-walking algorithm. The directory will be locked.
- *   The devfs entry corresponding to the component is returned. If there is
- *   no matching entry, %NULL is returned.
- *   An implicit devfs_get() is performed on the returned entry.
- */
-
-static struct devfs_entry *_devfs_descend(struct devfs_entry *dir,
-					  const char *name, int namelen,
-					  int *next_pos)
-{
-	const char *stop, *ptr;
-	struct devfs_entry *entry;
-
-	if ((namelen >= 3) && (strncmp(name, "../", 3) == 0)) {	/*  Special-case going to parent directory  */
-		*next_pos = 3;
-		return devfs_get(dir->parent);
-	}
-	stop = name + namelen;
-	/*  Search for a possible '/'  */
-	for (ptr = name; (ptr < stop) && (*ptr != '/'); ++ptr) ;
-	*next_pos = ptr - name;
-	read_lock(&dir->u.dir.lock);
-	entry = _devfs_search_dir(dir, name, *next_pos);
-	read_unlock(&dir->u.dir.lock);
-	return entry;
-}				/*  End Function _devfs_descend  */
-
-static devfs_handle_t _devfs_make_parent_for_leaf(struct devfs_entry *dir,
-						  const char *name,
-						  int namelen, int *leaf_pos)
-{
-	int next_pos = 0;
-
-	if (dir == NULL)
-		dir = _devfs_get_root_entry();
-	if (dir == NULL)
-		return NULL;
-	devfs_get(dir);
-	/*  Search for possible trailing component and ignore it  */
-	for (--namelen; (namelen > 0) && (name[namelen] != '/'); --namelen) ;
-	*leaf_pos = (name[namelen] == '/') ? (namelen + 1) : 0;
-	for (; namelen > 0; name += next_pos, namelen -= next_pos) {
-		struct devfs_entry *de, *old = NULL;
-
-		if ((de =
-		     _devfs_descend(dir, name, namelen, &next_pos)) == NULL) {
-			de = _devfs_alloc_entry(name, next_pos, MODE_DIR);
-			devfs_get(de);
-			if (!de || _devfs_append_entry(dir, de, &old)) {
-				devfs_put(de);
-				if (!old || !S_ISDIR(old->mode)) {
-					devfs_put(old);
-					devfs_put(dir);
-					return NULL;
-				}
-				de = old;	/*  Use the existing directory  */
-			}
-		}
-		if (de == dir->parent) {
-			devfs_put(dir);
-			devfs_put(de);
-			return NULL;
-		}
-		devfs_put(dir);
-		dir = de;
-		if (name[next_pos] == '/')
-			++next_pos;
-	}
-	return dir;
-}				/*  End Function _devfs_make_parent_for_leaf  */
-
-static devfs_handle_t _devfs_prepare_leaf(devfs_handle_t * dir,
-					  const char *name, umode_t mode)
-{
-	int namelen, leaf_pos;
-	struct devfs_entry *de;
-
-	namelen = strlen(name);
-	if ((*dir = _devfs_make_parent_for_leaf(*dir, name, namelen,
-						&leaf_pos)) == NULL) {
-		PRINTK("(%s): could not create parent path\n", name);
-		return NULL;
-	}
-	if ((de = _devfs_alloc_entry(name + leaf_pos, namelen - leaf_pos, mode))
-	    == NULL) {
-		PRINTK("(%s): could not allocate entry\n", name);
-		devfs_put(*dir);
-		return NULL;
-	}
-	return de;
-}				/*  End Function _devfs_prepare_leaf  */
-
-static devfs_handle_t _devfs_walk_path(struct devfs_entry *dir,
-				       const char *name, int namelen,
-				       int traverse_symlink)
-{
-	int next_pos = 0;
-
-	if (dir == NULL)
-		dir = _devfs_get_root_entry();
-	if (dir == NULL)
-		return NULL;
-	devfs_get(dir);
-	for (; namelen > 0; name += next_pos, namelen -= next_pos) {
-		struct devfs_entry *de, *link;
-
-		if (!S_ISDIR(dir->mode)) {
-			devfs_put(dir);
-			return NULL;
-		}
-
-		if ((de =
-		     _devfs_descend(dir, name, namelen, &next_pos)) == NULL) {
-			devfs_put(dir);
-			return NULL;
-		}
-		if (S_ISLNK(de->mode) && traverse_symlink) {	/*  Need to follow the link: this is a stack chomper  */
-			/* FIXME what if it puts outside of mounted tree? */
-			link = _devfs_walk_path(dir, de->u.symlink.linkname,
-						de->u.symlink.length, TRUE);
-			devfs_put(de);
-			if (!link) {
-				devfs_put(dir);
-				return NULL;
-			}
-			de = link;
-		}
-		devfs_put(dir);
-		dir = de;
-		if (name[next_pos] == '/')
-			++next_pos;
-	}
-	return dir;
-}				/*  End Function _devfs_walk_path  */
-
-/**
- *	_devfs_find_entry - Find a devfs entry.
- *	@dir: The handle to the parent devfs directory entry. If this is %NULL the
- *		name is relative to the root of the devfs.
- *	@name: The name of the entry. This may be %NULL.
- *	@traverse_symlink: If %TRUE then symbolic links are traversed.
- *
- *	Returns the devfs_entry pointer on success, else %NULL. An implicit
- *	devfs_get() is performed.
- */
-
-static struct devfs_entry *_devfs_find_entry(devfs_handle_t dir,
-					     const char *name,
-					     int traverse_symlink)
-{
-	unsigned int namelen = strlen(name);
-
-	if (name[0] == '/') {
-		/*  Skip leading pathname component  */
-		if (namelen < 2) {
-			PRINTK("(%s): too short\n", name);
-			return NULL;
-		}
-		for (++name, --namelen; (*name != '/') && (namelen > 0);
-		     ++name, --namelen) ;
-		if (namelen < 2) {
-			PRINTK("(%s): too short\n", name);
-			return NULL;
-		}
-		++name;
-		--namelen;
-	}
-	return _devfs_walk_path(dir, name, namelen, traverse_symlink);
-}				/*  End Function _devfs_find_entry  */
-
-static struct devfs_entry *get_devfs_entry_from_vfs_inode(struct inode *inode)
-{
-	if (inode == NULL)
-		return NULL;
-	VERIFY_ENTRY((struct devfs_entry *)inode->u.generic_ip);
-	return inode->u.generic_ip;
-}				/*  End Function get_devfs_entry_from_vfs_inode  */
-
-/**
- *	free_dentry - Free the dentry for a device entry and invalidate inode.
- *	@de: The entry.
- *
- *	This must only be called after the entry has been unhooked from its
- *	 parent directory.
- */
-
-static void free_dentry(struct devfs_entry *de)
-{
-	struct dentry *dentry = de->inode.dentry;
-
-	if (!dentry)
-		return;
-	spin_lock(&dcache_lock);
-	dget_locked(dentry);
-	spin_unlock(&dcache_lock);
-	/*  Forcefully remove the inode  */
-	if (dentry->d_inode != NULL)
-		dentry->d_inode->i_nlink = 0;
-	d_drop(dentry);
-	dput(dentry);
-}				/*  End Function free_dentry  */
-
-/**
- *	is_devfsd_or_child - Test if the current process is devfsd or one of its children.
- *	@fs_info: The filesystem information.
- *
- *	Returns %TRUE if devfsd or child, else %FALSE.
- */
-
-static int is_devfsd_or_child(struct fs_info *fs_info)
-{
-	struct task_struct *p = current;
-
-	if (p == fs_info->devfsd_task)
-		return (TRUE);
-	if (process_group(p) == fs_info->devfsd_pgrp)
-		return (TRUE);
-	read_lock(&tasklist_lock);
-	for (; p != &init_task; p = p->real_parent) {
-		if (p == fs_info->devfsd_task) {
-			read_unlock(&tasklist_lock);
-			return (TRUE);
-		}
-	}
-	read_unlock(&tasklist_lock);
-	return (FALSE);
-}				/*  End Function is_devfsd_or_child  */
-
-/**
- *	devfsd_queue_empty - Test if devfsd has work pending in its event queue.
- *	@fs_info: The filesystem information.
- *
- *	Returns %TRUE if the queue is empty, else %FALSE.
- */
-
-static inline int devfsd_queue_empty(struct fs_info *fs_info)
-{
-	return (fs_info->devfsd_last_event) ? FALSE : TRUE;
-}				/*  End Function devfsd_queue_empty  */
-
-/**
- *	wait_for_devfsd_finished - Wait for devfsd to finish processing its event queue.
- *	@fs_info: The filesystem information.
- *
- *	Returns %TRUE if no more waiting will be required, else %FALSE.
- */
-
-static int wait_for_devfsd_finished(struct fs_info *fs_info)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	if (fs_info->devfsd_task == NULL)
-		return (TRUE);
-	if (devfsd_queue_empty(fs_info) && fs_info->devfsd_sleeping)
-		return TRUE;
-	if (is_devfsd_or_child(fs_info))
-		return (FALSE);
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	add_wait_queue(&fs_info->revalidate_wait_queue, &wait);
-	if (!devfsd_queue_empty(fs_info) || !fs_info->devfsd_sleeping)
-		if (fs_info->devfsd_task)
-			schedule();
-	remove_wait_queue(&fs_info->revalidate_wait_queue, &wait);
-	__set_current_state(TASK_RUNNING);
-	return (TRUE);
-}				/*  End Function wait_for_devfsd_finished  */
-
-/**
- *	devfsd_notify_de - Notify the devfsd daemon of a change.
- *	@de: The devfs entry that has changed. This and all parent entries will
- *            have their reference counts incremented if the event was queued.
- *	@type: The type of change.
- *	@mode: The mode of the entry.
- *	@uid: The user ID.
- *	@gid: The group ID.
- *	@fs_info: The filesystem info.
- *
- *	Returns %TRUE if an event was queued and devfsd woken up, else %FALSE.
- */
-
-static int devfsd_notify_de(struct devfs_entry *de,
-			    unsigned short type, umode_t mode,
-			    uid_t uid, gid_t gid, struct fs_info *fs_info)
-{
-	struct devfsd_buf_entry *entry;
-	struct devfs_entry *curr;
-
-	if (!(fs_info->devfsd_event_mask & (1 << type)))
-		return (FALSE);
-	if ((entry = kmem_cache_alloc(devfsd_buf_cache, SLAB_KERNEL)) == NULL) {
-		atomic_inc(&fs_info->devfsd_overrun_count);
-		return (FALSE);
-	}
-	for (curr = de; curr != NULL; curr = curr->parent)
-		devfs_get(curr);
-	entry->de = de;
-	entry->type = type;
-	entry->mode = mode;
-	entry->uid = uid;
-	entry->gid = gid;
-	entry->next = NULL;
-	spin_lock(&fs_info->devfsd_buffer_lock);
-	if (!fs_info->devfsd_first_event)
-		fs_info->devfsd_first_event = entry;
-	if (fs_info->devfsd_last_event)
-		fs_info->devfsd_last_event->next = entry;
-	fs_info->devfsd_last_event = entry;
-	spin_unlock(&fs_info->devfsd_buffer_lock);
-	wake_up_interruptible(&fs_info->devfsd_wait_queue);
-	return (TRUE);
-}				/*  End Function devfsd_notify_de  */
-
-/**
- *	devfsd_notify - Notify the devfsd daemon of a change.
- *	@de: The devfs entry that has changed.
- *	@type: The type of change event.
- *	@wait: If TRUE, the function waits for the daemon to finish processing
- *		the event.
- */
-
-static void devfsd_notify(struct devfs_entry *de, unsigned short type)
-{
-	devfsd_notify_de(de, type, de->mode, current->euid,
-			 current->egid, &fs_info);
-}
-
-static int devfs_mk_dev(dev_t dev, umode_t mode, const char *fmt, va_list args)
-{
-	struct devfs_entry *dir = NULL, *de;
-	char buf[64];
-	int error, n;
-
-	n = vsnprintf(buf, sizeof(buf), fmt, args);
-	if (n >= sizeof(buf) || !buf[0]) {
-		printk(KERN_WARNING "%s: invalid format string %s\n",
-		       __FUNCTION__, fmt);
-		return -EINVAL;
-	}
-
-	de = _devfs_prepare_leaf(&dir, buf, mode);
-	if (!de) {
-		printk(KERN_WARNING "%s: could not prepare leaf for %s\n",
-		       __FUNCTION__, buf);
-		return -ENOMEM;	/* could be more accurate... */
-	}
-
-	de->u.dev = dev;
-
-	error = _devfs_append_entry(dir, de, NULL);
-	if (error) {
-		printk(KERN_WARNING "%s: could not append to parent for %s\n",
-		       __FUNCTION__, buf);
-		goto out;
-	}
-
-	devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED);
-      out:
-	devfs_put(dir);
-	return error;
-}
-
-int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...)
-{
-	va_list args;
-
-	if (!S_ISBLK(mode)) {
-		printk(KERN_WARNING "%s: invalide mode (%u) for %s\n",
-		       __FUNCTION__, mode, fmt);
-		return -EINVAL;
-	}
-
-	va_start(args, fmt);
-	return devfs_mk_dev(dev, mode, fmt, args);
-}
-
-EXPORT_SYMBOL(devfs_mk_bdev);
-
-int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...)
-{
-	va_list args;
-
-	if (!S_ISCHR(mode)) {
-		printk(KERN_WARNING "%s: invalide mode (%u) for %s\n",
-		       __FUNCTION__, mode, fmt);
-		return -EINVAL;
-	}
-
-	va_start(args, fmt);
-	return devfs_mk_dev(dev, mode, fmt, args);
-}
-
-EXPORT_SYMBOL(devfs_mk_cdev);
-
-/**
- *	_devfs_unhook - Unhook a device entry from its parents list
- *	@de: The entry to unhook.
- *
- *	Returns %TRUE if the entry was unhooked, else %FALSE if it was
- *		previously unhooked.
- *	The caller must have a write lock on the parent directory.
- */
-
-static int _devfs_unhook(struct devfs_entry *de)
-{
-	struct devfs_entry *parent;
-
-	if (!de || (de->prev == de))
-		return FALSE;
-	parent = de->parent;
-	if (de->prev == NULL)
-		parent->u.dir.first = de->next;
-	else
-		de->prev->next = de->next;
-	if (de->next == NULL)
-		parent->u.dir.last = de->prev;
-	else
-		de->next->prev = de->prev;
-	de->prev = de;		/*  Indicate we're unhooked                      */
-	de->next = NULL;	/*  Force early termination for <devfs_readdir>  */
-	return TRUE;
-}				/*  End Function _devfs_unhook  */
-
-/**
- *	_devfs_unregister - Unregister a device entry from its parent.
- *	@dir: The parent directory.
- *	@de: The entry to unregister.
- *
- *	The caller must have a write lock on the parent directory, which is
- *	unlocked by this function.
- */
-
-static void _devfs_unregister(struct devfs_entry *dir, struct devfs_entry *de)
-{
-	int unhooked = _devfs_unhook(de);
-
-	write_unlock(&dir->u.dir.lock);
-	if (!unhooked)
-		return;
-	devfs_get(dir);
-	devfsd_notify(de, DEVFSD_NOTIFY_UNREGISTERED);
-	free_dentry(de);
-	devfs_put(dir);
-	if (!S_ISDIR(de->mode))
-		return;
-	while (TRUE) {		/*  Recursively unregister: this is a stack chomper  */
-		struct devfs_entry *child;
-
-		write_lock(&de->u.dir.lock);
-		de->u.dir.no_more_additions = TRUE;
-		child = de->u.dir.first;
-		VERIFY_ENTRY(child);
-		_devfs_unregister(de, child);
-		if (!child)
-			break;
-		DPRINTK(DEBUG_UNREGISTER, "(%s): child: %p  refcount: %d\n",
-			child->name, child, atomic_read(&child->refcount));
-		devfs_put(child);
-	}
-}				/*  End Function _devfs_unregister  */
-
-static int devfs_do_symlink(devfs_handle_t dir, const char *name,
-			    const char *link, devfs_handle_t * handle)
-{
-	int err;
-	unsigned int linklength;
-	char *newlink;
-	struct devfs_entry *de;
-
-	if (handle != NULL)
-		*handle = NULL;
-	if (name == NULL) {
-		PRINTK("(): NULL name pointer\n");
-		return -EINVAL;
-	}
-	if (link == NULL) {
-		PRINTK("(%s): NULL link pointer\n", name);
-		return -EINVAL;
-	}
-	linklength = strlen(link);
-	if ((newlink = kmalloc(linklength + 1, GFP_KERNEL)) == NULL)
-		return -ENOMEM;
-	memcpy(newlink, link, linklength);
-	newlink[linklength] = '\0';
-	if ((de = _devfs_prepare_leaf(&dir, name, S_IFLNK | S_IRUGO | S_IXUGO))
-	    == NULL) {
-		PRINTK("(%s): could not prepare leaf\n", name);
-		kfree(newlink);
-		return -ENOTDIR;
-	}
-	de->info = NULL;
-	de->u.symlink.linkname = newlink;
-	de->u.symlink.length = linklength;
-	if ((err = _devfs_append_entry(dir, de, NULL)) != 0) {
-		PRINTK("(%s): could not append to parent, err: %d\n", name,
-		       err);
-		devfs_put(dir);
-		return err;
-	}
-	devfs_put(dir);
-#ifdef CONFIG_DEVFS_DEBUG
-	spin_lock(&stat_lock);
-	stat_num_bytes += linklength + 1;
-	spin_unlock(&stat_lock);
-#endif
-	if (handle != NULL)
-		*handle = de;
-	return 0;
-}				/*  End Function devfs_do_symlink  */
-
-/**
- *	devfs_mk_symlink Create a symbolic link in the devfs namespace.
- *	@from: The name of the entry.
- *	@to: Name of the destination
- *
- *	Returns 0 on success, else a negative error code is returned.
- */
-
-int devfs_mk_symlink(const char *from, const char *to)
-{
-	devfs_handle_t de;
-	int err;
-
-	err = devfs_do_symlink(NULL, from, to, &de);
-	if (!err) {
-		de->vfs = TRUE;
-		devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED);
-	}
-
-	return err;
-}
-
-/**
- *	devfs_mk_dir - Create a directory in the devfs namespace.
- *		new name is relative to the root of the devfs.
- *	@fmt: The name of the entry.
- *
- *	Use of this function is optional. The devfs_register() function
- *	will automatically create intermediate directories as needed. This function
- *	is provided for efficiency reasons, as it provides a handle to a directory.
- *	On failure %NULL is returned.
- */
-
-int devfs_mk_dir(const char *fmt, ...)
-{
-	struct devfs_entry *dir = NULL, *de = NULL, *old;
-	char buf[64];
-	va_list args;
-	int error, n;
-
-	va_start(args, fmt);
-	n = vsnprintf(buf, 64, fmt, args);
-	if (n >= 64 || !buf[0]) {
-		printk(KERN_WARNING "%s: invalid argument.", __FUNCTION__);
-		return -EINVAL;
-	}
-
-	de = _devfs_prepare_leaf(&dir, buf, MODE_DIR);
-	if (!de) {
-		PRINTK("(%s): could not prepare leaf\n", buf);
-		return -EINVAL;
-	}
-
-	error = _devfs_append_entry(dir, de, &old);
-	if (error == -EEXIST && S_ISDIR(old->mode)) {
-		/*
-		 * devfs_mk_dir() of an already-existing directory will
-		 * return success.
-		 */
-		error = 0;
-		goto out_put;
-	} else if (error) {
-		PRINTK("(%s): could not append to dir: %p \"%s\"\n",
-		       buf, dir, dir->name);
-		devfs_put(old);
-		goto out_put;
-	}
-
-	devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED);
-
-      out_put:
-	devfs_put(dir);
-	return error;
-}
-
-void devfs_remove(const char *fmt, ...)
-{
-	char buf[64];
-	va_list args;
-	int n;
-
-	va_start(args, fmt);
-	n = vsnprintf(buf, sizeof(buf), fmt, args);
-	if (n < sizeof(buf) && buf[0]) {
-		devfs_handle_t de = _devfs_find_entry(NULL, buf, 0);
-
-		if (!de) {
-			printk(KERN_ERR "%s: %s not found, cannot remove\n",
-			       __FUNCTION__, buf);
-			dump_stack();
-			return;
-		}
-
-		write_lock(&de->parent->u.dir.lock);
-		_devfs_unregister(de->parent, de);
-		devfs_put(de);
-		devfs_put(de);
-	}
-}
-
-/**
- *	devfs_generate_path - Generate a pathname for an entry, relative to the devfs root.
- *	@de: The devfs entry.
- *	@path: The buffer to write the pathname to. The pathname and '\0'
- *		terminator will be written at the end of the buffer.
- *	@buflen: The length of the buffer.
- *
- *	Returns the offset in the buffer where the pathname starts on success,
- *	else a negative error code.
- */
-
-static int devfs_generate_path(devfs_handle_t de, char *path, int buflen)
-{
-	int pos;
-#define NAMEOF(de) ( (de)->mode ? (de)->name : (de)->u.name )
-
-	if (de == NULL)
-		return -EINVAL;
-	VERIFY_ENTRY(de);
-	if (de->namelen >= buflen)
-		return -ENAMETOOLONG;	/*  Must be first       */
-	path[buflen - 1] = '\0';
-	if (de->parent == NULL)
-		return buflen - 1;	/*  Don't prepend root  */
-	pos = buflen - de->namelen - 1;
-	memcpy(path + pos, NAMEOF(de), de->namelen);
-	for (de = de->parent; de->parent != NULL; de = de->parent) {
-		if (pos - de->namelen - 1 < 0)
-			return -ENAMETOOLONG;
-		path[--pos] = '/';
-		pos -= de->namelen;
-		memcpy(path + pos, NAMEOF(de), de->namelen);
-	}
-	return pos;
-}				/*  End Function devfs_generate_path  */
-
-/**
- *	devfs_setup - Process kernel boot options.
- *	@str: The boot options after the "devfs=".
- */
-
-static int __init devfs_setup(char *str)
-{
-	static struct {
-		char *name;
-		unsigned int mask;
-		unsigned int *opt;
-	} devfs_options_tab[] __initdata = {
-#ifdef CONFIG_DEVFS_DEBUG
-		{
-		"dall", DEBUG_ALL, &devfs_debug_init}, {
-		"dmod", DEBUG_MODULE_LOAD, &devfs_debug_init}, {
-		"dreg", DEBUG_REGISTER, &devfs_debug_init}, {
-		"dunreg", DEBUG_UNREGISTER, &devfs_debug_init}, {
-		"dfree", DEBUG_FREE, &devfs_debug_init}, {
-		"diget", DEBUG_I_GET, &devfs_debug_init}, {
-		"dchange", DEBUG_SET_FLAGS, &devfs_debug_init}, {
-		"dsread", DEBUG_S_READ, &devfs_debug_init}, {
-		"dichange", DEBUG_I_CHANGE, &devfs_debug_init}, {
-		"dimknod", DEBUG_I_MKNOD, &devfs_debug_init}, {
-		"dilookup", DEBUG_I_LOOKUP, &devfs_debug_init}, {
-		"diunlink", DEBUG_I_UNLINK, &devfs_debug_init},
-#endif				/*  CONFIG_DEVFS_DEBUG  */
-		{
-		"mount", OPTION_MOUNT, &boot_options}, {
-		NULL, 0, NULL}
-	};
-
-	while ((*str != '\0') && !isspace(*str)) {
-		int i, found = 0, invert = 0;
-
-		if (strncmp(str, "no", 2) == 0) {
-			invert = 1;
-			str += 2;
-		}
-		for (i = 0; devfs_options_tab[i].name != NULL; i++) {
-			int len = strlen(devfs_options_tab[i].name);
-
-			if (strncmp(str, devfs_options_tab[i].name, len) == 0) {
-				if (invert)
-					*devfs_options_tab[i].opt &=
-					    ~devfs_options_tab[i].mask;
-				else
-					*devfs_options_tab[i].opt |=
-					    devfs_options_tab[i].mask;
-				str += len;
-				found = 1;
-				break;
-			}
-		}
-		if (!found)
-			return 0;	/*  No match         */
-		if (*str != ',')
-			return 0;	/*  No more options  */
-		++str;
-	}
-	return 1;
-}				/*  End Function devfs_setup  */
-
-__setup("devfs=", devfs_setup);
-
-EXPORT_SYMBOL(devfs_mk_dir);
-EXPORT_SYMBOL(devfs_remove);
-
-/**
- *	try_modload - Notify devfsd of an inode lookup by a non-devfsd process.
- *	@parent: The parent devfs entry.
- *	@fs_info: The filesystem info.
- *	@name: The device name.
- *	@namelen: The number of characters in @name.
- *	@buf: A working area that will be used. This must not go out of scope
- *            until devfsd is idle again.
- *
- *	Returns 0 on success (event was queued), else a negative error code.
- */
-
-static int try_modload(struct devfs_entry *parent, struct fs_info *fs_info,
-		       const char *name, unsigned namelen,
-		       struct devfs_entry *buf)
-{
-	if (!(fs_info->devfsd_event_mask & (1 << DEVFSD_NOTIFY_LOOKUP)))
-		return -ENOENT;
-	if (is_devfsd_or_child(fs_info))
-		return -ENOENT;
-	memset(buf, 0, sizeof *buf);
-	atomic_set(&buf->refcount, 1);
-	buf->parent = parent;
-	buf->namelen = namelen;
-	buf->u.name = name;
-	WRITE_ENTRY_MAGIC(buf, MAGIC_VALUE);
-	if (!devfsd_notify_de(buf, DEVFSD_NOTIFY_LOOKUP, 0,
-			      current->euid, current->egid, fs_info))
-		return -ENOENT;
-	/*  Possible success: event has been queued  */
-	return 0;
-}				/*  End Function try_modload  */
-
-/*  Superblock operations follow  */
-
-static struct inode_operations devfs_iops;
-static struct inode_operations devfs_dir_iops;
-static const struct file_operations devfs_fops;
-static const struct file_operations devfs_dir_fops;
-static struct inode_operations devfs_symlink_iops;
-
-static int devfs_notify_change(struct dentry *dentry, struct iattr *iattr)
-{
-	int retval;
-	struct devfs_entry *de;
-	struct inode *inode = dentry->d_inode;
-	struct fs_info *fs_info = inode->i_sb->s_fs_info;
-
-	de = get_devfs_entry_from_vfs_inode(inode);
-	if (de == NULL)
-		return -ENODEV;
-	retval = inode_change_ok(inode, iattr);
-	if (retval != 0)
-		return retval;
-	retval = inode_setattr(inode, iattr);
-	if (retval != 0)
-		return retval;
-	DPRINTK(DEBUG_I_CHANGE, "(%d): VFS inode: %p  devfs_entry: %p\n",
-		(int)inode->i_ino, inode, de);
-	DPRINTK(DEBUG_I_CHANGE, "():   mode: 0%o  uid: %d  gid: %d\n",
-		(int)inode->i_mode, (int)inode->i_uid, (int)inode->i_gid);
-	/*  Inode is not on hash chains, thus must save permissions here rather
-	   than in a write_inode() method  */
-	de->mode = inode->i_mode;
-	de->inode.uid = inode->i_uid;
-	de->inode.gid = inode->i_gid;
-	de->inode.atime = inode->i_atime;
-	de->inode.mtime = inode->i_mtime;
-	de->inode.ctime = inode->i_ctime;
-	if ((iattr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) &&
-	    !is_devfsd_or_child(fs_info))
-		devfsd_notify_de(de, DEVFSD_NOTIFY_CHANGE, inode->i_mode,
-				 inode->i_uid, inode->i_gid, fs_info);
-	return 0;
-}				/*  End Function devfs_notify_change  */
-
-static struct super_operations devfs_sops = {
-	.drop_inode = generic_delete_inode,
-	.statfs = simple_statfs,
-};
-
-/**
- *	_devfs_get_vfs_inode - Get a VFS inode.
- *	@sb: The super block.
- *	@de: The devfs inode.
- *	@dentry: The dentry to register with the devfs inode.
- *
- *	Returns the inode on success, else %NULL. An implicit devfs_get() is
- *       performed if the inode is created.
- */
-
-static struct inode *_devfs_get_vfs_inode(struct super_block *sb,
-					  struct devfs_entry *de,
-					  struct dentry *dentry)
-{
-	struct inode *inode;
-
-	if (de->prev == de)
-		return NULL;	/*  Quick check to see if unhooked  */
-	if ((inode = new_inode(sb)) == NULL) {
-		PRINTK("(%s): new_inode() failed, de: %p\n", de->name, de);
-		return NULL;
-	}
-	if (de->parent) {
-		read_lock(&de->parent->u.dir.lock);
-		if (de->prev != de)
-			de->inode.dentry = dentry;	/*      Not unhooked  */
-		read_unlock(&de->parent->u.dir.lock);
-	} else
-		de->inode.dentry = dentry;	/*  Root: no locking needed  */
-	if (de->inode.dentry != dentry) {	/*  Must have been unhooked  */
-		iput(inode);
-		return NULL;
-	}
-	/* FIXME where is devfs_put? */
-	inode->u.generic_ip = devfs_get(de);
-	inode->i_ino = de->inode.ino;
-	DPRINTK(DEBUG_I_GET, "(%d): VFS inode: %p  devfs_entry: %p\n",
-		(int)inode->i_ino, inode, de);
-	inode->i_blocks = 0;
-	inode->i_blksize = FAKE_BLOCK_SIZE;
-	inode->i_op = &devfs_iops;
-	inode->i_mode = de->mode;
-	if (S_ISDIR(de->mode)) {
-		inode->i_op = &devfs_dir_iops;
-		inode->i_fop = &devfs_dir_fops;
-	} else if (S_ISLNK(de->mode)) {
-		inode->i_op = &devfs_symlink_iops;
-		inode->i_size = de->u.symlink.length;
-	} else if (S_ISCHR(de->mode) || S_ISBLK(de->mode)) {
-		init_special_inode(inode, de->mode, de->u.dev);
-	} else if (S_ISFIFO(de->mode) || S_ISSOCK(de->mode)) {
-		init_special_inode(inode, de->mode, 0);
-	} else {
-		PRINTK("(%s): unknown mode %o de: %p\n",
-		       de->name, de->mode, de);
-		iput(inode);
-		devfs_put(de);
-		return NULL;
-	}
-
-	inode->i_uid = de->inode.uid;
-	inode->i_gid = de->inode.gid;
-	inode->i_atime = de->inode.atime;
-	inode->i_mtime = de->inode.mtime;
-	inode->i_ctime = de->inode.ctime;
-	DPRINTK(DEBUG_I_GET, "():   mode: 0%o  uid: %d  gid: %d\n",
-		(int)inode->i_mode, (int)inode->i_uid, (int)inode->i_gid);
-	return inode;
-}				/*  End Function _devfs_get_vfs_inode  */
-
-/*  File operations for device entries follow  */
-
-static int devfs_readdir(struct file *file, void *dirent, filldir_t filldir)
-{
-	int err, count;
-	int stored = 0;
-	struct fs_info *fs_info;
-	struct devfs_entry *parent, *de, *next = NULL;
-	struct inode *inode = file->f_dentry->d_inode;
-
-	fs_info = inode->i_sb->s_fs_info;
-	parent = get_devfs_entry_from_vfs_inode(file->f_dentry->d_inode);
-	if ((long)file->f_pos < 0)
-		return -EINVAL;
-	DPRINTK(DEBUG_F_READDIR, "(%s): fs_info: %p  pos: %ld\n",
-		parent->name, fs_info, (long)file->f_pos);
-	switch ((long)file->f_pos) {
-	case 0:
-		err = (*filldir) (dirent, "..", 2, file->f_pos,
-				  parent_ino(file->f_dentry), DT_DIR);
-		if (err == -EINVAL)
-			break;
-		if (err < 0)
-			return err;
-		file->f_pos++;
-		++stored;
-		/*  Fall through  */
-	case 1:
-		err =
-		    (*filldir) (dirent, ".", 1, file->f_pos, inode->i_ino,
-				DT_DIR);
-		if (err == -EINVAL)
-			break;
-		if (err < 0)
-			return err;
-		file->f_pos++;
-		++stored;
-		/*  Fall through  */
-	default:
-		/*  Skip entries  */
-		count = file->f_pos - 2;
-		read_lock(&parent->u.dir.lock);
-		for (de = parent->u.dir.first; de && (count > 0); de = de->next)
-			--count;
-		devfs_get(de);
-		read_unlock(&parent->u.dir.lock);
-		/*  Now add all remaining entries  */
-		while (de) {
-			err = (*filldir) (dirent, de->name, de->namelen,
-					  file->f_pos, de->inode.ino,
-					  de->mode >> 12);
-			if (err < 0)
-				devfs_put(de);
-			else {
-				file->f_pos++;
-				++stored;
-			}
-			if (err == -EINVAL)
-				break;
-			if (err < 0)
-				return err;
-			read_lock(&parent->u.dir.lock);
-			next = devfs_get(de->next);
-			read_unlock(&parent->u.dir.lock);
-			devfs_put(de);
-			de = next;
-		}
-		break;
-	}
-	return stored;
-}				/*  End Function devfs_readdir  */
-
-/* Open devfs specific special files */
-static int devfs_open(struct inode *inode, struct file *file)
-{
-	int err;
-	int minor = MINOR(inode->i_rdev);
-	struct file_operations *old_fops, *new_fops;
-
-	switch (minor) {
-	case 0:		/* /dev/.devfsd */
-		new_fops = fops_get(&devfsd_fops);
-		break;
-#ifdef CONFIG_DEVFS_DEBUG
-	case 1:		/* /dev/.stat */
-		new_fops = fops_get(&stat_fops);
-		break;
-#endif
-	default:
-		return -ENODEV;
-	}
-
-	if (new_fops == NULL)
-		return -ENODEV;
-	old_fops = file->f_op;
-	file->f_op = new_fops;
-	err = new_fops->open ? new_fops->open(inode, file) : 0;
-	if (err) {
-		file->f_op = old_fops;
-		fops_put(new_fops);
-	} else
-		fops_put(old_fops);
-	return err;
-}				/*  End Function devfs_open  */
-
-static const struct file_operations devfs_fops = {
-	.open = devfs_open,
-};
-
-static const struct file_operations devfs_dir_fops = {
-	.read = generic_read_dir,
-	.readdir = devfs_readdir,
-};
-
-/*  Dentry operations for device entries follow  */
-
-/**
- *	devfs_d_release - Callback for when a dentry is freed.
- *	@dentry: The dentry.
- */
-
-static void devfs_d_release(struct dentry *dentry)
-{
-	DPRINTK(DEBUG_D_RELEASE, "(%p): inode: %p\n", dentry, dentry->d_inode);
-}				/*  End Function devfs_d_release  */
-
-/**
- *	devfs_d_iput - Callback for when a dentry loses its inode.
- *	@dentry: The dentry.
- *	@inode:	The inode.
- */
-
-static void devfs_d_iput(struct dentry *dentry, struct inode *inode)
-{
-	struct devfs_entry *de;
-
-	de = get_devfs_entry_from_vfs_inode(inode);
-	DPRINTK(DEBUG_D_IPUT,
-		"(%s): dentry: %p inode: %p de: %p de->dentry: %p\n", de->name,
-		dentry, inode, de, de->inode.dentry);
-	if (de->inode.dentry && (de->inode.dentry != dentry))
-		OOPS("(%s): de: %p dentry: %p de->dentry: %p\n",
-		     de->name, de, dentry, de->inode.dentry);
-	de->inode.dentry = NULL;
-	iput(inode);
-	devfs_put(de);
-}				/*  End Function devfs_d_iput  */
-
-static int devfs_d_delete(struct dentry *dentry);
-
-static struct dentry_operations devfs_dops = {
-	.d_delete = devfs_d_delete,
-	.d_release = devfs_d_release,
-	.d_iput = devfs_d_iput,
-};
-
-static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *);
-
-static struct dentry_operations devfs_wait_dops = {
-	.d_delete = devfs_d_delete,
-	.d_release = devfs_d_release,
-	.d_iput = devfs_d_iput,
-	.d_revalidate = devfs_d_revalidate_wait,
-};
-
-/**
- *	devfs_d_delete - Callback for when all files for a dentry are closed.
- *	@dentry: The dentry.
- */
-
-static int devfs_d_delete(struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-
-	if (dentry->d_op == &devfs_wait_dops)
-		dentry->d_op = &devfs_dops;
-	/*  Unhash dentry if negative (has no inode)  */
-	if (inode == NULL) {
-		DPRINTK(DEBUG_D_DELETE, "(%p): dropping negative dentry\n",
-			dentry);
-		return 1;
-	}
-	return 0;
-}				/*  End Function devfs_d_delete  */
-
-struct devfs_lookup_struct {
-	devfs_handle_t de;
-	wait_queue_head_t wait_queue;
-};
-
-/* XXX: this doesn't handle the case where we got a negative dentry
-        but a devfs entry has been registered in the meanwhile */
-static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
-{
-	struct inode *dir = dentry->d_parent->d_inode;
-	struct fs_info *fs_info = dir->i_sb->s_fs_info;
-	devfs_handle_t parent = get_devfs_entry_from_vfs_inode(dir);
-	struct devfs_lookup_struct *lookup_info = dentry->d_fsdata;
-	DECLARE_WAITQUEUE(wait, current);
-	int need_lock;
-
-	/*
-	 * FIXME HACK
-	 *
-	 * make sure that
-	 *   d_instantiate always runs under lock
-	 *   we release i_mutex lock before going to sleep
-	 *
-	 * unfortunately sometimes d_revalidate is called with
-	 * and sometimes without i_mutex lock held. The following checks
-	 * attempt to deduce when we need to add (and drop resp.) lock
-	 * here. This relies on current (2.6.2) calling coventions:
-	 *
-	 *   lookup_hash is always run under i_mutex and is passing NULL
-	 *   as nd
-	 *
-	 *   open(...,O_CREATE,...) calls _lookup_hash under i_mutex
-	 *   and sets flags to LOOKUP_OPEN|LOOKUP_CREATE
-	 *
-	 *   all other invocations of ->d_revalidate seem to happen
-	 *   outside of i_mutex
-	 */
-	need_lock = nd &&
-	    (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT));
-
-	if (need_lock)
-		mutex_lock(&dir->i_mutex);
-
-	if (is_devfsd_or_child(fs_info)) {
-		devfs_handle_t de = lookup_info->de;
-		struct inode *inode;
-
-		DPRINTK(DEBUG_I_LOOKUP,
-			"(%s): dentry: %p inode: %p de: %p by: \"%s\"\n",
-			dentry->d_name.name, dentry, dentry->d_inode, de,
-			current->comm);
-		if (dentry->d_inode)
-			goto out;
-		if (de == NULL) {
-			read_lock(&parent->u.dir.lock);
-			de = _devfs_search_dir(parent, dentry->d_name.name,
-					       dentry->d_name.len);
-			read_unlock(&parent->u.dir.lock);
-			if (de == NULL)
-				goto out;
-			lookup_info->de = de;
-		}
-		/*  Create an inode, now that the driver information is available  */
-		inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry);
-		if (!inode)
-			goto out;
-		DPRINTK(DEBUG_I_LOOKUP,
-			"(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n",
-			de->name, de->inode.ino, inode, de, current->comm);
-		d_instantiate(dentry, inode);
-		goto out;
-	}
-	if (lookup_info == NULL)
-		goto out;	/*  Early termination  */
-	read_lock(&parent->u.dir.lock);
-	if (dentry->d_fsdata) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		add_wait_queue(&lookup_info->wait_queue, &wait);
-		read_unlock(&parent->u.dir.lock);
-		/* at this point it is always (hopefully) locked */
-		mutex_unlock(&dir->i_mutex);
-		schedule();
-		mutex_lock(&dir->i_mutex);
-		/*
-		 * This does not need nor should remove wait from wait_queue.
-		 * Wait queue head is never reused - nothing is ever added to it
-		 * after all waiters have been waked up and head itself disappears
-		 * very soon after it. Moreover it is local variable on stack that
-		 * is likely to have already disappeared so any reference to it
-		 * at this point is buggy.
-		 */
-
-	} else
-		read_unlock(&parent->u.dir.lock);
-
-      out:
-	if (need_lock)
-		mutex_unlock(&dir->i_mutex);
-	return 1;
-}				/*  End Function devfs_d_revalidate_wait  */
-
-/*  Inode operations for device entries follow  */
-
-static struct dentry *devfs_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
-{
-	struct devfs_entry tmp;	/*  Must stay in scope until devfsd idle again  */
-	struct devfs_lookup_struct lookup_info;
-	struct fs_info *fs_info = dir->i_sb->s_fs_info;
-	struct devfs_entry *parent, *de;
-	struct inode *inode;
-	struct dentry *retval = NULL;
-
-	/*  Set up the dentry operations before anything else, to ensure cleaning
-	   up on any error  */
-	dentry->d_op = &devfs_dops;
-	/*  First try to get the devfs entry for this directory  */
-	parent = get_devfs_entry_from_vfs_inode(dir);
-	DPRINTK(DEBUG_I_LOOKUP, "(%s): dentry: %p parent: %p by: \"%s\"\n",
-		dentry->d_name.name, dentry, parent, current->comm);
-	if (parent == NULL)
-		return ERR_PTR(-ENOENT);
-	read_lock(&parent->u.dir.lock);
-	de = _devfs_search_dir(parent, dentry->d_name.name, dentry->d_name.len);
-	read_unlock(&parent->u.dir.lock);
-	lookup_info.de = de;
-	init_waitqueue_head(&lookup_info.wait_queue);
-	dentry->d_fsdata = &lookup_info;
-	if (de == NULL) {	/*  Try with devfsd. For any kind of failure, leave a negative dentry
-				   so someone else can deal with it (in the case where the sysadmin
-				   does a mknod()). It's important to do this before hashing the
-				   dentry, so that the devfsd queue is filled before revalidates
-				   can start  */
-		if (try_modload(parent, fs_info, dentry->d_name.name, dentry->d_name.len, &tmp) < 0) {	/*  Lookup event was not queued to devfsd  */
-			d_add(dentry, NULL);
-			return NULL;
-		}
-	}
-	dentry->d_op = &devfs_wait_dops;
-	d_add(dentry, NULL);	/*  Open the floodgates  */
-	/*  Unlock directory semaphore, which will release any waiters. They
-	   will get the hashed dentry, and may be forced to wait for
-	   revalidation  */
-	mutex_unlock(&dir->i_mutex);
-	wait_for_devfsd_finished(fs_info);	/*  If I'm not devfsd, must wait  */
-	mutex_lock(&dir->i_mutex);	/*  Grab it again because them's the rules  */
-	de = lookup_info.de;
-	/*  If someone else has been so kind as to make the inode, we go home
-	   early  */
-	if (dentry->d_inode)
-		goto out;
-	if (de == NULL) {
-		read_lock(&parent->u.dir.lock);
-		de = _devfs_search_dir(parent, dentry->d_name.name,
-				       dentry->d_name.len);
-		read_unlock(&parent->u.dir.lock);
-		if (de == NULL)
-			goto out;
-		/*  OK, there's an entry now, but no VFS inode yet  */
-	}
-	/*  Create an inode, now that the driver information is available  */
-	inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry);
-	if (!inode) {
-		retval = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-	DPRINTK(DEBUG_I_LOOKUP,
-		"(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n", de->name,
-		de->inode.ino, inode, de, current->comm);
-	d_instantiate(dentry, inode);
-      out:
-	write_lock(&parent->u.dir.lock);
-	dentry->d_op = &devfs_dops;
-	dentry->d_fsdata = NULL;
-	wake_up(&lookup_info.wait_queue);
-	write_unlock(&parent->u.dir.lock);
-	devfs_put(de);
-	return retval;
-}				/*  End Function devfs_lookup  */
-
-static int devfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int unhooked;
-	struct devfs_entry *de;
-	struct inode *inode = dentry->d_inode;
-	struct fs_info *fs_info = dir->i_sb->s_fs_info;
-
-	de = get_devfs_entry_from_vfs_inode(inode);
-	DPRINTK(DEBUG_I_UNLINK, "(%s): de: %p\n", dentry->d_name.name, de);
-	if (de == NULL)
-		return -ENOENT;
-	if (!de->vfs)
-		return -EPERM;
-	write_lock(&de->parent->u.dir.lock);
-	unhooked = _devfs_unhook(de);
-	write_unlock(&de->parent->u.dir.lock);
-	if (!unhooked)
-		return -ENOENT;
-	if (!is_devfsd_or_child(fs_info))
-		devfsd_notify_de(de, DEVFSD_NOTIFY_DELETE, inode->i_mode,
-				 inode->i_uid, inode->i_gid, fs_info);
-	free_dentry(de);
-	devfs_put(de);
-	return 0;
-}				/*  End Function devfs_unlink  */
-
-static int devfs_symlink(struct inode *dir, struct dentry *dentry,
-			 const char *symname)
-{
-	int err;
-	struct fs_info *fs_info = dir->i_sb->s_fs_info;
-	struct devfs_entry *parent, *de;
-	struct inode *inode;
-
-	/*  First try to get the devfs entry for this directory  */
-	parent = get_devfs_entry_from_vfs_inode(dir);
-	if (parent == NULL)
-		return -ENOENT;
-	err = devfs_do_symlink(parent, dentry->d_name.name, symname, &de);
-	DPRINTK(DEBUG_DISABLED, "(%s): errcode from <devfs_do_symlink>: %d\n",
-		dentry->d_name.name, err);
-	if (err < 0)
-		return err;
-	de->vfs = TRUE;
-	de->inode.uid = current->euid;
-	de->inode.gid = current->egid;
-	de->inode.atime = CURRENT_TIME;
-	de->inode.mtime = CURRENT_TIME;
-	de->inode.ctime = CURRENT_TIME;
-	if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL)
-		return -ENOMEM;
-	DPRINTK(DEBUG_DISABLED, "(%s): new VFS inode(%u): %p  dentry: %p\n",
-		dentry->d_name.name, de->inode.ino, inode, dentry);
-	d_instantiate(dentry, inode);
-	if (!is_devfsd_or_child(fs_info))
-		devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
-				 inode->i_uid, inode->i_gid, fs_info);
-	return 0;
-}				/*  End Function devfs_symlink  */
-
-static int devfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	int err;
-	struct fs_info *fs_info = dir->i_sb->s_fs_info;
-	struct devfs_entry *parent, *de;
-	struct inode *inode;
-
-	mode = (mode & ~S_IFMT) | S_IFDIR;	/*  VFS doesn't pass S_IFMT part  */
-	parent = get_devfs_entry_from_vfs_inode(dir);
-	if (parent == NULL)
-		return -ENOENT;
-	de = _devfs_alloc_entry(dentry->d_name.name, dentry->d_name.len, mode);
-	if (!de)
-		return -ENOMEM;
-	de->vfs = TRUE;
-	if ((err = _devfs_append_entry(parent, de, NULL)) != 0)
-		return err;
-	de->inode.uid = current->euid;
-	de->inode.gid = current->egid;
-	de->inode.atime = CURRENT_TIME;
-	de->inode.mtime = CURRENT_TIME;
-	de->inode.ctime = CURRENT_TIME;
-	if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL)
-		return -ENOMEM;
-	DPRINTK(DEBUG_DISABLED, "(%s): new VFS inode(%u): %p  dentry: %p\n",
-		dentry->d_name.name, de->inode.ino, inode, dentry);
-	d_instantiate(dentry, inode);
-	if (!is_devfsd_or_child(fs_info))
-		devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
-				 inode->i_uid, inode->i_gid, fs_info);
-	return 0;
-}				/*  End Function devfs_mkdir  */
-
-static int devfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	int err = 0;
-	struct devfs_entry *de;
-	struct fs_info *fs_info = dir->i_sb->s_fs_info;
-	struct inode *inode = dentry->d_inode;
-
-	if (dir->i_sb->s_fs_info != inode->i_sb->s_fs_info)
-		return -EINVAL;
-	de = get_devfs_entry_from_vfs_inode(inode);
-	if (de == NULL)
-		return -ENOENT;
-	if (!S_ISDIR(de->mode))
-		return -ENOTDIR;
-	if (!de->vfs)
-		return -EPERM;
-	/*  First ensure the directory is empty and will stay that way  */
-	write_lock(&de->u.dir.lock);
-	if (de->u.dir.first)
-		err = -ENOTEMPTY;
-	else
-		de->u.dir.no_more_additions = TRUE;
-	write_unlock(&de->u.dir.lock);
-	if (err)
-		return err;
-	/*  Now unhook the directory from its parent  */
-	write_lock(&de->parent->u.dir.lock);
-	if (!_devfs_unhook(de))
-		err = -ENOENT;
-	write_unlock(&de->parent->u.dir.lock);
-	if (err)
-		return err;
-	if (!is_devfsd_or_child(fs_info))
-		devfsd_notify_de(de, DEVFSD_NOTIFY_DELETE, inode->i_mode,
-				 inode->i_uid, inode->i_gid, fs_info);
-	free_dentry(de);
-	devfs_put(de);
-	return 0;
-}				/*  End Function devfs_rmdir  */
-
-static int devfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
-		       dev_t rdev)
-{
-	int err;
-	struct fs_info *fs_info = dir->i_sb->s_fs_info;
-	struct devfs_entry *parent, *de;
-	struct inode *inode;
-
-	DPRINTK(DEBUG_I_MKNOD, "(%s): mode: 0%o  dev: %u:%u\n",
-		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
-	parent = get_devfs_entry_from_vfs_inode(dir);
-	if (parent == NULL)
-		return -ENOENT;
-	de = _devfs_alloc_entry(dentry->d_name.name, dentry->d_name.len, mode);
-	if (!de)
-		return -ENOMEM;
-	de->vfs = TRUE;
-	if (S_ISCHR(mode) || S_ISBLK(mode))
-		de->u.dev = rdev;
-	if ((err = _devfs_append_entry(parent, de, NULL)) != 0)
-		return err;
-	de->inode.uid = current->euid;
-	de->inode.gid = current->egid;
-	de->inode.atime = CURRENT_TIME;
-	de->inode.mtime = CURRENT_TIME;
-	de->inode.ctime = CURRENT_TIME;
-	if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL)
-		return -ENOMEM;
-	DPRINTK(DEBUG_I_MKNOD, ":   new VFS inode(%u): %p  dentry: %p\n",
-		de->inode.ino, inode, dentry);
-	d_instantiate(dentry, inode);
-	if (!is_devfsd_or_child(fs_info))
-		devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
-				 inode->i_uid, inode->i_gid, fs_info);
-	return 0;
-}				/*  End Function devfs_mknod  */
-
-static void *devfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	struct devfs_entry *p = get_devfs_entry_from_vfs_inode(dentry->d_inode);
-	nd_set_link(nd, p ? p->u.symlink.linkname : ERR_PTR(-ENODEV));
-	return NULL;
-}				/*  End Function devfs_follow_link  */
-
-static struct inode_operations devfs_iops = {
-	.setattr = devfs_notify_change,
-};
-
-static struct inode_operations devfs_dir_iops = {
-	.lookup = devfs_lookup,
-	.unlink = devfs_unlink,
-	.symlink = devfs_symlink,
-	.mkdir = devfs_mkdir,
-	.rmdir = devfs_rmdir,
-	.mknod = devfs_mknod,
-	.setattr = devfs_notify_change,
-};
-
-static struct inode_operations devfs_symlink_iops = {
-	.readlink = generic_readlink,
-	.follow_link = devfs_follow_link,
-	.setattr = devfs_notify_change,
-};
-
-static int devfs_fill_super(struct super_block *sb, void *data, int silent)
-{
-	struct inode *root_inode = NULL;
-
-	if (_devfs_get_root_entry() == NULL)
-		goto out_no_root;
-	atomic_set(&fs_info.devfsd_overrun_count, 0);
-	init_waitqueue_head(&fs_info.devfsd_wait_queue);
-	init_waitqueue_head(&fs_info.revalidate_wait_queue);
-	fs_info.sb = sb;
-	sb->s_fs_info = &fs_info;
-	sb->s_blocksize = 1024;
-	sb->s_blocksize_bits = 10;
-	sb->s_magic = DEVFS_SUPER_MAGIC;
-	sb->s_op = &devfs_sops;
-	sb->s_time_gran = 1;
-	if ((root_inode = _devfs_get_vfs_inode(sb, root_entry, NULL)) == NULL)
-		goto out_no_root;
-	sb->s_root = d_alloc_root(root_inode);
-	if (!sb->s_root)
-		goto out_no_root;
-	DPRINTK(DEBUG_S_READ, "(): made devfs ptr: %p\n", sb->s_fs_info);
-	return 0;
-
-      out_no_root:
-	PRINTK("(): get root inode failed\n");
-	if (root_inode)
-		iput(root_inode);
-	return -EINVAL;
-}				/*  End Function devfs_fill_super  */
-
-static int devfs_get_sb(struct file_system_type *fs_type,
-			int flags, const char *dev_name,
-			void *data, struct vfsmount *mnt)
-{
-	return get_sb_single(fs_type, flags, data, devfs_fill_super, mnt);
-}
-
-static struct file_system_type devfs_fs_type = {
-	.name = DEVFS_NAME,
-	.get_sb = devfs_get_sb,
-	.kill_sb = kill_anon_super,
-};
-
-/*  File operations for devfsd follow  */
-
-static ssize_t devfsd_read(struct file *file, char __user *buf, size_t len,
-			   loff_t * ppos)
-{
-	int done = FALSE;
-	int ival;
-	loff_t pos, devname_offset, tlen, rpos;
-	devfs_handle_t de;
-	struct devfsd_buf_entry *entry;
-	struct fs_info *fs_info = file->f_dentry->d_inode->i_sb->s_fs_info;
-	struct devfsd_notify_struct *info = fs_info->devfsd_info;
-	DECLARE_WAITQUEUE(wait, current);
-
-	/*  Verify the task has grabbed the queue  */
-	if (fs_info->devfsd_task != current)
-		return -EPERM;
-	info->major = 0;
-	info->minor = 0;
-	/*  Block for a new entry  */
-	set_current_state(TASK_INTERRUPTIBLE);
-	add_wait_queue(&fs_info->devfsd_wait_queue, &wait);
-	while (devfsd_queue_empty(fs_info)) {
-		fs_info->devfsd_sleeping = TRUE;
-		wake_up(&fs_info->revalidate_wait_queue);
-		schedule();
-		fs_info->devfsd_sleeping = FALSE;
-		if (signal_pending(current)) {
-			remove_wait_queue(&fs_info->devfsd_wait_queue, &wait);
-			__set_current_state(TASK_RUNNING);
-			return -EINTR;
-		}
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-	remove_wait_queue(&fs_info->devfsd_wait_queue, &wait);
-	__set_current_state(TASK_RUNNING);
-	/*  Now play with the data  */
-	ival = atomic_read(&fs_info->devfsd_overrun_count);
-	info->overrun_count = ival;
-	entry = fs_info->devfsd_first_event;
-	info->type = entry->type;
-	info->mode = entry->mode;
-	info->uid = entry->uid;
-	info->gid = entry->gid;
-	de = entry->de;
-	if (S_ISCHR(de->mode) || S_ISBLK(de->mode)) {
-		info->major = MAJOR(de->u.dev);
-		info->minor = MINOR(de->u.dev);
-	}
-	pos = devfs_generate_path(de, info->devname, DEVFS_PATHLEN);
-	if (pos < 0)
-		return pos;
-	info->namelen = DEVFS_PATHLEN - pos - 1;
-	if (info->mode == 0)
-		info->mode = de->mode;
-	devname_offset = info->devname - (char *)info;
-	rpos = *ppos;
-	if (rpos < devname_offset) {
-		/*  Copy parts of the header  */
-		tlen = devname_offset - rpos;
-		if (tlen > len)
-			tlen = len;
-		if (copy_to_user(buf, (char *)info + rpos, tlen)) {
-			return -EFAULT;
-		}
-		rpos += tlen;
-		buf += tlen;
-		len -= tlen;
-	}
-	if ((rpos >= devname_offset) && (len > 0)) {
-		/*  Copy the name  */
-		tlen = info->namelen + 1;
-		if (tlen > len)
-			tlen = len;
-		else
-			done = TRUE;
-		if (copy_to_user
-		    (buf, info->devname + pos + rpos - devname_offset, tlen)) {
-			return -EFAULT;
-		}
-		rpos += tlen;
-	}
-	tlen = rpos - *ppos;
-	if (done) {
-		devfs_handle_t parent;
-
-		spin_lock(&fs_info->devfsd_buffer_lock);
-		fs_info->devfsd_first_event = entry->next;
-		if (entry->next == NULL)
-			fs_info->devfsd_last_event = NULL;
-		spin_unlock(&fs_info->devfsd_buffer_lock);
-		for (; de != NULL; de = parent) {
-			parent = de->parent;
-			devfs_put(de);
-		}
-		kmem_cache_free(devfsd_buf_cache, entry);
-		if (ival > 0)
-			atomic_sub(ival, &fs_info->devfsd_overrun_count);
-		*ppos = 0;
-	} else
-		*ppos = rpos;
-	return tlen;
-}				/*  End Function devfsd_read  */
-
-static int devfsd_ioctl(struct inode *inode, struct file *file,
-			unsigned int cmd, unsigned long arg)
-{
-	int ival;
-	struct fs_info *fs_info = inode->i_sb->s_fs_info;
-
-	switch (cmd) {
-	case DEVFSDIOC_GET_PROTO_REV:
-		ival = DEVFSD_PROTOCOL_REVISION_KERNEL;
-		if (copy_to_user((void __user *)arg, &ival, sizeof ival))
-			return -EFAULT;
-		break;
-	case DEVFSDIOC_SET_EVENT_MASK:
-		/*  Ensure only one reader has access to the queue. This scheme will
-		   work even if the global kernel lock were to be removed, because it
-		   doesn't matter who gets in first, as long as only one gets it  */
-		if (fs_info->devfsd_task == NULL) {
-			static DEFINE_SPINLOCK(lock);
-
-			if (!spin_trylock(&lock))
-				return -EBUSY;
-			if (fs_info->devfsd_task != NULL) {	/*  We lost the race...  */
-				spin_unlock(&lock);
-				return -EBUSY;
-			}
-			fs_info->devfsd_task = current;
-			spin_unlock(&lock);
-			fs_info->devfsd_pgrp =
-			    (process_group(current) ==
-			     current->pid) ? process_group(current) : 0;
-			fs_info->devfsd_file = file;
-			fs_info->devfsd_info =
-			    kmalloc(sizeof *fs_info->devfsd_info, GFP_KERNEL);
-			if (!fs_info->devfsd_info) {
-				devfsd_close(inode, file);
-				return -ENOMEM;
-			}
-		} else if (fs_info->devfsd_task != current)
-			return -EBUSY;
-		fs_info->devfsd_event_mask = arg;	/*  Let the masses come forth  */
-		break;
-	case DEVFSDIOC_RELEASE_EVENT_QUEUE:
-		if (fs_info->devfsd_file != file)
-			return -EPERM;
-		return devfsd_close(inode, file);
-		/*break; */
-#ifdef CONFIG_DEVFS_DEBUG
-	case DEVFSDIOC_SET_DEBUG_MASK:
-		if (copy_from_user(&ival, (void __user *)arg, sizeof ival))
-			return -EFAULT;
-		devfs_debug = ival;
-		break;
-#endif
-	default:
-		return -ENOIOCTLCMD;
-	}
-	return 0;
-}				/*  End Function devfsd_ioctl  */
-
-static int devfsd_close(struct inode *inode, struct file *file)
-{
-	struct devfsd_buf_entry *entry, *next;
-	struct fs_info *fs_info = inode->i_sb->s_fs_info;
-
-	if (fs_info->devfsd_file != file)
-		return 0;
-	fs_info->devfsd_event_mask = 0;
-	fs_info->devfsd_file = NULL;
-	spin_lock(&fs_info->devfsd_buffer_lock);
-	entry = fs_info->devfsd_first_event;
-	fs_info->devfsd_first_event = NULL;
-	fs_info->devfsd_last_event = NULL;
-	kfree(fs_info->devfsd_info);
-	fs_info->devfsd_info = NULL;
-	spin_unlock(&fs_info->devfsd_buffer_lock);
-	fs_info->devfsd_pgrp = 0;
-	fs_info->devfsd_task = NULL;
-	wake_up(&fs_info->revalidate_wait_queue);
-	for (; entry; entry = next) {
-		next = entry->next;
-		kmem_cache_free(devfsd_buf_cache, entry);
-	}
-	return 0;
-}				/*  End Function devfsd_close  */
-
-#ifdef CONFIG_DEVFS_DEBUG
-static ssize_t stat_read(struct file *file, char __user *buf, size_t len,
-			 loff_t * ppos)
-{
-	ssize_t num;
-	char txt[80];
-
-	num = sprintf(txt, "Number of entries: %u  number of bytes: %u\n",
-		      stat_num_entries, stat_num_bytes) + 1;
-	if (*ppos >= num)
-		return 0;
-	if (*ppos + len > num)
-		len = num - *ppos;
-	if (copy_to_user(buf, txt + *ppos, len))
-		return -EFAULT;
-	*ppos += len;
-	return len;
-}				/*  End Function stat_read  */
-#endif
-
-static int __init init_devfs_fs(void)
-{
-	int err;
-	int major;
-	struct devfs_entry *devfsd;
-#ifdef CONFIG_DEVFS_DEBUG
-	struct devfs_entry *stat;
-#endif
-
-	if (_devfs_get_root_entry() == NULL)
-		return -ENOMEM;
-
-	printk(KERN_INFO "%s: %s Richard Gooch (rgooch@atnf.csiro.au)\n",
-	       DEVFS_NAME, DEVFS_VERSION);
-	devfsd_buf_cache = kmem_cache_create("devfsd_event",
-					     sizeof(struct devfsd_buf_entry),
-					     0, 0, NULL, NULL);
-	if (!devfsd_buf_cache)
-		OOPS("(): unable to allocate event slab\n");
-#ifdef CONFIG_DEVFS_DEBUG
-	devfs_debug = devfs_debug_init;
-	printk(KERN_INFO "%s: devfs_debug: 0x%0x\n", DEVFS_NAME, devfs_debug);
-#endif
-	printk(KERN_INFO "%s: boot_options: 0x%0x\n", DEVFS_NAME, boot_options);
-
-	/* register special device for devfsd communication */
-	major = register_chrdev(0, "devfs", &devfs_fops);
-	if (major < 0)
-		return major;
-
-	/*  And create the entry for ".devfsd"  */
-	devfsd = _devfs_alloc_entry(".devfsd", 0, S_IFCHR | S_IRUSR | S_IWUSR);
-	if (devfsd == NULL)
-		return -ENOMEM;
-	devfsd->u.dev = MKDEV(major, 0);
-	_devfs_append_entry(root_entry, devfsd, NULL);
-
-#ifdef CONFIG_DEVFS_DEBUG
-	stat = _devfs_alloc_entry(".stat", 0, S_IFCHR | S_IRUGO);
-	if (stat == NULL)
-		return -ENOMEM;
-	stat->u.dev = MKDEV(major, 1);
-	_devfs_append_entry(root_entry, stat, NULL);
-#endif
-
-	err = register_filesystem(&devfs_fs_type);
-	return err;
-}				/*  End Function init_devfs_fs  */
-
-void __init mount_devfs_fs(void)
-{
-	int err;
-
-	if (!(boot_options & OPTION_MOUNT))
-		return;
-	err = do_mount("none", "/dev", "devfs", 0, NULL);
-	if (err == 0)
-		printk(KERN_INFO "Mounted devfs on /dev\n");
-	else
-		PRINTK("(): unable to mount devfs, err: %d\n", err);
-}				/*  End Function mount_devfs_fs  */
-
-module_init(init_devfs_fs)
diff --git a/fs/devfs/util.c b/fs/devfs/util.c
deleted file mode 100644
index db06d388c9a..00000000000
--- a/fs/devfs/util.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*  devfs (Device FileSystem) utilities.
-
-    Copyright (C) 1999-2002  Richard Gooch
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Library General Public
-    License as published by the Free Software Foundation; either
-    version 2 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Library General Public License for more details.
-
-    You should have received a copy of the GNU Library General Public
-    License along with this library; if not, write to the Free
-    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-    Richard Gooch may be reached by email at  rgooch@atnf.csiro.au
-    The postal address is:
-      Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
-
-    ChangeLog
-
-    19991031   Richard Gooch <rgooch@atnf.csiro.au>
-               Created.
-    19991103   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <_devfs_convert_name> and supported SCSI and IDE CD-ROMs
-    20000203   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed operations pointer type to void *.
-    20000621   Richard Gooch <rgooch@atnf.csiro.au>
-               Changed interface to <devfs_register_series>.
-    20000622   Richard Gooch <rgooch@atnf.csiro.au>
-               Took account of interface change to <devfs_mk_symlink>.
-               Took account of interface change to <devfs_mk_dir>.
-    20010519   Richard Gooch <rgooch@atnf.csiro.au>
-               Documentation cleanup.
-    20010709   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_*alloc_major> and <devfs_*alloc_devnum>.
-    20010710   Richard Gooch <rgooch@atnf.csiro.au>
-               Created <devfs_*alloc_unique_number>.
-    20010730   Richard Gooch <rgooch@atnf.csiro.au>
-               Documentation typo fix.
-    20010806   Richard Gooch <rgooch@atnf.csiro.au>
-               Made <block_semaphore> and <char_semaphore> private.
-    20010813   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bug in <devfs_alloc_unique_number>: limited to 128 numbers
-    20010818   Richard Gooch <rgooch@atnf.csiro.au>
-               Updated major masks up to Linus' "no new majors" proclamation.
-	       Block: were 126 now 122 free, char: were 26 now 19 free.
-    20020324   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bug in <devfs_alloc_unique_number>: was clearing beyond
-	       bitfield.
-    20020326   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed bitfield data type for <devfs_*alloc_devnum>.
-               Made major bitfield type and initialiser 64 bit safe.
-    20020413   Richard Gooch <rgooch@atnf.csiro.au>
-               Fixed shift warning on 64 bit machines.
-    20020428   Richard Gooch <rgooch@atnf.csiro.au>
-               Copied and used macro for error messages from fs/devfs/base.c 
-    20021013   Richard Gooch <rgooch@atnf.csiro.au>
-               Documentation fix.
-    20030101   Adam J. Richter <adam@yggdrasil.com>
-               Eliminate DEVFS_SPECIAL_{CHR,BLK}.  Use mode_t instead.
-    20030106   Christoph Hellwig <hch@infradead.org>
-               Rewrite devfs_{,de}alloc_devnum to look like C code.
-*/
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/devfs_fs_kernel.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/genhd.h>
-#include <linux/bitops.h>
-
-int devfs_register_tape(const char *name)
-{
-	char tname[32], dest[64];
-	static unsigned int tape_counter;
-	unsigned int n = tape_counter++;
-
-	sprintf(dest, "../%s", name);
-	sprintf(tname, "tapes/tape%u", n);
-	devfs_mk_symlink(tname, dest);
-
-	return n;
-}
-
-EXPORT_SYMBOL(devfs_register_tape);
-
-void devfs_unregister_tape(int num)
-{
-	if (num >= 0)
-		devfs_remove("tapes/tape%u", num);
-}
-
-EXPORT_SYMBOL(devfs_unregister_tape);
-- 
cgit v1.2.3


From a29641883f57f36424e3219ae9ff48dd6cd34de0 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Jun 2005 21:15:16 -0700
Subject: [PATCH] devfs: Remove devfs from the partition code

This patch removes the devfs code from the fs/partitions/ directory.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/Makefile |   1 -
 fs/partitions/check.c  |  26 ++--------
 fs/partitions/devfs.c  | 130 -------------------------------------------------
 fs/partitions/devfs.h  |  10 ----
 4 files changed, 5 insertions(+), 162 deletions(-)
 delete mode 100644 fs/partitions/devfs.c
 delete mode 100644 fs/partitions/devfs.h

(limited to 'fs')

diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile
index 42c7d3878ed..d713ce6b3e1 100644
--- a/fs/partitions/Makefile
+++ b/fs/partitions/Makefile
@@ -4,7 +4,6 @@
 
 obj-y := check.o
 
-obj-$(CONFIG_DEVFS_FS) += devfs.o
 obj-$(CONFIG_ACORN_PARTITION) += acorn.o
 obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
 obj-$(CONFIG_ATARI_PARTITION) += atari.o
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 2ef313a96b6..2ab7701eb2f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -21,7 +21,6 @@
 #include <linux/devfs_fs_kernel.h>
 
 #include "check.h"
-#include "devfs.h"
 
 #include "acorn.h"
 #include "amiga.h"
@@ -161,18 +160,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
 	if (!state)
 		return NULL;
 
-#ifdef CONFIG_DEVFS_FS
-	if (hd->devfs_name[0] != '\0') {
-		printk(KERN_INFO " /dev/%s:", hd->devfs_name);
+	disk_name(hd, 0, state->name);
+	printk(KERN_INFO " %s:", state->name);
+	if (isdigit(state->name[strlen(state->name)-1]))
 		sprintf(state->name, "p");
-	}
-#endif
-	else {
-		disk_name(hd, 0, state->name);
-		printk(KERN_INFO " %s:", state->name);
-		if (isdigit(state->name[strlen(state->name)-1]))
-			sprintf(state->name, "p");
-	}
+
 	state->limit = hd->minors;
 	i = res = 0;
 	while (!res && check_part[i]) {
@@ -423,14 +415,8 @@ void register_disk(struct gendisk *disk)
  	disk_sysfs_add_subdirs(disk);
 
 	/* No minors to use for partitions */
-	if (disk->minors == 1) {
-		if (disk->devfs_name[0] != '\0')
-			devfs_add_disk(disk);
+	if (disk->minors == 1)
 		goto exit;
-	}
-
-	/* always add handle for the whole disk */
-	devfs_add_partitioned(disk);
 
 	/* No such device (e.g., media were just removed) */
 	if (!get_capacity(disk))
@@ -538,8 +524,6 @@ void del_gendisk(struct gendisk *disk)
 	disk_stat_set_all(disk, 0);
 	disk->stamp = 0;
 
-	devfs_remove_disk(disk);
-
 	kobject_uevent(&disk->kobj, KOBJ_REMOVE);
 	if (disk->holder_dir)
 		kobject_unregister(disk->holder_dir);
diff --git a/fs/partitions/devfs.c b/fs/partitions/devfs.c
deleted file mode 100644
index 3f0a780c9ce..00000000000
--- a/fs/partitions/devfs.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * This tries to keep block devices away from devfs as much as possible.
- */
-#include <linux/fs.h>
-#include <linux/devfs_fs_kernel.h>
-#include <linux/vmalloc.h>
-#include <linux/genhd.h>
-#include <linux/bitops.h>
-#include <linux/mutex.h>
-
-
-struct unique_numspace {
-	u32		  num_free;          /*  Num free in bits       */
-	u32		  length;            /*  Array length in bytes  */
-	unsigned long	  *bits;
-	struct semaphore  mutex;
-};
-
-static DEFINE_MUTEX(numspace_mutex);
-
-static int expand_numspace(struct unique_numspace *s)
-{
-	u32 length;
-	void *bits;
-
-	if (s->length < 16)
-		length = 16;
-	else
-		length = s->length << 1;
-
-	bits = vmalloc(length);
-	if (!bits)
-		return -ENOMEM;
-	if (s->bits) {
-		memcpy(bits, s->bits, s->length);
-		vfree(s->bits);
-	}
-		
-	s->num_free = (length - s->length) << 3;
-	s->bits = bits;
-	memset(bits + s->length, 0, length - s->length);
-	s->length = length;
-
-	return 0;
-}
-
-static int alloc_unique_number(struct unique_numspace *s)
-{
-	int rval = 0;
-
-	mutex_lock(&numspace_mutex);
-	if (s->num_free < 1)
-		rval = expand_numspace(s);
-	if (!rval) {
-		rval = find_first_zero_bit(s->bits, s->length << 3);
-		--s->num_free;
-		__set_bit(rval, s->bits);
-	}
-	mutex_unlock(&numspace_mutex);
-
-	return rval;
-}
-
-static void dealloc_unique_number(struct unique_numspace *s, int number)
-{
-	int old_val;
-
-	if (number >= 0) {
-		mutex_lock(&numspace_mutex);
-		old_val = __test_and_clear_bit(number, s->bits);
-		if (old_val)
-			++s->num_free;
-		mutex_unlock(&numspace_mutex);
-	}
-}
-
-static struct unique_numspace disc_numspace;
-static struct unique_numspace cdrom_numspace;
-
-void devfs_add_partitioned(struct gendisk *disk)
-{
-	char dirname[64], symlink[16];
-
-	devfs_mk_dir(disk->devfs_name);
-	devfs_mk_bdev(MKDEV(disk->major, disk->first_minor),
-			S_IFBLK|S_IRUSR|S_IWUSR,
-			"%s/disc", disk->devfs_name);
-
-	disk->number = alloc_unique_number(&disc_numspace);
-
-	sprintf(symlink, "discs/disc%d", disk->number);
-	sprintf(dirname, "../%s", disk->devfs_name);
-	devfs_mk_symlink(symlink, dirname);
-
-}
-
-void devfs_add_disk(struct gendisk *disk)
-{
-	devfs_mk_bdev(MKDEV(disk->major, disk->first_minor),
-			(disk->flags & GENHD_FL_CD) ?
-				S_IFBLK|S_IRUGO|S_IWUGO :
-				S_IFBLK|S_IRUSR|S_IWUSR,
-			"%s", disk->devfs_name);
-
-	if (disk->flags & GENHD_FL_CD) {
-		char dirname[64], symlink[16];
-
-		disk->number = alloc_unique_number(&cdrom_numspace);
-
-		sprintf(symlink, "cdroms/cdrom%d", disk->number);
-		sprintf(dirname, "../%s", disk->devfs_name);
-		devfs_mk_symlink(symlink, dirname);
-	}
-}
-
-void devfs_remove_disk(struct gendisk *disk)
-{
-	if (disk->minors != 1) {
-		devfs_remove("discs/disc%d", disk->number);
-		dealloc_unique_number(&disc_numspace, disk->number);
-		devfs_remove("%s/disc", disk->devfs_name);
-	}
-	if (disk->flags & GENHD_FL_CD) {
-		devfs_remove("cdroms/cdrom%d", disk->number);
-		dealloc_unique_number(&cdrom_numspace, disk->number);
-	}
-	devfs_remove(disk->devfs_name);
-}
-
-
diff --git a/fs/partitions/devfs.h b/fs/partitions/devfs.h
deleted file mode 100644
index 176118b4e49..00000000000
--- a/fs/partitions/devfs.h
+++ /dev/null
@@ -1,10 +0,0 @@
-
-#ifdef CONFIG_DEVFS_FS
-void devfs_add_disk(struct gendisk *dev);
-void devfs_add_partitioned(struct gendisk *dev);
-void devfs_remove_disk(struct gendisk *dev);
-#else
-# define devfs_add_disk(disk)			do { } while (0)
-# define devfs_add_partitioned(disk)		do { } while (0)
-# define devfs_remove_disk(disk)		do { } while (0)
-#endif
-- 
cgit v1.2.3


From 95dc112a5770dc670a1b45a3d9ee346fdd2b2697 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Jun 2005 21:15:16 -0700
Subject: [PATCH] devfs: Remove devfs_mk_dir() function from the kernel tree

Removes the devfs_mk_dir() function and all callers of it.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/coda/psdev.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 7caee8d8ea3..a193823f905 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -365,7 +365,6 @@ static int init_coda_psdev(void)
 		err = PTR_ERR(coda_psdev_class);
 		goto out_chrdev;
 	}		
-	devfs_mk_dir ("coda");
 	for (i = 0; i < MAX_CODADEVS; i++) {
 		class_device_create(coda_psdev_class, NULL,
 				MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i);
-- 
cgit v1.2.3


From 1a715c5cf917326a285533d1116d725f5f2593c2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Jun 2005 21:15:16 -0700
Subject: [PATCH] devfs: Remove devfs_mk_bdev() function from the kernel tree

Removes the devfs_mk_bdev() function and all callers of it.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 2ab7701eb2f..d7a5078b3d7 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -342,10 +342,6 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 	p->nr_sects = len;
 	p->partno = part;
 
-	devfs_mk_bdev(MKDEV(disk->major, disk->first_minor + part),
-			S_IFBLK|S_IRUSR|S_IWUSR,
-			"%s/part%d", disk->devfs_name, part);
-
 	if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1]))
 		snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part);
 	else
-- 
cgit v1.2.3


From 7c69ef79741910883d5543caafa06aca3ebadbd1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Jun 2005 21:15:16 -0700
Subject: [PATCH] devfs: Remove devfs_mk_cdev() function from the kernel tree

Removes the devfs_mk_cdev() function and all callers of it.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/coda/psdev.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index a193823f905..f478222cb5c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -365,21 +365,12 @@ static int init_coda_psdev(void)
 		err = PTR_ERR(coda_psdev_class);
 		goto out_chrdev;
 	}		
-	for (i = 0; i < MAX_CODADEVS; i++) {
+	for (i = 0; i < MAX_CODADEVS; i++)
 		class_device_create(coda_psdev_class, NULL,
 				MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i);
-		err = devfs_mk_cdev(MKDEV(CODA_PSDEV_MAJOR, i),
-				S_IFCHR|S_IRUSR|S_IWUSR, "coda/%d", i);
-		if (err)
-			goto out_class;
-	}
 	coda_sysctl_init();
 	goto out;
 
-out_class:
-	for (i = 0; i < MAX_CODADEVS; i++) 
-		class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
-	class_destroy(coda_psdev_class);
 out_chrdev:
 	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
 out:
-- 
cgit v1.2.3


From 8ab5e4c15b53e147c08031a959d9f776823dbe73 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Jun 2005 21:15:16 -0700
Subject: [PATCH] devfs: Remove devfs_remove() function from the kernel tree

Removes the devfs_remove() function and all callers of it.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/coda/psdev.c       | 10 ++--------
 fs/partitions/check.c |  1 -
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index f478222cb5c..aaf6462a942 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -409,12 +409,9 @@ static int __init init_coda(void)
 	}
 	return 0;
 out:
-	for (i = 0; i < MAX_CODADEVS; i++) {
+	for (i = 0; i < MAX_CODADEVS; i++)
 		class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
-		devfs_remove("coda/%d", i);
-	}
 	class_destroy(coda_psdev_class);
-	devfs_remove("coda");
 	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
 	coda_sysctl_clean();
 out1:
@@ -431,12 +428,9 @@ static void __exit exit_coda(void)
         if ( err != 0 ) {
                 printk("coda: failed to unregister filesystem\n");
         }
-	for (i = 0; i < MAX_CODADEVS; i++) {
+	for (i = 0; i < MAX_CODADEVS; i++)
 		class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
-		devfs_remove("coda/%d", i);
-	}
 	class_destroy(coda_psdev_class);
-	devfs_remove("coda");
 	unregister_chrdev(CODA_PSDEV_MAJOR, "coda");
 	coda_sysctl_clean();
 	coda_destroy_inodecache();
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index d7a5078b3d7..202f50acea8 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -320,7 +320,6 @@ void delete_partition(struct gendisk *disk, int part)
 	p->nr_sects = 0;
 	p->ios[0] = p->ios[1] = 0;
 	p->sectors[0] = p->sectors[1] = 0;
-	devfs_remove("%s/part%d", disk->devfs_name, part);
 	sysfs_remove_link(&p->kobj, "subsystem");
 	if (p->holder_dir)
 		kobject_unregister(p->holder_dir);
-- 
cgit v1.2.3


From ff23eca3e8f613034e0d20ff86f6a89b62f5a14e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 20 Jun 2005 21:15:16 -0700
Subject: [PATCH] devfs: Remove the devfs_fs_kernel.h file from the tree

Also fixes up all files that #include it.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/block_dev.c        | 1 -
 fs/char_dev.c         | 1 -
 fs/coda/psdev.c       | 1 -
 fs/partitions/check.c | 1 -
 4 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 028d9fb9c2d..ddb305eebf9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -12,7 +12,6 @@
 #include <linux/slab.h>
 #include <linux/kmod.h>
 #include <linux/major.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/smp_lock.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
diff --git a/fs/char_dev.c b/fs/char_dev.c
index f3418f7a6e9..97986635b64 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -14,7 +14,6 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/seq_file.h>
 
 #include <linux/kobject.h>
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index aaf6462a942..803aacf0d49 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -28,7 +28,6 @@
 #include <linux/delay.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
 #include <linux/file.h>
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 202f50acea8..839634026eb 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -18,7 +18,6 @@
 #include <linux/fs.h>
 #include <linux/kmod.h>
 #include <linux/ctype.h>
-#include <linux/devfs_fs_kernel.h>
 
 #include "check.h"
 
-- 
cgit v1.2.3


From 65c491d833a06fd0d1383297590772c75d28155c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 6 Mar 2006 15:36:17 -0800
Subject: ocfs2: move lockres qstr next to hlist_node structure

Gains us a bit of performance on loads which heavily hit the lockres hash.
Patch suggested by Daniel Phillips <phillips@google.com>.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 88cc43df18f..1c05d485e01 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -216,6 +216,7 @@ struct dlm_lock_resource
 	/* WARNING: Please see the comment in dlm_init_lockres before
 	 * adding fields here. */
 	struct hlist_node hash_node;
+	struct qstr lockname;
 	struct kref      refs;
 
 	/* please keep these next 3 in this order
@@ -238,7 +239,6 @@ struct dlm_lock_resource
 	wait_queue_head_t wq;
 	u8  owner;              //node which owns the lock resource, or unknown
 	u16 state;
-	struct qstr lockname;
 	char lvb[DLM_LVB_LEN];
 };
 
-- 
cgit v1.2.3


From a3d3329159ea76bae0b3b8680691a1c3ecf5801f Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 9 Mar 2006 17:55:56 -0800
Subject: ocfs2: calculate lockid hash values outside of the spinlock

Fixes a performance bug - pointed out by Andrew.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |  6 +++++-
 fs/ocfs2/dlm/dlmdomain.c   | 12 +++++-------
 fs/ocfs2/dlm/dlmmaster.c   | 26 ++++++++++++++++----------
 fs/ocfs2/dlm/dlmrecovery.c |  5 ++++-
 4 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 1c05d485e01..3b675368762 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -39,6 +39,9 @@
 
 #define DLM_HASH_BUCKETS     (PAGE_SIZE / sizeof(struct hlist_head))
 
+/* Intended to make it easier for us to switch out hash functions */
+#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+
 enum dlm_ast_type {
 	DLM_AST = 0,
 	DLM_BAST,
@@ -694,7 +697,8 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 			  struct dlm_lock_resource *res);
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 						const char *name,
-						unsigned int len);
+						unsigned int len,
+						unsigned int hash);
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
 					      const char *name,
 					      unsigned int len);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8f3a9e3106f..a818fde2447 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -90,7 +90,6 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 	assert_spin_locked(&dlm->spinlock);
 
 	q = &res->lockname;
-	q->hash = full_name_hash(q->name, q->len);
 	bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
 
 	/* get a reference for our hashtable */
@@ -100,10 +99,10 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 }
 
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
-					 const char *name,
-					 unsigned int len)
+						const char *name,
+						unsigned int len,
+						unsigned int hash)
 {
-	unsigned int hash;
 	struct hlist_node *iter;
 	struct dlm_lock_resource *tmpres=NULL;
 	struct hlist_head *bucket;
@@ -112,8 +111,6 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 
 	assert_spin_locked(&dlm->spinlock);
 
-	hash = full_name_hash(name, len);
-
 	bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
 
 	/* check for pre-existing lock */
@@ -135,9 +132,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
 				    unsigned int len)
 {
 	struct dlm_lock_resource *res;
+	unsigned int hash = dlm_lockid_hash(name, len);
 
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, name, len);
+	res = __dlm_lookup_lockres(dlm, name, len, hash);
 	spin_unlock(&dlm->spinlock);
 	return res;
 }
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 940be4c13b1..953aa8421be 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -603,7 +603,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 	memcpy(qname, name, namelen);
 
 	res->lockname.len = namelen;
-	res->lockname.hash = full_name_hash(name, namelen);
+	res->lockname.hash = dlm_lockid_hash(name, namelen);
 
 	init_waitqueue_head(&res->wq);
 	spin_lock_init(&res->spinlock);
@@ -677,19 +677,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
 	int blocked = 0;
 	int ret, nodenum;
 	struct dlm_node_iter iter;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	int tries = 0;
 	int bit, wait_on_recovery = 0;
 
 	BUG_ON(!lockid);
 
 	namelen = strlen(lockid);
+	hash = dlm_lockid_hash(lockid, namelen);
 
 	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
 
 lookup:
 	spin_lock(&dlm->spinlock);
-	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
 	if (tmpres) {
 		spin_unlock(&dlm->spinlock);
 		mlog(0, "found in hash!\n");
@@ -1316,7 +1317,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
 	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
 	char *name;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	int found, ret;
 	int set_maybe;
 	int dispatch_assert = 0;
@@ -1331,6 +1332,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	name = request->name;
 	namelen = request->namelen;
+	hash = dlm_lockid_hash(name, namelen);
 
 	if (namelen > DLM_LOCKID_NAME_MAX) {
 		response = DLM_IVBUFLEN;
@@ -1339,7 +1341,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 way_up_top:
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, name, namelen);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
 	if (res) {
 		spin_unlock(&dlm->spinlock);
 
@@ -1612,7 +1614,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
 	struct dlm_lock_resource *res = NULL;
 	char *name;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	u32 flags;
 	int master_request = 0;
 	int ret = 0;
@@ -1622,6 +1624,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	name = assert->name;
 	namelen = assert->namelen;
+	hash = dlm_lockid_hash(name, namelen);
 	flags = be32_to_cpu(assert->flags);
 
 	if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -1670,7 +1673,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	/* ok everything checks out with the MLE
 	 * now check to see if there is a lockres */
-	res = __dlm_lookup_lockres(dlm, name, namelen);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
 	if (res) {
 		spin_lock(&res->spinlock);
 		if (res->state & DLM_LOCK_RES_RECOVERING)  {
@@ -2462,7 +2465,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
 	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
 	const char *name;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	int ret = 0;
 
 	if (!dlm_grab(dlm))
@@ -2470,6 +2473,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	name = migrate->name;
 	namelen = migrate->namelen;
+	hash = dlm_lockid_hash(name, namelen);
 
 	/* preallocate.. if this fails, abort */
 	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
@@ -2482,7 +2486,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	/* check for pre-existing lock */
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, name, namelen);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
 	spin_lock(&dlm->master_lock);
 
 	if (res) {
@@ -2601,6 +2605,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
 	struct list_head *iter, *iter2;
 	struct dlm_master_list_entry *mle;
 	struct dlm_lock_resource *res;
+	unsigned int hash;
 
 	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
 top:
@@ -2684,8 +2689,9 @@ top:
 		     mle->master, mle->new_master);
 		/* if there is a lockres associated with this
 	 	 * mle, find it and set its owner to UNKNOWN */
+		hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
 		res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-					mle->u.name.len);
+					   mle->u.name.len, hash);
 		if (res) {
 			/* unfortunately if we hit this rare case, our
 		 	 * lock ordering is messed.  we need to drop
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9962190e741..4f3d482a729 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1404,6 +1404,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_ctxt *dlm = data;
 	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
 	struct dlm_lock_resource *res = NULL;
+	unsigned int hash;
 	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
 	u32 flags = DLM_ASSERT_MASTER_REQUERY;
 
@@ -1413,8 +1414,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
 		return master;
 	}
 
+	hash = dlm_lockid_hash(req->name, req->namelen);
+
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
+	res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
 	if (res) {
 		spin_lock(&res->spinlock);
 		master = res->owner;
-- 
cgit v1.2.3


From 4198985f7ae119a23f83503a692dd822bd574080 Mon Sep 17 00:00:00 2001
From: Daniel Phillips <phillips@google.com>
Date: Fri, 10 Mar 2006 13:31:47 -0800
Subject: [PATCH] Clean up ocfs2 hash probe and make it faster

Signed-Off-By: Daniel Phillips <phillips@google.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmdomain.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index a818fde2447..595b1c7cbd1 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -103,28 +103,27 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 						unsigned int len,
 						unsigned int hash)
 {
-	struct hlist_node *iter;
-	struct dlm_lock_resource *tmpres=NULL;
 	struct hlist_head *bucket;
+	struct hlist_node *list;
 
 	mlog_entry("%.*s\n", len, name);
 
 	assert_spin_locked(&dlm->spinlock);
 
-	bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
-
-	/* check for pre-existing lock */
-	hlist_for_each(iter, bucket) {
-		tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
-		if (tmpres->lockname.len == len &&
-		    memcmp(tmpres->lockname.name, name, len) == 0) {
-			dlm_lockres_get(tmpres);
-			break;
-		}
-
-		tmpres = NULL;
+	bucket = dlm->lockres_hash + full_name_hash(name, len) % DLM_HASH_BUCKETS;
+	hlist_for_each(list, bucket) {
+		struct dlm_lock_resource *res = hlist_entry(list,
+			struct dlm_lock_resource, hash_node);
+		if (res->lockname.name[0] != name[0])
+			continue;
+		if (unlikely(res->lockname.len != len))
+			continue;
+		if (memcmp(res->lockname.name + 1, name + 1, len - 1))
+			continue;
+		dlm_lockres_get(res);
+		return res;
 	}
-	return tmpres;
+	return NULL;
 }
 
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
-- 
cgit v1.2.3


From 95c4f581d6551de55cf5b8693db98b01ce07021b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 10 Mar 2006 13:44:00 -0800
Subject: ocfs2: inline dlm_lockres_get()

It's called on every lookup so this might help performance a bit.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h | 7 ++++++-
 fs/ocfs2/dlm/dlmmaster.c | 5 -----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 3b675368762..87612819c13 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -690,7 +690,12 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			    struct dlm_lock_resource *res);
 void dlm_purge_lockres(struct dlm_ctxt *dlm,
 		       struct dlm_lock_resource *lockres);
-void dlm_lockres_get(struct dlm_lock_resource *res);
+static inline void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+	/* This is called on every lookup, so it might be worth
+	 * inlining. */
+	kref_get(&res->refs);
+}
 void dlm_lockres_put(struct dlm_lock_resource *res);
 void __dlm_unhash_lockres(struct dlm_lock_resource *res);
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 953aa8421be..f1fbf2f4e5d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -579,11 +579,6 @@ static void dlm_lockres_release(struct kref *kref)
 	kfree(res);
 }
 
-void dlm_lockres_get(struct dlm_lock_resource *res)
-{
-	kref_get(&res->refs);
-}
-
 void dlm_lockres_put(struct dlm_lock_resource *res)
 {
 	kref_put(&res->refs, dlm_lockres_release);
-- 
cgit v1.2.3


From 03d864c02c3ea803b1718940ac6953a257182d7a Mon Sep 17 00:00:00 2001
From: Daniel Phillips <phillips@google.com>
Date: Fri, 10 Mar 2006 18:08:16 -0800
Subject: ocfs2: allocate lockres hash pages in an array

This allows us to have a hash table greater than a single page which greatly
improves dlm performance on some tests.

Signed-off-by: Daniel Phillips <phillips@google.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   | 12 ++++++++++--
 fs/ocfs2/dlm/dlmdebug.c    |  2 +-
 fs/ocfs2/dlm/dlmdomain.c   | 41 +++++++++++++++++++++++++++++++++--------
 fs/ocfs2/dlm/dlmrecovery.c |  4 ++--
 4 files changed, 46 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 87612819c13..0378ddbc8a8 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,10 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
 
-#define DLM_HASH_BUCKETS     (PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_SIZE		(1 << 14)
+#define DLM_HASH_PAGES		(DLM_HASH_SIZE / PAGE_SIZE)
+#define DLM_BUCKETS_PER_PAGE	(PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_BUCKETS	(DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
 
 /* Intended to make it easier for us to switch out hash functions */
 #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
@@ -88,7 +91,7 @@ enum dlm_ctxt_state {
 struct dlm_ctxt
 {
 	struct list_head list;
-	struct hlist_head *lockres_hash;
+	struct hlist_head **lockres_hash;
 	struct list_head dirty_list;
 	struct list_head purge_list;
 	struct list_head pending_asts;
@@ -135,6 +138,11 @@ struct dlm_ctxt
 	struct list_head	dlm_eviction_callbacks;
 };
 
+static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
+{
+	return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
+}
+
 /* these keventd work queue items are for less-frequently
  * called functions that cannot be directly called from the
  * net message handlers for some reason, usually because
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c7eae5d3324..e119e4ddf93 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -136,7 +136,7 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 
 	spin_lock(&dlm->spinlock);
 	for (i=0; i<DLM_HASH_BUCKETS; i++) {
-		bucket = &(dlm->lockres_hash[i]);
+		bucket = dlm_lockres_hash(dlm, i);
 		hlist_for_each_entry(res, iter, bucket, hash_node)
 			dlm_print_one_lock_resource(res);
 	}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 595b1c7cbd1..80b8cce9cf3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -49,6 +49,30 @@
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
 
+static void dlm_free_pagevec(void **vec, int pages)
+{
+	while (pages--)
+		free_page((unsigned long)vec[pages]);
+	kfree(vec);
+}
+
+static void **dlm_alloc_pagevec(int pages)
+{
+	void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+	int i;
+
+	if (!vec)
+		return NULL;
+
+	for (i = 0; i < pages; i++)
+		if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
+			goto out_free;
+	return vec;
+out_free:
+	dlm_free_pagevec(vec, i);
+	return NULL;
+}
+
 /*
  *
  * spinlock lock ordering: if multiple locks are needed, obey this ordering:
@@ -90,7 +114,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 	assert_spin_locked(&dlm->spinlock);
 
 	q = &res->lockname;
-	bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
+	bucket = dlm_lockres_hash(dlm, q->hash);
 
 	/* get a reference for our hashtable */
 	dlm_lockres_get(res);
@@ -110,7 +134,8 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 
 	assert_spin_locked(&dlm->spinlock);
 
-	bucket = dlm->lockres_hash + full_name_hash(name, len) % DLM_HASH_BUCKETS;
+	bucket = dlm_lockres_hash(dlm, hash);
+
 	hlist_for_each(list, bucket) {
 		struct dlm_lock_resource *res = hlist_entry(list,
 			struct dlm_lock_resource, hash_node);
@@ -191,7 +216,7 @@ static int dlm_wait_on_domain_helper(const char *domain)
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
 	if (dlm->lockres_hash)
-		free_page((unsigned long) dlm->lockres_hash);
+		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
 
 	if (dlm->name)
 		kfree(dlm->name);
@@ -301,8 +326,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 restart:
 	spin_lock(&dlm->spinlock);
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-		while (!hlist_empty(&dlm->lockres_hash[i])) {
-			res = hlist_entry(dlm->lockres_hash[i].first,
+		while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
+			res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
 					  struct dlm_lock_resource, hash_node);
 			/* need reference when manually grabbing lockres */
 			dlm_lockres_get(res);
@@ -1188,7 +1213,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 		goto leave;
 	}
 
-	dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
+	dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
 	if (!dlm->lockres_hash) {
 		mlog_errno(-ENOMEM);
 		kfree(dlm->name);
@@ -1197,8 +1222,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 		goto leave;
 	}
 
-	for (i=0; i<DLM_HASH_BUCKETS; i++)
-		INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
 
 	strcpy(dlm->name, domain);
 	dlm->key = key;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 4f3d482a729..bf2a02eca62 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1719,7 +1719,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 	 * the RECOVERING state and set the owner
 	 * if necessary */
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-		bucket = &(dlm->lockres_hash[i]);
+		bucket = dlm_lockres_hash(dlm, i);
 		hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
 			if (res->state & DLM_LOCK_RES_RECOVERING) {
 				if (res->owner == dead_node) {
@@ -1884,7 +1884,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 	 *    need to be fired as a result.
 	 */
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-		bucket = &(dlm->lockres_hash[i]);
+		bucket = dlm_lockres_hash(dlm, i);
 		hlist_for_each_entry(res, iter, bucket, hash_node) {
  			/* always prune any $RECOVERY entries for dead nodes,
  			 * otherwise hangs can occur during later recovery */
-- 
cgit v1.2.3


From c8f33b6e86af74ee7b800f57cac7b3c8559318fe Mon Sep 17 00:00:00 2001
From: Joel Becker <Joel.Becker@oracle.com>
Date: Thu, 16 Mar 2006 17:40:37 -0800
Subject: [PATCH] ocfs2: Alloc at least a page for the DLM hash

The OCFS2 DLM allocates a number of pages for a hash to lookup locks.
There was a bug where a PAGE_SIZE bigger than the hash size (eg, 64K
pages) would result in zero pages allocated.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h | 8 ++++++--
 fs/ocfs2/dlm/dlmdomain.c | 3 +++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 0378ddbc8a8..4fc1be3a3fa 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,8 +37,12 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
 
-#define DLM_HASH_SIZE		(1 << 14)
-#define DLM_HASH_PAGES		(DLM_HASH_SIZE / PAGE_SIZE)
+#define DLM_HASH_SIZE_DEFAULT	(1 << 14)
+#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
+# define DLM_HASH_PAGES		1
+#else
+# define DLM_HASH_PAGES		(DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
+#endif
 #define DLM_BUCKETS_PER_PAGE	(PAGE_SIZE / sizeof(struct hlist_head))
 #define DLM_HASH_BUCKETS	(DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
 
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 80b8cce9cf3..a074ec6f193 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -67,6 +67,9 @@ static void **dlm_alloc_pagevec(int pages)
 	for (i = 0; i < pages; i++)
 		if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
 			goto out_free;
+
+	mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %Zd buckets per page\n",
+	     pages, DLM_HASH_PAGES, DLM_BUCKETS_PER_PAGE);
 	return vec;
 out_free:
 	dlm_free_pagevec(vec, i);
-- 
cgit v1.2.3


From 685f1adb3872d904e08e22fab699f34432d5068a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 23 Mar 2006 11:23:29 -0800
Subject: ocfs2: silence a compile warning in dlm_alloc_pagevec()

Reported by Andrew Morton.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmdomain.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index a074ec6f193..3511930a34e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -68,8 +68,8 @@ static void **dlm_alloc_pagevec(int pages)
 		if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
 			goto out_free;
 
-	mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %Zd buckets per page\n",
-	     pages, DLM_HASH_PAGES, DLM_BUCKETS_PER_PAGE);
+	mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
+	     pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE);
 	return vec;
 out_free:
 	dlm_free_pagevec(vec, i);
-- 
cgit v1.2.3


From 8d79d088e88198d5456861ee9e6a8226dcd08799 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 17:58:23 -0700
Subject: ocfs2: add a small delay after a failed migration

Otherwise we risk starving other threads.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmthread.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 44d3b57ae8a..22bb58a514d 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -39,6 +39,7 @@
 #include <linux/inet.h>
 #include <linux/timer.h>
 #include <linux/kthread.h>
+#include <linux/delay.h>
 
 
 #include "cluster/heartbeat.h"
@@ -165,6 +166,7 @@ again:
 	} else if (ret < 0) {
 		mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
 		     lockres->lockname.len, lockres->lockname.name);
+		msleep(100);
 		goto again;
 	}
 
@@ -640,8 +642,9 @@ static int dlm_thread(void *data)
 			 * spinlock and do NOT have the dlm lock.
 			 * safe to reserve/queue asts and run the lists. */
 
-			mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
-			     "res=%p\n", dlm, res);
+			mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
+			     "res=%.*s\n", dlm->name,
+			     res->lockname.len, res->lockname.name);
 
 			/* called while holding lockres lock */
 			dlm_shuffle_lists(dlm, res);
-- 
cgit v1.2.3


From 2580a580e029f9a59a66cd230b1fd7e2d9ee339d Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 17:59:46 -0700
Subject: ocfs2: recheck lockres master before sending an unlock request.

Recovery may have happened and it may now be mastered locally.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmunlock.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index ac89c509daf..9b40d60e3b4 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -317,6 +317,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
 	size_t veclen = 1;
 
 	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+			
+	if (owner == dlm->node_num) {
+		/* ended up trying to contact ourself.  this means
+		 * that the lockres had been remote but became local
+		 * via a migration.  just retry it, now as local */
+		mlog(0, "%s:%.*s: this node became the master due to a "
+		     "migration, re-evaluate now\n", dlm->name,
+		     res->lockname.len, res->lockname.name);
+		return DLM_FORWARD;
+	}
 
 	memset(&unlock, 0, sizeof(unlock));
 	unlock.node_idx = dlm->node_num;
-- 
cgit v1.2.3


From aba9aac78817d88aa2b223f1aedf1e9815ae97b8 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 18:00:21 -0700
Subject: ocfs2: fix inverted logic in dlm_is_node_dead

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bf2a02eca62..b568bb6617d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -267,7 +267,7 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
 {
 	int dead;
 	spin_lock(&dlm->spinlock);
-	dead = test_bit(node, dlm->domain_map);
+	dead = !test_bit(node, dlm->domain_map);
 	spin_unlock(&dlm->spinlock);
 	return dead;
 }
-- 
cgit v1.2.3


From 8bc674cb4834fb25206b7f7f5e37fe571aa76b34 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 18:02:10 -0700
Subject: ocfs2: Fix empty lvb check

The check for an empty lvb should check the entire buffer not just the first
byte.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |  9 +++++++++
 fs/ocfs2/dlm/dlmrecovery.c | 12 +++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 4fc1be3a3fa..bf873919b00 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -315,6 +315,15 @@ enum dlm_lockres_list {
 	DLM_BLOCKED_LIST
 };
 
+static inline int dlm_lvb_is_empty(char *lvb)
+{
+	int i;
+	for (i=0; i<DLM_LVB_LEN; i++)
+		if (lvb[i])
+			return 0;
+	return 1;
+}
+
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b568bb6617d..8a3f0173ff5 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1021,8 +1021,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
 		    ml->type == LKM_PRMODE) {
 			/* if it is already set, this had better be a PR
 			 * and it has to match */
-			if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
-			    memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+			if (!dlm_lvb_is_empty(mres->lvb) &&
+			    (ml->type == LKM_EXMODE ||
+			     memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
 				mlog(ML_ERROR, "mismatched lvbs!\n");
 				__dlm_print_one_lock_resource(lock->lockres);
 				BUG();
@@ -1554,7 +1555,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 		lksb->flags |= (ml->flags &
 				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
 			
-		if (mres->lvb[0]) {
+		if (!dlm_lvb_is_empty(mres->lvb)) {
 			if (lksb->flags & DLM_LKSB_PUT_LVB) {
 				/* other node was trying to update
 				 * lvb when node died.  recreate the
@@ -1565,8 +1566,9 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 				 * most recent valid lvb info */
 				BUG_ON(ml->type != LKM_EXMODE &&
 				       ml->type != LKM_PRMODE);
-				if (res->lvb[0] && (ml->type == LKM_EXMODE ||
-				    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+				if (!dlm_lvb_is_empty(res->lvb) &&
+				    (ml->type == LKM_EXMODE ||
+				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
 					mlog(ML_ERROR, "received bad lvb!\n");
 					__dlm_print_one_lock_resource(res);
 					BUG();
-- 
cgit v1.2.3


From ab27eb6f47092923a92f7c164dcf9be3b76f3944 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 18:03:49 -0700
Subject: ocfs2: Better tracking for recovery state changes

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 8a3f0173ff5..ee0860bb673 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -115,12 +115,31 @@ static u64 dlm_get_next_mig_cookie(void)
 	return c;
 }
 
+static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
+					  u8 dead_node)
+{
+	assert_spin_locked(&dlm->spinlock);
+	if (dlm->reco.dead_node != dead_node)
+		mlog(0, "%s: changing dead_node from %u to %u\n",
+		     dlm->name, dlm->reco.dead_node, dead_node);
+	dlm->reco.dead_node = dead_node;
+}
+
+static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
+				       u8 master)
+{
+	assert_spin_locked(&dlm->spinlock);
+	mlog(0, "%s: changing new_master from %u to %u\n",
+	     dlm->name, dlm->reco.new_master, master);
+	dlm->reco.new_master = master;
+}
+
 static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
 {
 	spin_lock(&dlm->spinlock);
 	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
-	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
 	spin_unlock(&dlm->spinlock);
 }
 
@@ -341,7 +360,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 		mlog(0, "new master %u died while recovering %u!\n",
 		     dlm->reco.new_master, dlm->reco.dead_node);
 		/* unset the new_master, leave dead_node */
-		dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+		dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
 	}
 
 	/* select a target to recover */
@@ -350,14 +369,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 
 		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
 		if (bit >= O2NM_MAX_NODES || bit < 0)
-			dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+			dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 		else
-			dlm->reco.dead_node = bit;
+			dlm_set_reco_dead_node(dlm, bit);
 	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
 		/* BUG? */
 		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
 		     dlm->reco.dead_node);
-		dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+		dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 	}
 
 	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
@@ -2089,7 +2108,7 @@ again:
 
 			/* set the new_master to this node */
 			spin_lock(&dlm->spinlock);
-			dlm->reco.new_master = dlm->node_num;
+			dlm_set_reco_master(dlm, dlm->node_num);
 			spin_unlock(&dlm->spinlock);
 		}
 
@@ -2254,8 +2273,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 		     "node %u changing it to %u\n", dlm->name, 
 		     dlm->reco.dead_node, br->node_idx, br->dead_node);
 	}
-	dlm->reco.new_master = br->node_idx;
-	dlm->reco.dead_node = br->dead_node;
+	dlm_set_reco_master(dlm, br->node_idx);
+	dlm_set_reco_dead_node(dlm, br->dead_node);
 	if (!test_bit(br->dead_node, dlm->recovery_map)) {
 		mlog(0, "recovery master %u sees %u as dead, but this "
 		     "node has not yet.  marking %u as dead\n",
-- 
cgit v1.2.3


From c3187ce5e335cf8e06391236cc1ad7d1b1e193ed Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 18:05:41 -0700
Subject: ocfs2: only recover one dead node at a time

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ee0860bb673..39488763728 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -710,6 +710,14 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
 	if (!dlm_grab(dlm))
 		return -EINVAL;
 
+	if (lr->dead_node != dlm->reco.dead_node) {
+		mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
+		     "dead_node is %u\n", dlm->name, lr->node_idx,
+		     lr->dead_node, dlm->reco.dead_node);
+		/* this is a hack */
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
 	BUG_ON(lr->dead_node != dlm->reco.dead_node);
 
 	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
@@ -1504,7 +1512,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 	struct dlm_lock *newlock = NULL;
 	struct dlm_lockstatus *lksb = NULL;
 	int ret = 0;
-	int i;
+	int i, bad;
 	struct list_head *iter;
 	struct dlm_lock *lock = NULL;
 
@@ -1613,9 +1621,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 		 * relative to each other, but clearly *not*
 		 * preserved relative to locks from other nodes.
 		 */
+		bad = 0;
 		spin_lock(&res->spinlock);
-		dlm_lock_get(newlock);
-		list_add_tail(&newlock->list, queue);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.cookie == ml->cookie) {
+				u64 c = lock->ml.cookie;
+				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
+				     "exists on this lockres!\n", dlm->name,
+				     res->lockname.len, res->lockname.name,
+				     dlm_get_lock_cookie_node(c),
+				     dlm_get_lock_cookie_seq(c));
+
+				mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
+				     "node=%u, cookie=%u:%llu, queue=%d\n",
+	      			     ml->type, ml->convert_type, ml->node,
+				     dlm_get_lock_cookie_node(ml->cookie),
+				     dlm_get_lock_cookie_seq(ml->cookie),
+				     ml->list);
+
+				__dlm_print_one_lock_resource(res);
+				bad = 1;
+				break;
+			}
+		}
+		if (!bad) {
+			dlm_lock_get(newlock);
+			list_add_tail(&newlock->list, queue);
+		}
 		spin_unlock(&res->spinlock);
 	}
 	mlog(0, "done running all the locks\n");
-- 
cgit v1.2.3


From 29c0fa0f56f20b4512f65b0f3e55bc8af50485b7 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 18:06:58 -0700
Subject: ocfs2: handle network errors during recovery

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 53 +++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 39488763728..59c8976915a 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -757,6 +757,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 	struct list_head *iter;
 	int ret;
 	u8 dead_node, reco_master;
+	int skip_all_done = 0;
 
 	dlm = item->dlm;
 	dead_node = item->u.ral.dead_node;
@@ -793,12 +794,18 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
 
 	/* now we can begin blasting lockreses without the dlm lock */
+
+	/* any errors returned will be due to the new_master dying,
+	 * the dlm_reco_thread should detect this */
 	list_for_each(iter, &resources) {
 		res = list_entry (iter, struct dlm_lock_resource, recovering);
 		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
 				   	DLM_MRES_RECOVERY);
-		if (ret < 0)
+		if (ret < 0) {
 			mlog_errno(ret);
+			skip_all_done = 1;
+			break;
+		}
 	}
 
 	/* move the resources back to the list */
@@ -806,9 +813,12 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 	list_splice_init(&resources, &dlm->reco.resources);
 	spin_unlock(&dlm->spinlock);
 
-	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
-	if (ret < 0)
-		mlog_errno(ret);
+	if (!skip_all_done) {
+		ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+		if (ret < 0) {
+			mlog_errno(ret);
+		}
+	}
 
 	free_page((unsigned long)data);
 }
@@ -828,8 +838,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
 
 	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
 				 sizeof(done_msg), send_to, &tmpret);
-	/* negative status is ignored by the caller */
-	if (ret >= 0)
+	if (ret < 0) {
+		if (!dlm_is_host_down(ret)) {
+			mlog_errno(ret);
+			mlog(ML_ERROR, "%s: unknown error sending data-done "
+			     "to %u\n", dlm->name, send_to);
+			BUG();
+		}
+	} else
 		ret = tmpret;
 	return ret;
 }
@@ -1109,22 +1125,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 			 * we must send it immediately. */
 			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
 						       res, total_locks);
-			if (ret < 0) {
-				// TODO
-				mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
-				     "returned %d, TODO\n", ret);
-				BUG();
-			}
+			if (ret < 0)
+				goto error;
 		}
 	}
 	/* flush any remaining locks */
 	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
-	if (ret < 0) {
-		// TODO
-		mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
-		     "TODO\n", ret);
+	if (ret < 0)
+		goto error;
+	return ret;
+
+error:
+	mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
+	     dlm->name, ret);
+	if (!dlm_is_host_down(ret))
 		BUG();
-	}
+	mlog(0, "%s: node %u went down while sending %s "
+	     "lockres %.*s\n", dlm->name, send_to,
+	     flags & DLM_MRES_RECOVERY ?  "recovery" : "migration",
+	     res->lockname.len, res->lockname.name);
 	return ret;
 }
 
-- 
cgit v1.2.3


From d6dea6e9732f5319b723e14f7adbc5f99cd69aec Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 18:08:51 -0700
Subject: ocfs2: clean up recovery related messages

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 102 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 90 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 59c8976915a..81bd2400e22 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -239,6 +239,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
  *
  */
 
+static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
+{
+	struct dlm_reco_node_data *ndata;
+	struct dlm_lock_resource *res;
+
+	mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
+	     dlm->name, dlm->dlm_reco_thread_task->pid,
+	     dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
+	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+		char *st = "unknown";
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+				st = "init";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				st = "requesting";
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				st = "dead";
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				st = "receiving";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				st = "requested";
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				st = "done";
+				break;
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				st = "finalize-sent";
+				break;
+			default:
+				st = "bad";
+				break;
+		}
+		mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
+		     dlm->name, ndata->node_num, st);
+	}
+	list_for_each_entry(res, &dlm->reco.resources, recovering) {
+		mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+	}
+}
 
 #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
 
@@ -385,7 +431,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 		/* return to main thread loop and sleep. */
 		return 0;
 	}
-	mlog(0, "recovery thread found node %u in the recovery map!\n",
+	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
+	     dlm->name, dlm->dlm_reco_thread_task->pid,
 	     dlm->reco.dead_node);
 	spin_unlock(&dlm->spinlock);
 
@@ -408,8 +455,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 		}
 		mlog(0, "another node will master this recovery session.\n");
 	}
-	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
-	     dlm->name, dlm->reco.new_master,
+	mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
+	     dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
 	     dlm->node_num, dlm->reco.dead_node);
 
 	/* it is safe to start everything back up here
@@ -421,7 +468,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 	return 0;
 
 master_here:
-	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+	mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
+	     dlm->dlm_reco_thread_task->pid,
 	     dlm->name, dlm->reco.dead_node, dlm->node_num);
 
 	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
@@ -563,11 +611,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 					goto leave;
 				case DLM_RECO_NODE_DATA_RECEIVING:
 				case DLM_RECO_NODE_DATA_REQUESTED:
+					mlog(0, "%s: node %u still in state %s\n",
+					     dlm->name, ndata->node_num,
+					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
+					     "receiving" : "requested");
 					all_nodes_done = 0;
 					break;
 				case DLM_RECO_NODE_DATA_DONE:
+					mlog(0, "%s: node %u state is done\n",
+					     dlm->name, ndata->node_num);
 					break;
 				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+					mlog(0, "%s: node %u state is finalize\n",
+					     dlm->name, ndata->node_num);
 					break;
 			}
 		}
@@ -714,6 +770,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
 		mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
 		     "dead_node is %u\n", dlm->name, lr->node_idx,
 		     lr->dead_node, dlm->reco.dead_node);
+		dlm_print_reco_node_status(dlm);
 		/* this is a hack */
 		dlm_put(dlm);
 		return -ENOMEM;
@@ -764,6 +821,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 	reco_master = item->u.ral.reco_master;
 	mres = (struct dlm_migratable_lockres *)data;
 
+	mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
+	     dlm->name, dead_node, reco_master);
+
 	if (dead_node != dlm->reco.dead_node ||
 	    reco_master != dlm->reco.new_master) {
 		/* show extra debug info if the recovery state is messed */
@@ -802,7 +862,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
 				   	DLM_MRES_RECOVERY);
 		if (ret < 0) {
-			mlog_errno(ret);
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery state for dead node %u, ret=%d\n", dlm->name,
+			     reco_master, dead_node, ret);
 			skip_all_done = 1;
 			break;
 		}
@@ -816,7 +878,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 	if (!skip_all_done) {
 		ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
 		if (ret < 0) {
-			mlog_errno(ret);
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery all-done for dead node %u, ret=%d\n",
+			     dlm->name, reco_master, dead_node, ret);
 		}
 	}
 
@@ -865,7 +929,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
 	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
 	     "node_idx=%u, this node=%u\n", done->dead_node,
 	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
-	BUG_ON(done->dead_node != dlm->reco.dead_node);
+
+	mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
+			"Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+			"node_idx=%u, this node=%u\n", done->dead_node,
+			dlm->reco.dead_node, done->node_idx, dlm->node_num);
 
 	spin_lock(&dlm_reco_state_lock);
 	list_for_each(iter, &dlm->reco.node_data) {
@@ -2228,7 +2296,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
 
 	mlog_entry("%u\n", dead_node);
 
-	mlog(0, "dead node is %u\n", dead_node);
+	mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
 
 	spin_lock(&dlm->spinlock);
 	dlm_node_iter_init(dlm->domain_map, &iter);
@@ -2301,8 +2369,9 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 	if (!dlm_grab(dlm))
 		return 0;
 
-	mlog(0, "node %u wants to recover node %u\n",
-		  br->node_idx, br->dead_node);
+	mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
 
 	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
 
@@ -2344,6 +2413,11 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 	spin_unlock(&dlm->spinlock);
 
 	dlm_kick_recovery_thread(dlm);
+
+	mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
 	dlm_put(dlm);
 	return 0;
 }
@@ -2401,8 +2475,9 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 	if (!dlm_grab(dlm))
 		return 0;
 
-	mlog(0, "node %u finalizing recovery of node %u\n",
-	     fr->node_idx, fr->dead_node);
+	mlog(0, "%s: node %u finalizing recovery of node %u (%u:%u)\n",
+	     dlm->name, fr->node_idx, fr->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
 
 	spin_lock(&dlm->spinlock);
 
@@ -2426,6 +2501,9 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 	dlm_reset_recovery(dlm);
 
 	dlm_kick_recovery_thread(dlm);
+	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
+	     dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
+
 	dlm_put(dlm);
 	return 0;
 }
-- 
cgit v1.2.3


From 958837197e6415009cba0f31bbb5aacdb936ef09 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 18:47:41 -0700
Subject: ocfs2: better mle debugging

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f1fbf2f4e5d..d1c85f10c63 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -130,15 +130,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
 #if 0
 /* Code here is included but defined out as it aids debugging */
 
+#define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
+void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+{
+	int i;
+	printk("%s=[ ", mapname);
+	for (i=0; i<O2NM_MAX_NODES; i++)
+		if (test_bit(i, map))
+			printk("%d ", i);
+	printk("]");
+}
+
 void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 {
-	int i = 0, refs;
+	int refs;
 	char *type;
 	char attached;
 	u8 master;
 	unsigned int namelen;
 	const char *name;
 	struct kref *k;
+	unsigned long *maybe = mle->maybe_map,
+		      *vote = mle->vote_map,
+		      *resp = mle->response_map,
+		      *node = mle->node_map;
 
 	k = &mle->mle_refs;
 	if (mle->type == DLM_MLE_BLOCK)
@@ -159,9 +174,18 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 		name = mle->u.res->lockname.name;
 	}
 
-	mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
-		  i, type, refs, master, mle->new_master, attached,
-		  namelen, namelen, name);
+	mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
+		  namelen, name, type, refs, master, mle->new_master, attached,
+		  mle->inuse);
+	dlm_print_nodemap(maybe);
+	printk(", ");
+	dlm_print_nodemap(vote);
+	printk(", ");
+	dlm_print_nodemap(resp);
+	printk(", ");
+	dlm_print_nodemap(node);
+	printk(", ");
+	printk("\n");
 }
 
 static void dlm_dump_mles(struct dlm_ctxt *dlm)
@@ -170,7 +194,6 @@ static void dlm_dump_mles(struct dlm_ctxt *dlm)
 	struct list_head *iter;
 	
 	mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
-	mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
 	spin_lock(&dlm->master_lock);
 	list_for_each(iter, &dlm->master_list) {
 		mle = list_entry(iter, struct dlm_master_list_entry, list);
-- 
cgit v1.2.3


From a2bf04774bf4aa0a75036c1e92e3d2fd1cce2aff Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 18:51:26 -0700
Subject: ocfs2: mle ref counting fixes

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 109 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 90 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index d1c85f10c63..19399446aa8 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -74,6 +74,7 @@ struct dlm_master_list_entry
 	wait_queue_head_t wq;
 	atomic_t woken;
 	struct kref mle_refs;
+	int inuse;
 	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -337,6 +338,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
 	spin_unlock(&dlm->spinlock);
 }
 
+static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	mle->inuse++;
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	mle->inuse--;
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+}
+
 /* remove from list and free */
 static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 {
@@ -390,6 +416,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
 	memset(mle->response_map, 0, sizeof(mle->response_map));
 	mle->master = O2NM_MAX_NODES;
 	mle->new_master = O2NM_MAX_NODES;
+	mle->inuse = 0;
 
 	if (mle->type == DLM_MLE_MASTER) {
 		BUG_ON(!res);
@@ -809,7 +836,7 @@ lookup:
 	 * if so, the creator of the BLOCK may try to put the last
 	 * ref at this time in the assert master handler, so we
 	 * need an extra one to keep from a bad ptr deref. */
-	dlm_get_mle(mle);
+	dlm_get_mle_inuse(mle);
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
@@ -899,7 +926,7 @@ wait:
 	dlm_mle_detach_hb_events(dlm, mle);
 	dlm_put_mle(mle);
 	/* put the extra ref */
-	dlm_put_mle(mle);
+	dlm_put_mle_inuse(mle);
 
 wake_waiters:
 	spin_lock(&res->spinlock);
@@ -1753,6 +1780,7 @@ ok:
 	if (mle) {
 		int extra_ref = 0;
 		int nn = -1;
+		int rr, err = 0;
 		
 		spin_lock(&mle->spinlock);
 		if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
@@ -1772,27 +1800,64 @@ ok:
 		wake_up(&mle->wq);
 		spin_unlock(&mle->spinlock);
 
-		if (mle->type == DLM_MLE_MIGRATION && res) {
-			mlog(0, "finishing off migration of lockres %.*s, "
-			     "from %u to %u\n",
-			       res->lockname.len, res->lockname.name,
-			       dlm->node_num, mle->new_master);
+		if (res) {
 			spin_lock(&res->spinlock);
-			res->state &= ~DLM_LOCK_RES_MIGRATING;
-			dlm_change_lockres_owner(dlm, res, mle->new_master);
-			BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			if (mle->type == DLM_MLE_MIGRATION) {
+				mlog(0, "finishing off migration of lockres %.*s, "
+			     		"from %u to %u\n",
+			       		res->lockname.len, res->lockname.name,
+			       		dlm->node_num, mle->new_master);
+				res->state &= ~DLM_LOCK_RES_MIGRATING;
+				dlm_change_lockres_owner(dlm, res, mle->new_master);
+				BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			} else {
+				dlm_change_lockres_owner(dlm, res, mle->master);
+			}
 			spin_unlock(&res->spinlock);
 		}
-		/* master is known, detach if not already detached */
-		dlm_mle_detach_hb_events(dlm, mle);
-		dlm_put_mle(mle);
-		
+
+		/* master is known, detach if not already detached.
+		 * ensures that only one assert_master call will happen
+		 * on this mle. */
+		spin_lock(&dlm->spinlock);
+		spin_lock(&dlm->master_lock);
+
+		rr = atomic_read(&mle->mle_refs.refcount);
+		if (mle->inuse > 0) {
+			if (extra_ref && rr < 3)
+				err = 1;
+			else if (!extra_ref && rr < 2)
+				err = 1;
+		} else {
+			if (extra_ref && rr < 2)
+				err = 1;
+			else if (!extra_ref && rr < 1)
+				err = 1;
+		}
+		if (err) {
+			mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
+			     "that will mess up this node, refs=%d, extra=%d, "
+			     "inuse=%d\n", dlm->name, namelen, name,
+			     assert->node_idx, rr, extra_ref, mle->inuse);
+			dlm_print_one_mle(mle);
+		}
+		list_del_init(&mle->list);
+		__dlm_mle_detach_hb_events(dlm, mle);
+		__dlm_put_mle(mle);
 		if (extra_ref) {
 			/* the assert master message now balances the extra
 		 	 * ref given by the master / migration request message.
 		 	 * if this is the last put, it will be removed
 		 	 * from the list. */
-			dlm_put_mle(mle);
+			__dlm_put_mle(mle);
+		}
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&dlm->spinlock);
+	} else if (res) {
+		if (res->owner != assert->node_idx) {
+			mlog(0, "assert_master from %u, but current "
+			     "owner is %u (%.*s), no mle\n", assert->node_idx,
+			     res->owner, namelen, name);
 		}
 	}
 
@@ -2138,7 +2203,7 @@ fail:
 	 * take both dlm->spinlock and dlm->master_lock */
 	spin_lock(&dlm->spinlock);
 	spin_lock(&dlm->master_lock);
-	dlm_get_mle(mle);
+	dlm_get_mle_inuse(mle);
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
@@ -2155,7 +2220,10 @@ fail:
 		/* migration failed, detach and clean up mle */
 		dlm_mle_detach_hb_events(dlm, mle);
 		dlm_put_mle(mle);
-		dlm_put_mle(mle);
+		dlm_put_mle_inuse(mle);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
 		goto leave;
 	}
 
@@ -2196,7 +2264,10 @@ fail:
 			/* migration failed, detach and clean up mle */
 			dlm_mle_detach_hb_events(dlm, mle);
 			dlm_put_mle(mle);
-			dlm_put_mle(mle);
+			dlm_put_mle_inuse(mle);
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_MIGRATING;
+			spin_unlock(&res->spinlock);
 			goto leave;
 		}
 		/* TODO: if node died: stop, clean up, return error */
@@ -2212,7 +2283,7 @@ fail:
 
 	/* master is known, detach if not already detached */
 	dlm_mle_detach_hb_events(dlm, mle);
-	dlm_put_mle(mle);
+	dlm_put_mle_inuse(mle);
 	ret = 0;
 
 	dlm_lockres_calc_usage(dlm, res);
-- 
cgit v1.2.3


From da01ad05528bf6f6bcb286329b14225a71713325 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 18:53:04 -0700
Subject: ocfs2: detach mle from heartbeat events

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 19399446aa8..d7a51691e7d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2673,6 +2673,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 			/* remove it from the list so that only one
 			 * mle will be found */
 			list_del_init(&tmp->list);
+			__dlm_mle_detach_hb_events(dlm, mle);
 		}
 		spin_unlock(&tmp->spinlock);
 	}
@@ -2767,6 +2768,7 @@ top:
 
 		/* remove from the list early.  NOTE: unlinking
 		 * list_head while in list_for_each_safe */
+		__dlm_mle_detach_hb_events(dlm, mle);
 		spin_lock(&mle->spinlock);
 		list_del_init(&mle->list);
 		atomic_set(&mle->woken, 1);
-- 
cgit v1.2.3


From 41b8c8a101ba77f59d9a4b3cac6c846cb8a34840 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 19:00:26 -0700
Subject: ocfs2: properly initialize the mle structure

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index d7a51691e7d..376283e98c1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1512,15 +1512,12 @@ way_up_top:
 				mlog_errno(-ENOMEM);
 				goto send_response;
 			}
-			spin_lock(&dlm->spinlock);
-			dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
-					 name, namelen);
-			spin_unlock(&dlm->spinlock);
 			goto way_up_top;
 		}
 
 		// mlog(0, "this is second time thru, already allocated, "
 		// "add the block.\n");
+		dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
 		set_bit(request->node_idx, mle->maybe_map);
 		list_add(&mle->list, &dlm->master_list);
 		response = DLM_MASTER_RESP_NO;
-- 
cgit v1.2.3


From 2d1a868c563f07c07c681836d273d69efb7c5ad8 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 19:01:35 -0700
Subject: ocfs2: take mle reference during migration

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 376283e98c1..0b7e29e916e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1710,6 +1710,23 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 				     assert->node_idx);
 			}
 		}
+		if (mle->type == DLM_MLE_MIGRATION) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "%s:%.*s: got cleanup assert"
+				     " from %u for migration\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+			} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
+				mlog(0, "%s:%.*s: got unrelated assert"
+				     " from %u for migration, ignoring\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+				__dlm_put_mle(mle);
+				spin_unlock(&dlm->master_lock);
+				spin_unlock(&dlm->spinlock);
+				goto done;
+			}	
+		}
 	}
 	spin_unlock(&dlm->master_lock);
 
-- 
cgit v1.2.3


From dc2ed195dda848c8e4b24f3f0e1952fa74f74f6b Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 19:03:18 -0700
Subject: ocfs2: allow for an assert message during lock mastery

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 0b7e29e916e..1b28d3eedac 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1741,7 +1741,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 			goto kill;
 		}
 		if (!mle) {
-			if (res->owner != assert->node_idx) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
+			    res->owner != assert->node_idx) {
 				mlog(ML_ERROR, "assert_master from "
 					  "%u, but current owner is "
 					  "%u! (%.*s)\n",
-- 
cgit v1.2.3


From aa85235427992b8d3040297d9174d69dd1d8a675 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 19:04:49 -0700
Subject: ocfs2: mle ref count debugging

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 1b28d3eedac..9c5ef405bb3 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -371,9 +371,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
-	BUG_ON(!atomic_read(&mle->mle_refs.refcount));
-
-	kref_put(&mle->mle_refs, dlm_mle_release);
+	if (!atomic_read(&mle->mle_refs.refcount)) {
+		/* this may or may not crash, but who cares.
+		 * it's a BUG. */
+		mlog(ML_ERROR, "bad mle: %p\n", mle);
+		dlm_print_one_mle(mle);
+		BUG();
+	} else
+		kref_put(&mle->mle_refs, dlm_mle_release);
 }
 
 
@@ -1008,6 +1013,12 @@ recheck:
 		     "rechecking now\n", dlm->name, res->lockname.len,
 		     res->lockname.name);
 		goto recheck;
+	} else {
+		if (!voting_done) {
+			mlog(0, "map not changed and voting not done "
+			     "for %s:%.*s\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
+		}
 	}
 
 	if (m != O2NM_MAX_NODES) {
@@ -1691,7 +1702,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 		if (bit >= O2NM_MAX_NODES) {
 			/* not necessarily an error, though less likely.
 			 * could be master just re-asserting. */
-			mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
+			mlog(0, "no bits set in the maybe_map, but %u "
 			     "is asserting! (%.*s)\n", assert->node_idx,
 			     namelen, name);
 		} else if (bit != assert->node_idx) {
@@ -1703,7 +1714,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 				 * number winning the mastery will respond
 				 * YES to mastery requests, but this node
 				 * had no way of knowing.  let it pass. */
-				mlog(ML_ERROR, "%u is the lowest node, "
+				mlog(0, "%u is the lowest node, "
 				     "%u is asserting. (%.*s)  %u must "
 				     "have begun after %u won.\n", bit,
 				     assert->node_idx, namelen, name, bit,
@@ -2268,8 +2279,8 @@ fail:
 			/* avoid hang during shutdown when migrating lockres 
 			 * to a node which also goes down */
 			if (dlm_is_node_dead(dlm, target)) {
-				mlog(0, "%s:%.*s: expected migration target %u "
-				     "is no longer up.  restarting.\n",
+				mlog(0, "%s:%.*s: expected migration "
+				     "target %u is no longer up, restarting\n",
 				     dlm->name, res->lockname.len,
 				     res->lockname.name, target);
 				ret = -ERESTARTSYS;
@@ -2790,8 +2801,8 @@ top:
 		spin_unlock(&mle->spinlock);
 		wake_up(&mle->wq);
 
-		mlog(0, "node %u died during migration from "
-		     "%u to %u!\n", dead_node,
+		mlog(0, "%s: node %u died during migration from "
+		     "%u to %u!\n", dlm->name, dead_node,
 		     mle->master, mle->new_master);
 		/* if there is a lockres associated with this
 	 	 * mle, find it and set its owner to UNKNOWN */
-- 
cgit v1.2.3


From c0a8520c7333dd62624683772f31864c7f9c46d9 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 27 Apr 2006 19:07:45 -0700
Subject: ocfs2: do LVB puts in place

Don't wait until the AST will be fired to do the LVB copy into the lock
resource.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmast.c     | 12 +++++++-----
 fs/ocfs2/dlm/dlmconvert.c |  3 +++
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 87ee29cad50..42775e2bbe2 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 				  lock->ml.node == dlm->node_num ? "master" :
 				  "remote");
 			memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
-		} else if (lksb->flags & DLM_LKSB_PUT_LVB) {
-			mlog(0, "setting lvb from lockres for %s node\n",
-				  lock->ml.node == dlm->node_num ? "master" :
-				  "remote");
-			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
 		}
+		/* Do nothing for lvb put requests - they should be done in
+ 		 * place when the lock is downconverted - otherwise we risk
+ 		 * racing gets and puts which could result in old lvb data
+ 		 * being propagated. We leave the put flag set and clear it
+ 		 * here. In the future we might want to clear it at the time
+ 		 * the put is actually done.
+		 */
 		spin_unlock(&res->spinlock);
 	}
 
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 70888b31e75..90cbaaff33a 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -214,6 +214,9 @@ grant:
 	if (lock->ml.node == dlm->node_num)
 		mlog(0, "doing in-place convert for nonlocal lock\n");
 	lock->ml.type = type;
+	if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
+		memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+
 	status = DLM_NORMAL;
 	*call_ast = 1;
 	goto unlock_exit;
-- 
cgit v1.2.3


From a7f90d83ea8dc8b0999ab7c1c0539af9a6ed69d2 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 19:24:21 -0700
Subject: ocfs2: dump lockres info before we BUG() on a bad reference

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 9c5ef405bb3..427c0af0d21 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -619,6 +619,28 @@ static void dlm_lockres_release(struct kref *kref)
 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 	     res->lockname.name);
 
+	if (!hlist_unhashed(&res->hash_node) ||
+	    !list_empty(&res->granted) ||
+	    !list_empty(&res->converting) ||
+	    !list_empty(&res->blocked) ||
+	    !list_empty(&res->dirty) ||
+	    !list_empty(&res->recovering) ||
+	    !list_empty(&res->purge)) {
+		mlog(ML_ERROR,
+		     "Going to BUG for resource %.*s."
+		     "  We're on a list! [%c%c%c%c%c%c%c]\n",
+		     res->lockname.len, res->lockname.name,
+		     !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
+		     !list_empty(&res->granted) ? 'G' : ' ',
+		     !list_empty(&res->converting) ? 'C' : ' ',
+		     !list_empty(&res->blocked) ? 'B' : ' ',
+		     !list_empty(&res->dirty) ? 'D' : ' ',
+		     !list_empty(&res->recovering) ? 'R' : ' ',
+		     !list_empty(&res->purge) ? 'P' : ' ');
+
+		dlm_print_one_lock_resource(res);
+	}
+
 	/* By the time we're ready to blow this guy away, we shouldn't
 	 * be on any lists. */
 	BUG_ON(!hlist_unhashed(&res->hash_node));
-- 
cgit v1.2.3


From a9ee4c8a67b962db0208addf0e32935aa571af6b Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Thu, 27 Apr 2006 19:26:15 -0700
Subject: ocfs2: better error handling during assert master message

handle errors during lock assert master by either killing self or other node

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 427c0af0d21..81ceee2e0d5 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1633,6 +1633,8 @@ again:
 	dlm_node_iter_init(nodemap, &iter);
 	while ((to = dlm_node_iter_next(&iter)) >= 0) {
 		int r = 0;
+		struct dlm_master_list_entry *mle = NULL;
+
 		mlog(0, "sending assert master to %d (%.*s)\n", to,
 		     namelen, lockname);
 		memset(&assert, 0, sizeof(assert));
@@ -1657,7 +1659,15 @@ again:
 			/* ok, something horribly messed.  kill thyself. */
 			mlog(ML_ERROR,"during assert master of %.*s to %u, "
 			     "got %d.\n", namelen, lockname, to, r);
-			dlm_dump_lock_resources(dlm);
+			spin_lock(&dlm->spinlock);
+			spin_lock(&dlm->master_lock);
+			if (dlm_find_mle(dlm, &mle, (char *)lockname,
+					 namelen)) {
+				dlm_print_one_mle(mle);
+				__dlm_put_mle(mle);
+			}
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
 			BUG();
 		} else if (r == EAGAIN) {
 			mlog(0, "%.*s: node %u create mles on other "
@@ -1922,12 +1932,12 @@ done:
 
 kill:
 	/* kill the caller! */
+	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+	     "and killing the other node now!  This node is OK and can continue.\n");
+	__dlm_print_one_lock_resource(res);
 	spin_unlock(&res->spinlock);
 	spin_unlock(&dlm->spinlock);
 	dlm_lockres_put(res);
-	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
-	     "and killing the other node now!  This node is OK and can continue.\n");
-	dlm_dump_lock_resources(dlm);
 	dlm_put(dlm);
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From 69d72b066cc5971318d9e29e34289b74cf8a9d22 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 10:57:51 -0700
Subject: ocfs2: dlm recovery / lockres reference count fix

Take a reference on lockres structures while they are on the recovery list.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |  1 +
 fs/ocfs2/dlm/dlmrecovery.c | 16 +++++++++++++---
 fs/ocfs2/dlm/dlmthread.c   |  2 +-
 3 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index bf873919b00..9e052445b0b 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -849,6 +849,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm,
 			   u8 dead_node);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 
+int __dlm_lockres_unused(struct dlm_lock_resource *res);
 
 static inline const char * dlm_lock_mode_name(int mode)
 {
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 81bd2400e22..6ee8b324712 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1758,8 +1758,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
 	struct dlm_lock *lock;
 
 	res->state |= DLM_LOCK_RES_RECOVERING;
-	if (!list_empty(&res->recovering))
+	if (!list_empty(&res->recovering)) {
+		mlog(0,
+		     "Recovering res %s:%.*s, is already on recovery list!\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
 		list_del_init(&res->recovering);
+	}
+	/* We need to hold a reference while on the recovery list */
+	dlm_lockres_get(res);
 	list_add_tail(&res->recovering, &dlm->reco.resources);
 
 	/* find any pending locks and put them back on proper list */
@@ -1848,9 +1854,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 			spin_lock(&res->spinlock);
 			dlm_change_lockres_owner(dlm, res, new_master);
 			res->state &= ~DLM_LOCK_RES_RECOVERING;
-			__dlm_dirty_lockres(dlm, res);
+			if (!__dlm_lockres_unused(res))
+				__dlm_dirty_lockres(dlm, res);
 			spin_unlock(&res->spinlock);
 			wake_up(&res->wq);
+			dlm_lockres_put(res);
 		}
 	}
 
@@ -1883,11 +1891,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 					     dlm->name, res->lockname.len,
 					     res->lockname.name, res->owner);
 					list_del_init(&res->recovering);
+					dlm_lockres_put(res);
 				}
 				spin_lock(&res->spinlock);
 				dlm_change_lockres_owner(dlm, res, new_master);
 				res->state &= ~DLM_LOCK_RES_RECOVERING;
-				__dlm_dirty_lockres(dlm, res);
+				if (!__dlm_lockres_unused(res))
+					__dlm_dirty_lockres(dlm, res);
 				spin_unlock(&res->spinlock);
 				wake_up(&res->wq);
 			}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 22bb58a514d..71302b90daf 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -81,7 +81,7 @@ repeat:
 }
 
 
-static int __dlm_lockres_unused(struct dlm_lock_resource *res)
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
 	if (list_empty(&res->granted) &&
 	    list_empty(&res->converting) &&
-- 
cgit v1.2.3


From 466d1a4591c4e1bc3affd5c0cf3df5ad20338fb9 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 11:11:13 -0700
Subject: ocfs2: make dlm recovery finalization 2 stage

Makes it easier for the recovery process to deal with node death.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |   6 ++-
 fs/ocfs2/dlm/dlmrecovery.c | 112 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 99 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 9e052445b0b..78eccd0951e 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -71,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
 	return 0;
 }
 
-#define DLM_RECO_STATE_ACTIVE  0x0001
+#define DLM_RECO_STATE_ACTIVE    0x0001
+#define DLM_RECO_STATE_FINALIZE  0x0002
 
 struct dlm_recovery_ctxt
 {
@@ -633,7 +634,8 @@ struct dlm_finalize_reco
 {
 	u8 node_idx;
 	u8 dead_node;
-	__be16 pad1;
+	u8 flags;
+	u8 pad1;
 	__be32 pad2;
 };
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 6ee8b324712..19123ce8b30 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -134,12 +134,18 @@ static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
 	dlm->reco.new_master = master;
 }
 
-static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
 {
-	spin_lock(&dlm->spinlock);
+	assert_spin_locked(&dlm->spinlock);
 	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
 	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
+
+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	spin_lock(&dlm->spinlock);
+	__dlm_reset_recovery(dlm);
 	spin_unlock(&dlm->spinlock);
 }
 
@@ -2074,6 +2080,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 {
 	assert_spin_locked(&dlm->spinlock);
 
+	if (dlm->reco.new_master == idx) {
+		mlog(0, "%s: recovery master %d just died\n",
+		     dlm->name, idx);
+		if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+			/* finalize1 was reached, so it is safe to clear
+			 * the new_master and dead_node.  that recovery
+			 * is complete. */
+			mlog(0, "%s: dead master %d had reached "
+			     "finalize1 state, clearing\n", dlm->name, idx);
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			__dlm_reset_recovery(dlm);
+		}
+	}
+
 	/* check to see if the node is already considered dead */
 	if (!test_bit(idx, dlm->live_nodes_map)) {
 		mlog(0, "for domain %s, node %d is already dead. "
@@ -2364,6 +2384,14 @@ retry:
 			 * another ENOMEM */
 			msleep(100);
 			goto retry;
+		} else if (ret == EAGAIN) {
+			mlog(0, "%s: trying to start recovery of node "
+			     "%u, but node %u is waiting for last recovery "
+			     "to complete, backoff for a bit\n", dlm->name,
+			     dead_node, nodenum);
+			/* TODO Look into replacing msleep with cond_resched() */
+			msleep(100);
+			goto retry;
 		}
 	}
 
@@ -2379,6 +2407,17 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 	if (!dlm_grab(dlm))
 		return 0;
 
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+		mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+		     "but this node is in finalize state, waiting on finalize2\n",
+		     dlm->name, br->node_idx, br->dead_node,
+		     dlm->reco.dead_node, dlm->reco.new_master);
+		spin_unlock(&dlm->spinlock);
+		return EAGAIN;
+	}
+	spin_unlock(&dlm->spinlock);
+
 	mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
 	     dlm->name, br->node_idx, br->dead_node,
 	     dlm->reco.dead_node, dlm->reco.new_master);
@@ -2432,6 +2471,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 	return 0;
 }
 
+#define DLM_FINALIZE_STAGE2  0x01
 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 {
 	int ret = 0;
@@ -2439,25 +2479,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 	struct dlm_node_iter iter;
 	int nodenum;
 	int status;
+	int stage = 1;
 
-	mlog(0, "finishing recovery for node %s:%u\n",
-	     dlm->name, dlm->reco.dead_node);
+	mlog(0, "finishing recovery for node %s:%u, "
+	     "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
 
 	spin_lock(&dlm->spinlock);
 	dlm_node_iter_init(dlm->domain_map, &iter);
 	spin_unlock(&dlm->spinlock);
 
+stage2:
 	memset(&fr, 0, sizeof(fr));
 	fr.node_idx = dlm->node_num;
 	fr.dead_node = dlm->reco.dead_node;
+	if (stage == 2)
+		fr.flags |= DLM_FINALIZE_STAGE2;
 
 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
 		if (nodenum == dlm->node_num)
 			continue;
 		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
 					 &fr, sizeof(fr), nodenum, &status);
-		if (ret >= 0) {
+		if (ret >= 0)
 			ret = status;
+		if (ret < 0) {
+			mlog_errno(ret);
 			if (dlm_is_host_down(ret)) {
 				/* this has no effect on this recovery 
 				 * session, so set the status to zero to 
@@ -2466,12 +2512,15 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 				     "node finished recovery.\n", nodenum);
 				ret = 0;
 			}
-		}
-		if (ret < 0) {
-			mlog_errno(ret);
 			break;
 		}
 	}
+	if (stage == 1) {
+		/* reset the node_iter back to the top and send finalize2 */
+		iter.curnode = -1;
+		stage = 2;
+		goto stage2;
+	}
 
 	return ret;
 }
@@ -2480,15 +2529,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 {
 	struct dlm_ctxt *dlm = data;
 	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+	int stage = 1;
 
 	/* ok to return 0, domain has gone away */
 	if (!dlm_grab(dlm))
 		return 0;
 
-	mlog(0, "%s: node %u finalizing recovery of node %u (%u:%u)\n",
-	     dlm->name, fr->node_idx, fr->dead_node,
-	     dlm->reco.dead_node, dlm->reco.new_master);
+	if (fr->flags & DLM_FINALIZE_STAGE2)
+		stage = 2;
 
+	mlog(0, "%s: node %u finalizing recovery stage%d of "
+	     "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+	     fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
+ 
 	spin_lock(&dlm->spinlock);
 
 	if (dlm->reco.new_master != fr->node_idx) {
@@ -2504,13 +2557,38 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 		BUG();
 	}
 
-	dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
-
-	spin_unlock(&dlm->spinlock);
-
-	dlm_reset_recovery(dlm);
+	switch (stage) {
+		case 1:
+			dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+			if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+				mlog(ML_ERROR, "%s: received finalize1 from "
+				     "new master %u for dead node %u, but "
+				     "this node has already received it!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+			break;
+		case 2:
+			if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+				mlog(ML_ERROR, "%s: received finalize2 from "
+				     "new master %u for dead node %u, but "
+				     "this node did not have finalize1!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+			dlm_reset_recovery(dlm);
+			dlm_kick_recovery_thread(dlm);
+			break;
+		default:
+			BUG();
+	}
 
-	dlm_kick_recovery_thread(dlm);
 	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
 	     dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
 
-- 
cgit v1.2.3


From 343e26a4007d14c2154c1d13d1209797dce5c535 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 11:15:04 -0700
Subject: ocfs2: dump mismatching migrated lvbs before BUG()

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 19123ce8b30..de68e498bfa 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1689,8 +1689,19 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 				if (!dlm_lvb_is_empty(res->lvb) &&
 				    (ml->type == LKM_EXMODE ||
 				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
-					mlog(ML_ERROR, "received bad lvb!\n");
-					__dlm_print_one_lock_resource(res);
+					int i;
+					mlog(ML_ERROR, "%s:%.*s: received bad "
+					     "lvb! type=%d\n", dlm->name,
+					     res->lockname.len,
+					     res->lockname.name, ml->type);
+					printk("lockres lvb=[");
+					for (i=0; i<DLM_LVB_LEN; i++)
+						printk("%02x", res->lvb[i]);
+					printk("]\nmigrated lvb=[");
+					for (i=0; i<DLM_LVB_LEN; i++)
+						printk("%02x", mres->lvb[i]);
+					printk("]\n");
+					dlm_print_one_lock_resource(res);
 					BUG();
 				}
 				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
-- 
cgit v1.2.3


From 8b2198097ae6a5b54ed92345989ec343070f916b Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 11:16:45 -0700
Subject: ocfs2: purge lockres' sooner

Immediately purge a lockress that the local node is not the master of.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmthread.c | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 71302b90daf..76526ea95bb 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -54,6 +54,8 @@
 #include "cluster/masklog.h"
 
 static int dlm_thread(void *data);
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *lockres);
 
 static void dlm_flush_asts(struct dlm_ctxt *dlm);
 
@@ -111,10 +113,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			res->last_used = jiffies;
 			list_add_tail(&res->purge, &dlm->purge_list);
 			dlm->purge_count++;
+
+			/* if this node is not the owner, there is
+			 * no way to keep track of who the owner could be.
+			 * unhash it to avoid serious problems. */
+			if (res->owner != dlm->node_num) {
+				mlog(0, "%s:%.*s: doing immediate "
+				     "purge of lockres owned by %u\n",
+				     dlm->name, res->lockname.len,
+				     res->lockname.name, res->owner);
+
+				dlm_purge_lockres_now(dlm, res);
+			}
 		}
 	} else if (!list_empty(&res->purge)) {
-		mlog(0, "removing lockres %.*s from purge list\n",
-		     res->lockname.len, res->lockname.name);
+		mlog(0, "removing lockres %.*s from purge list, "
+		     "owner=%u\n", res->lockname.len, res->lockname.name,
+		     res->owner);
 
 		list_del_init(&res->purge);
 		dlm->purge_count--;
@@ -180,6 +195,24 @@ finish:
 	__dlm_unhash_lockres(lockres);
 }
 
+/* make an unused lockres go away immediately.
+ * as soon as the dlm spinlock is dropped, this lockres
+ * will not be found. kfree still happens on last put. */
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *lockres)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&lockres->spinlock);
+
+	BUG_ON(!__dlm_lockres_unused(lockres));
+
+	if (!list_empty(&lockres->purge)) {
+		list_del_init(&lockres->purge);
+		dlm->purge_count--;
+	}
+	__dlm_unhash_lockres(lockres);
+}
+
 static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 			       int purge_now)
 {
-- 
cgit v1.2.3


From 588e00902b06528c476eed38019dba4a087def01 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 11:22:06 -0700
Subject: ocfs2: do not send master requests to localhost

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 81ceee2e0d5..e5d7271d503 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -994,12 +994,14 @@ recheck:
 		spin_unlock(&res->spinlock);
 		/* this will cause the master to re-assert across
 		 * the whole cluster, freeing up mles */
-		ret = dlm_do_master_request(mle, res->owner);
-		if (ret < 0) {
-			/* give recovery a chance to run */
-			mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
-			msleep(500);
-			goto recheck;
+		if (res->owner != dlm->node_num) {
+			ret = dlm_do_master_request(mle, res->owner);
+			if (ret < 0) {
+				/* give recovery a chance to run */
+				mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+				msleep(500);
+				goto recheck;
+			}
 		}
 		ret = 0;
 		goto leave;
-- 
cgit v1.2.3


From ccd8b1f916bc5e4b2156f03ccd3546be7f65f6b3 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 11:32:14 -0700
Subject: ocfs2: update lvb immediately during recovery

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 44 ++++++++++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index de68e498bfa..86199f66eb5 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1674,40 +1674,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 		}
 		lksb->flags |= (ml->flags &
 				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
-			
+
+		if (ml->type == LKM_NLMODE)
+			goto skip_lvb;
+
 		if (!dlm_lvb_is_empty(mres->lvb)) {
 			if (lksb->flags & DLM_LKSB_PUT_LVB) {
 				/* other node was trying to update
 				 * lvb when node died.  recreate the
 				 * lksb with the updated lvb. */
 				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+				/* the lock resource lvb update must happen
+				 * NOW, before the spinlock is dropped.
+				 * we no longer wait for the AST to update
+				 * the lvb. */
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
 			} else {
 				/* otherwise, the node is sending its 
 				 * most recent valid lvb info */
 				BUG_ON(ml->type != LKM_EXMODE &&
 				       ml->type != LKM_PRMODE);
 				if (!dlm_lvb_is_empty(res->lvb) &&
-				    (ml->type == LKM_EXMODE ||
-				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
-					int i;
-					mlog(ML_ERROR, "%s:%.*s: received bad "
-					     "lvb! type=%d\n", dlm->name,
-					     res->lockname.len,
-					     res->lockname.name, ml->type);
-					printk("lockres lvb=[");
-					for (i=0; i<DLM_LVB_LEN; i++)
-						printk("%02x", res->lvb[i]);
-					printk("]\nmigrated lvb=[");
-					for (i=0; i<DLM_LVB_LEN; i++)
-						printk("%02x", mres->lvb[i]);
-					printk("]\n");
-					dlm_print_one_lock_resource(res);
-					BUG();
+ 				    (ml->type == LKM_EXMODE ||
+ 				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+ 					int i;
+ 					mlog(ML_ERROR, "%s:%.*s: received bad "
+ 					     "lvb! type=%d\n", dlm->name,
+ 					     res->lockname.len,
+ 					     res->lockname.name, ml->type);
+ 					printk("lockres lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", res->lvb[i]);
+ 					printk("]\nmigrated lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", mres->lvb[i]);
+ 					printk("]\n");
+ 					dlm_print_one_lock_resource(res);
+ 					BUG();
 				}
 				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
 			}
 		}
-
+skip_lvb:
 
 		/* NOTE:
 		 * wrt lock queue ordering and recovery:
-- 
cgit v1.2.3


From e4eb03681a8313168d99c2f93175331a898a2c16 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 11:46:59 -0700
Subject: ocfs2: gracefully handle stale create_lock messages.

This is an error on the sending side, so gracefully error out on the
receiving end.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmlock.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 55cda25ae11..57576208b79 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -280,6 +280,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
 	if (tmpret >= 0) {
 		// successfully sent and received
 		ret = status;  // this is already a dlm_status
+		if (ret == DLM_RECOVERING) {
+			mlog(ML_ERROR, "%s:%.*s: BUG.  this is a stale lockres "
+			     "no longer owned by %u.  that node is coming back "
+			     "up currently.\n", dlm->name, create.namelen,
+			     create.name, res->owner);
+			dlm_print_one_lock_resource(res);
+			BUG();
+		}
 	} else {
 		mlog_errno(tmpret);
 		if (dlm_is_host_down(tmpret)) {
@@ -428,11 +436,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 	if (!dlm_grab(dlm))
 		return DLM_REJECTED;
 
-	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
-			"Domain %s not fully joined!\n", dlm->name);
-
 	name = create->name;
 	namelen = create->namelen;
+	status = DLM_RECOVERING;
+	if (!dlm_domain_fully_joined(dlm)) {
+		mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
+		     "sending a create_lock message for lock %.*s!\n",
+		     dlm->name, create->node_idx, namelen, name);
+		dlm_error(status);
+		goto leave;
+	}
 
 	status = DLM_IVBUFLEN;
 	if (namelen > DLM_LOCKID_NAME_MAX) {
-- 
cgit v1.2.3


From e7e69eb38946ebef86e27442d01514fcf9c854ee Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 11:49:52 -0700
Subject: ocfs2: teach dlm_restart_lock_mastery() to wait on recovery

Change behavior of dlm_restart_lock_mastery() when a node goes down.  Dump
all responses that have been collected and start over.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 100 +++++++++++++++++++++--------------------------
 1 file changed, 44 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index e5d7271d503..915283fb48c 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -867,6 +867,7 @@ lookup:
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
+redo_request:
 	while (wait_on_recovery) {
 		/* any cluster changes that occurred after dropping the
 		 * dlm spinlock would be detectable be a change on the mle,
@@ -904,7 +905,6 @@ lookup:
 	if (blocked)
 		goto wait;
 
-redo_request:
 	ret = -EINVAL;
 	dlm_node_iter_init(mle->vote_map, &iter);
 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
@@ -929,6 +929,7 @@ wait:
 	/* keep going until the response map includes all nodes */
 	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
 	if (ret < 0) {
+		wait_on_recovery = 1;
 		mlog(0, "%s:%.*s: node map changed, redo the "
 		     "master request now, blocked=%d\n",
 		     dlm->name, res->lockname.len,
@@ -1210,18 +1211,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 			set_bit(node, mle->vote_map);
 		} else {
 			mlog(ML_ERROR, "node down! %d\n", node);
-
-			/* if the node wasn't involved in mastery skip it,
-			 * but clear it out from the maps so that it will
-			 * not affect mastery of this lockres */
-			clear_bit(node, mle->response_map);
-			clear_bit(node, mle->vote_map);
-			if (!test_bit(node, mle->maybe_map))
-				goto next;
-
-			/* if we're already blocked on lock mastery, and the
-			 * dead node wasn't the expected master, or there is
-			 * another node in the maybe_map, keep waiting */
 			if (blocked) {
 				int lowest = find_next_bit(mle->maybe_map,
 						       O2NM_MAX_NODES, 0);
@@ -1229,54 +1218,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 				/* act like it was never there */
 				clear_bit(node, mle->maybe_map);
 
-			       	if (node != lowest)
-					goto next;
-
-				mlog(ML_ERROR, "expected master %u died while "
-				     "this node was blocked waiting on it!\n",
-				     node);
-				lowest = find_next_bit(mle->maybe_map,
-						       O2NM_MAX_NODES,
-						       lowest+1);
-				if (lowest < O2NM_MAX_NODES) {
-					mlog(0, "still blocked. waiting "
-					     "on %u now\n", lowest);
-					goto next;
+			       	if (node == lowest) {
+					mlog(0, "expected master %u died"
+					    " while this node was blocked "
+					    "waiting on it!\n", node);
+					lowest = find_next_bit(mle->maybe_map,
+						       	O2NM_MAX_NODES,
+						       	lowest+1);
+					if (lowest < O2NM_MAX_NODES) {
+						mlog(0, "%s:%.*s:still "
+						     "blocked. waiting on %u "
+						     "now\n", dlm->name,
+						     res->lockname.len,
+						     res->lockname.name,
+						     lowest);
+					} else {
+						/* mle is an MLE_BLOCK, but
+						 * there is now nothing left to
+						 * block on.  we need to return
+						 * all the way back out and try
+						 * again with an MLE_MASTER.
+						 * dlm_do_local_recovery_cleanup
+						 * has already run, so the mle
+						 * refcount is ok */
+						mlog(0, "%s:%.*s: no "
+						     "longer blocking. try to "
+						     "master this here\n",
+						     dlm->name,
+						     res->lockname.len,
+						     res->lockname.name);
+						mle->type = DLM_MLE_MASTER;
+						mle->u.res = res;
+					}
 				}
-
-				/* mle is an MLE_BLOCK, but there is now
-				 * nothing left to block on.  we need to return
-				 * all the way back out and try again with
-				 * an MLE_MASTER. dlm_do_local_recovery_cleanup
-				 * has already run, so the mle refcount is ok */
-				mlog(0, "no longer blocking. we can "
-				     "try to master this here\n");
-				mle->type = DLM_MLE_MASTER;
-				memset(mle->maybe_map, 0,
-				       sizeof(mle->maybe_map));
-				memset(mle->response_map, 0,
-				       sizeof(mle->maybe_map));
-				memcpy(mle->vote_map, mle->node_map,
-				       sizeof(mle->node_map));
-				mle->u.res = res;
-				set_bit(dlm->node_num, mle->maybe_map);
-
-				ret = -EAGAIN;
-				goto next;
 			}
 
-			clear_bit(node, mle->maybe_map);
-			if (node > dlm->node_num)
-				goto next;
-
-			mlog(0, "dead node in map!\n");
-			/* yuck. go back and re-contact all nodes
-			 * in the vote_map, removing this node. */
-			memset(mle->response_map, 0,
-			       sizeof(mle->response_map));
+			/* now blank out everything, as if we had never
+			 * contacted anyone */
+			memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+			memset(mle->response_map, 0, sizeof(mle->response_map));
+			/* reset the vote_map to the current node_map */
+			memcpy(mle->vote_map, mle->node_map,
+			       sizeof(mle->node_map));
+			/* put myself into the maybe map */
+			if (mle->type != DLM_MLE_BLOCK)
+				set_bit(dlm->node_num, mle->maybe_map);
 		}
 		ret = -EAGAIN;
-next:
 		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
 	}
 	return ret;
-- 
cgit v1.2.3


From 6ff06a93916b3f95e83c346f7530cf2f5c68ae0c Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 11:51:45 -0700
Subject: ocfs2: give the dlm dirty list a reference on the lockres

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h | 14 +++++++++++---
 fs/ocfs2/dlm/dlmthread.c |  6 ++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 78eccd0951e..829cc394880 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -235,18 +235,26 @@ struct dlm_lock_resource
 	struct qstr lockname;
 	struct kref      refs;
 
-	/* please keep these next 3 in this order
-	 * some funcs want to iterate over all lists */
+	/*
+	 * Please keep granted, converting, and blocked in this order,
+	 * as some funcs want to iterate over all lists.
+	 *
+	 * All four lists are protected by the hash's reference.
+	 */
 	struct list_head granted;
 	struct list_head converting;
 	struct list_head blocked;
+	struct list_head purge;
 
+	/*
+	 * These two lists require you to hold an additional reference
+	 * while they are on the list.
+	 */
 	struct list_head dirty;
 	struct list_head recovering; // dlm_recovery_ctxt.resources list
 
 	/* unused lock resources have their last_used stamped and are
 	 * put on a list for the dlm thread to run. */
-	struct list_head purge;
 	unsigned long    last_used;
 
 	unsigned migration_pending:1;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 76526ea95bb..610dc76a851 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -455,6 +455,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 	/* don't shuffle secondary queues */
 	if ((res->owner == dlm->node_num) &&
 	    !(res->state & DLM_LOCK_RES_DIRTY)) {
+		/* ref for dirty_list */
+		dlm_lockres_get(res);
 		list_add_tail(&res->dirty, &dlm->dirty_list);
 		res->state |= DLM_LOCK_RES_DIRTY;
 	}
@@ -639,6 +641,8 @@ static int dlm_thread(void *data)
 			list_del_init(&res->dirty);
 			spin_unlock(&res->spinlock);
 			spin_unlock(&dlm->spinlock);
+			/* Drop dirty_list ref */
+			dlm_lockres_put(res);
 
 		 	/* lockres can be re-dirtied/re-added to the
 			 * dirty_list in this gap, but that is ok */
@@ -691,6 +695,8 @@ in_progress:
 			/* if the lock was in-progress, stick
 			 * it on the back of the list */
 			if (delay) {
+				/* ref for dirty_list */
+				dlm_lockres_get(res);
 				spin_lock(&res->spinlock);
 				list_add_tail(&res->dirty, &dlm->dirty_list);
 				res->state |= DLM_LOCK_RES_DIRTY;
-- 
cgit v1.2.3


From f42a100b2272bc5cb44fb2aa03526b436b1d6833 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 11:53:33 -0700
Subject: ocfs2: have dlm_pre_master_reco_lockres() ignore dead nodes

Recovery will spin in dlm_pre_master_reco_lockres if we do not ignore
timed-out network responses from dead nodes.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 915283fb48c..2e371e06894 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2050,6 +2050,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
 				BUG();
 			/* host is down, so answer for that node would be
 			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+			ret = 0;
 		}
 
 		if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
-- 
cgit v1.2.3


From aa087b84977173395c0e3a1e0c1773314958f277 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 12:02:07 -0700
Subject: ocfs2: increase backoff before waiting for recovery

When mastering non-recovery lock resources, additional time was frequently
needed to allow the disk heartbeat to catch up with the network timeout. the
recovery lock resource is time critical and avoids this path.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 2e371e06894..21081bcfa09 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -886,7 +886,7 @@ redo_request:
 		} 
 
 		dlm_kick_recovery_thread(dlm);
-		msleep(100);
+		msleep(1000);
 		dlm_wait_for_recovery(dlm);
 
 		spin_lock(&dlm->spinlock);
-- 
cgit v1.2.3


From 2abaf97e62e51fdd09d5a46703b3b680f24bdd8b Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 13:27:10 -0700
Subject: ocfs2: do not unconditionally purge the lockres in dlmlock_remote()

In dlmlock_remote(), do not call purge_lockres until the lock resource
actually changes. otherwise, the mastery info on the lockres will go away
underneath the caller.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmlock.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 57576208b79..675123c3085 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 				      struct dlm_lock *lock, int flags)
 {
 	enum dlm_status status = DLM_DENIED;
+	int lockres_changed = 1;
 
 	mlog_entry("type=%d\n", lock->ml.type);
 	mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
@@ -230,6 +231,10 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 			dlm_error(status);
 		dlm_revert_pending_lock(res, lock);
 		dlm_lock_put(lock);
+		/* do NOT call calc_usage, as this would unhash the remote
+		 * lockres before we ever get to use it.  treat as if we
+		 * never made any change to the lockres. */
+		lockres_changed = 0;
 	} else if (dlm_is_recovery_lock(res->lockname.name, 
 					res->lockname.len)) {
 		/* special case for the $RECOVERY lock.
@@ -243,7 +248,8 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 	}
 	spin_unlock(&res->spinlock);
 
-	dlm_lockres_calc_usage(dlm, res);
+	if (lockres_changed)
+		dlm_lockres_calc_usage(dlm, res);
 
 	wake_up(&res->wq);
 	return status;
-- 
cgit v1.2.3


From c87a9ae7059f718bf1bb87a702fcbef535e32111 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 13:30:49 -0700
Subject: ocfs2: temporarily disable automatic lock migration

Now we never change the owner of a lock resource until unmount or node
death. This will be re-enabled once some issues in the algorithm used have
been resolved.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmlock.c   | 14 +++++++++-----
 fs/ocfs2/dlm/dlmthread.c | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 675123c3085..0ff93487494 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -227,14 +227,18 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
 	lock->lock_pending = 0;
 	if (status != DLM_NORMAL) {
-		if (status != DLM_NOTQUEUED)
+		if (status != DLM_NOTQUEUED) {
+			/*
+			 * DO NOT call calc_usage, as this would unhash
+			 * the remote lockres before we ever get to use
+			 * it.  treat as if we never made any change to
+			 * the lockres.
+			 */
+			lockres_changed = 0;
 			dlm_error(status);
+		}
 		dlm_revert_pending_lock(res, lock);
 		dlm_lock_put(lock);
-		/* do NOT call calc_usage, as this would unhash the remote
-		 * lockres before we ever get to use it.  treat as if we
-		 * never made any change to the lockres. */
-		lockres_changed = 0;
 	} else if (dlm_is_recovery_lock(res->lockname.name, 
 					res->lockname.len)) {
 		/* special case for the $RECOVERY lock.
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 610dc76a851..c1c10fd1a5a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -106,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 	assert_spin_locked(&res->spinlock);
 
 	if (__dlm_lockres_unused(res)){
+		/* For now, just keep any resource we master */
+		if (res->owner == dlm->node_num)
+		{
+			if (!list_empty(&res->purge)) {
+				mlog(0, "we master %s:%.*s, but it is on "
+				     "the purge list.  Removing\n",
+				     dlm->name, res->lockname.len,
+				     res->lockname.name);
+				list_del_init(&res->purge);
+				dlm->purge_count--;
+			}
+			return;
+		}
+
 		if (list_empty(&res->purge)) {
 			mlog(0, "putting lockres %.*s from purge list\n",
 			     res->lockname.len, res->lockname.name);
-- 
cgit v1.2.3


From 36407488b1cbc4d84bc2bd14e03f3f9b768090d9 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 13:32:27 -0700
Subject: ocfs2: pending mastery asserts and migrations should block each other

Use the existing structure for blocking migrations when ASTs are pending to
achieve the same result. If we can catch the assert before it goes on the
wire, just cancel it and let the migration continue.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 21081bcfa09..78ba77bf3a7 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2000,6 +2000,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
 		}
 	}
 
+	/*
+	 * If we're migrating this lock to someone else, we are no
+	 * longer allowed to assert out own mastery.  OTOH, we need to
+	 * prevent migration from starting while we're still asserting
+	 * our dominance.  The reserved ast delays migration.
+	 */
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "Someone asked us to assert mastery, but we're "
+		     "in the middle of migration.  Skipping assert, "
+		     "the new master will handle that.\n");
+		spin_unlock(&res->spinlock);
+		goto put;
+	} else
+		__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
 	/* this call now finishes out the nodemap
 	 * even if one or more nodes die */
 	mlog(0, "worker about to master %.*s here, this=%u\n",
@@ -2012,6 +2029,10 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
 		mlog_errno(ret);
 	}
 
+	/* Ok, we've asserted ourselves.  Let's let migration start. */
+	dlm_lockres_release_ast(dlm, res);
+
+put:
 	dlm_lockres_put(res);
 
 	mlog(0, "finished with dlm_assert_master_worker\n");
-- 
cgit v1.2.3


From c8df412e1c746dd21094966d04b3a79aad0f4d08 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 13:47:50 -0700
Subject: ocfs2: special case recovery lock in dlmlock_remote()

If the previous master of the recovery lock dies, let calc_usage take it
down completely and let the caller completely redo the dlmlock() call.
Otherwise, there will never be an opportunity to re-master the lockres and
recovery wont be able to progress.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmlock.c     | 33 +++++++++++++++++++++++----------
 fs/ocfs2/dlm/dlmrecovery.c |  4 ++++
 2 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 0ff93487494..20b38dc1873 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -227,7 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
 	lock->lock_pending = 0;
 	if (status != DLM_NORMAL) {
-		if (status != DLM_NOTQUEUED) {
+		if (status == DLM_RECOVERING &&
+		    dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len)) {
+			/* recovery lock was mastered by dead node.
+			 * we need to have calc_usage shoot down this
+			 * lockres and completely remaster it. */
+			mlog(0, "%s: recovery lock was owned by "
+			     "dead node %u, remaster it now.\n",
+			     dlm->name, res->owner);
+		} else if (status != DLM_NOTQUEUED) {
 			/*
 			 * DO NOT call calc_usage, as this would unhash
 			 * the remote lockres before we ever get to use
@@ -691,18 +700,22 @@ retry_lock:
 			msleep(100);
 			/* no waiting for dlm_reco_thread */
 			if (recovery) {
-				if (status == DLM_RECOVERING) {
-					mlog(0, "%s: got RECOVERING "
-					     "for $REOCVERY lock, master "
-					     "was %u\n", dlm->name, 
-					     res->owner);
-					dlm_wait_for_node_death(dlm, res->owner, 
-							DLM_NODE_DEATH_WAIT_MAX);
-				}
+				if (status != DLM_RECOVERING)
+					goto retry_lock;
+
+				mlog(0, "%s: got RECOVERING "
+				     "for $RECOVERY lock, master "
+				     "was %u\n", dlm->name,
+				     res->owner);
+				/* wait to see the node go down, then
+				 * drop down and allow the lockres to
+				 * get cleaned up.  need to remaster. */
+				dlm_wait_for_node_death(dlm, res->owner,
+						DLM_NODE_DEATH_WAIT_MAX);
 			} else {
 				dlm_wait_for_recovery(dlm);
+				goto retry_lock;
 			}
-			goto retry_lock;
 		}
 
 		if (status != DLM_NORMAL) {
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 86199f66eb5..00209f4a291 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2314,6 +2314,10 @@ again:
 		mlog(0, "%s: reco master %u is ready to recover %u\n",
 		     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
 		status = -EEXIST;
+	} else if (ret == DLM_RECOVERING) {
+		mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		goto again;
 	} else {
 		struct dlm_lock_resource *res;
 
-- 
cgit v1.2.3


From 6a41321121ee2af33b8ac55c87657603df480b25 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 13:49:20 -0700
Subject: ocfs2: dlm_remaster_locks() should never exit without completing

We cannot restart recovery. Once we begin to recover a node, keep the state
of the recovery intact and follow through, regardless of any other node
deaths that may occur.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 116 ++++++++++++++++++++++++---------------------
 1 file changed, 62 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 00209f4a291..22a0b055cfc 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -480,6 +480,7 @@ master_here:
 
 	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
 	if (status < 0) {
+		/* we should never hit this anymore */
 		mlog(ML_ERROR, "error %d remastering locks for node %u, "
 		     "retrying.\n", status, dlm->reco.dead_node);
 		/* yield a bit to allow any final network messages
@@ -506,9 +507,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 	int destroy = 0;
 	int pass = 0;
 
-	status = dlm_init_recovery_area(dlm, dead_node);
-	if (status < 0)
-		goto leave;
+	do {
+		/* we have become recovery master.  there is no escaping
+		 * this, so just keep trying until we get it. */
+		status = dlm_init_recovery_area(dlm, dead_node);
+		if (status < 0) {
+			mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+			     "retrying\n", dlm->name);
+			msleep(1000);
+		}
+	} while (status != 0);
 
 	/* safe to access the node data list without a lock, since this
 	 * process is the only one to change the list */
@@ -525,16 +533,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 			continue;
 		}
 
-		status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
-		if (status < 0) {
-			mlog_errno(status);
-			if (dlm_is_host_down(status))
-				ndata->state = DLM_RECO_NODE_DATA_DEAD;
-			else {
-				destroy = 1;
-				goto leave;
+		do {
+			status = dlm_request_all_locks(dlm, ndata->node_num,
+						       dead_node);
+			if (status < 0) {
+				mlog_errno(status);
+				if (dlm_is_host_down(status)) {
+					/* node died, ignore it for recovery */
+					status = 0;
+					ndata->state = DLM_RECO_NODE_DATA_DEAD;
+					/* wait for the domain map to catch up
+					 * with the network state. */
+					wait_event_timeout(dlm->dlm_reco_thread_wq,
+							   dlm_is_node_dead(dlm,
+								ndata->node_num),
+							   msecs_to_jiffies(1000));
+					mlog(0, "waited 1 sec for %u, "
+					     "dead? %s\n", ndata->node_num,
+					     dlm_is_node_dead(dlm, ndata->node_num) ?
+					     "yes" : "no");
+				} else {
+					/* -ENOMEM on the other node */
+					mlog(0, "%s: node %u returned "
+					     "%d during recovery, retrying "
+					     "after a short wait\n",
+					     dlm->name, ndata->node_num,
+					     status);
+					msleep(100);
+				}
 			}
-		}
+		} while (status != 0);
 
 		switch (ndata->state) {
 			case DLM_RECO_NODE_DATA_INIT:
@@ -546,10 +574,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 				mlog(0, "node %u died after requesting "
 				     "recovery info for node %u\n",
 				     ndata->node_num, dead_node);
-				// start all over
-				destroy = 1;
-				status = -EAGAIN;
-				goto leave;
+				/* fine.  don't need this node's info.
+				 * continue without it. */
+				break;
 			case DLM_RECO_NODE_DATA_REQUESTING:
 				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
 				mlog(0, "now receiving recovery data from "
@@ -593,28 +620,12 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 					BUG();
 					break;
 				case DLM_RECO_NODE_DATA_DEAD:
-					mlog(ML_NOTICE, "node %u died after "
+					mlog(0, "node %u died after "
 					     "requesting recovery info for "
 					     "node %u\n", ndata->node_num,
 					     dead_node);
 					spin_unlock(&dlm_reco_state_lock);
-					// start all over
-					destroy = 1;
-					status = -EAGAIN;
-					/* instead of spinning like crazy here,
-					 * wait for the domain map to catch up
-					 * with the network state.  otherwise this
-					 * can be hit hundreds of times before
-					 * the node is really seen as dead. */
-					wait_event_timeout(dlm->dlm_reco_thread_wq,
-							   dlm_is_node_dead(dlm,
-								ndata->node_num),
-							   msecs_to_jiffies(1000));
-					mlog(0, "waited 1 sec for %u, "
-					     "dead? %s\n", ndata->node_num,
-					     dlm_is_node_dead(dlm, ndata->node_num) ?
-					     "yes" : "no");
-					goto leave;
+					break;
 				case DLM_RECO_NODE_DATA_RECEIVING:
 				case DLM_RECO_NODE_DATA_REQUESTED:
 					mlog(0, "%s: node %u still in state %s\n",
@@ -659,7 +670,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 			     jiffies, dlm->reco.dead_node,
 			     dlm->node_num, dlm->reco.new_master);
 			destroy = 1;
-			status = ret;
+			status = 0;
 			/* rescan everything marked dirty along the way */
 			dlm_kick_thread(dlm, NULL);
 			break;
@@ -672,7 +683,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 
 	}
 
-leave:
 	if (destroy)
 		dlm_destroy_recovery_area(dlm, dead_node);
 
@@ -832,24 +842,22 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 
 	if (dead_node != dlm->reco.dead_node ||
 	    reco_master != dlm->reco.new_master) {
-		/* show extra debug info if the recovery state is messed */
-		mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
-		     "request(dead=%u, master=%u)\n",
-		     dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
-		     dead_node, reco_master);
-		mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
-		     "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
-		     dlm->name, mres->lockname_len, mres->lockname, mres->master,
-		     mres->num_locks, mres->total_locks, mres->flags,
-		     dlm_get_lock_cookie_node(mres->ml[0].cookie),
-		     dlm_get_lock_cookie_seq(mres->ml[0].cookie),
-		     mres->ml[0].list, mres->ml[0].flags,
-		     mres->ml[0].type, mres->ml[0].convert_type,
-		     mres->ml[0].highest_blocked, mres->ml[0].node);
-		BUG();
+		/* worker could have been created before the recovery master
+		 * died.  if so, do not continue, but do not error. */
+		if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+			mlog(ML_NOTICE, "%s: will not send recovery state, "
+			     "recovery master %u died, thread=(dead=%u,mas=%u)"
+			     " current=(dead=%u,mas=%u)\n", dlm->name,
+			     reco_master, dead_node, reco_master,
+			     dlm->reco.dead_node, dlm->reco.new_master);
+		} else {
+			mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
+			     "master=%u), request(dead=%u, master=%u)\n",
+			     dlm->name, dlm->reco.dead_node,
+			     dlm->reco.new_master, dead_node, reco_master);
+		}
+		goto leave;
 	}
-	BUG_ON(dead_node != dlm->reco.dead_node);
-	BUG_ON(reco_master != dlm->reco.new_master);
 
 	/* lock resources should have already been moved to the
  	 * dlm->reco.resources list.  now move items from that list
@@ -889,7 +897,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 			     dlm->name, reco_master, dead_node, ret);
 		}
 	}
-
+leave:
 	free_page((unsigned long)data);
 }
 
-- 
cgit v1.2.3


From 67a187412baa84dfff2d423961d86663b7fc7d3c Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 13:50:12 -0700
Subject: ocfs2: remove unneccesary spin_unlock() in dlm_remaster_locks()

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 22a0b055cfc..4e0aada7327 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -624,7 +624,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 					     "requesting recovery info for "
 					     "node %u\n", ndata->node_num,
 					     dead_node);
-					spin_unlock(&dlm_reco_state_lock);
 					break;
 				case DLM_RECO_NODE_DATA_RECEIVING:
 				case DLM_RECO_NODE_DATA_REQUESTED:
-- 
cgit v1.2.3


From c27069e6cfa242a3b84eb3442934c6fe51ee9066 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 13:51:49 -0700
Subject: ocfs2: continue recovery when a dead node is encountered

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 4e0aada7327..c699a450b0f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2541,6 +2541,7 @@ stage2:
 				mlog(ML_ERROR, "node %u went down after this "
 				     "node finished recovery.\n", nodenum);
 				ret = 0;
+				continue;
 			}
 			break;
 		}
-- 
cgit v1.2.3


From b7084ab538ac2bd71ce494cf1cbbea9fe9db2c07 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 13:54:07 -0700
Subject: ocfs2: wait for recovery when starting lock mastery

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |  1 +
 fs/ocfs2/dlm/dlmmaster.c   |  3 +++
 fs/ocfs2/dlm/dlmrecovery.c | 30 ++++++++++++++++++++++++++++++
 3 files changed, 34 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 829cc394880..9bea5c6ef9b 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -710,6 +710,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
 void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
 
 void dlm_put(struct dlm_ctxt *dlm);
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 78ba77bf3a7..ed1601d1831 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -899,6 +899,9 @@ redo_request:
 		} else
 			wait_on_recovery = 0;
 		spin_unlock(&dlm->spinlock);
+
+		if (wait_on_recovery)
+			dlm_wait_for_node_recovery(dlm, bit, 10000);
 	}
 
 	/* must wait for lock to be mastered elsewhere */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index c699a450b0f..b03fecab299 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -343,6 +343,18 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
 	return dead;
 }
 
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+	int recovered;
+	spin_lock(&dlm->spinlock);
+	recovered = !test_bit(node, dlm->recovery_map);
+	spin_unlock(&dlm->spinlock);
+	return recovered;
+}
+
+
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 {
 	if (timeout) {
@@ -361,6 +373,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 	return 0;
 }
 
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (timeout) {
+		mlog(0, "%s: waiting %dms for notification of "
+		     "recovery of node %u\n", dlm->name, timeout, node);
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_recovered(dlm, node),
+			   msecs_to_jiffies(timeout));
+	} else {
+		mlog(0, "%s: waiting indefinitely for notification "
+		     "of recovery of node %u\n", dlm->name, node);
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_recovered(dlm, node));
+	}
+	/* for now, return 0 */
+	return 0;
+}
+
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
  * block on the dlm->reco.event when recovery is in progress.
  * the dlm recovery thread will set this state when it begins
-- 
cgit v1.2.3


From ad8100e0d20e0123def9f83c040b68c96c8638f0 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 14:25:21 -0700
Subject: ocfs2: use GFP_NOFS in some dlm operations

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmfs.c       |  6 +++---
 fs/ocfs2/dlm/dlmlock.c     |  4 ++--
 fs/ocfs2/dlm/dlmmaster.c   | 16 ++++++++--------
 fs/ocfs2/dlm/dlmrecovery.c | 10 +++++-----
 fs/ocfs2/dlm/userdlm.c     |  2 +-
 5 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7273d9fa6ba..033ad170123 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode,
 	 * doesn't make sense for LVB writes. */
 	file->f_flags &= ~O_APPEND;
 
-	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	fp = kmalloc(sizeof(*fp), GFP_NOFS);
 	if (!fp) {
 		status = -ENOMEM;
 		goto bail;
@@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
 	else
 		readlen = count - *ppos;
 
-	lvb_buf = kmalloc(readlen, GFP_KERNEL);
+	lvb_buf = kmalloc(readlen, GFP_NOFS);
 	if (!lvb_buf)
 		return -ENOMEM;
 
@@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 	else
 		writelen = count - *ppos;
 
-	lvb_buf = kmalloc(writelen, GFP_KERNEL);
+	lvb_buf = kmalloc(writelen, GFP_NOFS);
 	if (!lvb_buf)
 		return -ENOMEM;
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 20b38dc1873..e3cab0e4850 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -408,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
 	struct dlm_lock *lock;
 	int kernel_allocated = 0;
 
-	lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
+	lock = kcalloc(1, sizeof(*lock), GFP_NOFS);
 	if (!lock)
 		return NULL;
 
 	if (!lksb) {
 		/* zero memory only if kernel-allocated */
-		lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
+		lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS);
 		if (!lksb) {
 			kfree(lock);
 			return NULL;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ed1601d1831..b780e159dad 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -709,11 +709,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
 	struct dlm_lock_resource *res;
 
-	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
 	if (!res)
 		return NULL;
 
-	res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+	res->lockname.name = kmalloc(namelen, GFP_NOFS);
 	if (!res->lockname.name) {
 		kfree(res);
 		return NULL;
@@ -777,7 +777,7 @@ lookup:
 		mlog(0, "allocating a new resource\n");
 		/* nothing found and we need to allocate one. */
 		alloc_mle = (struct dlm_master_list_entry *)
-			kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+			kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 		if (!alloc_mle)
 			goto leave;
 		res = dlm_new_lockres(dlm, lockid, namelen);
@@ -1532,7 +1532,7 @@ way_up_top:
 			spin_unlock(&dlm->spinlock);
 
 			mle = (struct dlm_master_list_entry *)
-				kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+				kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 			if (!mle) {
 				response = DLM_MASTER_RESP_ERROR;
 				mlog_errno(-ENOMEM);
@@ -1940,7 +1940,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 			       int ignore_higher, u8 request_from, u32 flags)
 {
 	struct dlm_work_item *item;
-	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_NOFS);
 	if (!item)
 		return -ENOMEM;
 
@@ -2175,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	 */
 
 	ret = -ENOMEM;
-	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
 	if (!mres) {
 		mlog_errno(ret);
 		goto leave;
 	}
 
 	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-								GFP_KERNEL);
+								GFP_NOFS);
 	if (!mle) {
 		mlog_errno(ret);
 		goto leave;
@@ -2639,7 +2639,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	/* preallocate.. if this fails, abort */
 	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-							 GFP_KERNEL);
+							 GFP_NOFS);
 
 	if (!mle) {
 		ret = -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index b03fecab299..f76b498a174 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -737,7 +737,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
 		}
 		BUG_ON(num == dead_node);
 
-		ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+		ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
 		if (!ndata) {
 			dlm_destroy_recovery_area(dlm, dead_node);
 			return -ENOMEM;
@@ -822,14 +822,14 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
 	}
 	BUG_ON(lr->dead_node != dlm->reco.dead_node);
 
-	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_NOFS);
 	if (!item) {
 		dlm_put(dlm);
 		return -ENOMEM;
 	}
 
 	/* this will get freed by dlm_request_all_locks_worker */
-	buf = (char *) __get_free_page(GFP_KERNEL);
+	buf = (char *) __get_free_page(GFP_NOFS);
 	if (!buf) {
 		kfree(item);
 		dlm_put(dlm);
@@ -1302,8 +1302,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
 		mlog(0, "all done flag.  all lockres data received!\n");
 
 	ret = -ENOMEM;
-	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
-	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
+	item = kcalloc(1, sizeof(*item), GFP_NOFS);
 	if (!buf || !item)
 		goto leave;
 
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index 74ca4e5f976..e641b084b34 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
 	u32 dlm_key;
 	char *domain;
 
-	domain = kmalloc(name->len + 1, GFP_KERNEL);
+	domain = kmalloc(name->len + 1, GFP_NOFS);
 	if (!domain) {
 		mlog_errno(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From f85cd47a5825b77a146bad6870b2ddcf08415c13 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 14:27:41 -0700
Subject: ocfs2: use cond_resched() in dlm_thread()

yield() does not yield.  cond_resched() does.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmthread.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index c1c10fd1a5a..0c822f3ffb0 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -731,7 +731,7 @@ in_progress:
 
 		/* yield and continue right away if there is more work to do */
 		if (!n) {
-			yield();
+			cond_resched();
 			continue;
 		}
 
-- 
cgit v1.2.3


From b220532a71adf65d45c3aa8a284bfa7ec57957bd Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 14:29:28 -0700
Subject: ocfs2: retry operations when a lock is marked in recovery

Before checking for a nonexistent lock, make sure the lockres is not marked
RECOVERING. The caller will just retry and the state should be fixed up when
recovery completes.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmconvert.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 90cbaaff33a..2f7ad526128 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -464,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 	}
 
 	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL) {
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		goto leave;
+	}
 	list_for_each(iter, &res->granted) {
 		lock = list_entry(iter, struct dlm_lock, list);
 		if (lock->ml.cookie == cnv->cookie &&
@@ -473,6 +479,20 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 		}
 		lock = NULL;
 	}
+	if (!lock) {
+		__dlm_print_one_lock_resource(res);
+		list_for_each(iter, &res->granted) {
+			lock = list_entry(iter, struct dlm_lock, list);
+			if (lock->ml.node == cnv->node_idx) {
+				mlog(0, "There is something here "
+				     "for node %u, lock->ml.cookie=%llu, "
+				     "cnv->cookie=%llu\n", cnv->node_idx,
+				     lock->ml.cookie, cnv->cookie);
+				break;
+			}
+		}
+		lock = NULL;
+	}
 	spin_unlock(&res->spinlock);
 	if (!lock) {
 		status = DLM_IVLOCKID;
-- 
cgit v1.2.3


From 44a7f1d063bbe45773353903f36d9d88fb73d82a Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 14:29:59 -0700
Subject: ocfs2: mlog in dlm_convert_lock_handler() should be ML_ERROR

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmconvert.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 2f7ad526128..b24fa537a76 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -484,7 +484,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 		list_for_each(iter, &res->granted) {
 			lock = list_entry(iter, struct dlm_lock, list);
 			if (lock->ml.node == cnv->node_idx) {
-				mlog(0, "There is something here "
+				mlog(ML_ERROR, "There is something here "
 				     "for node %u, lock->ml.cookie=%llu, "
 				     "cnv->cookie=%llu\n", cnv->node_idx,
 				     lock->ml.cookie, cnv->cookie);
-- 
cgit v1.2.3


From 56a7c104bc91b4a5f970d7372ebb04eebc633c68 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 14:30:39 -0700
Subject: ocfs2: display message before waiting for recovery to complete

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f76b498a174..08336aec13d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -409,6 +409,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm)
 
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
 {
+	if (dlm_in_recovery(dlm)) {
+		mlog(ML_NOTICE, "%s: reco thread %d in recovery: "
+		     "state=%d, master=%u, dead=%u\n",
+		     dlm->name, dlm->dlm_reco_thread_task->pid,
+		     dlm->reco.state, dlm->reco.new_master,
+		     dlm->reco.dead_node);
+	}
 	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
 }
 
-- 
cgit v1.2.3


From 3b3b84a892d37ba336391e411eb5f8b013b9a669 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 14:31:37 -0700
Subject: ocfs2: tune down some noisy messages during dlm recovery

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c   | 11 ++++++-----
 fs/ocfs2/dlm/dlmrecovery.c |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index b780e159dad..f186e090a10 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1639,13 +1639,13 @@ again:
 		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
 					    &assert, sizeof(assert), to, &r);
 		if (tmpret < 0) {
-			mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+			mlog(0, "assert_master returned %d!\n", tmpret);
 			if (!dlm_is_host_down(tmpret)) {
-				mlog(ML_ERROR, "unhandled error!\n");
+				mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
 				BUG();
 			}
 			/* a node died.  finish out the rest of the nodes. */
-			mlog(ML_ERROR, "link to %d went down!\n", to);
+			mlog(0, "link to %d went down!\n", to);
 			/* any nonzero status return will do */
 			ret = tmpret;
 		} else if (r < 0) {
@@ -2029,7 +2029,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
 				   nodemap, flags);
 	if (ret < 0) {
 		/* no need to restart, we are done */
-		mlog_errno(ret);
+		if (!dlm_is_host_down(ret))
+			mlog_errno(ret);
 	}
 
 	/* Ok, we've asserted ourselves.  Let's let migration start. */
@@ -2808,7 +2809,7 @@ top:
 				 * may result in the mle being unlinked and
 				 * freed, but there may still be a process
 				 * waiting in the dlmlock path which is fine. */
-				mlog(ML_ERROR, "node %u was expected master\n",
+				mlog(0, "node %u was expected master\n",
 				     dead_node);
 				atomic_set(&mle->woken, 1);
 				spin_unlock(&mle->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 08336aec13d..f0549e9f002 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -410,7 +410,7 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm)
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
 {
 	if (dlm_in_recovery(dlm)) {
-		mlog(ML_NOTICE, "%s: reco thread %d in recovery: "
+		mlog(0, "%s: reco thread %d in recovery: "
 		     "state=%d, master=%u, dead=%u\n",
 		     dlm->name, dlm->dlm_reco_thread_task->pid,
 		     dlm->reco.state, dlm->reco.new_master,
-- 
cgit v1.2.3


From 495ac96e638cb0ad33baa7113531d742bfb328d4 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 14:34:08 -0700
Subject: ocfs2: fix incorrect error returns

Use DLM_REJECTED instead of DLM_RECOVERING.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmlock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index e3cab0e4850..d6f89577e25 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -299,7 +299,7 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
 	if (tmpret >= 0) {
 		// successfully sent and received
 		ret = status;  // this is already a dlm_status
-		if (ret == DLM_RECOVERING) {
+		if (ret == DLM_REJECTED) {
 			mlog(ML_ERROR, "%s:%.*s: BUG.  this is a stale lockres "
 			     "no longer owned by %u.  that node is coming back "
 			     "up currently.\n", dlm->name, create.namelen,
@@ -457,7 +457,7 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	name = create->name;
 	namelen = create->namelen;
-	status = DLM_RECOVERING;
+	status = DLM_REJECTED;
 	if (!dlm_domain_fully_joined(dlm)) {
 		mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
 		     "sending a create_lock message for lock %.*s!\n",
-- 
cgit v1.2.3


From 3156d267016627fe427a6b0d4ed8a9678557e91e Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 14:39:29 -0700
Subject: ocfs2: move dlm work to a private work queue

The work that is done can block for long periods of time and so is not
appropriate for keventd.

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |  1 +
 fs/ocfs2/dlm/dlmdomain.c   | 19 +++++++++++++++++++
 fs/ocfs2/dlm/dlmmaster.c   |  2 +-
 fs/ocfs2/dlm/dlmrecovery.c | 13 +++++++++++--
 4 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 9bea5c6ef9b..9bdc9cf6599 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -131,6 +131,7 @@ struct dlm_ctxt
 	struct o2hb_callback_func dlm_hb_down;
 	struct task_struct *dlm_thread_task;
 	struct task_struct *dlm_reco_thread_task;
+	struct workqueue_struct *dlm_worker;
 	wait_queue_head_t dlm_thread_wq;
 	wait_queue_head_t dlm_reco_thread_wq;
 	wait_queue_head_t ast_wq;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3511930a34e..c3462ea694e 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -303,11 +303,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
 	return ret;
 }
 
+static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_worker) {
+		flush_workqueue(dlm->dlm_worker);
+		destroy_workqueue(dlm->dlm_worker);
+		dlm->dlm_worker = NULL;
+	}
+}
+
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
 	dlm_unregister_domain_handlers(dlm);
 	dlm_complete_thread(dlm);
 	dlm_complete_recovery_thread(dlm);
+	dlm_destroy_dlm_worker(dlm);
 
 	/* We've left the domain. Now we can take ourselves out of the
 	 * list and allow the kref stuff to help us free the
@@ -1151,6 +1161,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
 		goto bail;
 	}
 
+	dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+	if (!dlm->dlm_worker) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
 	do {
 		unsigned int backoff;
 		status = dlm_try_to_join_domain(dlm);
@@ -1191,6 +1208,7 @@ bail:
 		dlm_unregister_domain_handlers(dlm);
 		dlm_complete_thread(dlm);
 		dlm_complete_recovery_thread(dlm);
+		dlm_destroy_dlm_worker(dlm);
 	}
 
 	return status;
@@ -1256,6 +1274,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 
 	dlm->dlm_thread_task = NULL;
 	dlm->dlm_reco_thread_task = NULL;
+	dlm->dlm_worker = NULL;
 	init_waitqueue_head(&dlm->dlm_thread_wq);
 	init_waitqueue_head(&dlm->dlm_reco_thread_wq);
 	init_waitqueue_head(&dlm->reco.event);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f186e090a10..733d9a52887 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1962,7 +1962,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
 
-	schedule_work(&dlm->dispatched_work);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 	return 0;
 }
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f0549e9f002..22e6a5b6958 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -157,11 +157,20 @@ void dlm_dispatch_work(void *data)
 	struct list_head *iter, *iter2;
 	struct dlm_work_item *item;
 	dlm_workfunc_t *workfunc;
+	int tot=0;
+
+	if (!dlm_joined(dlm))
+		return;
 
 	spin_lock(&dlm->work_lock);
 	list_splice_init(&dlm->work_list, &tmp_list);
 	spin_unlock(&dlm->work_lock);
 
+	list_for_each_safe(iter, iter2, &tmp_list) {
+		tot++;
+	}
+	mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+
 	list_for_each_safe(iter, iter2, &tmp_list) {
 		item = list_entry(iter, struct dlm_work_item, list);
 		workfunc = item->func;
@@ -851,7 +860,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
 	spin_lock(&dlm->work_lock);
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
-	schedule_work(&dlm->dispatched_work);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
 	dlm_put(dlm);
 	return 0;
@@ -1401,7 +1410,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
 	spin_lock(&dlm->work_lock);
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
-	schedule_work(&dlm->dispatched_work);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
 leave:
 	dlm_put(dlm);
-- 
cgit v1.2.3


From 0032abd674a606c3cf2d9961c8119f5d72c411d0 Mon Sep 17 00:00:00 2001
From: Kurt Hackel <kurt.hackel@oracle.com>
Date: Mon, 1 May 2006 14:39:57 -0700
Subject: ocfs2: remove whitespace in dlmunlock.c

Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmunlock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 9b40d60e3b4..b0c3134f4f7 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -317,7 +317,7 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
 	size_t veclen = 1;
 
 	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
-			
+
 	if (owner == dlm->node_num) {
 		/* ended up trying to contact ourself.  this means
 		 * that the lockres had been remote but became local
-- 
cgit v1.2.3


From 8a9343fa24d8d3fcb189bed2b7afcf4b8a8c1c8d Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 1 May 2006 14:55:10 -0700
Subject: ocfs2: dlm_print_one_mle() needs to be defined

Fixes compile breakage.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 733d9a52887..1b38c558f2a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -128,11 +128,8 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
 	return 1;
 }
 
-#if 0
-/* Code here is included but defined out as it aids debugging */
-
 #define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
-void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
 {
 	int i;
 	printk("%s=[ ", mapname);
@@ -142,7 +139,7 @@ void _dlm_print_nodemap(unsigned long *map, const char *mapname)
 	printk("]");
 }
 
-void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 {
 	int refs;
 	char *type;
@@ -189,6 +186,9 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 	printk("\n");
 }
 
+#if 0
+/* Code here is included but defined out as it aids debugging */
+
 static void dlm_dump_mles(struct dlm_ctxt *dlm)
 {
 	struct dlm_master_list_entry *mle;
@@ -943,7 +943,7 @@ wait:
 			     dlm->name, res->lockname.len, 
 			     res->lockname.name, blocked);
 			dlm_print_one_lock_resource(res);
-			/* dlm_print_one_mle(mle); */
+			dlm_print_one_mle(mle);
 			tries = 0;
 		}
 		goto redo_request;
-- 
cgit v1.2.3


From 43dee336c903fae15783b90983dfdedd2c7ffefc Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 1 May 2006 14:56:57 -0700
Subject: ocfs2: fix compiler warnings in dlm_convert_lock_handler()

We need to cast to unsigned long long.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmconvert.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index b24fa537a76..c764dc8e40a 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -487,7 +487,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 				mlog(ML_ERROR, "There is something here "
 				     "for node %u, lock->ml.cookie=%llu, "
 				     "cnv->cookie=%llu\n", cnv->node_idx,
-				     lock->ml.cookie, cnv->cookie);
+				     (unsigned long long)lock->ml.cookie,
+				     (unsigned long long)cnv->cookie);
 				break;
 			}
 		}
-- 
cgit v1.2.3


From 3fb5a9891dbb553dda96783dbc0dc4e77cbb2529 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 16 May 2006 17:26:41 +0200
Subject: [PATCH] fs/ocfs2/dlm/: cleanups

This patch #if 0's the no longer used dlm_dump_lock_resources().

Since this makes dlmdebug.h empty, this patch also removes this header.

Additionally, the needlessly global dlm_is_node_recovered() is made
static.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmdebug.c    |  4 ++--
 fs/ocfs2/dlm/dlmdebug.h    | 30 ------------------------------
 fs/ocfs2/dlm/dlmdomain.c   |  1 -
 fs/ocfs2/dlm/dlmmaster.c   |  1 -
 fs/ocfs2/dlm/dlmrecovery.c |  2 +-
 5 files changed, 3 insertions(+), 35 deletions(-)
 delete mode 100644 fs/ocfs2/dlm/dlmdebug.h

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index e119e4ddf93..3f6c8d88f7a 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -37,10 +37,8 @@
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 
 #include "dlmdomain.h"
-#include "dlmdebug.h"
 
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
@@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
 }
 EXPORT_SYMBOL_GPL(dlm_print_one_lock);
 
+#if 0
 void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 {
 	struct dlm_lock_resource *res;
@@ -142,6 +141,7 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 	}
 	spin_unlock(&dlm->spinlock);
 }
+#endif  /*  0  */
 
 static const char *dlm_errnames[] = {
 	[DLM_NORMAL] =			"DLM_NORMAL",
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
deleted file mode 100644
index 6858510c3cc..00000000000
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmdebug.h
- *
- * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-
-#ifndef DLMDEBUG_H
-#define DLMDEBUG_H
-
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
-
-#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c3462ea694e..ba27c5c5e95 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -41,7 +41,6 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 
 #include "dlmver.h"
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 1b38c558f2a..1b8346dd057 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -47,7 +47,6 @@
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 22e6a5b6958..da399013516 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -354,7 +354,7 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
 
 /* returns true if node is no longer in the domain
  * could be dead or just not joined */
-int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
 {
 	int recovered;
 	spin_lock(&dlm->spinlock);
-- 
cgit v1.2.3


From b71d300c8bc0f3617eef4a9c96aebbadeb259177 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 27 Jun 2006 12:45:17 +1000
Subject: [XFS] link(2) on directory is banned in VFS.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26293a

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 00a6b7dc24a..23cfa583772 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2603,8 +2603,7 @@ xfs_link(
 	vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
 
 	target_namelen = VNAMELEN(dentry);
-	if (VN_ISDIR(src_vp))
-		return XFS_ERROR(EPERM);
+	ASSERT(!VN_ISDIR(src_vp));
 
 	sip = xfs_vtoi(src_vp);
 	tdp = XFS_BHVTOI(target_dir_bdp);
@@ -2699,9 +2698,8 @@ xfs_link(
 	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
 
 	error = xfs_bumplink(tp, sip);
-	if (error) {
+	if (error)
 		goto abort_return;
-	}
 
 	/*
 	 * If this is a synchronous mount, make sure that the
@@ -2719,9 +2717,8 @@ xfs_link(
 	}
 
 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
-	if (error) {
+	if (error)
 		goto std_return;
-	}
 
 	/* Fall through to std_return with error = 0. */
 std_return:
@@ -2742,6 +2739,8 @@ std_return:
 	xfs_trans_cancel(tp, cancel_flags);
 	goto std_return;
 }
+
+
 /*
  * xfs_mkdir
  *
-- 
cgit v1.2.3


From 71306f3b880539fb4c579fbd16da552ebb10f29b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 27 Jun 2006 14:10:29 +1000
Subject: [XFS] * There is trivial "inode => vnode => inode" conversion, but
 only flags and   mode of final inode are looked at. Pass original inode
 instead. * Two occurences of bhv_vnode_t go out.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26298a

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_vnode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 35c6a01963a..c42b3221b20 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -93,7 +93,7 @@ typedef enum {
  */
 static inline struct bhv_vnode *vn_from_inode(struct inode *inode)
 {
-	return (bhv_vnode_t *)list_entry(inode, bhv_vnode_t, v_inode);
+	return container_of(inode, bhv_vnode_t, v_inode);
 }
 static inline struct inode *vn_to_inode(struct bhv_vnode *vnode)
 {
-- 
cgit v1.2.3


From b3bbed1d08e925a12ebbe6326d17554902b3d9ab Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 27 Jun 2006 16:12:15 +1000
Subject: [XFS] remove unused behaviour lock - shrink XFS vnode as a side
 effect.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26299a

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_behavior.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h
index 1d8ff103201..6e6e56fb352 100644
--- a/fs/xfs/xfs_behavior.h
+++ b/fs/xfs/xfs_behavior.h
@@ -78,15 +78,12 @@
  *
  */
 
-struct bhv_head_lock;
-
 /*
  * Behavior head.  Head of the chain of behaviors.
  * Contained within each virtualized object data structure.
  */
 typedef struct bhv_head {
 	struct bhv_desc *bh_first;	/* first behavior in chain */
-	struct bhv_head_lock *bh_lockp;	/* pointer to lock info struct */
 } bhv_head_t;
 
 /*
-- 
cgit v1.2.3


From 1998764e5a1700ffad2cab29b7b2a175364e5b70 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 27 Jun 2006 16:12:40 +1000
Subject: [XFS] Reduce size of xfs_trans_t structure. * remove ->t_forw,
 ->t_back -- unused * ->t_ag_freeblks_delta, ->t_ag_flist_delta,
 ->t_ag_btree_delta are debugging aid -- wrap them in everyone's favourite
 way.    As a result, cut "xfs_trans" slab object size from 592 to 572 bytes
 here.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26319a

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_trans.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index cb65c3a603f..9dc88b38060 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -338,8 +338,6 @@ typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
 typedef struct xfs_trans {
 	unsigned int		t_magic;	/* magic number */
 	xfs_log_callback_t	t_logcb;	/* log callback struct */
-	struct xfs_trans	*t_forw;	/* async list pointers */
-	struct xfs_trans	*t_back;	/* async list pointers */
 	unsigned int		t_type;		/* transaction type */
 	unsigned int		t_log_res;	/* amt of log space resvd */
 	unsigned int		t_log_count;	/* count for perm log res */
@@ -364,9 +362,11 @@ typedef struct xfs_trans {
 	long			t_res_fdblocks_delta; /* on-disk only chg */
 	long			t_frextents_delta;/* superblock freextents chg*/
 	long			t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
 	long			t_ag_freeblks_delta; /* debugging counter */
 	long			t_ag_flist_delta; /* debugging counter */
 	long			t_ag_btree_delta; /* debugging counter */
+#endif
 	long			t_dblocks_delta;/* superblock dblocks change */
 	long			t_agcount_delta;/* superblock agcount change */
 	long			t_imaxpct_delta;/* superblock imaxpct change */
-- 
cgit v1.2.3


From ebe1090549a7821faac09c467fc80b2245d0d30e Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 27 Jun 2006 16:13:02 +1000
Subject: [XFS] Remove a couple of no-longer-used macros.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26339a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_linux.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index aa26ab906c8..028eb17ec2e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -140,9 +140,7 @@ BUFFER_FNS(PrivateStart, unwritten);
 #define current_pid()		(current->pid)
 #define current_fsuid(cred)	(current->fsuid)
 #define current_fsgid(cred)	(current->fsgid)
-#define current_set_flags(f)	(current->flags |= (f))
 #define current_test_flags(f)	(current->flags & (f))
-#define current_clear_flags(f)	(current->flags & ~(f))
 #define current_set_flags_nested(sp, f)		\
 		(*(sp) = current->flags, current->flags |= (f))
 #define current_clear_flags_nested(sp, f)	\
-- 
cgit v1.2.3


From 05a3332885dd143496694bcecff223339880d7c9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 27 Jun 2006 16:13:29 +1000
Subject: [XFS] Remove redundant directory checks from inode link operation.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26343a

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 12810baeb5d..b3b46457f15 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -419,9 +419,6 @@ xfs_vn_link(
 	int		error;
 
 	ip = old_dentry->d_inode;	/* inode being linked to */
-	if (S_ISDIR(ip->i_mode))
-		return -EPERM;
-
 	tdvp = vn_from_inode(dir);
 	vp = vn_from_inode(ip);
 
-- 
cgit v1.2.3


From 97dfd70c8958a634157e0b35711cca01ff6f959b Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Tue, 27 Jun 2006 16:13:46 +1000
Subject: [XFS] Remove a race condition where a linked inode could BUG_ON in
 d_instantiate, due to fast transaction committal removing the last remaining
 reference before we were all done.

SGI-PV: 953287
SGI-Modid: xfs-linux-melb:xfs-kern:26347a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index b3b46457f15..d9180020de6 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -422,10 +422,12 @@ xfs_vn_link(
 	tdvp = vn_from_inode(dir);
 	vp = vn_from_inode(ip);
 
+	VN_HOLD(vp);
 	error = bhv_vop_link(tdvp, vp, dentry, NULL);
-	if (likely(!error)) {
+	if (unlikely(error)) {
+		VN_RELE(vp);
+	} else {
 		VMODIFY(tdvp);
-		VN_HOLD(vp);
 		xfs_validate_fields(ip, &vattr);
 		d_instantiate(dentry, ip);
 	}
-- 
cgit v1.2.3


From 750d1151a6c95ef9b9a188bb7cff6b80ee30da17 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 27 Jun 2006 06:28:30 +0000
Subject: [CIFS] Fix allocation of buffers for new session setup routine to
 allow longer user and domain names and allow passing sec options on mount

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES    |  3 ++-
 fs/cifs/README     |  5 ++++-
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/cifssmb.c  | 18 +++++++++++++-----
 fs/cifs/connect.c  | 23 ++++++++++++++---------
 fs/cifs/sess.c     | 44 ++++++++++++++++++++++++--------------------
 6 files changed, 58 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 79a202b8f66..a61d17ed182 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -7,7 +7,8 @@ so we can do search (ls etc.) to OS/2.  Do not send NTCreateX
 or recent levels of FindFirst unless server says it supports NT SMBs
 (instead use legacy equivalents from LANMAN dialect). Fix to allow
 NTLMv2 authentication support (now can use stronger password hashing
-on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004)
+on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004).
+Allow override of global cifs security flags on mount via "sec=" option(s).
 
 Version 1.43
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 46c2cfa5acf..7986d0d97ac 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -443,7 +443,10 @@ A partial list of the supported mount options follows:
 		SFU does).  In the future the bottom 9 bits of the mode
 		mode also will be emulated using queries of the security
 		descriptor (ACL).
-sec		Security mode.  Allowed values are:
+ sign           Must use packet signing (helps avoid unwanted data modification
+		by intermediate systems in the route).  Note that signing
+		does not work with lanman or plaintext authentication.
+ sec            Security mode.  Allowed values are:
 			none	attempt to connection as a null user (no name)
 			krb5    Use Kerberos version 5 authentication
 			krb5i   Use Kerberos authentication and packet signing
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 87453a6bcaf..6d7cf5f3bc0 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -186,6 +186,7 @@ struct cifsSesInfo {
 	struct TCP_Server_Info *server;	/* pointer to server info */
 	atomic_t inUse; /* # of mounts (tree connections) on this ses */
 	enum statusEnum status;
+	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
 	__u16 flags;
 	char *serverOS;		/* name of operating system underlying server */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 38f83db2076..de405bfb67d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -396,6 +396,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	int i;
 	struct TCP_Server_Info * server;
 	u16 count;
+	unsigned int secFlags;
 
 	if(ses->server)
 		server = ses->server;
@@ -407,9 +408,16 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		      (void **) &pSMB, (void **) &pSMBr);
 	if (rc)
 		return rc;
+
+	/* if any of auth flags (ie not sign or seal) are overriden use them */
+	if(ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
+		secFlags = ses->overrideSecFlg;
+	else /* if override flags set only sign/seal OR them with global auth */
+		secFlags = extended_security | ses->overrideSecFlg;
+
 	pSMB->hdr.Mid = GetNextMid(server);
 	pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
-	if((extended_security & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
+	if((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	
 	count = 0;
@@ -439,8 +447,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			&& (pSMBr->DialectIndex == LANMAN_PROT)) {
 		struct lanman_neg_rsp * rsp = (struct lanman_neg_rsp *)pSMBr;
 
-		if((extended_security & CIFSSEC_MAY_LANMAN) || 
-			(extended_security & CIFSSEC_MAY_PLNTXT))
+		if((secFlags & CIFSSEC_MAY_LANMAN) || 
+			(secFlags & CIFSSEC_MAY_PLNTXT))
 			server->secType = LANMAN;
 		else {
 			cERROR(1, ("mount failed weak security disabled"
@@ -498,12 +506,12 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 
 	if((server->secMode & SECMODE_PW_ENCRYPT) == 0)
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-		if ((extended_security & CIFSSEC_MAY_PLNTXT) == 0)
+		if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
 #endif /* CIFS_WEAK_PW_HASH */
 			cERROR(1,("Server requests plain text password"
 				  " but client support disabled"));
 
-	if(extended_security & CIFSSEC_MUST_NTLMV2)
+	if(secFlags & CIFSSEC_MUST_NTLMV2)
 		server->secType = NTLMv2;
 	else
 		server->secType = NTLM;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c0f98ddea88..876eb9ef85f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -915,32 +915,32 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 				cERROR(1,("no security value specified"));
                                 continue;
                         } else if (strnicmp(value, "krb5i", 5) == 0) {
-				vol->secFlg = CIFSSEC_MAY_KRB5 | 
+				vol->secFlg |= CIFSSEC_MAY_KRB5 | 
 					CIFSSEC_MUST_SIGN;
 			} else if (strnicmp(value, "krb5p", 5) == 0) {
-				/* vol->secFlg = CIFSSEC_MUST_SEAL | 
+				/* vol->secFlg |= CIFSSEC_MUST_SEAL | 
 					CIFSSEC_MAY_KRB5; */ 
 				cERROR(1,("Krb5 cifs privacy not supported"));
 				return 1;
 			} else if (strnicmp(value, "krb5", 4) == 0) {
-				vol->secFlg = CIFSSEC_MAY_KRB5;
+				vol->secFlg |= CIFSSEC_MAY_KRB5;
 			} else if (strnicmp(value, "ntlmv2i", 7) == 0) {
-				vol->secFlg = CIFSSEC_MAY_NTLMV2 |
+				vol->secFlg |= CIFSSEC_MAY_NTLMV2 |
 					CIFSSEC_MUST_SIGN;
 			} else if (strnicmp(value, "ntlmv2", 6) == 0) {
-				vol->secFlg = CIFSSEC_MAY_NTLMV2;
+				vol->secFlg |= CIFSSEC_MAY_NTLMV2;
 			} else if (strnicmp(value, "ntlmi", 5) == 0) {
-				vol->secFlg = CIFSSEC_MAY_NTLM |
+				vol->secFlg |= CIFSSEC_MAY_NTLM |
 					CIFSSEC_MUST_SIGN;
 			} else if (strnicmp(value, "ntlm", 4) == 0) {
 				/* ntlm is default so can be turned off too */
-				vol->secFlg = CIFSSEC_MAY_NTLM;
+				vol->secFlg |= CIFSSEC_MAY_NTLM;
 			} else if (strnicmp(value, "nontlm", 6) == 0) {
 				/* BB is there a better way to do this? */
-				vol->secFlg = CIFSSEC_MAY_NTLMV2;
+				vol->secFlg |= CIFSSEC_MAY_NTLMV2;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 			} else if (strnicmp(value, "lanman", 6) == 0) {
-                                vol->secFlg = CIFSSEC_MAY_LANMAN;
+                                vol->secFlg |= CIFSSEC_MAY_LANMAN;
 #endif
 			} else if (strnicmp(value, "none", 4) == 0) {
 				vol->nullauth = 1;
@@ -1173,6 +1173,10 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 			vol->no_psx_acl = 0;
 		} else if (strnicmp(data, "noacl",5) == 0) {
 			vol->no_psx_acl = 1;
+		} else if (strnicmp(data, "sign",4) == 0) {
+			vol->secFlg |= CIFSSEC_MUST_SIGN;
+/*		} else if (strnicmp(data, "seal",4) == 0) {
+			vol->secFlg |= CIFSSEC_MUST_SEAL; */
 		} else if (strnicmp(data, "direct",6) == 0) {
 			vol->direct_io = 1;
 		} else if (strnicmp(data, "forcedirectio",13) == 0) {
@@ -1776,6 +1780,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 						volume_info.domainname);
 			}
 			pSesInfo->linux_uid = volume_info.linux_uid;
+			pSesInfo->overrideSecFlg = volume_info.secFlg;
 			down(&pSesInfo->sesSem);
 			/* BB FIXME need to pass vol->secFlgs BB */
 			rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 70e32a81c21..7737edd1baf 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -138,7 +138,7 @@ static void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
                 strncpy(bcc_ptr, ses->userName, 300);
         }
 	/* BB improve check for overflow */
-        bcc_ptr += strnlen(ses->userName, 200);
+        bcc_ptr += strnlen(ses->userName, 300);
 	*bcc_ptr = 0;
         bcc_ptr++; /* account for null termination */
 
@@ -313,11 +313,12 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	int wct;
 	struct smb_hdr *smb_buf;
 	char *bcc_ptr;
+	char *str_area;
 	SESSION_SETUP_ANDX *pSMB;
 	__u32 capabilities;
 	int count;
 	int resp_buf_type = 0;
-	struct kvec iov[2];  /* BB split variable length info into 2nd iovec */
+	struct kvec iov[2];
 	enum securityEnum type;
 	__u16 action;
 	int bytes_remaining;
@@ -351,7 +352,18 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	pSMB = (SESSION_SETUP_ANDX *)smb_buf;
 
 	capabilities = cifs_ssetup_hdr(ses, pSMB);
-	bcc_ptr = pByteArea(smb_buf);
+
+	/* we will send the SMB in two pieces,
+	a fixed length beginning part, and a
+	second part which will include the strings
+	and rest of bcc area, in order to avoid having
+	to do a large buffer 17K allocation */
+        iov[0].iov_base = (char *)pSMB;
+        iov[0].iov_len = smb_buf->smb_buf_length + 4;
+
+	/* 2000 big enough to fit max user, domain, NOS name etc. */
+	str_area = kmalloc(2000, GFP_KERNEL);
+	bcc_ptr = str_area;
 
 	if(type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
@@ -365,10 +377,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 
 		calc_lanman_hash(ses, lnm_session_key);
 
-#ifdef CONFIG_CIFS_DEBUG2
+/* #ifdef CONFIG_CIFS_DEBUG2
 		cifs_dump_mem("cryptkey: ",ses->server->cryptKey,
 			CIFS_SESS_KEY_SIZE);
-#endif
+#endif */
 		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
 		bcc_ptr += CIFS_SESS_KEY_SIZE;
 
@@ -377,7 +389,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		changed to do higher than lanman dialect and
 		we reconnected would we ever calc signing_key? */
 
-		cERROR(1,("Negotiating LANMAN setting up strings"));
+		cFYI(1,("Negotiating LANMAN setting up strings"));
 		/* Unicode not allowed for LANMAN dialects */
 		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif    
@@ -396,7 +408,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 
 		if(first_time) /* should this be moved into common code 
 				  with similar ntlmv2 path? */
-			cifs_calculate_mac_key( ses->server->mac_signing_key,
+			cifs_calculate_mac_key(ses->server->mac_signing_key,
 				ntlm_session_key, ses->password);
 		/* copy session key */
 
@@ -454,23 +466,14 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		/* BB set password lengths */
 	}
 
-	count = (long) bcc_ptr - (long) pByteArea(smb_buf);
+	count = (long) bcc_ptr - (long) str_area;
 	smb_buf->smb_buf_length += count;
 
-	/* if we switch to small buffers, count will need to be fewer
-	   than 383 (strings less than 335 bytes) */
-
 	BCC_LE(smb_buf) = cpu_to_le16(count);
 
-
-	/* BB FIXME check for other non ntlm code paths */
-
-	/* BB check is this too big for a small smb? */
-
-	iov[0].iov_base = (char *)pSMB;
-	iov[0].iov_len = smb_buf->smb_buf_length + 4;
-
-	rc = SendReceive2(xid, ses, iov, 1 /* num_iovecs */, &resp_buf_type, 0);
+	iov[1].iov_base = str_area;
+	iov[1].iov_len = count; 
+	rc = SendReceive2(xid, ses, iov, 2 /* num_iovecs */, &resp_buf_type, 0);
 	/* SMB request buf freed in SendReceive2 */
 
 	cFYI(1,("ssetup rc from sendrecv2 is %d",rc));
@@ -515,6 +518,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,nls_cp);
 	
 ssetup_exit:
+	kfree(str_area);
 	if(resp_buf_type == CIFS_SMALL_BUFFER) {
 		cFYI(1,("ssetup freeing small buf %p", iov[0].iov_base));
 		cifs_small_buf_release(iov[0].iov_base);
-- 
cgit v1.2.3


From c9f700f840bd481b3e01fcad1ba8da01794a6726 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sun, 11 Jun 2006 10:35:15 +0900
Subject: [JFFS2][XATTR] using 'delete marker' for xdatum/xref deletion

- When xdatum is removed, a new xdatum with 'delete marker' is
  written. (version==0xffffffff means 'delete marker')
- When xref is removed, a new xref with 'delete marker' is written.
  (odd-numbered xseqno means 'delete marker')

- delete_xattr_(datum/xref)_delay() are new deletion functions
  are added. We can only use them if we can detect the target
  obsolete xdatum/xref as a orphan or errir one.
  (e.g when inode deletion, or detecting crc error)

[1/3] jffs2-xattr-v6-01-delete_marker.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/erase.c       |  19 +-
 fs/jffs2/gc.c          |   5 +-
 fs/jffs2/jffs2_fs_sb.h |   3 +
 fs/jffs2/malloc.c      |   2 +
 fs/jffs2/nodemgmt.c    |  21 +-
 fs/jffs2/scan.c        |  57 ++--
 fs/jffs2/summary.c     |  39 +--
 fs/jffs2/xattr.c       | 736 +++++++++++++++++++++++++++++++------------------
 fs/jffs2/xattr.h       |  40 ++-
 9 files changed, 583 insertions(+), 339 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 1862e8bc101..1644e340885 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -230,7 +230,6 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 			   at the end of the linked list. Stash it and continue
 			   from the beginning of the list */
 			ic = (struct jffs2_inode_cache *)(*prev);
-			BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE);
 			prev = &ic->nodes;
 			continue;
 		}
@@ -254,7 +253,8 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 
 	/* PARANOIA */
 	if (!ic) {
-		printk(KERN_WARNING "inode_cache not found in remove_node_refs()!!\n");
+		JFFS2_WARNING("inode_cache/xattr_datum/xattr_ref"
+			      " not found in remove_node_refs()!!\n");
 		return;
 	}
 
@@ -279,8 +279,19 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c,
 		printk("\n");
 	});
 
-	if (ic->nodes == (void *)ic && ic->nlink == 0)
-		jffs2_del_ino_cache(c, ic);
+	switch (ic->class) {
+#ifdef CONFIG_JFFS2_FS_XATTR
+		case RAWNODE_CLASS_XATTR_DATUM:
+			jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+			break;
+		case RAWNODE_CLASS_XATTR_REF:
+			jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+			break;
+#endif
+		default:
+			if (ic->nodes == (void *)ic && ic->nlink == 0)
+				jffs2_del_ino_cache(c, ic);
+	}
 }
 
 void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 477c526d638..f59b147661c 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -275,13 +275,12 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 	 * We can decide whether this node is inode or xattr by ic->class.     */
 	if (ic->class == RAWNODE_CLASS_XATTR_DATUM
 	    || ic->class == RAWNODE_CLASS_XATTR_REF) {
-		BUG_ON(raw->next_in_ino != (void *)ic);
 		spin_unlock(&c->erase_completion_lock);
 
 		if (ic->class == RAWNODE_CLASS_XATTR_DATUM) {
-			ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+			ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic, raw);
 		} else {
-			ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+			ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic, raw);
 		}
 		goto release_sem;
 	}
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 935fec1b120..b98594992ee 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -119,8 +119,11 @@ struct jffs2_sb_info {
 #ifdef CONFIG_JFFS2_FS_XATTR
 #define XATTRINDEX_HASHSIZE	(57)
 	uint32_t highest_xid;
+	uint32_t highest_xseqno;
 	struct list_head xattrindex[XATTRINDEX_HASHSIZE];
 	struct list_head xattr_unchecked;
+	struct list_head xattr_dead_list;
+	struct jffs2_xattr_ref *xref_dead_list;
 	struct jffs2_xattr_ref *xref_temp;
 	struct rw_semaphore xattr_sem;
 	uint32_t xdatum_mem_usage;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 4889d0700c0..8310c95478e 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -291,6 +291,7 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
 
 	memset(xd, 0, sizeof(struct jffs2_xattr_datum));
 	xd->class = RAWNODE_CLASS_XATTR_DATUM;
+	xd->node = (void *)xd;
 	INIT_LIST_HEAD(&xd->xindex);
 	return xd;
 }
@@ -309,6 +310,7 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 
 	memset(ref, 0, sizeof(struct jffs2_xattr_ref));
 	ref->class = RAWNODE_CLASS_XATTR_REF;
+	ref->node = (void *)ref;
 	return ref;
 }
 
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 8bedfd2ff68..01594a2256e 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -684,19 +684,26 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 		spin_lock(&c->erase_completion_lock);
 
 		ic = jffs2_raw_ref_to_ic(ref);
-		/* It seems we should never call jffs2_mark_node_obsolete() for
-		   XATTR nodes.... yet. Make sure we notice if/when we change
-		   that :) */
-		BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE);
 		for (p = &ic->nodes; (*p) != ref; p = &((*p)->next_in_ino))
 			;
 
 		*p = ref->next_in_ino;
 		ref->next_in_ino = NULL;
 
-		if (ic->nodes == (void *)ic && ic->nlink == 0)
-			jffs2_del_ino_cache(c, ic);
-
+		switch (ic->class) {
+#ifdef CONFIG_JFFS2_FS_XATTR
+			case RAWNODE_CLASS_XATTR_DATUM:
+				jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic);
+				break;
+			case RAWNODE_CLASS_XATTR_REF:
+				jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic);
+				break;
+#endif
+			default:
+				if (ic->nodes == (void *)ic && ic->nlink == 0)
+					jffs2_del_ino_cache(c, ic);
+				break;
+		}
 		spin_unlock(&c->erase_completion_lock);
 	}
 
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 61618080b86..79638f56c5e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -317,20 +317,25 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 				 struct jffs2_summary *s)
 {
 	struct jffs2_xattr_datum *xd;
-	uint32_t totlen, crc;
+	uint32_t xid, version, totlen, crc;
 	int err;
 
 	crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4);
 	if (crc != je32_to_cpu(rx->node_crc)) {
-		if (je32_to_cpu(rx->node_crc) != 0xffffffff)
-			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
-				      ofs, je32_to_cpu(rx->node_crc), crc);
+		JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			      ofs, je32_to_cpu(rx->node_crc), crc);
 		if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen))))
 			return err;
 		return 0;
 	}
 
-	totlen = PAD(sizeof(*rx) + rx->name_len + 1 + je16_to_cpu(rx->value_len));
+	xid = je32_to_cpu(rx->xid);
+	version = je32_to_cpu(rx->version);
+
+	totlen = sizeof(struct jffs2_raw_xattr);
+	if (version != XDATUM_DELETE_MARKER)
+		totlen += rx->name_len + 1 + je16_to_cpu(rx->value_len);
+	totlen = PAD(totlen);
 	if (totlen != je32_to_cpu(rx->totlen)) {
 		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
 			      ofs, je32_to_cpu(rx->totlen), totlen);
@@ -339,22 +344,24 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 		return 0;
 	}
 
-	xd = jffs2_setup_xattr_datum(c, je32_to_cpu(rx->xid), je32_to_cpu(rx->version));
-	if (IS_ERR(xd)) {
-		if (PTR_ERR(xd) == -EEXIST) {
-			if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rx->totlen)))))
-				return err;
-			return 0;
-		}
+	xd = jffs2_setup_xattr_datum(c, xid, version);
+	if (IS_ERR(xd))
 		return PTR_ERR(xd);
-	}
-	xd->xprefix = rx->xprefix;
-	xd->name_len = rx->name_len;
-	xd->value_len = je16_to_cpu(rx->value_len);
-	xd->data_crc = je32_to_cpu(rx->data_crc);
 
-	xd->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL);
-	/* FIXME */ xd->node->next_in_ino = (void *)xd;
+	if (xd->version > version) {
+		struct jffs2_raw_node_ref *raw
+			= jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL);
+		raw->next_in_ino = xd->node->next_in_ino;
+		xd->node->next_in_ino = raw;
+	} else {
+		xd->version = version;
+		xd->xprefix = rx->xprefix;
+		xd->name_len = rx->name_len;
+		xd->value_len = je16_to_cpu(rx->value_len);
+		xd->data_crc = je32_to_cpu(rx->data_crc);
+
+		jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, (void *)xd);
+	}
 
 	if (jffs2_sum_active())
 		jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset);
@@ -373,9 +380,8 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 
 	crc = crc32(0, rr, sizeof(*rr) - 4);
 	if (crc != je32_to_cpu(rr->node_crc)) {
-		if (je32_to_cpu(rr->node_crc) != 0xffffffff)
-			JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
-				      ofs, je32_to_cpu(rr->node_crc), crc);
+		JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			      ofs, je32_to_cpu(rr->node_crc), crc);
 		if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen)))))
 			return err;
 		return 0;
@@ -395,6 +401,7 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 		return -ENOMEM;
 
 	/* BEFORE jffs2_build_xattr_subsystem() called, 
+	 * and AFTER xattr_ref is marked as a dead xref,
 	 * ref->xid is used to store 32bit xid, xd is not used
 	 * ref->ino is used to store 32bit inode-number, ic is not used
 	 * Thoes variables are declared as union, thus using those
@@ -404,11 +411,13 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	 */
 	ref->ino = je32_to_cpu(rr->ino);
 	ref->xid = je32_to_cpu(rr->xid);
+	ref->xseqno = je32_to_cpu(rr->xseqno);
+	if (ref->xseqno > c->highest_xseqno)
+		c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
 	ref->next = c->xref_temp;
 	c->xref_temp = ref;
 
-	ref->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), NULL);
-	/* FIXME */ ref->node->next_in_ino = (void *)ref;
+	jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), (void *)ref);
 
 	if (jffs2_sum_active())
 		jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset);
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0b02fc79e4d..c430f1d217e 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -310,8 +310,6 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
 #ifdef CONFIG_JFFS2_FS_XATTR
 		case JFFS2_NODETYPE_XATTR: {
 			struct jffs2_sum_xattr_mem *temp;
-			if (je32_to_cpu(node->x.version) == 0xffffffff)
-				return 0;
 			temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL);
 			if (!temp)
 				goto no_mem;
@@ -327,10 +325,6 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
 		}
 		case JFFS2_NODETYPE_XREF: {
 			struct jffs2_sum_xref_mem *temp;
-
-			if (je32_to_cpu(node->r.ino) == 0xffffffff
-			    && je32_to_cpu(node->r.xid) == 0xffffffff)
-				return 0;
 			temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL);
 			if (!temp)
 				goto no_mem;
@@ -483,22 +477,20 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 
 				xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid),
 								je32_to_cpu(spx->version));
-				if (IS_ERR(xd)) {
-					if (PTR_ERR(xd) == -EEXIST) {
-						/* a newer version of xd exists */
-						if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(spx->totlen))))
-							return err;
-						sp += JFFS2_SUMMARY_XATTR_SIZE;
-						break;
-					}
-					JFFS2_NOTICE("allocation of xattr_datum failed\n");
+				if (IS_ERR(xd))
 					return PTR_ERR(xd);
+				if (xd->version > je32_to_cpu(spx->version)) {
+					/* node is not the newest one */
+					struct jffs2_raw_node_ref *raw
+						= sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+								    PAD(je32_to_cpu(spx->totlen)), NULL);
+					raw->next_in_ino = xd->node->next_in_ino;
+					xd->node->next_in_ino = raw;
+				} else {
+					xd->version = je32_to_cpu(spx->version);
+					sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
+							  PAD(je32_to_cpu(spx->totlen)), (void *)xd);
 				}
-
-				xd->node = sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED,
-							     PAD(je32_to_cpu(spx->totlen)), NULL);
-				/* FIXME */ xd->node->next_in_ino = (void *)xd;
-
 				*pseudo_random += je32_to_cpu(spx->xid);
 				sp += JFFS2_SUMMARY_XATTR_SIZE;
 
@@ -519,14 +511,11 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
 					JFFS2_NOTICE("allocation of xattr_datum failed\n");
 					return -ENOMEM;
 				}
-				ref->ino = 0xfffffffe;
-				ref->xid = 0xfffffffd;
 				ref->next = c->xref_temp;
 				c->xref_temp = ref;
 
-				ref->node = sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
-							      PAD(sizeof(struct jffs2_raw_xref)), NULL);
-				/* FIXME */ ref->node->next_in_ino = (void *)ref;
+				sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED,
+						  PAD(sizeof(struct jffs2_raw_xref)), (void *)ref);
 
 				*pseudo_random += ref->node->flash_offset;
 				sp += JFFS2_SUMMARY_XREF_SIZE;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 2d82e250be3..03871ab7c26 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -23,18 +23,15 @@
  * xattr_datum_hashkey(xprefix, xname, xvalue, xsize)
  *   is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is
  *   the index of the xattr name/value pair cache (c->xattrindex).
+ * is_xattr_datum_unchecked(c, xd)
+ *   returns 1, if xdatum contains any unchecked raw nodes. if all raw nodes are not
+ *   unchecked, it returns 0.
  * unload_xattr_datum(c, xd)
  *   is used to release xattr name/value pair and detach from c->xattrindex.
  * reclaim_xattr_datum(c)
  *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
  *   memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 
  *   is hard coded as 32KiB.
- * delete_xattr_datum_node(c, xd)
- *   is used to delete a jffs2 node is dominated by xdatum. When EBS(Erase Block Summary) is
- *   enabled, it overwrites the obsolete node by myself.
- * delete_xattr_datum(c, xd)
- *   is used to delete jffs2_xattr_datum object. It must be called with 0-value of reference
- *   counter. (It means how many jffs2_xattr_ref object refers this xdatum.)
  * do_verify_xattr_datum(c, xd)
  *   is used to load the xdatum informations without name/value pair from the medium.
  *   It's necessary once, because those informations are not collected during mounting
@@ -53,8 +50,13 @@
  *   is used to write xdatum to medium. xd->version will be incremented.
  * create_xattr_datum(c, xprefix, xname, xvalue, xsize)
  *   is used to create new xdatum and write to medium.
+ * delete_xattr_datum_delay(c, xd)
+ *   is used to delete a xdatum without 'delete marker'. It has a possibility to detect
+ *   orphan xdatum on next mounting.
+ * delete_xattr_datum(c, xd)
+ *   is used to delete a xdatum with 'delete marker'. Calling jffs2_reserve_space() is
+ *   necessary before this function.
  * -------------------------------------------------- */
-
 static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
 {
 	int name_len = strlen(xname);
@@ -62,6 +64,22 @@ static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *
 	return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize);
 }
 
+static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	struct jffs2_raw_node_ref *raw;
+	int rc = 0;
+
+	spin_lock(&c->erase_completion_lock);
+	for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			rc = 1;
+			break;
+		}
+	}
+	spin_unlock(&c->erase_completion_lock);
+	return rc;
+}
+
 static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
@@ -107,80 +125,39 @@ static void reclaim_xattr_datum(struct jffs2_sb_info *c)
 		     before, c->xdatum_mem_usage, before - c->xdatum_mem_usage);
 }
 
-static void delete_xattr_datum_node(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
-{
-	/* must be called under down_write(xattr_sem) */
-	struct jffs2_raw_xattr rx;
-	size_t length;
-	int rc;
-
-	if (!xd->node) {
-		JFFS2_WARNING("xdatum (xid=%u) is removed twice.\n", xd->xid);
-		return;
-	}
-	if (jffs2_sum_active()) {
-		memset(&rx, 0xff, sizeof(struct jffs2_raw_xattr));
-		rc = jffs2_flash_read(c, ref_offset(xd->node),
-				      sizeof(struct jffs2_unknown_node),
-				      &length, (char *)&rx);
-		if (rc || length != sizeof(struct jffs2_unknown_node)) {
-			JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
-				    rc, sizeof(struct jffs2_unknown_node),
-				    length, ref_offset(xd->node));
-		}
-		rc = jffs2_flash_write(c, ref_offset(xd->node), sizeof(rx),
-				       &length, (char *)&rx);
-		if (rc || length != sizeof(struct jffs2_raw_xattr)) {
-			JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu ar %#08x\n",
-				    rc, sizeof(rx), length, ref_offset(xd->node));
-		}
-	}
-	spin_lock(&c->erase_completion_lock);
-	xd->node->next_in_ino = NULL;
-	spin_unlock(&c->erase_completion_lock);
-	jffs2_mark_node_obsolete(c, xd->node);
-	xd->node = NULL;
-}
-
-static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
-{
-	/* must be called under down_write(xattr_sem) */
-	BUG_ON(xd->refcnt);
-
-	unload_xattr_datum(c, xd);
-	if (xd->node) {
-		delete_xattr_datum_node(c, xd);
-		xd->node = NULL;
-	}
-	jffs2_free_xattr_datum(xd);
-}
-
 static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
 	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xattr rx;
 	size_t readlen;
-	uint32_t crc, totlen;
+	uint32_t crc, offset, totlen;
 	int rc;
 
-	BUG_ON(!xd->node);
-	BUG_ON(ref_flags(xd->node) != REF_UNCHECKED);
+	spin_lock(&c->erase_completion_lock);
+	offset = ref_offset(xd->node);
+	if (ref_flags(xd->node) == REF_PRISTINE)
+		goto complete;
+	spin_unlock(&c->erase_completion_lock);
 
-	rc = jffs2_flash_read(c, ref_offset(xd->node), sizeof(rx), &readlen, (char *)&rx);
+	rc = jffs2_flash_read(c, offset, sizeof(rx), &readlen, (char *)&rx);
 	if (rc || readlen != sizeof(rx)) {
 		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
-			      rc, sizeof(rx), readlen, ref_offset(xd->node));
+			      rc, sizeof(rx), readlen, offset);
 		return rc ? rc : -EIO;
 	}
 	crc = crc32(0, &rx, sizeof(rx) - 4);
 	if (crc != je32_to_cpu(rx.node_crc)) {
-		if (je32_to_cpu(rx.node_crc) != 0xffffffff)
-			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
-				    ref_offset(xd->node), je32_to_cpu(rx.hdr_crc), crc);
+		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			    offset, je32_to_cpu(rx.hdr_crc), crc);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
 		return EIO;
 	}
-	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
+	totlen = sizeof(rx);
+	if (xd->version != XDATUM_DELETE_MARKER)
+		totlen += rx.name_len + 1 + je16_to_cpu(rx.value_len);
+	totlen = PAD(totlen);
 	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
 	    || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR
 	    || je32_to_cpu(rx.totlen) != totlen
@@ -188,11 +165,12 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 	    || je32_to_cpu(rx.version) != xd->version) {
 		JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, "
 			    "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n",
-			    ref_offset(xd->node), je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
+			    offset, je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK,
 			    je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR,
 			    je32_to_cpu(rx.totlen), totlen,
 			    je32_to_cpu(rx.xid), xd->xid,
 			    je32_to_cpu(rx.version), xd->version);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
 		return EIO;
 	}
 	xd->xprefix = rx.xprefix;
@@ -200,14 +178,17 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 	xd->value_len = je16_to_cpu(rx.value_len);
 	xd->data_crc = je32_to_cpu(rx.data_crc);
 
-	/* This JFFS2_NODETYPE_XATTR node is checked */
-	jeb = &c->blocks[ref_offset(xd->node) / c->sector_size];
-	totlen = PAD(je32_to_cpu(rx.totlen));
-
 	spin_lock(&c->erase_completion_lock);
-	c->unchecked_size -= totlen; c->used_size += totlen;
-	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
-	xd->node->flash_offset = ref_offset(xd->node) | REF_PRISTINE;
+ complete:
+	for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+		jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+		totlen = PAD(ref_totlen(c, jeb, raw));
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+		}
+		raw->flash_offset = ref_offset(raw) | ((xd->node==raw) ? REF_PRISTINE : REF_NORMAL);
+	}
 	spin_unlock(&c->erase_completion_lock);
 
 	/* unchecked xdatum is chained with c->xattr_unchecked */
@@ -227,7 +208,6 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
 	uint32_t crc, length;
 	int i, ret, retry = 0;
 
-	BUG_ON(!xd->node);
 	BUG_ON(ref_flags(xd->node) != REF_PRISTINE);
 	BUG_ON(!list_empty(&xd->xindex));
  retry:
@@ -253,6 +233,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum
 			      " at %#08x, read: 0x%08x calculated: 0x%08x\n",
 			      ref_offset(xd->node), xd->data_crc, crc);
 		kfree(data);
+		xd->flags |= JFFS2_XFLAGS_INVALID;
 		return EIO;
 	}
 
@@ -286,16 +267,13 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	 * rc > 0 : Unrecoverable error, this node should be deleted.
 	 */
 	int rc = 0;
-	BUG_ON(xd->xname);
-	if (!xd->node)
+
+	if (xd->xname)
+		return 0;
+	if (xd->flags & JFFS2_XFLAGS_INVALID)
 		return EIO;
-	if (unlikely(ref_flags(xd->node) != REF_PRISTINE)) {
+	if (unlikely(is_xattr_datum_unchecked(c, xd)))
 		rc = do_verify_xattr_datum(c, xd);
-		if (rc > 0) {
-			list_del_init(&xd->xindex);
-			delete_xattr_datum_node(c, xd);
-		}
-	}
 	if (!rc)
 		rc = do_load_xattr_datum(c, xd);
 	return rc;
@@ -304,36 +282,43 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xattr rx;
 	struct kvec vecs[2];
 	size_t length;
-	int rc, totlen;
+	int rc, totlen, nvecs = 1;
 	uint32_t phys_ofs = write_ofs(c);
 
-	BUG_ON(!xd->xname);
+	BUG_ON(is_xattr_datum_dead(xd) || (xd->flags & JFFS2_XFLAGS_INVALID)
+	       ? !!xd->xname : !xd->xname);
 
 	vecs[0].iov_base = &rx;
-	vecs[0].iov_len = PAD(sizeof(rx));
-	vecs[1].iov_base = xd->xname;
-	vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
-	totlen = vecs[0].iov_len + vecs[1].iov_len;
-
+	vecs[0].iov_len = totlen = sizeof(rx);
+	if (!is_xattr_datum_dead(xd) && !(xd->flags & JFFS2_XFLAGS_INVALID)) {
+		nvecs++;
+		vecs[1].iov_base = xd->xname;
+		vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
+		totlen += vecs[1].iov_len;
+	}
 	/* Setup raw-xattr */
+	memset(&rx, 0, sizeof(rx));
 	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
 	rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR);
 	rx.totlen = cpu_to_je32(PAD(totlen));
 	rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4));
 
 	rx.xid = cpu_to_je32(xd->xid);
-	rx.version = cpu_to_je32(++xd->version);
-	rx.xprefix = xd->xprefix;
-	rx.name_len = xd->name_len;
-	rx.value_len = cpu_to_je16(xd->value_len);
-	rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+	if (!is_xattr_datum_dead(xd) && !(xd->flags & JFFS2_XFLAGS_INVALID)) {
+		rx.version = cpu_to_je32(++xd->version);
+		rx.xprefix = xd->xprefix;
+		rx.name_len = xd->name_len;
+		rx.value_len = cpu_to_je16(xd->value_len);
+		rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
+	} else {
+		rx.version = cpu_to_je32(XDATUM_DELETE_MARKER);
+	}
 	rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4));
 
-	rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
+	rc = jffs2_flash_writev(c, vecs, nvecs, phys_ofs, &length, 0);
 	if (rc || totlen != length) {
 		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n",
 			      rc, totlen, length, phys_ofs);
@@ -343,14 +328,8 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 
 		return rc;
 	}
-
 	/* success */
-	raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), NULL);
-	/* FIXME */ raw->next_in_ino = (void *)xd;
-
-	if (xd->node)
-		delete_xattr_datum_node(c, xd);
-	xd->node = raw;
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), (void *)xd);
 
 	dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n",
 		  xd->xid, xd->version, xd->xprefix, xd->xname);
@@ -426,6 +405,39 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 	return xd;
 }
 
+static void delete_xattr_datum_delay(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under down_write(xattr_sem) */
+	BUG_ON(xd->refcnt);
+
+	unload_xattr_datum(c, xd);
+	set_xattr_datum_dead(xd);
+	spin_lock(&c->erase_completion_lock);
+	list_add(&xd->xindex, &c->xattr_dead_list);
+	spin_unlock(&c->erase_completion_lock);
+	JFFS2_NOTICE("xdatum(xid=%u) was removed without delete marker. "
+		     "An orphan xdatum may be detected on next mounting.\n", xd->xid);
+}
+
+static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under jffs2_reserve_space() and down_write(xattr_sem) */
+	int rc;
+	BUG_ON(xd->refcnt);
+
+	unload_xattr_datum(c, xd);
+	set_xattr_datum_dead(xd);	
+	rc = save_xattr_datum(c, xd);
+	if (rc) {
+		JFFS2_NOTICE("xdatum(xid=%u) was removed without delete marker. "
+			     "An orphan xdatum may be detected on next mounting.\n",
+			     xd->xid);
+	}
+	spin_lock(&c->erase_completion_lock);
+	list_add(&xd->xindex, &c->xattr_dead_list);
+	spin_unlock(&c->erase_completion_lock);
+}
+
 /* -------- xref related functions ------------------
  * verify_xattr_ref(c, ref)
  *   is used to load xref information from medium. Because summary data does not
@@ -450,25 +462,29 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 {
 	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xref rr;
 	size_t readlen;
-	uint32_t crc, totlen;
+	uint32_t crc, offset, totlen;
 	int rc;
 
-	BUG_ON(ref_flags(ref->node) != REF_UNCHECKED);
+	spin_lock(&c->erase_completion_lock);
+	if (ref_flags(ref->node) != REF_UNCHECKED)
+		goto complete;
+	offset = ref_offset(ref->node);
+	spin_unlock(&c->erase_completion_lock);
 
-	rc = jffs2_flash_read(c, ref_offset(ref->node), sizeof(rr), &readlen, (char *)&rr);
+	rc = jffs2_flash_read(c, offset, sizeof(rr), &readlen, (char *)&rr);
 	if (rc || sizeof(rr) != readlen) {
 		JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n",
-			      rc, sizeof(rr), readlen, ref_offset(ref->node));
+			      rc, sizeof(rr), readlen, offset);
 		return rc ? rc : -EIO;
 	}
 	/* obsolete node */
 	crc = crc32(0, &rr, sizeof(rr) - 4);
 	if (crc != je32_to_cpu(rr.node_crc)) {
-		if (je32_to_cpu(rr.node_crc) != 0xffffffff)
-			JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
-				    ref_offset(ref->node), je32_to_cpu(rr.node_crc), crc);
+		JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n",
+			    offset, je32_to_cpu(rr.node_crc), crc);
 		return EIO;
 	}
 	if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK
@@ -476,22 +492,28 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 	    || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) {
 		JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, "
 			    "nodetype=%#04x/%#04x, totlen=%u/%zu\n",
-			    ref_offset(ref->node), je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
+			    offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK,
 			    je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF,
 			    je32_to_cpu(rr.totlen), PAD(sizeof(rr)));
 		return EIO;
 	}
 	ref->ino = je32_to_cpu(rr.ino);
 	ref->xid = je32_to_cpu(rr.xid);
-
-	/* fixup superblock/eraseblock info */
-	jeb = &c->blocks[ref_offset(ref->node) / c->sector_size];
-	totlen = PAD(sizeof(rr));
+	ref->xseqno = je32_to_cpu(rr.xseqno);
+	if (ref->xseqno > c->highest_xseqno)
+		c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER);
 
 	spin_lock(&c->erase_completion_lock);
-	c->unchecked_size -= totlen; c->used_size += totlen;
-	jeb->unchecked_size -= totlen; jeb->used_size += totlen;
-	ref->node->flash_offset = ref_offset(ref->node) | REF_PRISTINE;
+ complete:
+	for (raw=ref->node; raw != (void *)ref; raw=raw->next_in_ino) {
+		jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+		totlen = PAD(ref_totlen(c, jeb, raw));
+		if (ref_flags(raw) == REF_UNCHECKED) {
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+		}
+		raw->flash_offset = ref_offset(raw) | ((ref->node==raw) ? REF_PRISTINE : REF_NORMAL);
+	}
 	spin_unlock(&c->erase_completion_lock);
 
 	dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n",
@@ -499,58 +521,12 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref
 	return 0;
 }
 
-static void delete_xattr_ref_node(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
-{
-	struct jffs2_raw_xref rr;
-	size_t length;
-	int rc;
-
-	if (jffs2_sum_active()) {
-		memset(&rr, 0xff, sizeof(rr));
-		rc = jffs2_flash_read(c, ref_offset(ref->node),
-				      sizeof(struct jffs2_unknown_node),
-				      &length, (char *)&rr);
-		if (rc || length != sizeof(struct jffs2_unknown_node)) {
-			JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n",
-				    rc, sizeof(struct jffs2_unknown_node),
-				    length, ref_offset(ref->node));
-		}
-		rc = jffs2_flash_write(c, ref_offset(ref->node), sizeof(rr),
-				       &length, (char *)&rr);
-		if (rc || length != sizeof(struct jffs2_raw_xref)) {
-			JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu at %#08x\n",
-				    rc, sizeof(rr), length, ref_offset(ref->node));
-		}
-	}
-	spin_lock(&c->erase_completion_lock);
-	ref->node->next_in_ino = NULL;
-	spin_unlock(&c->erase_completion_lock);
-	jffs2_mark_node_obsolete(c, ref->node);
-	ref->node = NULL;
-}
-
-static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
-{
-	/* must be called under down_write(xattr_sem) */
-	struct jffs2_xattr_datum *xd;
-
-	BUG_ON(!ref->node);
-	delete_xattr_ref_node(c, ref);
-
-	xd = ref->xd;
-	xd->refcnt--;
-	if (!xd->refcnt)
-		delete_xattr_datum(c, xd);
-	jffs2_free_xattr_ref(ref);
-}
-
 static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 {
 	/* must be called under down_write(xattr_sem) */
-	struct jffs2_raw_node_ref *raw;
 	struct jffs2_raw_xref rr;
 	size_t length;
-	uint32_t phys_ofs = write_ofs(c);
+	uint32_t xseqno, phys_ofs = write_ofs(c);
 	int ret;
 
 	rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -558,8 +534,16 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 	rr.totlen = cpu_to_je32(PAD(sizeof(rr)));
 	rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4));
 
-	rr.ino = cpu_to_je32(ref->ic->ino);
-	rr.xid = cpu_to_je32(ref->xd->xid);
+	xseqno = (c->highest_xseqno += 2);
+	if (is_xattr_ref_dead(ref)) {
+		xseqno |= XREF_DELETE_MARKER;
+		rr.ino = cpu_to_je32(ref->ino);
+		rr.xid = cpu_to_je32(ref->xid);
+	} else {
+		rr.ino = cpu_to_je32(ref->ic->ino);
+		rr.xid = cpu_to_je32(ref->xd->xid);
+	}
+	rr.xseqno = cpu_to_je32(xseqno);
 	rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4));
 
 	ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr);
@@ -572,12 +556,9 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 
 		return ret;
 	}
-
-	raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), NULL);
-	/* FIXME */ raw->next_in_ino = (void *)ref;
-	if (ref->node)
-		delete_xattr_ref_node(c, ref);
-	ref->node = raw;
+	/* success */
+	ref->xseqno = xseqno;
+	jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), (void *)ref);
 
 	dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid);
 
@@ -610,22 +591,116 @@ static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct
 	return ref; /* success */
 }
 
+static void delete_xattr_ref_delay(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under down_write(xattr_sem) */
+	struct jffs2_xattr_datum *xd;
+
+	set_xattr_ref_dead(ref);
+	xd = ref->xd;
+	ref->ino = ref->ic->ino;
+	ref->xid = ref->xd->xid;
+	spin_lock(&c->erase_completion_lock);
+	ref->next = c->xref_dead_list;
+	c->xref_dead_list = ref;
+	spin_unlock(&c->erase_completion_lock);
+
+	JFFS2_NOTICE("xref(ino=%u, xid=%u) was removed without delete marker. "
+		     "An orphan xref may be detected on next mounting.\n",
+		     ref->ino, ref->xid);
+
+	if (!--xd->refcnt)
+		delete_xattr_datum_delay(c, xd);
+}
+
+static int delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, int enforce)
+{
+	/* must be called under jffs2_reserve_space() and down_write(xattr_sem) */
+	struct jffs2_inode_cache *ic;
+	struct jffs2_xattr_datum *xd;
+	uint32_t length;
+	int rc;
+
+	set_xattr_ref_dead(ref);
+	ic = ref->ic;
+	xd = ref->xd;
+	ref->ino = ic->ino;
+	ref->xid = xd->xid;
+	rc = save_xattr_ref(c, ref);
+	if (rc) {
+		if (!enforce) {
+			clr_xattr_ref_dead(ref);
+			ref->ic = ic;
+			ref->xd = xd;
+			return rc;
+		}
+		JFFS2_WARNING("could not write delete marker of xref(ino=%u, xid=%u). "
+			      "An orphan xref may be detected on next mounting.\n",
+			      ref->ic->ino, ref->xd->xid);
+	}
+	spin_lock(&c->erase_completion_lock);
+	ref->next = c->xref_dead_list;
+	c->xref_dead_list = ref;
+	spin_unlock(&c->erase_completion_lock);
+
+	xd->refcnt--;
+	if (xd->refcnt)
+		return 0;
+
+	/* delete xdatum */
+	unload_xattr_datum(c, xd);
+	up_write(&c->xattr_sem);
+	jffs2_complete_reservation(c);
+
+	rc = jffs2_reserve_space(c, PAD(sizeof(struct jffs2_raw_xattr)), &length,
+				 ALLOC_DELETION, JFFS2_SUMMARY_XATTR_SIZE);
+	if (rc) {
+		down(&c->alloc_sem);
+		down_write(&c->xattr_sem);
+		delete_xattr_datum_delay(c, xd);
+	} else {
+		down_write(&c->xattr_sem);
+		delete_xattr_datum(c, xd);
+	}
+	return 0;
+}
+
 void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
 	/* It's called from jffs2_clear_inode() on inode removing.
 	   When an inode with XATTR is removed, those XATTRs must be removed. */
-	struct jffs2_xattr_ref *ref, *_ref;
+	struct jffs2_xattr_ref *ref;
+	uint32_t length;
+	int rc, retry;
 
 	if (!ic || ic->nlink > 0)
 		return;
 
+	down_read(&c->xattr_sem);
+	if (!ic->xref) {
+		up_read(&c->xattr_sem);
+		return;
+	}
+	up_read(&c->xattr_sem);
+ retry:
+	rc = jffs2_reserve_space(c, PAD(sizeof(struct jffs2_raw_xref)), &length,
+				 ALLOC_DELETION, JFFS2_SUMMARY_XREF_SIZE);
 	down_write(&c->xattr_sem);
-	for (ref = ic->xref; ref; ref = _ref) {
-		_ref = ref->next;
-		delete_xattr_ref(c, ref);
+	if (ic->xref) {
+		ref = ic->xref;
+		ic->xref = ref->next;
+		if (rc) {
+			delete_xattr_ref_delay(c, ref);
+		} else {
+			delete_xattr_ref(c, ref, 1);
+		}
 	}
-	ic->xref = NULL;
+	retry = ic->xref ? 1 : 0;
 	up_write(&c->xattr_sem);
+	if (!rc)
+		jffs2_complete_reservation(c);
+	if (retry)
+		goto retry;
 }
 
 void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
@@ -655,7 +730,7 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac
 	 * duplicate name/value pairs. If duplicate name/value pair would be found,
 	 * one will be removed.
 	 */
-	struct jffs2_xattr_ref *ref, *cmp, **pref;
+	struct jffs2_xattr_ref *ref, *cmp, **pref, **pcmp;
 	int rc = 0;
 
 	if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED))
@@ -668,27 +743,32 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac
 			rc = load_xattr_datum(c, ref->xd);
 			if (unlikely(rc > 0)) {
 				*pref = ref->next;
-				delete_xattr_ref(c, ref);
+				delete_xattr_ref_delay(c, ref);
 				goto retry;
 			} else if (unlikely(rc < 0))
 				goto out;
 		}
-		for (cmp=ref->next, pref=&ref->next; cmp; pref=&cmp->next, cmp=cmp->next) {
+		for (cmp=ref->next, pcmp=&ref->next; cmp; pcmp=&cmp->next, cmp=cmp->next) {
 			if (!cmp->xd->xname) {
 				ref->xd->flags |= JFFS2_XFLAGS_BIND;
 				rc = load_xattr_datum(c, cmp->xd);
 				ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
 				if (unlikely(rc > 0)) {
-					*pref = cmp->next;
-					delete_xattr_ref(c, cmp);
+					*pcmp = cmp->next;
+					delete_xattr_ref_delay(c, cmp);
 					goto retry;
 				} else if (unlikely(rc < 0))
 					goto out;
 			}
 			if (ref->xd->xprefix == cmp->xd->xprefix
 			    && !strcmp(ref->xd->xname, cmp->xd->xname)) {
-				*pref = cmp->next;
-				delete_xattr_ref(c, cmp);
+				if (ref->xseqno > cmp->xseqno) {
+					*pcmp = cmp->next;
+					delete_xattr_ref_delay(c, cmp);
+				} else {
+					*pref = ref->next;
+					delete_xattr_ref_delay(c, ref);
+				}
 				goto retry;
 			}
 		}
@@ -719,9 +799,13 @@ void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c)
 	for (i=0; i < XATTRINDEX_HASHSIZE; i++)
 		INIT_LIST_HEAD(&c->xattrindex[i]);
 	INIT_LIST_HEAD(&c->xattr_unchecked);
+	INIT_LIST_HEAD(&c->xattr_dead_list);
+	c->xref_dead_list = NULL;
 	c->xref_temp = NULL;
 
 	init_rwsem(&c->xattr_sem);
+	c->highest_xid = 0;
+	c->highest_xseqno = 0;
 	c->xdatum_mem_usage = 0;
 	c->xdatum_mem_threshold = 32 * 1024;	/* Default 32KB */
 }
@@ -751,7 +835,11 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
 		_ref = ref->next;
 		jffs2_free_xattr_ref(ref);
 	}
-	c->xref_temp = NULL;
+
+	for (ref=c->xref_dead_list; ref; ref = _ref) {
+		_ref = ref->next;
+		jffs2_free_xattr_ref(ref);
+	}
 
 	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
 		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
@@ -761,64 +849,120 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
 			jffs2_free_xattr_datum(xd);
 		}
 	}
+
+	list_for_each_entry_safe(xd, _xd, &c->xattr_dead_list, xindex) {
+		list_del(&xd->xindex);
+		jffs2_free_xattr_datum(xd);
+	}
 }
 
+#define XREF_TMPHASH_SIZE	(128)
 void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 {
 	struct jffs2_xattr_ref *ref, *_ref;
+	struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE];
 	struct jffs2_xattr_datum *xd, *_xd;
 	struct jffs2_inode_cache *ic;
-	int i, xdatum_count =0, xdatum_unchecked_count = 0, xref_count = 0;
+	struct jffs2_raw_node_ref *raw;
+	int i, xdatum_count = 0, xdatum_unchecked_count = 0, xref_count = 0;
 
 	BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+	/* Phase.1 : Drop dead xdatum */
+	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
+		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			BUG_ON(xd->node == (void *)xd);
+			if (is_xattr_datum_dead(xd)) {
+				list_del_init(&xd->xindex);
+				list_add(&xd->xindex, &c->xattr_unchecked);
+			}
+		}
+	}
 
-	/* Phase.1 */
+	/* Phase.2 : Merge same xref */
+	for (i=0; i < XREF_TMPHASH_SIZE; i++)
+		xref_tmphash[i] = NULL;
 	for (ref=c->xref_temp; ref; ref=_ref) {
+		struct jffs2_xattr_ref *tmp;
+
 		_ref = ref->next;
-		/* checking REF_UNCHECKED nodes */
 		if (ref_flags(ref->node) != REF_PRISTINE) {
 			if (verify_xattr_ref(c, ref)) {
-				delete_xattr_ref_node(c, ref);
+				BUG_ON(ref->node->next_in_ino != (void *)ref);
+				ref->node->next_in_ino = NULL;
+				jffs2_mark_node_obsolete(c, ref->node);
 				jffs2_free_xattr_ref(ref);
 				continue;
 			}
 		}
-		/* At this point, ref->xid and ref->ino contain XID and inode number.
-		   ref->xd and ref->ic are not valid yet. */
-		xd = jffs2_find_xattr_datum(c, ref->xid);
-		ic = jffs2_get_ino_cache(c, ref->ino);
-		if (!xd || !ic) {
-			if (ref_flags(ref->node) != REF_UNCHECKED)
-				JFFS2_WARNING("xref(ino=%u, xid=%u) is orphan. \n",
-					      ref->ino, ref->xid);
-			delete_xattr_ref_node(c, ref);
+
+		i = (ref->ino ^ ref->xid) % XREF_TMPHASH_SIZE;
+		for (tmp=xref_tmphash[i]; tmp; tmp=tmp->next) {
+			if (tmp->ino == ref->ino && tmp->xid == ref->xid)
+				break;
+		}
+		if (tmp) {
+			raw = ref->node;
+			if (ref->xseqno > tmp->xseqno) {
+				tmp->xseqno = ref->xseqno;
+				raw->next_in_ino = tmp->node;
+				tmp->node = raw;
+			} else {
+				raw->next_in_ino = tmp->node->next_in_ino;
+				tmp->node->next_in_ino = raw;
+			}
 			jffs2_free_xattr_ref(ref);
 			continue;
+		} else {
+			ref->next = xref_tmphash[i];
+			xref_tmphash[i] = ref;
 		}
-		ref->xd = xd;
-		ref->ic = ic;
-		xd->refcnt++;
-		ref->next = ic->xref;
-		ic->xref = ref;
-		xref_count++;
 	}
 	c->xref_temp = NULL;
-	/* After this, ref->xid/ino are NEVER used. */
 
-	/* Phase.2 */
+	/* Phase.3 : Bind xref with inode_cache and xattr_datum */
+	for (i=0; i < XREF_TMPHASH_SIZE; i++) {
+		for (ref=xref_tmphash[i]; ref; ref=_ref) {
+			_ref = ref->next;
+			if (is_xattr_ref_dead(ref)) {
+				ref->next = c->xref_dead_list;
+				c->xref_dead_list = ref;
+				continue;
+			}
+			/* At this point, ref->xid and ref->ino contain XID and inode number.
+			   ref->xd and ref->ic are not valid yet. */
+			xd = jffs2_find_xattr_datum(c, ref->xid);
+			ic = jffs2_get_ino_cache(c, ref->ino);
+			if (!xd || !ic) {
+				JFFS2_WARNING("xref(ino=%u, xid=%u, xseqno=%u) is orphan. \n",
+					      ref->ino, ref->xid, ref->xseqno);
+				set_xattr_ref_dead(ref);
+				ref->next = c->xref_dead_list;
+				c->xref_dead_list = ref;
+				continue;
+			}
+			ref->xd = xd;
+			ref->ic = ic;
+			xd->refcnt++;
+			ref->next = ic->xref;
+			ic->xref = ref;
+			xref_count++;
+		}
+	}
+
+	/* Phase.4 : Link unchecked xdatum to xattr_unchecked list */
 	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
 		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
 			list_del_init(&xd->xindex);
 			if (!xd->refcnt) {
-				if (ref_flags(xd->node) != REF_UNCHECKED)
-					JFFS2_WARNING("orphan xdatum(xid=%u, version=%u) at %#08x\n",
-						      xd->xid, xd->version, ref_offset(xd->node));
-				delete_xattr_datum(c, xd);
+				JFFS2_WARNING("orphan xdatum(xid=%u, version=%u)\n",
+					      xd->xid, xd->version);
+				set_xattr_datum_dead(xd);
+				list_add(&xd->xindex, &c->xattr_unchecked);
 				continue;
 			}
-			if (ref_flags(xd->node) != REF_PRISTINE) {
-				dbg_xattr("unchecked xdatum(xid=%u) at %#08x\n",
-					  xd->xid, ref_offset(xd->node));
+			if (is_xattr_datum_unchecked(c, xd)) {
+				dbg_xattr("unchecked xdatum(xid=%u, version=%u)\n",
+					  xd->xid, xd->version);
 				list_add(&xd->xindex, &c->xattr_unchecked);
 				xdatum_unchecked_count++;
 			}
@@ -833,28 +977,18 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
 						  uint32_t xid, uint32_t version)
 {
-	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_xattr_datum *xd;
 
-	_xd = jffs2_find_xattr_datum(c, xid);
-	if (_xd) {
-		dbg_xattr("duplicate xdatum (xid=%u, version=%u/%u) at %#08x\n",
-			  xid, version, _xd->version, ref_offset(_xd->node));
-		if (version < _xd->version)
-			return ERR_PTR(-EEXIST);
-	}
-	xd = jffs2_alloc_xattr_datum();
-	if (!xd)
-		return ERR_PTR(-ENOMEM);
-	xd->xid = xid;
-	xd->version = version;
-	if (xd->xid > c->highest_xid)
-		c->highest_xid = xd->xid;
-	list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
-
-	if (_xd) {
-		list_del_init(&_xd->xindex);
-		delete_xattr_datum_node(c, _xd);
-		jffs2_free_xattr_datum(_xd);
+	xd = jffs2_find_xattr_datum(c, xid);
+	if (!xd) {
+		xd = jffs2_alloc_xattr_datum();
+		if (!xd)
+			return ERR_PTR(-ENOMEM);
+		xd->xid = xid;
+		xd->version = version;
+		if (xd->xid > c->highest_xid)
+			c->highest_xid = xd->xid;
+		list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]);
 	}
 	return xd;
 }
@@ -945,7 +1079,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 				rc = load_xattr_datum(c, xd);
 				if (unlikely(rc > 0)) {
 					*pref = ref->next;
-					delete_xattr_ref(c, ref);
+					delete_xattr_ref_delay(c, ref);
 					goto retry;
 				} else if (unlikely(rc < 0))
 					goto out;
@@ -1006,7 +1140,7 @@ int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
 				rc = load_xattr_datum(c, xd);
 				if (unlikely(rc > 0)) {
 					*pref = ref->next;
-					delete_xattr_ref(c, ref);
+					delete_xattr_ref_delay(c, ref);
 					goto retry;
 				} else if (unlikely(rc < 0)) {
 					goto out;
@@ -1069,7 +1203,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 			rc = load_xattr_datum(c, xd);
 			if (unlikely(rc > 0)) {
 				*pref = ref->next;
-				delete_xattr_ref(c, ref);
+				delete_xattr_ref_delay(c, ref);
 				goto retry;
 			} else if (unlikely(rc < 0))
 				goto out;
@@ -1081,8 +1215,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 			}
 			if (!buffer) {
 				*pref = ref->next;
-				delete_xattr_ref(c, ref);
-				rc = 0;
+				rc = delete_xattr_ref(c, ref, 0);
 				goto out;
 			}
 			goto found;
@@ -1094,7 +1227,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		goto out;
 	}
 	if (!buffer) {
-		rc = -EINVAL;
+		rc = -ENODATA;
 		goto out;
 	}
  found:
@@ -1110,16 +1243,15 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 	request = PAD(sizeof(struct jffs2_raw_xref));
 	rc = jffs2_reserve_space(c, request, &length,
 				 ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE);
+	down_write(&c->xattr_sem);
 	if (rc) {
 		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
-		down_write(&c->xattr_sem);
 		xd->refcnt--;
 		if (!xd->refcnt)
-			delete_xattr_datum(c, xd);
+			delete_xattr_datum_delay(c, xd);
 		up_write(&c->xattr_sem);
 		return rc;
 	}
-	down_write(&c->xattr_sem);
 	if (ref)
 		*pref = ref->next;
 	newref = create_xattr_ref(c, ic, xd);
@@ -1131,9 +1263,21 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		rc = PTR_ERR(newref);
 		xd->refcnt--;
 		if (!xd->refcnt)
-			delete_xattr_datum(c, xd);
+			delete_xattr_datum_delay(c, xd);
 	} else if (ref) {
-		delete_xattr_ref(c, ref);
+		up_write(&c->xattr_sem);
+		jffs2_complete_reservation(c);
+
+		rc = jffs2_reserve_space(c, request, &length,
+					 ALLOC_DELETION, JFFS2_SUMMARY_XREF_SIZE);
+		down_write(&c->xattr_sem);
+		if (rc) {
+			JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
+			delete_xattr_ref_delay(c, ref);
+			up_write(&c->xattr_sem);
+			return 0;
+		}
+		delete_xattr_ref(c, ref, 1);
 	}
  out:
 	up_write(&c->xattr_sem);
@@ -1142,38 +1286,37 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 }
 
 /* -------- garbage collector functions -------------
- * jffs2_garbage_collect_xattr_datum(c, xd)
+ * jffs2_garbage_collect_xattr_datum(c, xd, raw)
  *   is used to move xdatum into new node.
- * jffs2_garbage_collect_xattr_ref(c, ref)
+ * jffs2_garbage_collect_xattr_ref(c, ref, raw)
  *   is used to move xref into new node.
  * jffs2_verify_xattr(c)
  *   is used to call do_verify_xattr_datum() before garbage collecting.
  * -------------------------------------------------- */
-int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+				      struct jffs2_raw_node_ref *raw)
 {
 	uint32_t totlen, length, old_ofs;
-	int rc = -EINVAL;
+	int rc = 0;
 
 	down_write(&c->xattr_sem);
-	BUG_ON(!xd->node);
+	if (xd->node != raw)
+		goto out;
+	if (is_xattr_datum_dead(xd) && (raw->next_in_ino == (void *)xd))
+		goto out;
 
 	old_ofs = ref_offset(xd->node);
 	totlen = ref_totlen(c, c->gcblock, xd->node);
-	if (totlen < sizeof(struct jffs2_raw_xattr))
-		goto out;
 
-	if (!xd->xname) {
+	if (!is_xattr_datum_dead(xd)) {
 		rc = load_xattr_datum(c, xd);
-		if (unlikely(rc > 0)) {
-			delete_xattr_datum_node(c, xd);
-			rc = 0;
-			goto out;
-		} else if (unlikely(rc < 0))
+		if (unlikely(rc < 0))
 			goto out;
 	}
+
 	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE);
-	if (rc || length < totlen) {
-		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, totlen);
+	if (rc) {
+		JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen);
 		rc = rc ? rc : -EBADFD;
 		goto out;
 	}
@@ -1182,27 +1325,33 @@ int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xatt
 		dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n",
 			  xd->xid, xd->version, old_ofs, ref_offset(xd->node));
  out:
+	if (!rc)
+		jffs2_mark_node_obsolete(c, raw);
 	up_write(&c->xattr_sem);
 	return rc;
 }
 
 
-int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+				    struct jffs2_raw_node_ref *raw)
 {
 	uint32_t totlen, length, old_ofs;
-	int rc = -EINVAL;
+	int rc = 0;
 
 	down_write(&c->xattr_sem);
 	BUG_ON(!ref->node);
 
+	if (ref->node != raw)
+		goto out;
+	if (is_xattr_ref_dead(ref) && (raw->next_in_ino == (void *)ref))
+		goto out;
+
 	old_ofs = ref_offset(ref->node);
 	totlen = ref_totlen(c, c->gcblock, ref->node);
-	if (totlen != sizeof(struct jffs2_raw_xref))
-		goto out;
 
 	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE);
-	if (rc || length < totlen) {
-		JFFS2_WARNING("%s: jffs2_reserve_space() = %d, request = %u\n",
+	if (rc) {
+		JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n",
 			      __FUNCTION__, rc, totlen);
 		rc = rc ? rc : -EBADFD;
 		goto out;
@@ -1212,6 +1361,8 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_
 		dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n",
 			  ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node));
  out:
+	if (!rc)
+		jffs2_mark_node_obsolete(c, raw);
 	up_write(&c->xattr_sem);
 	return rc;
 }
@@ -1219,20 +1370,59 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_
 int jffs2_verify_xattr(struct jffs2_sb_info *c)
 {
 	struct jffs2_xattr_datum *xd, *_xd;
+	struct jffs2_eraseblock *jeb;
+	struct jffs2_raw_node_ref *raw;
+	uint32_t totlen;
 	int rc;
 
 	down_write(&c->xattr_sem);
 	list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) {
 		rc = do_verify_xattr_datum(c, xd);
-		if (rc == 0) {
-			list_del_init(&xd->xindex);
-			break;
-		} else if (rc > 0) {
-			list_del_init(&xd->xindex);
-			delete_xattr_datum_node(c, xd);
+		if (rc < 0)
+			continue;
+		list_del_init(&xd->xindex);
+		spin_lock(&c->erase_completion_lock);
+		for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) {
+			if (ref_flags(raw) != REF_UNCHECKED)
+				continue;
+			jeb = &c->blocks[ref_offset(raw) / c->sector_size];
+			totlen = PAD(ref_totlen(c, jeb, raw));
+			c->unchecked_size -= totlen; c->used_size += totlen;
+			jeb->unchecked_size -= totlen; jeb->used_size += totlen;
+			raw->flash_offset = ref_offset(raw)
+				| ((xd->node == (void *)raw) ? REF_PRISTINE : REF_NORMAL);
 		}
+		if (is_xattr_datum_dead(xd))
+			list_add(&xd->xindex, &c->xattr_dead_list);
+		spin_unlock(&c->erase_completion_lock);
 	}
 	up_write(&c->xattr_sem);
-
 	return list_empty(&c->xattr_unchecked) ? 1 : 0;
 }
+
+void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+{
+	/* must be called under spin_lock(&c->erase_completion_lock) */
+	if (xd->node != (void *)xd)
+		return;
+
+	list_del(&xd->xindex);
+	jffs2_free_xattr_datum(xd);
+}
+
+void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+{
+	/* must be called under spin_lock(&c->erase_completion_lock) */
+	struct jffs2_xattr_ref *tmp, **ptmp;
+
+	if (ref->node != (void *)ref)
+		return;
+
+	for (tmp=c->xref_dead_list, ptmp=&c->xref_dead_list; tmp; ptmp=&tmp->next, tmp=tmp->next) {
+		if (ref == tmp) {
+			*ptmp = tmp->next;
+			jffs2_free_xattr_ref(ref);
+			break;
+		}
+	}
+}
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 2c199856c58..06ab7b88021 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -16,6 +16,7 @@
 
 #define JFFS2_XFLAGS_HOT	(0x01)	/* This datum is HOT */
 #define JFFS2_XFLAGS_BIND	(0x02)	/* This datum is not reclaimed */
+#define JFFS2_XFLAGS_INVALID	(0x80)	/* This datum contains crc error */
 
 struct jffs2_xattr_datum
 {
@@ -23,7 +24,7 @@ struct jffs2_xattr_datum
 	struct jffs2_raw_node_ref *node;
 	uint8_t class;
 	uint8_t flags;
-	uint16_t xprefix;			/* see JFFS2_XATTR_PREFIX_* */
+	uint16_t xprefix;		/* see JFFS2_XATTR_PREFIX_* */
 
 	struct list_head xindex;	/* chained from c->xattrindex[n] */
 	uint32_t refcnt;		/* # of xattr_ref refers this */
@@ -47,6 +48,7 @@ struct jffs2_xattr_ref
 	uint8_t flags;		/* Currently unused */
 	u16 unused;
 
+	uint32_t xseqno;
 	union {
 		struct jffs2_inode_cache *ic;	/* reference to jffs2_inode_cache */
 		uint32_t ino;			/* only used in scanning/building  */
@@ -58,6 +60,34 @@ struct jffs2_xattr_ref
 	struct jffs2_xattr_ref *next;		/* chained from ic->xref_list */
 };
 
+#define XDATUM_DELETE_MARKER	(0xffffffff)
+#define XREF_DELETE_MARKER	(0x00000001)
+static inline int is_xattr_datum_dead(struct jffs2_xattr_datum *xd)
+{
+	return (xd->version == XDATUM_DELETE_MARKER);
+}
+
+static inline void set_xattr_datum_dead(struct jffs2_xattr_datum *xd)
+{
+	xd->version = XDATUM_DELETE_MARKER;
+}
+
+static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref)
+{
+	return ((ref->xseqno & XREF_DELETE_MARKER) != 0);
+}
+
+static inline void set_xattr_ref_dead(struct jffs2_xattr_ref *ref)
+{
+	ref->xseqno |= XREF_DELETE_MARKER;
+}
+
+static inline void clr_xattr_ref_dead(struct jffs2_xattr_ref *ref)
+{
+	ref->xseqno &= ~XREF_DELETE_MARKER;
+}
+
+
 #ifdef CONFIG_JFFS2_FS_XATTR
 
 extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
@@ -70,9 +100,13 @@ extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c
 extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
 extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic);
 
-extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd);
-extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref);
+extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
+					     struct jffs2_raw_node_ref *raw);
+extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
+					   struct jffs2_raw_node_ref *raw);
 extern int jffs2_verify_xattr(struct jffs2_sb_info *c);
+extern void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd);
+extern void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref);
 
 extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
 			     char *buffer, size_t size);
-- 
cgit v1.2.3


From a1ae76e96a18edf4b8a4d4e102762060c26d1c6a Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sun, 11 Jun 2006 10:45:22 +0900
Subject: [JFFS2][XATTR] Fix ACL bug when updating null xattr by null ACL.

This patch enable to handle the case when updating null xattr
by null ACL.

When we try to set NULL into NULL xattr, xattr subsystem returns
-ENODATA. This patch enables to handle this error code.

[2/3] jffs2-xattr-v6-02-fix_posixacl_bug.patch

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/acl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 320dd48b834..9c2077e7e08 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -267,6 +267,8 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	}
 
 	rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0);
+	if (!value && rc == -ENODATA)
+		rc = 0;
 	if (value)
 		kfree(value);
 	if (!rc) {
-- 
cgit v1.2.3


From 8a13695cbe4e8311b363f9bd25162904b984ca74 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 24 Jun 2006 09:14:13 +0900
Subject: [JFFS2][XATTR] rid unnecessary writing of delete marker.

In the followinf situation, an explicit delete marker is not
necessary, because we can certainlly detect those obsolete
xattr_datum or xattr_ref on next mounting.

- When to delete xattr_datum node.
- When to delete xattr_ref node on removing inode.
- When to delete xattr_ref node on updating xattr.

This patch rids writing delete marker in those situations.

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/scan.c  |   6 +-
 fs/jffs2/xattr.c | 303 +++++++++++++++++++------------------------------------
 fs/jffs2/xattr.h |  23 +----
 3 files changed, 105 insertions(+), 227 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 79638f56c5e..2bfdc33752d 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -332,10 +332,8 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc
 	xid = je32_to_cpu(rx->xid);
 	version = je32_to_cpu(rx->version);
 
-	totlen = sizeof(struct jffs2_raw_xattr);
-	if (version != XDATUM_DELETE_MARKER)
-		totlen += rx->name_len + 1 + je16_to_cpu(rx->value_len);
-	totlen = PAD(totlen);
+	totlen = PAD(sizeof(struct jffs2_raw_xattr)
+			+ rx->name_len + 1 + je16_to_cpu(rx->value_len));
 	if (totlen != je32_to_cpu(rx->totlen)) {
 		JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n",
 			      ofs, je32_to_cpu(rx->totlen), totlen);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 03871ab7c26..7622f79cb5b 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -50,12 +50,9 @@
  *   is used to write xdatum to medium. xd->version will be incremented.
  * create_xattr_datum(c, xprefix, xname, xvalue, xsize)
  *   is used to create new xdatum and write to medium.
- * delete_xattr_datum_delay(c, xd)
- *   is used to delete a xdatum without 'delete marker'. It has a possibility to detect
- *   orphan xdatum on next mounting.
  * delete_xattr_datum(c, xd)
- *   is used to delete a xdatum with 'delete marker'. Calling jffs2_reserve_space() is
- *   necessary before this function.
+ *   is used to delete a xdatum. It marks xd JFFS2_XFLAGS_DEAD, and allows
+ *   GC to reclaim those physical nodes.
  * -------------------------------------------------- */
 static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize)
 {
@@ -154,10 +151,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
 		xd->flags |= JFFS2_XFLAGS_INVALID;
 		return EIO;
 	}
-	totlen = sizeof(rx);
-	if (xd->version != XDATUM_DELETE_MARKER)
-		totlen += rx.name_len + 1 + je16_to_cpu(rx.value_len);
-	totlen = PAD(totlen);
+	totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len));
 	if (je16_to_cpu(rx.magic) != JFFS2_MAGIC_BITMASK
 	    || je16_to_cpu(rx.nodetype) != JFFS2_NODETYPE_XATTR
 	    || je32_to_cpu(rx.totlen) != totlen
@@ -268,6 +262,7 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	 */
 	int rc = 0;
 
+	BUG_ON(xd->flags & JFFS2_XFLAGS_DEAD);
 	if (xd->xname)
 		return 0;
 	if (xd->flags & JFFS2_XFLAGS_INVALID)
@@ -285,20 +280,18 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	struct jffs2_raw_xattr rx;
 	struct kvec vecs[2];
 	size_t length;
-	int rc, totlen, nvecs = 1;
+	int rc, totlen;
 	uint32_t phys_ofs = write_ofs(c);
 
-	BUG_ON(is_xattr_datum_dead(xd) || (xd->flags & JFFS2_XFLAGS_INVALID)
-	       ? !!xd->xname : !xd->xname);
+	BUG_ON(!xd->xname);
+	BUG_ON(xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID));
 
 	vecs[0].iov_base = &rx;
-	vecs[0].iov_len = totlen = sizeof(rx);
-	if (!is_xattr_datum_dead(xd) && !(xd->flags & JFFS2_XFLAGS_INVALID)) {
-		nvecs++;
-		vecs[1].iov_base = xd->xname;
-		vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
-		totlen += vecs[1].iov_len;
-	}
+	vecs[0].iov_len = sizeof(rx);
+	vecs[1].iov_base = xd->xname;
+	vecs[1].iov_len = xd->name_len + 1 + xd->value_len;
+	totlen = vecs[0].iov_len + vecs[1].iov_len;
+
 	/* Setup raw-xattr */
 	memset(&rx, 0, sizeof(rx));
 	rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -307,18 +300,14 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x
 	rx.hdr_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_unknown_node) - 4));
 
 	rx.xid = cpu_to_je32(xd->xid);
-	if (!is_xattr_datum_dead(xd) && !(xd->flags & JFFS2_XFLAGS_INVALID)) {
-		rx.version = cpu_to_je32(++xd->version);
-		rx.xprefix = xd->xprefix;
-		rx.name_len = xd->name_len;
-		rx.value_len = cpu_to_je16(xd->value_len);
-		rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
-	} else {
-		rx.version = cpu_to_je32(XDATUM_DELETE_MARKER);
-	}
+	rx.version = cpu_to_je32(++xd->version);
+	rx.xprefix = xd->xprefix;
+	rx.name_len = xd->name_len;
+	rx.value_len = cpu_to_je16(xd->value_len);
+	rx.data_crc = cpu_to_je32(crc32(0, vecs[1].iov_base, vecs[1].iov_len));
 	rx.node_crc = cpu_to_je32(crc32(0, &rx, sizeof(struct jffs2_raw_xattr) - 4));
 
-	rc = jffs2_flash_writev(c, vecs, nvecs, phys_ofs, &length, 0);
+	rc = jffs2_flash_writev(c, vecs, 2, phys_ofs, &length, 0);
 	if (rc || totlen != length) {
 		JFFS2_WARNING("jffs2_flash_writev()=%d, req=%u, wrote=%zu, at %#08x\n",
 			      rc, totlen, length, phys_ofs);
@@ -405,53 +394,36 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 	return xd;
 }
 
-static void delete_xattr_datum_delay(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
+static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
 	BUG_ON(xd->refcnt);
 
 	unload_xattr_datum(c, xd);
-	set_xattr_datum_dead(xd);
+	xd->flags |= JFFS2_XFLAGS_DEAD;
 	spin_lock(&c->erase_completion_lock);
-	list_add(&xd->xindex, &c->xattr_dead_list);
-	spin_unlock(&c->erase_completion_lock);
-	JFFS2_NOTICE("xdatum(xid=%u) was removed without delete marker. "
-		     "An orphan xdatum may be detected on next mounting.\n", xd->xid);
-}
-
-static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
-{
-	/* must be called under jffs2_reserve_space() and down_write(xattr_sem) */
-	int rc;
-	BUG_ON(xd->refcnt);
-
-	unload_xattr_datum(c, xd);
-	set_xattr_datum_dead(xd);	
-	rc = save_xattr_datum(c, xd);
-	if (rc) {
-		JFFS2_NOTICE("xdatum(xid=%u) was removed without delete marker. "
-			     "An orphan xdatum may be detected on next mounting.\n",
-			     xd->xid);
+	if (xd->node == (void *)xd) {
+		BUG_ON(!(xd->flags & JFFS2_XFLAGS_INVALID));
+		jffs2_free_xattr_datum(xd);
+	} else {
+		list_add(&xd->xindex, &c->xattr_dead_list);
 	}
-	spin_lock(&c->erase_completion_lock);
-	list_add(&xd->xindex, &c->xattr_dead_list);
 	spin_unlock(&c->erase_completion_lock);
+	dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n", xd->xid, xd->version);
 }
 
 /* -------- xref related functions ------------------
  * verify_xattr_ref(c, ref)
  *   is used to load xref information from medium. Because summary data does not
  *   contain xid/ino, it's necessary to verify once while mounting process.
- * delete_xattr_ref_node(c, ref)
- *   is used to delete a jffs2 node is dominated by xref. When EBS is enabled,
- *   it overwrites the obsolete node by myself. 
- * delete_xattr_ref(c, ref)
- *   is used to delete jffs2_xattr_ref object. If the reference counter of xdatum
- *   is refered by this xref become 0, delete_xattr_datum() is called later.
  * save_xattr_ref(c, ref)
- *   is used to write xref to medium.
+ *   is used to write xref to medium. If delete marker is marked, it write
+ *   a delete marker of xref into medium.
  * create_xattr_ref(c, ic, xd)
  *   is used to create a new xref and write to medium.
+ * delete_xattr_ref(c, ref)
+ *   is used to delete jffs2_xattr_ref. It marks xref XREF_DELETE_MARKER,
+ *   and allows GC to reclaim those physical nodes.
  * jffs2_xattr_delete_inode(c, ic)
  *   is called to remove xrefs related to obsolete inode when inode is unlinked.
  * jffs2_xattr_free_inode(c, ic)
@@ -591,13 +563,13 @@ static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct
 	return ref; /* success */
 }
 
-static void delete_xattr_ref_delay(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
+static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref)
 {
 	/* must be called under down_write(xattr_sem) */
 	struct jffs2_xattr_datum *xd;
 
-	set_xattr_ref_dead(ref);
 	xd = ref->xd;
+	ref->xseqno |= XREF_DELETE_MARKER;
 	ref->ino = ref->ic->ino;
 	ref->xid = ref->xd->xid;
 	spin_lock(&c->erase_completion_lock);
@@ -605,102 +577,29 @@ static void delete_xattr_ref_delay(struct jffs2_sb_info *c, struct jffs2_xattr_r
 	c->xref_dead_list = ref;
 	spin_unlock(&c->erase_completion_lock);
 
-	JFFS2_NOTICE("xref(ino=%u, xid=%u) was removed without delete marker. "
-		     "An orphan xref may be detected on next mounting.\n",
-		     ref->ino, ref->xid);
+	dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n",
+		  ref->ino, ref->xid, ref->xseqno);
 
 	if (!--xd->refcnt)
-		delete_xattr_datum_delay(c, xd);
-}
-
-static int delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, int enforce)
-{
-	/* must be called under jffs2_reserve_space() and down_write(xattr_sem) */
-	struct jffs2_inode_cache *ic;
-	struct jffs2_xattr_datum *xd;
-	uint32_t length;
-	int rc;
-
-	set_xattr_ref_dead(ref);
-	ic = ref->ic;
-	xd = ref->xd;
-	ref->ino = ic->ino;
-	ref->xid = xd->xid;
-	rc = save_xattr_ref(c, ref);
-	if (rc) {
-		if (!enforce) {
-			clr_xattr_ref_dead(ref);
-			ref->ic = ic;
-			ref->xd = xd;
-			return rc;
-		}
-		JFFS2_WARNING("could not write delete marker of xref(ino=%u, xid=%u). "
-			      "An orphan xref may be detected on next mounting.\n",
-			      ref->ic->ino, ref->xd->xid);
-	}
-	spin_lock(&c->erase_completion_lock);
-	ref->next = c->xref_dead_list;
-	c->xref_dead_list = ref;
-	spin_unlock(&c->erase_completion_lock);
-
-	xd->refcnt--;
-	if (xd->refcnt)
-		return 0;
-
-	/* delete xdatum */
-	unload_xattr_datum(c, xd);
-	up_write(&c->xattr_sem);
-	jffs2_complete_reservation(c);
-
-	rc = jffs2_reserve_space(c, PAD(sizeof(struct jffs2_raw_xattr)), &length,
-				 ALLOC_DELETION, JFFS2_SUMMARY_XATTR_SIZE);
-	if (rc) {
-		down(&c->alloc_sem);
-		down_write(&c->xattr_sem);
-		delete_xattr_datum_delay(c, xd);
-	} else {
-		down_write(&c->xattr_sem);
 		delete_xattr_datum(c, xd);
-	}
-	return 0;
 }
 
 void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
 	/* It's called from jffs2_clear_inode() on inode removing.
 	   When an inode with XATTR is removed, those XATTRs must be removed. */
-	struct jffs2_xattr_ref *ref;
-	uint32_t length;
-	int rc, retry;
+	struct jffs2_xattr_ref *ref, *_ref;
 
 	if (!ic || ic->nlink > 0)
 		return;
 
-	down_read(&c->xattr_sem);
-	if (!ic->xref) {
-		up_read(&c->xattr_sem);
-		return;
-	}
-	up_read(&c->xattr_sem);
- retry:
-	rc = jffs2_reserve_space(c, PAD(sizeof(struct jffs2_raw_xref)), &length,
-				 ALLOC_DELETION, JFFS2_SUMMARY_XREF_SIZE);
 	down_write(&c->xattr_sem);
-	if (ic->xref) {
-		ref = ic->xref;
-		ic->xref = ref->next;
-		if (rc) {
-			delete_xattr_ref_delay(c, ref);
-		} else {
-			delete_xattr_ref(c, ref, 1);
-		}
+	for (ref = ic->xref; ref; ref = _ref) {
+		_ref = ref->next;
+		delete_xattr_ref(c, ref);
 	}
-	retry = ic->xref ? 1 : 0;
+	ic->xref = NULL;
 	up_write(&c->xattr_sem);
-	if (!rc)
-		jffs2_complete_reservation(c);
-	if (retry)
-		goto retry;
 }
 
 void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
@@ -743,7 +642,7 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac
 			rc = load_xattr_datum(c, ref->xd);
 			if (unlikely(rc > 0)) {
 				*pref = ref->next;
-				delete_xattr_ref_delay(c, ref);
+				delete_xattr_ref(c, ref);
 				goto retry;
 			} else if (unlikely(rc < 0))
 				goto out;
@@ -755,7 +654,7 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac
 				ref->xd->flags &= ~JFFS2_XFLAGS_BIND;
 				if (unlikely(rc > 0)) {
 					*pcmp = cmp->next;
-					delete_xattr_ref_delay(c, cmp);
+					delete_xattr_ref(c, cmp);
 					goto retry;
 				} else if (unlikely(rc < 0))
 					goto out;
@@ -764,10 +663,10 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac
 			    && !strcmp(ref->xd->xname, cmp->xd->xname)) {
 				if (ref->xseqno > cmp->xseqno) {
 					*pcmp = cmp->next;
-					delete_xattr_ref_delay(c, cmp);
+					delete_xattr_ref(c, cmp);
 				} else {
 					*pref = ref->next;
-					delete_xattr_ref_delay(c, ref);
+					delete_xattr_ref(c, ref);
 				}
 				goto retry;
 			}
@@ -865,20 +764,11 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 	struct jffs2_inode_cache *ic;
 	struct jffs2_raw_node_ref *raw;
 	int i, xdatum_count = 0, xdatum_unchecked_count = 0, xref_count = 0;
+	int xdatum_orphan_count = 0, xref_orphan_count = 0, xref_dead_count = 0;
 
 	BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
-	/* Phase.1 : Drop dead xdatum */
-	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
-		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
-			BUG_ON(xd->node == (void *)xd);
-			if (is_xattr_datum_dead(xd)) {
-				list_del_init(&xd->xindex);
-				list_add(&xd->xindex, &c->xattr_unchecked);
-			}
-		}
-	}
 
-	/* Phase.2 : Merge same xref */
+	/* Phase.1 : Merge same xref */
 	for (i=0; i < XREF_TMPHASH_SIZE; i++)
 		xref_tmphash[i] = NULL;
 	for (ref=c->xref_temp; ref; ref=_ref) {
@@ -919,13 +809,15 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 	}
 	c->xref_temp = NULL;
 
-	/* Phase.3 : Bind xref with inode_cache and xattr_datum */
+	/* Phase.2 : Bind xref with inode_cache and xattr_datum */
 	for (i=0; i < XREF_TMPHASH_SIZE; i++) {
 		for (ref=xref_tmphash[i]; ref; ref=_ref) {
+			xref_count++;
 			_ref = ref->next;
 			if (is_xattr_ref_dead(ref)) {
 				ref->next = c->xref_dead_list;
 				c->xref_dead_list = ref;
+				xref_dead_count++;
 				continue;
 			}
 			/* At this point, ref->xid and ref->ino contain XID and inode number.
@@ -933,11 +825,12 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 			xd = jffs2_find_xattr_datum(c, ref->xid);
 			ic = jffs2_get_ino_cache(c, ref->ino);
 			if (!xd || !ic) {
-				JFFS2_WARNING("xref(ino=%u, xid=%u, xseqno=%u) is orphan. \n",
-					      ref->ino, ref->xid, ref->xseqno);
-				set_xattr_ref_dead(ref);
+				dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) is orphan.\n",
+					  ref->ino, ref->xid, ref->xseqno);
+				ref->xseqno |= XREF_DELETE_MARKER;
 				ref->next = c->xref_dead_list;
 				c->xref_dead_list = ref;
+				xref_orphan_count++;
 				continue;
 			}
 			ref->xd = xd;
@@ -945,19 +838,20 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 			xd->refcnt++;
 			ref->next = ic->xref;
 			ic->xref = ref;
-			xref_count++;
 		}
 	}
 
-	/* Phase.4 : Link unchecked xdatum to xattr_unchecked list */
+	/* Phase.3 : Link unchecked xdatum to xattr_unchecked list */
 	for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
 		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
+			xdatum_count++;
 			list_del_init(&xd->xindex);
 			if (!xd->refcnt) {
-				JFFS2_WARNING("orphan xdatum(xid=%u, version=%u)\n",
-					      xd->xid, xd->version);
-				set_xattr_datum_dead(xd);
+				dbg_xattr("xdatum(xid=%u, version=%u) is orphan.\n",
+					  xd->xid, xd->version);
+				xd->flags |= JFFS2_XFLAGS_DEAD;
 				list_add(&xd->xindex, &c->xattr_unchecked);
+				xdatum_orphan_count++;
 				continue;
 			}
 			if (is_xattr_datum_unchecked(c, xd)) {
@@ -966,12 +860,14 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 				list_add(&xd->xindex, &c->xattr_unchecked);
 				xdatum_unchecked_count++;
 			}
-			xdatum_count++;
 		}
 	}
 	/* build complete */
-	JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum (%u unchecked) and "
-		     "%u of xref found.\n", xdatum_count, xdatum_unchecked_count, xref_count);
+	JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum"
+		     " (%u unchecked, %u orphan) and "
+		     "%u of xref (%u dead, %u orphan) found.\n",
+		     xdatum_count, xdatum_unchecked_count, xdatum_orphan_count,
+		     xref_count, xref_dead_count, xref_orphan_count);
 }
 
 struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
@@ -1079,7 +975,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 				rc = load_xattr_datum(c, xd);
 				if (unlikely(rc > 0)) {
 					*pref = ref->next;
-					delete_xattr_ref_delay(c, ref);
+					delete_xattr_ref(c, ref);
 					goto retry;
 				} else if (unlikely(rc < 0))
 					goto out;
@@ -1140,7 +1036,7 @@ int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname,
 				rc = load_xattr_datum(c, xd);
 				if (unlikely(rc > 0)) {
 					*pref = ref->next;
-					delete_xattr_ref_delay(c, ref);
+					delete_xattr_ref(c, ref);
 					goto retry;
 				} else if (unlikely(rc < 0)) {
 					goto out;
@@ -1203,7 +1099,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 			rc = load_xattr_datum(c, xd);
 			if (unlikely(rc > 0)) {
 				*pref = ref->next;
-				delete_xattr_ref_delay(c, ref);
+				delete_xattr_ref(c, ref);
 				goto retry;
 			} else if (unlikely(rc < 0))
 				goto out;
@@ -1214,8 +1110,23 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 				goto out;
 			}
 			if (!buffer) {
-				*pref = ref->next;
-				rc = delete_xattr_ref(c, ref, 0);
+				ref->ino = ic->ino;
+				ref->xid = xd->xid;
+				ref->xseqno |= XREF_DELETE_MARKER;
+				rc = save_xattr_ref(c, ref);
+				if (!rc) {
+					*pref = ref->next;
+					spin_lock(&c->erase_completion_lock);
+					ref->next = c->xref_dead_list;
+					c->xref_dead_list = ref;
+					spin_unlock(&c->erase_completion_lock);
+					if (!--xd->refcnt)
+						delete_xattr_datum(c, xd);
+				} else {
+					ref->ic = ic;
+					ref->xd = xd;
+					ref->xseqno &= ~XREF_DELETE_MARKER;
+				}
 				goto out;
 			}
 			goto found;
@@ -1248,7 +1159,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
 		xd->refcnt--;
 		if (!xd->refcnt)
-			delete_xattr_datum_delay(c, xd);
+			delete_xattr_datum(c, xd);
 		up_write(&c->xattr_sem);
 		return rc;
 	}
@@ -1263,21 +1174,9 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 		rc = PTR_ERR(newref);
 		xd->refcnt--;
 		if (!xd->refcnt)
-			delete_xattr_datum_delay(c, xd);
+			delete_xattr_datum(c, xd);
 	} else if (ref) {
-		up_write(&c->xattr_sem);
-		jffs2_complete_reservation(c);
-
-		rc = jffs2_reserve_space(c, request, &length,
-					 ALLOC_DELETION, JFFS2_SUMMARY_XREF_SIZE);
-		down_write(&c->xattr_sem);
-		if (rc) {
-			JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
-			delete_xattr_ref_delay(c, ref);
-			up_write(&c->xattr_sem);
-			return 0;
-		}
-		delete_xattr_ref(c, ref, 1);
+		delete_xattr_ref(c, ref);
 	}
  out:
 	up_write(&c->xattr_sem);
@@ -1292,6 +1191,10 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
  *   is used to move xref into new node.
  * jffs2_verify_xattr(c)
  *   is used to call do_verify_xattr_datum() before garbage collecting.
+ * jffs2_release_xattr_datum(c, xd)
+ *   is used to release an in-memory object of xdatum.
+ * jffs2_release_xattr_ref(c, ref)
+ *   is used to release an in-memory object of xref.
  * -------------------------------------------------- */
 int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd,
 				      struct jffs2_raw_node_ref *raw)
@@ -1302,18 +1205,17 @@ int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xatt
 	down_write(&c->xattr_sem);
 	if (xd->node != raw)
 		goto out;
-	if (is_xattr_datum_dead(xd) && (raw->next_in_ino == (void *)xd))
+	if (xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID))
 		goto out;
 
-	old_ofs = ref_offset(xd->node);
-	totlen = ref_totlen(c, c->gcblock, xd->node);
-
-	if (!is_xattr_datum_dead(xd)) {
-		rc = load_xattr_datum(c, xd);
-		if (unlikely(rc < 0))
-			goto out;
+	rc = load_xattr_datum(c, xd);
+	if (unlikely(rc)) {
+		rc = (rc > 0) ? 0 : rc;
+		goto out;
 	}
-
+	old_ofs = ref_offset(xd->node);
+	totlen = PAD(sizeof(struct jffs2_raw_xattr)
+			+ xd->name_len + 1 + xd->value_len);
 	rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE);
 	if (rc) {
 		JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen);
@@ -1331,7 +1233,6 @@ int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xatt
 	return rc;
 }
 
-
 int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref,
 				    struct jffs2_raw_node_ref *raw)
 {
@@ -1392,7 +1293,7 @@ int jffs2_verify_xattr(struct jffs2_sb_info *c)
 			raw->flash_offset = ref_offset(raw)
 				| ((xd->node == (void *)raw) ? REF_PRISTINE : REF_NORMAL);
 		}
-		if (is_xattr_datum_dead(xd))
+		if (xd->flags & JFFS2_XFLAGS_DEAD)
 			list_add(&xd->xindex, &c->xattr_dead_list);
 		spin_unlock(&c->erase_completion_lock);
 	}
@@ -1403,7 +1304,7 @@ int jffs2_verify_xattr(struct jffs2_sb_info *c)
 void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under spin_lock(&c->erase_completion_lock) */
-	if (xd->node != (void *)xd)
+	if (xd->refcnt > 0 || xd->node != (void *)xd)
 		return;
 
 	list_del(&xd->xindex);
@@ -1421,8 +1322,8 @@ void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
 	for (tmp=c->xref_dead_list, ptmp=&c->xref_dead_list; tmp; ptmp=&tmp->next, tmp=tmp->next) {
 		if (ref == tmp) {
 			*ptmp = tmp->next;
-			jffs2_free_xattr_ref(ref);
 			break;
 		}
 	}
+	jffs2_free_xattr_ref(ref);
 }
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 06ab7b88021..4a10abc6d91 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -16,6 +16,7 @@
 
 #define JFFS2_XFLAGS_HOT	(0x01)	/* This datum is HOT */
 #define JFFS2_XFLAGS_BIND	(0x02)	/* This datum is not reclaimed */
+#define JFFS2_XFLAGS_DEAD	(0x40)	/* This datum is already dead */
 #define JFFS2_XFLAGS_INVALID	(0x80)	/* This datum contains crc error */
 
 struct jffs2_xattr_datum
@@ -60,34 +61,12 @@ struct jffs2_xattr_ref
 	struct jffs2_xattr_ref *next;		/* chained from ic->xref_list */
 };
 
-#define XDATUM_DELETE_MARKER	(0xffffffff)
 #define XREF_DELETE_MARKER	(0x00000001)
-static inline int is_xattr_datum_dead(struct jffs2_xattr_datum *xd)
-{
-	return (xd->version == XDATUM_DELETE_MARKER);
-}
-
-static inline void set_xattr_datum_dead(struct jffs2_xattr_datum *xd)
-{
-	xd->version = XDATUM_DELETE_MARKER;
-}
-
 static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref)
 {
 	return ((ref->xseqno & XREF_DELETE_MARKER) != 0);
 }
 
-static inline void set_xattr_ref_dead(struct jffs2_xattr_ref *ref)
-{
-	ref->xseqno |= XREF_DELETE_MARKER;
-}
-
-static inline void clr_xattr_ref_dead(struct jffs2_xattr_ref *ref)
-{
-	ref->xseqno &= ~XREF_DELETE_MARKER;
-}
-
-
 #ifdef CONFIG_JFFS2_FS_XATTR
 
 extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
-- 
cgit v1.2.3


From 355ed4e141203fd7266ef9d90d57be0c61bd1aa4 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 24 Jun 2006 09:15:36 +0900
Subject: [JFFS2][XATTR] Fix memory leak with jffs2_xattr_ref

If xattr_ref is associated with an orphan inode_cache
on filesystem mounting, those xattr_refs are not
released even if this inode_cache is released.

This patch enables to call jffs2_xattr_delete_inode()
for such a irregular inode_cachde too.

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/fs.c        | 2 --
 fs/jffs2/gc.c        | 1 +
 fs/jffs2/nodelist.c  | 3 +++
 fs/jffs2/readinode.c | 1 +
 4 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2900ec3ec3a..97caa77d60c 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -227,8 +227,6 @@ void jffs2_clear_inode (struct inode *inode)
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 
 	D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode));
-
-	jffs2_xattr_delete_inode(c, f->inocache);
 	jffs2_do_clear_inode(c, f);
 }
 
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index f59b147661c..daff3341ff9 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -165,6 +165,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c)
 			D1(printk(KERN_DEBUG "Skipping check of ino #%d with nlink zero\n",
 				  ic->ino));
 			spin_unlock(&c->inocache_lock);
+			jffs2_xattr_delete_inode(c, ic);
 			continue;
 		}
 		switch(ic->state) {
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 927dfe42ba7..7675b33396c 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -906,6 +906,9 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old)
 {
 	struct jffs2_inode_cache **prev;
 
+#ifdef CONFIG_JFFS2_FS_XATTR
+	BUG_ON(old->xref);
+#endif
 	dbg_inocache("del %p (ino #%u)\n", old, old->ino);
 	spin_lock(&c->inocache_lock);
 
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 5fec012b02e..cc1899268c4 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -968,6 +968,7 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
 	struct jffs2_full_dirent *fd, *fds;
 	int deleted;
 
+	jffs2_xattr_delete_inode(c, f->inocache);
 	down(&f->sem);
 	deleted = f->inocache && !f->inocache->nlink;
 
-- 
cgit v1.2.3


From 2c887e2359f6e7217cdaa17994ca94ef328b658f Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 24 Jun 2006 09:16:50 +0900
Subject: [JFFS2][XATTR] Re-define xd->refcnt as atomic_t

In jffs2_release_xattr_datum(), it refers xd->refcnt to ensure
whether releasing xd is allowed or not.
But we can't hold xattr_sem since this function is called under
spin_lock(&c->erase_completion_lock). Thus we have to refer it
without any locking.

This patch redefine xd->refcnt as atomic_t. It enables to refer
xd->refcnt without any locking.

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/xattr.c | 25 +++++++++++--------------
 fs/jffs2/xattr.h |  2 +-
 2 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 7622f79cb5b..18e66dbf23b 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -345,7 +345,7 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 		    && xd->value_len==xsize
 		    && !strcmp(xd->xname, xname)
 		    && !memcmp(xd->xvalue, xvalue, xsize)) {
-			xd->refcnt++;
+			atomic_inc(&xd->refcnt);
 			return xd;
 		}
 	}
@@ -365,7 +365,7 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 	strcpy(data, xname);
 	memcpy(data + name_len + 1, xvalue, xsize);
 
-	xd->refcnt = 1;
+	atomic_set(&xd->refcnt, 1);
 	xd->xid = ++c->highest_xid;
 	xd->flags |= JFFS2_XFLAGS_HOT;
 	xd->xprefix = xprefix;
@@ -397,7 +397,7 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c,
 static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under down_write(xattr_sem) */
-	BUG_ON(xd->refcnt);
+	BUG_ON(atomic_read(&xd->refcnt));
 
 	unload_xattr_datum(c, xd);
 	xd->flags |= JFFS2_XFLAGS_DEAD;
@@ -580,7 +580,7 @@ static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *re
 	dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n",
 		  ref->ino, ref->xid, ref->xseqno);
 
-	if (!--xd->refcnt)
+	if (atomic_dec_and_test(&xd->refcnt))
 		delete_xattr_datum(c, xd);
 }
 
@@ -612,8 +612,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
 	for (ref = ic->xref; ref; ref = _ref) {
 		_ref = ref->next;
 		xd = ref->xd;
-		xd->refcnt--;
-		if (!xd->refcnt) {
+		if (atomic_dec_and_test(&xd->refcnt)) {
 			unload_xattr_datum(c, xd);
 			jffs2_free_xattr_datum(xd);
 		}
@@ -835,7 +834,7 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 			}
 			ref->xd = xd;
 			ref->ic = ic;
-			xd->refcnt++;
+			atomic_inc(&xd->refcnt);
 			ref->next = ic->xref;
 			ic->xref = ref;
 		}
@@ -846,7 +845,7 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
 		list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
 			xdatum_count++;
 			list_del_init(&xd->xindex);
-			if (!xd->refcnt) {
+			if (!atomic_read(&xd->refcnt)) {
 				dbg_xattr("xdatum(xid=%u, version=%u) is orphan.\n",
 					  xd->xid, xd->version);
 				xd->flags |= JFFS2_XFLAGS_DEAD;
@@ -1120,7 +1119,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 					ref->next = c->xref_dead_list;
 					c->xref_dead_list = ref;
 					spin_unlock(&c->erase_completion_lock);
-					if (!--xd->refcnt)
+					if (atomic_dec_and_test(&xd->refcnt))
 						delete_xattr_datum(c, xd);
 				} else {
 					ref->ic = ic;
@@ -1157,8 +1156,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 	down_write(&c->xattr_sem);
 	if (rc) {
 		JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request);
-		xd->refcnt--;
-		if (!xd->refcnt)
+		if (atomic_dec_and_test(&xd->refcnt))
 			delete_xattr_datum(c, xd);
 		up_write(&c->xattr_sem);
 		return rc;
@@ -1172,8 +1170,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
 			ic->xref = ref;
 		}
 		rc = PTR_ERR(newref);
-		xd->refcnt--;
-		if (!xd->refcnt)
+		if (atomic_dec_and_test(&xd->refcnt))
 			delete_xattr_datum(c, xd);
 	} else if (ref) {
 		delete_xattr_ref(c, ref);
@@ -1304,7 +1301,7 @@ int jffs2_verify_xattr(struct jffs2_sb_info *c)
 void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd)
 {
 	/* must be called under spin_lock(&c->erase_completion_lock) */
-	if (xd->refcnt > 0 || xd->node != (void *)xd)
+	if (atomic_read(&xd->refcnt) || xd->node != (void *)xd)
 		return;
 
 	list_del(&xd->xindex);
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 4a10abc6d91..06a5c69dcf8 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -28,7 +28,7 @@ struct jffs2_xattr_datum
 	uint16_t xprefix;		/* see JFFS2_XATTR_PREFIX_* */
 
 	struct list_head xindex;	/* chained from c->xattrindex[n] */
-	uint32_t refcnt;		/* # of xattr_ref refers this */
+	atomic_t refcnt;		/* # of xattr_ref refers this */
 	uint32_t xid;
 	uint32_t version;
 
-- 
cgit v1.2.3


From 332a6b99c587161a972eaa1b00ae0e0dc49fbbfd Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 24 Jun 2006 09:17:42 +0900
Subject: [JFFS2][XATTR] Fix wrong copyright

summary.c was modified at 2006.

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/jffs2/summary.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index c430f1d217e..1deb92acd2b 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -5,7 +5,7 @@
  *                     Zoltan Sogor <weth@inf.u-szeged.hu>,
  *                     Patrik Kluba <pajko@halom.u-szeged.hu>,
  *                     University of Szeged, Hungary
- *               2005  KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *               2006  KaiGai Kohei <kaigai@ak.jp.nec.com>
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
-- 
cgit v1.2.3


From 04510dee3c21ac208351cc6de7b70a5459ab3029 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Sat, 24 Jun 2006 09:21:13 +0900
Subject: [JFFS2][XATTR] coexistence between xattr and write buffering support.

Drop '&& !JFFS2_FS_WRITEBUFFER' from fs/Kconfig.
The series of previous patches enables to use those
functionality at same time.

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 1cdc043922d..ee5effac1ad 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1116,7 +1116,7 @@ config JFFS2_SUMMARY
 
 config JFFS2_FS_XATTR
 	bool "JFFS2 XATTR support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL && !JFFS2_FS_WRITEBUFFER
+	depends on JFFS2_FS && EXPERIMENTAL
 	default n
 	help
 	  Extended attributes are name:value pairs associated with inodes by
-- 
cgit v1.2.3


From 0223cf0b10bdb3b557d8884b1a957cc64be843c3 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 27 Jun 2006 19:50:57 +0000
Subject: [CIFS] Fix alignment of unicode strings in previous patch

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7737edd1baf..b7d49c03985 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -83,11 +83,11 @@ static void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
 	/* BB FIXME add check that strings total less
 	than 335 or will need to send them as arrays */
 
-	/* align unicode strings, must be word aligned */
-	if ((long) bcc_ptr % 2)	{
+	/* unicode strings, must be word aligned before the call */
+/*	if ((long) bcc_ptr % 2)	{
 		*bcc_ptr = 0;
 		bcc_ptr++;
-	}
+	} */
 	/* copy user */
 	if(ses->userName == NULL) {
 		/* BB what about null user mounts - check that we do this BB */
@@ -416,9 +416,14 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		bcc_ptr += CIFS_SESS_KEY_SIZE;
 		memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESS_KEY_SIZE);
 		bcc_ptr += CIFS_SESS_KEY_SIZE;
-		if(ses->capabilities & CAP_UNICODE)
+		if(ses->capabilities & CAP_UNICODE) {
+			/* unicode strings must be word aligned */
+			if (iov[0].iov_len % 2) {
+				*bcc_ptr = 0;
+				bcc_ptr++;		
+			}	
 			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
-		else
+		} else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 	} else if (type == NTLMv2) {
 		char * v2_sess_key = 
@@ -455,9 +460,12 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		memcpy(bcc_ptr, (char *)v2_sess_key, sizeof(struct ntlmv2_resp));
 		bcc_ptr += sizeof(struct ntlmv2_resp);
 		kfree(v2_sess_key);
-		if(ses->capabilities & CAP_UNICODE)
+		if(ses->capabilities & CAP_UNICODE) {
+			if(iov[0].iov_len % 2) {
+				*bcc_ptr = 0;
+			}	bcc_ptr++;
 			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
-		else
+		} else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 	} else /* NTLMSSP or SPNEGO */ {
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-- 
cgit v1.2.3


From 266bee88699ddbde42ab303bbc426a105cc49809 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 27 Jun 2006 12:59:15 -0700
Subject: [PATCH] fix static linking of NFS

Builds on ARM report link problems with common configurations like
statically linked NFS (for nfsroot).  The symptom is that __init
section code references __exit section code; that won't work since
the exit sections are discarded (since they can never be called).

The best fix for these particular cases would be an "__init_or_exit"
section annotation.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/direct.c   | 2 +-
 fs/nfs/inode.c    | 2 +-
 fs/nfs/internal.h | 8 ++++----
 fs/nfs/pagelist.c | 2 +-
 fs/nfs/read.c     | 2 +-
 fs/nfs/write.c    | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 402005c35ab..8ca9707be6c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -909,7 +909,7 @@ int __init nfs_init_directcache(void)
  * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
  *
  */
-void __exit nfs_destroy_directcache(void)
+void nfs_destroy_directcache(void)
 {
 	if (kmem_cache_destroy(nfs_direct_cachep))
 		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 51bc88b662f..c5b916605fb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1132,7 +1132,7 @@ static int __init nfs_init_inodecache(void)
 	return 0;
 }
 
-static void __exit nfs_destroy_inodecache(void)
+static void nfs_destroy_inodecache(void)
 {
 	if (kmem_cache_destroy(nfs_inode_cachep))
 		printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n");
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bd2815e2dec..4fe51c1292b 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -31,15 +31,15 @@ extern struct svc_version nfs4_callback_version1;
 
 /* pagelist.c */
 extern int __init nfs_init_nfspagecache(void);
-extern void __exit nfs_destroy_nfspagecache(void);
+extern void nfs_destroy_nfspagecache(void);
 extern int __init nfs_init_readpagecache(void);
-extern void __exit nfs_destroy_readpagecache(void);
+extern void nfs_destroy_readpagecache(void);
 extern int __init nfs_init_writepagecache(void);
-extern void __exit nfs_destroy_writepagecache(void);
+extern void nfs_destroy_writepagecache(void);
 
 #ifdef CONFIG_NFS_DIRECTIO
 extern int __init nfs_init_directcache(void);
-extern void __exit nfs_destroy_directcache(void);
+extern void nfs_destroy_directcache(void);
 #else
 #define nfs_init_directcache() (0)
 #define nfs_destroy_directcache() do {} while(0)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ef9429643eb..d89f6fb3b3a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -390,7 +390,7 @@ int __init nfs_init_nfspagecache(void)
 	return 0;
 }
 
-void __exit nfs_destroy_nfspagecache(void)
+void nfs_destroy_nfspagecache(void)
 {
 	if (kmem_cache_destroy(nfs_page_cachep))
 		printk(KERN_INFO "nfs_page: not all structures were freed\n");
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 41c2ffee24f..32cf3773af0 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -711,7 +711,7 @@ int __init nfs_init_readpagecache(void)
 	return 0;
 }
 
-void __exit nfs_destroy_readpagecache(void)
+void nfs_destroy_readpagecache(void)
 {
 	mempool_destroy(nfs_rdata_mempool);
 	if (kmem_cache_destroy(nfs_rdata_cachep))
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b383fdd3a15..8fccb9cb173 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1551,7 +1551,7 @@ int __init nfs_init_writepagecache(void)
 	return 0;
 }
 
-void __exit nfs_destroy_writepagecache(void)
+void nfs_destroy_writepagecache(void)
 {
 	mempool_destroy(nfs_commit_mempool);
 	mempool_destroy(nfs_wdata_mempool);
-- 
cgit v1.2.3


From 73024cf11522c0233228453984b2a2b37885e336 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Wed, 28 Jun 2006 08:42:26 +1000
Subject: [XFS] Fix realtime subvolume expansion, a porting bug b0rked it. 
 Coverity made me look at this code (bug id #344). We only return with
 XFS_ERROR(EINVAL) if mp->m_rtdev_targp is valid and pass it otherwise to
 xfs_read_buf() where some function calls later it gets dereferenced by an
 assert.

SGI-PV: 954266
SGI-Modid: xfs-linux-melb:xfs-kern:26363a

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_rtalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 0c1e42b037e..5a0b678956e 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1929,7 +1929,7 @@ xfs_growfs_rt(
 	/*
 	 * Initial error checking.
 	 */
-	if (mp->m_rtdev_targp || mp->m_rbmip == NULL ||
+	if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
 	    (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
 	    (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
 		return XFS_ERROR(EINVAL);
-- 
cgit v1.2.3


From f40c562855294bf4e7268274d7461dc32c1e6b25 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 28 Jun 2006 00:13:38 +0000
Subject: [CIFS] Fix authentication choice so we do not force NTLMv2 unless the
 user specifies it is required or turns of ntlm

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 10 +++++++---
 fs/cifs/sess.c    |  3 ++-
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index de405bfb67d..19678c575df 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -415,6 +415,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	else /* if override flags set only sign/seal OR them with global auth */
 		secFlags = extended_security | ses->overrideSecFlg;
 
+	cFYI(1,("secFlags 0x%x",secFlags));
+
 	pSMB->hdr.Mid = GetNextMid(server);
 	pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
 	if((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
@@ -511,11 +513,13 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			cERROR(1,("Server requests plain text password"
 				  " but client support disabled"));
 
-	if(secFlags & CIFSSEC_MUST_NTLMV2)
+	if((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
 		server->secType = NTLMv2;
-	else
+	else if(secFlags & CIFSSEC_MAY_NTLM)
 		server->secType = NTLM;
-	/* else krb5 ... */
+	else if(secFlags & CIFSSEC_MAY_NTLMV2)
+		server->secType = NTLMv2;
+	/* else krb5 ... any others ... */
 
 	/* one byte, so no need to convert this or EncryptionKeyLen from
 	   little endian */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index b7d49c03985..7202d534ef0 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -323,11 +323,12 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	__u16 action;
 	int bytes_remaining;
 
-	cFYI(1,("new sess setup"));
 	if(ses == NULL)
 		return -EINVAL;
 
 	type = ses->server->secType;
+
+	cFYI(1,("sess setup type %d",type));
 	if(type == LANMAN) {
 #ifndef CONFIG_CIFS_WEAK_PW_HASH
 		/* LANMAN and plaintext are less secure and off by default.
-- 
cgit v1.2.3


From 6fdf8ccc09fd764a9cce11006aa3fca53ac1c895 Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 28 Jun 2006 10:13:52 +1000
Subject: [XFS] Rework code snippets slightly to remove remaining recent-gcc
 warnings.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26364a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_inode.c       | 4 ++--
 fs/xfs/xfs_log_recover.c | 2 ++
 fs/xfs/xfs_mount.c       | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5fa0adb7e17..86c1bf0bba9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1961,9 +1961,9 @@ xfs_iunlink_remove(
 	xfs_agino_t	agino;
 	xfs_agino_t	next_agino;
 	xfs_buf_t	*last_ibp;
-	xfs_dinode_t	*last_dip;
+	xfs_dinode_t	*last_dip = NULL;
 	short		bucket_index;
-	int		offset, last_offset;
+	int		offset, last_offset = 0;
 	int		error;
 	int		agi_ok;
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 55b4237c215..3cb678e3a13 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -990,6 +990,8 @@ xlog_find_zeroed(
 	xfs_daddr_t     num_scan_bblks;
 	int	        error, log_bbnum = log->l_logBBsize;
 
+	*blk_no = 0;
+
 	/* check totally zeroed log */
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 10dbf203c62..22d3a1c7547 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -2026,7 +2026,7 @@ xfs_icsb_balance_counter(
 	xfs_sb_field_t  field,
 	int		flags)
 {
-	uint64_t	count, resid = 0;
+	uint64_t	count, resid;
 	int		weight = num_online_cpus();
 	int		s;
 
@@ -2058,6 +2058,7 @@ xfs_icsb_balance_counter(
 		break;
 	default:
 		BUG();
+		count = resid = 0;	/* quiet, gcc */
 		break;
 	}
 
-- 
cgit v1.2.3


From e6e5494cb23d1933735ee47cc674ffe1c4afed6f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:53:50 -0700
Subject: [PATCH] vdso: randomize the i386 vDSO by moving it into a vma

Move the i386 VDSO down into a vma and thus randomize it.

Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.

It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).

There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO.  Newer
distributions (using glibc 2.3.3 or later) can turn this option off.  Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.

There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.

(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)

This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.

[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0137ec4c136..0a163a4f776 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -122,6 +122,11 @@ struct mem_size_stats
 	unsigned long private_dirty;
 };
 
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	return NULL;
+}
+
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
 	struct proc_maps_private *priv = m->private;
@@ -158,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats
 		pad_len_spaces(m, len);
 		seq_path(m, file->f_vfsmnt, file->f_dentry, "\n");
 	} else {
-		if (mm) {
-			if (vma->vm_start <= mm->start_brk &&
+		const char *name = arch_vma_name(vma);
+		if (!name) {
+			if (mm) {
+				if (vma->vm_start <= mm->start_brk &&
 						vma->vm_end >= mm->brk) {
-				pad_len_spaces(m, len);
-				seq_puts(m, "[heap]");
-			} else {
-				if (vma->vm_start <= mm->start_stack &&
-					vma->vm_end >= mm->start_stack) {
-
-					pad_len_spaces(m, len);
-					seq_puts(m, "[stack]");
+					name = "[heap]";
+				} else if (vma->vm_start <= mm->start_stack &&
+					   vma->vm_end >= mm->start_stack) {
+					name = "[stack]";
 				}
+			} else {
+				name = "[vdso]";
 			}
-		} else {
+		}
+		if (name) {
 			pad_len_spaces(m, len);
-			seq_puts(m, "[vdso]");
+			seq_puts(m, name);
 		}
 	}
 	seq_putc(m, '\n');
-- 
cgit v1.2.3


From c9cf55285e87ac423c45d9efca750d3f50234d10 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 27 Jun 2006 02:53:52 -0700
Subject: [PATCH] add poison.h and patch primary users

Localize poison values into one header file for better documentation and
easier/quicker debugging and so that the same values won't be used for
multiple purposes.

Use these constants in core arch., mm, driver, and fs code.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Matt Mackall <mpm@selenic.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/jbd/journal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7f96b5cb678..8c9b28dff11 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -34,6 +34,7 @@
 #include <linux/suspend.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
+#include <linux/poison.h>
 #include <linux/proc_fs.h>
 
 #include <asm/uaccess.h>
@@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh)
 {
 #ifdef CONFIG_JBD_DEBUG
 	atomic_dec(&nr_journal_heads);
-	memset(jh, 0x5b, sizeof(*jh));
+	memset(jh, JBD_POISON_FREE, sizeof(*jh));
 #endif
 	kmem_cache_free(journal_head_cache, jh);
 }
-- 
cgit v1.2.3


From b6cd0b772dcc5dc9b4c03d53946474dee399fa72 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 27 Jun 2006 02:53:54 -0700
Subject: [PATCH] fs/buffer.c: cleanups

- add a proper prototype for the following global function:
  - buffer_init()

- make the following needlessly global function static:
  - end_buffer_async_write()

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 373bb6292bd..f23bb647db4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -564,7 +564,7 @@ still_busy:
  * Completion handler for block_write_full_page() - pages which are unlocked
  * during I/O, and which have PageWriteback cleared upon I/O completion.
  */
-void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
 	unsigned long flags;
@@ -3166,7 +3166,6 @@ EXPORT_SYMBOL(block_sync_page);
 EXPORT_SYMBOL(block_truncate_page);
 EXPORT_SYMBOL(block_write_full_page);
 EXPORT_SYMBOL(cont_prepare_write);
-EXPORT_SYMBOL(end_buffer_async_write);
 EXPORT_SYMBOL(end_buffer_read_sync);
 EXPORT_SYMBOL(end_buffer_write_sync);
 EXPORT_SYMBOL(file_fsync);
-- 
cgit v1.2.3


From 34af946a22724c4e2b204957f2b24b22a0fb121c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 27 Jun 2006 02:53:55 -0700
Subject: [PATCH] spin/rwlock init cleanups

locking init cleanups:

 - convert " = SPIN_LOCK_UNLOCKED" to spin_lock_init() or DEFINE_SPINLOCK()
 - convert rwlocks in a similar manner

this patch was generated automatically.

Motivation:

 - cleanliness
 - lockdep needs control of lock initialization, which the open-coded
   variants do not give
 - it's also useful for -rt and for lock debugging in general

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c          | 2 +-
 fs/ocfs2/cluster/heartbeat.c | 2 +-
 fs/ocfs2/cluster/tcp.c       | 2 +-
 fs/ocfs2/dlm/dlmdomain.c     | 2 +-
 fs/ocfs2/dlm/dlmlock.c       | 2 +-
 fs/ocfs2/dlm/dlmrecovery.c   | 4 ++--
 fs/ocfs2/dlmglue.c           | 2 +-
 fs/ocfs2/journal.c           | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1630b5670dc..7c7d01672d3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -123,7 +123,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags);
  */
 
 /* recall_lock protects the del_recall_lru */
-static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 
 static void
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21f38accd03..1d26cfcd9f8 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -54,7 +54,7 @@ static DECLARE_RWSEM(o2hb_callback_sem);
  * multiple hb threads are watching multiple regions.  A node is live
  * whenever any of the threads sees activity from the node in its region.
  */
-static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(o2hb_live_lock);
 static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
 static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 static LIST_HEAD(o2hb_node_events);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f60cc0d398..1591eb37a72 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -108,7 +108,7 @@
 	    ##args);							\
 } while (0)
 
-static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(o2net_handler_lock);
 static struct rb_root o2net_handler_tree = RB_ROOT;
 
 static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index ba27c5c5e95..b8c23f7ba67 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -88,7 +88,7 @@ out_free:
  *
  */
 
-spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(dlm_domain_lock);
 LIST_HEAD(dlm_domains);
 static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events);
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index d6f89577e25..5ca57ec650c 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,7 +53,7 @@
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
 
-static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_cookie_lock);
 static u64 dlm_next_cookie = 1;
 
 static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index da399013516..29b2845f370 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -98,8 +98,8 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
 
 static u64 dlm_get_next_mig_cookie(void);
 
-static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
 static u64 dlm_mig_cookie = 1;
 
 static u64 dlm_get_next_mig_cookie(void)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64cd52860c8..4acd37286bd 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -242,7 +242,7 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
 	mlog_exit_void();
 }
 
-static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
 
 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
 				       struct ocfs2_dlm_debug *dlm_debug)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 3fe8781c22c..910a601b2e9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -49,7 +49,7 @@
 
 #include "buffer_head_io.h"
 
-spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(trans_inc_lock);
 
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-- 
cgit v1.2.3


From 05f225dc8729d3e5703b8c34e750fdca67bcefe6 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Tue, 27 Jun 2006 02:53:59 -0700
Subject: [PATCH] ufs: ufs_read_inode cleanup

Add missed ufsi->i_dir_start_lookup initialization in ufs_read_inode in
UFS2 case.  Also it cleans ufs_read_inode function to prevent such kind of
situation in the future: it move depend on UFS type parts of code into
separate functions and leaves in ufs_read_inode only generic code.  It
cleans code and avoids duplication.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/inode.c | 111 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 57 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index f2dbdf5a876..259bd196099 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -605,39 +605,12 @@ static void ufs_set_inode_ops(struct inode *inode)
 				   ufs_get_inode_dev(inode->i_sb, UFS_I(inode)));
 }
 
-void ufs_read_inode (struct inode * inode)
+static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 {
 	struct ufs_inode_info *ufsi = UFS_I(inode);
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
-	struct ufs_inode * ufs_inode;	
-	struct ufs2_inode *ufs2_inode;
-	struct buffer_head * bh;
+	struct super_block *sb = inode->i_sb;
 	mode_t mode;
 	unsigned i;
-	unsigned flags;
-	
-	UFSD("ENTER, ino %lu\n", inode->i_ino);
-	
-	sb = inode->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
-	flags = UFS_SB(sb)->s_flags;
-
-	if (inode->i_ino < UFS_ROOTINO || 
-	    inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
-		ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino);
-		goto bad_inode;
-	}
-	
-	bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
-	if (!bh) {
-		ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino);
-		goto bad_inode;
-	}
-	if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
-		goto ufs2_inode;
-
-	ufs_inode = (struct ufs_inode *) (bh->b_data + sizeof(struct ufs_inode) * ufs_inotofsbo(inode->i_ino));
 
 	/*
 	 * Copy data to the in-core inode.
@@ -661,14 +634,11 @@ void ufs_read_inode (struct inode * inode)
 	inode->i_atime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
-	inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size (for stat) */
-	inode->i_version++;
 	ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
 	ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen);
 	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
 	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
-	ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
-	ufsi->i_dir_start_lookup = 0;
+
 	
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
@@ -677,24 +647,16 @@ void ufs_read_inode (struct inode * inode)
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
 			ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
 	}
-	ufsi->i_osync = 0;
-
-	ufs_set_inode_ops(inode);
-
-	brelse (bh);
-
-	UFSD("EXIT\n");
-	return;
+}
 
-bad_inode:
-	make_bad_inode(inode);
-	return;
+static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	mode_t mode;
+	unsigned i;
 
-ufs2_inode :
 	UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
-
-	ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino));
-
 	/*
 	 * Copy data to the in-core inode.
 	 */
@@ -717,26 +679,64 @@ ufs2_inode :
 	inode->i_atime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
-	inode->i_blksize = PAGE_SIZE; /*This is the optimal IO size(for stat)*/
-
-	inode->i_version++;
 	ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags);
 	ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen);
 	/*
 	ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
 	ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
 	*/
-	ufsi->i_lastfrag= (inode->i_size + uspi->s_fsize- 1) >> uspi->s_fshift;
 
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
 			ufsi->i_u1.u2_i_data[i] =
 				ufs2_inode->ui_u2.ui_addr.ui_db[i];
-	}
-	else {
+	} else {
 		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
 			ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
 	}
+}
+
+void ufs_read_inode(struct inode * inode)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block * sb;
+	struct ufs_sb_private_info * uspi;
+	struct buffer_head * bh;
+
+	UFSD("ENTER, ino %lu\n", inode->i_ino);
+
+	sb = inode->i_sb;
+	uspi = UFS_SB(sb)->s_uspi;
+
+	if (inode->i_ino < UFS_ROOTINO ||
+	    inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) {
+		ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n",
+			    inode->i_ino);
+		goto bad_inode;
+	}
+
+	bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino));
+	if (!bh) {
+		ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n",
+			    inode->i_ino);
+		goto bad_inode;
+	}
+	if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
+		struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data;
+
+		ufs2_read_inode(inode,
+				ufs2_inode + ufs_inotofsbo(inode->i_ino));
+	} else {
+		struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data;
+
+		ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
+	}
+
+	inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/
+	inode->i_version++;
+	ufsi->i_lastfrag =
+		(inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
+	ufsi->i_dir_start_lookup = 0;
 	ufsi->i_osync = 0;
 
 	ufs_set_inode_ops(inode);
@@ -745,6 +745,9 @@ ufs2_inode :
 
 	UFSD("EXIT\n");
 	return;
+
+bad_inode:
+	make_bad_inode(inode);
 }
 
 static int ufs_update_inode(struct inode * inode, int do_sync)
-- 
cgit v1.2.3


From 5a67e4c5b6faaccf31740a07d93704166405d880 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Tue, 27 Jun 2006 02:54:11 -0700
Subject: [PATCH] cpu hotplug: use hotplug version of cpu notifier in
 appropriate places

Make use the of newly defined hotplug version of cpu_notifier functionality
wherever appropriate.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/xfs/xfs_mount.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 10dbf203c62..ed7579beb6b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1721,15 +1721,14 @@ xfs_mount_log_sbunit(
  * is present to prevent thrashing).
  */
 
+#ifdef CONFIG_HOTPLUG_CPU
 /*
  * hot-plug CPU notifier support.
  *
- * We cannot use the hotcpu_register() function because it does
- * not allow notifier instances. We need a notifier per filesystem
- * as we need to be able to identify the filesystem to balance
- * the counters out. This is achieved by having a notifier block
- * embedded in the xfs_mount_t and doing pointer magic to get the
- * mount pointer from the notifier block address.
+ * We need a notifier per filesystem as we need to be able to identify
+ * the filesystem to balance the counters out. This is achieved by
+ * having a notifier block embedded in the xfs_mount_t and doing pointer
+ * magic to get the mount pointer from the notifier block address.
  */
 STATIC int
 xfs_icsb_cpu_notify(
@@ -1779,6 +1778,7 @@ xfs_icsb_cpu_notify(
 
 	return NOTIFY_OK;
 }
+#endif /* CONFIG_HOTPLUG_CPU */
 
 int
 xfs_icsb_init_counters(
@@ -1791,9 +1791,11 @@ xfs_icsb_init_counters(
 	if (mp->m_sb_cnts == NULL)
 		return -ENOMEM;
 
+#ifdef CONFIG_HOTPLUG_CPU
 	mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
 	mp->m_icsb_notifier.priority = 0;
-	register_cpu_notifier(&mp->m_icsb_notifier);
+	register_hotcpu_notifier(&mp->m_icsb_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
 
 	for_each_online_cpu(i) {
 		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
@@ -1812,7 +1814,7 @@ xfs_icsb_destroy_counters(
 	xfs_mount_t	*mp)
 {
 	if (mp->m_sb_cnts) {
-		unregister_cpu_notifier(&mp->m_icsb_notifier);
+		unregister_hotcpu_notifier(&mp->m_icsb_notifier);
 		free_percpu(mp->m_sb_cnts);
 	}
 }
-- 
cgit v1.2.3


From 4ad98457aa545bc4d03d417da86325507aa586ec Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Tue, 27 Jun 2006 02:55:04 -0700
Subject: [PATCH] Remove redundant NULL checks before [kv]free - in fs/

Remove redundant NULL checks before kfree for fs/

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Acked-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ocfs2/vote.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index ee42765a855..cf70fe2075b 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -988,9 +988,7 @@ int ocfs2_request_mount_vote(struct ocfs2_super *osb)
 	}
 
 bail:
-	if (request)
-		kfree(request);
-
+	kfree(request);
 	return status;
 }
 
@@ -1021,9 +1019,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
 	}
 
 bail:
-	if (request)
-		kfree(request);
-
+	kfree(request);
 	return status;
 }
 
-- 
cgit v1.2.3


From 5493a0fcba8a94baa5c1d80aca21d6ae4b3b573e Mon Sep 17 00:00:00 2001
From: Nathan Scott <nathans@sgi.com>
Date: Wed, 28 Jun 2006 11:17:28 +1000
Subject: [XFS] Fixup whitespace damage in log_write, remove final warning.

SGI-PV: 904196
SGI-Modid: xfs-linux-melb:xfs-kern:26366a

Signed-off-by: Nathan Scott <nathans@sgi.com>
---
 fs/xfs/xfs_log.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index d8f5d4cbe8b..e730328636c 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1740,10 +1740,10 @@ xlog_write(xfs_mount_t *	mp,
 	   xlog_in_core_t	**commit_iclog,
 	   uint			flags)
 {
-    xlog_t	     *log    = mp->m_log;
+    xlog_t	     *log = mp->m_log;
     xlog_ticket_t    *ticket = (xlog_ticket_t *)tic;
+    xlog_in_core_t   *iclog = NULL;  /* ptr to current in-core log */
     xlog_op_header_t *logop_head;    /* ptr to log operation header */
-    xlog_in_core_t   *iclog;	     /* ptr to current in-core log */
     __psint_t	     ptr;	     /* copy address into data region */
     int		     len;	     /* # xlog_write() bytes 2 still copy */
     int		     index;	     /* region index currently copying */
-- 
cgit v1.2.3


From f5e54d6e53a20cef45af7499e86164f0e0d16bb2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 28 Jun 2006 04:26:44 -0700
Subject: [PATCH] mark address_space_operations const

Same as with already do with the file operations: keep them in .rodata and
prevents people from doing runtime patching.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Steven French <sfrench@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/v9fs_vfs.h            | 2 +-
 fs/9p/vfs_addr.c            | 2 +-
 fs/adfs/inode.c             | 2 +-
 fs/affs/affs.h              | 6 +++---
 fs/affs/file.c              | 4 ++--
 fs/affs/symlink.c           | 2 +-
 fs/afs/file.c               | 2 +-
 fs/afs/internal.h           | 2 +-
 fs/befs/linuxvfs.c          | 2 +-
 fs/bfs/bfs.h                | 2 +-
 fs/bfs/file.c               | 2 +-
 fs/block_dev.c              | 2 +-
 fs/buffer.c                 | 2 +-
 fs/cifs/cifsfs.h            | 4 ++--
 fs/cifs/file.c              | 4 ++--
 fs/coda/symlink.c           | 2 +-
 fs/configfs/inode.c         | 2 +-
 fs/cramfs/inode.c           | 4 ++--
 fs/efs/inode.c              | 2 +-
 fs/efs/symlink.c            | 2 +-
 fs/ext2/ext2.h              | 6 +++---
 fs/ext2/inode.c             | 6 +++---
 fs/ext3/inode.c             | 6 +++---
 fs/fat/inode.c              | 2 +-
 fs/freevxfs/vxfs_immed.c    | 2 +-
 fs/freevxfs/vxfs_inode.c    | 6 +++---
 fs/freevxfs/vxfs_subr.c     | 2 +-
 fs/fuse/file.c              | 2 +-
 fs/hfs/hfs_fs.h             | 4 ++--
 fs/hfs/inode.c              | 4 ++--
 fs/hfsplus/hfsplus_fs.h     | 4 ++--
 fs/hfsplus/inode.c          | 4 ++--
 fs/hostfs/hostfs_kern.c     | 6 +++---
 fs/hpfs/file.c              | 2 +-
 fs/hpfs/hpfs_fn.h           | 4 ++--
 fs/hpfs/namei.c             | 2 +-
 fs/hugetlbfs/inode.c        | 4 ++--
 fs/inode.c                  | 2 +-
 fs/isofs/compress.c         | 2 +-
 fs/isofs/inode.c            | 2 +-
 fs/isofs/isofs.h            | 2 +-
 fs/isofs/rock.c             | 2 +-
 fs/isofs/zisofs.h           | 2 +-
 fs/jffs/inode-v23.c         | 4 ++--
 fs/jffs2/file.c             | 2 +-
 fs/jffs2/os-linux.h         | 2 +-
 fs/jfs/inode.c              | 2 +-
 fs/jfs/jfs_inode.h          | 2 +-
 fs/jfs/jfs_metapage.c       | 2 +-
 fs/jfs/jfs_metapage.h       | 2 +-
 fs/minix/inode.c            | 2 +-
 fs/ncpfs/inode.c            | 2 +-
 fs/ncpfs/symlink.c          | 2 +-
 fs/nfs/file.c               | 2 +-
 fs/ntfs/aops.c              | 4 ++--
 fs/ntfs/ntfs.h              | 4 ++--
 fs/ocfs2/aops.c             | 2 +-
 fs/ocfs2/inode.h            | 2 +-
 fs/qnx4/inode.c             | 2 +-
 fs/ramfs/file-mmu.c         | 2 +-
 fs/ramfs/file-nommu.c       | 2 +-
 fs/ramfs/internal.h         | 2 +-
 fs/reiserfs/inode.c         | 2 +-
 fs/romfs/inode.c            | 2 +-
 fs/smbfs/file.c             | 2 +-
 fs/smbfs/proto.h            | 2 +-
 fs/sysfs/inode.c            | 2 +-
 fs/sysv/itree.c             | 2 +-
 fs/sysv/sysv.h              | 2 +-
 fs/udf/file.c               | 2 +-
 fs/udf/inode.c              | 2 +-
 fs/udf/symlink.c            | 2 +-
 fs/udf/udfdecl.h            | 6 +++---
 fs/ufs/inode.c              | 2 +-
 fs/xfs/linux-2.6/xfs_aops.c | 2 +-
 fs/xfs/linux-2.6/xfs_aops.h | 2 +-
 fs/xfs/linux-2.6/xfs_buf.c  | 2 +-
 77 files changed, 104 insertions(+), 104 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f867b8d3e97..450b0c1b385 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -38,7 +38,7 @@
  */
 
 extern struct file_system_type v9fs_fs_type;
-extern struct address_space_operations v9fs_addr_operations;
+extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
 extern const struct file_operations v9fs_dir_operations;
 extern struct dentry_operations v9fs_dentry_operations;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index efda46fb64d..d4f0aa3c87f 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -103,6 +103,6 @@ UnmapAndUnlock:
 	return retval;
 }
 
-struct address_space_operations v9fs_addr_operations = {
+const struct address_space_operations v9fs_addr_operations = {
       .readpage = v9fs_vfs_readpage,
 };
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index a02802a3079..534f3eecc98 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -72,7 +72,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, adfs_get_block);
 }
 
-static struct address_space_operations adfs_aops = {
+static const struct address_space_operations adfs_aops = {
 	.readpage	= adfs_readpage,
 	.writepage	= adfs_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a43a876742b..0ddd4cc0d1a 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -195,9 +195,9 @@ extern struct inode_operations   affs_symlink_inode_operations;
 extern const struct file_operations	 affs_file_operations;
 extern const struct file_operations	 affs_file_operations_ofs;
 extern const struct file_operations	 affs_dir_operations;
-extern struct address_space_operations	 affs_symlink_aops;
-extern struct address_space_operations	 affs_aops;
-extern struct address_space_operations	 affs_aops_ofs;
+extern const struct address_space_operations	 affs_symlink_aops;
+extern const struct address_space_operations	 affs_aops;
+extern const struct address_space_operations	 affs_aops_ofs;
 
 extern struct dentry_operations	 affs_dentry_operations;
 extern struct dentry_operations	 affs_dentry_operations_intl;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 7076262af39..3de8590e4f6 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,7 +406,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,affs_get_block);
 }
-struct address_space_operations affs_aops = {
+const struct address_space_operations affs_aops = {
 	.readpage = affs_readpage,
 	.writepage = affs_writepage,
 	.sync_page = block_sync_page,
@@ -759,7 +759,7 @@ out:
 	goto done;
 }
 
-struct address_space_operations affs_aops_ofs = {
+const struct address_space_operations affs_aops_ofs = {
 	.readpage = affs_readpage_ofs,
 	//.writepage = affs_writepage_ofs,
 	//.sync_page = affs_sync_page_ofs,
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 426f0f094f2..f802256a593 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -66,7 +66,7 @@ fail:
 	return err;
 }
 
-struct address_space_operations affs_symlink_aops = {
+const struct address_space_operations affs_symlink_aops = {
 	.readpage	= affs_symlink_readpage,
 };
 
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 7bb716887e2..67d6634101f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -35,7 +35,7 @@ struct inode_operations afs_file_inode_operations = {
 	.getattr	= afs_inode_getattr,
 };
 
-struct address_space_operations afs_fs_aops = {
+const struct address_space_operations afs_fs_aops = {
 	.readpage	= afs_file_readpage,
 	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 72febdf9a35..e88b3b65ae4 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -69,7 +69,7 @@ extern const struct file_operations afs_dir_file_operations;
 /*
  * file.c
  */
-extern struct address_space_operations afs_fs_aops;
+extern const struct address_space_operations afs_fs_aops;
 extern struct inode_operations afs_file_inode_operations;
 
 #ifdef AFS_CACHING_SUPPORT
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 08201fab26c..a83e889a97c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -73,7 +73,7 @@ static struct inode_operations befs_dir_inode_operations = {
 	.lookup		= befs_lookup,
 };
 
-static struct address_space_operations befs_aops = {
+static const struct address_space_operations befs_aops = {
 	.readpage	= befs_readpage,
 	.sync_page	= block_sync_page,
 	.bmap		= befs_bmap,
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 9d791004b21..31973bbbf05 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -50,7 +50,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 /* file.c */
 extern struct inode_operations bfs_file_inops;
 extern const struct file_operations bfs_file_operations;
-extern struct address_space_operations bfs_aops;
+extern const struct address_space_operations bfs_aops;
 
 /* dir.c */
 extern struct inode_operations bfs_dir_inops;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index d83cd74a2e4..3d5aca28a0a 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -153,7 +153,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, bfs_get_block);
 }
 
-struct address_space_operations bfs_aops = {
+const struct address_space_operations bfs_aops = {
 	.readpage	= bfs_readpage,
 	.writepage	= bfs_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 028d9fb9c2d..7f7600e2381 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1095,7 +1095,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
 }
 
-struct address_space_operations def_blk_aops = {
+const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/buffer.c b/fs/buffer.c
index f23bb647db4..e9994722f4a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2598,7 +2598,7 @@ int nobh_truncate_page(struct address_space *mapping, loff_t from)
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	unsigned to;
 	struct page *page;
-	struct address_space_operations *a_ops = mapping->a_ops;
+	const struct address_space_operations *a_ops = mapping->a_ops;
 	char *kaddr;
 	int ret = 0;
 
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a6384d83fde..8f75c6f2470 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -32,8 +32,8 @@
 #define TRUE 1
 #endif
 
-extern struct address_space_operations cifs_addr_ops;
-extern struct address_space_operations cifs_addr_ops_smallbuf;
+extern const struct address_space_operations cifs_addr_ops;
+extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
 /* Functions related to super block operations */
 extern struct super_operations cifs_super_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e9c1573f6aa..5861eb42e62 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1942,7 +1942,7 @@ static int cifs_prepare_write(struct file *file, struct page *page,
 	return 0;
 }
 
-struct address_space_operations cifs_addr_ops = {
+const struct address_space_operations cifs_addr_ops = {
 	.readpage = cifs_readpage,
 	.readpages = cifs_readpages,
 	.writepage = cifs_writepage,
@@ -1959,7 +1959,7 @@ struct address_space_operations cifs_addr_ops = {
  * contain the header plus one complete page of data.  Otherwise, we need
  * to leave cifs_readpages out of the address space operations.
  */
-struct address_space_operations cifs_addr_ops_smallbuf = {
+const struct address_space_operations cifs_addr_ops_smallbuf = {
 	.readpage = cifs_readpage,
 	.writepage = cifs_writepage,
 	.writepages = cifs_writepages,
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index b35e5bbd9c9..76e00a65a75 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -50,6 +50,6 @@ fail:
 	return error;
 }
 
-struct address_space_operations coda_symlink_aops = {
+const struct address_space_operations coda_symlink_aops = {
 	.readpage	= coda_symlink_filler,
 };
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c153bd9534c..e14488ca641 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -38,7 +38,7 @@
 
 extern struct super_block * configfs_sb;
 
-static struct address_space_operations configfs_aops = {
+static const struct address_space_operations configfs_aops = {
 	.readpage	= simple_readpage,
 	.prepare_write	= simple_prepare_write,
 	.commit_write	= simple_commit_write
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index c45d7386080..223c0431042 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -30,7 +30,7 @@
 static struct super_operations cramfs_ops;
 static struct inode_operations cramfs_dir_inode_operations;
 static const struct file_operations cramfs_directory_operations;
-static struct address_space_operations cramfs_aops;
+static const struct address_space_operations cramfs_aops;
 
 static DEFINE_MUTEX(read_mutex);
 
@@ -501,7 +501,7 @@ static int cramfs_readpage(struct file *file, struct page * page)
 	return 0;
 }
 
-static struct address_space_operations cramfs_aops = {
+static const struct address_space_operations cramfs_aops = {
 	.readpage = cramfs_readpage
 };
 
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 180607f9314..174696f9bf1 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -21,7 +21,7 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,efs_get_block);
 }
-static struct address_space_operations efs_aops = {
+static const struct address_space_operations efs_aops = {
 	.readpage = efs_readpage,
 	.sync_page = block_sync_page,
 	.bmap = _efs_bmap
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 3d9a350e3e7..e249cf733a6 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -53,6 +53,6 @@ fail:
 	return err;
 }
 
-struct address_space_operations efs_symlink_aops = {
+const struct address_space_operations efs_symlink_aops = {
 	.readpage	= efs_symlink_readpage
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 9f74a62be55..e65a019fc7a 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -162,9 +162,9 @@ extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
 
 /* inode.c */
-extern struct address_space_operations ext2_aops;
-extern struct address_space_operations ext2_aops_xip;
-extern struct address_space_operations ext2_nobh_aops;
+extern const struct address_space_operations ext2_aops;
+extern const struct address_space_operations ext2_aops_xip;
+extern const struct address_space_operations ext2_nobh_aops;
 
 /* namei.c */
 extern struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 04af9c45dce..fb4d3220eb8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -684,7 +684,7 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	return mpage_writepages(mapping, wbc, ext2_get_block);
 }
 
-struct address_space_operations ext2_aops = {
+const struct address_space_operations ext2_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_writepage,
@@ -697,12 +697,12 @@ struct address_space_operations ext2_aops = {
 	.migratepage		= buffer_migrate_page,
 };
 
-struct address_space_operations ext2_aops_xip = {
+const struct address_space_operations ext2_aops_xip = {
 	.bmap			= ext2_bmap,
 	.get_xip_page		= ext2_get_xip_page,
 };
 
-struct address_space_operations ext2_nobh_aops = {
+const struct address_space_operations ext2_nobh_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
 	.writepage		= ext2_nobh_writepage,
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 0321e1b9034..f804d5e9d60 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1698,7 +1698,7 @@ static int ext3_journalled_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-static struct address_space_operations ext3_ordered_aops = {
+static const struct address_space_operations ext3_ordered_aops = {
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_ordered_writepage,
@@ -1712,7 +1712,7 @@ static struct address_space_operations ext3_ordered_aops = {
 	.migratepage	= buffer_migrate_page,
 };
 
-static struct address_space_operations ext3_writeback_aops = {
+static const struct address_space_operations ext3_writeback_aops = {
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_writeback_writepage,
@@ -1726,7 +1726,7 @@ static struct address_space_operations ext3_writeback_aops = {
 	.migratepage	= buffer_migrate_page,
 };
 
-static struct address_space_operations ext3_journalled_aops = {
+static const struct address_space_operations ext3_journalled_aops = {
 	.readpage	= ext3_readpage,
 	.readpages	= ext3_readpages,
 	.writepage	= ext3_journalled_writepage,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7c35d582ec1..31b7174176b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -196,7 +196,7 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, fat_get_block);
 }
 
-static struct address_space_operations fat_aops = {
+static const struct address_space_operations fat_aops = {
 	.readpage	= fat_readpage,
 	.readpages	= fat_readpages,
 	.writepage	= fat_writepage,
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index 6f5df1700e9..4e25f3fbed8 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -56,7 +56,7 @@ struct inode_operations vxfs_immed_symlink_iops = {
 /*
  * Adress space operations for immed files and directories.
  */
-struct address_space_operations vxfs_immed_aops = {
+const struct address_space_operations vxfs_immed_aops = {
 	.readpage =		vxfs_immed_readpage,
 };
 
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f544aae9169..ca6a3971477 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -41,8 +41,8 @@
 #include "vxfs_extern.h"
 
 
-extern struct address_space_operations vxfs_aops;
-extern struct address_space_operations vxfs_immed_aops;
+extern const struct address_space_operations vxfs_aops;
+extern const struct address_space_operations vxfs_immed_aops;
 
 extern struct inode_operations vxfs_immed_symlink_iops;
 
@@ -295,7 +295,7 @@ vxfs_read_inode(struct inode *ip)
 {
 	struct super_block		*sbp = ip->i_sb;
 	struct vxfs_inode_info		*vip;
-	struct address_space_operations	*aops;
+	const struct address_space_operations	*aops;
 	ino_t				ino = ip->i_ino;
 
 	if (!(vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist)))
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index c1be118fc06..decac62efe5 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -42,7 +42,7 @@
 static int		vxfs_readpage(struct file *, struct page *);
 static sector_t		vxfs_bmap(struct address_space *, sector_t);
 
-struct address_space_operations vxfs_aops = {
+const struct address_space_operations vxfs_aops = {
 	.readpage =		vxfs_readpage,
 	.bmap =			vxfs_bmap,
 	.sync_page =		block_sync_page,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 28aa81eae2c..63614ed1633 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -770,7 +770,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	/* no mmap and sendfile */
 };
 
-static struct address_space_operations fuse_file_aops  = {
+static const struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
 	.prepare_write	= fuse_prepare_write,
 	.commit_write	= fuse_commit_write,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 3ed8663a8db..735332dfd1b 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -182,8 +182,8 @@ extern void hfs_file_truncate(struct inode *);
 extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
 /* inode.c */
-extern struct address_space_operations hfs_aops;
-extern struct address_space_operations hfs_btree_aops;
+extern const struct address_space_operations hfs_aops;
+extern const struct address_space_operations hfs_btree_aops;
 
 extern struct inode *hfs_new_inode(struct inode *, struct qstr *, int);
 extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2d4ced22201..315cf44a90b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -114,7 +114,7 @@ static int hfs_writepages(struct address_space *mapping,
 	return mpage_writepages(mapping, wbc, hfs_get_block);
 }
 
-struct address_space_operations hfs_btree_aops = {
+const struct address_space_operations hfs_btree_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
 	.sync_page	= block_sync_page,
@@ -124,7 +124,7 @@ struct address_space_operations hfs_btree_aops = {
 	.releasepage	= hfs_releasepage,
 };
 
-struct address_space_operations hfs_aops = {
+const struct address_space_operations hfs_aops = {
 	.readpage	= hfs_readpage,
 	.writepage	= hfs_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7ae393637a0..8a1ca5ef7ad 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -323,8 +323,8 @@ int hfsplus_file_extend(struct inode *);
 void hfsplus_file_truncate(struct inode *);
 
 /* inode.c */
-extern struct address_space_operations hfsplus_aops;
-extern struct address_space_operations hfsplus_btree_aops;
+extern const struct address_space_operations hfsplus_aops;
+extern const struct address_space_operations hfsplus_btree_aops;
 
 void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
 void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index acf66dba3e0..924ecdef809 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -109,7 +109,7 @@ static int hfsplus_writepages(struct address_space *mapping,
 	return mpage_writepages(mapping, wbc, hfsplus_get_block);
 }
 
-struct address_space_operations hfsplus_btree_aops = {
+const struct address_space_operations hfsplus_btree_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
 	.sync_page	= block_sync_page,
@@ -119,7 +119,7 @@ struct address_space_operations hfsplus_btree_aops = {
 	.releasepage	= hfsplus_releasepage,
 };
 
-struct address_space_operations hfsplus_aops = {
+const struct address_space_operations hfsplus_aops = {
 	.readpage	= hfsplus_readpage,
 	.writepage	= hfsplus_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 8e0d37743e7..b82e3d9c879 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -54,7 +54,7 @@ static int append = 0;
 
 static struct inode_operations hostfs_iops;
 static struct inode_operations hostfs_dir_iops;
-static struct address_space_operations hostfs_link_aops;
+static const struct address_space_operations hostfs_link_aops;
 
 #ifndef MODULE
 static int __init hostfs_args(char *options, int *add)
@@ -518,7 +518,7 @@ int hostfs_commit_write(struct file *file, struct page *page, unsigned from,
 	return(err);
 }
 
-static struct address_space_operations hostfs_aops = {
+static const struct address_space_operations hostfs_aops = {
 	.writepage 	= hostfs_writepage,
 	.readpage	= hostfs_readpage,
 	.set_page_dirty = __set_page_dirty_nobuffers,
@@ -935,7 +935,7 @@ int hostfs_link_readpage(struct file *file, struct page *page)
 	return(err);
 }
 
-static struct address_space_operations hostfs_link_aops = {
+static const struct address_space_operations hostfs_link_aops = {
 	.readpage	= hostfs_link_readpage,
 };
 
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index d3b9fffe45a..d9eb19b7b8a 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -99,7 +99,7 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,hpfs_get_block);
 }
-struct address_space_operations hpfs_aops = {
+const struct address_space_operations hpfs_aops = {
 	.readpage = hpfs_readpage,
 	.writepage = hpfs_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 29b7a3e5517..f687d54ed44 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, char *, char *, int);
 int hpfs_file_fsync(struct file *, struct dentry *, int);
 extern const struct file_operations hpfs_file_ops;
 extern struct inode_operations hpfs_file_iops;
-extern struct address_space_operations hpfs_aops;
+extern const struct address_space_operations hpfs_aops;
 
 /* inode.c */
 
@@ -304,7 +304,7 @@ void hpfs_decide_conv(struct inode *, unsigned char *, unsigned);
 /* namei.c */
 
 extern struct inode_operations hpfs_dir_iops;
-extern struct address_space_operations hpfs_symlink_aops;
+extern const struct address_space_operations hpfs_symlink_aops;
 
 static inline struct hpfs_inode_info *hpfs_i(struct inode *inode)
 {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index a03abb12c61..59e7dc182a0 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -538,7 +538,7 @@ fail:
 	return err;
 }
 
-struct address_space_operations hpfs_symlink_aops = {
+const struct address_space_operations hpfs_symlink_aops = {
 	.readpage	= hpfs_symlink_readpage
 };
 	
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e6410d8edd0..6449cb69796 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -34,7 +34,7 @@
 #define HUGETLBFS_MAGIC	0x958458f6
 
 static struct super_operations hugetlbfs_ops;
-static struct address_space_operations hugetlbfs_aops;
+static const struct address_space_operations hugetlbfs_aops;
 const struct file_operations hugetlbfs_file_operations;
 static struct inode_operations hugetlbfs_dir_inode_operations;
 static struct inode_operations hugetlbfs_inode_operations;
@@ -547,7 +547,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
 
-static struct address_space_operations hugetlbfs_aops = {
+static const struct address_space_operations hugetlbfs_aops = {
 	.readpage	= hugetlbfs_readpage,
 	.prepare_write	= hugetlbfs_prepare_write,
 	.commit_write	= hugetlbfs_commit_write,
diff --git a/fs/inode.c b/fs/inode.c
index 3a2446a27d2..f42961eb983 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -102,7 +102,7 @@ static kmem_cache_t * inode_cachep __read_mostly;
 
 static struct inode *alloc_inode(struct super_block *sb)
 {
-	static struct address_space_operations empty_aops;
+	static const struct address_space_operations empty_aops;
 	static struct inode_operations empty_iops;
 	static const struct file_operations empty_fops;
 	struct inode *inode;
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 4917315db73..3a39158cca9 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -312,7 +312,7 @@ eio:
 	return err;
 }
 
-struct address_space_operations zisofs_aops = {
+const struct address_space_operations zisofs_aops = {
 	.readpage = zisofs_readpage,
 	/* No sync_page operation supported? */
 	/* No bmap operation supported */
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f9c8ba1fa1..bb11c7fb401 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1054,7 +1054,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping,block,isofs_get_block);
 }
 
-static struct address_space_operations isofs_aops = {
+static const struct address_space_operations isofs_aops = {
 	.readpage = isofs_readpage,
 	.sync_page = block_sync_page,
 	.bmap = _isofs_bmap
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index b87ba066f5e..e6308c8b573 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -176,5 +176,5 @@ isofs_normalize_block_and_offset(struct iso_directory_record* de,
 
 extern struct inode_operations isofs_dir_inode_operations;
 extern const struct file_operations isofs_dir_operations;
-extern struct address_space_operations isofs_symlink_aops;
+extern const struct address_space_operations isofs_symlink_aops;
 extern struct export_operations isofs_export_ops;
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 4326cb47f8f..f3a1db3098d 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -754,6 +754,6 @@ error:
 	return -EIO;
 }
 
-struct address_space_operations isofs_symlink_aops = {
+const struct address_space_operations isofs_symlink_aops = {
 	.readpage = rock_ridge_symlink_readpage
 };
diff --git a/fs/isofs/zisofs.h b/fs/isofs/zisofs.h
index d78485d101c..27379570915 100644
--- a/fs/isofs/zisofs.h
+++ b/fs/isofs/zisofs.h
@@ -15,7 +15,7 @@
  */
 
 #ifdef CONFIG_ZISOFS
-extern struct address_space_operations zisofs_aops;
+extern const struct address_space_operations zisofs_aops;
 extern int __init zisofs_init(void);
 extern void zisofs_cleanup(void);
 #endif
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 9e46ea6da75..93068697a9b 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -59,7 +59,7 @@ static const struct file_operations jffs_file_operations;
 static struct inode_operations jffs_file_inode_operations;
 static const struct file_operations jffs_dir_operations;
 static struct inode_operations jffs_dir_inode_operations;
-static struct address_space_operations jffs_address_operations;
+static const struct address_space_operations jffs_address_operations;
 
 kmem_cache_t     *node_cache = NULL;
 kmem_cache_t     *fm_cache = NULL;
@@ -1614,7 +1614,7 @@ jffs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 } /* jffs_ioctl()  */
 
 
-static struct address_space_operations jffs_address_operations = {
+static const struct address_space_operations jffs_address_operations = {
 	.readpage	= jffs_readpage,
 	.prepare_write	= jffs_prepare_write,
 	.commit_write	= jffs_commit_write,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index bb8844f40e4..3ed6e3e120b 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -62,7 +62,7 @@ struct inode_operations jffs2_file_inode_operations =
 	.removexattr =	jffs2_removexattr
 };
 
-struct address_space_operations jffs2_file_address_operations =
+const struct address_space_operations jffs2_file_address_operations =
 {
 	.readpage =	jffs2_readpage,
 	.prepare_write =jffs2_prepare_write,
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 6b522356540..9f41fc01a37 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern struct inode_operations jffs2_dir_inode_operations;
 /* file.c */
 extern const struct file_operations jffs2_file_operations;
 extern struct inode_operations jffs2_file_inode_operations;
-extern struct address_space_operations jffs2_file_address_operations;
+extern const struct address_space_operations jffs2_file_address_operations;
 int jffs2_fsync(struct file *, struct dentry *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 04eb78f1252..43e3f566aad 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -305,7 +305,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
 				offset, nr_segs, jfs_get_block, NULL);
 }
 
-struct address_space_operations jfs_aops = {
+const struct address_space_operations jfs_aops = {
 	.readpage	= jfs_readpage,
 	.readpages	= jfs_readpages,
 	.writepage	= jfs_writepage,
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index c3007267446..b5c7da6190d 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -33,7 +33,7 @@ extern void jfs_free_zero_link(struct inode *);
 extern struct dentry *jfs_get_parent(struct dentry *dentry);
 extern void jfs_set_inode_flags(struct inode *);
 
-extern struct address_space_operations jfs_aops;
+extern const struct address_space_operations jfs_aops;
 extern struct inode_operations jfs_dir_inode_operations;
 extern const struct file_operations jfs_dir_operations;
 extern struct inode_operations jfs_file_inode_operations;
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 7f6e8803970..e1e0a6e6ebd 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -577,7 +577,7 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
 	metapage_releasepage(page, 0);
 }
 
-struct address_space_operations jfs_metapage_aops = {
+const struct address_space_operations jfs_metapage_aops = {
 	.readpage	= metapage_readpage,
 	.writepage	= metapage_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index f0b7d3282b0..d17a3290f5a 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -139,7 +139,7 @@ static inline void metapage_homeok(struct metapage *mp)
 	put_metapage(mp);
 }
 
-extern struct address_space_operations jfs_metapage_aops;
+extern const struct address_space_operations jfs_metapage_aops;
 
 /*
  * This routines invalidate all pages for an extent.
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index a6fb509b734..9ea91c5eeb7 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -335,7 +335,7 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,minix_get_block);
 }
-static struct address_space_operations minix_aops = {
+static const struct address_space_operations minix_aops = {
 	.readpage = minix_readpage,
 	.writepage = minix_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 90d2ea28f33..6c51c119846 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -105,7 +105,7 @@ static struct super_operations ncp_sops =
 
 extern struct dentry_operations ncp_root_dentry_operations;
 #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
-extern struct address_space_operations ncp_symlink_aops;
+extern const struct address_space_operations ncp_symlink_aops;
 extern int ncp_symlink(struct inode*, struct dentry*, const char*);
 #endif
 
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index e935f1b34bc..f76b1392a01 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -99,7 +99,7 @@ fail:
 /*
  * symlinks can't do much...
  */
-struct address_space_operations ncp_symlink_aops = {
+const struct address_space_operations ncp_symlink_aops = {
 	.readpage	= ncp_symlink_readpage,
 };
 	
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index add28913883..cc2b874ad5a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -315,7 +315,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
 	return !nfs_wb_page(page->mapping->host, page);
 }
 
-struct address_space_operations nfs_file_aops = {
+const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
 	.readpages = nfs_readpages,
 	.set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 580412d330c..bc579bfdfbd 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1544,7 +1544,7 @@ err_out:
 /**
  * ntfs_aops - general address space operations for inodes and attributes
  */
-struct address_space_operations ntfs_aops = {
+const struct address_space_operations ntfs_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
 	.sync_page	= block_sync_page,	/* Currently, just unplugs the
 						   disk request queue. */
@@ -1560,7 +1560,7 @@ struct address_space_operations ntfs_aops = {
  * ntfs_mst_aops - general address space operations for mst protecteed inodes
  *		   and attributes
  */
-struct address_space_operations ntfs_mst_aops = {
+const struct address_space_operations ntfs_mst_aops = {
 	.readpage	= ntfs_readpage,	/* Fill page with data. */
 	.sync_page	= block_sync_page,	/* Currently, just unplugs the
 						   disk request queue. */
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index bf7b3d7c093..ddd3d503097 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -57,8 +57,8 @@ extern struct kmem_cache *ntfs_attr_ctx_cache;
 extern struct kmem_cache *ntfs_index_ctx_cache;
 
 /* The various operations structs defined throughout the driver files. */
-extern struct address_space_operations ntfs_aops;
-extern struct address_space_operations ntfs_mst_aops;
+extern const struct address_space_operations ntfs_aops;
+extern const struct address_space_operations ntfs_mst_aops;
 
 extern const struct  file_operations ntfs_file_ops;
 extern struct inode_operations ntfs_file_inode_ops;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 47152bf9a7f..cca71317b6d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -666,7 +666,7 @@ out:
 	return ret;
 }
 
-struct address_space_operations ocfs2_aops = {
+const struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
 	.writepage	= ocfs2_writepage,
 	.prepare_write	= ocfs2_prepare_write,
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 84c50796128..35140f6cf84 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -114,7 +114,7 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 
 extern kmem_cache_t *ocfs2_inode_cache;
 
-extern struct address_space_operations ocfs2_aops;
+extern const struct address_space_operations ocfs2_aops;
 
 struct buffer_head *ocfs2_bread(struct inode *inode, int block,
 				int *err, int reada);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2f24c46f72a..8bc182a8874 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -450,7 +450,7 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,qnx4_get_block);
 }
-static struct address_space_operations qnx4_aops = {
+static const struct address_space_operations qnx4_aops = {
 	.readpage	= qnx4_readpage,
 	.writepage	= qnx4_writepage,
 	.sync_page	= block_sync_page,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 00a933eb820..86f14cacf64 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -26,7 +26,7 @@
 
 #include <linux/fs.h>
 
-struct address_space_operations ramfs_aops = {
+const struct address_space_operations ramfs_aops = {
 	.readpage	= simple_readpage,
 	.prepare_write	= simple_prepare_write,
 	.commit_write	= simple_commit_write
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index f443a84b98a..99fffc9e1bf 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -27,7 +27,7 @@
 
 static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
 
-struct address_space_operations ramfs_aops = {
+const struct address_space_operations ramfs_aops = {
 	.readpage		= simple_readpage,
 	.prepare_write		= simple_prepare_write,
 	.commit_write		= simple_commit_write
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 313237631b4..c2bb58e7465 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -10,6 +10,6 @@
  */
 
 
-extern struct address_space_operations ramfs_aops;
+extern const struct address_space_operations ramfs_aops;
 extern const struct file_operations ramfs_file_operations;
 extern struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 9857e50f85e..a24858a632f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2996,7 +2996,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 	return error;
 }
 
-struct address_space_operations reiserfs_address_space_operations = {
+const struct address_space_operations reiserfs_address_space_operations = {
 	.writepage = reiserfs_writepage,
 	.readpage = reiserfs_readpage,
 	.readpages = reiserfs_readpages,
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 283fbc6b8ee..22eed61ebf6 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -459,7 +459,7 @@ err_out:
 
 /* Mapping from our types to the kernel */
 
-static struct address_space_operations romfs_aops = {
+static const struct address_space_operations romfs_aops = {
 	.readpage = romfs_readpage
 };
 
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index ed9a24d19d7..dae67048bab 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -306,7 +306,7 @@ static int smb_commit_write(struct file *file, struct page *page,
 	return status;
 }
 
-struct address_space_operations smb_file_aops = {
+const struct address_space_operations smb_file_aops = {
 	.readpage = smb_readpage,
 	.writepage = smb_writepage,
 	.prepare_write = smb_prepare_write,
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 972ed7dad38..34fb462b237 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -63,7 +63,7 @@ extern int smb_revalidate_inode(struct dentry *dentry);
 extern int smb_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 extern int smb_notify_change(struct dentry *dentry, struct iattr *attr);
 /* file.c */
-extern struct address_space_operations smb_file_aops;
+extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index f0b347bd12c..5e0e31cc46f 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -16,7 +16,7 @@
 
 extern struct super_block * sysfs_sb;
 
-static struct address_space_operations sysfs_aops = {
+static const struct address_space_operations sysfs_aops = {
 	.readpage	= simple_readpage,
 	.prepare_write	= simple_prepare_write,
 	.commit_write	= simple_commit_write
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 86f5f8d43d0..f2bcccd1d6f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -465,7 +465,7 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,get_block);
 }
-struct address_space_operations sysv_aops = {
+const struct address_space_operations sysv_aops = {
 	.readpage = sysv_readpage,
 	.writepage = sysv_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 393a480e4de..9dcc8212093 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -161,7 +161,7 @@ extern struct inode_operations sysv_dir_inode_operations;
 extern struct inode_operations sysv_fast_symlink_inode_operations;
 extern const struct file_operations sysv_file_operations;
 extern const struct file_operations sysv_dir_operations;
-extern struct address_space_operations sysv_aops;
+extern const struct address_space_operations sysv_aops;
 extern struct super_operations sysv_sops;
 extern struct dentry_operations sysv_dentry_operations;
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index e34b00e303f..a59e5f33daf 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -95,7 +95,7 @@ static int udf_adinicb_commit_write(struct file *file, struct page *page, unsign
 	return 0;
 }
 
-struct address_space_operations udf_adinicb_aops = {
+const struct address_space_operations udf_adinicb_aops = {
 	.readpage		= udf_adinicb_readpage,
 	.writepage		= udf_adinicb_writepage,
 	.sync_page		= block_sync_page,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2983afd5e7f..605f5111b6d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -132,7 +132,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping,block,udf_get_block);
 }
 
-struct address_space_operations udf_aops = {
+const struct address_space_operations udf_aops = {
 	.readpage		= udf_readpage,
 	.writepage		= udf_writepage,
 	.sync_page		= block_sync_page,
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 674bb40edc8..ba068a78656 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -113,6 +113,6 @@ out:
 /*
  * symlinks can't do much...
  */
-struct address_space_operations udf_symlink_aops = {
+const struct address_space_operations udf_symlink_aops = {
 	.readpage		= udf_symlink_filler,
 };
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 023e19ba5a2..2f992387cc9 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -47,9 +47,9 @@ extern struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
-extern struct address_space_operations udf_aops;
-extern struct address_space_operations udf_adinicb_aops;
-extern struct address_space_operations udf_symlink_aops;
+extern const struct address_space_operations udf_aops;
+extern const struct address_space_operations udf_adinicb_aops;
+extern const struct address_space_operations udf_symlink_aops;
 
 struct udf_fileident_bh
 {
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 259bd196099..8e1f90e4204 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -574,7 +574,7 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping,block,ufs_getfrag_block);
 }
-struct address_space_operations ufs_aops = {
+const struct address_space_operations ufs_aops = {
 	.readpage = ufs_readpage,
 	.writepage = ufs_writepage,
 	.sync_page = block_sync_page,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 3e807b828e2..c40f81ba9b1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1454,7 +1454,7 @@ xfs_vm_invalidatepage(
 	block_invalidatepage(page, offset);
 }
 
-struct address_space_operations xfs_address_space_operations = {
+const struct address_space_operations xfs_address_space_operations = {
 	.readpage		= xfs_vm_readpage,
 	.readpages		= xfs_vm_readpages,
 	.writepage		= xfs_vm_writepage,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 706d8c781b8..2244e516b66 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -40,7 +40,7 @@ typedef struct xfs_ioend {
 	struct work_struct	io_work;	/* xfsdatad work queue */
 } xfs_ioend_t;
 
-extern struct address_space_operations xfs_address_space_operations;
+extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 26fed0756f0..2af528dcfb0 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1520,7 +1520,7 @@ xfs_mapping_buftarg(
 	struct backing_dev_info	*bdi;
 	struct inode		*inode;
 	struct address_space	*mapping;
-	static struct address_space_operations mapping_aops = {
+	static const struct address_space_operations mapping_aops = {
 		.sync_page = block_sync_page,
 		.migratepage = fail_migrate_page,
 	};
-- 
cgit v1.2.3


From 94374e7cc369b972855cebd13ba942f4eb1be1ac Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Wed, 28 Jun 2006 04:26:50 -0700
Subject: [PATCH] v9fs: return the correct error when interrupted by signal

If a signal interrupts the user process, v9fs sends a flush request to the
file server and waits for its response.  It error code is incorrectly set
to the error code of the flush message instead of ERESTARTSYS.  The patch
sets the error code to the correct value.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/mux.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 12e1baa4508..8d45ed66883 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -932,6 +932,8 @@ v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc,
 					r.rcall || r.err);
 			} while (!r.rcall && !r.err && err==-ERESTARTSYS &&
 				m->trans->status==Connected && !m->err);
+
+			err = -ERESTARTSYS;
 		}
 		sigpending = 1;
 	}
-- 
cgit v1.2.3


From 9d7fa40098253a6768cfc3ffbbd5988ba852d364 Mon Sep 17 00:00:00 2001
From: Latchesar Ionkov <lucho@ionkov.net>
Date: Wed, 28 Jun 2006 04:26:51 -0700
Subject: [PATCH] v9fs: fix fid check in v9fs_create

Fix an incorrect check whether a fid was allocated in v9fs_create and if it
should be freed on error.

Signed-off-by: Latchesar Ionkov <lucho@ionkov.net>
Cc: Eric Van Hensbergen <ericvh@ericvh.myip.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/9p/vfs_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5c6bdf82146..2f580a197b8 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -300,7 +300,7 @@ clunk_fid:
 	fid = V9FS_NOFID;
 
 put_fid:
-	if (fid >= 0)
+	if (fid != V9FS_NOFID)
 		v9fs_put_idpool(fid, &v9ses->fidpool);
 
 	kfree(fcall);
-- 
cgit v1.2.3


From 7256d819e43f89af6ba30047936c96c683436941 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 29 Jun 2006 02:24:29 -0700
Subject: [PATCH] ufs: printk() fix

fs/ufs/inode.c: In function `ufs_frag_map':
fs/ufs/inode.c:101: warning: long long unsigned int format, u64 arg (arg 4)
fs/ufs/inode.c: In function `ufs_getfrag_block':
fs/ufs/inode.c:432: warning: long long unsigned int format, u64 arg (arg 2)

Cc: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 8e1f90e4204..488b5ff48af 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -98,7 +98,9 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag)
 	u64 temp = 0L;
 
 	UFSD(": frag = %llu  depth = %d\n", (unsigned long long)frag, depth);
-	UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",uspi->s_fpbshift,uspi->s_apbmask,mask);
+	UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",
+		uspi->s_fpbshift, uspi->s_apbmask,
+		(unsigned long long)mask);
 
 	if (depth == 0)
 		return 0;
@@ -429,7 +431,7 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
 	
 	if (!create) {
 		phys64 = ufs_frag_map(inode, fragment);
-		UFSD("phys64 = %llu \n",phys64);
+		UFSD("phys64 = %llu\n", (unsigned long long)phys64);
 		if (phys64)
 			map_bh(bh_result, sb, phys64);
 		return 0;
-- 
cgit v1.2.3


From e7515d065d09f6450c996a8fa206ad66569e183c Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 10 Mar 2006 11:42:30 -0800
Subject: configfs: Clear up a few extra spaces where there should be TABs.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/configfs/dir.c     | 6 +++---
 fs/configfs/symlink.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 207f8006fd6..df025453dd9 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -211,7 +211,7 @@ static void remove_dir(struct dentry * d)
 	struct configfs_dirent * sd;
 
 	sd = d->d_fsdata;
- 	list_del_init(&sd->s_sibling);
+	list_del_init(&sd->s_sibling);
 	configfs_put(sd);
 	if (d->d_inode)
 		simple_rmdir(parent->d_inode,d);
@@ -330,7 +330,7 @@ static int configfs_detach_prep(struct dentry *dentry)
 
 			ret = configfs_detach_prep(sd->s_dentry);
 			if (!ret)
-			       	continue;
+				continue;
 		} else
 			ret = -ENOTEMPTY;
 
@@ -931,7 +931,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
 
 	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (!IS_ERR(new_dentry)) {
-  		if (!new_dentry->d_inode) {
+		if (!new_dentry->d_inode) {
 			error = config_item_set_name(item, "%s", new_name);
 			if (!error) {
 				d_add(new_dentry, NULL);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index e5512e295cf..fb65e0800a8 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -66,7 +66,7 @@ static void fill_item_path(struct config_item * item, char * buffer, int length)
 }
 
 static int create_link(struct config_item *parent_item,
- 		       struct config_item *item,
+		       struct config_item *item,
 		       struct dentry *dentry)
 {
 	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
-- 
cgit v1.2.3


From 2b388c67906ee8cd3bf1a600a7023cd0807d414f Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 10 May 2006 18:28:59 -0700
Subject: ocfs2: Compile-time disabling of ocfs2 debugging output.

Give gcc the chance to compile out the debug logging code in ocfs2.
This saves some size at the expense of being able to debug the code.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Kconfig                 | 10 ++++++++++
 fs/ocfs2/cluster/masklog.h | 22 +++++++++++++++++++++-
 fs/ocfs2/dir.c             |  6 ++----
 fs/ocfs2/mmap.c            |  4 ++--
 4 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 6dc8cfd6d80..be870db242b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -356,6 +356,16 @@ config OCFS2_FS
 	          - POSIX ACLs
 	          - readpages / writepages (not user visible)
 
+config OCFS2_DEBUG_MASKLOG
+	bool "OCFS2 logging support"
+	depends on OCFS2_FS
+	default y
+	help
+	  The ocfs2 filesystem has an extensive logging system.  The system
+	  allows selection of events to log via files in /sys/o2cb/logmask/.
+	  This option will enlarge your kernel, but it allows debugging of
+	  ocfs2 filesystem issues.
+
 config MINIX_FS
 	tristate "Minix fs support"
 	help
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 73edad78253..a42628ba9dd 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -123,6 +123,17 @@
 #define MLOG_MASK_PREFIX 0
 #endif
 
+/*
+ * When logging is disabled, force the bit test to 0 for anything other
+ * than errors and notices, allowing gcc to remove the code completely.
+ * When enabled, allow all masks.
+ */
+#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
+#define ML_ALLOWED_BITS ~0
+#else
+#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE)
+#endif
+
 #define MLOG_MAX_BITS 64
 
 struct mlog_bits {
@@ -187,7 +198,8 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 
 #define mlog(mask, fmt, args...) do {					\
 	u64 __m = MLOG_MASK_PREFIX | (mask);				\
-	if (__mlog_test_u64(__m, mlog_and_bits) &&			\
+	if ((__m & ML_ALLOWED_BITS) &&					\
+	    __mlog_test_u64(__m, mlog_and_bits) &&			\
 	    !__mlog_test_u64(__m, mlog_not_bits)) {			\
 		if (__m & ML_ERROR)					\
 			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
@@ -204,6 +216,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
 } while (0)
 
+#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
 #define mlog_entry(fmt, args...) do {					\
 	mlog(ML_ENTRY, "ENTRY:" fmt , ##args);				\
 } while (0)
@@ -247,6 +260,13 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 #define mlog_exit_void() do {						\
 	mlog(ML_EXIT, "EXIT\n");					\
 } while (0)
+#else
+#define mlog_entry(...)  do { } while (0)
+#define mlog_entry_void(...)  do { } while (0)
+#define mlog_exit(...)  do { } while (0)
+#define mlog_exit_ptr(...)  do { } while (0)
+#define mlog_exit_void(...)  do { } while (0)
+#endif  /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
 
 #define mlog_bug_on_msg(cond, fmt, args...) do {			\
 	if (cond) {							\
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ae47f450792..3d494d1a5f3 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -213,11 +213,9 @@ int ocfs2_find_files_on_disk(const char *name,
 			     struct ocfs2_dir_entry **dirent)
 {
 	int status = -ENOENT;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(osb=%p, parent=%llu, name='%.*s', blkno=%p, inode=%p)\n",
-		   osb, (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		   namelen, name, blkno, inode);
+	mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
+		   namelen, name, blkno, inode, dirent_bh, dirent);
 
 	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
 	if (!*dirent_bh || !*dirent) {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 843cf9ddefe..83934e33e5b 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -46,12 +46,12 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
 				 unsigned long address,
 				 int *type)
 {
-	struct inode *inode = area->vm_file->f_dentry->d_inode;
 	struct page *page = NOPAGE_SIGBUS;
 	sigset_t blocked, oldset;
 	int ret;
 
-	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
+	mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
+		   type);
 
 	/* The best way to deal with signals in this path is
 	 * to block them upfront, rather than allowing the
-- 
cgit v1.2.3


From 4ba63adce06bc7549e1dd36344123dbaccdaa52f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sun, 26 Mar 2006 14:25:52 +0200
Subject: ocfs2: OCFS2_FS must depend on SYSFS

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index be870db242b..53f5c6d6112 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -326,7 +326,7 @@ source "fs/xfs/Kconfig"
 
 config OCFS2_FS
 	tristate "OCFS2 file system support (EXPERIMENTAL)"
-	depends on NET && EXPERIMENTAL
+	depends on NET && SYSFS && EXPERIMENTAL
 	select CONFIGFS_FS
 	select JBD
 	select CRC32
-- 
cgit v1.2.3


From 0db638f44e7db9732d9c5704ca837f57ce061f42 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 9 May 2006 15:09:35 -0700
Subject: ocfs2: warn the user on a dead timeout mismatch

Print a warning to the user when a node with a different dead count joins
the region.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/heartbeat.c       | 20 ++++++++++++++++++++
 fs/ocfs2/cluster/ocfs2_heartbeat.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 1d26cfcd9f8..504595d6cf6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -517,6 +517,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
 	hb_block->hb_seq = cpu_to_le64(cputime);
 	hb_block->hb_node = node_num;
 	hb_block->hb_generation = cpu_to_le64(generation);
+	hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS);
 
 	/* This step must always happen last! */
 	hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
@@ -645,6 +646,8 @@ static int o2hb_check_slot(struct o2hb_region *reg,
 	struct o2nm_node *node;
 	struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
 	u64 cputime;
+	unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
+	unsigned int slot_dead_ms;
 
 	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
 
@@ -733,6 +736,23 @@ fire_callbacks:
 			      &o2hb_live_slots[slot->ds_node_num]);
 
 		slot->ds_equal_samples = 0;
+
+		/* We want to be sure that all nodes agree on the
+		 * number of milliseconds before a node will be
+		 * considered dead. The self-fencing timeout is
+		 * computed from this value, and a discrepancy might
+		 * result in heartbeat calling a node dead when it
+		 * hasn't self-fenced yet. */
+		slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
+		if (slot_dead_ms && slot_dead_ms != dead_ms) {
+			/* TODO: Perhaps we can fail the region here. */
+			mlog(ML_ERROR, "Node %d on device %s has a dead count "
+			     "of %u ms, but our count is %u ms.\n"
+			     "Please double check your configuration values "
+			     "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
+			     slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
+			     dead_ms);
+		}
 		goto out;
 	}
 
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
index 94096069cb4..3f4151da970 100644
--- a/fs/ocfs2/cluster/ocfs2_heartbeat.h
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -32,6 +32,7 @@ struct o2hb_disk_heartbeat_block {
 	__u8  hb_pad1[3];
 	__le32 hb_cksum;
 	__le64 hb_generation;
+	__le32 hb_dead_ms;
 };
 
 #endif /* _OCFS2_HEARTBEAT_H */
-- 
cgit v1.2.3


From 8169cae5a13b9f8ae53edc183825d20b4f4daeeb Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 31 Mar 2006 16:53:55 +0200
Subject: [PATCH] fs/ocfs2/dlm/dlmrecovery.c: make dlm_lockres_master_requery()
 static

dlm_lockres_master_requery() became global without any external usage.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlm/dlmcommon.h   | 2 --
 fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 9bdc9cf6599..14530ee7e11 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -822,8 +822,6 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
 int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
 int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 			  u8 nodenum, u8 *real_master);
-int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-			       struct dlm_lock_resource *res, u8 *real_master);
 
 
 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 29b2845f370..594745fab0b 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -95,6 +95,9 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
 static void dlm_request_all_locks_worker(struct dlm_work_item *item,
 					 void *data);
 static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master);
 
 static u64 dlm_get_next_mig_cookie(void);
 
@@ -1484,8 +1487,9 @@ leave:
 
 
-int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-			       struct dlm_lock_resource *res, u8 *real_master)
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master)
 {
 	struct dlm_node_iter iter;
 	int nodenum;
-- 
cgit v1.2.3


From a43db30c7c614c08851a97476aeb317ca2e14475 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 27 Apr 2006 16:36:14 -0700
Subject: ocfs2: silence -EEXIST from ocfs2_extent_map_insert/lookup

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/extent_map.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 1a5c69071df..fcd4475d1f8 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -298,7 +298,7 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
 
 		ret = ocfs2_extent_map_insert(inode, rec,
 					      le16_to_cpu(el->l_tree_depth));
-		if (ret) {
+		if (ret && (ret != -EEXIST)) {
 			mlog_errno(ret);
 			goto out_free;
 		}
@@ -427,6 +427,11 @@ static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
 /*
  * Simple rule: on any return code other than -EAGAIN, anything left
  * in the insert_context will be freed.
+ *
+ * Simple rule #2: A return code of -EEXIST from this function or
+ * its calls to ocfs2_extent_map_insert_entry() signifies that another
+ * thread beat us to the insert.  It is not an actual error, but it
+ * tells the caller we have no more work to do.
  */
 static int ocfs2_extent_map_try_insert(struct inode *inode,
 				       struct ocfs2_extent_rec *rec,
@@ -448,22 +453,32 @@ static int ocfs2_extent_map_try_insert(struct inode *inode,
 		goto out_unlock;
 	}
 
+	/* Since insert_entry failed, the map MUST have old_ent */
 	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
-					  le32_to_cpu(rec->e_clusters), NULL,
-					  NULL);
+					  le32_to_cpu(rec->e_clusters),
+					  NULL, NULL);
 
 	BUG_ON(!old_ent);
 
-	ret = -EEXIST;
-	if (old_ent->e_tree_depth < tree_depth)
+	if (old_ent->e_tree_depth < tree_depth) {
+		/* Another thread beat us to the lower tree_depth */
+		ret = -EEXIST;
 		goto out_unlock;
+	}
 
 	if (old_ent->e_tree_depth == tree_depth) {
+		/*
+		 * Another thread beat us to this tree_depth.
+		 * Let's make sure we agree with that thread (the
+		 * extent_rec should be identical).
+		 */
 		if (!memcmp(rec, &old_ent->e_rec,
 			    sizeof(struct ocfs2_extent_rec)))
 			ret = 0;
+		else
+			/* FIXME: Should this be ESRCH/EBADR??? */
+			ret = -EEXIST;
 
-		/* FIXME: Should this be ESRCH/EBADR??? */
 		goto out_unlock;
 	}
 
@@ -599,7 +614,7 @@ static int ocfs2_extent_map_insert(struct inode *inode,
 						  tree_depth, &ctxt);
 	} while (ret == -EAGAIN);
 
-	if (ret < 0)
+	if ((ret < 0) && (ret != -EEXIST))
 		mlog_errno(ret);
 
 	if (ctxt.left_ent)
-- 
cgit v1.2.3


From 781ee3e2b1ea41d56ed86ae1c85fc40b7f330205 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 27 Apr 2006 16:41:31 -0700
Subject: ocfs2: Cleanup message prints

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/cluster/tcp.c   | 14 +++++++-------
 fs/ocfs2/dlm/dlmdomain.c |  9 ++++++---
 fs/ocfs2/super.c         | 15 +++++++--------
 3 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1591eb37a72..b650efa8c8b 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -396,8 +396,8 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 	}
 
 	if (was_valid && !valid) {
-		mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
-		     SC_NODEF_ARGS(old_sc));
+		printk(KERN_INFO "o2net: no longer connected to "
+		       SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
 		o2net_complete_nodes_nsw(nn);
 	}
 
@@ -409,10 +409,10 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 		 * the only way to start connecting again is to down
 		 * heartbeat and bring it back up. */
 		cancel_delayed_work(&nn->nn_connect_expired);
-		mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", 
-		     o2nm_this_node() > sc->sc_node->nd_num ?
-		     	"connected to" : "accepted connection from",
-		     SC_NODEF_ARGS(sc));
+		printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
+		       o2nm_this_node() > sc->sc_node->nd_num ?
+		       		"connected to" : "accepted connection from",
+		       SC_NODEF_ARGS(sc));
 	}
 
 	/* trigger the connecting worker func as long as we're not valid,
@@ -1280,7 +1280,7 @@ static void o2net_idle_timer(unsigned long data)
 
 	do_gettimeofday(&now);
 
-	mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
+	printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for 10 "
 	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
 	mlog(ML_NOTICE, "here are some times that might help debug the "
 	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index b8c23f7ba67..8d1065f8b3b 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -408,12 +408,13 @@ static void __dlm_print_nodes(struct dlm_ctxt *dlm)
 
 	assert_spin_locked(&dlm->spinlock);
 
-	mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
+	printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
 
 	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
 				     node + 1)) < O2NM_MAX_NODES) {
-		mlog(ML_NOTICE, " node %d\n", node);
+		printk("%d ", node);
 	}
+	printk("\n");
 }
 
 static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
@@ -429,7 +430,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	node = exit_msg->node_idx;
 
-	mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
+	printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
 
 	spin_lock(&dlm->spinlock);
 	clear_bit(node, dlm->domain_map);
@@ -678,6 +679,8 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data)
 		set_bit(assert->node_idx, dlm->domain_map);
 		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
 
+		printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+		       assert->node_idx, dlm->name);
 		__dlm_print_nodes(dlm);
 
 		/* notify anything attached to the heartbeat events */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index cdf73393f09..4e214ecb588 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -642,10 +642,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	ocfs2_complete_mount_recovery(osb);
 
-	printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
-	       "data mode.\n",
-	       MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
-	       osb->slot_num,
+	printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %d, slot %d) "
+	       "with %s data mode.\n",
+	       osb->dev_str, osb->node_num, osb->slot_num,
 	       osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
 	       "ordered");
 
@@ -1020,7 +1019,7 @@ static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
+	mlog(0, "I am node %d\n", osb->node_num);
 
 	status = 0;
 bail:
@@ -1191,8 +1190,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
 
-	printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
-	       MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
+	printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %d)\n",
+	       osb->dev_str, osb->node_num);
 
 	ocfs2_delete_osb(osb);
 	kfree(osb);
@@ -1327,7 +1326,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		status = -EINVAL;
 		goto bail;
 	}
-	mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
+	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
 
 	init_waitqueue_head(&osb->osb_wipe_event);
 	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
-- 
cgit v1.2.3


From d426721cf10824391fd38bd42f38357ace2b1c08 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 27 Apr 2006 16:44:13 -0700
Subject: ocfs2: silence ENOENT during lookup of broken links

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/symlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 0c8a1294ec9..c0f68aa6c17 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -154,7 +154,7 @@ static void *ocfs2_follow_link(struct dentry *dentry,
 	}
 
 	status = vfs_follow_link(nd, link);
-	if (status)
+	if (status && status != -ENOENT)
 		mlog_errno(status);
 bail:
 	if (page) {
-- 
cgit v1.2.3


From e7607ab3daeeaea50b3b5aebe8dfa29a1dfb8311 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 27 Apr 2006 17:53:22 -0700
Subject: ocfs2: silence a debug print

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/slot_map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 871627961d6..aa6f5aadedc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -264,7 +264,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 	osb->slot_num = slot;
 	spin_unlock(&si->si_lock);
 
-	mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
+	mlog(0, "taking node slot %d\n", osb->slot_num);
 
 	status = ocfs2_update_disk_slots(osb, si);
 	if (status < 0)
-- 
cgit v1.2.3


From a75a6e4c3ada10b15e26f6d12f72c03efde266e0 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 4 May 2006 11:49:22 -0700
Subject: ocfs2: fix init of uuid_net_key

ocfs2_initialize_super() should be copying from the beginning of the uuid.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4e214ecb588..3aa9f18527f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1417,7 +1417,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
+	memcpy(&uuid_net_key, osb->uuid, sizeof(uuid_net_key));
 	osb->net_key = le32_to_cpu(uuid_net_key);
 
 	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
-- 
cgit v1.2.3


From 784270435b001164054e803421a624ef1098519d Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 4 May 2006 12:03:26 -0700
Subject: ocfs2: clean up some osb fields

Get rid of osb->uuid, osb->proc_sub_dir, and osb->osb_id. Those fields were
unused, or could easily be removed. As a result, we also no longer need
MAX_OSB_ID or ocfs2_globals_lock.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dlmglue.c |  3 +--
 fs/ocfs2/journal.c |  5 ++---
 fs/ocfs2/ocfs2.h   |  4 ----
 fs/ocfs2/super.c   | 34 +---------------------------------
 4 files changed, 4 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4acd37286bd..762eb1fbb34 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2071,8 +2071,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 	}
 
 	/* launch vote thread */
-	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
-				     osb->osb_id);
+	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
 	if (IS_ERR(osb->vote_task)) {
 		status = PTR_ERR(osb->vote_task);
 		osb->vote_task = NULL;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 910a601b2e9..f92bf1dd379 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -784,8 +784,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal)
 	}
 
 	/* Launch the commit thread */
-	osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
-				       osb->osb_id);
+	osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt");
 	if (IS_ERR(osb->commit_task)) {
 		status = PTR_ERR(osb->commit_task);
 		osb->commit_task = NULL;
@@ -1118,7 +1117,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 		goto out;
 
 	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
-						 "ocfs2rec-%d", osb->osb_id);
+						 "ocfs2rec");
 	if (IS_ERR(osb->recovery_thread_task)) {
 		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
 		osb->recovery_thread_task = NULL;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index da1093039c0..cd4a6f253d1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -184,7 +184,6 @@ struct ocfs2_journal;
 struct ocfs2_journal_handle;
 struct ocfs2_super
 {
-	u32 osb_id;		/* id used by the proc interface */
 	struct task_struct *commit_task;
 	struct super_block *sb;
 	struct inode *root_inode;
@@ -222,13 +221,11 @@ struct ocfs2_super
 	unsigned long s_mount_opt;
 
 	u16 max_slots;
-	u16 num_nodes;
 	s16 node_num;
 	s16 slot_num;
 	int s_sectsize_bits;
 	int s_clustersize;
 	int s_clustersize_bits;
-	struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
 
 	atomic_t vol_state;
 	struct mutex recovery_lock;
@@ -294,7 +291,6 @@ struct ocfs2_super
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
-#define OCFS2_MAX_OSB_ID             65536
 
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3aa9f18527f..382706a67ff 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,13 +68,6 @@
 
 #include "buffer_head_io.h"
 
-/*
- * Globals
- */
-static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
-
-static u32 osb_id;             /* Keeps track of next available OSB Id */
-
 static kmem_cache_t *ocfs2_inode_cachep = NULL;
 
 kmem_cache_t *ocfs2_lock_cache = NULL;
@@ -799,10 +792,6 @@ static int __init ocfs2_init(void)
 		goto leave;
 	}
 
-	spin_lock(&ocfs2_globals_lock);
-	osb_id = 0;
-	spin_unlock(&ocfs2_globals_lock);
-
 	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
 	if (!ocfs2_debugfs_root) {
 		status = -EFAULT;
@@ -1211,8 +1200,6 @@ static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uu
 	if (osb->uuid_str == NULL)
 		return -ENOMEM;
 
-	memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
-
 	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
 		/* print with null */
 		ret = snprintf(ptr, 3, "%02X", uuid[i]);
@@ -1310,13 +1297,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
-	if (!osb->uuid) {
-		mlog(ML_ERROR, "unable to alloc uuid\n");
-		status = -ENOMEM;
-		goto bail;
-	}
-
 	di = (struct ocfs2_dinode *)bh->b_data;
 
 	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
@@ -1417,7 +1397,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	memcpy(&uuid_net_key, osb->uuid, sizeof(uuid_net_key));
+	memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
 	osb->net_key = le32_to_cpu(uuid_net_key);
 
 	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
@@ -1483,18 +1463,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		goto bail;
 	}
 
-	/*  Link this osb onto the global linked list of all osb structures. */
-	/*  The Global Link List is mainted for the whole driver . */
-	spin_lock(&ocfs2_globals_lock);
-	osb->osb_id = osb_id;
-	if (osb_id < OCFS2_MAX_OSB_ID)
-		osb_id++;
-	else {
-		mlog(ML_ERROR, "Too many volumes mounted\n");
-		status = -ENOMEM;
-	}
-	spin_unlock(&ocfs2_globals_lock);
-
 bail:
 	mlog_exit(status);
 	return status;
-- 
cgit v1.2.3


From 184d7d20d352c7374f70ebca7468dc8cd5cc618a Mon Sep 17 00:00:00 2001
From: Florin Malita <fmalita@gmail.com>
Date: Sat, 3 Jun 2006 19:30:10 -0400
Subject: ocfs2: remove redundant NULL checks in ocfs2_direct_IO_get_blocks()

Signed-off-by: Florin Malita <fmalita@gmail.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index cca71317b6d..f1d1c342ce0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -558,16 +558,9 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 	u64 vbo_max; /* file offset, max_blocks from iblock */
 	u64 p_blkno;
 	int contig_blocks;
-	unsigned char blocksize_bits;
+	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 
-	if (!inode || !bh_result) {
-		mlog(ML_ERROR, "inode or bh_result is null\n");
-		return -EIO;
-	}
-
-	blocksize_bits = inode->i_sb->s_blocksize_bits;
-
 	/* This function won't even be called if the request isn't all
 	 * nicely aligned and of the right size, so there's no need
 	 * for us to check any of that. */
-- 
cgit v1.2.3


From 0418726bb5c7b5a70c7e7e82e860d5979d0c78cf Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 30 Jun 2006 18:23:04 +0200
Subject: typo fixes: aquire -> acquire

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Mauro Carvalho Chehab <mchehab@infradead.org>
---
 fs/befs/linuxvfs.c  | 38 +++++++++++++++++++-------------------
 fs/cifs/file.c      |  2 +-
 fs/jfs/jfs_txnmgr.c |  2 +-
 3 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a83e889a97c..fcaeead9696 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -325,7 +325,7 @@ befs_read_inode(struct inode *inode)
 	if (!bh) {
 		befs_error(sb, "unable to read inode block - "
 			   "inode = %lu", inode->i_ino);
-		goto unaquire_none;
+		goto unacquire_none;
 	}
 
 	raw_inode = (befs_inode *) bh->b_data;
@@ -334,7 +334,7 @@ befs_read_inode(struct inode *inode)
 
 	if (befs_check_inode(sb, raw_inode, inode->i_ino) != BEFS_OK) {
 		befs_error(sb, "Bad inode: %lu", inode->i_ino);
-		goto unaquire_bh;
+		goto unacquire_bh;
 	}
 
 	inode->i_mode = (umode_t) fs32_to_cpu(sb, raw_inode->mode);
@@ -402,17 +402,17 @@ befs_read_inode(struct inode *inode)
 		befs_error(sb, "Inode %lu is not a regular file, "
 			   "directory or symlink. THAT IS WRONG! BeFS has no "
 			   "on disk special files", inode->i_ino);
-		goto unaquire_bh;
+		goto unacquire_bh;
 	}
 
 	brelse(bh);
 	befs_debug(sb, "<--- befs_read_inode()");
 	return;
 
-      unaquire_bh:
+      unacquire_bh:
 	brelse(bh);
 
-      unaquire_none:
+      unacquire_none:
 	make_bad_inode(inode);
 	befs_debug(sb, "<--- befs_read_inode() - Bad inode");
 	return;
@@ -761,14 +761,14 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR
 		       "BeFS(%s): Unable to allocate memory for private "
 		       "portion of superblock. Bailing.\n", sb->s_id);
-		goto unaquire_none;
+		goto unacquire_none;
 	}
 	befs_sb = BEFS_SB(sb);
 	memset(befs_sb, 0, sizeof(befs_sb_info));
 
 	if (!parse_options((char *) data, &befs_sb->mount_opts)) {
 		befs_error(sb, "cannot parse mount options");
-		goto unaquire_priv_sbp;
+		goto unacquire_priv_sbp;
 	}
 
 	befs_debug(sb, "---> befs_fill_super()");
@@ -794,7 +794,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (!(bh = sb_bread(sb, sb_block))) {
 		befs_error(sb, "unable to read superblock");
-		goto unaquire_priv_sbp;
+		goto unacquire_priv_sbp;
 	}
 
 	/* account for offset of super block on x86 */
@@ -809,20 +809,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (befs_load_sb(sb, disk_sb) != BEFS_OK)
-		goto unaquire_bh;
+		goto unacquire_bh;
 
 	befs_dump_super_block(sb, disk_sb);
 
 	brelse(bh);
 
 	if (befs_check_sb(sb) != BEFS_OK)
-		goto unaquire_priv_sbp;
+		goto unacquire_priv_sbp;
 
 	if( befs_sb->num_blocks > ~((sector_t)0) ) {
 		befs_error(sb, "blocks count: %Lu "
 			"is larger than the host can use",
 			befs_sb->num_blocks);
-		goto unaquire_priv_sbp;
+		goto unacquire_priv_sbp;
 	}
 
 	/*
@@ -838,7 +838,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sb->s_root) {
 		iput(root);
 		befs_error(sb, "get root inode failed");
-		goto unaquire_priv_sbp;
+		goto unacquire_priv_sbp;
 	}
 
 	/* load nls library */
@@ -860,13 +860,13 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 
 	return 0;
 /*****************/
-      unaquire_bh:
+      unacquire_bh:
 	brelse(bh);
 
-      unaquire_priv_sbp:
+      unacquire_priv_sbp:
 	kfree(sb->s_fs_info);
 
-      unaquire_none:
+      unacquire_none:
 	sb->s_fs_info = NULL;
 	return -EINVAL;
 }
@@ -925,18 +925,18 @@ init_befs_fs(void)
 
 	err = befs_init_inodecache();
 	if (err)
-		goto unaquire_none;
+		goto unacquire_none;
 
 	err = register_filesystem(&befs_fs_type);
 	if (err)
-		goto unaquire_inodecache;
+		goto unacquire_inodecache;
 
 	return 0;
 
-unaquire_inodecache:
+unacquire_inodecache:
 	befs_destroy_inodecache();
 
-unaquire_none:
+unacquire_none:
 	return err;
 }
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5861eb42e62..944d2b9e092 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -324,7 +324,7 @@ out:
 	return rc;
 }
 
-/* Try to reaquire byte range locks that were released when session */
+/* Try to reacquire byte range locks that were released when session */
 /* to server was lost */
 static int cifs_relock_file(struct cifsFileInfo *cifsFile)
 {
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index ac3d66948e8..10c46231ce1 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -842,7 +842,7 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
 	TXN_UNLOCK();
 	release_metapage(mp);
 	TXN_LOCK();
-	xtid = tlck->tid;	/* reaquire after dropping TXN_LOCK */
+	xtid = tlck->tid;	/* reacquire after dropping TXN_LOCK */
 
 	jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
 		 tid, xtid, lid);
-- 
cgit v1.2.3


From 779cbf0bbc75629ecffd480d2f261af42ea80bbe Mon Sep 17 00:00:00 2001
From: Paul Collins <paul@ondioline.org>
Date: Fri, 30 Jun 2006 18:50:03 +0200
Subject: v9fs: do not include linux/version.h

I noticed that part of v9fs was being rebuilt when version.h changed.

Signed-off-by: Paul Collins <paul@ondioline.org>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 fs/9p/vfs_addr.c | 1 -
 fs/9p/vfs_file.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index d4f0aa3c87f..9dfd259a70b 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -31,7 +31,6 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/inet.h>
-#include <linux/version.h>
 #include <linux/pagemap.h>
 #include <linux/idr.h>
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 1a8e46084f0..c3c47eda757 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -31,7 +31,6 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/inet.h>
-#include <linux/version.h>
 #include <linux/list.h>
 #include <asm/uaccess.h>
 #include <linux/idr.h>
-- 
cgit v1.2.3


From 6ab3d5624e172c553004ecc862bfeac16d9d68b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rn=20Engel?= <joern@wohnheim.fh-wedel.de>
Date: Fri, 30 Jun 2006 19:25:36 +0200
Subject: Remove obsolete #include <linux/config.h>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jörn Engel <joern@wohnheim.fh-wedel.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 fs/9p/conv.c                   | 1 -
 fs/9p/error.c                  | 1 -
 fs/9p/fcall.c                  | 1 -
 fs/9p/fcprint.c                | 1 -
 fs/9p/fid.c                    | 1 -
 fs/9p/mux.c                    | 1 -
 fs/9p/trans_fd.c               | 1 -
 fs/9p/v9fs.c                   | 1 -
 fs/9p/vfs_super.c              | 1 -
 fs/adfs/dir.c                  | 1 -
 fs/binfmt_flat.c               | 1 -
 fs/binfmt_som.c                | 1 -
 fs/block_dev.c                 | 1 -
 fs/buffer.c                    | 1 -
 fs/char_dev.c                  | 1 -
 fs/cifs/asn1.c                 | 1 -
 fs/coda/sysctl.c               | 1 -
 fs/compat_ioctl.c              | 1 -
 fs/dcache.c                    | 1 -
 fs/dcookies.c                  | 1 -
 fs/debugfs/file.c              | 1 -
 fs/debugfs/inode.c             | 1 -
 fs/exec.c                      | 1 -
 fs/ext2/balloc.c               | 1 -
 fs/ext2/ialloc.c               | 1 -
 fs/ext2/super.c                | 1 -
 fs/ext2/xattr.h                | 1 -
 fs/ext3/balloc.c               | 1 -
 fs/ext3/resize.c               | 1 -
 fs/ext3/super.c                | 1 -
 fs/ext3/xattr.h                | 1 -
 fs/file_table.c                | 1 -
 fs/hfs/super.c                 | 1 -
 fs/hfsplus/super.c             | 1 -
 fs/inode.c                     | 1 -
 fs/ioctl.c                     | 1 -
 fs/isofs/compress.c            | 1 -
 fs/isofs/dir.c                 | 1 -
 fs/isofs/inode.c               | 1 -
 fs/jffs/intrep.c               | 1 -
 fs/jffs/jffs_fm.h              | 1 -
 fs/jffs2/compr_zlib.c          | 1 -
 fs/jffs2/debug.h               | 1 -
 fs/jffs2/fs.c                  | 1 -
 fs/jffs2/nodelist.h            | 1 -
 fs/jffs2/super.c               | 1 -
 fs/jfs/super.c                 | 1 -
 fs/lockd/clntproc.c            | 1 -
 fs/lockd/svc.c                 | 1 -
 fs/lockd/svclock.c             | 1 -
 fs/lockd/svcproc.c             | 1 -
 fs/lockd/svcsubs.c             | 1 -
 fs/lockd/xdr.c                 | 1 -
 fs/namespace.c                 | 1 -
 fs/ncpfs/dir.c                 | 1 -
 fs/ncpfs/inode.c               | 1 -
 fs/ncpfs/ioctl.c               | 1 -
 fs/ncpfs/ncplib_kernel.c       | 1 -
 fs/ncpfs/ncplib_kernel.h       | 1 -
 fs/ncpfs/ncpsign_kernel.c      | 1 -
 fs/ncpfs/sock.c                | 1 -
 fs/ncpfs/symlink.c             | 1 -
 fs/nfs/callback.c              | 1 -
 fs/nfs/callback_proc.c         | 1 -
 fs/nfs/callback_xdr.c          | 1 -
 fs/nfs/delegation.c            | 1 -
 fs/nfs/direct.c                | 1 -
 fs/nfs/inode.c                 | 1 -
 fs/nfs/nfs4state.c             | 1 -
 fs/nfs/pagelist.c              | 1 -
 fs/nfs/read.c                  | 1 -
 fs/nfs/sysctl.c                | 1 -
 fs/nfs/write.c                 | 1 -
 fs/nfsctl.c                    | 1 -
 fs/nfsd/nfs4callback.c         | 1 -
 fs/nfsd/nfs4idmap.c            | 1 -
 fs/nfsd/nfsctl.c               | 1 -
 fs/nfsd/nfssvc.c               | 1 -
 fs/nfsd/vfs.c                  | 1 -
 fs/nls/nls_base.c              | 1 -
 fs/ntfs/sysctl.h               | 1 -
 fs/partitions/acorn.c          | 1 -
 fs/partitions/efi.c            | 1 -
 fs/partitions/efi.h            | 1 -
 fs/partitions/ibm.c            | 1 -
 fs/partitions/mac.c            | 1 -
 fs/partitions/msdos.c          | 1 -
 fs/proc/array.c                | 1 -
 fs/proc/base.c                 | 1 -
 fs/proc/kcore.c                | 1 -
 fs/proc/proc_misc.c            | 1 -
 fs/proc/root.c                 | 1 -
 fs/proc/vmcore.c               | 1 -
 fs/qnx4/bitmap.c               | 1 -
 fs/qnx4/dir.c                  | 1 -
 fs/qnx4/fsync.c                | 1 -
 fs/qnx4/inode.c                | 1 -
 fs/qnx4/namei.c                | 1 -
 fs/qnx4/truncate.c             | 1 -
 fs/reiserfs/bitmap.c           | 1 -
 fs/reiserfs/dir.c              | 1 -
 fs/reiserfs/do_balan.c         | 1 -
 fs/reiserfs/fix_node.c         | 1 -
 fs/reiserfs/ibalance.c         | 1 -
 fs/reiserfs/inode.c            | 1 -
 fs/reiserfs/journal.c          | 1 -
 fs/reiserfs/lbalance.c         | 1 -
 fs/reiserfs/namei.c            | 1 -
 fs/reiserfs/objectid.c         | 1 -
 fs/reiserfs/prints.c           | 1 -
 fs/reiserfs/procfs.c           | 1 -
 fs/reiserfs/stree.c            | 1 -
 fs/reiserfs/super.c            | 1 -
 fs/reiserfs/tail_conversion.c  | 1 -
 fs/smbfs/inode.c               | 1 -
 fs/smbfs/smbiod.c              | 1 -
 fs/stat.c                      | 1 -
 fs/super.c                     | 1 -
 fs/udf/super.c                 | 1 -
 fs/udf/udfdecl.h               | 1 -
 fs/ufs/super.c                 | 1 -
 fs/xfs/linux-2.6/xfs_buf.h     | 1 -
 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 -
 fs/xfs/linux-2.6/xfs_linux.h   | 1 -
 124 files changed, 124 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index a767e05b60b..1e898144eb7 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -24,7 +24,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/error.c b/fs/9p/error.c
index 981fe8ecd78..ae91555c155 100644
--- a/fs/9p/error.c
+++ b/fs/9p/error.c
@@ -27,7 +27,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 
 #include <linux/list.h>
diff --git a/fs/9p/fcall.c b/fs/9p/fcall.c
index 6f2617820a4..8556097fcda 100644
--- a/fs/9p/fcall.c
+++ b/fs/9p/fcall.c
@@ -24,7 +24,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/fcprint.c b/fs/9p/fcprint.c
index 583e827baeb..34b96114a28 100644
--- a/fs/9p/fcprint.c
+++ b/fs/9p/fcprint.c
@@ -21,7 +21,6 @@
  *  Boston, MA  02111-1301  USA
  *
  */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b7608af07ce..70492ccb438 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -20,7 +20,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index 8d45ed66883..90a79c78454 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -23,7 +23,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c
index 94e0a7fd9fc..34d43355beb 100644
--- a/fs/9p/trans_fd.c
+++ b/fs/9p/trans_fd.c
@@ -25,7 +25,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/in.h>
 #include <linux/module.h>
 #include <linux/net.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index d37416eb579..22f7ccd58d3 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -23,7 +23,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8b15bb22cac..63320d4e15d 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -25,7 +25,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 7b075fc397d..d3c7905b2dd 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,7 +9,6 @@
  *
  *  Common directory handling for ADFS
  */
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/adfs_fs.h>
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index c94d52eafd1..a62fd4018a2 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -16,7 +16,6 @@
  */
 
 #include <linux/module.h>
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 00a91dc25d1..32b5d625ce9 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -32,7 +32,6 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-#include <linux/config.h>
 
 #include <linux/elf.h>
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 909cb0595b4..9633a490dab 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -5,7 +5,6 @@
  *  Copyright (C) 2001  Andrea Arcangeli <andrea@suse.de> SuSE
  */
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/fcntl.h>
diff --git a/fs/buffer.c b/fs/buffer.c
index e9994722f4a..bf22bb56a08 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -18,7 +18,6 @@
  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
  */
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 97986635b64..a4cbc6706ef 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -4,7 +4,6 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 031cdf29325..2e75883b7f5 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -17,7 +17,6 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index f0b10757288..1c82e9a7d7c 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/sysctl.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index d8d50a70c58..4063a939697 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -10,7 +10,6 @@
  * ioctls.
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/compat.h>
 #include <linux/kernel.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 48b44a714b3..c6e3535be19 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -14,7 +14,6 @@
  * the dcache entry is deleted or garbage collected.
  */
 
-#include <linux/config.h>
 #include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/mm.h>
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 8749339bf4f..0c4b0674854 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -12,7 +12,6 @@
  * to the pair and can be looked up from userspace.
  */
 
-#include <linux/config.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 66a505422e5..39640fd0345 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -13,7 +13,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 6fa1e04f841..e8ae3042b80 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -16,7 +16,6 @@
 /* uncomment to get debug messages from the debug filesystem, ah the irony. */
 /* #define DEBUG */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
diff --git a/fs/exec.c b/fs/exec.c
index c8494f513ea..8344ba73a2a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -22,7 +22,6 @@
  * formats. 
  */
 
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/mman.h>
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 433a213a8bd..d4870432ecf 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -11,7 +11,6 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  */
 
-#include <linux/config.h>
 #include "ext2.h"
 #include <linux/quotaops.h>
 #include <linux/sched.h>
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 308c252568c..de85c61c58c 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -12,7 +12,6 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  */
 
-#include <linux/config.h>
 #include <linux/quotaops.h>
 #include <linux/sched.h>
 #include <linux/backing-dev.h>
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index d4233b2e643..9f43879d6d6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -16,7 +16,6 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 67cfeb66e89..bf8175b2ced 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -6,7 +6,6 @@
   (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/xattr.h>
 
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 96172e89ddc..a504a40d6d2 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -11,7 +11,6 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index dfd811895d8..5e1337fd878 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -8,7 +8,6 @@
  * This could probably be made into a module, because it is not often in use.
  */
 
-#include <linux/config.h>
 
 #define EXT3FS_DEBUG
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b7483360a2d..f2dd7133661 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -16,7 +16,6 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 2ceae38f3d4..6b1ae1c6182 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -6,7 +6,6 @@
   (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
 
-#include <linux/config.h>
 #include <linux/xattr.h>
 
 /* Magic value in attribute blocks */
diff --git a/fs/file_table.c b/fs/file_table.c
index 506d5307108..0131ba06e1e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -5,7 +5,6 @@
  *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
  */
 
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index d9227bf14e8..34937ee83ab 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -12,7 +12,6 @@
  * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/mount.h>
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 0a92fa2336a..d279d5924f2 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -7,7 +7,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
diff --git a/fs/inode.c b/fs/inode.c
index f42961eb983..e53796a6d88 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -4,7 +4,6 @@
  * (C) 1997 Linus Torvalds
  */
 
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/dcache.h>
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f8aeec3ca10..4b7660b09ac 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -4,7 +4,6 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 
-#include <linux/config.h>
 #include <linux/syscalls.h>
 #include <linux/mm.h>
 #include <linux/smp_lock.h>
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 3a39158cca9..731816332b1 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -16,7 +16,6 @@
  * Transparent decompression of files on an iso9660 filesystem
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index 5440ea292c6..27e276987fd 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -10,7 +10,6 @@
  * 
  *  isofs directory handling functions
  */
-#include <linux/config.h>
 #include <linux/smp_lock.h>
 #include "isofs.h"
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bb11c7fb401..14391361c88 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -11,7 +11,6 @@
  *	2004  Paul Serice - NFS Export Operations
  */
 
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/module.h>
 
diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 5371a403130..9000f1effed 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -55,7 +55,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/jffs.h>
diff --git a/fs/jffs/jffs_fm.h b/fs/jffs/jffs_fm.h
index c794d923df2..9ee6ad29eff 100644
--- a/fs/jffs/jffs_fm.h
+++ b/fs/jffs/jffs_fm.h
@@ -20,7 +20,6 @@
 #ifndef __LINUX_JFFS_FM_H__
 #define __LINUX_JFFS_FM_H__
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/jffs.h>
 #include <linux/mtd/mtd.h>
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 5c63e0cdcf4..3681d0728ac 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -15,7 +15,6 @@
 #error "The userspace support got too messy and was removed. Update your mkfs.jffs2"
 #endif
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h
index 5fa494a792b..3daf3bca037 100644
--- a/fs/jffs2/debug.h
+++ b/fs/jffs2/debug.h
@@ -13,7 +13,6 @@
 #ifndef _JFFS2_DEBUG_H_
 #define _JFFS2_DEBUG_H_
 
-#include <linux/config.h>
 
 #ifndef CONFIG_JFFS2_FS_DEBUG
 #define CONFIG_JFFS2_FS_DEBUG 0
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 97caa77d60c..4780f82825d 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -12,7 +12,6 @@
  */
 
 #include <linux/capability.h>
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index b16c60bbcf6..f752baa8d39 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -14,7 +14,6 @@
 #ifndef __JFFS2_NODELIST_H__
 #define __JFFS2_NODELIST_H__
 
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/jffs2.h>
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 2378a662c25..68e3953419b 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -11,7 +11,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 73d2aba084c..4f6cfebc82d 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -18,7 +18,6 @@
  */
 
 #include <linux/fs.h>
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/parser.h>
 #include <linux/completion.h>
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 4db62098d3f..5980c45998c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -6,7 +6,6 @@
  * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/errno.h>
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index fd56c8872f3..9a991b52c64 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -12,7 +12,6 @@
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sysctl.h>
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 3ef739120df..baf5ae51348 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -20,7 +20,6 @@
  * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index d210cf304e9..dbb66a3b5cd 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -7,7 +7,6 @@
  * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/time.h>
 #include <linux/slab.h>
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index a570e5c8a93..2a4df9b3779 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -6,7 +6,6 @@
  * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/time.h>
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index f22a3764461..033ea4ac2c3 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -6,7 +6,6 @@
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/utsname.h>
diff --git a/fs/namespace.c b/fs/namespace.c
index b3ed212ea41..fa7ed6a9fc2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -8,7 +8,6 @@
  * Heavily rewritten.
  */
 
-#include <linux/config.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f0860c602d8..b4ee89250e9 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -10,7 +10,6 @@
  *
  */
 
-#include <linux/config.h>
 
 #include <linux/time.h>
 #include <linux/errno.h>
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 6c51c119846..1ddf77b0b82 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -9,7 +9,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 
 #include <asm/system.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index eb3813ad136..42039fe0653 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -7,7 +7,6 @@
  *
  */
 
-#include <linux/config.h>
 
 #include <asm/uaccess.h>
 #include <linux/capability.h>
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index d9ebf6439f5..551e0bac7aa 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -10,7 +10,6 @@
  */
 
 
-#include <linux/config.h>
 
 #include "ncplib_kernel.h"
 
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 799e5c2bec5..2441d1ab57d 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -12,7 +12,6 @@
 #ifndef _NCPLIB_H
 #define _NCPLIB_H
 
-#include <linux/config.h>
 
 #include <linux/fs.h>
 #include <linux/types.h>
diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c
index a6ec90cd889..749a18d3359 100644
--- a/fs/ncpfs/ncpsign_kernel.c
+++ b/fs/ncpfs/ncpsign_kernel.c
@@ -5,7 +5,6 @@
  *
  */
 
-#include <linux/config.h>
 
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
 
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 8783eb7ec64..11c2b252ebe 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -8,7 +8,6 @@
  *
  */
 
-#include <linux/config.h>
 
 #include <linux/time.h>
 #include <linux/errno.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index f76b1392a01..ca92c240663 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -20,7 +20,6 @@
  *
  */
 
-#include <linux/config.h>
 
 #include <asm/uaccess.h>
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index d53f8c6a9ec..fe0a6b8ac14 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -6,7 +6,6 @@
  * NFSv4 callback handling
  */
 
-#include <linux/config.h>
 #include <linux/completion.h>
 #include <linux/ip.h>
 #include <linux/module.h>
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 462cfceb50c..7719483ecdf 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -5,7 +5,6 @@
  *
  * NFSv4 callback procedures
  */
-#include <linux/config.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include "nfs4_fs.h"
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c92991328d9..29f93219205 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -5,7 +5,6 @@
  *
  * NFSv4 callback encode/decode procedures
  */
-#include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfs4.h>
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d3be923d4e4..9540a316c05 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -6,7 +6,6 @@
  * NFS file delegation management
  *
  */
-#include <linux/config.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 8ca9707be6c..4cdd1b499e3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -38,7 +38,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/kernel.h>
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c5b916605fb..d349fb2245d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -13,7 +13,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 96e5b82c153..090a36b07a2 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -38,7 +38,6 @@
  * subsequent patch.
  */
 
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/nfs_fs.h>
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index d89f6fb3b3a..d3ff7abcf54 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -9,7 +9,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/sunrpc/clnt.h>
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 32cf3773af0..52bf634260a 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -15,7 +15,6 @@
  * within the RPC code when root squashing is suspected.
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index db61e51bb15..2fe3403c240 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -3,7 +3,6 @@
  *
  * Sysctl interface to NFS parameters
  */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/linkage.h>
 #include <linux/ctype.h>
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8fccb9cb173..1aadc138ec4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -46,7 +46,6 @@
  * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index a5a18d4aca4..c043136a82c 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -4,7 +4,6 @@
  *	This should eventually move to userland.
  *
  */
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/file.h>
 #include <linux/fs.h>
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index dbaf3f93f32..54b37b1d2e3 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -33,7 +33,6 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/list.h>
 #include <linux/inet.h>
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4b6aa60dfce..bea6b947811 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -34,7 +34,6 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a1810e6a93e..7046ac9cf97 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -6,7 +6,6 @@
  * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 
 #include <linux/linkage.h>
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 3790727e5df..ec1decf29ba 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -8,7 +8,6 @@
  * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 
 #include <linux/time.h>
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 245eaa1fb59..da3ec74d8a4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -16,7 +16,6 @@
  * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
  */
 
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/time.h>
 #include <linux/errno.h>
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index a912debcd20..9de6b495f11 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -10,7 +10,6 @@
 
 #include <linux/module.h>
 #include <linux/string.h>
-#include <linux/config.h>
 #include <linux/nls.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
index c8064cae8f1..beda5bf9640 100644
--- a/fs/ntfs/sysctl.h
+++ b/fs/ntfs/sysctl.h
@@ -24,7 +24,6 @@
 #ifndef _LINUX_NTFS_SYSCTL_H
 #define _LINUX_NTFS_SYSCTL_H
 
-#include <linux/config.h>
 
 #if defined(DEBUG) && defined(CONFIG_SYSCTL)
 
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index c05085710fc..1bc9f372c7d 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -12,7 +12,6 @@
  *  every single manufacturer of SCSI and IDE cards created their own
  *  method.
  */
-#include <linux/config.h>
 #include <linux/buffer_head.h>
 #include <linux/adfs_fs.h>
 
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 0f5b017aeba..63730282ad8 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -91,7 +91,6 @@
  * - Code works, detects all the partitions.
  *
  ************************************************************/
-#include <linux/config.h>
 #include <linux/crc32.h>
 #include "check.h"
 #include "efi.h"
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index c44fb056144..2cc89d0475b 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -26,7 +26,6 @@
 #define FS_PART_EFI_H_INCLUDED
 
 #include <linux/types.h>
-#include <linux/config.h>
 #include <linux/fs.h>
 #include <linux/genhd.h>
 #include <linux/kernel.h>
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 830c55d86ab..d352a7381fe 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -6,7 +6,6 @@
  * (C) IBM Corporation, IBM Deutschland Entwicklung GmbH, 1999,2000
  */
 
-#include <linux/config.h>
 #include <linux/buffer_head.h>
 #include <linux/hdreg.h>
 #include <linux/slab.h>
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 813292f2121..c0871002d00 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -6,7 +6,6 @@
  *  Re-organised Feb 1998 Russell King
  */
 
-#include <linux/config.h>
 #include <linux/ctype.h>
 #include "check.h"
 #include "mac.h"
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 9935d254186..8f12587c312 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -19,7 +19,6 @@
  *  Re-organised Feb 1998 Russell King
  */
 
-#include <linux/config.h>
 
 #include "check.h"
 #include "msdos.h"
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7a76ad57023..7495d3e2077 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -52,7 +52,6 @@
  *			 :  base.c too.
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/time.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6ba7785319d..243a94af042 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -49,7 +49,6 @@
 
 #include <asm/uaccess.h>
 
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 17f6e8fa139..036d14d8362 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -9,7 +9,6 @@
  *	Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
  */
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/user.h>
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5c10ea15742..b8d48852649 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -26,7 +26,6 @@
 #include <linux/mman.h>
 #include <linux/proc_fs.h>
 #include <linux/ioport.h>
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/pagemap.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9995356ce73..8901c65caca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -12,7 +12,6 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
-#include <linux/config.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 20d4b2237fc..d96050728c4 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -7,7 +7,6 @@
  *
  */
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/user.h>
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 46efbf52cbe..8425cf6e962 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,7 +13,6 @@
  * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/qnx4_fs.h>
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 9031948fefd..0d7103fa0df 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,7 +11,6 @@
  * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
  */
 
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
index df5bc75d541..aa3b19544be 100644
--- a/fs/qnx4/fsync.c
+++ b/fs/qnx4/fsync.c
@@ -10,7 +10,6 @@
  * 24-03-1998 by Richard Frowijn : first release.
  */
 
-#include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/time.h>
 #include <linux/stat.h>
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 8bc182a8874..5a903491e69 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -12,7 +12,6 @@
  * 30-06-1998 by Frank Denis : first step to write inodes.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/string.h>
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 4af4951d7f5..c3d83f67154 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,7 +12,6 @@
  * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/qnx4_fs.h>
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 86563ec01b3..6437c1c3d1d 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,7 +10,6 @@
  * 30-06-1998 by Frank DENIS : ugly filler.
  */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 909f71e9a30..4a7dbdee1b6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -3,7 +3,6 @@
  */
 /* Reiserfs block (de)allocator, bitmap-based. */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/errno.h>
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 973c819f803..9aabcc0ccd2 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -2,7 +2,6 @@
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index b2264ba3cc5..fba304e64de 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -15,7 +15,6 @@
  **
  **/
 
-#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <linux/time.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5600d3d60cf..6d0e554daa9 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -34,7 +34,6 @@
  ** 
  **/
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/string.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 6c5a726fd34..de391a82b99 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -2,7 +2,6 @@
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
-#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <linux/string.h>
 #include <linux/time.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a24858a632f..477eaa2fcf6 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2,7 +2,6 @@
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 49d1a53dbef..9b3672d6936 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -34,7 +34,6 @@
 **		        from within kupdate, it will ignore the immediate flag
 */
 
-#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 2533c1f64ab..281f8061ac5 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -2,7 +2,6 @@
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
-#include <linux/config.h>
 #include <asm/uaccess.h>
 #include <linux/string.h>
 #include <linux/time.h>
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 284f7852de8..c61710e49c6 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -11,7 +11,6 @@
  * NO WARRANTY
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/bitops.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index f62590aa9c9..65feba4deb6 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -2,7 +2,6 @@
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
-#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/random.h>
 #include <linux/time.h>
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 27bd3a1df2a..bc808a91eea 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -2,7 +2,6 @@
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 731688e1cfe..5d8a8cfebc7 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -10,7 +10,6 @@
 
 /* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/seq_file.h>
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index d2b25e1ba6e..8b9b1312713 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -49,7 +49,6 @@
  * reiserfs_insert_item
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 00f1321e920..28eb3c88603 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -11,7 +11,6 @@
  * NO WARRANTY
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 196e971c03c..36f108fc1cf 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -2,7 +2,6 @@
  * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details
  */
 
-#include <linux/config.h>
 #include <linux/time.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 506ff87c1d4..a1ed657c3c8 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -7,7 +7,6 @@
  *  Please add a note about your changes to smbfs in the ChangeLog file.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/kernel.h>
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 24577e2c489..e6754044128 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -5,7 +5,6 @@
  *  Copyright (C) 2001, Urban Widmark
  */
 
-#include <linux/config.h>
 
 #include <linux/sched.h>
 #include <linux/kernel.h>
diff --git a/fs/stat.c b/fs/stat.c
index 0f282face32..3a44dcf97da 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -4,7 +4,6 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
diff --git a/fs/super.c b/fs/super.c
index 8a669f6f3f5..9b780c42d84 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -20,7 +20,6 @@
  *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 44fe2cb0bbb..4df822c881b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -40,7 +40,6 @@
 
 #include "udfdecl.h"    
 
-#include <linux/config.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 2f992387cc9..1033b7cf293 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -6,7 +6,6 @@
 #include "osta_udf.h"
 
 #include <linux/fs.h>
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/udf_fs_i.h>
 #include <linux/udf_fs_sb.h>
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 74ef5e9bedf..19a99726e58 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -64,7 +64,6 @@
  */
 
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 4dd6592d5a4..ceda3a2859d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_BUF_H__
 #define __XFS_BUF_H__
 
-#include <linux/config.h>
 #include <linux/list.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 601f01c92f7..270db0f3861 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -15,7 +15,6 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
-#include <linux/config.h>
 #include <linux/compat.h>
 #include <linux/init.h>
 #include <linux/ioctl.h>
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 028eb17ec2e..8c021dc57d1 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -19,7 +19,6 @@
 #define __XFS_LINUX__
 
 #include <linux/types.h>
-#include <linux/config.h>
 
 /*
  * Some types are conditional depending on the target system.
-- 
cgit v1.2.3


From 65ba55f500a37272985d071c9bbb35256a2f7c14 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:34 -0700
Subject: [PATCH] zoned vm counters: convert nr_mapped to per zone counter

nr_mapped is important because it allows a determination of how many pages of
a zone are not mapped, which would allow a more efficient means of determining
when we need to reclaim memory in a zone.

We take the nr_mapped field out of the page state structure and define a new
per zone counter named NR_FILE_MAPPED (the anonymous pages will be split off
from NR_MAPPED in the next patch).

We replace the use of nr_mapped in various kernel locations.  This avoids the
looping over all processors in try_to_free_pages(), writeback, reclaim (swap +
zone reclaim).

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5c10ea15742..bc7d9abca74 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -190,7 +190,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.freeswap),
 		K(ps.nr_dirty),
 		K(ps.nr_writeback),
-		K(ps.nr_mapped),
+		K(global_page_state(NR_FILE_MAPPED)),
 		K(ps.nr_slab),
 		K(allowed),
 		K(committed),
-- 
cgit v1.2.3


From 347ce434d57da80fd5809c0c836f206a50999c26 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:35 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_pagecache to per zone
 counter

Currently a single atomic variable is used to establish the size of the page
cache in the whole machine.  The zoned VM counters have the same method of
implementation as the nr_pagecache code but also allow the determination of
the pagecache size per zone.

Remove the special implementation for nr_pagecache and make it a zoned counter
named NR_FILE_PAGES.

Updates of the page cache counters are always performed with interrupts off.
We can therefore use the __ variant here.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index bc7d9abca74..1af12fd77fe 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -142,7 +142,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 	allowed = ((totalram_pages - hugetlb_total_pages())
 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
-	cached = get_page_cache_size() - total_swapcache_pages - i.bufferram;
+	cached = global_page_state(NR_FILE_PAGES) -
+			total_swapcache_pages - i.bufferram;
 	if (cached < 0)
 		cached = 0;
 
-- 
cgit v1.2.3


From f3dbd34460ff54962d3e3244b6bcb7f5295356e6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:36 -0700
Subject: [PATCH] zoned vm counters: split NR_ANON_PAGES off from
 NR_FILE_MAPPED

The current NR_FILE_MAPPED is used by zone reclaim and the dirty load
calculation as the number of mapped pagecache pages.  However, that is not
true.  NR_FILE_MAPPED includes the mapped anonymous pages.  This patch
separates those and therefore allows an accurate tracking of the anonymous
pages per zone.

It then becomes possible to determine the number of unmapped pages per zone
and we can avoid scanning for unmapped pages if there are none.

Also it may now be possible to determine the mapped/unmapped ratio in
get_dirty_limit.  Isnt the number of anonymous pages irrelevant in that
calculation?

Note that this will change the meaning of the number of mapped pages reported
in /proc/vmstat /proc/meminfo and in the per node statistics.  This may affect
user space tools that monitor these counters!  NR_FILE_MAPPED works like
NR_FILE_DIRTY.  It is only valid for pagecache pages.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1af12fd77fe..ff809656ce3 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -168,6 +168,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"SwapFree:     %8lu kB\n"
 		"Dirty:        %8lu kB\n"
 		"Writeback:    %8lu kB\n"
+		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
@@ -191,6 +192,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.freeswap),
 		K(ps.nr_dirty),
 		K(ps.nr_writeback),
+		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(ps.nr_slab),
 		K(allowed),
-- 
cgit v1.2.3


From 9a865ffa34b6117a5e0b67640a084d8c2e198c93 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:38 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_slab to per zone counter

- Allows reclaim to access counter without looping over processor counts.

- Allows accurate statistics on how many pages are used in a zone by
  the slab. This may become useful to balance slab allocations over
  various zones.

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index ff809656ce3..16aaf7187bb 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -194,7 +194,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(ps.nr_writeback),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
-		K(ps.nr_slab),
+		K(global_page_state(NR_SLAB)),
 		K(allowed),
 		K(committed),
 		K(ps.nr_page_table_pages),
-- 
cgit v1.2.3


From df849a1529c106f7460e51479ca78fe07b07dc8c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:38 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_pagetables to per zone
 counter

Conversion of nr_page_table_pages to a per zone counter

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 16aaf7187bb..0eae68f8421 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -171,9 +171,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
+		"PageTables:   %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
 		"Committed_AS: %8lu kB\n"
-		"PageTables:   %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
 		"VmallocUsed:  %8lu kB\n"
 		"VmallocChunk: %8lu kB\n",
@@ -195,9 +195,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SLAB)),
+		K(global_page_state(NR_PAGETABLE)),
 		K(allowed),
 		K(committed),
-		K(ps.nr_page_table_pages),
 		(unsigned long)VMALLOC_TOTAL >> 10,
 		vmi.used >> 10,
 		vmi.largest_chunk >> 10
-- 
cgit v1.2.3


From b1e7a8fd854d2f895730e82137400012b509650e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:39 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_dirty to per zone counter

This makes nr_dirty a per zone counter.  Looping over all processors is
avoided during writeback state determination.

The counter aggregation for nr_dirty had to be undone in the NFS layer since
we summed up the page counts from multiple zones.  Someone more familiar with
NFS should probably review what I have done.

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c         | 2 +-
 fs/fs-writeback.c   | 2 +-
 fs/nfs/pagelist.c   | 1 +
 fs/nfs/write.c      | 3 +--
 fs/proc/proc_misc.c | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index e9994722f4a..90e52e67720 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -852,7 +852,7 @@ int __set_page_dirty_buffers(struct page *page)
 		write_lock_irq(&mapping->tree_lock);
 		if (page->mapping) {	/* Race with truncate? */
 			if (mapping_cap_account_dirty(mapping))
-				inc_page_state(nr_dirty);
+				__inc_zone_page_state(page, NR_FILE_DIRTY);
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 031b27a4bc9..e5ad1075684 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -464,7 +464,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
 	};
-	unsigned long nr_dirty = read_page_state(nr_dirty);
+	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
 	unsigned long nr_unstable = read_page_state(nr_unstable);
 
 	wbc.nr_to_write = nr_dirty + nr_unstable +
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index d89f6fb3b3a..26b1fe90937 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -315,6 +315,7 @@ nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
 						req->wb_index, NFS_PAGE_TAG_DIRTY);
 				nfs_list_remove_request(req);
 				nfs_list_add_request(req, dst);
+				dec_zone_page_state(req->wb_page, NR_FILE_DIRTY);
 				res++;
 			}
 		}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 8fccb9cb173..a6d1ca513dd 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -497,7 +497,7 @@ nfs_mark_request_dirty(struct nfs_page *req)
 	nfs_list_add_request(req, &nfsi->dirty);
 	nfsi->ndirty++;
 	spin_unlock(&nfsi->req_lock);
-	inc_page_state(nr_dirty);
+	inc_zone_page_state(req->wb_page, NR_FILE_DIRTY);
 	mark_inode_dirty(inode);
 }
 
@@ -609,7 +609,6 @@ nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_sta
 	if (nfsi->ndirty != 0) {
 		res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages);
 		nfsi->ndirty -= res;
-		sub_page_state(nr_dirty,res);
 		if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
 			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
 	}
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 0eae68f8421..e23717dec3c 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -190,7 +190,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.freeram-i.freehigh),
 		K(i.totalswap),
 		K(i.freeswap),
-		K(ps.nr_dirty),
+		K(global_page_state(NR_FILE_DIRTY)),
 		K(ps.nr_writeback),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
-- 
cgit v1.2.3


From ce866b34ae1b7f1ce60234cf65855886ac7e7d30 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:40 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_writeback to per zone
 counter

Conversion of nr_writeback to per zone counter.

This removes the last page_state counter from arch/i386/mm/pgtable.c so we
drop the page_state from there.

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index e23717dec3c..48cb3586244 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -191,7 +191,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.totalswap),
 		K(i.freeswap),
 		K(global_page_state(NR_FILE_DIRTY)),
-		K(ps.nr_writeback),
+		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SLAB)),
-- 
cgit v1.2.3


From fd39fc8561be33065306bdac0e30414e1e8ac8e1 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:40 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_unstable to per zone
 counter

Conversion of nr_unstable to a per zone counter

We need to do some special modifications to the nfs code since there are
multiple cases of disposition and we need to have a page ref for proper
accounting.

This converts the last critical page state of the VM and therefore we need to
remove several functions that were depending on GET_PAGE_STATE_LAST in order
to make the kernel compile again.  We are only left with event type counters
in page state.

[akpm@osdl.org: bugfixes]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fs-writeback.c   | 2 +-
 fs/nfs/write.c      | 6 ++----
 fs/proc/proc_misc.c | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5ad1075684..892643dc9af 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -465,7 +465,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 		.range_end	= LLONG_MAX,
 	};
 	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = read_page_state(nr_unstable);
+	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 
 	wbc.nr_to_write = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a6d1ca513dd..f21e268705c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -525,7 +525,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	nfs_list_add_request(req, &nfsi->commit);
 	nfsi->ncommit++;
 	spin_unlock(&nfsi->req_lock);
-	inc_page_state(nr_unstable);
+	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	mark_inode_dirty(inode);
 }
 #endif
@@ -1393,7 +1393,6 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
 	struct nfs_page		*req;
-	int res = 0;
 
         dprintk("NFS: %4d nfs_commit_done (status %d)\n",
                                 task->tk_pid, task->tk_status);
@@ -1405,6 +1404,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 	while (!list_empty(&data->pages)) {
 		req = nfs_list_entry(data->pages.next);
 		nfs_list_remove_request(req);
+		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 
 		dprintk("NFS: commit (%s/%Ld %d@%Ld)",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
@@ -1431,9 +1431,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
 		nfs_mark_request_dirty(req);
 	next:
 		nfs_clear_page_writeback(req);
-		res++;
 	}
-	sub_page_state(nr_unstable,res);
 }
 
 static const struct rpc_call_ops nfs_commit_ops = {
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 48cb3586244..eae66b41a1a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -120,7 +120,6 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 {
 	struct sysinfo i;
 	int len;
-	struct page_state ps;
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long free;
@@ -129,7 +128,6 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 	struct vmalloc_info vmi;
 	long cached;
 
-	get_page_state(&ps);
 	get_zone_counts(&active, &inactive, &free);
 
 /*
@@ -172,6 +170,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Mapped:       %8lu kB\n"
 		"Slab:         %8lu kB\n"
 		"PageTables:   %8lu kB\n"
+		"NFS Unstable: %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
 		"Committed_AS: %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
@@ -196,6 +195,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SLAB)),
 		K(global_page_state(NR_PAGETABLE)),
+		K(global_page_state(NR_UNSTABLE_NFS)),
 		K(allowed),
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
-- 
cgit v1.2.3


From d2c5e30c9a1420902262aa923794d2ae4e0bc391 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:41 -0700
Subject: [PATCH] zoned vm counters: conversion of nr_bounce to per zone
 counter

Conversion of nr_bounce to a per zone counter

nr_bounce is only used for proc output.  So it could be left as an event
counter.  However, the event counters may not be accurate and nr_bounce is
categorizing types of pages in a zone.  So we really need this to also be a
per zone counter.

[akpm@osdl.org: bugfix]
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index eae66b41a1a..6aa2aa779a0 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -171,6 +171,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"Slab:         %8lu kB\n"
 		"PageTables:   %8lu kB\n"
 		"NFS Unstable: %8lu kB\n"
+		"Bounce:       %8lu kB\n"
 		"CommitLimit:  %8lu kB\n"
 		"Committed_AS: %8lu kB\n"
 		"VmallocTotal: %8lu kB\n"
@@ -196,6 +197,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(global_page_state(NR_SLAB)),
 		K(global_page_state(NR_PAGETABLE)),
 		K(global_page_state(NR_UNSTABLE_NFS)),
+		K(global_page_state(NR_BOUNCE)),
 		K(allowed),
 		K(committed),
 		(unsigned long)VMALLOC_TOTAL >> 10,
-- 
cgit v1.2.3


From f8891e5e1f93a128c3900f82035e8541357896a7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 30 Jun 2006 01:55:45 -0700
Subject: [PATCH] Light weight event counters

The remaining counters in page_state after the zoned VM counter patches
have been applied are all just for show in /proc/vmstat.  They have no
essential function for the VM.

We use a simple increment of per cpu variables.  In order to avoid the most
severe races we disable preempt.  Preempt does not prevent the race between
an increment and an interrupt handler incrementing the same statistics
counter.  However, that race is exceedingly rare, we may only loose one
increment or so and there is no requirement (at least not in kernel) that
the vm event counters have to be accurate.

In the non preempt case this results in a simple increment for each
counter.  For many architectures this will be reduced by the compiler to a
single instruction.  This single instruction is atomic for i386 and x86_64.
 And therefore even the rare race condition in an interrupt is avoided for
both architectures in most cases.

The patchset also adds an off switch for embedded systems that allows a
building of linux kernels without these counters.

The implementation of these counters is through inline code that hopefully
results in only a single instruction increment instruction being emitted
(i386, x86_64) or in the increment being hidden though instruction
concurrency (EPIC architectures such as ia64 can get that done).

Benefits:
- VM event counter operations usually reduce to a single inline instruction
  on i386 and x86_64.
- No interrupt disable, only preempt disable for the preempt case.
  Preempt disable can also be avoided by moving the counter into a spinlock.
- Handling is similar to zoned VM counters.
- Simple and easily extendable.
- Can be omitted to reduce memory use for embedded use.

References:

RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=113512330605497&w=2
RFC http://marc.theaimsgroup.com/?l=linux-kernel&m=114988082814934&w=2
local_t http://marc.theaimsgroup.com/?l=linux-kernel&m=114991748606690&w=2
V2 http://marc.theaimsgroup.com/?t=115014808400007&r=1&w=2
V3 http://marc.theaimsgroup.com/?l=linux-kernel&m=115024767022346&w=2
V4 http://marc.theaimsgroup.com/?l=linux-kernel&m=115047968808926&w=2

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c      | 9 ++++-----
 fs/ncpfs/mmap.c | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index f42961eb983..14a6c4147e4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -452,15 +452,14 @@ static void prune_icache(int nr_to_scan)
 		nr_pruned++;
 	}
 	inodes_stat.nr_unused -= nr_pruned;
+	if (current_is_kswapd())
+		__count_vm_events(KSWAPD_INODESTEAL, reap);
+	else
+		__count_vm_events(PGINODESTEAL, reap);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&freeable);
 	mutex_unlock(&iprune_mutex);
-
-	if (current_is_kswapd())
-		mod_page_state(kswapd_inodesteal, reap);
-	else
-		mod_page_state(pginodesteal, reap);
 }
 
 /*
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 52d60c3d899..e7d5a3097fe 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -93,7 +93,7 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
 	 */
 	if (type)
 		*type = VM_FAULT_MAJOR;
-	inc_page_state(pgmajfault);
+	count_vm_event(PGMAJFAULT);
 	return page;
 }
 
-- 
cgit v1.2.3


From a1836a42daf5ddfe9a891973734bd9a7d62eb504 Mon Sep 17 00:00:00 2001
From: David Quigley <dpquigl@tycho.nsa.gov>
Date: Fri, 30 Jun 2006 01:55:49 -0700
Subject: [PATCH] SELinux: Add security hook definition for getioprio and
 insert hooks

Add a new security hook definition for the sys_ioprio_get operation.  At
present, the SELinux hook function implementation for this hook is
identical to the getscheduler implementation but a separate hook is
introduced to allow this check to be specialized in the future if
necessary.

This patch also creates a helper function get_task_ioprio which handles the
access check in addition to retrieving the ioprio value for the task.

Signed-off-by: David Quigley <dpquigl@tycho.nsa.gov>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ioprio.c | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index 7fa76ed53c1..93aa5715f22 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -125,11 +125,24 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 	return ret;
 }
 
+static int get_task_ioprio(struct task_struct *p)
+{
+	int ret;
+
+	ret = security_task_getioprio(p);
+	if (ret)
+		goto out;
+	ret = p->ioprio;
+out:
+	return ret;
+}
+
 asmlinkage long sys_ioprio_get(int which, int who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
 	int ret = -ESRCH;
+	int tmpio;
 
 	read_lock_irq(&tasklist_lock);
 	switch (which) {
@@ -139,16 +152,19 @@ asmlinkage long sys_ioprio_get(int which, int who)
 			else
 				p = find_task_by_pid(who);
 			if (p)
-				ret = p->ioprio;
+				ret = get_task_ioprio(p);
 			break;
 		case IOPRIO_WHO_PGRP:
 			if (!who)
 				who = process_group(current);
 			do_each_task_pid(who, PIDTYPE_PGID, p) {
+				tmpio = get_task_ioprio(p);
+				if (tmpio < 0)
+					continue;
 				if (ret == -ESRCH)
-					ret = p->ioprio;
+					ret = tmpio;
 				else
-					ret = ioprio_best(ret, p->ioprio);
+					ret = ioprio_best(ret, tmpio);
 			} while_each_task_pid(who, PIDTYPE_PGID, p);
 			break;
 		case IOPRIO_WHO_USER:
@@ -163,10 +179,13 @@ asmlinkage long sys_ioprio_get(int which, int who)
 			do_each_thread(g, p) {
 				if (p->uid != user->uid)
 					continue;
+				tmpio = get_task_ioprio(p);
+				if (tmpio < 0)
+					continue;
 				if (ret == -ESRCH)
-					ret = p->ioprio;
+					ret = tmpio;
 				else
-					ret = ioprio_best(ret, p->ioprio);
+					ret = ioprio_best(ret, tmpio);
 			} while_each_thread(g, p);
 
 			if (who)
-- 
cgit v1.2.3


From a56f39375a7bd183149c18286389e4a58ac9bc95 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 30 Jun 2006 01:56:10 -0700
Subject: [PATCH] knfsd: improve the test for cross-device-rename in nfsd

Just testing the i_sb isn't really enough, at least the vfsmnt must be the
same.  Thanks Al.

Cc: Al Viro <viro@ftp.linux.org.uk>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 245eaa1fb59..b653ede37e6 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1553,7 +1553,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	tdir = tdentry->d_inode;
 
 	err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
-	if (fdir->i_sb != tdir->i_sb)
+	if (ffhp->fh_export != tfhp->fh_export)
 		goto out;
 
 	err = nfserr_perm;
-- 
cgit v1.2.3


From 4bdff8c0950afa6a3f2efec6cd15159d97fac1e6 Mon Sep 17 00:00:00 2001
From: Frank Filz <ffilzlnx@us.ibm.com>
Date: Fri, 30 Jun 2006 01:56:11 -0700
Subject: [PATCH] knfsd: fixing missing 'expkey' support for fsid type 3

Type '3' is used for the fsid in filehandles when the device number of the
device holding the filesystem has more than 8 bits in either major or minor.
Unfortunately expkey_parse doesn't recognise type 3.  Fix this.

(Slighty modified from Frank's original)

Signed-off-by: Frank Filz <ffilzlnx@us.ibm.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/export.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 3eec30000f3..01bc68c628a 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -126,7 +126,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
 	if (*ep)
 		goto out;
 	dprintk("found fsidtype %d\n", fsidtype);
-	if (fsidtype > 2)
+	if (key_len(fsidtype)==0) /* invalid type */
 		goto out;
 	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
 		goto out;
-- 
cgit v1.2.3


From 4c9608b2f2f51e84eba72f4bf061d360259ce656 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 30 Jun 2006 01:56:11 -0700
Subject: [PATCH] knfsd: remove noise about filehandle being uptodate

There is a perfectly valid situation where fh_update gets called on an already
uptodate filehandle - in nfsd_create_v3 where a CREATE_UNCHECKED finds an
existing file and wants to just set the size.

We could possible optimise out the call in that case, but the only harm
involved is that fh_update prints a warning, so it is easier to remove the
warning.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfsfh.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 3f2ec2e6d06..e3f3bbea8ae 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -461,7 +461,7 @@ fh_update(struct svc_fh *fhp)
 	} else {
 		int size;
 		if (fhp->fh_handle.fh_fileid_type != 0)
-			goto out_uptodate;
+			goto out;
 		datap = fhp->fh_handle.fh_auth+
 			fhp->fh_handle.fh_size/4 -1;
 		size = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
@@ -481,10 +481,6 @@ out_negative:
 	printk(KERN_ERR "fh_update: %s/%s still negative!\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 	goto out;
-out_uptodate:
-	printk(KERN_ERR "fh_update: %s/%s already up-to-date!\n",
-		dentry->d_parent->d_name.name, dentry->d_name.name);
-	goto out;
 }
 
 /*
-- 
cgit v1.2.3


From 7e4053645a67097fa9bec2794d685b1d3928757a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 30 Jun 2006 01:56:12 -0700
Subject: [PATCH] knfsd: ignore ref_fh when crossing a mountpoint

nfsd tries to return to a client the same sort of filehandle as was used by
the client.  This removes some filehandle aliasing issues and means that a
server upgrade followed by a downgrade will not confused clients not restarted
during that time.

However when crossing a mountpoint, the filehandle used for one filesystem
doesn't provide any useful information on what sort of filehandle should be
used on the other, and can provide misleading information.  So if the
reference filehandle is on a different filesystem to the one being generated,
ignore it.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfsfh.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e3f3bbea8ae..ca96ede30a1 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -312,8 +312,8 @@ int
 fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct svc_fh *ref_fh)
 {
 	/* ref_fh is a reference file handle.
-	 * if it is non-null, then we should compose a filehandle which is
-	 * of the same version, where possible.
+	 * if it is non-null and for the same filesystem, then we should compose
+	 * a filehandle which is of the same version, where possible.
 	 * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
 	 * Then create a 32byte filehandle using nfs_fhbase_old
 	 *
@@ -332,7 +332,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
 		parent->d_name.name, dentry->d_name.name,
 		(inode ? inode->i_ino : 0));
 
-	if (ref_fh) {
+	if (ref_fh && ref_fh->fh_export == exp) {
 		ref_fh_version = ref_fh->fh_handle.fh_version;
 		if (ref_fh_version == 0xca)
 			ref_fh_fsid_type = 0;
-- 
cgit v1.2.3


From a8cddc5dfc1d03a91885ef27eb91418e665577ce Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 30 Jun 2006 01:56:13 -0700
Subject: [PATCH] knfsd: nfsd4: fix open_confirm locking

Fix an improper unlock in an error path.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7c7d01672d3..21497321a52 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2252,8 +2252,9 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
 			(int)current_fh->fh_dentry->d_name.len,
 			current_fh->fh_dentry->d_name.name);
 
-	if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0)))
-		goto out;
+	status = fh_verify(rqstp, current_fh, S_IFREG, 0);
+	if (status)
+		return status;
 
 	nfs4_lock_state();
 
-- 
cgit v1.2.3


From 7fc90ec93a5eb71f4b08403baf5ba7176b3ec6b1 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 30 Jun 2006 01:56:14 -0700
Subject: [PATCH] knfsd: nfsd: call nfsd_setuser() on fh_compose(), fix nfsd4
 permissions problem

In the typical v2/v3 case the only new filehandles used as arguments to
operations are filehandles taken directly off the wire, which don't get
dentries until fh_verify() is called.

But in v4 the filehandles that are arguments to operations were often created
by previous operations (putrootfh, lookup, etc.) using fh_compose, which sets
the dentry in the filehandle without calling nfsd_setuser().

This also means that, for example, if filesystem B is mounted on filesystem A,
and filesystem A is exported without root-squashing, then a client can bypass
the rootsquashing on B using a compound that starts at a filehandle in A,
crosses into B using lookups, and then does stuff in B.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfsfh.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ca96ede30a1..ecc439d2565 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -187,13 +187,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 			goto out;
 		}
 
-		/* Set user creds for this exportpoint */
-		error = nfsd_setuser(rqstp, exp);
-		if (error) {
-			error = nfserrno(error);
-			goto out;
-		}
-
 		/*
 		 * Look up the dentry using the NFS file handle.
 		 */
@@ -251,6 +244,14 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 	}
 	cache_get(&exp->h);
 
+	/* Set user creds for this exportpoint; necessary even in the "just
+	 * checking" case because this may be a filehandle that was created by
+	 * fh_compose, and that is about to be used in another nfsv4 compound
+	 * operation */
+	error = nfserrno(nfsd_setuser(rqstp, exp));
+	if (error)
+		goto out;
+
 	error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type);
 	if (error)
 		goto out;
-- 
cgit v1.2.3


From 6e46d8a9ccbcd3273bdb6902ca2b6da62c253e73 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 30 Jun 2006 01:56:14 -0700
Subject: [PATCH] knfsd: nfsd4: remove superfluous grace period checks

We're checking nfs_in_grace here a few times when there isn't really any
reason to--bad_stateid is probably the more sensible return value anyway.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 21497321a52..4810577347c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2070,16 +2070,12 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl
 	if (!stateid->si_fileid) { /* delegation stateid */
 		if(!(dp = find_delegation_stateid(ino, stateid))) {
 			dprintk("NFSD: delegation stateid not found\n");
-			if (nfs4_in_grace())
-				status = nfserr_grace;
 			goto out;
 		}
 		stidp = &dp->dl_stateid;
 	} else { /* open or lock stateid */
 		if (!(stp = find_stateid(stateid, flags))) {
 			dprintk("NFSD: open or lock stateid not found\n");
-			if (nfs4_in_grace())
-				status = nfserr_grace;
 			goto out;
 		}
 		if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
-- 
cgit v1.2.3


From 270d56e536dcd37cc819a6adb51d918185411048 Mon Sep 17 00:00:00 2001
From: "David M. Richter" <richterd@citi.umich.edu>
Date: Fri, 30 Jun 2006 01:56:15 -0700
Subject: [PATCH] knfsd: nfsd: fix misplaced fh_unlock() in nfsd_link()

In the event that lookup_one_len() fails in nfsd_link(), fh_unlock() is
skipped and locks are held overlong.

Patch was tested on 2.6.17-rc2 by causing lookup_one_len() to fail and
verifying that fh_unlock() gets called appropriately.

Signed-off-by: David M. Richter <richterd@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b653ede37e6..f916b170e13 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1517,14 +1517,15 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 			err = nfserrno(err);
 	}
 
-	fh_unlock(ffhp);
 	dput(dnew);
+out_unlock:
+	fh_unlock(ffhp);
 out:
 	return err;
 
 out_nfserr:
 	err = nfserrno(err);
-	goto out;
+	goto out_unlock;
 }
 
 /*
-- 
cgit v1.2.3


From ba5a6a19d83babe00be3711db3deee5c57587b8f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 30 Jun 2006 01:56:16 -0700
Subject: [PATCH] knfsd: nfsd4: fix some open argument tests

These tests always returned true; clearly that wasn't what was intended.

In keeping with kernel style, make them functions instead of macros while
we're at it.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4810577347c..591dc6ba6e1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1237,8 +1237,15 @@ find_file(struct inode *ino)
 	return NULL;
 }
 
-#define TEST_ACCESS(x) ((x > 0 || x < 4)?1:0)
-#define TEST_DENY(x) ((x >= 0 || x < 5)?1:0)
+static int access_valid(u32 x)
+{
+	return (x > 0 && x < 4);
+}
+
+static int deny_valid(u32 x)
+{
+	return (x >= 0 && x < 5);
+}
 
 static void
 set_access(unsigned int *access, unsigned long bmap) {
@@ -1745,7 +1752,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	int status;
 
 	status = nfserr_inval;
-	if (!TEST_ACCESS(open->op_share_access) || !TEST_DENY(open->op_share_deny))
+	if (!access_valid(open->op_share_access)
+			|| !deny_valid(open->op_share_deny))
 		goto out;
 	/*
 	 * Lookup file; if found, lookup stateid and check open request,
@@ -2317,7 +2325,8 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct n
 			(int)current_fh->fh_dentry->d_name.len,
 			current_fh->fh_dentry->d_name.name);
 
-	if (!TEST_ACCESS(od->od_share_access) || !TEST_DENY(od->od_share_deny))
+	if (!access_valid(od->od_share_access)
+			|| !deny_valid(od->od_share_deny))
 		return nfserr_inval;
 
 	nfs4_lock_state();
-- 
cgit v1.2.3


From 9ecb6a08d84d0e795648d5add64f154bc406914b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 30 Jun 2006 01:56:17 -0700
Subject: [PATCH] knfsd: nfsd4: fix open flag passing

Since nfsv4 actually keeps around the file descriptors it gets from open
(instead of just using them for a single read or write operation), we need to
make sure that we can do RDWR opens and not just RDONLY/WRONLY.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c | 6 +++---
 fs/nfsd/vfs.c       | 5 ++++-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 591dc6ba6e1..9daa0b9feb8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1790,10 +1790,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	} else {
 		/* Stateid was not found, this is a new OPEN */
 		int flags = 0;
+		if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
+			flags |= MAY_READ;
 		if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-			flags = MAY_WRITE;
-		else
-			flags = MAY_READ;
+			flags |= MAY_WRITE;
 		status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
 		if (status)
 			goto out;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f916b170e13..423e1ba0704 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -673,7 +673,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 		goto out_nfserr;
 
 	if (access & MAY_WRITE) {
-		flags = O_WRONLY|O_LARGEFILE;
+		if (access & MAY_READ)
+			flags = O_RDWR|O_LARGEFILE;
+		else
+			flags = O_WRONLY|O_LARGEFILE;
 
 		DQUOT_INIT(inode);
 	}
-- 
cgit v1.2.3


From 5c04c46aec16b3267d8fe03af886f2d41e448cd0 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Fri, 30 Jun 2006 01:56:19 -0700
Subject: [PATCH] knfsd: nfsd: mark rqstp to prevent use of sendfile in privacy
 case

Add a rq_sendfile_ok flag to svc_rqst which will be cleared in the privacy
case so that the wrapping code will get copies of the read data instead of
real page cache pages.  This makes life simpler when we encrypt the response.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/vfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 423e1ba0704..e170030d45d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -837,7 +837,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	if (ra && ra->p_set)
 		file->f_ra = ra->p_ra;
 
-	if (file->f_op->sendfile) {
+	if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
 		svc_pushback_unused_pages(rqstp);
 		err = file->f_op->sendfile(file, &offset, *count,
 						 nfsd_read_actor, rqstp);
-- 
cgit v1.2.3


From 10e5dce07e6f8f9cea1b54161a888bb099484f88 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Sat, 1 Jul 2006 04:36:24 -0700
Subject: [PATCH] ufs: truncate should allocate block for last byte

This patch fixes buggy behaviour of UFS
in such kind of scenario:
open(, O_TRUNC...)
ftruncate(, 1024)
ftruncate(, 0)

Such a scenario causes ufs_panic and remount read-only.  This happen
because of according to specification UFS should always allocate block for
last byte, and many parts of our implementation rely on this, but
`ufs_truncate' doesn't care about this.

To make possible return error code and to know about old size, this patch
removes `truncate' from ufs inode_operations and uses `setattr' method to
call ufs_truncate.

Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ufs/balloc.c   |  48 +-----------------
 fs/ufs/file.c     |   4 --
 fs/ufs/inode.c    |   7 ++-
 fs/ufs/truncate.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ufs/util.c     |  54 ++++++++++++++++++++
 fs/ufs/util.h     |   8 +++
 6 files changed, 204 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 95b878e5c7a..b01804baa12 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -217,48 +217,6 @@ failed:
 	return;
 }
 
-static struct page *ufs_get_locked_page(struct address_space *mapping,
-				  unsigned long index)
-{
-	struct page *page;
-
-try_again:
-	page = find_lock_page(mapping, index);
-	if (!page) {
-		page = read_cache_page(mapping, index,
-				       (filler_t*)mapping->a_ops->readpage,
-				       NULL);
-		if (IS_ERR(page)) {
-			printk(KERN_ERR "ufs_change_blocknr: "
-			       "read_cache_page error: ino %lu, index: %lu\n",
-			       mapping->host->i_ino, index);
-			goto out;
-		}
-
-		lock_page(page);
-
-		if (!PageUptodate(page) || PageError(page)) {
-			unlock_page(page);
-			page_cache_release(page);
-
-			printk(KERN_ERR "ufs_change_blocknr: "
-			       "can not read page: ino %lu, index: %lu\n",
-			       mapping->host->i_ino, index);
-
-			page = ERR_PTR(-EIO);
-			goto out;
-		}
-	}
-
-	if (unlikely(!page->mapping || !page_has_buffers(page))) {
-		unlock_page(page);
-		page_cache_release(page);
-		goto try_again;/*we really need these buffers*/
-	}
-out:
-	return page;
-}
-
 /*
  * Modify inode page cache in such way:
  * have - blocks with b_blocknr equal to oldb...oldb+count-1
@@ -311,10 +269,8 @@ static void ufs_change_blocknr(struct inode *inode, unsigned int baseblk,
 
 		set_page_dirty(page);
 
-		if (likely(cur_index != index)) {
-			unlock_page(page);
-			page_cache_release(page);
-		}
+		if (likely(cur_index != index))
+			ufs_put_locked_page(page);
  	}
 	UFSD("EXIT\n");
 }
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 0e5001512a9..a9c6e5f04fa 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -60,7 +60,3 @@ const struct file_operations ufs_file_operations = {
 	.fsync		= ufs_sync_file,
 	.sendfile	= generic_file_sendfile,
 };
-
-struct inode_operations ufs_file_inode_operations = {
-	.truncate	= ufs_truncate,
-};
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 488b5ff48af..e7c8615beb6 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -843,14 +843,17 @@ int ufs_sync_inode (struct inode *inode)
 
 void ufs_delete_inode (struct inode * inode)
 {
+	loff_t old_i_size;
+
 	truncate_inode_pages(&inode->i_data, 0);
 	/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
 	lock_kernel();
 	mark_inode_dirty(inode);
 	ufs_update_inode(inode, IS_SYNC(inode));
+	old_i_size = inode->i_size;
 	inode->i_size = 0;
-	if (inode->i_blocks)
-		ufs_truncate (inode);
+	if (inode->i_blocks && ufs_truncate(inode, old_i_size))
+		ufs_warning(inode->i_sb, __FUNCTION__, "ufs_truncate failed\n");
 	ufs_free_inode (inode);
 	unlock_kernel();
 }
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 3c3b301f870..c9b55872079 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -369,24 +369,97 @@ static int ufs_trunc_tindirect (struct inode * inode)
 	UFSD("EXIT\n");
 	return retry;
 }
-		
-void ufs_truncate (struct inode * inode)
+
+static int ufs_alloc_lastblock(struct inode *inode)
 {
+	int err = 0;
+	struct address_space *mapping = inode->i_mapping;
+	struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
 	struct ufs_inode_info *ufsi = UFS_I(inode);
-	struct super_block * sb;
-	struct ufs_sb_private_info * uspi;
-	int retry;
+	unsigned lastfrag, i, end;
+	struct page *lastpage;
+	struct buffer_head *bh;
+
+	lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift;
+
+	if (!lastfrag) {
+		ufsi->i_lastfrag = 0;
+		goto out;
+	}
+	lastfrag--;
+
+	lastpage = ufs_get_locked_page(mapping, lastfrag >>
+				       (PAGE_CACHE_SHIFT - inode->i_blkbits));
+       if (IS_ERR(lastpage)) {
+               err = -EIO;
+               goto out;
+       }
+
+       end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
+       bh = page_buffers(lastpage);
+       for (i = 0; i < end; ++i)
+               bh = bh->b_this_page;
+
+       if (!buffer_mapped(bh)) {
+               err = ufs_getfrag_block(inode, lastfrag, bh, 1);
+
+               if (unlikely(err))
+                       goto out_unlock;
+
+               if (buffer_new(bh)) {
+                       clear_buffer_new(bh);
+                       unmap_underlying_metadata(bh->b_bdev,
+						 bh->b_blocknr);
+		       /*
+			* we do not zeroize fragment, because of
+			* if it maped to hole, it already contains zeroes
+			*/
+                       set_buffer_uptodate(bh);
+                       mark_buffer_dirty(bh);
+                       set_page_dirty(lastpage);
+               }
+       }
+out_unlock:
+       ufs_put_locked_page(lastpage);
+out:
+       return err;
+}
+
+int ufs_truncate(struct inode *inode, loff_t old_i_size)
+{
+	struct ufs_inode_info *ufsi = UFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+	int retry, err = 0;
 	
 	UFSD("ENTER\n");
-	sb = inode->i_sb;
-	uspi = UFS_SB(sb)->s_uspi;
 
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
-		return;
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	      S_ISLNK(inode->i_mode)))
+		return -EINVAL;
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-		return;
+		return -EPERM;
+
+	if (inode->i_size > old_i_size) {
+		/*
+		 * if we expand file we should care about
+		 * allocation of block for last byte first of all
+		 */
+		err = ufs_alloc_lastblock(inode);
+
+		if (err) {
+			i_size_write(inode, old_i_size);
+			goto out;
+		}
+		/*
+		 * go away, because of we expand file, and we do not
+		 * need free blocks, and zeroizes page
+		 */
+		lock_kernel();
+		goto almost_end;
+	}
 
-	block_truncate_page(inode->i_mapping,	inode->i_size, ufs_getfrag_block);
+	block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
 
 	lock_kernel();
 	while (1) {
@@ -404,9 +477,58 @@ void ufs_truncate (struct inode * inode)
 		yield();
 	}
 
+	if (inode->i_size < old_i_size) {
+		/*
+		 * now we should have enough space
+		 * to allocate block for last byte
+		 */
+		err = ufs_alloc_lastblock(inode);
+		if (err)
+			/*
+			 * looks like all the same - we have no space,
+			 * but we truncate file already
+			 */
+			inode->i_size = (ufsi->i_lastfrag - 1) * uspi->s_fsize;
+	}
+almost_end:
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-	ufsi->i_lastfrag = DIRECT_FRAGMENT;
 	unlock_kernel();
 	mark_inode_dirty(inode);
-	UFSD("EXIT\n");
+out:
+	UFSD("EXIT: err %d\n", err);
+	return err;
 }
+
+
+/*
+ * We don't define our `inode->i_op->truncate', and call it here,
+ * because of:
+ * - there is no way to know old size
+ * - there is no way inform user about error, if it happens in `truncate'
+ */
+static int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	unsigned int ia_valid = attr->ia_valid;
+	int error;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if (ia_valid & ATTR_SIZE &&
+	    attr->ia_size != i_size_read(inode)) {
+		loff_t old_i_size = inode->i_size;
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+		error = ufs_truncate(inode, old_i_size);
+		if (error)
+			return error;
+	}
+	return inode_setattr(inode, attr);
+}
+
+struct inode_operations ufs_file_inode_operations = {
+	.setattr = ufs_setattr,
+};
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index a2f13f45708..337cf2c46d1 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -233,3 +233,57 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
 	else
 		ufsi->i_u1.i_data[0] = fs32;
 }
+
+/**
+ * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist
+ * read it from disk.
+ * @mapping: the address_space to search
+ * @index: the page index
+ *
+ * Locates the desired pagecache page, if not exist we'll read it,
+ * locks it, increments its reference
+ * count and returns its address.
+ *
+ */
+
+struct page *ufs_get_locked_page(struct address_space *mapping,
+				 pgoff_t index)
+{
+	struct page *page;
+
+try_again:
+	page = find_lock_page(mapping, index);
+	if (!page) {
+		page = read_cache_page(mapping, index,
+				       (filler_t*)mapping->a_ops->readpage,
+				       NULL);
+		if (IS_ERR(page)) {
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "read_cache_page error: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+			goto out;
+		}
+
+		lock_page(page);
+
+		if (!PageUptodate(page) || PageError(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+
+			printk(KERN_ERR "ufs_change_blocknr: "
+			       "can not read page: ino %lu, index: %lu\n",
+			       mapping->host->i_ino, index);
+
+			page = ERR_PTR(-EIO);
+			goto out;
+		}
+	}
+
+	if (unlikely(!page->mapping || !page_has_buffers(page))) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto try_again;/*we really need these buffers*/
+	}
+out:
+	return page;
+}
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 406981fff5e..28fce6c239b 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -251,6 +251,14 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc
 #define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size)
 extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
 
+/* This functions works with cache pages*/
+extern struct page *ufs_get_locked_page(struct address_space *mapping,
+					pgoff_t index);
+static inline void ufs_put_locked_page(struct page *page)
+{
+       unlock_page(page);
+       page_cache_release(page);
+}
 
 
 /*
-- 
cgit v1.2.3


From dd535a5965cef7551da34aef5cec47f46e97b6d9 Mon Sep 17 00:00:00 2001
From: Vladimir Saveliev <vs@namesys.com>
Date: Sat, 1 Jul 2006 04:36:32 -0700
Subject: [PATCH] reiserfs: update ctime and mtime on expanding truncate

Reiserfs does not update ctime and mtime on expanding truncate via
truncate().  This patch fixes it.

Signed-off-by: Vladimir Saveliev <vs@namesys.com>
Cc: Hans Reiser <reiser@namesys.com>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Chris Mason <mason@suse.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 477eaa2fcf6..12dfdcfbee3 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2932,6 +2932,11 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 			}
 			if (error)
 				goto out;
+			/*
+			 * file size is changed, ctime and mtime are
+			 * to be updated
+			 */
+			attr->ia_valid |= (ATTR_MTIME | ATTR_CTIME);
 		}
 	}
 
-- 
cgit v1.2.3


From 4ebd9ab387b39c44165cd296a6637082a4f0f66a Mon Sep 17 00:00:00 2001
From: Dominik Hackl <dominik@hackl.dhs.org>
Date: Sun, 2 Jul 2006 17:29:26 +0200
Subject: [PATCH] nfs: non-procfs build fix

This fixes a bug in fs/nfs which makes it impossible to build nfs
without having procfs enabled.

Signed-off-by: Dominik Hackl <dominik@hackl.dhs.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/internal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 4fe51c1292b..e4f4e5def0f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -81,9 +81,9 @@ extern struct file_system_type clone_nfs_fs_type;
 #ifdef CONFIG_NFS_V4
 extern struct file_system_type clone_nfs4_fs_type;
 #endif
-#ifdef CONFIG_PROC_FS
+
 extern struct rpc_stat nfs_rpcstat;
-#endif
+
 extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
 
-- 
cgit v1.2.3