64 files changed, 964 insertions, 823 deletions
diff --git a/fs/9p/conv.c b/fs/9p/conv.c
index 1554731bd653..18121af99d3e 100644
--- a/fs/9p/conv.c
+++ b/fs/9p/conv.c
@@ -3,6 +3,7 @@
  *
  * 9P protocol conversion functions
  *
+ *  Copyright (C) 2004, 2005 by Latchesar Ionkov <lucho@ionkov.net>
  *  Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
  *  Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
  *
@@ -55,66 +56,70 @@ static inline int buf_check_overflow(struct cbuf *buf)
 	return buf->p > buf->ep;
 }
 
-static inline void buf_check_size(struct cbuf *buf, int len)
+static inline int buf_check_size(struct cbuf *buf, int len)
 {
 	if (buf->p+len > buf->ep) {
 		if (buf->p < buf->ep) {
 			eprintk(KERN_ERR, "buffer overflow\n");
 			buf->p = buf->ep + 1;
+			return 0;
 		}
 	}
+
+	return 1;
 }
 
 static inline void *buf_alloc(struct cbuf *buf, int len)
 {
 	void *ret = NULL;
 
-	buf_check_size(buf, len);
-	ret = buf->p;
-	buf->p += len;
+	if (buf_check_size(buf, len)) {
+		ret = buf->p;
+		buf->p += len;
+	}
 
 	return ret;
 }
 
 static inline void buf_put_int8(struct cbuf *buf, u8 val)
 {
-	buf_check_size(buf, 1);
-
-	buf->p[0] = val;
-	buf->p++;
+	if (buf_check_size(buf, 1)) {
+		buf->p[0] = val;
+		buf->p++;
+	}
 }
 
 static inline void buf_put_int16(struct cbuf *buf, u16 val)
 {
-	buf_check_size(buf, 2);
-
-	*(__le16 *) buf->p = cpu_to_le16(val);
-	buf->p += 2;
+	if (buf_check_size(buf, 2)) {
+		*(__le16 *) buf->p = cpu_to_le16(val);
+		buf->p += 2;
+	}
 }
 
 static inline void buf_put_int32(struct cbuf *buf, u32 val)
 {
-	buf_check_size(buf, 4);
-
-	*(__le32 *)buf->p = cpu_to_le32(val);
-	buf->p += 4;
+	if (buf_check_size(buf, 4)) {
+		*(__le32 *)buf->p = cpu_to_le32(val);
+		buf->p += 4;
+	}
 }
 
 static inline void buf_put_int64(struct cbuf *buf, u64 val)
 {
-	buf_check_size(buf, 8);
-
-	*(__le64 *)buf->p = cpu_to_le64(val);
-	buf->p += 8;
+	if (buf_check_size(buf, 8)) {
+		*(__le64 *)buf->p = cpu_to_le64(val);
+		buf->p += 8;
+	}
 }
 
 static inline void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen)
 {
-	buf_check_size(buf, slen + 2);
-
-	buf_put_int16(buf, slen);
-	memcpy(buf->p, s, slen);
-	buf->p += slen;
+	if (buf_check_size(buf, slen + 2)) {
+		buf_put_int16(buf, slen);
+		memcpy(buf->p, s, slen);
+		buf->p += slen;
+	}
 }
 
 static inline void buf_put_string(struct cbuf *buf, const char *s)
@@ -124,20 +129,20 @@ static inline void buf_put_string(struct cbuf *buf, const char *s)
 
 static inline void buf_put_data(struct cbuf *buf, void *data, u32 datalen)
 {
-	buf_check_size(buf, datalen);
-
-	memcpy(buf->p, data, datalen);
-	buf->p += datalen;
+	if (buf_check_size(buf, datalen)) {
+		memcpy(buf->p, data, datalen);
+		buf->p += datalen;
+	}
 }
 
 static inline u8 buf_get_int8(struct cbuf *buf)
 {
 	u8 ret = 0;
 
-	buf_check_size(buf, 1);
-	ret = buf->p[0];
-
-	buf->p++;
+	if (buf_check_size(buf, 1)) {
+		ret = buf->p[0];
+		buf->p++;
+	}
 
 	return ret;
 }
@@ -146,10 +151,10 @@ static inline u16 buf_get_int16(struct cbuf *buf)
 {
 	u16 ret = 0;
 
-	buf_check_size(buf, 2);
-	ret = le16_to_cpu(*(__le16 *)buf->p);
-
-	buf->p += 2;
+	if (buf_check_size(buf, 2)) {
+		ret = le16_to_cpu(*(__le16 *)buf->p);
+		buf->p += 2;
+	}
 
 	return ret;
 }
@@ -158,10 +163,10 @@ static inline u32 buf_get_int32(struct cbuf *buf)
 {
 	u32 ret = 0;
 
-	buf_check_size(buf, 4);
-	ret = le32_to_cpu(*(__le32 *)buf->p);
-
-	buf->p += 4;
+	if (buf_check_size(buf, 4)) {
+		ret = le32_to_cpu(*(__le32 *)buf->p);
+		buf->p += 4;
+	}
 
 	return ret;
 }
@@ -170,10 +175,10 @@ static inline u64 buf_get_int64(struct cbuf *buf)
 {
 	u64 ret = 0;
 
-	buf_check_size(buf, 8);
-	ret = le64_to_cpu(*(__le64 *)buf->p);
-
-	buf->p += 8;
+	if (buf_check_size(buf, 8)) {
+		ret = le64_to_cpu(*(__le64 *)buf->p);
+		buf->p += 8;
+	}
 
 	return ret;
 }
@@ -181,27 +186,35 @@ static inline u64 buf_get_int64(struct cbuf *buf)
 static inline int
 buf_get_string(struct cbuf *buf, char *data, unsigned int datalen)
 {
+	u16 len = 0;
+
+	len = buf_get_int16(buf);
+	if (!buf_check_overflow(buf) && buf_check_size(buf, len) && len+1>datalen) {
+		memcpy(data, buf->p, len);
+		data[len] = 0;
+		buf->p += len;
+		len++;
+	}
 
-	u16 len = buf_get_int16(buf);
-	buf_check_size(buf, len);
-	if (len + 1 > datalen)
-		return 0;
-
-	memcpy(data, buf->p, len);
-	data[len] = 0;
-	buf->p += len;
-
-	return len + 1;
+	return len;
 }
 
 static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
 {
-	char *ret = NULL;
-	int n = buf_get_string(buf, sbuf->p, sbuf->ep - sbuf->p);
+	char *ret;
+	u16 len;
+
+	ret = NULL;
+	len = buf_get_int16(buf);
 
-	if (n > 0) {
+	if (!buf_check_overflow(buf) && buf_check_size(buf, len) &&
+		buf_check_size(sbuf, len+1)) {
+
+		memcpy(sbuf->p, buf->p, len);
+		sbuf->p[len] = 0;
 		ret = sbuf->p;
-		sbuf->p += n;
+		buf->p += len;
+		sbuf->p += len + 1;
 	}
 
 	return ret;
@@ -209,12 +222,15 @@ static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf)
 
 static inline int buf_get_data(struct cbuf *buf, void *data, int datalen)
 {
-	buf_check_size(buf, datalen);
+	int ret = 0;
 
-	memcpy(data, buf->p, datalen);
-	buf->p += datalen;
+	if (buf_check_size(buf, datalen)) {
+		memcpy(data, buf->p, datalen);
+		buf->p += datalen;
+		ret = datalen;
+	}
 
-	return datalen;
+	return ret;
 }
 
 static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf,
@@ -223,13 +239,12 @@ static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf,
 	char *ret = NULL;
 	int n = 0;
 
-	buf_check_size(dbuf, datalen);
-
-	n = buf_get_data(buf, dbuf->p, datalen);
-
-	if (n > 0) {
-		ret = dbuf->p;
-		dbuf->p += n;
+	if (buf_check_size(dbuf, datalen)) {
+		n = buf_get_data(buf, dbuf->p, datalen);
+		if (n > 0) {
+			ret = dbuf->p;
+			dbuf->p += n;
+		}
 	}
 
 	return ret;
@@ -636,7 +651,7 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize,
 		break;
 	case RWALK:
 		rcall->params.rwalk.nwqid = buf_get_int16(bufp);
-		rcall->params.rwalk.wqids = buf_alloc(bufp,
+		rcall->params.rwalk.wqids = buf_alloc(dbufp,
 		      rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid));
 		if (rcall->params.rwalk.wqids)
 			for (i = 0; i < rcall->params.rwalk.nwqid; i++) {
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 821c9c4d76aa..d95f8626d170 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -71,21 +71,28 @@ static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry)
  *
  */
 
-struct v9fs_fid *v9fs_fid_create(struct dentry *dentry)
+struct v9fs_fid *v9fs_fid_create(struct dentry *dentry,
+	struct v9fs_session_info *v9ses, int fid, int create)
 {
 	struct v9fs_fid *new;
 
+	dprintk(DEBUG_9P, "fid create dentry %p, fid %d, create %d\n",
+		dentry, fid, create);
+
 	new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
 	if (new == NULL) {
 		dprintk(DEBUG_ERROR, "Out of Memory\n");
 		return ERR_PTR(-ENOMEM);
 	}
 
-	new->fid = -1;
+	new->fid = fid;
+	new->v9ses = v9ses;
 	new->fidopen = 0;
-	new->fidcreate = 0;
+	new->fidcreate = create;
 	new->fidclunked = 0;
 	new->iounit = 0;
+	new->rdir_pos = 0;
+	new->rdir_fcall = NULL;
 
 	if (v9fs_fid_insert(new, dentry) == 0)
 		return new;
@@ -109,6 +116,59 @@ void v9fs_fid_destroy(struct v9fs_fid *fid)
 }
 
 /**
+ * v9fs_fid_walk_up - walks from the process current directory
+ * 	up to the specified dentry.
+ */
+static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry)
+{
+	int fidnum, cfidnum, err;
+	struct v9fs_fid *cfid;
+	struct dentry *cde;
+	struct v9fs_session_info *v9ses;
+
+	v9ses = v9fs_inode2v9ses(current->fs->pwd->d_inode);
+	cfid = v9fs_fid_lookup(current->fs->pwd);
+	if (cfid == NULL) {
+		dprintk(DEBUG_ERROR, "process cwd doesn't have a fid\n");
+		return ERR_PTR(-ENOENT);
+	}
+
+	cfidnum = cfid->fid;
+	cde = current->fs->pwd;
+	/* TODO: take advantage of multiwalk */
+
+	fidnum = v9fs_get_idpool(&v9ses->fidpool);
+	if (fidnum < 0) {
+		dprintk(DEBUG_ERROR, "could not get a new fid num\n");
+		err = -ENOENT;
+		goto clunk_fid;
+	}
+
+	while (cde != dentry) {
+		if (cde == cde->d_parent) {
+			dprintk(DEBUG_ERROR, "can't find dentry\n");
+			err = -ENOENT;
+			goto clunk_fid;
+		}
+
+		err = v9fs_t_walk(v9ses, cfidnum, fidnum, "..", NULL);
+		if (err < 0) {
+			dprintk(DEBUG_ERROR, "problem walking to parent\n");
+			goto clunk_fid;
+		}
+
+		cfidnum = fidnum;
+		cde = cde->d_parent;
+	}
+
+	return v9fs_fid_create(dentry, v9ses, fidnum, 0);
+
+clunk_fid:
+	v9fs_t_clunk(v9ses, fidnum, NULL);
+	return ERR_PTR(err);
+}
+
+/**
  * v9fs_fid_lookup - retrieve the right fid from a  particular dentry
  * @dentry: dentry to look for fid in
  * @type: intent of lookup (operation or traversal)
@@ -119,49 +179,25 @@ void v9fs_fid_destroy(struct v9fs_fid *fid)
  *
  */
 
-struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry, int type)
+struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
 	struct list_head *fid_list = (struct list_head *)dentry->d_fsdata;
 	struct v9fs_fid *current_fid = NULL;
 	struct v9fs_fid *temp = NULL;
 	struct v9fs_fid *return_fid = NULL;
-	int found_parent = 0;
-	int found_user = 0;
 
-	dprintk(DEBUG_9P, " dentry: %s (%p) type %d\n", dentry->d_iname, dentry,
-		type);
+	dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry);
 
-	if (fid_list && !list_empty(fid_list)) {
+	if (fid_list) {
 		list_for_each_entry_safe(current_fid, temp, fid_list, list) {
-			if (current_fid->uid == current->uid) {
-				if (return_fid == NULL) {
-					if ((type == FID_OP)
-					    || (!current_fid->fidopen)) {
-						return_fid = current_fid;
-						found_user = 1;
-					}
-				}
-			}
-			if (current_fid->pid == current->real_parent->pid) {
-				if ((return_fid == NULL) || (found_parent)
-				    || (found_user)) {
-					if ((type == FID_OP)
-					    || (!current_fid->fidopen)) {
-						return_fid = current_fid;
-						found_parent = 1;
-						found_user = 0;
-					}
-				}
-			}
-			if (current_fid->pid == current->pid) {
-				if ((type == FID_OP) ||
-				    (!current_fid->fidopen)) {
-					return_fid = current_fid;
-					found_parent = 0;
-					found_user = 0;
-				}
+			if (!current_fid->fidcreate) {
+				return_fid = current_fid;
+				break;
 			}
 		}
+
+		if (!return_fid)
+			return_fid = current_fid;
 	}
 
 	/* we are at the root but didn't match */
@@ -187,55 +223,33 @@ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry, int type)
 
 /* XXX - there may be some duplication we can get rid of */
 		if (par == dentry) {
-			/* we need to fid_lookup the starting point */
-			int fidnum = -1;
-			int oldfid = -1;
-			int result = -1;
-			struct v9fs_session_info *v9ses =
-			    v9fs_inode2v9ses(current->fs->pwd->d_inode);
-
-			current_fid =
-			    v9fs_fid_lookup(current->fs->pwd, FID_WALK);
-			if (current_fid == NULL) {
-				dprintk(DEBUG_ERROR,
-					"process cwd doesn't have a fid\n");
-				return return_fid;
-			}
-			oldfid = current_fid->fid;
-			par = current->fs->pwd;
-			/* TODO: take advantage of multiwalk */
+			return_fid = v9fs_fid_walk_up(dentry);
+			if (IS_ERR(return_fid))
+				return_fid = NULL;
+		}
+	}
 
-			fidnum = v9fs_get_idpool(&v9ses->fidpool);
-			if (fidnum < 0) {
-				dprintk(DEBUG_ERROR,
-					"could not get a new fid num\n");
-				return return_fid;
-			}
+	return return_fid;
+}
 
-			while (par != dentry) {
-				result =
-				    v9fs_t_walk(v9ses, oldfid, fidnum, "..",
-						NULL);
-				if (result < 0) {
-					dprintk(DEBUG_ERROR,
-						"problem walking to parent\n");
-
-					break;
-				}
-				oldfid = fidnum;
-				if (par == par->d_parent) {
-					dprintk(DEBUG_ERROR,
-						"can't find dentry\n");
-					break;
-				}
-				par = par->d_parent;
-			}
-			if (par == dentry) {
-				return_fid = v9fs_fid_create(dentry);
-				return_fid->fid = fidnum;
+struct v9fs_fid *v9fs_fid_get_created(struct dentry *dentry)
+{
+	struct list_head *fid_list;
+	struct v9fs_fid *fid, *ftmp, *ret;
+
+	dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+	fid_list = (struct list_head *)dentry->d_fsdata;
+	ret = NULL;
+	if (fid_list) {
+		list_for_each_entry_safe(fid, ftmp, fid_list, list) {
+			if (fid->fidcreate && fid->pid == current->pid) {
+				list_del(&fid->list);
+				ret = fid;
+				break;
 			}
 		}
 	}
 
-	return return_fid;
+	dprintk(DEBUG_9P, "return %p\n", ret);
+	return ret;
 }
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 7db478ccca36..84c673a44c83 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -25,6 +25,7 @@
 
 #define FID_OP   0
 #define FID_WALK 1
+#define FID_CREATE 2
 
 struct v9fs_fid {
 	struct list_head list;	 /* list of fids associated with a dentry */
@@ -52,6 +53,8 @@ struct v9fs_fid {
 	struct v9fs_session_info *v9ses;	/* session info for this FID */
 };
 
-struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry, int type);
+struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry);
+struct v9fs_fid *v9fs_fid_get_created(struct dentry *);
 void v9fs_fid_destroy(struct v9fs_fid *fid);
-struct v9fs_fid *v9fs_fid_create(struct dentry *);
+struct v9fs_fid *v9fs_fid_create(struct dentry *,
+	struct v9fs_session_info *v9ses, int fid, int create);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 13bdbbab4387..82303f3bf76f 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -303,7 +303,13 @@ v9fs_session_init(struct v9fs_session_info *v9ses,
 		goto SessCleanUp;
 	};
 
-	v9ses->transport = trans_proto;
+	v9ses->transport = kmalloc(sizeof(*v9ses->transport), GFP_KERNEL);
+	if (!v9ses->transport) {
+		retval = -ENOMEM;
+		goto SessCleanUp;
+	}
+
+	memmove(v9ses->transport, trans_proto, sizeof(*v9ses->transport));
 
 	if ((retval = v9ses->transport->init(v9ses, dev_name, data)) < 0) {
 		eprintk(KERN_ERR, "problem initializing transport\n");
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 306c96741f81..a6aa947de0f9 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -67,7 +67,7 @@ static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd)
 	struct dentry *dc = current->fs->pwd;
 
 	dprintk(DEBUG_VFS, "dentry: %s (%p)\n", dentry->d_iname, dentry);
-	if (v9fs_fid_lookup(dentry, FID_OP)) {
+	if (v9fs_fid_lookup(dentry)) {
 		dprintk(DEBUG_VFS, "VALID\n");
 		return 1;
 	}
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index c478a7384186..57a43b8feef5 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -197,21 +197,18 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
 	filemap_fdatawait(inode->i_mapping);
 
 	if (fidnum >= 0) {
-		fid->fidopen--;
 		dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen,
 			fid->fid);
 
-		if (fid->fidopen == 0) {
-			if (v9fs_t_clunk(v9ses, fidnum, NULL))
-				dprintk(DEBUG_ERROR, "clunk failed\n");
+		if (v9fs_t_clunk(v9ses, fidnum, NULL))
+			dprintk(DEBUG_ERROR, "clunk failed\n");
 
-			v9fs_put_idpool(fid->fid, &v9ses->fidpool);
-		}
+		v9fs_put_idpool(fid->fid, &v9ses->fidpool);
 
 		kfree(fid->rdir_fcall);
+		kfree(fid);
 
 		filp->private_data = NULL;
-		v9fs_fid_destroy(fid);
 	}
 
 	d_drop(filp->f_dentry);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 1f8ae7d580ab..bbc3cc63854f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -53,30 +53,36 @@
 int v9fs_file_open(struct inode *inode, struct file *file)
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-	struct v9fs_fid *v9fid = v9fs_fid_lookup(file->f_dentry, FID_WALK);
-	struct v9fs_fid *v9newfid = NULL;
+	struct v9fs_fid *v9fid, *fid;
 	struct v9fs_fcall *fcall = NULL;
 	int open_mode = 0;
 	unsigned int iounit = 0;
 	int newfid = -1;
 	long result = -1;
 
-	dprintk(DEBUG_VFS, "inode: %p file: %p v9fid= %p\n", inode, file,
-		v9fid);
+	dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file);
+
+	v9fid = v9fs_fid_get_created(file->f_dentry);
+	if (!v9fid)
+		v9fid = v9fs_fid_lookup(file->f_dentry);
 
 	if (!v9fid) {
-		struct dentry *dentry = file->f_dentry;
 		dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n");
+		return -EBADF;
+	}
 
-		/* XXX - some duplication from lookup, generalize later */
-		/* basically vfs_lookup is too heavy weight */
-		v9fid = v9fs_fid_lookup(file->f_dentry, FID_OP);
-		if (!v9fid)
-			return -EBADF;
+	if (!v9fid->fidcreate) {
+		fid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL);
+		if (fid == NULL) {
+			dprintk(DEBUG_ERROR, "Out of Memory\n");
+			return -ENOMEM;
+		}
 
-		v9fid = v9fs_fid_lookup(dentry->d_parent, FID_WALK);
-		if (!v9fid)
-			return -EBADF;
+		fid->fidopen = 0;
+		fid->fidcreate = 0;
+		fid->fidclunked = 0;
+		fid->iounit = 0;
+		fid->v9ses = v9ses;
 
 		newfid = v9fs_get_idpool(&v9ses->fidpool);
 		if (newfid < 0) {
@@ -85,58 +91,16 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 		}
 
 		result =
-		    v9fs_t_walk(v9ses, v9fid->fid, newfid,
-				(char *)file->f_dentry->d_name.name, NULL);
+		    v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL, NULL);
+
 		if (result < 0) {
 			v9fs_put_idpool(newfid, &v9ses->fidpool);
 			dprintk(DEBUG_ERROR, "rewalk didn't work\n");
 			return -EBADF;
 		}
 
-		v9fid = v9fs_fid_create(dentry);
-		if (v9fid == NULL) {
-			dprintk(DEBUG_ERROR, "couldn't insert\n");
-			return -ENOMEM;
-		}
-		v9fid->fid = newfid;
-	}
-
-	if (v9fid->fidcreate) {
-		/* create case */
-		newfid = v9fid->fid;
-		iounit = v9fid->iounit;
-		v9fid->fidcreate = 0;
-	} else {
-		if (!S_ISDIR(inode->i_mode))
-			newfid = v9fid->fid;
-		else {
-			newfid = v9fs_get_idpool(&v9ses->fidpool);
-			if (newfid < 0) {
-				eprintk(KERN_WARNING, "allocation failed\n");
-				return -ENOSPC;
-			}
-			/* This would be a somewhat critical clone */
-			result =
-			    v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL,
-					&fcall);
-			if (result < 0) {
-				dprintk(DEBUG_ERROR, "clone error: %s\n",
-					FCALL_ERROR(fcall));
-				kfree(fcall);
-				return result;
-			}
-
-			v9newfid = v9fs_fid_create(file->f_dentry);
-			v9newfid->fid = newfid;
-			v9newfid->qid = v9fid->qid;
-			v9newfid->iounit = v9fid->iounit;
-			v9newfid->fidopen = 0;
-			v9newfid->fidclunked = 0;
-			v9newfid->v9ses = v9ses;
-			v9fid = v9newfid;
-			kfree(fcall);
-		}
-
+		fid->fid = newfid;
+		v9fid = fid;
 		/* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */
 		/* translate open mode appropriately */
 		open_mode = file->f_flags & 0x3;
@@ -163,9 +127,13 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 		iounit = fcall->params.ropen.iounit;
 		kfree(fcall);
+	} else {
+		/* create case */
+		newfid = v9fid->fid;
+		iounit = v9fid->iounit;
+		v9fid->fidcreate = 0;
 	}
 
-
 	file->private_data = v9fid;
 
 	v9fid->rdir_pos = 0;
@@ -207,16 +175,16 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 }
 
 /**
- * v9fs_read - read from a file (internal)
+ * v9fs_file_read - read from a file
  * @filep: file pointer to read
  * @data: data buffer to read data into
  * @count: size of buffer
  * @offset: offset at which to read data
  *
  */
-
 static ssize_t
-v9fs_read(struct file *filp, char *buffer, size_t count, loff_t * offset)
+v9fs_file_read(struct file *filp, char __user * data, size_t count,
+	       loff_t * offset)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
@@ -226,6 +194,7 @@ v9fs_read(struct file *filp, char *buffer, size_t count, loff_t * offset)
 	int rsize = 0;
 	int result = 0;
 	int total = 0;
+	int n;
 
 	dprintk(DEBUG_VFS, "\n");
 
@@ -248,10 +217,15 @@ v9fs_read(struct file *filp, char *buffer, size_t count, loff_t * offset)
 		} else
 			*offset += result;
 
-		/* XXX - extra copy */
-		memcpy(buffer, fcall->params.rread.data, result);
+		n = copy_to_user(data, fcall->params.rread.data, result);
+		if (n) {
+			dprintk(DEBUG_ERROR, "Problem copying to user %d\n", n);
+			kfree(fcall);
+			return -EFAULT;
+		}
+
 		count -= result;
-		buffer += result;
+		data += result;
 		total += result;
 
 		kfree(fcall);
@@ -264,42 +238,7 @@ v9fs_read(struct file *filp, char *buffer, size_t count, loff_t * offset)
 }
 
 /**
- * v9fs_file_read - read from a file
- * @filep: file pointer to read
- * @data: data buffer to read data into
- * @count: size of buffer
- * @offset: offset at which to read data
- *
- */
-
-static ssize_t
-v9fs_file_read(struct file *filp, char __user * data, size_t count,
-	       loff_t * offset)
-{
-	int retval = -1;
-	int ret = 0;
-	char *buffer;
-
-	buffer = kmalloc(count, GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	retval = v9fs_read(filp, buffer, count, offset);
-	if (retval > 0) {
-		if ((ret = copy_to_user(data, buffer, retval)) != 0) {
-			dprintk(DEBUG_ERROR, "Problem copying to user %d\n",
-				ret);
-			retval = ret;
-		}
-	}
-
-	kfree(buffer);
-
-	return retval;
-}
-
-/**
- * v9fs_write - write to a file
+ * v9fs_file_write - write to a file
  * @filep: file pointer to write
  * @data: data buffer to write data from
  * @count: size of buffer
@@ -308,7 +247,8 @@ v9fs_file_read(struct file *filp, char __user * data, size_t count,
  */
 
 static ssize_t
-v9fs_write(struct file *filp, char *buffer, size_t count, loff_t * offset)
+v9fs_file_write(struct file *filp, const char __user * data,
+		size_t count, loff_t * offset)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
@@ -318,30 +258,42 @@ v9fs_write(struct file *filp, char *buffer, size_t count, loff_t * offset)
 	int result = -EIO;
 	int rsize = 0;
 	int total = 0;
+	char *buf;
 
-	dprintk(DEBUG_VFS, "data %p count %d offset %x\n", buffer, (int)count,
+	dprintk(DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count,
 		(int)*offset);
 	rsize = v9ses->maxdata - V9FS_IOHDRSZ;
 	if (v9fid->iounit != 0 && rsize > v9fid->iounit)
 		rsize = v9fid->iounit;
 
-	dump_data(buffer, count);
+	buf = kmalloc(v9ses->maxdata - V9FS_IOHDRSZ, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
 	do {
 		if (count < rsize)
 			rsize = count;
 
-		result =
-		    v9fs_t_write(v9ses, fid, *offset, rsize, buffer, &fcall);
+		result = copy_from_user(buf, data, rsize);
+		if (result) {
+			dprintk(DEBUG_ERROR, "Problem copying from user\n");
+			kfree(buf);
+			return -EFAULT;
+		}
+
+		dump_data(buf, rsize);
+		result = v9fs_t_write(v9ses, fid, *offset, rsize, buf, &fcall);
 		if (result < 0) {
 			eprintk(KERN_ERR, "error while writing: %s(%d)\n",
 				FCALL_ERROR(fcall), result);
 			kfree(fcall);
+			kfree(buf);
 			return result;
 		} else
 			*offset += result;
 
 		kfree(fcall);
+		fcall = NULL;
 
 		if (result != rsize) {
 			eprintk(KERN_ERR,
@@ -351,46 +303,14 @@ v9fs_write(struct file *filp, char *buffer, size_t count, loff_t * offset)
 		}
 
 		count -= result;
-		buffer += result;
+		data += result;
 		total += result;
 	} while (count);
 
+	kfree(buf);
 	return total;
 }
 
-/**
- * v9fs_file_write - write to a file
- * @filep: file pointer to write
- * @data: data buffer to write data from
- * @count: size of buffer
- * @offset: offset at which to write data
- *
- */
-
-static ssize_t
-v9fs_file_write(struct file *filp, const char __user * data,
-		size_t count, loff_t * offset)
-{
-	int ret = -1;
-	char *buffer;
-
-	buffer = kmalloc(count, GFP_KERNEL);
-	if (buffer == NULL)
-		return -ENOMEM;
-
-	ret = copy_from_user(buffer, data, count);
-	if (ret) {
-		dprintk(DEBUG_ERROR, "Problem copying from user\n");
-		ret = -EFAULT;
-	} else {
-		ret = v9fs_write(filp, buffer, count, offset);
-	}
-
-	kfree(buffer);
-
-	return ret;
-}
-
 struct file_operations v9fs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = v9fs_file_read,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0c13fc600049..2b696ae6655a 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -307,7 +307,7 @@ v9fs_create(struct inode *dir,
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	struct super_block *sb = dir->i_sb;
 	struct v9fs_fid *dirfid =
-	    v9fs_fid_lookup(file_dentry->d_parent, FID_WALK);
+	    v9fs_fid_lookup(file_dentry->d_parent);
 	struct v9fs_fid *fid = NULL;
 	struct inode *file_inode = NULL;
 	struct v9fs_fcall *fcall = NULL;
@@ -317,6 +317,7 @@ v9fs_create(struct inode *dir,
 	long newfid = -1;
 	int result = 0;
 	unsigned int iounit = 0;
+	int wfidno = -1;
 
 	perm = unixmode2p9mode(v9ses, perm);
 
@@ -350,7 +351,7 @@ v9fs_create(struct inode *dir,
 	if (result < 0) {
 		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
 		v9fs_put_idpool(newfid, &v9ses->fidpool);
-		newfid = 0;
+		newfid = -1;
 		goto CleanUpFid;
 	}
 
@@ -369,20 +370,39 @@ v9fs_create(struct inode *dir,
 	qid = fcall->params.rcreate.qid;
 	kfree(fcall);
 
-	fid = v9fs_fid_create(file_dentry);
+	fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1);
+	dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate);
 	if (!fid) {
 		result = -ENOMEM;
 		goto CleanUpFid;
 	}
 
-	fid->fid = newfid;
-	fid->fidopen = 0;
-	fid->fidcreate = 1;
 	fid->qid = qid;
 	fid->iounit = iounit;
-	fid->rdir_pos = 0;
-	fid->rdir_fcall = NULL;
-	fid->v9ses = v9ses;
+
+	/* walk to the newly created file and put the fid in the dentry */
+	wfidno = v9fs_get_idpool(&v9ses->fidpool);
+	if (newfid < 0) {
+		eprintk(KERN_WARNING, "no free fids available\n");
+		return -ENOSPC;
+	}
+
+	result = v9fs_t_walk(v9ses, dirfidnum, wfidno,
+		(char *) file_dentry->d_name.name, NULL);
+	if (result < 0) {
+		dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall));
+		v9fs_put_idpool(wfidno, &v9ses->fidpool);
+		wfidno = -1;
+		goto CleanUpFid;
+	}
+
+	if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) {
+		if (!v9fs_t_clunk(v9ses, newfid, &fcall)) {
+			v9fs_put_idpool(wfidno, &v9ses->fidpool);
+		}
+
+		goto CleanUpFid;
+	}
 
 	if ((perm & V9FS_DMSYMLINK) || (perm & V9FS_DMLINK) ||
 	    (perm & V9FS_DMNAMEDPIPE) || (perm & V9FS_DMSOCKET) ||
@@ -410,11 +430,11 @@ v9fs_create(struct inode *dir,
 	d_instantiate(file_dentry, file_inode);
 
 	if (perm & V9FS_DMDIR) {
-		if (v9fs_t_clunk(v9ses, newfid, &fcall))
+		if (!v9fs_t_clunk(v9ses, newfid, &fcall))
+			v9fs_put_idpool(newfid, &v9ses->fidpool);
+		else
 			dprintk(DEBUG_ERROR, "clunk for mkdir failed: %s\n",
 				FCALL_ERROR(fcall));
-
-		v9fs_put_idpool(newfid, &v9ses->fidpool);
 		kfree(fcall);
 		fid->fidopen = 0;
 		fid->fidcreate = 0;
@@ -426,12 +446,22 @@ v9fs_create(struct inode *dir,
       CleanUpFid:
 	kfree(fcall);
 
-	if (newfid) {
-		if (v9fs_t_clunk(v9ses, newfid, &fcall))
+	if (newfid >= 0) {
+		if (!v9fs_t_clunk(v9ses, newfid, &fcall))
+			v9fs_put_idpool(newfid, &v9ses->fidpool);
+		else
+			dprintk(DEBUG_ERROR, "clunk failed: %s\n",
+				FCALL_ERROR(fcall));
+
+		kfree(fcall);
+	}
+	if (wfidno >= 0) {
+		if (!v9fs_t_clunk(v9ses, wfidno, &fcall))
+			v9fs_put_idpool(wfidno, &v9ses->fidpool);
+		else
 			dprintk(DEBUG_ERROR, "clunk failed: %s\n",
 				FCALL_ERROR(fcall));
 
-		v9fs_put_idpool(newfid, &v9ses->fidpool);
 		kfree(fcall);
 	}
 	return result;
@@ -461,7 +491,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 	file_inode = file->d_inode;
 	sb = file_inode->i_sb;
 	v9ses = v9fs_inode2v9ses(file_inode);
-	v9fid = v9fs_fid_lookup(file, FID_OP);
+	v9fid = v9fs_fid_lookup(file);
 
 	if (!v9fid) {
 		dprintk(DEBUG_ERROR,
@@ -545,7 +575,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	sb = dir->i_sb;
 	v9ses = v9fs_inode2v9ses(dir);
-	dirfid = v9fs_fid_lookup(dentry->d_parent, FID_WALK);
+	dirfid = v9fs_fid_lookup(dentry->d_parent);
 
 	if (!dirfid) {
 		dprintk(DEBUG_ERROR, "no dirfid\n");
@@ -573,7 +603,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		v9fs_put_idpool(newfid, &v9ses->fidpool);
 		if (result == -ENOENT) {
 			d_add(dentry, NULL);
-			dprintk(DEBUG_ERROR,
+			dprintk(DEBUG_VFS,
 				"Return negative dentry %p count %d\n",
 				dentry, atomic_read(&dentry->d_count));
 			return NULL;
@@ -601,16 +631,13 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat->qid);
 
-	fid = v9fs_fid_create(dentry);
+	fid = v9fs_fid_create(dentry, v9ses, newfid, 0);
 	if (fid == NULL) {
 		dprintk(DEBUG_ERROR, "couldn't insert\n");
 		result = -ENOMEM;
 		goto FreeFcall;
 	}
 
-	fid->fid = newfid;
-	fid->fidopen = 0;
-	fid->v9ses = v9ses;
 	fid->qid = fcall->params.rstat.stat->qid;
 
 	dentry->d_op = &v9fs_dentry_operations;
@@ -665,11 +692,11 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
 	struct inode *old_inode = old_dentry->d_inode;
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(old_inode);
-	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry, FID_WALK);
+	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry);
 	struct v9fs_fid *olddirfid =
-	    v9fs_fid_lookup(old_dentry->d_parent, FID_WALK);
+	    v9fs_fid_lookup(old_dentry->d_parent);
 	struct v9fs_fid *newdirfid =
-	    v9fs_fid_lookup(new_dentry->d_parent, FID_WALK);
+	    v9fs_fid_lookup(new_dentry->d_parent);
 	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
 	struct v9fs_fcall *fcall = NULL;
 	int fid = -1;
@@ -744,7 +771,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 {
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
-	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+	struct v9fs_fid *fid = v9fs_fid_lookup(dentry);
 	int err = -EPERM;
 
 	dprintk(DEBUG_VFS, "dentry: %p\n", dentry);
@@ -778,7 +805,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
 {
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
-	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+	struct v9fs_fid *fid = v9fs_fid_lookup(dentry);
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
 	int res = -EPERM;
@@ -960,7 +987,7 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	if (retval != 0)
 		goto FreeFcall;
 
-	newfid = v9fs_fid_lookup(dentry, FID_OP);
+	newfid = v9fs_fid_lookup(dentry);
 
 	/* issue a twstat */
 	v9fs_blank_mistat(v9ses, mistat);
@@ -1004,7 +1031,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
 
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode);
-	struct v9fs_fid *fid = v9fs_fid_lookup(dentry, FID_OP);
+	struct v9fs_fid *fid = v9fs_fid_lookup(dentry);
 
 	if (!fid) {
 		dprintk(DEBUG_ERROR, "could not resolve fid from dentry\n");
@@ -1063,8 +1090,8 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
 	int ret;
 	char *link = __getname();
 
-	if (strlen(link) < buflen)
-		buflen = strlen(link);
+	if (buflen > PATH_MAX)
+		buflen = PATH_MAX;
 
 	dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
 
@@ -1148,7 +1175,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir);
 	struct v9fs_fcall *fcall = NULL;
 	struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL);
-	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry, FID_OP);
+	struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry);
 	struct v9fs_fid *newfid = NULL;
 	char *symname = __getname();
 
@@ -1168,7 +1195,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (retval != 0)
 		goto FreeMem;
 
-	newfid = v9fs_fid_lookup(dentry, FID_OP);
+	newfid = v9fs_fid_lookup(dentry);
 	if (!newfid) {
 		dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n");
 		goto FreeMem;
@@ -1246,7 +1273,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	if (retval != 0)
 		goto FreeMem;
 
-	newfid = v9fs_fid_lookup(dentry, FID_OP);
+	newfid = v9fs_fid_lookup(dentry);
 	if (!newfid) {
 		dprintk(DEBUG_ERROR, "coudn't resove fid from dentry\n");
 		retval = -EINVAL;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 868f350b2c5f..82c5b0084079 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -129,8 +129,7 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
-		retval = newfid;
-		goto free_session;
+		return ERR_PTR(newfid);
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
@@ -150,28 +149,24 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	if (!root) {
 		retval = -ENOMEM;
-		goto release_inode;
+		goto put_back_sb;
 	}
 
 	sb->s_root = root;
 
-	/* Setup the Root Inode */
-	root_fid = v9fs_fid_create(root);
-	if (root_fid == NULL) {
-		retval = -ENOMEM;
-		goto release_dentry;
-	}
-
-	root_fid->fidopen = 0;
-	root_fid->v9ses = v9ses;
-
 	stat_result = v9fs_t_stat(v9ses, newfid, &fcall);
 	if (stat_result < 0) {
 		dprintk(DEBUG_ERROR, "stat error\n");
 		v9fs_t_clunk(v9ses, newfid, NULL);
 		v9fs_put_idpool(newfid, &v9ses->fidpool);
 	} else {
-		root_fid->fid = newfid;
+		/* Setup the Root Inode */
+		root_fid = v9fs_fid_create(root, v9ses, newfid, 0);
+		if (root_fid == NULL) {
+			retval = -ENOMEM;
+			goto put_back_sb;
+		}
+
 		root_fid->qid = fcall->params.rstat.stat->qid;
 		root->d_inode->i_ino =
 		    v9fs_qid2ino(&fcall->params.rstat.stat->qid);
@@ -182,25 +177,15 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	if (stat_result < 0) {
 		retval = stat_result;
-		goto release_dentry;
+		goto put_back_sb;
 	}
 
 	return sb;
 
-      release_dentry:
-	dput(sb->s_root);
-
-      release_inode:
-	iput(inode);
-
-      put_back_sb:
+put_back_sb:
+	/* deactivate_super calls v9fs_kill_super which will frees the rest */
 	up_write(&sb->s_umount);
 	deactivate_super(sb);
-	v9fs_session_close(v9ses);
-
-      free_session:
-	kfree(v9ses);
-
 	return ERR_PTR(retval);
 }
 
diff --git a/fs/Kconfig b/fs/Kconfig
index 068ccea2f184..48f5422cb19a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -472,6 +472,9 @@ config FUSE_FS
 	  utilities is available from the FUSE homepage:
 	  <http://fuse.sourceforge.net/>
 
+	  See <file:Documentation/filesystems/fuse.txt> for more information.
+	  See <file:Documentation/Changes> for needed library/utility version.
+
 	  If you want to develop a userspace FS, or if you want to use
 	  a filesystem based on FUSE, answer Y or M.
 
diff --git a/fs/aio.c b/fs/aio.c
index 38f62680fd63..9fe7216457d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -398,7 +398,7 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
 	if (unlikely(!req))
 		return NULL;
 
-	req->ki_flags = 1 << KIF_LOCKED;
+	req->ki_flags = 0;
 	req->ki_users = 2;
 	req->ki_key = 0;
 	req->ki_ctx = ctx;
@@ -547,24 +547,6 @@ struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	return ioctx;
 }
 
-static int lock_kiocb_action(void *param)
-{
-	schedule();
-	return 0;
-}
-
-static inline void lock_kiocb(struct kiocb *iocb)
-{
-	wait_on_bit_lock(&iocb->ki_flags, KIF_LOCKED, lock_kiocb_action,
-			 TASK_UNINTERRUPTIBLE);
-}
-
-static inline void unlock_kiocb(struct kiocb *iocb)
-{
-	kiocbClearLocked(iocb);
-	wake_up_bit(&iocb->ki_flags, KIF_LOCKED);
-}
-
 /*
  * use_mm
  *	Makes the calling kernel thread take on the specified
@@ -740,19 +722,9 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
 	ret = retry(iocb);
 	current->io_wait = NULL;
 
-	if (-EIOCBRETRY != ret) {
- 		if (-EIOCBQUEUED != ret) {
-			BUG_ON(!list_empty(&iocb->ki_wait.task_list));
-			aio_complete(iocb, ret, 0);
-			/* must not access the iocb after this */
-		}
-	} else {
-		/*
-		 * Issue an additional retry to avoid waiting forever if
-		 * no waits were queued (e.g. in case of a short read).
-		 */
-		if (list_empty(&iocb->ki_wait.task_list))
-			kiocbSetKicked(iocb);
+	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+		BUG_ON(!list_empty(&iocb->ki_wait.task_list));
+		aio_complete(iocb, ret, 0);
 	}
 out:
 	spin_lock_irq(&ctx->ctx_lock);
@@ -805,9 +777,7 @@ static int __aio_run_iocbs(struct kioctx *ctx)
 		 * Hold an extra reference while retrying i/o.
 		 */
 		iocb->ki_users++;       /* grab extra reference */
-		lock_kiocb(iocb);
 		aio_run_iocb(iocb);
-		unlock_kiocb(iocb);
 		if (__aio_put_req(ctx, iocb))  /* drop extra ref */
 			put_ioctx(ctx);
  	}
@@ -898,16 +868,24 @@ static void aio_kick_handler(void *data)
  * and if required activate the aio work queue to process
  * it
  */
-static void queue_kicked_iocb(struct kiocb *iocb)
+static void try_queue_kicked_iocb(struct kiocb *iocb)
 {
  	struct kioctx	*ctx = iocb->ki_ctx;
 	unsigned long flags;
 	int run = 0;
 
-	WARN_ON((!list_empty(&iocb->ki_wait.task_list)));
+	/* We're supposed to be the only path putting the iocb back on the run
+	 * list.  If we find that the iocb is *back* on a wait queue already
+	 * than retry has happened before we could queue the iocb.  This also
+	 * means that the retry could have completed and freed our iocb, no
+	 * good. */
+	BUG_ON((!list_empty(&iocb->ki_wait.task_list)));
 
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
-	run = __queue_kicked_iocb(iocb);
+	/* set this inside the lock so that we can't race with aio_run_iocb()
+	 * testing it and putting the iocb on the run list under the lock */
+	if (!kiocbTryKick(iocb))
+		run = __queue_kicked_iocb(iocb);
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 	if (run)
 		aio_queue_work(ctx);
@@ -930,10 +908,7 @@ void fastcall kick_iocb(struct kiocb *iocb)
 		return;
 	}
 
-	/* If its already kicked we shouldn't queue it again */
-	if (!kiocbTryKick(iocb)) {
-		queue_kicked_iocb(iocb);
-	}
+	try_queue_kicked_iocb(iocb);
 }
 EXPORT_SYMBOL(kick_iocb);
 
@@ -1321,8 +1296,11 @@ asmlinkage long sys_io_destroy(aio_context_t ctx)
 }
 
 /*
- * Default retry method for aio_read (also used for first time submit)
- * Responsible for updating iocb state as retries progress
+ * aio_p{read,write} are the default  ki_retry methods for
+ * IO_CMD_P{READ,WRITE}.  They maintains kiocb retry state around potentially
+ * multiple calls to f_op->aio_read().  They loop around partial progress
+ * instead of returning -EIOCBRETRY because they don't have the means to call
+ * kick_iocb().
  */
 static ssize_t aio_pread(struct kiocb *iocb)
 {
@@ -1331,25 +1309,25 @@ static ssize_t aio_pread(struct kiocb *iocb)
 	struct inode *inode = mapping->host;
 	ssize_t ret = 0;
 
-	ret = file->f_op->aio_read(iocb, iocb->ki_buf,
-		iocb->ki_left, iocb->ki_pos);
+	do {
+		ret = file->f_op->aio_read(iocb, iocb->ki_buf,
+			iocb->ki_left, iocb->ki_pos);
+		/*
+		 * Can't just depend on iocb->ki_left to determine
+		 * whether we are done. This may have been a short read.
+		 */
+		if (ret > 0) {
+			iocb->ki_buf += ret;
+			iocb->ki_left -= ret;
+		}
 
-	/*
-	 * Can't just depend on iocb->ki_left to determine
-	 * whether we are done. This may have been a short read.
-	 */
-	if (ret > 0) {
-		iocb->ki_buf += ret;
-		iocb->ki_left -= ret;
 		/*
-		 * For pipes and sockets we return once we have
-		 * some data; for regular files we retry till we
-		 * complete the entire read or find that we can't
-		 * read any more data (e.g short reads).
+		 * For pipes and sockets we return once we have some data; for
+		 * regular files we retry till we complete the entire read or
+		 * find that we can't read any more data (e.g short reads).
 		 */
-		if (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))
-			ret = -EIOCBRETRY;
-	}
+	} while (ret > 0 && iocb->ki_left > 0 &&
+		 !S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode));
 
 	/* This means we must have transferred all that we could */
 	/* No need to retry anymore */
@@ -1359,27 +1337,21 @@ static ssize_t aio_pread(struct kiocb *iocb)
 	return ret;
 }
 
-/*
- * Default retry method for aio_write (also used for first time submit)
- * Responsible for updating iocb state as retries progress
- */
+/* see aio_pread() */
 static ssize_t aio_pwrite(struct kiocb *iocb)
 {
 	struct file *file = iocb->ki_filp;
 	ssize_t ret = 0;
 
-	ret = file->f_op->aio_write(iocb, iocb->ki_buf,
-		iocb->ki_left, iocb->ki_pos);
-
-	if (ret > 0) {
-		iocb->ki_buf += ret;
-		iocb->ki_left -= ret;
-
-		ret = -EIOCBRETRY;
-	}
+	do {
+		ret = file->f_op->aio_write(iocb, iocb->ki_buf,
+			iocb->ki_left, iocb->ki_pos);
+		if (ret > 0) {
+			iocb->ki_buf += ret;
+			iocb->ki_left -= ret;
+		}
+	} while (ret > 0 && iocb->ki_left > 0);
 
-	/* This means we must have transferred all that we could */
-	/* No need to retry anymore */
 	if ((ret == 0) || (iocb->ki_left == 0))
 		ret = iocb->ki_nbytes - iocb->ki_left;
 
@@ -1549,7 +1521,6 @@ int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 
 	spin_lock_irq(&ctx->ctx_lock);
 	aio_run_iocb(req);
-	unlock_kiocb(req);
 	if (!list_empty(&ctx->run_list)) {
 		/* drain the run list */
 		while (__aio_run_iocbs(ctx))
@@ -1681,7 +1652,6 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
 	if (NULL != cancel) {
 		struct io_event tmp;
 		pr_debug("calling cancel\n");
-		lock_kiocb(kiocb);
 		memset(&tmp, 0, sizeof(tmp));
 		tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
 		tmp.data = kiocb->ki_user_data;
@@ -1693,7 +1663,6 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
 			if (copy_to_user(result, &tmp, sizeof(tmp)))
 				ret = -EFAULT;
 		}
-		unlock_kiocb(kiocb);
 	} else
 		ret = -EINVAL;
 
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index e240c335eb23..5af928fa0449 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -108,7 +108,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 	inode->i_mapping->a_ops = &bfs_aops;
 	inode->i_mode = mode;
 	inode->i_ino = ino;
-	BFS_I(inode)->i_dsk_ino = cpu_to_le16(ino);
+	BFS_I(inode)->i_dsk_ino = ino;
 	BFS_I(inode)->i_sblock = 0;
 	BFS_I(inode)->i_eblock = 0;
 	insert_inode_hash(inode);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index c7b39aa279d7..3af6c73c5b5a 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -357,28 +357,46 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	}
 
 	info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1)>>BFS_BSIZE_BITS; /* for statfs(2) */
-	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 -  cpu_to_le32(bfs_sb->s_start))>>BFS_BSIZE_BITS;
+	info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 -  le32_to_cpu(bfs_sb->s_start))>>BFS_BSIZE_BITS;
 	info->si_freei = 0;
 	info->si_lf_eblk = 0;
 	info->si_lf_sblk = 0;
 	info->si_lf_ioff = 0;
+	bh = NULL;
 	for (i=BFS_ROOT_INO; i<=info->si_lasti; i++) {
-		inode = iget(s,i);
-		if (BFS_I(inode)->i_dsk_ino == 0)
+		struct bfs_inode *di;
+		int block = (i - BFS_ROOT_INO)/BFS_INODES_PER_BLOCK + 1;
+		int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
+		unsigned long sblock, eblock;
+
+		if (!off) {
+			brelse(bh);
+			bh = sb_bread(s, block);
+		}
+
+		if (!bh)
+			continue;
+
+		di = (struct bfs_inode *)bh->b_data + off;
+
+		if (!di->i_ino) {
 			info->si_freei++;
-		else {
-			set_bit(i, info->si_imap);
-			info->si_freeb -= inode->i_blocks;
-			if (BFS_I(inode)->i_eblock > info->si_lf_eblk) {
-				info->si_lf_eblk = BFS_I(inode)->i_eblock;
-				info->si_lf_sblk = BFS_I(inode)->i_sblock;
-				info->si_lf_ioff = BFS_INO2OFF(i);
-			}
+			continue;
+		}
+		set_bit(i, info->si_imap);
+		info->si_freeb -= BFS_FILEBLOCKS(di);
+
+		sblock =  le32_to_cpu(di->i_sblock);
+		eblock =  le32_to_cpu(di->i_eblock);
+		if (eblock > info->si_lf_eblk) {
+			info->si_lf_eblk = eblock;
+			info->si_lf_sblk = sblock;
+			info->si_lf_ioff = BFS_INO2OFF(i);
 		}
-		iput(inode);
 	}
+	brelse(bh);
 	if (!(s->s_flags & MS_RDONLY)) {
-		mark_buffer_dirty(bh);
+		mark_buffer_dirty(info->si_sbh);
 		s->s_dirt = 1;
 	} 
 	dump_imap("read_super", s);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7976a238f0a3..d4b15576e584 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -905,7 +905,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		send_sig(SIGKILL, current, 0);
 		goto out_free_dentry;
 	}
-	if (padzero(elf_bss)) {
+	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
 		send_sig(SIGSEGV, current, 0);
 		retval = -EFAULT; /* Nobody gets to see this, but.. */
 		goto out_free_dentry;
diff --git a/fs/bio.c b/fs/bio.c
index 83a349574567..7d81a93afd48 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -75,7 +75,7 @@ struct bio_set {
  */
 static struct bio_set *fs_bio_set;
 
-static inline struct bio_vec *bvec_alloc_bs(unsigned int __nocast gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
 	struct bio_vec *bvl;
 	struct biovec_slab *bp;
@@ -155,7 +155,7 @@ inline void bio_init(struct bio *bio)
  *   allocate bio and iovecs from the memory pools specified by the
  *   bio_set structure.
  **/
-struct bio *bio_alloc_bioset(unsigned int __nocast gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
 	struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
 
@@ -181,7 +181,7 @@ out:
 	return bio;
 }
 
-struct bio *bio_alloc(unsigned int __nocast gfp_mask, int nr_iovecs)
+struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 {
 	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
 
@@ -277,7 +277,7 @@ inline void __bio_clone(struct bio *bio, struct bio *bio_src)
  *
  * 	Like __bio_clone, only also allocates the returned bio
  */
-struct bio *bio_clone(struct bio *bio, unsigned int __nocast gfp_mask)
+struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
 	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
 
@@ -1078,7 +1078,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	return bp;
 }
 
-static void *bio_pair_alloc(unsigned int __nocast gfp_flags, void *data)
+static void *bio_pair_alloc(gfp_t gfp_flags, void *data)
 {
 	return kmalloc(sizeof(struct bio_pair), gfp_flags);
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index 6cbfceabd95d..1216c0d3c8ce 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3045,7 +3045,7 @@ static void recalc_bh_state(void)
 	buffer_heads_over_limit = (tot > max_buffer_heads);
 }
 	
-struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
+struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
 	if (ret) {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8cc23e7d0d5d..1ebf7dafc1d7 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -781,6 +781,8 @@ static int cifs_oplock_thread(void * dummyarg)
 
 	oplockThread = current;
 	do {
+		if (try_to_freeze()) 
+			continue;
 		set_current_state(TASK_INTERRUPTIBLE);
 		
 		schedule_timeout(1*HZ);  
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2335f14a1583..47360156cc54 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -344,6 +344,8 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	}
 
 	while (server->tcpStatus != CifsExiting) {
+		if (try_to_freeze())
+			continue;
 		if (bigbuf == NULL) {
 			bigbuf = cifs_buf_get();
 			if(bigbuf == NULL) {
diff --git a/fs/compat.c b/fs/compat.c
index ac3fb9ed8eea..a719e158e002 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -44,6 +44,8 @@
 #include <linux/nfsd/syscall.h>
 #include <linux/personality.h>
 #include <linux/rwsem.h>
+#include <linux/acct.h>
+#include <linux/mm.h>
 
 #include <net/sock.h>		/* siocdevprivate_ioctl */
 
@@ -1487,6 +1489,8 @@ int compat_do_execve(char * filename,
 
 		/* execve success */
 		security_bprm_free(bprm);
+		acct_update_integrals(current);
+		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
diff --git a/fs/dcache.c b/fs/dcache.c
index 7376b61269fb..fb10386c59be 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -102,7 +102,8 @@ static inline void dentry_iput(struct dentry * dentry)
 		list_del_init(&dentry->d_alias);
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
-		fsnotify_inoderemove(inode);
+		if (!inode->i_nlink)
+			fsnotify_inoderemove(inode);
 		if (dentry->d_op && dentry->d_op->d_iput)
 			dentry->d_op->d_iput(dentry, inode);
 		else
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 6ab1dd0ca904..4284cd31eba6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -101,6 +101,10 @@
 /* Maximum number of poll wake up nests we are allowing */
 #define EP_MAX_POLLWAKE_NESTS 4
 
+/* Maximum msec timeout value storeable in a long int */
+#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
+
+
 struct epoll_filefd {
 	struct file *file;
 	int fd;
@@ -231,8 +235,9 @@ struct ep_pqueue {
 
 static void ep_poll_safewake_init(struct poll_safewake *psw);
 static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
-static int ep_getfd(int *efd, struct inode **einode, struct file **efile);
-static int ep_file_init(struct file *file);
+static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
+		    struct eventpoll *ep);
+static int ep_alloc(struct eventpoll **pep);
 static void ep_free(struct eventpoll *ep);
 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
 static void ep_use_epitem(struct epitem *epi);
@@ -501,38 +506,37 @@ void eventpoll_release_file(struct file *file)
 asmlinkage long sys_epoll_create(int size)
 {
 	int error, fd;
+	struct eventpoll *ep;
 	struct inode *inode;
 	struct file *file;
 
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
 		     current, size));
 
-	/* Sanity check on the size parameter */
+	/*
+	 * Sanity check on the size parameter, and create the internal data
+	 * structure ( "struct eventpoll" ).
+	 */
 	error = -EINVAL;
-	if (size <= 0)
+	if (size <= 0 || (error = ep_alloc(&ep)) != 0)
 		goto eexit_1;
 
 	/*
 	 * Creates all the items needed to setup an eventpoll file. That is,
 	 * a file structure, and inode and a free file descriptor.
 	 */
-	error = ep_getfd(&fd, &inode, &file);
-	if (error)
-		goto eexit_1;
-
-	/* Setup the file internal data structure ( "struct eventpoll" ) */
-	error = ep_file_init(file);
+	error = ep_getfd(&fd, &inode, &file, ep);
 	if (error)
 		goto eexit_2;
 
-
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
 		     current, size, fd));
 
 	return fd;
 
 eexit_2:
-	sys_close(fd);
+	ep_free(ep);
+	kfree(ep);
 eexit_1:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
 		     current, size, error));
@@ -706,7 +710,8 @@ eexit_1:
 /*
  * Creates the file descriptor to be used by the epoll interface.
  */
-static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
+static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
+		    struct eventpoll *ep)
 {
 	struct qstr this;
 	char name[32];
@@ -756,7 +761,7 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
 	file->f_op = &eventpoll_fops;
 	file->f_mode = FMODE_READ;
 	file->f_version = 0;
-	file->private_data = NULL;
+	file->private_data = ep;
 
 	/* Install the new setup file into the allocated fd. */
 	fd_install(fd, file);
@@ -777,14 +782,13 @@ eexit_1:
 }
 
 
-static int ep_file_init(struct file *file)
+static int ep_alloc(struct eventpoll **pep)
 {
-	struct eventpoll *ep;
+	struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
 
-	if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
+	if (!ep)
 		return -ENOMEM;
 
-	memset(ep, 0, sizeof(*ep));
 	rwlock_init(&ep->lock);
 	init_rwsem(&ep->sem);
 	init_waitqueue_head(&ep->wq);
@@ -792,9 +796,9 @@ static int ep_file_init(struct file *file)
 	INIT_LIST_HEAD(&ep->rdllist);
 	ep->rbr = RB_ROOT;
 
-	file->private_data = ep;
+	*pep = ep;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n",
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
 		     current, ep));
 	return 0;
 }
@@ -1506,8 +1510,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	 * and the overflow condition. The passed timeout is in milliseconds,
 	 * that why (t * HZ) / 1000.
 	 */
-	jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ?
-		MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000;
+	jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
+		MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
 
 retry:
 	write_lock_irqsave(&ep->lock, flags);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c8d07030c897..e2d6208633a7 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -605,27 +605,28 @@ got:
 	insert_inode_hash(inode);
 
 	if (DQUOT_ALLOC_INODE(inode)) {
-		DQUOT_DROP(inode);
 		err = -ENOSPC;
-		goto fail2;
+		goto fail_drop;
 	}
+
 	err = ext2_init_acl(inode, dir);
-	if (err) {
-		DQUOT_FREE_INODE(inode);
-		DQUOT_DROP(inode);
-		goto fail2;
-	}
+	if (err)
+		goto fail_free_drop;
+
 	err = ext2_init_security(inode,dir);
-	if (err) {
-		DQUOT_FREE_INODE(inode);
-		goto fail2;
-	}
+	if (err)
+		goto fail_free_drop;
+
 	mark_inode_dirty(inode);
 	ext2_debug("allocating inode %lu\n", inode->i_ino);
 	ext2_preread_inode(inode);
 	return inode;
 
-fail2:
+fail_free_drop:
+	DQUOT_FREE_INODE(inode);
+
+fail_drop:
+	DQUOT_DROP(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	iput(inode);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index e463dca008e4..0213db4911a2 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1410,7 +1410,7 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 	unsigned long desc_count;
 	struct ext3_group_desc *gdp;
 	int i;
-	unsigned long ngroups;
+	unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
 #ifdef EXT3FS_DEBUG
 	struct ext3_super_block *es;
 	unsigned long bitmap_count, x;
@@ -1421,7 +1421,8 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 	desc_count = 0;
 	bitmap_count = 0;
 	gdp = NULL;
-	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
+
+	for (i = 0; i < ngroups; i++) {
 		gdp = ext3_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -1443,7 +1444,6 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 	return bitmap_count;
 #else
 	desc_count = 0;
-	ngroups = EXT3_SB(sb)->s_groups_count;
 	smp_rmb();
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext3_get_group_desc(sb, i, NULL);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 96552769d039..6549945f9ac1 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -597,27 +597,22 @@ got:
 
 	ret = inode;
 	if(DQUOT_ALLOC_INODE(inode)) {
-		DQUOT_DROP(inode);
 		err = -EDQUOT;
-		goto fail2;
+		goto fail_drop;
 	}
+
 	err = ext3_init_acl(handle, inode, dir);
-	if (err) {
-		DQUOT_FREE_INODE(inode);
-		DQUOT_DROP(inode);
-		goto fail2;
-  	}
+	if (err)
+		goto fail_free_drop;
+
 	err = ext3_init_security(handle,inode, dir);
-	if (err) {
-		DQUOT_FREE_INODE(inode);
-		goto fail2;
-	}
+	if (err)
+		goto fail_free_drop;
+
 	err = ext3_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext3_std_error(sb, err);
-		DQUOT_FREE_INODE(inode);
-		DQUOT_DROP(inode);
-		goto fail2;
+		goto fail_free_drop;
 	}
 
 	ext3_debug("allocating inode %lu\n", inode->i_ino);
@@ -631,7 +626,11 @@ really_out:
 	brelse(bitmap_bh);
 	return ret;
 
-fail2:
+fail_free_drop:
+	DQUOT_FREE_INODE(inode);
+
+fail_drop:
+	DQUOT_DROP(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	iput(inode);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 2c9f81278d5d..57f79106267d 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -242,7 +242,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	     i < sbi->s_itb_per_group; i++, bit++, block++) {
 		struct buffer_head *it;
 
-		ext3_debug("clear inode block %#04x (+%ld)\n", block, bit);
+		ext3_debug("clear inode block %#04lx (+%d)\n", block, bit);
 		if (IS_ERR(it = bclean(handle, sb, block))) {
 			err = PTR_ERR(it);
 			goto exit_bh;
@@ -643,8 +643,8 @@ static void update_backups(struct super_block *sb,
 			break;
 
 		bh = sb_getblk(sb, group * bpg + blk_off);
-		ext3_debug(sb, __FUNCTION__, "update metadata backup %#04lx\n",
-			   bh->b_blocknr);
+		ext3_debug("update metadata backup %#04lx\n",
+			  (unsigned long)bh->b_blocknr);
 		if ((err = ext3_journal_get_write_access(handle, bh)))
 			break;
 		lock_buffer(bh);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a93c3609025d..9e24ceb019fe 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -512,15 +512,14 @@ static void ext3_clear_inode(struct inode *inode)
 
 static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
-	struct ext3_sb_info *sbi = EXT3_SB(vfs->mnt_sb);
+	struct super_block *sb = vfs->mnt_sb;
+	struct ext3_sb_info *sbi = EXT3_SB(sb);
 
-	if (sbi->s_mount_opt & EXT3_MOUNT_JOURNAL_DATA)
+	if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
 		seq_puts(seq, ",data=journal");
-
-	if (sbi->s_mount_opt & EXT3_MOUNT_ORDERED_DATA)
+	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
 		seq_puts(seq, ",data=ordered");
-
-	if (sbi->s_mount_opt & EXT3_MOUNT_WRITEBACK_DATA)
+	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 
 #if defined(CONFIG_QUOTA)
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 62ffa9139400..7134403d5be2 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -12,39 +12,6 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 
-static ssize_t fat_file_aio_write(struct kiocb *iocb, const char __user *buf,
-				  size_t count, loff_t pos)
-{
-	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
-	int retval;
-
-	retval = generic_file_aio_write(iocb, buf, count, pos);
-	if (retval > 0) {
-		inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-		MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
-		mark_inode_dirty(inode);
-//		check the locking rules
-//		if (IS_SYNC(inode))
-//			fat_sync_inode(inode);
-	}
-	return retval;
-}
-
-static ssize_t fat_file_writev(struct file *filp, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t *ppos)
-{
-	struct inode *inode = filp->f_dentry->d_inode;
-	int retval;
-
-	retval = generic_file_writev(filp, iov, nr_segs, ppos);
-	if (retval > 0) {
-		inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-		MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
-		mark_inode_dirty(inode);
-	}
-	return retval;
-}
-
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
 {
@@ -148,9 +115,9 @@ struct file_operations fat_file_operations = {
 	.read		= do_sync_read,
 	.write		= do_sync_write,
 	.readv		= generic_file_readv,
-	.writev		= fat_file_writev,
+	.writev		= generic_file_writev,
 	.aio_read	= generic_file_aio_read,
-	.aio_write	= fat_file_aio_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.ioctl		= fat_generic_ioctl,
 	.fsync		= file_fsync,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a7cbe68e2259..e2effe2dc9b2 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -102,6 +102,19 @@ static int fat_prepare_write(struct file *file, struct page *page,
 				  &MSDOS_I(page->mapping->host)->mmu_private);
 }
 
+static int fat_commit_write(struct file *file, struct page *page,
+			    unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	int err = generic_commit_write(file, page, from, to);
+	if (!err && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+		MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
+		mark_inode_dirty(inode);
+	}
+	return err;
+}
+
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
 	return generic_block_bmap(mapping, block, fat_get_block);
@@ -112,7 +125,7 @@ static struct address_space_operations fat_aops = {
 	.writepage	= fat_writepage,
 	.sync_page	= block_sync_page,
 	.prepare_write	= fat_prepare_write,
-	.commit_write	= generic_commit_write,
+	.commit_write	= fat_commit_write,
 	.bmap		= _fat_bmap
 };
 
@@ -287,9 +300,9 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 	inode->i_blksize = sbi->cluster_size;
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
-	inode->i_mtime.tv_sec = inode->i_atime.tv_sec =
+	inode->i_mtime.tv_sec =
 		date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date));
-	inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0;
+	inode->i_mtime.tv_nsec = 0;
 	if (sbi->options.isvfat) {
 		int secs = de->ctime_cs / 100;
 		int csecs = de->ctime_cs % 100;
@@ -297,8 +310,11 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 			date_dos2unix(le16_to_cpu(de->ctime),
 				      le16_to_cpu(de->cdate)) + secs;
 		inode->i_ctime.tv_nsec = csecs * 10000000;
+		inode->i_atime.tv_sec =
+			date_dos2unix(le16_to_cpu(0), le16_to_cpu(de->adate));
+		inode->i_atime.tv_nsec = 0;
 	} else
-		inode->i_ctime = inode->i_mtime;
+		inode->i_ctime = inode->i_atime = inode->i_mtime;
 
 	return 0;
 }
@@ -500,7 +516,9 @@ retry:
 	raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
 	fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time, &raw_entry->date);
 	if (sbi->options.isvfat) {
+		__le16 atime;
 		fat_date_unix2dos(inode->i_ctime.tv_sec,&raw_entry->ctime,&raw_entry->cdate);
+		fat_date_unix2dos(inode->i_atime.tv_sec,&atime,&raw_entry->adate);
 		raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 +
 			inode->i_ctime.tv_nsec / 10000000;
 	}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index e79e49b3eec7..29f1e9f6e85c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -96,6 +96,8 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry,
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
 	err = req->out.h.error;
+	if (!err && (!outarg.nodeid || outarg.nodeid == FUSE_ROOT_ID))
+		err = -EIO;
 	if (!err) {
 		inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
 				  &outarg.attr);
@@ -152,6 +154,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 		fuse_put_request(fc, req);
 		return err;
 	}
+	if (!outarg.nodeid || outarg.nodeid == FUSE_ROOT_ID) {
+		fuse_put_request(fc, req);
+		return -EIO;
+	}
 	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
 			  &outarg.attr);
 	if (!inode) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6454022b0536..657ab11c173b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -23,6 +23,10 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 	struct fuse_file *ff;
 	int err;
 
+	/* VFS checks this, but only _after_ ->open() */
+	if (file->f_flags & O_DIRECT)
+		return -EINVAL;
+
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 59c5062cd63f..dd7113106269 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -793,11 +793,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
 	return(err);
 }
 
-void hostfs_truncate(struct inode *ino)
-{
-	not_implemented();
-}
-
 int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
 {
 	char *name;
@@ -894,7 +889,6 @@ static struct inode_operations hostfs_iops = {
 	.rmdir		= hostfs_rmdir,
 	.mknod		= hostfs_mknod,
 	.rename		= hostfs_rename,
-	.truncate	= hostfs_truncate,
 	.permission	= hostfs_permission,
 	.setattr	= hostfs_setattr,
 	.getattr	= hostfs_getattr,
@@ -910,7 +904,6 @@ static struct inode_operations hostfs_dir_iops = {
 	.rmdir		= hostfs_rmdir,
 	.mknod		= hostfs_mknod,
 	.rename		= hostfs_rename,
-	.truncate	= hostfs_truncate,
 	.permission	= hostfs_permission,
 	.setattr	= hostfs_setattr,
 	.getattr	= hostfs_getattr,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 0ec62d5310db..9f942ca8e4e3 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -129,8 +129,7 @@ void jfs_delete_inode(struct inode *inode)
 	jfs_info("In jfs_delete_inode, inode = 0x%p", inode);
 
 	if (!is_bad_inode(inode) &&
-	    (JFS_IP(inode)->fileset == cpu_to_le32(FILESYSTEM_I))) {
-
+	    (JFS_IP(inode)->fileset == FILESYSTEM_I)) {
 		truncate_inode_pages(&inode->i_data, 0);
 
 		if (test_cflag(COMMIT_Freewmap, inode))
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index c739626f5bf1..eadf319bee22 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -3055,7 +3055,7 @@ static int cntlz(u32 value)
  * RETURN VALUES:
  *      log2 number of blocks
  */
-int blkstol2(s64 nb)
+static int blkstol2(s64 nb)
 {
 	int l2nb;
 	s64 mask;		/* meant to be signed */
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index c7a92f9deb2b..9b71ed2674fe 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -725,6 +725,9 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
 	else
 		tlck->flag = tlckINODELOCK;
 
+	if (S_ISDIR(ip->i_mode))
+		tlck->flag |= tlckDIRECTORY;
+
 	tlck->type = 0;
 
 	/* bind the tlock and the page */
@@ -1009,6 +1012,8 @@ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
 
 	/* bind the tlock and the object */
 	tlck->flag = tlckINODELOCK;
+	if (S_ISDIR(ip->i_mode))
+		tlck->flag |= tlckDIRECTORY;
 	tlck->ip = ip;
 	tlck->mp = NULL;
 
@@ -1077,6 +1082,8 @@ struct linelock *txLinelock(struct linelock * tlock)
 	linelock->flag = tlckLINELOCK;
 	linelock->maxcnt = TLOCKLONG;
 	linelock->index = 0;
+	if (tlck->flag & tlckDIRECTORY)
+		linelock->flag |= tlckDIRECTORY;
 
 	/* append linelock after tlock */
 	linelock->next = tlock->next;
@@ -2070,8 +2077,8 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
  *
  * function:    log from maplock of freed data extents;
  */
-void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
-	    struct tlock * tlck)
+static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
+		   struct tlock * tlck)
 {
 	struct pxd_lock *pxdlock;
 	int i, nlock;
@@ -2209,7 +2216,7 @@ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
  * function: synchronously write pages locked by transaction
  *              after txLog() but before txUpdateMap();
  */
-void txForce(struct tblock * tblk)
+static void txForce(struct tblock * tblk)
 {
 	struct tlock *tlck;
 	lid_t lid, next;
@@ -2358,7 +2365,7 @@ static void txUpdateMap(struct tblock * tblk)
 			 */
 			else {	/* (maplock->flag & mlckFREE) */
 
-				if (S_ISDIR(tlck->ip->i_mode))
+				if (tlck->flag & tlckDIRECTORY)
 					txFreeMap(ipimap, maplock,
 						  tblk, COMMIT_PWMAP);
 				else
diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h
index 59ad0f6b7231..0e4dc4514c47 100644
--- a/fs/jfs/jfs_txnmgr.h
+++ b/fs/jfs/jfs_txnmgr.h
@@ -122,6 +122,7 @@ extern struct tlock *TxLock;	/* transaction lock table */
 #define tlckLOG			0x0800
 /* updateMap state */
 #define	tlckUPDATEMAP		0x0080
+#define	tlckDIRECTORY		0x0040
 /* freeLock state */
 #define tlckFREELOCK		0x0008
 #define tlckWRITEPAGE		0x0004
diff --git a/fs/locks.c b/fs/locks.c
index c2c09b4798d6..f7daa5f48949 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -124,6 +124,7 @@
 #include <linux/smp_lock.h>
 #include <linux/syscalls.h>
 #include <linux/time.h>
+#include <linux/rcupdate.h>
 
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
@@ -2205,6 +2206,7 @@ void steal_locks(fl_owner_t from)
 
 	lock_kernel();
 	j = 0;
+	rcu_read_lock();
 	fdt = files_fdtable(files);
 	for (;;) {
 		unsigned long set;
@@ -2222,6 +2224,7 @@ void steal_locks(fl_owner_t from)
 			set >>= 1;
 		}
 	}
+	rcu_read_unlock();
 	unlock_kernel();
 }
 EXPORT_SYMBOL(steal_locks);
diff --git a/fs/mpage.c b/fs/mpage.c
index bb9aebe93862..c5adcdddf3cc 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -102,7 +102,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 static struct bio *
 mpage_alloc(struct block_device *bdev,
 		sector_t first_sector, int nr_vecs,
-		unsigned int __nocast gfp_flags)
+		gfp_t gfp_flags)
 {
 	struct bio *bio;
 
diff --git a/fs/namei.c b/fs/namei.c
index 043d587216b5..aa62dbda93ac 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1551,19 +1551,19 @@ do_link:
 	if (nd->last_type != LAST_NORM)
 		goto exit;
 	if (nd->last.name[nd->last.len]) {
-		putname(nd->last.name);
+		__putname(nd->last.name);
 		goto exit;
 	}
 	error = -ELOOP;
 	if (count++==32) {
-		putname(nd->last.name);
+		__putname(nd->last.name);
 		goto exit;
 	}
 	dir = nd->dentry;
 	down(&dir->d_inode->i_sem);
 	path.dentry = __lookup_hash(&nd->last, nd->dentry, nd);
 	path.mnt = nd->mnt;
-	putname(nd->last.name);
+	__putname(nd->last.name);
 	goto do_last;
 }
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d7f7eb669d03..4a36839f0bbd 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -85,6 +85,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 	struct nfs_delegation *delegation;
 	int status = 0;
 
+	/* Ensure we first revalidate the attributes and page cache! */
+	if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR)))
+		__nfs_revalidate_inode(NFS_SERVER(inode), inode);
+
 	delegation = nfs_alloc_delegation();
 	if (delegation == NULL)
 		return -ENOMEM;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f6b9eda925c5..6bdcfa95de94 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -137,7 +137,8 @@ static int nfs_revalidate_file(struct inode *inode, struct file *filp)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int retval = 0;
 
-	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) || nfs_attribute_timeout(inode))
+	if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))
+			|| nfs_attribute_timeout(inode))
 		retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	nfs_revalidate_mapping(inode, filp->f_mapping);
 	return 0;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6922469d6fc5..d4eadeea128e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -877,12 +877,10 @@ static int nfs_wait_on_inode(struct inode *inode)
 	sigset_t oldmask;
 	int error;
 
-	atomic_inc(&inode->i_count);
 	rpc_clnt_sigmask(clnt, &oldmask);
 	error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
 					nfs_wait_schedule, TASK_INTERRUPTIBLE);
 	rpc_clnt_sigunmask(clnt, &oldmask);
-	iput(inode);
 
 	return error;
 }
@@ -1226,10 +1224,6 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	loff_t cur_size, new_isize;
 	int data_unstable;
 
-	/* Do we hold a delegation? */
-	if (nfs_have_delegation(inode, FMODE_READ))
-		return 0;
-
 	spin_lock(&inode->i_lock);
 
 	/* Are we in the process of updating data on the server? */
@@ -1350,7 +1344,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
 	nfsi->read_cache_jiffies = fattr->timestamp;
 
 	/* Are we racing with known updates of the metadata on the server? */
-	data_unstable = ! nfs_verify_change_attribute(inode, verifier);
+	data_unstable = ! (nfs_verify_change_attribute(inode, verifier) ||
+		(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE));
 
 	/* Check if our cached file size is stale */
  	new_isize = nfs_size_to_loff_t(fattr->size);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6ceb1d471f20..9758ebd49905 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -184,14 +184,13 @@ static void nfs_readpage_release(struct nfs_page *req)
 {
 	unlock_page(req->wb_page);
 
-	nfs_clear_request(req);
-	nfs_release_request(req);
-
 	dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
 			(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
 			req->wb_bytes,
 			(long long)req_offset(req));
+	nfs_clear_request(req);
+	nfs_release_request(req);
 }
 
 /*
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 251e5a1bb1c4..0c2be8c0307d 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -48,43 +48,26 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 		(struct nfsacl_encode_desc *) desc;
 	u32 *p = (u32 *) elem;
 
-	if (nfsacl_desc->count < nfsacl_desc->acl->a_count) {
-		struct posix_acl_entry *entry =
-			&nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
+	struct posix_acl_entry *entry =
+		&nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
 
-		*p++ = htonl(entry->e_tag | nfsacl_desc->typeflag);
-		switch(entry->e_tag) {
-			case ACL_USER_OBJ:
-				*p++ = htonl(nfsacl_desc->uid);
-				break;
-			case ACL_GROUP_OBJ:
-				*p++ = htonl(nfsacl_desc->gid);
-				break;
-			case ACL_USER:
-			case ACL_GROUP:
-				*p++ = htonl(entry->e_id);
-				break;
-			default:  /* Solaris depends on that! */
-				*p++ = 0;
-				break;
-		}
-		*p++ = htonl(entry->e_perm & S_IRWXO);
-	} else {
-		const struct posix_acl_entry *pa, *pe;
-		int group_obj_perm = ACL_READ|ACL_WRITE|ACL_EXECUTE;
-
-		FOREACH_ACL_ENTRY(pa, nfsacl_desc->acl, pe) {
-			if (pa->e_tag == ACL_GROUP_OBJ) {
-				group_obj_perm = pa->e_perm & S_IRWXO;
-				break;
-			}
-		}
-		/* fake up ACL_MASK entry */
-		*p++ = htonl(ACL_MASK | nfsacl_desc->typeflag);
-		*p++ = htonl(0);
-		*p++ = htonl(group_obj_perm);
+	*p++ = htonl(entry->e_tag | nfsacl_desc->typeflag);
+	switch(entry->e_tag) {
+		case ACL_USER_OBJ:
+			*p++ = htonl(nfsacl_desc->uid);
+			break;
+		case ACL_GROUP_OBJ:
+			*p++ = htonl(nfsacl_desc->gid);
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			*p++ = htonl(entry->e_id);
+			break;
+		default:  /* Solaris depends on that! */
+			*p++ = 0;
+			break;
 	}
-
+	*p++ = htonl(entry->e_perm & S_IRWXO);
 	return 0;
 }
 
@@ -105,11 +88,28 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
 		.gid = inode->i_gid,
 	};
 	int err;
+	struct posix_acl *acl2 = NULL;
 
 	if (entries > NFS_ACL_MAX_ENTRIES ||
 	    xdr_encode_word(buf, base, entries))
 		return -EINVAL;
+	if (encode_entries && acl && acl->a_count == 3) {
+		/* Fake up an ACL_MASK entry. */
+		acl2 = posix_acl_alloc(4, GFP_KERNEL);
+		if (!acl2)
+			return -ENOMEM;
+		/* Insert entries in canonical order: other orders seem
+		 to confuse Solaris VxFS. */
+		acl2->a_entries[0] = acl->a_entries[0];  /* ACL_USER_OBJ */
+		acl2->a_entries[1] = acl->a_entries[1];  /* ACL_GROUP_OBJ */
+		acl2->a_entries[2] = acl->a_entries[1];  /* ACL_MASK */
+		acl2->a_entries[2].e_tag = ACL_MASK;
+		acl2->a_entries[3] = acl->a_entries[2];  /* ACL_OTHER */
+		nfsacl_desc.acl = acl2;
+	}
 	err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
+	if (acl2)
+		posix_acl_release(acl2);
 	if (!err)
 		err = 8 + nfsacl_desc.desc.elem_size *
 			  nfsacl_desc.desc.array_len;
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 49eafbdb15c1..de58579a1d0e 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -29,7 +29,8 @@ ToDo/Notes:
 	  The Windows boot will run chkdsk and then reboot.  The user can then
 	  immediately boot into Linux rather than having to do a full Windows
 	  boot first before rebooting into Linux and we will recognize such a
-	  journal and empty it as it is clean by definition.
+	  journal and empty it as it is clean by definition.  Note, this only
+	  works if chkdsk left the journal in an obviously clean state.
 	- Support journals ($LogFile) with only one restart page as well as
 	  journals with two different restart pages.  We sanity check both and
 	  either use the only sane one or the more recent one of the two in the
@@ -92,6 +93,18 @@ ToDo/Notes:
 	  an octal number to conform to how chmod(1) works, too.  Thanks to
 	  Giuseppe Bilotta and Horst von Brand for pointing out the errors of
 	  my ways.
+	- Fix various bugs in the runlist merging code.  (Based on libntfs
+	  changes by Richard Russon.)
+	- Fix sparse warnings that have crept in over time.
+	- Change ntfs_cluster_free() to require a write locked runlist on entry
+	  since we otherwise get into a lock reversal deadlock if a read locked
+	  runlist is passed in. In the process also change it to take an ntfs
+	  inode instead of a vfs inode as parameter.
+	- Fix the definition of the CHKD ntfs record magic.  It had an off by
+	  two error causing it to be CHKB instead of CHKD.
+	- Fix a stupid bug in __ntfs_bitmap_set_bits_in_run() which caused the
+	  count to become negative and hence we had a wild memset() scribbling
+	  all over the system's ram.
 
 2.1.23 - Implement extension of resident files and make writing safe as well as
 	 many bug fixes, cleanups, and enhancements...
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index b6cc8cf24626..5e80c07c6a4d 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -59,39 +59,49 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unsigned long flags;
 	struct buffer_head *first, *tmp;
 	struct page *page;
+	struct inode *vi;
 	ntfs_inode *ni;
 	int page_uptodate = 1;
 
 	page = bh->b_page;
-	ni = NTFS_I(page->mapping->host);
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
 
 	if (likely(uptodate)) {
-		s64 file_ofs, initialized_size;
+		loff_t i_size;
+		s64 file_ofs, init_size;
 
 		set_buffer_uptodate(bh);
 
 		file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
 				bh_offset(bh);
 		read_lock_irqsave(&ni->size_lock, flags);
-		initialized_size = ni->initialized_size;
+		init_size = ni->initialized_size;
+		i_size = i_size_read(vi);
 		read_unlock_irqrestore(&ni->size_lock, flags);
+		if (unlikely(init_size > i_size)) {
+			/* Race with shrinking truncate. */
+			init_size = i_size;
+		}
 		/* Check for the current buffer head overflowing. */
-		if (file_ofs + bh->b_size > initialized_size) {
-			char *addr;
-			int ofs = 0;
-
-			if (file_ofs < initialized_size)
-				ofs = initialized_size - file_ofs;
-			addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
-			memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
+		if (unlikely(file_ofs + bh->b_size > init_size)) {
+			u8 *kaddr;
+			int ofs;
+
+			ofs = 0;
+			if (file_ofs < init_size)
+				ofs = init_size - file_ofs;
+			kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+			memset(kaddr + bh_offset(bh) + ofs, 0,
+					bh->b_size - ofs);
+			kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
 			flush_dcache_page(page);
-			kunmap_atomic(addr, KM_BIO_SRC_IRQ);
 		}
 	} else {
 		clear_buffer_uptodate(bh);
 		SetPageError(page);
-		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block %llu.",
-				(unsigned long long)bh->b_blocknr);
+		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
+				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
 	local_irq_save(flags);
@@ -124,7 +134,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
 	} else {
-		char *addr;
+		u8 *kaddr;
 		unsigned int i, recs;
 		u32 rec_size;
 
@@ -132,12 +142,12 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		recs = PAGE_CACHE_SIZE / rec_size;
 		/* Should have been verified before we got here... */
 		BUG_ON(!recs);
-		addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
+		kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ);
 		for (i = 0; i < recs; i++)
-			post_read_mst_fixup((NTFS_RECORD*)(addr +
+			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
 					i * rec_size), rec_size);
+		kunmap_atomic(kaddr, KM_BIO_SRC_IRQ);
 		flush_dcache_page(page);
-		kunmap_atomic(addr, KM_BIO_SRC_IRQ);
 		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
 	}
@@ -168,8 +178,11 @@ still_busy:
  */
 static int ntfs_read_block(struct page *page)
 {
+	loff_t i_size;
 	VCN vcn;
 	LCN lcn;
+	s64 init_size;
+	struct inode *vi;
 	ntfs_inode *ni;
 	ntfs_volume *vol;
 	runlist_element *rl;
@@ -180,7 +193,8 @@ static int ntfs_read_block(struct page *page)
 	int i, nr;
 	unsigned char blocksize_bits;
 
-	ni = NTFS_I(page->mapping->host);
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
 	vol = ni->vol;
 
 	/* $MFT/$DATA must have its complete runlist in memory at all times. */
@@ -199,11 +213,28 @@ static int ntfs_read_block(struct page *page)
 	bh = head = page_buffers(page);
 	BUG_ON(!bh);
 
+	/*
+	 * We may be racing with truncate.  To avoid some of the problems we
+	 * now take a snapshot of the various sizes and use those for the whole
+	 * of the function.  In case of an extending truncate it just means we
+	 * may leave some buffers unmapped which are now allocated.  This is
+	 * not a problem since these buffers will just get mapped when a write
+	 * occurs.  In case of a shrinking truncate, we will detect this later
+	 * on due to the runlist being incomplete and if the page is being
+	 * fully truncated, truncate will throw it away as soon as we unlock
+	 * it so no need to worry what we do with it.
+	 */
 	iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
 	read_lock_irqsave(&ni->size_lock, flags);
 	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
-	zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits;
+	init_size = ni->initialized_size;
+	i_size = i_size_read(vi);
 	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (unlikely(init_size > i_size)) {
+		/* Race with shrinking truncate. */
+		init_size = i_size;
+	}
+	zblock = (init_size + blocksize - 1) >> blocksize_bits;
 
 	/* Loop through all the buffers in the page. */
 	rl = NULL;
@@ -366,6 +397,8 @@ handle_zblock:
  */
 static int ntfs_readpage(struct file *file, struct page *page)
 {
+	loff_t i_size;
+	struct inode *vi;
 	ntfs_inode *ni, *base_ni;
 	u8 *kaddr;
 	ntfs_attr_search_ctx *ctx;
@@ -384,14 +417,17 @@ retry_readpage:
 		unlock_page(page);
 		return 0;
 	}
-	ni = NTFS_I(page->mapping->host);
+	vi = page->mapping->host;
+	ni = NTFS_I(vi);
 	/*
 	 * Only $DATA attributes can be encrypted and only unnamed $DATA
 	 * attributes can be compressed.  Index root can have the flags set but
 	 * this means to create compressed/encrypted files, not that the
-	 * attribute is compressed/encrypted.
+	 * attribute is compressed/encrypted.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
 	 */
-	if (ni->type != AT_INDEX_ROOT) {
+	if (ni->type != AT_INDEX_ALLOCATION) {
 		/* If attribute is encrypted, deny access, just like NT4. */
 		if (NInoEncrypted(ni)) {
 			BUG_ON(ni->type != AT_DATA);
@@ -456,7 +492,12 @@ retry_readpage:
 	read_lock_irqsave(&ni->size_lock, flags);
 	if (unlikely(attr_len > ni->initialized_size))
 		attr_len = ni->initialized_size;
+	i_size = i_size_read(vi);
 	read_unlock_irqrestore(&ni->size_lock, flags);
+	if (unlikely(attr_len > i_size)) {
+		/* Race with shrinking truncate. */
+		attr_len = i_size;
+	}
 	kaddr = kmap_atomic(page, KM_USER0);
 	/* Copy the data to the page. */
 	memcpy(kaddr, (u8*)ctx->attr +
@@ -1341,9 +1382,11 @@ retry_writepage:
 	 * Only $DATA attributes can be encrypted and only unnamed $DATA
 	 * attributes can be compressed.  Index root can have the flags set but
 	 * this means to create compressed/encrypted files, not that the
-	 * attribute is compressed/encrypted.
+	 * attribute is compressed/encrypted.  Note we need to check for
+	 * AT_INDEX_ALLOCATION since this is the type of both directory and
+	 * index inodes.
 	 */
-	if (ni->type != AT_INDEX_ROOT) {
+	if (ni->type != AT_INDEX_ALLOCATION) {
 		/* If file is encrypted, deny access, just like NT4. */
 		if (NInoEncrypted(ni)) {
 			unlock_page(page);
@@ -1379,8 +1422,8 @@ retry_writepage:
 			unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
 			kaddr = kmap_atomic(page, KM_USER0);
 			memset(kaddr + ofs, 0, PAGE_CACHE_SIZE - ofs);
-			flush_dcache_page(page);
 			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(page);
 		}
 		/* Handle mst protected attributes. */
 		if (NInoMstProtected(ni))
@@ -1443,34 +1486,33 @@ retry_writepage:
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 	unlock_page(page);
-	/*
-	 * Here, we do not need to zero the out of bounds area everytime
-	 * because the below memcpy() already takes care of the
-	 * mmap-at-end-of-file requirements.  If the file is converted to a
-	 * non-resident one, then the code path use is switched to the
-	 * non-resident one where the zeroing happens on each ntfs_writepage()
-	 * invocation.
-	 */
 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
 	i_size = i_size_read(vi);
 	if (unlikely(attr_len > i_size)) {
+		/* Race with shrinking truncate or a failed truncate. */
 		attr_len = i_size;
-		ctx->attr->data.resident.value_length = cpu_to_le32(attr_len);
+		/*
+		 * If the truncate failed, fix it up now.  If a concurrent
+		 * truncate, we do its job, so it does not have to do anything.
+		 */
+		err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
+				attr_len);
+		/* Shrinking cannot fail. */
+		BUG_ON(err);
 	}
 	kaddr = kmap_atomic(page, KM_USER0);
 	/* Copy the data from the page to the mft record. */
 	memcpy((u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset),
 			kaddr, attr_len);
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	/* Zero out of bounds area in the page cache page. */
 	memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
-	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_USER0);
-
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	flush_dcache_page(page);
+	/* We are done with the page. */
 	end_page_writeback(page);
-
-	/* Mark the mft record dirty, so it gets written back. */
+	/* Finally, mark the mft record dirty, so it gets written back. */
 	mark_mft_record_dirty(ctx->ntfs_ino);
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(base_ni);
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
index 12cf2e30c7dd..7a190cdc60e2 100644
--- a/fs/ntfs/bitmap.c
+++ b/fs/ntfs/bitmap.c
@@ -1,7 +1,7 @@
 /*
  * bitmap.c - NTFS kernel bitmap handling.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2004 Anton Altaparmakov
+ * Copyright (c) 2004-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -90,7 +90,8 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
 	/* If the first byte is partial, modify the appropriate bits in it. */
 	if (bit) {
 		u8 *byte = kaddr + pos;
-		while ((bit & 7) && cnt--) {
+		while ((bit & 7) && cnt) {
+			cnt--;
 			if (value)
 				*byte |= 1 << bit++;
 			else
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index dc4bbe3acf5c..7ec045131808 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1166,6 +1166,8 @@ err_out:
  *
  * Return 0 on success and -errno on error.  In the error case, the inode will
  * have had make_bad_inode() executed on it.
+ *
+ * Note this cannot be called for AT_INDEX_ALLOCATION.
  */
 static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 {
@@ -1242,8 +1244,8 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 			}
 		}
 		/*
-		 * The encryption flag set in an index root just means to
-		 * compress all files.
+		 * The compressed/sparse flag set in an index root just means
+		 * to compress all files.
 		 */
 		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
 			ntfs_error(vi->i_sb, "Found mst protected attribute "
@@ -1319,8 +1321,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 					"the mapping pairs array.");
 			goto unm_err_out;
 		}
-		if ((NInoCompressed(ni) || NInoSparse(ni)) &&
-				ni->type != AT_INDEX_ROOT) {
+		if (NInoCompressed(ni) || NInoSparse(ni)) {
 			if (a->data.non_resident.compression_unit != 4) {
 				ntfs_error(vi->i_sb, "Found nonstandard "
 						"compression unit (%u instead "
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 609ad1728ce4..5c248d404f05 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -123,7 +123,7 @@ enum {
 	magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */
 
 	/* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
-	magic_CHKD = const_cpu_to_le32(0x424b4843), /* Modified by chkdsk. */
+	magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
 
 	/* Found in all ntfs record containing records. */
 	magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector
@@ -308,10 +308,8 @@ typedef le16 MFT_RECORD_FLAGS;
  * The _LE versions are to be applied on little endian MFT_REFs.
  * Note: The _LE versions will return a CPU endian formatted value!
  */
-typedef enum {
-	MFT_REF_MASK_CPU	= 0x0000ffffffffffffULL,
-	MFT_REF_MASK_LE		= const_cpu_to_le64(0x0000ffffffffffffULL),
-} MFT_REF_CONSTS;
+#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
+#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU)
 
 typedef u64 MFT_REF;
 typedef le64 leMFT_REF;
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 7b5934290685..5af3bf0b7eee 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -779,14 +779,13 @@ out:
 
 /**
  * __ntfs_cluster_free - free clusters on an ntfs volume
- * @vi:		vfs inode whose runlist describes the clusters to free
- * @start_vcn:	vcn in the runlist of @vi at which to start freeing clusters
+ * @ni:		ntfs inode whose runlist describes the clusters to free
+ * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
  * @count:	number of clusters to free or -1 for all clusters
- * @write_locked:	true if the runlist is locked for writing
  * @is_rollback:	true if this is a rollback operation
  *
  * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the vfs inode @vi.
+ * described by the vfs inode @ni.
  *
  * If @count is -1, all clusters from @start_vcn to the end of the runlist are
  * deallocated.  Thus, to completely free all clusters in a runlist, use
@@ -801,31 +800,28 @@ out:
  * Return the number of deallocated clusters (not counting sparse ones) on
  * success and -errno on error.
  *
- * Locking: - The runlist described by @vi must be locked on entry and is
- *	      locked on return.  Note if the runlist is locked for reading the
- *	      lock may be dropped and reacquired.  Note the runlist may be
- *	      modified when needed runlist fragments need to be mapped.
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
  *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
  *	      on return.
  *	    - This function takes the volume lcn bitmap lock for writing and
  *	      modifies the bitmap contents.
  */
-s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
-		const BOOL write_locked, const BOOL is_rollback)
+s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
+		const BOOL is_rollback)
 {
 	s64 delta, to_free, total_freed, real_freed;
-	ntfs_inode *ni;
 	ntfs_volume *vol;
 	struct inode *lcnbmp_vi;
 	runlist_element *rl;
 	int err;
 
-	BUG_ON(!vi);
+	BUG_ON(!ni);
 	ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
-			"0x%llx.%s", vi->i_ino, (unsigned long long)start_vcn,
+			"0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn,
 			(unsigned long long)count,
 			is_rollback ? " (rollback)" : "");
-	ni = NTFS_I(vi);
 	vol = ni->vol;
 	lcnbmp_vi = vol->lcnbmp_ino;
 	BUG_ON(!lcnbmp_vi);
@@ -843,7 +839,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 
 	total_freed = real_freed = 0;
 
-	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, write_locked);
+	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, TRUE);
 	if (IS_ERR(rl)) {
 		if (!is_rollback)
 			ntfs_error(vol->sb, "Failed to find first runlist "
@@ -897,7 +893,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 
 			/* Attempt to map runlist. */
 			vcn = rl->vcn;
-			rl = ntfs_attr_find_vcn_nolock(ni, vcn, write_locked);
+			rl = ntfs_attr_find_vcn_nolock(ni, vcn, TRUE);
 			if (IS_ERR(rl)) {
 				err = PTR_ERR(rl);
 				if (!is_rollback)
@@ -965,8 +961,7 @@ err_out:
 	 * If rollback fails, set the volume errors flag, emit an error
 	 * message, and return the error code.
 	 */
-	delta = __ntfs_cluster_free(vi, start_vcn, total_freed, write_locked,
-			TRUE);
+	delta = __ntfs_cluster_free(ni, start_vcn, total_freed, TRUE);
 	if (delta < 0) {
 		ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
 				"inconsistent metadata!  Unmount and run "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index e4d7fb98d685..a6a8827882e7 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -2,7 +2,7 @@
  * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation.  Part of the
  *		Linux-NTFS project.
  *
- * Copyright (c) 2004 Anton Altaparmakov
+ * Copyright (c) 2004-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -28,6 +28,7 @@
 #include <linux/fs.h>
 
 #include "types.h"
+#include "inode.h"
 #include "runlist.h"
 #include "volume.h"
 
@@ -42,18 +43,17 @@ extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
 		const VCN start_vcn, const s64 count, const LCN start_lcn,
 		const NTFS_CLUSTER_ALLOCATION_ZONES zone);
 
-extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-		s64 count, const BOOL write_locked, const BOOL is_rollback);
+extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+		s64 count, const BOOL is_rollback);
 
 /**
  * ntfs_cluster_free - free clusters on an ntfs volume
- * @vi:		vfs inode whose runlist describes the clusters to free
- * @start_vcn:	vcn in the runlist of @vi at which to start freeing clusters
+ * @ni:		ntfs inode whose runlist describes the clusters to free
+ * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
  * @count:	number of clusters to free or -1 for all clusters
- * @write_locked:	true if the runlist is locked for writing
  *
  * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the vfs inode @vi.
+ * described by the ntfs inode @ni.
  *
  * If @count is -1, all clusters from @start_vcn to the end of the runlist are
  * deallocated.  Thus, to completely free all clusters in a runlist, use
@@ -65,19 +65,18 @@ extern s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
  * Return the number of deallocated clusters (not counting sparse ones) on
  * success and -errno on error.
  *
- * Locking: - The runlist described by @vi must be locked on entry and is
- *	      locked on return.  Note if the runlist is locked for reading the
- *	      lock may be dropped and reacquired.  Note the runlist may be
- *	      modified when needed runlist fragments need to be mapped.
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ *	      and is locked on return.  Note the runlist may be modified when
+ *	      needed runlist fragments need to be mapped.
  *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
  *	      on return.
  *	    - This function takes the volume lcn bitmap lock for writing and
  *	      modifies the bitmap contents.
  */
-static inline s64 ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
-		s64 count, const BOOL write_locked)
+static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+		s64 count)
 {
-	return __ntfs_cluster_free(vi, start_vcn, count, write_locked, FALSE);
+	return __ntfs_cluster_free(ni, start_vcn, count, FALSE);
 }
 
 extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 0173e95500d9..0fd70295cca6 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -51,7 +51,8 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 		RESTART_PAGE_HEADER *rp, s64 pos)
 {
 	u32 logfile_system_page_size, logfile_log_page_size;
-	u16 usa_count, usa_ofs, usa_end, ra_ofs;
+	u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
+	BOOL have_usa = TRUE;
 
 	ntfs_debug("Entering.");
 	/*
@@ -86,6 +87,14 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 				(int)sle16_to_cpu(rp->minor_ver));
 		return FALSE;
 	}
+	/*
+	 * If chkdsk has been run the restart page may not be protected by an
+	 * update sequence array.
+	 */
+	if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
+		have_usa = FALSE;
+		goto skip_usa_checks;
+	}
 	/* Verify the size of the update sequence array. */
 	usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
 	if (usa_count != le16_to_cpu(rp->usa_count)) {
@@ -102,6 +111,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 				"inconsistent update sequence array offset.");
 		return FALSE;
 	}
+skip_usa_checks:
 	/*
 	 * Verify the position of the restart area.  It must be:
 	 *	- aligned to 8-byte boundary,
@@ -109,7 +119,8 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 	 *	- within the system page size.
 	 */
 	ra_ofs = le16_to_cpu(rp->restart_area_offset);
-	if (ra_ofs & 7 || ra_ofs < usa_end ||
+	if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end :
+			ra_ofs < sizeof(RESTART_PAGE_HEADER)) ||
 			ra_ofs > logfile_system_page_size) {
 		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
 				"inconsistent restart area offset.");
@@ -402,8 +413,12 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
 			idx++;
 		} while (to_read > 0);
 	}
-	/* Perform the multi sector transfer deprotection on the buffer. */
-	if (post_read_mst_fixup((NTFS_RECORD*)trp,
+	/*
+	 * Perform the multi sector transfer deprotection on the buffer if the
+	 * restart page is protected.
+	 */
+	if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count))
+			&& post_read_mst_fixup((NTFS_RECORD*)trp,
 			le32_to_cpu(rp->system_page_size))) {
 		/*
 		 * A multi sector tranfer error was detected.  We only need to
@@ -615,11 +630,16 @@ is_empty:
 		 * Otherwise just throw it away.
 		 */
 		if (rstr2_lsn > rstr1_lsn) {
+			ntfs_debug("Using second restart page as it is more "
+					"recent.");
 			ntfs_free(rstr1_ph);
 			rstr1_ph = rstr2_ph;
 			/* rstr1_lsn = rstr2_lsn; */
-		} else
+		} else {
+			ntfs_debug("Using first restart page as it is more "
+					"recent.");
 			ntfs_free(rstr2_ph);
+		}
 		rstr2_ph = NULL;
 	}
 	/* All consistency checks passed. */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 42388f95ea6d..a51f3dd0e9eb 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -113,7 +113,7 @@ typedef struct {
  */
 enum {
 	RESTART_VOLUME_IS_CLEAN	= const_cpu_to_le16(0x0002),
-	RESTART_SPACE_FILLER	= 0xffff, /* gcc: Force enum bit width to 16. */
+	RESTART_SPACE_FILLER	= const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
 } __attribute__ ((__packed__));
 
 typedef le16 RESTART_AREA_FLAGS;
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index 3288bcc2c4aa..590887b943f5 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -1,7 +1,7 @@
 /*
  * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -40,7 +40,7 @@
  * Depending on @gfp_mask the allocation may be guaranteed to succeed.
  */
 static inline void *__ntfs_malloc(unsigned long size,
-		unsigned int __nocast gfp_mask)
+		gfp_t gfp_mask)
 {
 	if (likely(size <= PAGE_SIZE)) {
 		BUG_ON(!size);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 2c32b84385a8..b011369b5956 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -58,7 +58,8 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
 	 * overflowing the unsigned long, but I don't think we would ever get
 	 * here if the volume was that big...
 	 */
-	index = ni->mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+	index = (u64)ni->mft_no << vol->mft_record_size_bits >>
+			PAGE_CACHE_SHIFT;
 	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 
 	i_size = i_size_read(mft_vi);
@@ -1953,7 +1954,7 @@ restore_undo_alloc:
 	a = ctx->attr;
 	a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1);
 undo_alloc:
-	if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1, TRUE) < 0) {
+	if (ntfs_cluster_free(mft_ni, old_last_vcn, -1) < 0) {
 		ntfs_error(vol->sb, "Failed to free clusters from mft data "
 				"attribute.%s", es);
 		NVolSetErrors(vol);
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index f5b2ac929081..061b5ff6b73c 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -2,7 +2,7 @@
  * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
+ * Copyright (c) 2002-2005 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -158,17 +158,21 @@ static inline BOOL ntfs_are_rl_mergeable(runlist_element *dst,
 	BUG_ON(!dst);
 	BUG_ON(!src);
 
-	if ((dst->lcn < 0) || (src->lcn < 0)) {   /* Are we merging holes? */
-		if (dst->lcn == LCN_HOLE && src->lcn == LCN_HOLE)
-			return TRUE;
+	/* We can merge unmapped regions even if they are misaligned. */
+	if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
+		return TRUE;
+	/* If the runs are misaligned, we cannot merge them. */
+	if ((dst->vcn + dst->length) != src->vcn)
 		return FALSE;
-	}
-	if ((dst->lcn + dst->length) != src->lcn) /* Are the runs contiguous? */
-		return FALSE;
-	if ((dst->vcn + dst->length) != src->vcn) /* Are the runs misaligned? */
-		return FALSE;
-
-	return TRUE;
+	/* If both runs are non-sparse and contiguous, we can merge them. */
+	if ((dst->lcn >= 0) && (src->lcn >= 0) &&
+			((dst->lcn + dst->length) == src->lcn))
+		return TRUE;
+	/* If we are merging two holes, we can merge them. */
+	if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
+		return TRUE;
+	/* Cannot merge. */
+	return FALSE;
 }
 
 /**
@@ -214,14 +218,15 @@ static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
 static inline runlist_element *ntfs_rl_append(runlist_element *dst,
 		int dsize, runlist_element *src, int ssize, int loc)
 {
-	BOOL right;
-	int magic;
+	BOOL right = FALSE;	/* Right end of @src needs merging. */
+	int marker;		/* End of the inserted runs. */
 
 	BUG_ON(!dst);
 	BUG_ON(!src);
 
 	/* First, check if the right hand end needs merging. */
-	right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+	if ((loc + 1) < dsize)
+		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
 
 	/* Space required: @dst size + @src size, less one if we merged. */
 	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
@@ -236,18 +241,19 @@ static inline runlist_element *ntfs_rl_append(runlist_element *dst,
 	if (right)
 		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
 
-	magic = loc + ssize;
+	/* First run after the @src runs that have been inserted. */
+	marker = loc + ssize + 1;
 
 	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, magic + 1, loc + 1 + right, dsize - loc - 1 - right);
+	ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right));
 	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
 
 	/* Adjust the size of the preceding hole. */
 	dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
 
 	/* We may have changed the length of the file, so fix the end marker */
-	if (dst[magic + 1].lcn == LCN_ENOENT)
-		dst[magic + 1].vcn = dst[magic].vcn + dst[magic].length;
+	if (dst[marker].lcn == LCN_ENOENT)
+		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
 
 	return dst;
 }
@@ -279,18 +285,17 @@ static inline runlist_element *ntfs_rl_append(runlist_element *dst,
 static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
 		int dsize, runlist_element *src, int ssize, int loc)
 {
-	BOOL left = FALSE;
-	BOOL disc = FALSE;	/* Discontinuity */
-	BOOL hole = FALSE;	/* Following a hole */
-	int magic;
+	BOOL left = FALSE;	/* Left end of @src needs merging. */
+	BOOL disc = FALSE;	/* Discontinuity between @dst and @src. */
+	int marker;		/* End of the inserted runs. */
 
 	BUG_ON(!dst);
 	BUG_ON(!src);
 
-	/* disc => Discontinuity between the end of @dst and the start of @src.
-	 *	   This means we might need to insert a hole.
-	 * hole => @dst ends with a hole or an unmapped region which we can
-	 *	   extend to match the discontinuity. */
+	/*
+	 * disc => Discontinuity between the end of @dst and the start of @src.
+	 *	   This means we might need to insert a "not mapped" run.
+	 */
 	if (loc == 0)
 		disc = (src[0].vcn > 0);
 	else {
@@ -303,58 +308,49 @@ static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
 			merged_length += src->length;
 
 		disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
-		if (disc)
-			hole = (dst[loc - 1].lcn == LCN_HOLE);
 	}
-
-	/* Space required: @dst size + @src size, less one if we merged, plus
-	 * one if there was a discontinuity, less one for a trailing hole. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc - hole);
+	/*
+	 * Space required: @dst size + @src size, less one if we merged, plus
+	 * one if there was a discontinuity.
+	 */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc);
 	if (IS_ERR(dst))
 		return dst;
 	/*
 	 * We are guaranteed to succeed from here so can start modifying the
 	 * original runlist.
 	 */
-
 	if (left)
 		__ntfs_rl_merge(dst + loc - 1, src);
-
-	magic = loc + ssize - left + disc - hole;
+	/*
+	 * First run after the @src runs that have been inserted.
+	 * Nominally,  @marker equals @loc + @ssize, i.e. location + number of
+	 * runs in @src.  However, if @left, then the first run in @src has
+	 * been merged with one in @dst.  And if @disc, then @dst and @src do
+	 * not meet and we need an extra run to fill the gap.
+	 */
+	marker = loc + ssize - left + disc;
 
 	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, magic, loc, dsize - loc);
-	ntfs_rl_mc(dst, loc + disc - hole, src, left, ssize - left);
+	ntfs_rl_mm(dst, marker, loc, dsize - loc);
+	ntfs_rl_mc(dst, loc + disc, src, left, ssize - left);
 
-	/* Adjust the VCN of the last run ... */
-	if (dst[magic].lcn <= LCN_HOLE)
-		dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
+	/* Adjust the VCN of the first run after the insertion... */
+	dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
 	/* ... and the length. */
-	if (dst[magic].lcn == LCN_HOLE || dst[magic].lcn == LCN_RL_NOT_MAPPED)
-		dst[magic].length = dst[magic + 1].vcn - dst[magic].vcn;
+	if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED)
+		dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn;
 
-	/* Writing beyond the end of the file and there's a discontinuity. */
+	/* Writing beyond the end of the file and there is a discontinuity. */
 	if (disc) {
-		if (hole)
-			dst[loc - 1].length = dst[loc].vcn - dst[loc - 1].vcn;
-		else {
-			if (loc > 0) {
-				dst[loc].vcn = dst[loc - 1].vcn +
-						dst[loc - 1].length;
-				dst[loc].length = dst[loc + 1].vcn -
-						dst[loc].vcn;
-			} else {
-				dst[loc].vcn = 0;
-				dst[loc].length = dst[loc + 1].vcn;
-			}
-			dst[loc].lcn = LCN_RL_NOT_MAPPED;
+		if (loc > 0) {
+			dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length;
+			dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+		} else {
+			dst[loc].vcn = 0;
+			dst[loc].length = dst[loc + 1].vcn;
 		}
-
-		magic += hole;
-
-		if (dst[magic].lcn == LCN_ENOENT)
-			dst[magic].vcn = dst[magic - 1].vcn +
-					dst[magic - 1].length;
+		dst[loc].lcn = LCN_RL_NOT_MAPPED;
 	}
 	return dst;
 }
@@ -385,20 +381,23 @@ static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
 static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
 		int dsize, runlist_element *src, int ssize, int loc)
 {
-	BOOL left = FALSE;
-	BOOL right;
-	int magic;
+	BOOL left = FALSE;	/* Left end of @src needs merging. */
+	BOOL right = FALSE;	/* Right end of @src needs merging. */
+	int tail;		/* Start of tail of @dst. */
+	int marker;		/* End of the inserted runs. */
 
 	BUG_ON(!dst);
 	BUG_ON(!src);
 
-	/* First, merge the left and right ends, if necessary. */
-	right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+	/* First, see if the left and right ends need merging. */
+	if ((loc + 1) < dsize)
+		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
 	if (loc > 0)
 		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-
-	/* Allocate some space. We'll need less if the left, right, or both
-	 * ends were merged. */
+	/*
+	 * Allocate some space.  We will need less if the left, right, or both
+	 * ends get merged.
+	 */
 	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left - right);
 	if (IS_ERR(dst))
 		return dst;
@@ -406,21 +405,37 @@ static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
 	 * We are guaranteed to succeed from here so can start modifying the
 	 * original runlists.
 	 */
+
+	/* First, merge the left and right ends, if necessary. */
 	if (right)
 		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
 	if (left)
 		__ntfs_rl_merge(dst + loc - 1, src);
-
-	/* FIXME: What does this mean? (AIA) */
-	magic = loc + ssize - left;
+	/*
+	 * Offset of the tail of @dst.  This needs to be moved out of the way
+	 * to make space for the runs to be copied from @src, i.e. the first
+	 * run of the tail of @dst.
+	 * Nominally, @tail equals @loc + 1, i.e. location, skipping the
+	 * replaced run.  However, if @right, then one of @dst's runs is
+	 * already merged into @src.
+	 */
+	tail = loc + right + 1;
+	/*
+	 * First run after the @src runs that have been inserted, i.e. where
+	 * the tail of @dst needs to be moved to.
+	 * Nominally, @marker equals @loc + @ssize, i.e. location + number of
+	 * runs in @src.  However, if @left, then the first run in @src has
+	 * been merged with one in @dst.
+	 */
+	marker = loc + ssize - left;
 
 	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, magic, loc + right + 1, dsize - loc - right - 1);
+	ntfs_rl_mm(dst, marker, tail, dsize - tail);
 	ntfs_rl_mc(dst, loc, src, left, ssize - left);
 
-	/* We may have changed the length of the file, so fix the end marker */
-	if (dst[magic].lcn == LCN_ENOENT)
-		dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
+	/* We may have changed the length of the file, so fix the end marker. */
+	if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT)
+		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
 	return dst;
 }
 
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index a389a5a16c84..0ea887fc859c 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c
@@ -1,7 +1,7 @@
 /*
  * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 296480e96dd5..6c8dcf7613fd 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -35,7 +35,7 @@ EXPORT_SYMBOL(posix_acl_permission);
  * Allocate a new ACL with the specified number of entries.
  */
 struct posix_acl *
-posix_acl_alloc(int count, unsigned int __nocast flags)
+posix_acl_alloc(int count, gfp_t flags)
 {
 	const size_t size = sizeof(struct posix_acl) +
 	                    count * sizeof(struct posix_acl_entry);
@@ -51,7 +51,7 @@ posix_acl_alloc(int count, unsigned int __nocast flags)
  * Clone an ACL.
  */
 struct posix_acl *
-posix_acl_clone(const struct posix_acl *acl, unsigned int __nocast flags)
+posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
 {
 	struct posix_acl *clone = NULL;
 
@@ -185,7 +185,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p)
  * Create an ACL representing the file mode permission bits of an inode.
  */
 struct posix_acl *
-posix_acl_from_mode(mode_t mode, unsigned int __nocast flags)
+posix_acl_from_mode(mode_t mode, gfp_t flags)
 {
 	struct posix_acl *acl = posix_acl_alloc(3, flags);
 	if (!acl)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d88d518d30f6..d84eecacbeaf 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -74,6 +74,7 @@
 #include <linux/file.h>
 #include <linux/times.h>
 #include <linux/cpuset.h>
+#include <linux/rcupdate.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -180,12 +181,14 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		p->gid, p->egid, p->sgid, p->fsgid);
 	read_unlock(&tasklist_lock);
 	task_lock(p);
+	rcu_read_lock();
 	if (p->files)
 		fdt = files_fdtable(p->files);
 	buffer += sprintf(buffer,
 		"FDSize:\t%d\n"
 		"Groups:\t",
 		fdt ? fdt->max_fds : 0);
+	rcu_read_unlock();
 
 	group_info = p->group_info;
 	get_group_info(group_info);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 23db452ab428..a170450aadb1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -103,7 +103,9 @@ enum pid_directory_inos {
 	PROC_TGID_NUMA_MAPS,
 	PROC_TGID_MOUNTS,
 	PROC_TGID_WCHAN,
+#ifdef CONFIG_MMU
 	PROC_TGID_SMAPS,
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	PROC_TGID_SCHEDSTAT,
 #endif
@@ -141,7 +143,9 @@ enum pid_directory_inos {
 	PROC_TID_NUMA_MAPS,
 	PROC_TID_MOUNTS,
 	PROC_TID_WCHAN,
+#ifdef CONFIG_MMU
 	PROC_TID_SMAPS,
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	PROC_TID_SCHEDSTAT,
 #endif
@@ -195,7 +199,9 @@ static struct pid_entry tgid_base_stuff[] = {
 	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TGID_MOUNTS,    "mounts",  S_IFREG|S_IRUGO),
+#ifdef CONFIG_MMU
 	E(PROC_TGID_SMAPS,     "smaps",   S_IFREG|S_IRUGO),
+#endif
 #ifdef CONFIG_SECURITY
 	E(PROC_TGID_ATTR,      "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
 #endif
@@ -235,7 +241,9 @@ static struct pid_entry tid_base_stuff[] = {
 	E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_EXE,        "exe",     S_IFLNK|S_IRWXUGO),
 	E(PROC_TID_MOUNTS,     "mounts",  S_IFREG|S_IRUGO),
+#ifdef CONFIG_MMU
 	E(PROC_TID_SMAPS,      "smaps",   S_IFREG|S_IRUGO),
+#endif
 #ifdef CONFIG_SECURITY
 	E(PROC_TID_ATTR,       "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
 #endif
@@ -340,6 +348,54 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
 	return result;
 }
 
+
+/* Same as proc_root_link, but this addionally tries to get fs from other
+ * threads in the group */
+static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
+				struct vfsmount **mnt)
+{
+	struct fs_struct *fs;
+	int result = -ENOENT;
+	struct task_struct *leader = proc_task(inode);
+
+	task_lock(leader);
+	fs = leader->fs;
+	if (fs) {
+		atomic_inc(&fs->count);
+		task_unlock(leader);
+	} else {
+		/* Try to get fs from other threads */
+		task_unlock(leader);
+		read_lock(&tasklist_lock);
+		if (pid_alive(leader)) {
+			struct task_struct *task = leader;
+
+			while ((task = next_thread(task)) != leader) {
+				task_lock(task);
+				fs = task->fs;
+				if (fs) {
+					atomic_inc(&fs->count);
+					task_unlock(task);
+					break;
+				}
+				task_unlock(task);
+			}
+		}
+		read_unlock(&tasklist_lock);
+	}
+
+	if (fs) {
+		read_lock(&fs->lock);
+		*mnt = mntget(fs->rootmnt);
+		*dentry = dget(fs->root);
+		read_unlock(&fs->lock);
+		result = 0;
+		put_fs_struct(fs);
+	}
+	return result;
+}
+
+
 #define MAY_PTRACE(task) \
 	(task == current || \
 	(task->parent == current && \
@@ -471,14 +527,14 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 
 /* permission checks */
 
-static int proc_check_root(struct inode *inode)
+/* If the process being read is separated by chroot from the reading process,
+ * don't let the reader access the threads.
+ */
+static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt)
 {
-	struct dentry *de, *base, *root;
-	struct vfsmount *our_vfsmnt, *vfsmnt, *mnt;
+	struct dentry *de, *base;
+	struct vfsmount *our_vfsmnt, *mnt;
 	int res = 0;
-
-	if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */
-		return -ENOENT;
 	read_lock(&current->fs->lock);
 	our_vfsmnt = mntget(current->fs->rootmnt);
 	base = dget(current->fs->root);
@@ -511,6 +567,16 @@ out:
 	goto exit;
 }
 
+static int proc_check_root(struct inode *inode)
+{
+	struct dentry *root;
+	struct vfsmount *vfsmnt;
+
+	if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */
+		return -ENOENT;
+	return proc_check_chroot(root, vfsmnt);
+}
+
 static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	if (generic_permission(inode, mask, NULL) != 0)
@@ -518,6 +584,20 @@ static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
 	return proc_check_root(inode);
 }
 
+static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	struct dentry *root;
+	struct vfsmount *vfsmnt;
+
+	if (generic_permission(inode, mask, NULL) != 0)
+		return -EACCES;
+
+	if (proc_task_root_link(inode, &root, &vfsmnt))
+		return -ENOENT;
+
+	return proc_check_chroot(root, vfsmnt);
+}
+
 extern struct seq_operations proc_pid_maps_op;
 static int maps_open(struct inode *inode, struct file *file)
 {
@@ -558,6 +638,7 @@ static struct file_operations proc_numa_maps_operations = {
 };
 #endif
 
+#ifdef CONFIG_MMU
 extern struct seq_operations proc_pid_smaps_op;
 static int smaps_open(struct inode *inode, struct file *file)
 {
@@ -576,6 +657,7 @@ static struct file_operations proc_smaps_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif
 
 extern struct seq_operations mounts_op;
 static int mounts_open(struct inode *inode, struct file *file)
@@ -1419,7 +1501,7 @@ static struct inode_operations proc_fd_inode_operations = {
 
 static struct inode_operations proc_task_inode_operations = {
 	.lookup		= proc_task_lookup,
-	.permission	= proc_permission,
+	.permission	= proc_task_permission,
 };
 
 #ifdef CONFIG_SECURITY
@@ -1609,10 +1691,12 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 		case PROC_TGID_MOUNTS:
 			inode->i_fop = &proc_mounts_operations;
 			break;
+#ifdef CONFIG_MMU
 		case PROC_TID_SMAPS:
 		case PROC_TGID_SMAPS:
 			inode->i_fop = &proc_smaps_operations;
 			break;
+#endif
 #ifdef CONFIG_SECURITY
 		case PROC_TID_ATTR:
 			inode->i_nlink = 2;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index f3bf016d5ee3..cff10ab1af63 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -91,6 +91,7 @@ static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos)
 			next = _rb;
 			break;
 		}
+		pos--;
 	}
 
 	return next;
diff --git a/fs/read_write.c b/fs/read_write.c
index b60324aaa2b6..a091ee4f430d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -499,6 +499,9 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	ret = rw_verify_area(type, file, pos, tot_len);
 	if (ret)
 		goto out;
+	ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE);
+	if (ret)
+		goto out;
 
 	fnv = NULL;
 	if (type == READ) {
diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
index 2aa8e2719999..84e21ffa5ca8 100644
--- a/fs/relayfs/buffers.c
+++ b/fs/relayfs/buffers.c
@@ -109,7 +109,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
 		if (unlikely(!buf->page_array[i]))
 			goto depopulate;
 	}
-	mem = vmap(buf->page_array, n_pages, GFP_KERNEL, PAGE_KERNEL);
+	mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
 	if (!mem)
 		goto depopulate;
 
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 4b184559f231..d2653b589b1c 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -45,7 +45,7 @@
 
 
 void *
-kmem_alloc(size_t size, unsigned int __nocast flags)
+kmem_alloc(size_t size, gfp_t flags)
 {
 	int		retries = 0;
 	unsigned int	lflags = kmem_flags_convert(flags);
@@ -67,7 +67,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 }
 
 void *
-kmem_zalloc(size_t size, unsigned int __nocast flags)
+kmem_zalloc(size_t size, gfp_t flags)
 {
 	void	*ptr;
 
@@ -90,7 +90,7 @@ kmem_free(void *ptr, size_t size)
 
 void *
 kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
-	     unsigned int __nocast flags)
+	     gfp_t flags)
 {
 	void	*new;
 
@@ -105,7 +105,7 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
 }
 
 void *
-kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_alloc(kmem_zone_t *zone, gfp_t flags)
 {
 	int		retries = 0;
 	unsigned int	lflags = kmem_flags_convert(flags);
@@ -124,7 +124,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 }
 
 void *
-kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_zalloc(kmem_zone_t *zone, gfp_t flags)
 {
 	void	*ptr;
 
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 109fcf27e256..ee7010f085bc 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -81,7 +81,7 @@ typedef unsigned long xfs_pflags_t;
 	*(NSTATEP) = *(OSTATEP);	\
 } while (0)
 
-static __inline unsigned int kmem_flags_convert(unsigned int __nocast flags)
+static __inline unsigned int kmem_flags_convert(gfp_t flags)
 {
 	unsigned int	lflags = __GFP_NOWARN;	/* we'll report problems, if need be */
 
@@ -125,13 +125,12 @@ kmem_zone_destroy(kmem_zone_t *zone)
 		BUG();
 }
 
-extern void	    *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-extern void	    *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
+extern void	    *kmem_zone_zalloc(kmem_zone_t *, gfp_t);
+extern void	    *kmem_zone_alloc(kmem_zone_t *, gfp_t);
 
-extern void	    *kmem_alloc(size_t, unsigned int __nocast);
-extern void	    *kmem_realloc(void *, size_t, size_t,
-				  unsigned int __nocast);
-extern void	    *kmem_zalloc(size_t, unsigned int __nocast);
+extern void	    *kmem_alloc(size_t, gfp_t);
+extern void	    *kmem_realloc(void *, size_t, size_t, gfp_t);
+extern void	    *kmem_zalloc(size_t, gfp_t);
 extern void         kmem_free(void *, size_t);
 
 typedef struct shrinker *kmem_shaker_t;