summaryrefslogtreecommitdiff
path: root/fs/nfs
diff options
context:
space:
mode:
authorTrond Myklebust <Trond.Myklebust@netapp.com>2007-04-02 19:29:52 -0400
committerTrond Myklebust <Trond.Myklebust@netapp.com>2007-04-30 22:17:06 -0700
commitc63c7b051395368573779c8309aa5c990dcf2f96 (patch)
treedb54090eef99349d15b95fcd8c2620a2403d8db8 /fs/nfs
parent8b09bee3083897e375bd0bf9d60f48daedfab3e0 (diff)
downloadkernel-common-c63c7b051395368573779c8309aa5c990dcf2f96.tar.gz
kernel-common-c63c7b051395368573779c8309aa5c990dcf2f96.tar.bz2
kernel-common-c63c7b051395368573779c8309aa5c990dcf2f96.zip
NFS: Fix a race when doing NFS write coalescing
Currently we do write coalescing in a very inefficient manner: one pass in generic_writepages() in order to lock the pages for writing, then one pass in nfs_flush_mapping() and/or nfs_sync_mapping_wait() in order to gather the locked pages for coalescing into RPC requests of size "wsize". In fact, it turns out there is actually a deadlock possible here since we only start I/O on the second pass. If the user signals the process while we're in nfs_sync_mapping_wait(), for instance, then we may exit before starting I/O on all the requests that have been queued up. Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'fs/nfs')
-rw-r--r--fs/nfs/pagelist.c92
-rw-r--r--fs/nfs/write.c149
2 files changed, 47 insertions, 194 deletions
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 094537ddd344..ea1a85df9ab1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -17,7 +17,6 @@
#include <linux/nfs_page.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
-#include <linux/writeback.h>
#define NFS_PARANOIA 1
@@ -354,25 +353,6 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
}
/**
- * nfs_pageio_add_list - Split coalesced requests out from a list.
- * @desc: destination io descriptor
- * @head: source list
- *
- * Moves a maximum of 'nmax' elements from one list to another.
- * The elements are checked to ensure that they form a contiguous set
- * of pages, and that the RPC credentials are the same.
- */
-void nfs_pageio_add_list(struct nfs_pageio_descriptor *desc,
- struct list_head *head)
-{
- while (!list_empty(head)) {
- struct nfs_page *req = nfs_list_entry(head->next);
- if (!nfs_pageio_add_request(desc, req))
- break;
- }
-}
-
-/**
* nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
* @desc: pointer to io descriptor
*/
@@ -383,78 +363,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
#define NFS_SCAN_MAXENTRIES 16
/**
- * nfs_scan_dirty - Scan the radix tree for dirty requests
- * @mapping: pointer to address space
- * @wbc: writeback_control structure
- * @dst: Destination list
- *
- * Moves elements from one of the inode request lists.
- * If the number of requests is set to 0, the entire address_space
- * starting at index idx_start, is scanned.
- * The requests are *not* checked to ensure that they form a contiguous set.
- * You must be holding the inode's req_lock when calling this function
- */
-long nfs_scan_dirty(struct address_space *mapping,
- struct writeback_control *wbc,
- struct list_head *dst)
-{
- struct nfs_inode *nfsi = NFS_I(mapping->host);
- struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
- struct nfs_page *req;
- pgoff_t idx_start, idx_end;
- long res = 0;
- int found, i;
-
- if (nfsi->ndirty == 0)
- return 0;
- if (wbc->range_cyclic) {
- idx_start = 0;
- idx_end = ULONG_MAX;
- } else if (wbc->range_end == 0) {
- idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
- idx_end = ULONG_MAX;
- } else {
- idx_start = wbc->range_start >> PAGE_CACHE_SHIFT;
- idx_end = wbc->range_end >> PAGE_CACHE_SHIFT;
- }
-
- for (;;) {
- unsigned int toscan = NFS_SCAN_MAXENTRIES;
-
- found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
- (void **)&pgvec[0], idx_start, toscan,
- NFS_PAGE_TAG_DIRTY);
-
- /* Did we make progress? */
- if (found <= 0)
- break;
-
- for (i = 0; i < found; i++) {
- req = pgvec[i];
- if (!wbc->range_cyclic && req->wb_index > idx_end)
- goto out;
-
- /* Try to lock request and mark it for writeback */
- if (!nfs_set_page_writeback_locked(req))
- goto next;
- radix_tree_tag_clear(&nfsi->nfs_page_tree,
- req->wb_index, NFS_PAGE_TAG_DIRTY);
- nfsi->ndirty--;
- nfs_list_remove_request(req);
- nfs_list_add_request(req, dst);
- res++;
- if (res == LONG_MAX)
- goto out;
-next:
- idx_start = req->wb_index + 1;
- }
- }
-out:
- WARN_ON ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty));
- return res;
-}
-
-/**
* nfs_scan_list - Scan a list for matching requests
* @nfsi: NFS inode
* @head: One of the NFS inode request lists
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b6749967eead..6ce2d94e7b3f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -38,7 +38,8 @@
static struct nfs_page * nfs_update_request(struct nfs_open_context*,
struct page *,
unsigned int, unsigned int);
-static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how);
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
+ struct inode *inode, int ioflags);
static const struct rpc_call_ops nfs_write_partial_ops;
static const struct rpc_call_ops nfs_write_full_ops;
static const struct rpc_call_ops nfs_commit_ops;
@@ -201,7 +202,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
static int wb_priority(struct writeback_control *wbc)
{
if (wbc->for_reclaim)
- return FLUSH_HIGHPRI;
+ return FLUSH_HIGHPRI | FLUSH_STABLE;
if (wbc->for_kupdate)
return FLUSH_LOWPRI;
return 0;
@@ -251,7 +252,8 @@ static void nfs_end_page_writeback(struct page *page)
* was not tagged.
* May also return an error if the user signalled nfs_wait_on_request().
*/
-static int nfs_page_mark_flush(struct page *page)
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+ struct page *page)
{
struct nfs_page *req;
struct nfs_inode *nfsi = NFS_I(page->mapping->host);
@@ -273,6 +275,8 @@ static int nfs_page_mark_flush(struct page *page)
* request as dirty (in which case we don't care).
*/
spin_unlock(req_lock);
+ /* Prevent deadlock! */
+ nfs_pageio_complete(pgio);
ret = nfs_wait_on_request(req);
nfs_release_request(req);
if (ret != 0)
@@ -283,21 +287,18 @@ static int nfs_page_mark_flush(struct page *page)
/* This request is marked for commit */
spin_unlock(req_lock);
nfs_unlock_request(req);
+ nfs_pageio_complete(pgio);
return 1;
}
- if (nfs_set_page_writeback(page) == 0) {
- nfs_list_remove_request(req);
- /* add the request to the inode's dirty list. */
- radix_tree_tag_set(&nfsi->nfs_page_tree,
- req->wb_index, NFS_PAGE_TAG_DIRTY);
- nfs_list_add_request(req, &nfsi->dirty);
- nfsi->ndirty++;
- spin_unlock(req_lock);
- __mark_inode_dirty(page->mapping->host, I_DIRTY_PAGES);
- } else
+ if (nfs_set_page_writeback(page) != 0) {
spin_unlock(req_lock);
+ BUG();
+ }
+ radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
+ NFS_PAGE_TAG_WRITEBACK);
ret = test_bit(PG_NEED_FLUSH, &req->wb_flags);
- nfs_unlock_request(req);
+ spin_unlock(req_lock);
+ nfs_pageio_add_request(pgio, req);
return ret;
}
@@ -306,6 +307,7 @@ static int nfs_page_mark_flush(struct page *page)
*/
static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
{
+ struct nfs_pageio_descriptor mypgio, *pgio;
struct nfs_open_context *ctx;
struct inode *inode = page->mapping->host;
unsigned offset;
@@ -314,7 +316,14 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
- err = nfs_page_mark_flush(page);
+ if (wbc->for_writepages)
+ pgio = wbc->fs_private;
+ else {
+ nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc));
+ pgio = &mypgio;
+ }
+
+ err = nfs_page_async_flush(pgio, page);
if (err <= 0)
goto out;
err = 0;
@@ -331,12 +340,12 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
put_nfs_open_context(ctx);
if (err != 0)
goto out;
- err = nfs_page_mark_flush(page);
+ err = nfs_page_async_flush(pgio, page);
if (err > 0)
err = 0;
out:
if (!wbc->for_writepages)
- nfs_flush_mapping(page->mapping, wbc, FLUSH_STABLE|wb_priority(wbc));
+ nfs_pageio_complete(pgio);
return err;
}
@@ -352,20 +361,20 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
+ struct nfs_pageio_descriptor pgio;
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
+ nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
+ wbc->fs_private = &pgio;
err = generic_writepages(mapping, wbc);
+ nfs_pageio_complete(&pgio);
if (err)
return err;
- err = nfs_flush_mapping(mapping, wbc, wb_priority(wbc));
- if (err < 0)
- goto out;
- nfs_add_stats(inode, NFSIOS_WRITEPAGES, err);
- err = 0;
-out:
- return err;
+ if (pgio.pg_error)
+ return pgio.pg_error;
+ return 0;
}
/*
@@ -536,18 +545,6 @@ static int nfs_wait_on_requests_locked(struct inode *inode, unsigned long idx_st
return res;
}
-static void nfs_cancel_dirty_list(struct list_head *head)
-{
- struct nfs_page *req;
- while(!list_empty(head)) {
- req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_end_page_writeback(req->wb_page);
- nfs_inode_remove_request(req);
- nfs_clear_page_writeback(req);
- }
-}
-
static void nfs_cancel_commit_list(struct list_head *head)
{
struct nfs_page *req;
@@ -936,33 +933,15 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, size_t cou
return -ENOMEM;
}
-static int nfs_flush_list(struct inode *inode, struct list_head *head, int npages, int how)
+static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
+ struct inode *inode, int ioflags)
{
- struct nfs_pageio_descriptor desc;
- int wpages = NFS_SERVER(inode)->wpages;
int wsize = NFS_SERVER(inode)->wsize;
- /* For single writes, FLUSH_STABLE is more efficient */
- if (npages <= wpages && npages == NFS_I(inode)->npages
- && nfs_list_entry(head->next)->wb_bytes <= wsize)
- how |= FLUSH_STABLE;
-
if (wsize < PAGE_CACHE_SIZE)
- nfs_pageio_init(&desc, inode, nfs_flush_multi, wsize, how);
+ nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
else
- nfs_pageio_init(&desc, inode, nfs_flush_one, wsize, how);
- nfs_pageio_add_list(&desc, head);
- nfs_pageio_complete(&desc);
- if (desc.pg_error == 0)
- return 0;
- while (!list_empty(head)) {
- struct nfs_page *req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_redirty_request(req);
- nfs_end_page_writeback(req->wb_page);
- nfs_clear_page_writeback(req);
- }
- return desc.pg_error;
+ nfs_pageio_init(pgio, inode, nfs_flush_one, wsize, ioflags);
}
/*
@@ -1286,31 +1265,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
.rpc_call_done = nfs_commit_done,
.rpc_release = nfs_commit_release,
};
-#else
-static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
-{
- return 0;
-}
-#endif
-
-static long nfs_flush_mapping(struct address_space *mapping, struct writeback_control *wbc, int how)
-{
- struct nfs_inode *nfsi = NFS_I(mapping->host);
- LIST_HEAD(head);
- long res;
-
- spin_lock(&nfsi->req_lock);
- res = nfs_scan_dirty(mapping, wbc, &head);
- spin_unlock(&nfsi->req_lock);
- if (res) {
- int error = nfs_flush_list(mapping->host, &head, res, how);
- if (error < 0)
- return error;
- }
- return res;
-}
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
int nfs_commit_inode(struct inode *inode, int how)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -1327,6 +1282,11 @@ int nfs_commit_inode(struct inode *inode, int how)
}
return res;
}
+#else
+static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
+{
+ return 0;
+}
#endif
long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
@@ -1360,19 +1320,6 @@ long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_contr
ret = nfs_wait_on_requests_locked(inode, idx_start, npages);
if (ret != 0)
continue;
- pages = nfs_scan_dirty(mapping, wbc, &head);
- if (pages != 0) {
- spin_unlock(&nfsi->req_lock);
- if (how & FLUSH_INVALIDATE) {
- nfs_cancel_dirty_list(&head);
- ret = pages;
- } else
- ret = nfs_flush_list(inode, &head, pages, how);
- spin_lock(&nfsi->req_lock);
- continue;
- }
- if (wbc->pages_skipped != 0)
- continue;
if (nocommit)
break;
pages = nfs_scan_commit(inode, &head, idx_start, npages);
@@ -1412,7 +1359,7 @@ int nfs_wb_all(struct inode *inode)
};
int ret;
- ret = generic_writepages(mapping, &wbc);
+ ret = nfs_writepages(mapping, &wbc);
if (ret < 0)
goto out;
ret = nfs_sync_mapping_wait(mapping, &wbc, 0);
@@ -1435,11 +1382,9 @@ int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, lo
};
int ret;
- if (!(how & FLUSH_NOWRITEPAGE)) {
- ret = generic_writepages(mapping, &wbc);
- if (ret < 0)
- goto out;
- }
+ ret = nfs_writepages(mapping, &wbc);
+ if (ret < 0)
+ goto out;
ret = nfs_sync_mapping_wait(mapping, &wbc, how);
if (ret >= 0)
return 0;
@@ -1462,7 +1407,7 @@ int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
int ret;
BUG_ON(!PageLocked(page));
- if (!(how & FLUSH_NOWRITEPAGE) && clear_page_dirty_for_io(page)) {
+ if (clear_page_dirty_for_io(page)) {
ret = nfs_writepage_locked(page, &wbc);
if (ret < 0)
goto out;