diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/xfs/linux-2.6 | |
download | kernel-common-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz kernel-common-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2 kernel-common-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'fs/xfs/linux-2.6')
38 files changed, 12989 insertions, 0 deletions
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c new file mode 100644 index 000000000000..364ea8c386b1 --- /dev/null +++ b/fs/xfs/linux-2.6/kmem.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/swap.h> +#include <linux/blkdev.h> + +#include "time.h" +#include "kmem.h" + +#define MAX_VMALLOCS 6 +#define MAX_SLAB_SIZE 0x20000 + + +void * +kmem_alloc(size_t size, int flags) +{ + int retries = 0; + int lflags = kmem_flags_convert(flags); + void *ptr; + + do { + if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) + ptr = kmalloc(size, lflags); + else + ptr = __vmalloc(size, lflags, PAGE_KERNEL); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + printk(KERN_ERR "XFS: possible memory allocation " + "deadlock in %s (mode:0x%x)\n", + __FUNCTION__, lflags); + blk_congestion_wait(WRITE, HZ/50); + } while (1); +} + +void * +kmem_zalloc(size_t size, int flags) +{ + void *ptr; + + ptr = kmem_alloc(size, flags); + if (ptr) + memset((char *)ptr, 0, (int)size); + return ptr; +} + +void +kmem_free(void *ptr, size_t size) +{ + if (((unsigned long)ptr < VMALLOC_START) || + ((unsigned long)ptr >= VMALLOC_END)) { + kfree(ptr); + } else { + vfree(ptr); + } +} + +void * +kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags) +{ + void *new; + + new = kmem_alloc(newsize, flags); + if (ptr) { + if (new) + memcpy(new, ptr, + ((oldsize < newsize) ? oldsize : newsize)); + kmem_free(ptr, oldsize); + } + return new; +} + +void * +kmem_zone_alloc(kmem_zone_t *zone, int flags) +{ + int retries = 0; + int lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmem_cache_alloc(zone, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + printk(KERN_ERR "XFS: possible memory allocation " + "deadlock in %s (mode:0x%x)\n", + __FUNCTION__, lflags); + blk_congestion_wait(WRITE, HZ/50); + } while (1); +} + +void * +kmem_zone_zalloc(kmem_zone_t *zone, int flags) +{ + void *ptr; + + ptr = kmem_zone_alloc(zone, flags); + if (ptr) + memset((char *)ptr, 0, kmem_cache_size(zone)); + return ptr; +} diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h new file mode 100644 index 000000000000..1397b669b059 --- /dev/null +++ b/fs/xfs/linux-2.6/kmem.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_KMEM_H__ +#define __XFS_SUPPORT_KMEM_H__ + +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/mm.h> + +/* + * memory management routines + */ +#define KM_SLEEP 0x0001 +#define KM_NOSLEEP 0x0002 +#define KM_NOFS 0x0004 +#define KM_MAYFAIL 0x0008 + +#define kmem_zone kmem_cache_s +#define kmem_zone_t kmem_cache_t + +typedef unsigned long xfs_pflags_t; + +#define PFLAGS_TEST_NOIO() (current->flags & PF_NOIO) +#define PFLAGS_TEST_FSTRANS() (current->flags & PF_FSTRANS) + +#define PFLAGS_SET_NOIO() do { \ + current->flags |= PF_NOIO; \ +} while (0) + +#define PFLAGS_CLEAR_NOIO() do { \ + current->flags &= ~PF_NOIO; \ +} while (0) + +/* these could be nested, so we save state */ +#define PFLAGS_SET_FSTRANS(STATEP) do { \ + *(STATEP) = current->flags; \ + current->flags |= PF_FSTRANS; \ +} while (0) + +#define PFLAGS_CLEAR_FSTRANS(STATEP) do { \ + *(STATEP) = current->flags; \ + current->flags &= ~PF_FSTRANS; \ +} while (0) + +/* Restore the PF_FSTRANS state to what was saved in STATEP */ +#define PFLAGS_RESTORE_FSTRANS(STATEP) do { \ + current->flags = ((current->flags & ~PF_FSTRANS) | \ + (*(STATEP) & PF_FSTRANS)); \ +} while (0) + +#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \ + *(NSTATEP) = *(OSTATEP); \ +} while (0) + +static __inline unsigned int kmem_flags_convert(int flags) +{ + int lflags = __GFP_NOWARN; /* we'll report problems, if need be */ + +#ifdef DEBUG + if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) { + printk(KERN_WARNING + "XFS: memory allocation with wrong flags (%x)\n", flags); + BUG(); + } +#endif + + if (flags & KM_NOSLEEP) { + lflags |= GFP_ATOMIC; + } else { + lflags |= GFP_KERNEL; + + /* avoid recusive callbacks to filesystem during transactions */ + if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS)) + lflags &= ~__GFP_FS; + } + + return lflags; +} + +static __inline kmem_zone_t * +kmem_zone_init(int size, char *zone_name) +{ + return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL); +} + +static __inline void +kmem_zone_free(kmem_zone_t *zone, void *ptr) +{ + kmem_cache_free(zone, ptr); +} + +static __inline void +kmem_zone_destroy(kmem_zone_t *zone) +{ + if (zone && kmem_cache_destroy(zone)) + BUG(); +} + +extern void *kmem_zone_zalloc(kmem_zone_t *, int); +extern void *kmem_zone_alloc(kmem_zone_t *, int); + +extern void *kmem_alloc(size_t, int); +extern void *kmem_realloc(void *, size_t, size_t, int); +extern void *kmem_zalloc(size_t, int); +extern void kmem_free(void *, size_t); + +typedef struct shrinker *kmem_shaker_t; +typedef int (*kmem_shake_func_t)(int, unsigned int); + +static __inline kmem_shaker_t +kmem_shake_register(kmem_shake_func_t sfunc) +{ + return set_shrinker(DEFAULT_SEEKS, sfunc); +} + +static __inline void +kmem_shake_deregister(kmem_shaker_t shrinker) +{ + remove_shrinker(shrinker); +} + +static __inline int +kmem_shake_allow(unsigned int gfp_mask) +{ + return (gfp_mask & __GFP_WAIT); +} + +#endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h new file mode 100644 index 000000000000..d2c11a098ff2 --- /dev/null +++ b/fs/xfs/linux-2.6/mrlock.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_MRLOCK_H__ +#define __XFS_SUPPORT_MRLOCK_H__ + +#include <linux/rwsem.h> + +enum { MR_NONE, MR_ACCESS, MR_UPDATE }; + +typedef struct { + struct rw_semaphore mr_lock; + int mr_writer; +} mrlock_t; + +#define mrinit(mrp, name) \ + ( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) ) +#define mrlock_init(mrp, t,n,s) mrinit(mrp, n) +#define mrfree(mrp) do { } while (0) +#define mraccess(mrp) mraccessf(mrp, 0) +#define mrupdate(mrp) mrupdatef(mrp, 0) + +static inline void mraccessf(mrlock_t *mrp, int flags) +{ + down_read(&mrp->mr_lock); +} + +static inline void mrupdatef(mrlock_t *mrp, int flags) +{ + down_write(&mrp->mr_lock); + mrp->mr_writer = 1; +} + +static inline int mrtryaccess(mrlock_t *mrp) +{ + return down_read_trylock(&mrp->mr_lock); +} + +static inline int mrtryupdate(mrlock_t *mrp) +{ + if (!down_write_trylock(&mrp->mr_lock)) + return 0; + mrp->mr_writer = 1; + return 1; +} + +static inline void mrunlock(mrlock_t *mrp) +{ + if (mrp->mr_writer) { + mrp->mr_writer = 0; + up_write(&mrp->mr_lock); + } else { + up_read(&mrp->mr_lock); + } +} + +static inline void mrdemote(mrlock_t *mrp) +{ + mrp->mr_writer = 0; + downgrade_write(&mrp->mr_lock); +} + +#ifdef DEBUG +/* + * Debug-only routine, without some platform-specific asm code, we can + * now only answer requests regarding whether we hold the lock for write + * (reader state is outside our visibility, we only track writer state). + * Note: means !ismrlocked would give false positivies, so don't do that. + */ +static inline int ismrlocked(mrlock_t *mrp, int type) +{ + if (mrp && type == MR_UPDATE) + return mrp->mr_writer; + return 1; +} +#endif + +#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h new file mode 100644 index 000000000000..0b296bb944cb --- /dev/null +++ b/fs/xfs/linux-2.6/mutex.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_MUTEX_H__ +#define __XFS_SUPPORT_MUTEX_H__ + +#include <linux/spinlock.h> +#include <asm/semaphore.h> + +/* + * Map the mutex'es from IRIX to Linux semaphores. + * + * Destroy just simply initializes to -99 which should block all other + * callers. + */ +#define MUTEX_DEFAULT 0x0 +typedef struct semaphore mutex_t; + +#define mutex_init(lock, type, name) sema_init(lock, 1) +#define mutex_destroy(lock) sema_init(lock, -99) +#define mutex_lock(lock, num) down(lock) +#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1) +#define mutex_unlock(lock) up(lock) + +#endif /* __XFS_SUPPORT_MUTEX_H__ */ diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h new file mode 100644 index 000000000000..30b67b4e1cbf --- /dev/null +++ b/fs/xfs/linux-2.6/sema.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SEMA_H__ +#define __XFS_SUPPORT_SEMA_H__ + +#include <linux/time.h> +#include <linux/wait.h> +#include <asm/atomic.h> +#include <asm/semaphore.h> + +/* + * sema_t structure just maps to struct semaphore in Linux kernel. + */ + +typedef struct semaphore sema_t; + +#define init_sema(sp, val, c, d) sema_init(sp, val) +#define initsema(sp, val) sema_init(sp, val) +#define initnsema(sp, val, name) sema_init(sp, val) +#define psema(sp, b) down(sp) +#define vsema(sp) up(sp) +#define valusema(sp) (atomic_read(&(sp)->count)) +#define freesema(sema) + +/* + * Map cpsema (try to get the sema) to down_trylock. We need to switch + * the return values since cpsema returns 1 (acquired) 0 (failed) and + * down_trylock returns the reverse 0 (acquired) 1 (failed). + */ + +#define cpsema(sp) (down_trylock(sp) ? 0 : 1) + +/* + * Didn't do cvsema(sp). Not sure how to map this to up/down/... + * It does a vsema if the values is < 0 other wise nothing. + */ + +#endif /* __XFS_SUPPORT_SEMA_H__ */ diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h new file mode 100644 index 000000000000..bcf60a0b8df0 --- /dev/null +++ b/fs/xfs/linux-2.6/spin.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SPIN_H__ +#define __XFS_SUPPORT_SPIN_H__ + +#include <linux/sched.h> /* preempt needs this */ +#include <linux/spinlock.h> + +/* + * Map lock_t from IRIX to Linux spinlocks. + * + * We do not make use of lock_t from interrupt context, so we do not + * have to worry about disabling interrupts at all (unlike IRIX). + */ + +typedef spinlock_t lock_t; + +#define SPLDECL(s) unsigned long s + +#define spinlock_init(lock, name) spin_lock_init(lock) +#define spinlock_destroy(lock) +#define mutex_spinlock(lock) ({ spin_lock(lock); 0; }) +#define mutex_spinunlock(lock, s) do { spin_unlock(lock); (void)s; } while (0) +#define nested_spinlock(lock) spin_lock(lock) +#define nested_spinunlock(lock) spin_unlock(lock) + +#endif /* __XFS_SUPPORT_SPIN_H__ */ diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h new file mode 100644 index 000000000000..821d3167e05b --- /dev/null +++ b/fs/xfs/linux-2.6/sv.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_SV_H__ +#define __XFS_SUPPORT_SV_H__ + +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/spinlock.h> + +/* + * Synchronisation variables. + * + * (Parameters "pri", "svf" and "rts" are not implemented) + */ + +typedef struct sv_s { + wait_queue_head_t waiters; +} sv_t; + +#define SV_FIFO 0x0 /* sv_t is FIFO type */ +#define SV_LIFO 0x2 /* sv_t is LIFO type */ +#define SV_PRIO 0x4 /* sv_t is PRIO type */ +#define SV_KEYED 0x6 /* sv_t is KEYED type */ +#define SV_DEFAULT SV_FIFO + + +static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, + unsigned long timeout) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&sv->waiters, &wait); + __set_current_state(state); + spin_unlock(lock); + + schedule_timeout(timeout); + + remove_wait_queue(&sv->waiters, &wait); +} + +#define init_sv(sv,type,name,flag) \ + init_waitqueue_head(&(sv)->waiters) +#define sv_init(sv,flag,name) \ + init_waitqueue_head(&(sv)->waiters) +#define sv_destroy(sv) \ + /*NOTHING*/ +#define sv_wait(sv, pri, lock, s) \ + _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) +#define sv_wait_sig(sv, pri, lock, s) \ + _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) +#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \ + _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts)) +#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \ + _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts)) +#define sv_signal(sv) \ + wake_up(&(sv)->waiters) +#define sv_broadcast(sv) \ + wake_up_all(&(sv)->waiters) + +#endif /* __XFS_SUPPORT_SV_H__ */ diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/linux-2.6/time.h new file mode 100644 index 000000000000..6c6fd0faa8e1 --- /dev/null +++ b/fs/xfs/linux-2.6/time.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPPORT_TIME_H__ +#define __XFS_SUPPORT_TIME_H__ + +#include <linux/sched.h> +#include <linux/time.h> + +typedef struct timespec timespec_t; + +static inline void delay(long ticks) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(ticks); +} + +static inline void nanotime(struct timespec *tvp) +{ + *tvp = CURRENT_TIME; +} + +#endif /* __XFS_SUPPORT_TIME_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c new file mode 100644 index 000000000000..76a84758073a --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -0,0 +1,1275 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_trans.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_rw.h" +#include "xfs_iomap.h" +#include <linux/mpage.h> +#include <linux/writeback.h> + +STATIC void xfs_count_page_state(struct page *, int *, int *, int *); +STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *, + struct writeback_control *wbc, void *, int, int); + +#if defined(XFS_RW_TRACE) +void +xfs_page_trace( + int tag, + struct inode *inode, + struct page *page, + int mask) +{ + xfs_inode_t *ip; + bhv_desc_t *bdp; + vnode_t *vp = LINVFS_GET_VP(inode); + loff_t isize = i_size_read(inode); + loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + int delalloc = -1, unmapped = -1, unwritten = -1; + + if (page_has_buffers(page)) + xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); + + bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); + ip = XFS_BHVTOI(bdp); + if (!ip->i_rwtrace) + return; + + ktrace_enter(ip->i_rwtrace, + (void *)((unsigned long)tag), + (void *)ip, + (void *)inode, + (void *)page, + (void *)((unsigned long)mask), + (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), + (void *)((unsigned long)((isize >> 32) & 0xffffffff)), + (void *)((unsigned long)(isize & 0xffffffff)), + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)delalloc), + (void *)((unsigned long)unmapped), + (void *)((unsigned long)unwritten), + (void *)NULL, + (void *)NULL); +} +#else +#define xfs_page_trace(tag, inode, page, mask) +#endif + +void +linvfs_unwritten_done( + struct buffer_head *bh, + int uptodate) +{ + xfs_buf_t *pb = (xfs_buf_t *)bh->b_private; + + ASSERT(buffer_unwritten(bh)); + bh->b_end_io = NULL; + clear_buffer_unwritten(bh); + if (!uptodate) + pagebuf_ioerror(pb, EIO); + if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { + pagebuf_iodone(pb, 1, 1); + } + end_buffer_async_write(bh, uptodate); +} + +/* + * Issue transactions to convert a buffer range from unwritten + * to written extents (buffered IO). + */ +STATIC void +linvfs_unwritten_convert( + xfs_buf_t *bp) +{ + vnode_t *vp = XFS_BUF_FSPRIVATE(bp, vnode_t *); + int error; + + BUG_ON(atomic_read(&bp->pb_hold) < 1); + VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp), + BMAPI_UNWRITTEN, NULL, NULL, error); + XFS_BUF_SET_FSPRIVATE(bp, NULL); + XFS_BUF_CLR_IODONE_FUNC(bp); + XFS_BUF_UNDATAIO(bp); + iput(LINVFS_GET_IP(vp)); + pagebuf_iodone(bp, 0, 0); +} + +/* + * Issue transactions to convert a buffer range from unwritten + * to written extents (direct IO). + */ +STATIC void +linvfs_unwritten_convert_direct( + struct inode *inode, + loff_t offset, + ssize_t size, + void *private) +{ + ASSERT(!private || inode == (struct inode *)private); + + /* private indicates an unwritten extent lay beneath this IO */ + if (private && size > 0) { + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error); + } +} + +STATIC int +xfs_map_blocks( + struct inode *inode, + loff_t offset, + ssize_t count, + xfs_iomap_t *mapp, + int flags) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error, nmaps = 1; + + VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error); + if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE))) + VMODIFY(vp); + return -error; +} + +/* + * Finds the corresponding mapping in block @map array of the + * given @offset within a @page. + */ +STATIC xfs_iomap_t * +xfs_offset_to_map( + struct page *page, + xfs_iomap_t *iomapp, + unsigned long offset) +{ + loff_t full_offset; /* offset from start of file */ + + ASSERT(offset < PAGE_CACHE_SIZE); + + full_offset = page->index; /* NB: using 64bit number */ + full_offset <<= PAGE_CACHE_SHIFT; /* offset from file start */ + full_offset += offset; /* offset from page start */ + + if (full_offset < iomapp->iomap_offset) + return NULL; + if (iomapp->iomap_offset + (iomapp->iomap_bsize -1) >= full_offset) + return iomapp; + return NULL; +} + +STATIC void +xfs_map_at_offset( + struct page *page, + struct buffer_head *bh, + unsigned long offset, + int block_bits, + xfs_iomap_t *iomapp) +{ + xfs_daddr_t bn; + loff_t delta; + int sector_shift; + + ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); + ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); + ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL); + + delta = page->index; + delta <<= PAGE_CACHE_SHIFT; + delta += offset; + delta -= iomapp->iomap_offset; + delta >>= block_bits; + + sector_shift = block_bits - BBSHIFT; + bn = iomapp->iomap_bn >> sector_shift; + bn += delta; + BUG_ON(!bn && !(iomapp->iomap_flags & IOMAP_REALTIME)); + ASSERT((bn << sector_shift) >= iomapp->iomap_bn); + + lock_buffer(bh); + bh->b_blocknr = bn; + bh->b_bdev = iomapp->iomap_target->pbr_bdev; + set_buffer_mapped(bh); + clear_buffer_delay(bh); +} + +/* + * Look for a page at index which is unlocked and contains our + * unwritten extent flagged buffers at its head. Returns page + * locked and with an extra reference count, and length of the + * unwritten extent component on this page that we can write, + * in units of filesystem blocks. + */ +STATIC struct page * +xfs_probe_unwritten_page( + struct address_space *mapping, + pgoff_t index, + xfs_iomap_t *iomapp, + xfs_buf_t *pb, + unsigned long max_offset, + unsigned long *fsbs, + unsigned int bbits) +{ + struct page *page; + + page = find_trylock_page(mapping, index); + if (!page) + return NULL; + if (PageWriteback(page)) + goto out; + + if (page->mapping && page_has_buffers(page)) { + struct buffer_head *bh, *head; + unsigned long p_offset = 0; + + *fsbs = 0; + bh = head = page_buffers(page); + do { + if (!buffer_unwritten(bh) || !buffer_uptodate(bh)) + break; + if (!xfs_offset_to_map(page, iomapp, p_offset)) + break; + if (p_offset >= max_offset) + break; + xfs_map_at_offset(page, bh, p_offset, bbits, iomapp); + set_buffer_unwritten_io(bh); + bh->b_private = pb; + p_offset += bh->b_size; + (*fsbs)++; + } while ((bh = bh->b_this_page) != head); + + if (p_offset) + return page; + } + +out: + unlock_page(page); + return NULL; +} + +/* + * Look for a page at index which is unlocked and not mapped + * yet - clustering for mmap write case. + */ +STATIC unsigned int +xfs_probe_unmapped_page( + struct address_space *mapping, + pgoff_t index, + unsigned int pg_offset) +{ + struct page *page; + int ret = 0; + + page = find_trylock_page(mapping, index); + if (!page) + return 0; + if (PageWriteback(page)) + goto out; + + if (page->mapping && PageDirty(page)) { + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_mapped(bh) || !buffer_uptodate(bh)) + break; + ret += bh->b_size; + if (ret >= pg_offset) + break; + } while ((bh = bh->b_this_page) != head); + } else + ret = PAGE_CACHE_SIZE; + } + +out: + unlock_page(page); + return ret; +} + +STATIC unsigned int +xfs_probe_unmapped_cluster( + struct inode *inode, + struct page *startpage, + struct buffer_head *bh, + struct buffer_head *head) +{ + pgoff_t tindex, tlast, tloff; + unsigned int pg_offset, len, total = 0; + struct address_space *mapping = inode->i_mapping; + + /* First sum forwards in this page */ + do { + if (buffer_mapped(bh)) + break; + total += bh->b_size; + } while ((bh = bh->b_this_page) != head); + + /* If we reached the end of the page, sum forwards in + * following pages. + */ + if (bh == head) { + tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; + /* Prune this back to avoid pathological behavior */ + tloff = min(tlast, startpage->index + 64); + for (tindex = startpage->index + 1; tindex < tloff; tindex++) { + len = xfs_probe_unmapped_page(mapping, tindex, + PAGE_CACHE_SIZE); + if (!len) + return total; + total += len; + } + if (tindex == tlast && + (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { + total += xfs_probe_unmapped_page(mapping, + tindex, pg_offset); + } + } + return total; +} + +/* + * Probe for a given page (index) in the inode and test if it is delayed + * and without unwritten buffers. Returns page locked and with an extra + * reference count. + */ +STATIC struct page * +xfs_probe_delalloc_page( + struct inode *inode, + pgoff_t index) +{ + struct page *page; + + page = find_trylock_page(inode->i_mapping, index); + if (!page) + return NULL; + if (PageWriteback(page)) + goto out; + + if (page->mapping && page_has_buffers(page)) { + struct buffer_head *bh, *head; + int acceptable = 0; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) { + acceptable = 0; + break; + } else if (buffer_delay(bh)) { + acceptable = 1; + } + } while ((bh = bh->b_this_page) != head); + + if (acceptable) + return page; + } + +out: + unlock_page(page); + return NULL; +} + +STATIC int +xfs_map_unwritten( + struct inode *inode, + struct page *start_page, + struct buffer_head *head, + struct buffer_head *curr, + unsigned long p_offset, + int block_bits, + xfs_iomap_t *iomapp, + struct writeback_control *wbc, + int startio, + int all_bh) +{ + struct buffer_head *bh = curr; + xfs_iomap_t *tmp; + xfs_buf_t *pb; + loff_t offset, size; + unsigned long nblocks = 0; + + offset = start_page->index; + offset <<= PAGE_CACHE_SHIFT; + offset += p_offset; + + /* get an "empty" pagebuf to manage IO completion + * Proper values will be set before returning */ + pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0); + if (!pb) + return -EAGAIN; + + /* Take a reference to the inode to prevent it from + * being reclaimed while we have outstanding unwritten + * extent IO on it. + */ + if ((igrab(inode)) != inode) { + pagebuf_free(pb); + return -EAGAIN; + } + + /* Set the count to 1 initially, this will stop an I/O + * completion callout which happens before we have started + * all the I/O from calling pagebuf_iodone too early. + */ + atomic_set(&pb->pb_io_remaining, 1); + + /* First map forwards in the page consecutive buffers + * covering this unwritten extent + */ + do { + if (!buffer_unwritten(bh)) + break; + tmp = xfs_offset_to_map(start_page, iomapp, p_offset); + if (!tmp) + break; + xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp); + set_buffer_unwritten_io(bh); + bh->b_private = pb; + p_offset += bh->b_size; + nblocks++; + } while ((bh = bh->b_this_page) != head); + + atomic_add(nblocks, &pb->pb_io_remaining); + + /* If we reached the end of the page, map forwards in any + * following pages which are also covered by this extent. + */ + if (bh == head) { + struct address_space *mapping = inode->i_mapping; + pgoff_t tindex, tloff, tlast; + unsigned long bs; + unsigned int pg_offset, bbits = inode->i_blkbits; + struct page *page; + + tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; + tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT; + tloff = min(tlast, tloff); + for (tindex = start_page->index + 1; tindex < tloff; tindex++) { + page = xfs_probe_unwritten_page(mapping, + tindex, iomapp, pb, + PAGE_CACHE_SIZE, &bs, bbits); + if (!page) + break; + nblocks += bs; + atomic_add(bs, &pb->pb_io_remaining); + xfs_convert_page(inode, page, iomapp, wbc, pb, + startio, all_bh); + /* stop if converting the next page might add + * enough blocks that the corresponding byte + * count won't fit in our ulong page buf length */ + if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) + goto enough; + } + + if (tindex == tlast && + (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) { + page = xfs_probe_unwritten_page(mapping, + tindex, iomapp, pb, + pg_offset, &bs, bbits); + if (page) { + nblocks += bs; + atomic_add(bs, &pb->pb_io_remaining); + xfs_convert_page(inode, page, iomapp, wbc, pb, + startio, all_bh); + if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) + goto enough; + } + } + } + +enough: + size = nblocks; /* NB: using 64bit number here */ + size <<= block_bits; /* convert fsb's to byte range */ + + XFS_BUF_DATAIO(pb); + XFS_BUF_ASYNC(pb); + XFS_BUF_SET_SIZE(pb, size); + XFS_BUF_SET_COUNT(pb, size); + XFS_BUF_SET_OFFSET(pb, offset); + XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode)); + XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert); + + if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { + pagebuf_iodone(pb, 1, 1); + } + + return 0; +} + +STATIC void +xfs_submit_page( + struct page *page, + struct writeback_control *wbc, + struct buffer_head *bh_arr[], + int bh_count, + int probed_page, + int clear_dirty) +{ + struct buffer_head *bh; + int i; + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + if (clear_dirty) + clear_page_dirty(page); + unlock_page(page); + + if (bh_count) { + for (i = 0; i < bh_count; i++) { + bh = bh_arr[i]; + mark_buffer_async_write(bh); + if (buffer_unwritten(bh)) + set_buffer_unwritten_io(bh); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + } + + for (i = 0; i < bh_count; i++) + submit_bh(WRITE, bh_arr[i]); + + if (probed_page && clear_dirty) + wbc->nr_to_write--; /* Wrote an "extra" page */ + } else { + end_page_writeback(page); + wbc->pages_skipped++; /* We didn't write this page */ + } +} + +/* + * Allocate & map buffers for page given the extent map. Write it out. + * except for the original page of a writepage, this is called on + * delalloc/unwritten pages only, for the original page it is possible + * that the page has no mapping at all. + */ +STATIC void +xfs_convert_page( + struct inode *inode, + struct page *page, + xfs_iomap_t *iomapp, + struct writeback_control *wbc, + void *private, + int startio, + int all_bh) +{ + struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; + xfs_iomap_t *mp = iomapp, *tmp; + unsigned long end, offset; + pgoff_t end_index; + int i = 0, index = 0; + int bbits = inode->i_blkbits; + + end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT; + if (page->index < end_index) { + end = PAGE_CACHE_SIZE; + } else { + end = i_size_read(inode) & (PAGE_CACHE_SIZE-1); + } + bh = head = page_buffers(page); + do { + offset = i << bbits; + if (offset >= end) + break; + if (!(PageUptodate(page) || buffer_uptodate(bh))) + continue; + if (buffer_mapped(bh) && all_bh && + !(buffer_unwritten(bh) || buffer_delay(bh))) { + if (startio) { + lock_buffer(bh); + bh_arr[index++] = bh; + } + continue; + } + tmp = xfs_offset_to_map(page, mp, offset); + if (!tmp) + continue; + ASSERT(!(tmp->iomap_flags & IOMAP_HOLE)); + ASSERT(!(tmp->iomap_flags & IOMAP_DELAY)); + + /* If this is a new unwritten extent buffer (i.e. one + * that we haven't passed in private data for, we must + * now map this buffer too. + */ + if (buffer_unwritten(bh) && !bh->b_end_io) { + ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN); + xfs_map_unwritten(inode, page, head, bh, offset, + bbits, tmp, wbc, startio, all_bh); + } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) { + xfs_map_at_offset(page, bh, offset, bbits, tmp); + if (buffer_unwritten(bh)) { + set_buffer_unwritten_io(bh); + bh->b_private = private; + ASSERT(private); + } + } + if (startio) { + bh_arr[index++] = bh; + } else { + set_buffer_dirty(bh); + unlock_buffer(bh); + mark_buffer_dirty(bh); + } + } while (i++, (bh = bh->b_this_page) != head); + + if (startio) { + xfs_submit_page(page, wbc, bh_arr, index, 1, index == i); + } else { + unlock_page(page); + } +} + +/* + * Convert & write out a cluster of pages in the same extent as defined + * by mp and following the start page. + */ +STATIC void +xfs_cluster_write( + struct inode *inode, + pgoff_t tindex, + xfs_iomap_t *iomapp, + struct writeback_control *wbc, + int startio, + int all_bh, + pgoff_t tlast) +{ + struct page *page; + + for (; tindex <= tlast; tindex++) { + page = xfs_probe_delalloc_page(inode, tindex); + if (!page) + break; + xfs_convert_page(inode, page, iomapp, wbc, NULL, + startio, all_bh); + } +} + +/* + * Calling this without startio set means we are being asked to make a dirty + * page ready for freeing it's buffers. When called with startio set then + * we are coming from writepage. + * + * When called with startio set it is important that we write the WHOLE + * page if possible. + * The bh->b_state's cannot know if any of the blocks or which block for + * that matter are dirty due to mmap writes, and therefore bh uptodate is + * only vaild if the page itself isn't completely uptodate. Some layers + * may clear the page dirty flag prior to calling write page, under the + * assumption the entire page will be written out; by not writing out the + * whole page the page can be reused before all valid dirty data is + * written out. Note: in the case of a page that has been dirty'd by + * mapwrite and but partially setup by block_prepare_write the + * bh->b_states's will not agree and only ones setup by BPW/BCW will have + * valid state, thus the whole page must be written out thing. + */ + +STATIC int +xfs_page_state_convert( + struct inode *inode, + struct page *page, + struct writeback_control *wbc, + int startio, + int unmapped) /* also implies page uptodate */ +{ + struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; + xfs_iomap_t *iomp, iomap; + loff_t offset; + unsigned long p_offset = 0; + __uint64_t end_offset; + pgoff_t end_index, last_index, tlast; + int len, err, i, cnt = 0, uptodate = 1; + int flags = startio ? 0 : BMAPI_TRYLOCK; + int page_dirty, delalloc = 0; + + /* Is this page beyond the end of the file? */ + offset = i_size_read(inode); + end_index = offset >> PAGE_CACHE_SHIFT; + last_index = (offset - 1) >> PAGE_CACHE_SHIFT; + if (page->index >= end_index) { + if ((page->index >= end_index + 1) || + !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { + err = -EIO; + goto error; + } + } + + offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + end_offset = min_t(unsigned long long, + offset + PAGE_CACHE_SIZE, i_size_read(inode)); + + bh = head = page_buffers(page); + iomp = NULL; + + /* + * page_dirty is initially a count of buffers on the page and + * is decrememted as we move each into a cleanable state. + */ + len = bh->b_size; + page_dirty = PAGE_CACHE_SIZE / len; + + do { + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) + continue; + + if (iomp) { + iomp = xfs_offset_to_map(page, &iomap, p_offset); + } + + /* + * First case, map an unwritten extent and prepare for + * extent state conversion transaction on completion. + */ + if (buffer_unwritten(bh)) { + if (!startio) + continue; + if (!iomp) { + err = xfs_map_blocks(inode, offset, len, &iomap, + BMAPI_READ|BMAPI_IGNSTATE); + if (err) { + goto error; + } + iomp = xfs_offset_to_map(page, &iomap, + p_offset); + } + if (iomp) { + if (!bh->b_end_io) { + err = xfs_map_unwritten(inode, page, + head, bh, p_offset, + inode->i_blkbits, iomp, + wbc, startio, unmapped); + if (err) { + goto error; + } + } else { + set_bit(BH_Lock, &bh->b_state); + } + BUG_ON(!buffer_locked(bh)); + bh_arr[cnt++] = bh; + page_dirty--; + } + /* + * Second case, allocate space for a delalloc buffer. + * We can return EAGAIN here in the release page case. + */ + } else if (buffer_delay(bh)) { + if (!iomp) { + delalloc = 1; + err = xfs_map_blocks(inode, offset, len, &iomap, + BMAPI_ALLOCATE | flags); + if (err) { + goto error; + } + iomp = xfs_offset_to_map(page, &iomap, + p_offset); + } + if (iomp) { + xfs_map_at_offset(page, bh, p_offset, + inode->i_blkbits, iomp); + if (startio) { + bh_arr[cnt++] = bh; + } else { + set_buffer_dirty(bh); + unlock_buffer(bh); + mark_buffer_dirty(bh); + } + page_dirty--; + } + } else if ((buffer_uptodate(bh) || PageUptodate(page)) && + (unmapped || startio)) { + + if (!buffer_mapped(bh)) { + int size; + + /* + * Getting here implies an unmapped buffer + * was found, and we are in a path where we + * need to write the whole page out. + */ + if (!iomp) { + size = xfs_probe_unmapped_cluster( + inode, page, bh, head); + err = xfs_map_blocks(inode, offset, + size, &iomap, + BMAPI_WRITE|BMAPI_MMAP); + if (err) { + goto error; + } + iomp = xfs_offset_to_map(page, &iomap, + p_offset); + } + if (iomp) { + xfs_map_at_offset(page, + bh, p_offset, + inode->i_blkbits, iomp); + if (startio) { + bh_arr[cnt++] = bh; + } else { + set_buffer_dirty(bh); + unlock_buffer(bh); + mark_buffer_dirty(bh); + } + page_dirty--; + } + } else if (startio) { + if (buffer_uptodate(bh) && + !test_and_set_bit(BH_Lock, &bh->b_state)) { + bh_arr[cnt++] = bh; + page_dirty--; + } + } + } + } while (offset += len, p_offset += len, + ((bh = bh->b_this_page) != head)); + + if (uptodate && bh == head) + SetPageUptodate(page); + + if (startio) + xfs_submit_page(page, wbc, bh_arr, cnt, 0, 1); + + if (iomp) { + tlast = (iomp->iomap_offset + iomp->iomap_bsize - 1) >> + PAGE_CACHE_SHIFT; + if (delalloc && (tlast > last_index)) + tlast = last_index; + xfs_cluster_write(inode, page->index + 1, iomp, wbc, + startio, unmapped, tlast); + } + + return page_dirty; + +error: + for (i = 0; i < cnt; i++) { + unlock_buffer(bh_arr[i]); + } + + /* + * If it's delalloc and we have nowhere to put it, + * throw it away, unless the lower layers told + * us to try again. + */ + if (err != -EAGAIN) { + if (!unmapped) { + block_invalidatepage(page, 0); + } + ClearPageUptodate(page); + } + return err; +} + +STATIC int +__linvfs_get_block( + struct inode *inode, + sector_t iblock, + unsigned long blocks, + struct buffer_head *bh_result, + int create, + int direct, + bmapi_flags_t flags) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + xfs_iomap_t iomap; + int retpbbm = 1; + int error; + ssize_t size; + loff_t offset = (loff_t)iblock << inode->i_blkbits; + + if (blocks) + size = blocks << inode->i_blkbits; + else + size = 1 << inode->i_blkbits; + + VOP_BMAP(vp, offset, size, + create ? flags : BMAPI_READ, &iomap, &retpbbm, error); + if (error) + return -error; + + if (retpbbm == 0) + return 0; + + if (iomap.iomap_bn != IOMAP_DADDR_NULL) { + xfs_daddr_t bn; + loff_t delta; + + /* For unwritten extents do not report a disk address on + * the read case (treat as if we're reading into a hole). + */ + if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { + delta = offset - iomap.iomap_offset; + delta >>= inode->i_blkbits; + + bn = iomap.iomap_bn >> (inode->i_blkbits - BBSHIFT); + bn += delta; + BUG_ON(!bn && !(iomap.iomap_flags & IOMAP_REALTIME)); + bh_result->b_blocknr = bn; + set_buffer_mapped(bh_result); + } + if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) { + if (direct) + bh_result->b_private = inode; + set_buffer_unwritten(bh_result); + set_buffer_delay(bh_result); + } + } + + /* If this is a realtime file, data might be on a new device */ + bh_result->b_bdev = iomap.iomap_target->pbr_bdev; + + /* If we previously allocated a block out beyond eof and + * we are now coming back to use it then we will need to + * flag it as new even if it has a disk address. + */ + if (create && + ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || + (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW))) { + set_buffer_new(bh_result); + } + + if (iomap.iomap_flags & IOMAP_DELAY) { + BUG_ON(direct); + if (create) { + set_buffer_uptodate(bh_result); + set_buffer_mapped(bh_result); + set_buffer_delay(bh_result); + } + } + + if (blocks) { + bh_result->b_size = (ssize_t)min( + (loff_t)(iomap.iomap_bsize - iomap.iomap_delta), + (loff_t)(blocks << inode->i_blkbits)); + } + + return 0; +} + +int +linvfs_get_block( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __linvfs_get_block(inode, iblock, 0, bh_result, + create, 0, BMAPI_WRITE); +} + +STATIC int +linvfs_get_blocks_direct( + struct inode *inode, + sector_t iblock, + unsigned long max_blocks, + struct buffer_head *bh_result, + int create) +{ + return __linvfs_get_block(inode, iblock, max_blocks, bh_result, + create, 1, BMAPI_WRITE|BMAPI_DIRECT); +} + +STATIC ssize_t +linvfs_direct_IO( + int rw, + struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + xfs_iomap_t iomap; + int maps = 1; + int error; + + VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error); + if (error) + return -error; + + return blockdev_direct_IO_own_locking(rw, iocb, inode, + iomap.iomap_target->pbr_bdev, + iov, offset, nr_segs, + linvfs_get_blocks_direct, + linvfs_unwritten_convert_direct); +} + + +STATIC sector_t +linvfs_bmap( + struct address_space *mapping, + sector_t block) +{ + struct inode *inode = (struct inode *)mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address); + + VOP_RWLOCK(vp, VRWLOCK_READ); + VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error); + VOP_RWUNLOCK(vp, VRWLOCK_READ); + return generic_block_bmap(mapping, block, linvfs_get_block); +} + +STATIC int +linvfs_readpage( + struct file *unused, + struct page *page) +{ + return mpage_readpage(page, linvfs_get_block); +} + +STATIC int +linvfs_readpages( + struct file *unused, + struct address_space *mapping, + struct list_head *pages, + unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block); +} + +STATIC void +xfs_count_page_state( + struct page *page, + int *delalloc, + int *unmapped, + int *unwritten) +{ + struct buffer_head *bh, *head; + + *delalloc = *unmapped = *unwritten = 0; + + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) && !buffer_mapped(bh)) + (*unmapped) = 1; + else if (buffer_unwritten(bh) && !buffer_delay(bh)) + clear_buffer_unwritten(bh); + else if (buffer_unwritten(bh)) + (*unwritten) = 1; + else if (buffer_delay(bh)) + (*delalloc) = 1; + } while ((bh = bh->b_this_page) != head); +} + + +/* + * writepage: Called from one of two places: + * + * 1. we are flushing a delalloc buffer head. + * + * 2. we are writing out a dirty page. Typically the page dirty + * state is cleared before we get here. In this case is it + * conceivable we have no buffer heads. + * + * For delalloc space on the page we need to allocate space and + * flush it. For unmapped buffer heads on the page we should + * allocate space if the page is uptodate. For any other dirty + * buffer heads on the page we should flush them. + * + * If we detect that a transaction would be required to flush + * the page, we have to check the process flags first, if we + * are already in a transaction or disk I/O during allocations + * is off, we need to fail the writepage and redirty the page. + */ + +STATIC int +linvfs_writepage( + struct page *page, + struct writeback_control *wbc) +{ + int error; + int need_trans; + int delalloc, unmapped, unwritten; + struct inode *inode = page->mapping->host; + + xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0); + + /* + * We need a transaction if: + * 1. There are delalloc buffers on the page + * 2. The page is uptodate and we have unmapped buffers + * 3. The page is uptodate and we have no buffers + * 4. There are unwritten buffers on the page + */ + + if (!page_has_buffers(page)) { + unmapped = 1; + need_trans = 1; + } else { + xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); + if (!PageUptodate(page)) + unmapped = 0; + need_trans = delalloc + unmapped + unwritten; + } + + /* + * If we need a transaction and the process flags say + * we are already in a transaction, or no IO is allowed + * then mark the page dirty again and leave the page + * as is. + */ + if (PFLAGS_TEST_FSTRANS() && need_trans) + goto out_fail; + + /* + * Delay hooking up buffer heads until we have + * made our go/no-go decision. + */ + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, 0); + + /* + * Convert delayed allocate, unwritten or unmapped space + * to real space and flush out to disk. + */ + error = xfs_page_state_convert(inode, page, wbc, 1, unmapped); + if (error == -EAGAIN) + goto out_fail; + if (unlikely(error < 0)) + goto out_unlock; + + return 0; + +out_fail: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +out_unlock: + unlock_page(page); + return error; +} + +/* + * Called to move a page into cleanable state - and from there + * to be released. Possibly the page is already clean. We always + * have buffer heads in this call. + * + * Returns 0 if the page is ok to release, 1 otherwise. + * + * Possible scenarios are: + * + * 1. We are being called to release a page which has been written + * to via regular I/O. buffer heads will be dirty and possibly + * delalloc. If no delalloc buffer heads in this case then we + * can just return zero. + * + * 2. We are called to release a page which has been written via + * mmap, all we need to do is ensure there is no delalloc + * state in the buffer heads, if not we can let the caller + * free them and we should come back later via writepage. + */ +STATIC int +linvfs_release_page( + struct page *page, + int gfp_mask) +{ + struct inode *inode = page->mapping->host; + int dirty, delalloc, unmapped, unwritten; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; + + xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask); + + xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); + if (!delalloc && !unwritten) + goto free_buffers; + + if (!(gfp_mask & __GFP_FS)) + return 0; + + /* If we are already inside a transaction or the thread cannot + * do I/O, we cannot release this page. + */ + if (PFLAGS_TEST_FSTRANS()) + return 0; + + /* + * Convert delalloc space to real space, do not flush the + * data out to disk, that will be done by the caller. + * Never need to allocate space here - we will always + * come back to writepage in that case. + */ + dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0); + if (dirty == 0 && !unwritten) + goto free_buffers; + return 0; + +free_buffers: + return try_to_free_buffers(page); +} + +STATIC int +linvfs_prepare_write( + struct file *file, + struct page *page, + unsigned int from, + unsigned int to) +{ + return block_prepare_write(page, from, to, linvfs_get_block); +} + +struct address_space_operations linvfs_aops = { + .readpage = linvfs_readpage, + .readpages = linvfs_readpages, + .writepage = linvfs_writepage, + .sync_page = block_sync_page, + .releasepage = linvfs_release_page, + .prepare_write = linvfs_prepare_write, + .commit_write = generic_commit_write, + .bmap = linvfs_bmap, + .direct_IO = linvfs_direct_IO, +}; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c new file mode 100644 index 000000000000..23e0eb67fc25 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -0,0 +1,1980 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * The xfs_buf.c code provides an abstract buffer cache model on top + * of the Linux page cache. Cached metadata blocks for a file system + * are hashed to the inode for the block device. xfs_buf.c assembles + * buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O. + * + * Written by Steve Lord, Jim Mostek, Russell Cattelan + * and Rajagopal Ananthanarayanan ("ananth") at SGI. + * + */ + +#include <linux/stddef.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/init.h> +#include <linux/vmalloc.h> +#include <linux/bio.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/workqueue.h> +#include <linux/percpu.h> +#include <linux/blkdev.h> +#include <linux/hash.h> + +#include "xfs_linux.h" + +/* + * File wide globals + */ + +STATIC kmem_cache_t *pagebuf_cache; +STATIC kmem_shaker_t pagebuf_shake; +STATIC int pagebuf_daemon_wakeup(int, unsigned int); +STATIC void pagebuf_delwri_queue(xfs_buf_t *, int); +STATIC struct workqueue_struct *pagebuf_logio_workqueue; +STATIC struct workqueue_struct *pagebuf_dataio_workqueue; + +/* + * Pagebuf debugging + */ + +#ifdef PAGEBUF_TRACE +void +pagebuf_trace( + xfs_buf_t *pb, + char *id, + void *data, + void *ra) +{ + ktrace_enter(pagebuf_trace_buf, + pb, id, + (void *)(unsigned long)pb->pb_flags, + (void *)(unsigned long)pb->pb_hold.counter, + (void *)(unsigned long)pb->pb_sema.count.counter, + (void *)current, + data, ra, + (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), + (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), + (void *)(unsigned long)pb->pb_buffer_length, + NULL, NULL, NULL, NULL, NULL); +} +ktrace_t *pagebuf_trace_buf; +#define PAGEBUF_TRACE_SIZE 4096 +#define PB_TRACE(pb, id, data) \ + pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0)) +#else +#define PB_TRACE(pb, id, data) do { } while (0) +#endif + +#ifdef PAGEBUF_LOCK_TRACKING +# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid) +# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1) +# define PB_GET_OWNER(pb) ((pb)->pb_last_holder) +#else +# define PB_SET_OWNER(pb) do { } while (0) +# define PB_CLEAR_OWNER(pb) do { } while (0) +# define PB_GET_OWNER(pb) do { } while (0) +#endif + +/* + * Pagebuf allocation / freeing. + */ + +#define pb_to_gfp(flags) \ + ((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \ + ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) + +#define pb_to_km(flags) \ + (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) + + +#define pagebuf_allocate(flags) \ + kmem_zone_alloc(pagebuf_cache, pb_to_km(flags)) +#define pagebuf_deallocate(pb) \ + kmem_zone_free(pagebuf_cache, (pb)); + +/* + * Page Region interfaces. + * + * For pages in filesystems where the blocksize is smaller than the + * pagesize, we use the page->private field (long) to hold a bitmap + * of uptodate regions within the page. + * + * Each such region is "bytes per page / bits per long" bytes long. + * + * NBPPR == number-of-bytes-per-page-region + * BTOPR == bytes-to-page-region (rounded up) + * BTOPRT == bytes-to-page-region-truncated (rounded down) + */ +#if (BITS_PER_LONG == 32) +#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ +#elif (BITS_PER_LONG == 64) +#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ +#else +#error BITS_PER_LONG must be 32 or 64 +#endif +#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) +#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) +#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) + +STATIC unsigned long +page_region_mask( + size_t offset, + size_t length) +{ + unsigned long mask; + int first, final; + + first = BTOPR(offset); + final = BTOPRT(offset + length - 1); + first = min(first, final); + + mask = ~0UL; + mask <<= BITS_PER_LONG - (final - first); + mask >>= BITS_PER_LONG - (final); + + ASSERT(offset + length <= PAGE_CACHE_SIZE); + ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); + + return mask; +} + +STATIC inline void +set_page_region( + struct page *page, + size_t offset, + size_t length) +{ + page->private |= page_region_mask(offset, length); + if (page->private == ~0UL) + SetPageUptodate(page); +} + +STATIC inline int +test_page_region( + struct page *page, + size_t offset, + size_t length) +{ + unsigned long mask = page_region_mask(offset, length); + + return (mask && (page->private & mask) == mask); +} + +/* + * Mapping of multi-page buffers into contiguous virtual space + */ + +typedef struct a_list { + void *vm_addr; + struct a_list *next; +} a_list_t; + +STATIC a_list_t *as_free_head; +STATIC int as_list_len; +STATIC DEFINE_SPINLOCK(as_lock); + +/* + * Try to batch vunmaps because they are costly. + */ +STATIC void +free_address( + void *addr) +{ + a_list_t *aentry; + + aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); + if (likely(aentry)) { + spin_lock(&as_lock); + aentry->next = as_free_head; + aentry->vm_addr = addr; + as_free_head = aentry; + as_list_len++; + spin_unlock(&as_lock); + } else { + vunmap(addr); + } +} + +STATIC void +purge_addresses(void) +{ + a_list_t *aentry, *old; + + if (as_free_head == NULL) + return; + + spin_lock(&as_lock); + aentry = as_free_head; + as_free_head = NULL; + as_list_len = 0; + spin_unlock(&as_lock); + + while ((old = aentry) != NULL) { + vunmap(aentry->vm_addr); + aentry = aentry->next; + kfree(old); + } +} + +/* + * Internal pagebuf object manipulation + */ + +STATIC void +_pagebuf_initialize( + xfs_buf_t *pb, + xfs_buftarg_t *target, + loff_t range_base, + size_t range_length, + page_buf_flags_t flags) +{ + /* + * We don't want certain flags to appear in pb->pb_flags. + */ + flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); + + memset(pb, 0, sizeof(xfs_buf_t)); + atomic_set(&pb->pb_hold, 1); + init_MUTEX_LOCKED(&pb->pb_iodonesema); + INIT_LIST_HEAD(&pb->pb_list); + INIT_LIST_HEAD(&pb->pb_hash_list); + init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ + PB_SET_OWNER(pb); + pb->pb_target = target; + pb->pb_file_offset = range_base; + /* + * Set buffer_length and count_desired to the same value initially. + * I/O routines should use count_desired, which will be the same in + * most cases but may be reset (e.g. XFS recovery). + */ + pb->pb_buffer_length = pb->pb_count_desired = range_length; + pb->pb_flags = flags | PBF_NONE; + pb->pb_bn = XFS_BUF_DADDR_NULL; + atomic_set(&pb->pb_pin_count, 0); + init_waitqueue_head(&pb->pb_waiters); + + XFS_STATS_INC(pb_create); + PB_TRACE(pb, "initialize", target); +} + +/* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. + */ +STATIC int +_pagebuf_get_pages( + xfs_buf_t *pb, + int page_count, + page_buf_flags_t flags) +{ + /* Make sure that we have a page list */ + if (pb->pb_pages == NULL) { + pb->pb_offset = page_buf_poff(pb->pb_file_offset); + pb->pb_page_count = page_count; + if (page_count <= PB_PAGES) { + pb->pb_pages = pb->pb_page_array; + } else { + pb->pb_pages = kmem_alloc(sizeof(struct page *) * + page_count, pb_to_km(flags)); + if (pb->pb_pages == NULL) + return -ENOMEM; + } + memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); + } + return 0; +} + +/* + * Frees pb_pages if it was malloced. + */ +STATIC void +_pagebuf_free_pages( + xfs_buf_t *bp) +{ + if (bp->pb_pages != bp->pb_page_array) { + kmem_free(bp->pb_pages, + bp->pb_page_count * sizeof(struct page *)); + } +} + +/* + * Releases the specified buffer. + * + * The modification state of any associated pages is left unchanged. + * The buffer most not be on any hash - use pagebuf_rele instead for + * hashed and refcounted buffers + */ +void +pagebuf_free( + xfs_buf_t *bp) +{ + PB_TRACE(bp, "free", 0); + + ASSERT(list_empty(&bp->pb_hash_list)); + + if (bp->pb_flags & _PBF_PAGE_CACHE) { + uint i; + + if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1)) + free_address(bp->pb_addr - bp->pb_offset); + + for (i = 0; i < bp->pb_page_count; i++) + page_cache_release(bp->pb_pages[i]); + _pagebuf_free_pages(bp); + } else if (bp->pb_flags & _PBF_KMEM_ALLOC) { + /* + * XXX(hch): bp->pb_count_desired might be incorrect (see + * pagebuf_associate_memory for details), but fortunately + * the Linux version of kmem_free ignores the len argument.. + */ + kmem_free(bp->pb_addr, bp->pb_count_desired); + _pagebuf_free_pages(bp); + } + + pagebuf_deallocate(bp); +} + +/* + * Finds all pages for buffer in question and builds it's page list. + */ +STATIC int +_pagebuf_lookup_pages( + xfs_buf_t *bp, + uint flags) +{ + struct address_space *mapping = bp->pb_target->pbr_mapping; + size_t blocksize = bp->pb_target->pbr_bsize; + size_t size = bp->pb_count_desired; + size_t nbytes, offset; + int gfp_mask = pb_to_gfp(flags); + unsigned short page_count, i; + pgoff_t first; + loff_t end; + int error; + + end = bp->pb_file_offset + bp->pb_buffer_length; + page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset); + + error = _pagebuf_get_pages(bp, page_count, flags); + if (unlikely(error)) + return error; + bp->pb_flags |= _PBF_PAGE_CACHE; + + offset = bp->pb_offset; + first = bp->pb_file_offset >> PAGE_CACHE_SHIFT; + + for (i = 0; i < bp->pb_page_count; i++) { + struct page *page; + uint retries = 0; + + retry: + page = find_or_create_page(mapping, first + i, gfp_mask); + if (unlikely(page == NULL)) { + if (flags & PBF_READ_AHEAD) { + bp->pb_page_count = i; + for (i = 0; i < bp->pb_page_count; i++) + unlock_page(bp->pb_pages[i]); + return -ENOMEM; + } + + /* + * This could deadlock. + * + * But until all the XFS lowlevel code is revamped to + * handle buffer allocation failures we can't do much. + */ + if (!(++retries % 100)) + printk(KERN_ERR + "XFS: possible memory allocation " + "deadlock in %s (mode:0x%x)\n", + __FUNCTION__, gfp_mask); + + XFS_STATS_INC(pb_page_retries); + pagebuf_daemon_wakeup(0, gfp_mask); + blk_congestion_wait(WRITE, HZ/50); + goto retry; + } + + XFS_STATS_INC(pb_page_found); + + nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); + size -= nbytes; + + if (!PageUptodate(page)) { + page_count--; + if (blocksize >= PAGE_CACHE_SIZE) { + if (flags & PBF_READ) + bp->pb_locked = 1; + } else if (!PagePrivate(page)) { + if (test_page_region(page, offset, nbytes)) + page_count++; + } + } + + bp->pb_pages[i] = page; + offset = 0; + } + + if (!bp->pb_locked) { + for (i = 0; i < bp->pb_page_count; i++) + unlock_page(bp->pb_pages[i]); + } + + if (page_count) { + /* if we have any uptodate pages, mark that in the buffer */ + bp->pb_flags &= ~PBF_NONE; + + /* if some pages aren't uptodate, mark that in the buffer */ + if (page_count != bp->pb_page_count) + bp->pb_flags |= PBF_PARTIAL; + } + + PB_TRACE(bp, "lookup_pages", (long)page_count); + return error; +} + +/* + * Map buffer into kernel address-space if nessecary. + */ +STATIC int +_pagebuf_map_pages( + xfs_buf_t *bp, + uint flags) +{ + /* A single page buffer is always mappable */ + if (bp->pb_page_count == 1) { + bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset; + bp->pb_flags |= PBF_MAPPED; + } else if (flags & PBF_MAPPED) { + if (as_list_len > 64) + purge_addresses(); + bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count, + VM_MAP, PAGE_KERNEL); + if (unlikely(bp->pb_addr == NULL)) + return -ENOMEM; + bp->pb_addr += bp->pb_offset; + bp->pb_flags |= PBF_MAPPED; + } + + return 0; +} + +/* + * Finding and Reading Buffers + */ + +/* + * _pagebuf_find + * + * Looks up, and creates if absent, a lockable buffer for + * a given range of an inode. The buffer is returned + * locked. If other overlapping buffers exist, they are + * released before the new buffer is created and locked, + * which may imply that this call will block until those buffers + * are unlocked. No I/O is implied by this call. + */ +xfs_buf_t * +_pagebuf_find( + xfs_buftarg_t *btp, /* block device target */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags, /* PBF_TRYLOCK */ + xfs_buf_t *new_pb)/* newly allocated buffer */ +{ + loff_t range_base; + size_t range_length; + xfs_bufhash_t *hash; + xfs_buf_t *pb, *n; + + range_base = (ioff << BBSHIFT); + range_length = (isize << BBSHIFT); + + /* Check for IOs smaller than the sector size / not sector aligned */ + ASSERT(!(range_length < (1 << btp->pbr_sshift))); + ASSERT(!(range_base & (loff_t)btp->pbr_smask)); + + hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; + + spin_lock(&hash->bh_lock); + + list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) { + ASSERT(btp == pb->pb_target); + if (pb->pb_file_offset == range_base && + pb->pb_buffer_length == range_length) { + /* + * If we look at something bring it to the + * front of the list for next time. + */ + atomic_inc(&pb->pb_hold); + list_move(&pb->pb_hash_list, &hash->bh_list); + goto found; + } + } + + /* No match found */ + if (new_pb) { + _pagebuf_initialize(new_pb, btp, range_base, + range_length, flags); + new_pb->pb_hash = hash; + list_add(&new_pb->pb_hash_list, &hash->bh_list); + } else { + XFS_STATS_INC(pb_miss_locked); + } + + spin_unlock(&hash->bh_lock); + return new_pb; + +found: + spin_unlock(&hash->bh_lock); + + /* Attempt to get the semaphore without sleeping, + * if this does not work then we need to drop the + * spinlock and do a hard attempt on the semaphore. + */ + if (down_trylock(&pb->pb_sema)) { + if (!(flags & PBF_TRYLOCK)) { + /* wait for buffer ownership */ + PB_TRACE(pb, "get_lock", 0); + pagebuf_lock(pb); + XFS_STATS_INC(pb_get_locked_waited); + } else { + /* We asked for a trylock and failed, no need + * to look at file offset and length here, we + * know that this pagebuf at least overlaps our + * pagebuf and is locked, therefore our buffer + * either does not exist, or is this buffer + */ + + pagebuf_rele(pb); + XFS_STATS_INC(pb_busy_locked); + return (NULL); + } + } else { + /* trylock worked */ + PB_SET_OWNER(pb); + } + + if (pb->pb_flags & PBF_STALE) + pb->pb_flags &= PBF_MAPPED; + PB_TRACE(pb, "got_lock", 0); + XFS_STATS_INC(pb_get_locked); + return (pb); +} + +/* + * xfs_buf_get_flags assembles a buffer covering the specified range. + * + * Storage in memory for all portions of the buffer will be allocated, + * although backing storage may not be. + */ +xfs_buf_t * +xfs_buf_get_flags( /* allocate a buffer */ + xfs_buftarg_t *target,/* target for buffer */ + loff_t ioff, /* starting offset of range */ + size_t isize, /* length of range */ + page_buf_flags_t flags) /* PBF_TRYLOCK */ +{ + xfs_buf_t *pb, *new_pb; + int error = 0, i; + + new_pb = pagebuf_allocate(flags); + if (unlikely(!new_pb)) + return NULL; + + pb = _pagebuf_find(target, ioff, isize, flags, new_pb); + if (pb == new_pb) { + error = _pagebuf_lookup_pages(pb, flags); + if (error) + goto no_buffer; + } else { + pagebuf_deallocate(new_pb); + if (unlikely(pb == NULL)) + return NULL; + } + + for (i = 0; i < pb->pb_page_count; i++) + mark_page_accessed(pb->pb_pages[i]); + + if (!(pb->pb_flags & PBF_MAPPED)) { + error = _pagebuf_map_pages(pb, flags); + if (unlikely(error)) { + printk(KERN_WARNING "%s: failed to map pages\n", + __FUNCTION__); + goto no_buffer; + } + } + + XFS_STATS_INC(pb_get); + + /* + * Always fill in the block number now, the mapped cases can do + * their own overlay of this later. + */ + pb->pb_bn = ioff; + pb->pb_count_desired = pb->pb_buffer_length; + + PB_TRACE(pb, "get", (unsigned long)flags); + return pb; + + no_buffer: + if (flags & (PBF_LOCK | PBF_TRYLOCK)) + pagebuf_unlock(pb); + pagebuf_rele(pb); + return NULL; +} + +xfs_buf_t * +xfs_buf_read_flags( + xfs_buftarg_t *target, + loff_t ioff, + size_t isize, + page_buf_flags_t flags) +{ + xfs_buf_t *pb; + + flags |= PBF_READ; + + pb = xfs_buf_get_flags(target, ioff, isize, flags); + if (pb) { + if (PBF_NOT_DONE(pb)) { + PB_TRACE(pb, "read", (unsigned long)flags); + XFS_STATS_INC(pb_get_read); + pagebuf_iostart(pb, flags); + } else if (flags & PBF_ASYNC) { + PB_TRACE(pb, "read_async", (unsigned long)flags); + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + goto no_buffer; + } else { + PB_TRACE(pb, "read_done", (unsigned long)flags); + /* We do not want read in the flags */ + pb->pb_flags &= ~PBF_READ; + } + } + + return pb; + + no_buffer: + if (flags & (PBF_LOCK | PBF_TRYLOCK)) + pagebuf_unlock(pb); + pagebuf_rele(pb); + return NULL; +} + +/* + * Create a skeletal pagebuf (no pages associated with it). + */ +xfs_buf_t * +pagebuf_lookup( + xfs_buftarg_t *target, + loff_t ioff, + size_t isize, + page_buf_flags_t flags) +{ + xfs_buf_t *pb; + + pb = pagebuf_allocate(flags); + if (pb) { + _pagebuf_initialize(pb, target, ioff, isize, flags); + } + return pb; +} + +/* + * If we are not low on memory then do the readahead in a deadlock + * safe manner. + */ +void +pagebuf_readahead( + xfs_buftarg_t *target, + loff_t ioff, + size_t isize, + page_buf_flags_t flags) +{ + struct backing_dev_info *bdi; + + bdi = target->pbr_mapping->backing_dev_info; + if (bdi_read_congested(bdi)) + return; + + flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD); + xfs_buf_read_flags(target, ioff, isize, flags); +} + +xfs_buf_t * +pagebuf_get_empty( + size_t len, + xfs_buftarg_t *target) +{ + xfs_buf_t *pb; + + pb = pagebuf_allocate(0); + if (pb) + _pagebuf_initialize(pb, target, 0, len, 0); + return pb; +} + +static inline struct page * +mem_to_page( + void *addr) +{ + if (((unsigned long)addr < VMALLOC_START) || + ((unsigned long)addr >= VMALLOC_END)) { + return virt_to_page(addr); + } else { + return vmalloc_to_page(addr); + } +} + +int +pagebuf_associate_memory( + xfs_buf_t *pb, + void *mem, + size_t len) +{ + int rval; + int i = 0; + size_t ptr; + size_t end, end_cur; + off_t offset; + int page_count; + + page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); + if (offset && (len > PAGE_CACHE_SIZE)) + page_count++; + + /* Free any previous set of page pointers */ + if (pb->pb_pages) + _pagebuf_free_pages(pb); + + pb->pb_pages = NULL; + pb->pb_addr = mem; + + rval = _pagebuf_get_pages(pb, page_count, 0); + if (rval) + return rval; + + pb->pb_offset = offset; + ptr = (size_t) mem & PAGE_CACHE_MASK; + end = PAGE_CACHE_ALIGN((size_t) mem + len); + end_cur = end; + /* set up first page */ + pb->pb_pages[0] = mem_to_page(mem); + + ptr += PAGE_CACHE_SIZE; + pb->pb_page_count = ++i; + while (ptr < end) { + pb->pb_pages[i] = mem_to_page((void *)ptr); + pb->pb_page_count = ++i; + ptr += PAGE_CACHE_SIZE; + } + pb->pb_locked = 0; + + pb->pb_count_desired = pb->pb_buffer_length = len; + pb->pb_flags |= PBF_MAPPED; + + return 0; +} + +xfs_buf_t * +pagebuf_get_no_daddr( + size_t len, + xfs_buftarg_t *target) +{ + size_t malloc_len = len; + xfs_buf_t *bp; + void *data; + int error; + + bp = pagebuf_allocate(0); + if (unlikely(bp == NULL)) + goto fail; + _pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO); + + try_again: + data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL); + if (unlikely(data == NULL)) + goto fail_free_buf; + + /* check whether alignment matches.. */ + if ((__psunsigned_t)data != + ((__psunsigned_t)data & ~target->pbr_smask)) { + /* .. else double the size and try again */ + kmem_free(data, malloc_len); + malloc_len <<= 1; + goto try_again; + } + + error = pagebuf_associate_memory(bp, data, len); + if (error) + goto fail_free_mem; + bp->pb_flags |= _PBF_KMEM_ALLOC; + + pagebuf_unlock(bp); + + PB_TRACE(bp, "no_daddr", data); + return bp; + fail_free_mem: + kmem_free(data, malloc_len); + fail_free_buf: + pagebuf_free(bp); + fail: + return NULL; +} + +/* + * pagebuf_hold + * + * Increment reference count on buffer, to hold the buffer concurrently + * with another thread which may release (free) the buffer asynchronously. + * + * Must hold the buffer already to call this function. + */ +void +pagebuf_hold( + xfs_buf_t *pb) +{ + atomic_inc(&pb->pb_hold); + PB_TRACE(pb, "hold", 0); +} + +/* + * pagebuf_rele + * + * pagebuf_rele releases a hold on the specified buffer. If the + * the hold count is 1, pagebuf_rele calls pagebuf_free. + */ +void +pagebuf_rele( + xfs_buf_t *pb) +{ + xfs_bufhash_t *hash = pb->pb_hash; + + PB_TRACE(pb, "rele", pb->pb_relse); + + /* + * pagebuf_lookup buffers are not hashed, not delayed write, + * and don't have their own release routines. Special case. + */ + if (unlikely(!hash)) { + ASSERT(!pb->pb_relse); + if (atomic_dec_and_test(&pb->pb_hold)) + xfs_buf_free(pb); + return; + } + + if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) { + int do_free = 1; + + if (pb->pb_relse) { + atomic_inc(&pb->pb_hold); + spin_unlock(&hash->bh_lock); + (*(pb->pb_relse)) (pb); + spin_lock(&hash->bh_lock); + do_free = 0; + } + + if (pb->pb_flags & PBF_DELWRI) { + pb->pb_flags |= PBF_ASYNC; + atomic_inc(&pb->pb_hold); + pagebuf_delwri_queue(pb, 0); + do_free = 0; + } else if (pb->pb_flags & PBF_FS_MANAGED) { + do_free = 0; + } + + if (do_free) { + list_del_init(&pb->pb_hash_list); + spin_unlock(&hash->bh_lock); + pagebuf_free(pb); + } else { + spin_unlock(&hash->bh_lock); + } + } +} + + +/* + * Mutual exclusion on buffers. Locking model: + * + * Buffers associated with inodes for which buffer locking + * is not enabled are not protected by semaphores, and are + * assumed to be exclusively owned by the caller. There is a + * spinlock in the buffer, used by the caller when concurrent + * access is possible. + */ + +/* + * pagebuf_cond_lock + * + * pagebuf_cond_lock locks a buffer object, if it is not already locked. + * Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +int +pagebuf_cond_lock( /* lock buffer, if not locked */ + /* returns -EBUSY if locked) */ + xfs_buf_t *pb) +{ + int locked; + + locked = down_trylock(&pb->pb_sema) == 0; + if (locked) { + PB_SET_OWNER(pb); + } + PB_TRACE(pb, "cond_lock", (long)locked); + return(locked ? 0 : -EBUSY); +} + +#if defined(DEBUG) || defined(XFS_BLI_TRACE) +/* + * pagebuf_lock_value + * + * Return lock value for a pagebuf + */ +int +pagebuf_lock_value( + xfs_buf_t *pb) +{ + return(atomic_read(&pb->pb_sema.count)); +} +#endif + +/* + * pagebuf_lock + * + * pagebuf_lock locks a buffer object. Note that this in no way + * locks the underlying pages, so it is only useful for synchronizing + * concurrent use of page buffer objects, not for synchronizing independent + * access to the underlying pages. + */ +int +pagebuf_lock( + xfs_buf_t *pb) +{ + PB_TRACE(pb, "lock", 0); + if (atomic_read(&pb->pb_io_remaining)) + blk_run_address_space(pb->pb_target->pbr_mapping); + down(&pb->pb_sema); + PB_SET_OWNER(pb); + PB_TRACE(pb, "locked", 0); + return 0; +} + +/* + * pagebuf_unlock + * + * pagebuf_unlock releases the lock on the buffer object created by + * pagebuf_lock or pagebuf_cond_lock (not any + * pinning of underlying pages created by pagebuf_pin). + */ +void +pagebuf_unlock( /* unlock buffer */ + xfs_buf_t *pb) /* buffer to unlock */ +{ + PB_CLEAR_OWNER(pb); + up(&pb->pb_sema); + PB_TRACE(pb, "unlock", 0); +} + + +/* + * Pinning Buffer Storage in Memory + */ + +/* + * pagebuf_pin + * + * pagebuf_pin locks all of the memory represented by a buffer in + * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for + * the same or different buffers affecting a given page, will + * properly count the number of outstanding "pin" requests. The + * buffer may be released after the pagebuf_pin and a different + * buffer used when calling pagebuf_unpin, if desired. + * pagebuf_pin should be used by the file system when it wants be + * assured that no attempt will be made to force the affected + * memory to disk. It does not assure that a given logical page + * will not be moved to a different physical page. + */ +void +pagebuf_pin( + xfs_buf_t *pb) +{ + atomic_inc(&pb->pb_pin_count); + PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter); +} + +/* + * pagebuf_unpin + * + * pagebuf_unpin reverses the locking of memory performed by + * pagebuf_pin. Note that both functions affected the logical + * pages associated with the buffer, not the buffer itself. + */ +void +pagebuf_unpin( + xfs_buf_t *pb) +{ + if (atomic_dec_and_test(&pb->pb_pin_count)) { + wake_up_all(&pb->pb_waiters); + } + PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter); +} + +int +pagebuf_ispin( + xfs_buf_t *pb) +{ + return atomic_read(&pb->pb_pin_count); +} + +/* + * pagebuf_wait_unpin + * + * pagebuf_wait_unpin waits until all of the memory associated + * with the buffer is not longer locked in memory. It returns + * immediately if none of the affected pages are locked. + */ +static inline void +_pagebuf_wait_unpin( + xfs_buf_t *pb) +{ + DECLARE_WAITQUEUE (wait, current); + + if (atomic_read(&pb->pb_pin_count) == 0) + return; + + add_wait_queue(&pb->pb_waiters, &wait); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&pb->pb_pin_count) == 0) + break; + if (atomic_read(&pb->pb_io_remaining)) + blk_run_address_space(pb->pb_target->pbr_mapping); + schedule(); + } + remove_wait_queue(&pb->pb_waiters, &wait); + set_current_state(TASK_RUNNING); +} + +/* + * Buffer Utility Routines + */ + +/* + * pagebuf_iodone + * + * pagebuf_iodone marks a buffer for which I/O is in progress + * done with respect to that I/O. The pb_iodone routine, if + * present, will be called as a side-effect. + */ +STATIC void +pagebuf_iodone_work( + void *v) +{ + xfs_buf_t *bp = (xfs_buf_t *)v; + + if (bp->pb_iodone) + (*(bp->pb_iodone))(bp); + else if (bp->pb_flags & PBF_ASYNC) + xfs_buf_relse(bp); +} + +void +pagebuf_iodone( + xfs_buf_t *pb, + int dataio, + int schedule) +{ + pb->pb_flags &= ~(PBF_READ | PBF_WRITE); + if (pb->pb_error == 0) { + pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE); + } + + PB_TRACE(pb, "iodone", pb->pb_iodone); + + if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { + if (schedule) { + INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb); + queue_work(dataio ? pagebuf_dataio_workqueue : + pagebuf_logio_workqueue, &pb->pb_iodone_work); + } else { + pagebuf_iodone_work(pb); + } + } else { + up(&pb->pb_iodonesema); + } +} + +/* + * pagebuf_ioerror + * + * pagebuf_ioerror sets the error code for a buffer. + */ +void +pagebuf_ioerror( /* mark/clear buffer error flag */ + xfs_buf_t *pb, /* buffer to mark */ + int error) /* error to store (0 if none) */ +{ + ASSERT(error >= 0 && error <= 0xffff); + pb->pb_error = (unsigned short)error; + PB_TRACE(pb, "ioerror", (unsigned long)error); +} + +/* + * pagebuf_iostart + * + * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied. + * If necessary, it will arrange for any disk space allocation required, + * and it will break up the request if the block mappings require it. + * The pb_iodone routine in the buffer supplied will only be called + * when all of the subsidiary I/O requests, if any, have been completed. + * pagebuf_iostart calls the pagebuf_ioinitiate routine or + * pagebuf_iorequest, if the former routine is not defined, to start + * the I/O on a given low-level request. + */ +int +pagebuf_iostart( /* start I/O on a buffer */ + xfs_buf_t *pb, /* buffer to start */ + page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ + /* PBF_WRITE, PBF_DELWRI, */ + /* PBF_DONT_BLOCK */ +{ + int status = 0; + + PB_TRACE(pb, "iostart", (unsigned long)flags); + + if (flags & PBF_DELWRI) { + pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC); + pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC); + pagebuf_delwri_queue(pb, 1); + return status; + } + + pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \ + PBF_READ_AHEAD | _PBF_RUN_QUEUES); + pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \ + PBF_READ_AHEAD | _PBF_RUN_QUEUES); + + BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL); + + /* For writes allow an alternate strategy routine to precede + * the actual I/O request (which may not be issued at all in + * a shutdown situation, for example). + */ + status = (flags & PBF_WRITE) ? + pagebuf_iostrategy(pb) : pagebuf_iorequest(pb); + + /* Wait for I/O if we are not an async request. + * Note: async I/O request completion will release the buffer, + * and that can already be done by this point. So using the + * buffer pointer from here on, after async I/O, is invalid. + */ + if (!status && !(flags & PBF_ASYNC)) + status = pagebuf_iowait(pb); + + return status; +} + +/* + * Helper routine for pagebuf_iorequest + */ + +STATIC __inline__ int +_pagebuf_iolocked( + xfs_buf_t *pb) +{ + ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE)); + if (pb->pb_flags & PBF_READ) + return pb->pb_locked; + return 0; +} + +STATIC __inline__ void +_pagebuf_iodone( + xfs_buf_t *pb, + int schedule) +{ + if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { + pb->pb_locked = 0; + pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule); + } +} + +STATIC int +bio_end_io_pagebuf( + struct bio *bio, + unsigned int bytes_done, + int error) +{ + xfs_buf_t *pb = (xfs_buf_t *)bio->bi_private; + unsigned int i, blocksize = pb->pb_target->pbr_bsize; + struct bio_vec *bvec = bio->bi_io_vec; + + if (bio->bi_size) + return 1; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + pb->pb_error = EIO; + + for (i = 0; i < bio->bi_vcnt; i++, bvec++) { + struct page *page = bvec->bv_page; + + if (pb->pb_error) { + SetPageError(page); + } else if (blocksize == PAGE_CACHE_SIZE) { + SetPageUptodate(page); + } else if (!PagePrivate(page) && + (pb->pb_flags & _PBF_PAGE_CACHE)) { + set_page_region(page, bvec->bv_offset, bvec->bv_len); + } + + if (_pagebuf_iolocked(pb)) { + unlock_page(page); + } + } + + _pagebuf_iodone(pb, 1); + bio_put(bio); + return 0; +} + +STATIC void +_pagebuf_ioapply( + xfs_buf_t *pb) +{ + int i, rw, map_i, total_nr_pages, nr_pages; + struct bio *bio; + int offset = pb->pb_offset; + int size = pb->pb_count_desired; + sector_t sector = pb->pb_bn; + unsigned int blocksize = pb->pb_target->pbr_bsize; + int locking = _pagebuf_iolocked(pb); + + total_nr_pages = pb->pb_page_count; + map_i = 0; + + if (pb->pb_flags & _PBF_RUN_QUEUES) { + pb->pb_flags &= ~_PBF_RUN_QUEUES; + rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC; + } else { + rw = (pb->pb_flags & PBF_READ) ? READ : WRITE; + } + + /* Special code path for reading a sub page size pagebuf in -- + * we populate up the whole page, and hence the other metadata + * in the same page. This optimization is only valid when the + * filesystem block size and the page size are equal. + */ + if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) && + (pb->pb_flags & PBF_READ) && locking && + (blocksize == PAGE_CACHE_SIZE)) { + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_bdev = pb->pb_target->pbr_bdev; + bio->bi_sector = sector - (offset >> BBSHIFT); + bio->bi_end_io = bio_end_io_pagebuf; + bio->bi_private = pb; + + bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0); + size = 0; + + atomic_inc(&pb->pb_io_remaining); + + goto submit_io; + } + + /* Lock down the pages which we need to for the request */ + if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) { + for (i = 0; size; i++) { + int nbytes = PAGE_CACHE_SIZE - offset; + struct page *page = pb->pb_pages[i]; + + if (nbytes > size) + nbytes = size; + + lock_page(page); + + size -= nbytes; + offset = 0; + } + offset = pb->pb_offset; + size = pb->pb_count_desired; + } + +next_chunk: + atomic_inc(&pb->pb_io_remaining); + nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); + if (nr_pages > total_nr_pages) + nr_pages = total_nr_pages; + + bio = bio_alloc(GFP_NOIO, nr_pages); + bio->bi_bdev = pb->pb_target->pbr_bdev; + bio->bi_sector = sector; + bio->bi_end_io = bio_end_io_pagebuf; + bio->bi_private = pb; + + for (; size && nr_pages; nr_pages--, map_i++) { + int nbytes = PAGE_CACHE_SIZE - offset; + + if (nbytes > size) + nbytes = size; + + if (bio_add_page(bio, pb->pb_pages[map_i], + nbytes, offset) < nbytes) + break; + + offset = 0; + sector += nbytes >> BBSHIFT; + size -= nbytes; + total_nr_pages--; + } + +submit_io: + if (likely(bio->bi_size)) { + submit_bio(rw, bio); + if (size) + goto next_chunk; + } else { + bio_put(bio); + pagebuf_ioerror(pb, EIO); + } +} + +/* + * pagebuf_iorequest -- the core I/O request routine. + */ +int +pagebuf_iorequest( /* start real I/O */ + xfs_buf_t *pb) /* buffer to convey to device */ +{ + PB_TRACE(pb, "iorequest", 0); + + if (pb->pb_flags & PBF_DELWRI) { + pagebuf_delwri_queue(pb, 1); + return 0; + } + + if (pb->pb_flags & PBF_WRITE) { + _pagebuf_wait_unpin(pb); + } + + pagebuf_hold(pb); + + /* Set the count to 1 initially, this will stop an I/O + * completion callout which happens before we have started + * all the I/O from calling pagebuf_iodone too early. + */ + atomic_set(&pb->pb_io_remaining, 1); + _pagebuf_ioapply(pb); + _pagebuf_iodone(pb, 0); + + pagebuf_rele(pb); + return 0; +} + +/* + * pagebuf_iowait + * + * pagebuf_iowait waits for I/O to complete on the buffer supplied. + * It returns immediately if no I/O is pending. In any case, it returns + * the error code, if any, or 0 if there is no error. + */ +int +pagebuf_iowait( + xfs_buf_t *pb) +{ + PB_TRACE(pb, "iowait", 0); + if (atomic_read(&pb->pb_io_remaining)) + blk_run_address_space(pb->pb_target->pbr_mapping); + down(&pb->pb_iodonesema); + PB_TRACE(pb, "iowaited", (long)pb->pb_error); + return pb->pb_error; +} + +caddr_t +pagebuf_offset( + xfs_buf_t *pb, + size_t offset) +{ + struct page *page; + + offset += pb->pb_offset; + + page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT]; + return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1)); +} + +/* + * pagebuf_iomove + * + * Move data into or out of a buffer. + */ +void +pagebuf_iomove( + xfs_buf_t *pb, /* buffer to process */ + size_t boff, /* starting buffer offset */ + size_t bsize, /* length to copy */ + caddr_t data, /* data address */ + page_buf_rw_t mode) /* read/write flag */ +{ + size_t bend, cpoff, csize; + struct page *page; + + bend = boff + bsize; + while (boff < bend) { + page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)]; + cpoff = page_buf_poff(boff + pb->pb_offset); + csize = min_t(size_t, + PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff); + + ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); + + switch (mode) { + case PBRW_ZERO: + memset(page_address(page) + cpoff, 0, csize); + break; + case PBRW_READ: + memcpy(data, page_address(page) + cpoff, csize); + break; + case PBRW_WRITE: + memcpy(page_address(page) + cpoff, data, csize); + } + + boff += csize; + data += csize; + } +} + +/* + * Handling of buftargs. + */ + +/* + * Wait for any bufs with callbacks that have been submitted but + * have not yet returned... walk the hash list for the target. + */ +void +xfs_wait_buftarg( + xfs_buftarg_t *btp) +{ + xfs_buf_t *bp, *n; + xfs_bufhash_t *hash; + uint i; + + for (i = 0; i < (1 << btp->bt_hashshift); i++) { + hash = &btp->bt_hash[i]; +again: + spin_lock(&hash->bh_lock); + list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) { + ASSERT(btp == bp->pb_target); + if (!(bp->pb_flags & PBF_FS_MANAGED)) { + spin_unlock(&hash->bh_lock); + delay(100); + goto again; + } + } + spin_unlock(&hash->bh_lock); + } +} + +/* + * Allocate buffer hash table for a given target. + * For devices containing metadata (i.e. not the log/realtime devices) + * we need to allocate a much larger hash table. + */ +STATIC void +xfs_alloc_bufhash( + xfs_buftarg_t *btp, + int external) +{ + unsigned int i; + + btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ + btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; + btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * + sizeof(xfs_bufhash_t), KM_SLEEP); + for (i = 0; i < (1 << btp->bt_hashshift); i++) { + spin_lock_init(&btp->bt_hash[i].bh_lock); + INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); + } +} + +STATIC void +xfs_free_bufhash( + xfs_buftarg_t *btp) +{ + kmem_free(btp->bt_hash, + (1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); + btp->bt_hash = NULL; +} + +void +xfs_free_buftarg( + xfs_buftarg_t *btp, + int external) +{ + xfs_flush_buftarg(btp, 1); + if (external) + xfs_blkdev_put(btp->pbr_bdev); + xfs_free_bufhash(btp); + iput(btp->pbr_mapping->host); + kmem_free(btp, sizeof(*btp)); +} + +void +xfs_incore_relse( + xfs_buftarg_t *btp, + int delwri_only, + int wait) +{ + invalidate_bdev(btp->pbr_bdev, 1); + truncate_inode_pages(btp->pbr_mapping, 0LL); +} + +STATIC int +xfs_setsize_buftarg_flags( + xfs_buftarg_t *btp, + unsigned int blocksize, + unsigned int sectorsize, + int verbose) +{ + btp->pbr_bsize = blocksize; + btp->pbr_sshift = ffs(sectorsize) - 1; + btp->pbr_smask = sectorsize - 1; + + if (set_blocksize(btp->pbr_bdev, sectorsize)) { + printk(KERN_WARNING + "XFS: Cannot set_blocksize to %u on device %s\n", + sectorsize, XFS_BUFTARG_NAME(btp)); + return EINVAL; + } + + if (verbose && + (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { + printk(KERN_WARNING + "XFS: %u byte sectors in use on device %s. " + "This is suboptimal; %u or greater is ideal.\n", + sectorsize, XFS_BUFTARG_NAME(btp), + (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); + } + + return 0; +} + +/* +* When allocating the initial buffer target we have not yet +* read in the superblock, so don't know what sized sectors +* are being used is at this early stage. Play safe. +*/ +STATIC int +xfs_setsize_buftarg_early( + xfs_buftarg_t *btp, + struct block_device *bdev) +{ + return xfs_setsize_buftarg_flags(btp, + PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); +} + +int +xfs_setsize_buftarg( + xfs_buftarg_t *btp, + unsigned int blocksize, + unsigned int sectorsize) +{ + return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); +} + +STATIC int +xfs_mapping_buftarg( + xfs_buftarg_t *btp, + struct block_device *bdev) +{ + struct backing_dev_info *bdi; + struct inode *inode; + struct address_space *mapping; + static struct address_space_operations mapping_aops = { + .sync_page = block_sync_page, + }; + + inode = new_inode(bdev->bd_inode->i_sb); + if (!inode) { + printk(KERN_WARNING + "XFS: Cannot allocate mapping inode for device %s\n", + XFS_BUFTARG_NAME(btp)); + return ENOMEM; + } + inode->i_mode = S_IFBLK; + inode->i_bdev = bdev; + inode->i_rdev = bdev->bd_dev; + bdi = blk_get_backing_dev_info(bdev); + if (!bdi) + bdi = &default_backing_dev_info; + mapping = &inode->i_data; + mapping->a_ops = &mapping_aops; + mapping->backing_dev_info = bdi; + mapping_set_gfp_mask(mapping, GFP_NOFS); + btp->pbr_mapping = mapping; + return 0; +} + +xfs_buftarg_t * +xfs_alloc_buftarg( + struct block_device *bdev, + int external) +{ + xfs_buftarg_t *btp; + + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + + btp->pbr_dev = bdev->bd_dev; + btp->pbr_bdev = bdev; + if (xfs_setsize_buftarg_early(btp, bdev)) + goto error; + if (xfs_mapping_buftarg(btp, bdev)) + goto error; + xfs_alloc_bufhash(btp, external); + return btp; + +error: + kmem_free(btp, sizeof(*btp)); + return NULL; +} + + +/* + * Pagebuf delayed write buffer handling + */ + +STATIC LIST_HEAD(pbd_delwrite_queue); +STATIC DEFINE_SPINLOCK(pbd_delwrite_lock); + +STATIC void +pagebuf_delwri_queue( + xfs_buf_t *pb, + int unlock) +{ + PB_TRACE(pb, "delwri_q", (long)unlock); + ASSERT(pb->pb_flags & PBF_DELWRI); + + spin_lock(&pbd_delwrite_lock); + /* If already in the queue, dequeue and place at tail */ + if (!list_empty(&pb->pb_list)) { + if (unlock) { + atomic_dec(&pb->pb_hold); + } + list_del(&pb->pb_list); + } + + list_add_tail(&pb->pb_list, &pbd_delwrite_queue); + pb->pb_queuetime = jiffies; + spin_unlock(&pbd_delwrite_lock); + + if (unlock) + pagebuf_unlock(pb); +} + +void +pagebuf_delwri_dequeue( + xfs_buf_t *pb) +{ + int dequeued = 0; + + spin_lock(&pbd_delwrite_lock); + if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) { + list_del_init(&pb->pb_list); + dequeued = 1; + } + pb->pb_flags &= ~PBF_DELWRI; + spin_unlock(&pbd_delwrite_lock); + + if (dequeued) + pagebuf_rele(pb); + + PB_TRACE(pb, "delwri_dq", (long)dequeued); +} + +STATIC void +pagebuf_runall_queues( + struct workqueue_struct *queue) +{ + flush_workqueue(queue); +} + +/* Defines for pagebuf daemon */ +STATIC DECLARE_COMPLETION(pagebuf_daemon_done); +STATIC struct task_struct *pagebuf_daemon_task; +STATIC int pagebuf_daemon_active; +STATIC int force_flush; + + +STATIC int +pagebuf_daemon_wakeup( + int priority, + unsigned int mask) +{ + force_flush = 1; + barrier(); + wake_up_process(pagebuf_daemon_task); + return 0; +} + +STATIC int +pagebuf_daemon( + void *data) +{ + struct list_head tmp; + unsigned long age; + xfs_buftarg_t *target; + xfs_buf_t *pb, *n; + + /* Set up the thread */ + daemonize("xfsbufd"); + current->flags |= PF_MEMALLOC; + + pagebuf_daemon_task = current; + pagebuf_daemon_active = 1; + barrier(); + + INIT_LIST_HEAD(&tmp); + do { + try_to_freeze(PF_FREEZE); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100); + + age = (xfs_buf_age_centisecs * HZ) / 100; + spin_lock(&pbd_delwrite_lock); + list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { + PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb)); + ASSERT(pb->pb_flags & PBF_DELWRI); + + if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) { + if (!force_flush && + time_before(jiffies, + pb->pb_queuetime + age)) { + pagebuf_unlock(pb); + break; + } + + pb->pb_flags &= ~PBF_DELWRI; + pb->pb_flags |= PBF_WRITE; + list_move(&pb->pb_list, &tmp); + } + } + spin_unlock(&pbd_delwrite_lock); + + while (!list_empty(&tmp)) { + pb = list_entry(tmp.next, xfs_buf_t, pb_list); + target = pb->pb_target; + + list_del_init(&pb->pb_list); + pagebuf_iostrategy(pb); + + blk_run_address_space(target->pbr_mapping); + } + + if (as_list_len > 0) + purge_addresses(); + + force_flush = 0; + } while (pagebuf_daemon_active); + + complete_and_exit(&pagebuf_daemon_done, 0); +} + +/* + * Go through all incore buffers, and release buffers if they belong to + * the given device. This is used in filesystem error handling to + * preserve the consistency of its metadata. + */ +int +xfs_flush_buftarg( + xfs_buftarg_t *target, + int wait) +{ + struct list_head tmp; + xfs_buf_t *pb, *n; + int pincount = 0; + + pagebuf_runall_queues(pagebuf_dataio_workqueue); + pagebuf_runall_queues(pagebuf_logio_workqueue); + + INIT_LIST_HEAD(&tmp); + spin_lock(&pbd_delwrite_lock); + list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { + + if (pb->pb_target != target) + continue; + + ASSERT(pb->pb_flags & PBF_DELWRI); + PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb)); + if (pagebuf_ispin(pb)) { + pincount++; + continue; + } + + pb->pb_flags &= ~PBF_DELWRI; + pb->pb_flags |= PBF_WRITE; + list_move(&pb->pb_list, &tmp); + } + spin_unlock(&pbd_delwrite_lock); + + /* + * Dropped the delayed write list lock, now walk the temporary list + */ + list_for_each_entry_safe(pb, n, &tmp, pb_list) { + if (wait) + pb->pb_flags &= ~PBF_ASYNC; + else + list_del_init(&pb->pb_list); + + pagebuf_lock(pb); + pagebuf_iostrategy(pb); + } + + /* + * Remaining list items must be flushed before returning + */ + while (!list_empty(&tmp)) { + pb = list_entry(tmp.next, xfs_buf_t, pb_list); + + list_del_init(&pb->pb_list); + xfs_iowait(pb); + xfs_buf_relse(pb); + } + + if (wait) + blk_run_address_space(target->pbr_mapping); + + return pincount; +} + +STATIC int +pagebuf_daemon_start(void) +{ + int rval; + + pagebuf_logio_workqueue = create_workqueue("xfslogd"); + if (!pagebuf_logio_workqueue) + return -ENOMEM; + + pagebuf_dataio_workqueue = create_workqueue("xfsdatad"); + if (!pagebuf_dataio_workqueue) { + destroy_workqueue(pagebuf_logio_workqueue); + return -ENOMEM; + } + + rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES); + if (rval < 0) { + destroy_workqueue(pagebuf_logio_workqueue); + destroy_workqueue(pagebuf_dataio_workqueue); + } + + return rval; +} + +/* + * pagebuf_daemon_stop + * + * Note: do not mark as __exit, it is called from pagebuf_terminate. + */ +STATIC void +pagebuf_daemon_stop(void) +{ + pagebuf_daemon_active = 0; + barrier(); + wait_for_completion(&pagebuf_daemon_done); + + destroy_workqueue(pagebuf_logio_workqueue); + destroy_workqueue(pagebuf_dataio_workqueue); +} + +/* + * Initialization and Termination + */ + +int __init +pagebuf_init(void) +{ + pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (pagebuf_cache == NULL) { + printk("XFS: couldn't init xfs_buf_t cache\n"); + pagebuf_terminate(); + return -ENOMEM; + } + +#ifdef PAGEBUF_TRACE + pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP); +#endif + + pagebuf_daemon_start(); + + pagebuf_shake = kmem_shake_register(pagebuf_daemon_wakeup); + if (pagebuf_shake == NULL) { + pagebuf_terminate(); + return -ENOMEM; + } + + return 0; +} + + +/* + * pagebuf_terminate. + * + * Note: do not mark as __exit, this is also called from the __init code. + */ +void +pagebuf_terminate(void) +{ + pagebuf_daemon_stop(); + +#ifdef PAGEBUF_TRACE + ktrace_free(pagebuf_trace_buf); +#endif + + kmem_zone_destroy(pagebuf_cache); + kmem_shake_deregister(pagebuf_shake); +} diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h new file mode 100644 index 000000000000..74deed8e6d90 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -0,0 +1,591 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI + */ + +#ifndef __XFS_BUF_H__ +#define __XFS_BUF_H__ + +#include <linux/config.h> +#include <linux/list.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <asm/system.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/uio.h> + +/* + * Base types + */ + +#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) + +#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) +#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) +#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) +#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) + +typedef enum page_buf_rw_e { + PBRW_READ = 1, /* transfer into target memory */ + PBRW_WRITE = 2, /* transfer from target memory */ + PBRW_ZERO = 3 /* Zero target memory */ +} page_buf_rw_t; + + +typedef enum page_buf_flags_e { /* pb_flags values */ + PBF_READ = (1 << 0), /* buffer intended for reading from device */ + PBF_WRITE = (1 << 1), /* buffer intended for writing to device */ + PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */ + PBF_PARTIAL = (1 << 3), /* buffer partially read */ + PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ + PBF_NONE = (1 << 5), /* buffer not read at all */ + PBF_DELWRI = (1 << 6), /* buffer has dirty pages */ + PBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ + PBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ + PBF_FS_DATAIOD = (1 << 9), /* schedule IO completion on fs datad */ + PBF_FORCEIO = (1 << 10), /* ignore any cache state */ + PBF_FLUSH = (1 << 11), /* flush disk write cache */ + PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ + + /* flags used only as arguments to access routines */ + PBF_LOCK = (1 << 14), /* lock requested */ + PBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ + PBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ + + /* flags used only internally */ + _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ + _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ + _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ +} page_buf_flags_t; + +#define PBF_UPDATE (PBF_READ | PBF_WRITE) +#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0) +#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0) + +typedef struct xfs_bufhash { + struct list_head bh_list; + spinlock_t bh_lock; +} xfs_bufhash_t; + +typedef struct xfs_buftarg { + dev_t pbr_dev; + struct block_device *pbr_bdev; + struct address_space *pbr_mapping; + unsigned int pbr_bsize; + unsigned int pbr_sshift; + size_t pbr_smask; + + /* per-device buffer hash table */ + uint bt_hashmask; + uint bt_hashshift; + xfs_bufhash_t *bt_hash; +} xfs_buftarg_t; + +/* + * xfs_buf_t: Buffer structure for page cache-based buffers + * + * This buffer structure is used by the page cache buffer management routines + * to refer to an assembly of pages forming a logical buffer. The actual I/O + * is performed with buffer_head structures, as required by drivers. + * + * The buffer structure is used on temporary basis only, and discarded when + * released. The real data storage is recorded in the page cache. Metadata is + * hashed to the block device on which the file system resides. + */ + +struct xfs_buf; + +/* call-back function on I/O completion */ +typedef void (*page_buf_iodone_t)(struct xfs_buf *); +/* call-back function on I/O completion */ +typedef void (*page_buf_relse_t)(struct xfs_buf *); +/* pre-write function */ +typedef int (*page_buf_bdstrat_t)(struct xfs_buf *); + +#define PB_PAGES 2 + +typedef struct xfs_buf { + struct semaphore pb_sema; /* semaphore for lockables */ + unsigned long pb_queuetime; /* time buffer was queued */ + atomic_t pb_pin_count; /* pin count */ + wait_queue_head_t pb_waiters; /* unpin waiters */ + struct list_head pb_list; + page_buf_flags_t pb_flags; /* status flags */ + struct list_head pb_hash_list; /* hash table list */ + xfs_bufhash_t *pb_hash; /* hash table list start */ + xfs_buftarg_t *pb_target; /* buffer target (device) */ + atomic_t pb_hold; /* reference count */ + xfs_daddr_t pb_bn; /* block number for I/O */ + loff_t pb_file_offset; /* offset in file */ + size_t pb_buffer_length; /* size of buffer in bytes */ + size_t pb_count_desired; /* desired transfer size */ + void *pb_addr; /* virtual address of buffer */ + struct work_struct pb_iodone_work; + atomic_t pb_io_remaining;/* #outstanding I/O requests */ + page_buf_iodone_t pb_iodone; /* I/O completion function */ + page_buf_relse_t pb_relse; /* releasing function */ + page_buf_bdstrat_t pb_strat; /* pre-write function */ + struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */ + void *pb_fspriv; + void *pb_fspriv2; + void *pb_fspriv3; + unsigned short pb_error; /* error code on I/O */ + unsigned short pb_locked; /* page array is locked */ + unsigned int pb_page_count; /* size of page array */ + unsigned int pb_offset; /* page offset in first page */ + struct page **pb_pages; /* array of page pointers */ + struct page *pb_page_array[PB_PAGES]; /* inline pages */ +#ifdef PAGEBUF_LOCK_TRACKING + int pb_last_holder; +#endif +} xfs_buf_t; + + +/* Finding and Reading Buffers */ + +extern xfs_buf_t *_pagebuf_find( /* find buffer for block if */ + /* the block is in memory */ + xfs_buftarg_t *, /* inode for block */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t, /* PBF_LOCK */ + xfs_buf_t *); /* newly allocated buffer */ + +#define xfs_incore(buftarg,blkno,len,lockit) \ + _pagebuf_find(buftarg, blkno ,len, lockit, NULL) + +extern xfs_buf_t *xfs_buf_get_flags( /* allocate a buffer */ + xfs_buftarg_t *, /* inode for buffer */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_LOCK, PBF_READ, */ + /* PBF_ASYNC */ + +#define xfs_buf_get(target, blkno, len, flags) \ + xfs_buf_get_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED) + +extern xfs_buf_t *xfs_buf_read_flags( /* allocate and read a buffer */ + xfs_buftarg_t *, /* inode for buffer */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC */ + +#define xfs_buf_read(target, blkno, len, flags) \ + xfs_buf_read_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED) + +extern xfs_buf_t *pagebuf_lookup( + xfs_buftarg_t *, + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* PBF_READ, PBF_WRITE, */ + /* PBF_FORCEIO, */ + +extern xfs_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */ + /* no memory or disk address */ + size_t len, + xfs_buftarg_t *); /* mount point "fake" inode */ + +extern xfs_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */ + /* without disk address */ + size_t len, + xfs_buftarg_t *); /* mount point "fake" inode */ + +extern int pagebuf_associate_memory( + xfs_buf_t *, + void *, + size_t); + +extern void pagebuf_hold( /* increment reference count */ + xfs_buf_t *); /* buffer to hold */ + +extern void pagebuf_readahead( /* read ahead into cache */ + xfs_buftarg_t *, /* target for buffer (or NULL) */ + loff_t, /* starting offset of range */ + size_t, /* length of range */ + page_buf_flags_t); /* additional read flags */ + +/* Releasing Buffers */ + +extern void pagebuf_free( /* deallocate a buffer */ + xfs_buf_t *); /* buffer to deallocate */ + +extern void pagebuf_rele( /* release hold on a buffer */ + xfs_buf_t *); /* buffer to release */ + +/* Locking and Unlocking Buffers */ + +extern int pagebuf_cond_lock( /* lock buffer, if not locked */ + /* (returns -EBUSY if locked) */ + xfs_buf_t *); /* buffer to lock */ + +extern int pagebuf_lock_value( /* return count on lock */ + xfs_buf_t *); /* buffer to check */ + +extern int pagebuf_lock( /* lock buffer */ + xfs_buf_t *); /* buffer to lock */ + +extern void pagebuf_unlock( /* unlock buffer */ + xfs_buf_t *); /* buffer to unlock */ + +/* Buffer Read and Write Routines */ + +extern void pagebuf_iodone( /* mark buffer I/O complete */ + xfs_buf_t *, /* buffer to mark */ + int, /* use data/log helper thread. */ + int); /* run completion locally, or in + * a helper thread. */ + +extern void pagebuf_ioerror( /* mark buffer in error (or not) */ + xfs_buf_t *, /* buffer to mark */ + int); /* error to store (0 if none) */ + +extern int pagebuf_iostart( /* start I/O on a buffer */ + xfs_buf_t *, /* buffer to start */ + page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, */ + /* PBF_READ, PBF_WRITE, */ + /* PBF_DELWRI */ + +extern int pagebuf_iorequest( /* start real I/O */ + xfs_buf_t *); /* buffer to convey to device */ + +extern int pagebuf_iowait( /* wait for buffer I/O done */ + xfs_buf_t *); /* buffer to wait on */ + +extern void pagebuf_iomove( /* move data in/out of pagebuf */ + xfs_buf_t *, /* buffer to manipulate */ + size_t, /* starting buffer offset */ + size_t, /* length in buffer */ + caddr_t, /* data pointer */ + page_buf_rw_t); /* direction */ + +static inline int pagebuf_iostrategy(xfs_buf_t *pb) +{ + return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb); +} + +static inline int pagebuf_geterror(xfs_buf_t *pb) +{ + return pb ? pb->pb_error : ENOMEM; +} + +/* Buffer Utility Routines */ + +extern caddr_t pagebuf_offset( /* pointer at offset in buffer */ + xfs_buf_t *, /* buffer to offset into */ + size_t); /* offset */ + +/* Pinning Buffer Storage in Memory */ + +extern void pagebuf_pin( /* pin buffer in memory */ + xfs_buf_t *); /* buffer to pin */ + +extern void pagebuf_unpin( /* unpin buffered data */ + xfs_buf_t *); /* buffer to unpin */ + +extern int pagebuf_ispin( /* check if buffer is pinned */ + xfs_buf_t *); /* buffer to check */ + +/* Delayed Write Buffer Routines */ + +extern void pagebuf_delwri_dequeue(xfs_buf_t *); + +/* Buffer Daemon Setup Routines */ + +extern int pagebuf_init(void); +extern void pagebuf_terminate(void); + + +#ifdef PAGEBUF_TRACE +extern ktrace_t *pagebuf_trace_buf; +extern void pagebuf_trace( + xfs_buf_t *, /* buffer being traced */ + char *, /* description of operation */ + void *, /* arbitrary diagnostic value */ + void *); /* return address */ +#else +# define pagebuf_trace(pb, id, ptr, ra) do { } while (0) +#endif + +#define pagebuf_target_name(target) \ + ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; }) + + + + + +/* These are just for xfs_syncsub... it sets an internal variable + * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t + */ +#define XFS_B_ASYNC PBF_ASYNC +#define XFS_B_DELWRI PBF_DELWRI +#define XFS_B_READ PBF_READ +#define XFS_B_WRITE PBF_WRITE +#define XFS_B_STALE PBF_STALE + +#define XFS_BUF_TRYLOCK PBF_TRYLOCK +#define XFS_INCORE_TRYLOCK PBF_TRYLOCK +#define XFS_BUF_LOCK PBF_LOCK +#define XFS_BUF_MAPPED PBF_MAPPED + +#define BUF_BUSY PBF_DONT_BLOCK + +#define XFS_BUF_BFLAGS(x) ((x)->pb_flags) +#define XFS_BUF_ZEROFLAGS(x) \ + ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI)) + +#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE) +#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE) +#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE) +#define XFS_BUF_SUPER_STALE(x) do { \ + XFS_BUF_STALE(x); \ + pagebuf_delwri_dequeue(x); \ + XFS_BUF_DONE(x); \ + } while (0) + +#define XFS_BUF_MANAGE PBF_FS_MANAGED +#define XFS_BUF_UNMANAGE(x) ((x)->pb_flags &= ~PBF_FS_MANAGED) + +#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI) +#define XFS_BUF_UNDELAYWRITE(x) pagebuf_delwri_dequeue(x) +#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI) + +#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no) +#define XFS_BUF_GETERROR(x) pagebuf_geterror(x) +#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0) + +#define XFS_BUF_DONE(x) ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE)) +#define XFS_BUF_UNDONE(x) ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE) +#define XFS_BUF_ISDONE(x) (!(PBF_NOT_DONE(x))) + +#define XFS_BUF_BUSY(x) ((x)->pb_flags |= PBF_FORCEIO) +#define XFS_BUF_UNBUSY(x) ((x)->pb_flags &= ~PBF_FORCEIO) +#define XFS_BUF_ISBUSY(x) (1) + +#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC) +#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC) +#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC) + +#define XFS_BUF_FLUSH(x) ((x)->pb_flags |= PBF_FLUSH) +#define XFS_BUF_UNFLUSH(x) ((x)->pb_flags &= ~PBF_FLUSH) +#define XFS_BUF_ISFLUSH(x) ((x)->pb_flags & PBF_FLUSH) + +#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n") +#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n") +#define XFS_BUF_ISSHUT(x) (0) + +#define XFS_BUF_HOLD(x) pagebuf_hold(x) +#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ) +#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ) +#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ) + +#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE) +#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE) +#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE) + +#define XFS_BUF_ISUNINITIAL(x) (0) +#define XFS_BUF_UNUNINITIAL(x) (0) + +#define XFS_BUF_BP_ISMAPPED(bp) 1 + +#define XFS_BUF_DATAIO(x) ((x)->pb_flags |= PBF_FS_DATAIOD) +#define XFS_BUF_UNDATAIO(x) ((x)->pb_flags &= ~PBF_FS_DATAIOD) + +#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone +#define XFS_BUF_SET_IODONE_FUNC(buf, func) \ + (buf)->pb_iodone = (func) +#define XFS_BUF_CLR_IODONE_FUNC(buf) \ + (buf)->pb_iodone = NULL +#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \ + (buf)->pb_strat = (func) +#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \ + (buf)->pb_strat = NULL + +#define XFS_BUF_FSPRIVATE(buf, type) \ + ((type)(buf)->pb_fspriv) +#define XFS_BUF_SET_FSPRIVATE(buf, value) \ + (buf)->pb_fspriv = (void *)(value) +#define XFS_BUF_FSPRIVATE2(buf, type) \ + ((type)(buf)->pb_fspriv2) +#define XFS_BUF_SET_FSPRIVATE2(buf, value) \ + (buf)->pb_fspriv2 = (void *)(value) +#define XFS_BUF_FSPRIVATE3(buf, type) \ + ((type)(buf)->pb_fspriv3) +#define XFS_BUF_SET_FSPRIVATE3(buf, value) \ + (buf)->pb_fspriv3 = (void *)(value) +#define XFS_BUF_SET_START(buf) + +#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \ + (buf)->pb_relse = (value) + +#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr) + +extern inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset) +{ + if (bp->pb_flags & PBF_MAPPED) + return XFS_BUF_PTR(bp) + offset; + return (xfs_caddr_t) pagebuf_offset(bp, offset); +} + +#define XFS_BUF_SET_PTR(bp, val, count) \ + pagebuf_associate_memory(bp, val, count) +#define XFS_BUF_ADDR(bp) ((bp)->pb_bn) +#define XFS_BUF_SET_ADDR(bp, blk) \ + ((bp)->pb_bn = (xfs_daddr_t)(blk)) +#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset) +#define XFS_BUF_SET_OFFSET(bp, off) \ + ((bp)->pb_file_offset = (off)) +#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired) +#define XFS_BUF_SET_COUNT(bp, cnt) \ + ((bp)->pb_count_desired = (cnt)) +#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length) +#define XFS_BUF_SET_SIZE(bp, cnt) \ + ((bp)->pb_buffer_length = (cnt)) +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) +#define XFS_BUF_SET_VTYPE(bp, type) +#define XFS_BUF_SET_REF(bp, ref) + +#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp) + +#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp) +#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0) +#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp) +#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp) +#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema); + +/* setup the buffer target from a buftarg structure */ +#define XFS_BUF_SET_TARGET(bp, target) \ + (bp)->pb_target = (target) +#define XFS_BUF_TARGET(bp) ((bp)->pb_target) +#define XFS_BUFTARG_NAME(target) \ + pagebuf_target_name(target) + +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) +#define XFS_BUF_SET_VTYPE(bp, type) +#define XFS_BUF_SET_REF(bp, ref) + +static inline int xfs_bawrite(void *mp, xfs_buf_t *bp) +{ + bp->pb_fspriv3 = mp; + bp->pb_strat = xfs_bdstrat_cb; + pagebuf_delwri_dequeue(bp); + return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | _PBF_RUN_QUEUES); +} + +static inline void xfs_buf_relse(xfs_buf_t *bp) +{ + if (!bp->pb_relse) + pagebuf_unlock(bp); + pagebuf_rele(bp); +} + +#define xfs_bpin(bp) pagebuf_pin(bp) +#define xfs_bunpin(bp) pagebuf_unpin(bp) + +#define xfs_buftrace(id, bp) \ + pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) + +#define xfs_biodone(pb) \ + pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0) + +#define xfs_biomove(pb, off, len, data, rw) \ + pagebuf_iomove((pb), (off), (len), (data), \ + ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ) + +#define xfs_biozero(pb, off, len) \ + pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO) + + +static inline int XFS_bwrite(xfs_buf_t *pb) +{ + int iowait = (pb->pb_flags & PBF_ASYNC) == 0; + int error = 0; + + if (!iowait) + pb->pb_flags |= _PBF_RUN_QUEUES; + + pagebuf_delwri_dequeue(pb); + pagebuf_iostrategy(pb); + if (iowait) { + error = pagebuf_iowait(pb); + xfs_buf_relse(pb); + } + return error; +} + +#define XFS_bdwrite(pb) \ + pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC) + +static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) +{ + bp->pb_strat = xfs_bdstrat_cb; + bp->pb_fspriv3 = mp; + + return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC); +} + +#define XFS_bdstrat(bp) pagebuf_iorequest(bp) + +#define xfs_iowait(pb) pagebuf_iowait(pb) + +#define xfs_baread(target, rablkno, ralen) \ + pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK) + +#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target)) +#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target)) +#define xfs_buf_free(bp) pagebuf_free(bp) + + +/* + * Handling of buftargs. + */ + +extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); +extern void xfs_free_buftarg(xfs_buftarg_t *, int); +extern void xfs_wait_buftarg(xfs_buftarg_t *); +extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); +extern void xfs_incore_relse(xfs_buftarg_t *, int, int); +extern int xfs_flush_buftarg(xfs_buftarg_t *, int); + +#define xfs_getsize_buftarg(buftarg) \ + block_size((buftarg)->pbr_bdev) +#define xfs_readonly_buftarg(buftarg) \ + bdev_read_only((buftarg)->pbr_bdev) +#define xfs_binval(buftarg) \ + xfs_flush_buftarg(buftarg, 1) +#define XFS_bflush(buftarg) \ + xfs_flush_buftarg(buftarg, 1) + +#endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h new file mode 100644 index 000000000000..00c45849d41a --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_cred.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_CRED_H__ +#define __XFS_CRED_H__ + +/* + * Credentials + */ +typedef struct cred { + /* EMPTY */ +} cred_t; + +extern struct cred *sys_cred; + +/* this is a hack.. (assums sys_cred is the only cred_t in the system) */ +static __inline int capable_cred(cred_t *cr, int cid) +{ + return (cr == sys_cred) ? 1 : capable(cid); +} + +#endif /* __XFS_CRED_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c new file mode 100644 index 000000000000..f372a1a5e168 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_types.h" +#include "xfs_dmapi.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_mount.h" +#include "xfs_export.h" + +/* + * XFS encode and decodes the fileid portion of NFS filehandles + * itself instead of letting the generic NFS code do it. This + * allows filesystems with 64 bit inode numbers to be exported. + * + * Note that a side effect is that xfs_vget() won't be passed a + * zero inode/generation pair under normal circumstances. As + * however a malicious client could send us such data, the check + * remains in that code. + */ + + +STATIC struct dentry * +linvfs_decode_fh( + struct super_block *sb, + __u32 *fh, + int fh_len, + int fileid_type, + int (*acceptable)( + void *context, + struct dentry *de), + void *context) +{ + xfs_fid2_t ifid; + xfs_fid2_t pfid; + void *parent = NULL; + int is64 = 0; + __u32 *p = fh; + +#if XFS_BIG_INUMS + is64 = (fileid_type & XFS_FILEID_TYPE_64FLAG); + fileid_type &= ~XFS_FILEID_TYPE_64FLAG; +#endif + + /* + * Note that we only accept fileids which are long enough + * rather than allow the parent generation number to default + * to zero. XFS considers zero a valid generation number not + * an invalid/wildcard value. There's little point printk'ing + * a warning here as we don't have the client information + * which would make such a warning useful. + */ + if (fileid_type > 2 || + fh_len < xfs_fileid_length((fileid_type == 2), is64)) + return NULL; + + p = xfs_fileid_decode_fid2(p, &ifid, is64); + + if (fileid_type == 2) { + p = xfs_fileid_decode_fid2(p, &pfid, is64); + parent = &pfid; + } + + fh = (__u32 *)&ifid; + return find_exported_dentry(sb, fh, parent, acceptable, context); +} + + +STATIC int +linvfs_encode_fh( + struct dentry *dentry, + __u32 *fh, + int *max_len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + int type = 1; + __u32 *p = fh; + int len; + int is64 = 0; +#if XFS_BIG_INUMS + vfs_t *vfs = LINVFS_GET_VFS(inode->i_sb); + xfs_mount_t *mp = XFS_VFSTOM(vfs); + + if (!(mp->m_flags & XFS_MOUNT_32BITINOOPT)) { + /* filesystem may contain 64bit inode numbers */ + is64 = XFS_FILEID_TYPE_64FLAG; + } +#endif + + /* Directories don't need their parent encoded, they have ".." */ + if (S_ISDIR(inode->i_mode)) + connectable = 0; + + /* + * Only encode if there is enough space given. In practice + * this means we can't export a filesystem with 64bit inodes + * over NFSv2 with the subtree_check export option; the other + * seven combinations work. The real answer is "don't use v2". + */ + len = xfs_fileid_length(connectable, is64); + if (*max_len < len) + return 255; + *max_len = len; + + p = xfs_fileid_encode_inode(p, inode, is64); + if (connectable) { + spin_lock(&dentry->d_lock); + p = xfs_fileid_encode_inode(p, dentry->d_parent->d_inode, is64); + spin_unlock(&dentry->d_lock); + type = 2; + } + BUG_ON((p - fh) != len); + return type | is64; +} + +STATIC struct dentry * +linvfs_get_dentry( + struct super_block *sb, + void *data) +{ + vnode_t *vp; + struct inode *inode; + struct dentry *result; + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + VFS_VGET(vfsp, &vp, (fid_t *)data, error); + if (error || vp == NULL) + return ERR_PTR(-ESTALE) ; + + inode = LINVFS_GET_IP(vp); + result = d_alloc_anon(inode); + if (!result) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + return result; +} + +STATIC struct dentry * +linvfs_get_parent( + struct dentry *child) +{ + int error; + vnode_t *vp, *cvp; + struct dentry *parent; + struct dentry dotdot; + + dotdot.d_name.name = ".."; + dotdot.d_name.len = 2; + dotdot.d_inode = NULL; + + cvp = NULL; + vp = LINVFS_GET_VP(child->d_inode); + VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error); + if (unlikely(error)) + return ERR_PTR(-error); + + parent = d_alloc_anon(LINVFS_GET_IP(cvp)); + if (unlikely(!parent)) { + VN_RELE(cvp); + return ERR_PTR(-ENOMEM); + } + return parent; +} + +struct export_operations linvfs_export_ops = { + .decode_fh = linvfs_decode_fh, + .encode_fh = linvfs_encode_fh, + .get_parent = linvfs_get_parent, + .get_dentry = linvfs_get_dentry, +}; diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h new file mode 100644 index 000000000000..60b2abac1c18 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_export.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_EXPORT_H__ +#define __XFS_EXPORT_H__ + +/* + * Common defines for code related to exporting XFS filesystems over NFS. + * + * The NFS fileid goes out on the wire as an array of + * 32bit unsigned ints in host order. There are 5 possible + * formats. + * + * (1) fileid_type=0x00 + * (no fileid data; handled by the generic code) + * + * (2) fileid_type=0x01 + * inode-num + * generation + * + * (3) fileid_type=0x02 + * inode-num + * generation + * parent-inode-num + * parent-generation + * + * (4) fileid_type=0x81 + * inode-num-lo32 + * inode-num-hi32 + * generation + * + * (5) fileid_type=0x82 + * inode-num-lo32 + * inode-num-hi32 + * generation + * parent-inode-num-lo32 + * parent-inode-num-hi32 + * parent-generation + * + * Note, the NFS filehandle also includes an fsid portion which + * may have an inode number in it. That number is hardcoded to + * 32bits and there is no way for XFS to intercept it. In + * practice this means when exporting an XFS filesytem with 64bit + * inodes you should either export the mountpoint (rather than + * a subdirectory) or use the "fsid" export option. + */ + +/* This flag goes on the wire. Don't play with it. */ +#define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ + +/* Calculate the length in u32 units of the fileid data */ +static inline int +xfs_fileid_length(int hasparent, int is64) +{ + return hasparent ? (is64 ? 6 : 4) : (is64 ? 3 : 2); +} + +/* + * Decode encoded inode information (either for the inode itself + * or the parent) into an xfs_fid2_t structure. Advances and + * returns the new data pointer + */ +static inline __u32 * +xfs_fileid_decode_fid2(__u32 *p, xfs_fid2_t *fid, int is64) +{ + fid->fid_len = sizeof(xfs_fid2_t) - sizeof(fid->fid_len); + fid->fid_pad = 0; + fid->fid_ino = *p++; +#if XFS_BIG_INUMS + if (is64) + fid->fid_ino |= (((__u64)(*p++)) << 32); +#endif + fid->fid_gen = *p++; + return p; +} + +/* + * Encode inode information (either for the inode itself or the + * parent) into a fileid buffer. Advances and returns the new + * data pointer. + */ +static inline __u32 * +xfs_fileid_encode_inode(__u32 *p, struct inode *inode, int is64) +{ + *p++ = (__u32)inode->i_ino; +#if XFS_BIG_INUMS + if (is64) + *p++ = (__u32)(inode->i_ino >> 32); +#endif + *p++ = inode->i_generation; + return p; +} + +#endif /* __XFS_EXPORT_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c new file mode 100644 index 000000000000..9f057a4a5b06 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -0,0 +1,573 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_trans.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_btree.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_rw.h" +#include "xfs_ioctl32.h" + +#include <linux/dcache.h> +#include <linux/smp_lock.h> + +static struct vm_operations_struct linvfs_file_vm_ops; + + +STATIC inline ssize_t +__linvfs_read( + struct kiocb *iocb, + char __user *buf, + int ioflags, + size_t count, + loff_t pos) +{ + struct iovec iov = {buf, count}; + struct file *file = iocb->ki_filp; + vnode_t *vp = LINVFS_GET_VP(file->f_dentry->d_inode); + ssize_t rval; + + BUG_ON(iocb->ki_pos != pos); + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval); + return rval; +} + + +STATIC ssize_t +linvfs_aio_read( + struct kiocb *iocb, + char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_read(iocb, buf, IO_ISAIO, count, pos); +} + +STATIC ssize_t +linvfs_aio_read_invis( + struct kiocb *iocb, + char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); +} + + +STATIC inline ssize_t +__linvfs_write( + struct kiocb *iocb, + const char __user *buf, + int ioflags, + size_t count, + loff_t pos) +{ + struct iovec iov = {(void __user *)buf, count}; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + ssize_t rval; + + BUG_ON(iocb->ki_pos != pos); + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + + VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval); + return rval; +} + + +STATIC ssize_t +linvfs_aio_write( + struct kiocb *iocb, + const char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_write(iocb, buf, IO_ISAIO, count, pos); +} + +STATIC ssize_t +linvfs_aio_write_invis( + struct kiocb *iocb, + const char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); +} + + +STATIC inline ssize_t +__linvfs_readv( + struct file *file, + const struct iovec *iov, + int ioflags, + unsigned long nr_segs, + loff_t *ppos) +{ + struct inode *inode = file->f_mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + struct kiocb kiocb; + ssize_t rval; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval); + + *ppos = kiocb.ki_pos; + return rval; +} + +STATIC ssize_t +linvfs_readv( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_readv(file, iov, 0, nr_segs, ppos); +} + +STATIC ssize_t +linvfs_readv_invis( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_readv(file, iov, IO_INVIS, nr_segs, ppos); +} + + +STATIC inline ssize_t +__linvfs_writev( + struct file *file, + const struct iovec *iov, + int ioflags, + unsigned long nr_segs, + loff_t *ppos) +{ + struct inode *inode = file->f_mapping->host; + vnode_t *vp = LINVFS_GET_VP(inode); + struct kiocb kiocb; + ssize_t rval; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + + VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval); + + *ppos = kiocb.ki_pos; + return rval; +} + + +STATIC ssize_t +linvfs_writev( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_writev(file, iov, 0, nr_segs, ppos); +} + +STATIC ssize_t +linvfs_writev_invis( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_writev(file, iov, IO_INVIS, nr_segs, ppos); +} + +STATIC ssize_t +linvfs_sendfile( + struct file *filp, + loff_t *ppos, + size_t count, + read_actor_t actor, + void *target) +{ + vnode_t *vp = LINVFS_GET_VP(filp->f_dentry->d_inode); + ssize_t rval; + + VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, rval); + return rval; +} + + +STATIC int +linvfs_open( + struct inode *inode, + struct file *filp) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EFBIG; + + ASSERT(vp); + VOP_OPEN(vp, NULL, error); + return -error; +} + + +STATIC int +linvfs_release( + struct inode *inode, + struct file *filp) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error = 0; + + if (vp) + VOP_RELEASE(vp, error); + return -error; +} + + +STATIC int +linvfs_fsync( + struct file *filp, + struct dentry *dentry, + int datasync) +{ + struct inode *inode = dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + int flags = FSYNC_WAIT; + + if (datasync) + flags |= FSYNC_DATA; + + ASSERT(vp); + VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error); + return -error; +} + +/* + * linvfs_readdir maps to VOP_READDIR(). + * We need to build a uio, cred, ... + */ + +#define nextdp(dp) ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen)) + +STATIC int +linvfs_readdir( + struct file *filp, + void *dirent, + filldir_t filldir) +{ + int error = 0; + vnode_t *vp; + uio_t uio; + iovec_t iov; + int eof = 0; + caddr_t read_buf; + int namelen, size = 0; + size_t rlen = PAGE_CACHE_SIZE; + xfs_off_t start_offset, curr_offset; + xfs_dirent_t *dbp = NULL; + + vp = LINVFS_GET_VP(filp->f_dentry->d_inode); + ASSERT(vp); + + /* Try fairly hard to get memory */ + do { + if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL))) + break; + rlen >>= 1; + } while (rlen >= 1024); + + if (read_buf == NULL) + return -ENOMEM; + + uio.uio_iov = &iov; + uio.uio_segflg = UIO_SYSSPACE; + curr_offset = filp->f_pos; + if (filp->f_pos != 0x7fffffff) + uio.uio_offset = filp->f_pos; + else + uio.uio_offset = 0xffffffff; + + while (!eof) { + uio.uio_resid = iov.iov_len = rlen; + iov.iov_base = read_buf; + uio.uio_iovcnt = 1; + + start_offset = uio.uio_offset; + + VOP_READDIR(vp, &uio, NULL, &eof, error); + if ((uio.uio_offset == start_offset) || error) { + size = 0; + break; + } + + size = rlen - uio.uio_resid; + dbp = (xfs_dirent_t *)read_buf; + while (size > 0) { + namelen = strlen(dbp->d_name); + + if (filldir(dirent, dbp->d_name, namelen, + (loff_t) curr_offset & 0x7fffffff, + (ino_t) dbp->d_ino, + DT_UNKNOWN)) { + goto done; + } + size -= dbp->d_reclen; + curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */; + dbp = nextdp(dbp); + } + } +done: + if (!error) { + if (size == 0) + filp->f_pos = uio.uio_offset & 0x7fffffff; + else if (dbp) + filp->f_pos = curr_offset; + } + + kfree(read_buf); + return -error; +} + + +STATIC int +linvfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + struct inode *ip = filp->f_dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(ip); + vattr_t va = { .va_mask = XFS_AT_UPDATIME }; + int error; + + if (vp->v_vfsp->vfs_flag & VFS_DMI) { + xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); + + error = -XFS_SEND_MMAP(mp, vma, 0); + if (error) + return error; + } + + vma->vm_ops = &linvfs_file_vm_ops; + + VOP_SETATTR(vp, &va, XFS_AT_UPDATIME, NULL, error); + if (!error) + vn_revalidate(vp); /* update Linux inode flags */ + return 0; +} + + +STATIC long +linvfs_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + int error; + struct inode *inode = filp->f_dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + + VOP_IOCTL(vp, inode, filp, 0, cmd, (void __user *)arg, error); + VMODIFY(vp); + + /* NOTE: some of the ioctl's return positive #'s as a + * byte count indicating success, such as + * readlink_by_handle. So we don't "sign flip" + * like most other routines. This means true + * errors need to be returned as a negative value. + */ + return error; +} + +STATIC long +linvfs_ioctl_invis( + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + int error; + struct inode *inode = filp->f_dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + + ASSERT(vp); + VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error); + VMODIFY(vp); + + /* NOTE: some of the ioctl's return positive #'s as a + * byte count indicating success, such as + * readlink_by_handle. So we don't "sign flip" + * like most other routines. This means true + * errors need to be returned as a negative value. + */ + return error; +} + +#ifdef HAVE_VMOP_MPROTECT +STATIC int +linvfs_mprotect( + struct vm_area_struct *vma, + unsigned int newflags) +{ + vnode_t *vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode); + int error = 0; + + if (vp->v_vfsp->vfs_flag & VFS_DMI) { + if ((vma->vm_flags & VM_MAYSHARE) && + (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE)) { + xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); + + error = XFS_SEND_MMAP(mp, vma, VM_WRITE); + } + } + return error; +} +#endif /* HAVE_VMOP_MPROTECT */ + +#ifdef HAVE_FOP_OPEN_EXEC +/* If the user is attempting to execute a file that is offline then + * we have to trigger a DMAPI READ event before the file is marked as busy + * otherwise the invisible I/O will not be able to write to the file to bring + * it back online. + */ +STATIC int +linvfs_open_exec( + struct inode *inode) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); + int error = 0; + bhv_desc_t *bdp; + xfs_inode_t *ip; + + if (vp->v_vfsp->vfs_flag & VFS_DMI) { + bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); + if (!bdp) { + error = -EINVAL; + goto open_exec_out; + } + ip = XFS_BHVTOI(bdp); + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) { + error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, + 0, 0, 0, NULL); + } + } +open_exec_out: + return error; +} +#endif /* HAVE_FOP_OPEN_EXEC */ + +struct file_operations linvfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .readv = linvfs_readv, + .writev = linvfs_writev, + .aio_read = linvfs_aio_read, + .aio_write = linvfs_aio_write, + .sendfile = linvfs_sendfile, + .unlocked_ioctl = linvfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_compat_ioctl, +#endif + .mmap = linvfs_file_mmap, + .open = linvfs_open, + .release = linvfs_release, + .fsync = linvfs_fsync, +#ifdef HAVE_FOP_OPEN_EXEC + .open_exec = linvfs_open_exec, +#endif +}; + +struct file_operations linvfs_invis_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .readv = linvfs_readv_invis, + .writev = linvfs_writev_invis, + .aio_read = linvfs_aio_read_invis, + .aio_write = linvfs_aio_write_invis, + .sendfile = linvfs_sendfile, + .unlocked_ioctl = linvfs_ioctl_invis, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_compat_invis_ioctl, +#endif + .mmap = linvfs_file_mmap, + .open = linvfs_open, + .release = linvfs_release, + .fsync = linvfs_fsync, +}; + + +struct file_operations linvfs_dir_operations = { + .read = generic_read_dir, + .readdir = linvfs_readdir, + .unlocked_ioctl = linvfs_ioctl, + .fsync = linvfs_fsync, +}; + +static struct vm_operations_struct linvfs_file_vm_ops = { + .nopage = filemap_nopage, + .populate = filemap_populate, +#ifdef HAVE_VMOP_MPROTECT + .mprotect = linvfs_mprotect, +#endif +}; diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c new file mode 100644 index 000000000000..05ebd30ec96f --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +/* + * Stub for no-op vnode operations that return error status. + */ +int +fs_noerr(void) +{ + return 0; +} + +/* + * Operation unsupported under this file system. + */ +int +fs_nosys(void) +{ + return ENOSYS; +} + +/* + * Stub for inactive, strategy, and read/write lock/unlock. Does nothing. + */ +/* ARGSUSED */ +void +fs_noval(void) +{ +} + +/* + * vnode pcache layer for vnode_tosspages. + * 'last' parameter unused but left in for IRIX compatibility + */ +void +fs_tosspages( + bhv_desc_t *bdp, + xfs_off_t first, + xfs_off_t last, + int fiopt) +{ + vnode_t *vp = BHV_TO_VNODE(bdp); + struct inode *ip = LINVFS_GET_IP(vp); + + if (VN_CACHED(vp)) + truncate_inode_pages(ip->i_mapping, first); +} + + +/* + * vnode pcache layer for vnode_flushinval_pages. + * 'last' parameter unused but left in for IRIX compatibility + */ +void +fs_flushinval_pages( + bhv_desc_t *bdp, + xfs_off_t first, + xfs_off_t last, + int fiopt) +{ + vnode_t *vp = BHV_TO_VNODE(bdp); + struct inode *ip = LINVFS_GET_IP(vp); + + if (VN_CACHED(vp)) { + filemap_fdatawrite(ip->i_mapping); + filemap_fdatawait(ip->i_mapping); + + truncate_inode_pages(ip->i_mapping, first); + } +} + +/* + * vnode pcache layer for vnode_flush_pages. + * 'last' parameter unused but left in for IRIX compatibility + */ +int +fs_flush_pages( + bhv_desc_t *bdp, + xfs_off_t first, + xfs_off_t last, + uint64_t flags, + int fiopt) +{ + vnode_t *vp = BHV_TO_VNODE(bdp); + struct inode *ip = LINVFS_GET_IP(vp); + + if (VN_CACHED(vp)) { + filemap_fdatawrite(ip->i_mapping); + filemap_fdatawait(ip->i_mapping); + } + + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h new file mode 100644 index 000000000000..2db9ddbd4567 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_fs_subr.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2000, 2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUBR_H__ +#define __XFS_SUBR_H__ + +/* + * Utilities shared among file system implementations. + */ + +struct cred; + +extern int fs_noerr(void); +extern int fs_nosys(void); +extern void fs_noval(void); +extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +extern int fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int); + +#endif /* __XFS_FS_SUBR_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c new file mode 100644 index 000000000000..a6da5b4fd240 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_globals.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * This file contains globals needed by XFS that were normally defined + * somewhere else in IRIX. + */ + +#include "xfs.h" +#include "xfs_cred.h" +#include "xfs_sysctl.h" + +/* + * System memory size - used to scale certain data structures in XFS. + */ +unsigned long xfs_physmem; + +/* + * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, + * other XFS code uses these values. Times are measured in centisecs (i.e. + * 100ths of a second). + */ +xfs_param_t xfs_params = { + /* MIN DFLT MAX */ + .restrict_chown = { 0, 1, 1 }, + .sgid_inherit = { 0, 0, 1 }, + .symlink_mode = { 0, 0, 1 }, + .panic_mask = { 0, 0, 127 }, + .error_level = { 0, 3, 11 }, + .syncd_timer = { 1*100, 30*100, 7200*100}, + .stats_clear = { 0, 0, 1 }, + .inherit_sync = { 0, 1, 1 }, + .inherit_nodump = { 0, 1, 1 }, + .inherit_noatim = { 0, 1, 1 }, + .xfs_buf_timer = { 100/2, 1*100, 30*100 }, + .xfs_buf_age = { 1*100, 15*100, 7200*100}, + .inherit_nosym = { 0, 0, 1 }, + .rotorstep = { 1, 1, 255 }, +}; + +/* + * Global system credential structure. + */ +cred_t sys_cred_val, *sys_cred = &sys_cred_val; + diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h new file mode 100644 index 000000000000..e81e2f38a853 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_globals.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_GLOBALS_H__ +#define __XFS_GLOBALS_H__ + +/* + * This file declares globals needed by XFS that were normally defined + * somewhere else in IRIX. + */ + +extern uint64_t xfs_panic_mask; /* set to cause more panics */ +extern unsigned long xfs_physmem; +extern struct cred *sys_cred; + +#endif /* __XFS_GLOBALS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c new file mode 100644 index 000000000000..69809eef8a54 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -0,0 +1,1336 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_dfrag.h" +#include "xfs_fsops.h" + +#include <linux/dcache.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/pagemap.h> + +/* + * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to + * a file or fs handle. + * + * XFS_IOC_PATH_TO_FSHANDLE + * returns fs handle for a mount point or path within that mount point + * XFS_IOC_FD_TO_HANDLE + * returns full handle for a FD opened in user space + * XFS_IOC_PATH_TO_HANDLE + * returns full handle for a path + */ +STATIC int +xfs_find_handle( + unsigned int cmd, + void __user *arg) +{ + int hsize; + xfs_handle_t handle; + xfs_fsop_handlereq_t hreq; + struct inode *inode; + struct vnode *vp; + + if (copy_from_user(&hreq, arg, sizeof(hreq))) + return -XFS_ERROR(EFAULT); + + memset((char *)&handle, 0, sizeof(handle)); + + switch (cmd) { + case XFS_IOC_PATH_TO_FSHANDLE: + case XFS_IOC_PATH_TO_HANDLE: { + struct nameidata nd; + int error; + + error = user_path_walk_link((const char __user *)hreq.path, &nd); + if (error) + return error; + + ASSERT(nd.dentry); + ASSERT(nd.dentry->d_inode); + inode = igrab(nd.dentry->d_inode); + path_release(&nd); + break; + } + + case XFS_IOC_FD_TO_HANDLE: { + struct file *file; + + file = fget(hreq.fd); + if (!file) + return -EBADF; + + ASSERT(file->f_dentry); + ASSERT(file->f_dentry->d_inode); + inode = igrab(file->f_dentry->d_inode); + fput(file); + break; + } + + default: + ASSERT(0); + return -XFS_ERROR(EINVAL); + } + + if (inode->i_sb->s_magic != XFS_SB_MAGIC) { + /* we're not in XFS anymore, Toto */ + iput(inode); + return -XFS_ERROR(EINVAL); + } + + /* we need the vnode */ + vp = LINVFS_GET_VP(inode); + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { + iput(inode); + return -XFS_ERROR(EBADF); + } + + /* now we can grab the fsid */ + memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t)); + hsize = sizeof(xfs_fsid_t); + + if (cmd != XFS_IOC_PATH_TO_FSHANDLE) { + xfs_inode_t *ip; + bhv_desc_t *bhv; + int lock_mode; + + /* need to get access to the xfs_inode to read the generation */ + bhv = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops); + ASSERT(bhv); + ip = XFS_BHVTOI(bhv); + ASSERT(ip); + lock_mode = xfs_ilock_map_shared(ip); + + /* fill in fid section of handle from inode */ + handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) - + sizeof(handle.ha_fid.xfs_fid_len); + handle.ha_fid.xfs_fid_pad = 0; + handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen; + handle.ha_fid.xfs_fid_ino = ip->i_ino; + + xfs_iunlock_map_shared(ip, lock_mode); + + hsize = XFS_HSIZE(handle); + } + + /* now copy our handle into the user buffer & write out the size */ + if (copy_to_user(hreq.ohandle, &handle, hsize) || + copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) { + iput(inode); + return -XFS_ERROR(EFAULT); + } + + iput(inode); + return 0; +} + + +/* + * Convert userspace handle data into vnode (and inode). + * We [ab]use the fact that all the fsop_handlereq ioctl calls + * have a data structure argument whose first component is always + * a xfs_fsop_handlereq_t, so we can cast to and from this type. + * This allows us to optimise the copy_from_user calls and gives + * a handy, shared routine. + * + * If no error, caller must always VN_RELE the returned vp. + */ +STATIC int +xfs_vget_fsop_handlereq( + xfs_mount_t *mp, + struct inode *parinode, /* parent inode pointer */ + xfs_fsop_handlereq_t *hreq, + vnode_t **vp, + struct inode **inode) +{ + void __user *hanp; + size_t hlen; + xfs_fid_t *xfid; + xfs_handle_t *handlep; + xfs_handle_t handle; + xfs_inode_t *ip; + struct inode *inodep; + vnode_t *vpp; + xfs_ino_t ino; + __u32 igen; + int error; + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(parinode->i_mode)) + return XFS_ERROR(ENOTDIR); + + hanp = hreq->ihandle; + hlen = hreq->ihandlen; + handlep = &handle; + + if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) + return XFS_ERROR(EINVAL); + if (copy_from_user(handlep, hanp, hlen)) + return XFS_ERROR(EFAULT); + if (hlen < sizeof(*handlep)) + memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); + if (hlen > sizeof(handlep->ha_fsid)) { + if (handlep->ha_fid.xfs_fid_len != + (hlen - sizeof(handlep->ha_fsid) + - sizeof(handlep->ha_fid.xfs_fid_len)) + || handlep->ha_fid.xfs_fid_pad) + return XFS_ERROR(EINVAL); + } + + /* + * Crack the handle, obtain the inode # & generation # + */ + xfid = (struct xfs_fid *)&handlep->ha_fid; + if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) { + ino = xfid->xfs_fid_ino; + igen = xfid->xfs_fid_gen; + } else { + return XFS_ERROR(EINVAL); + } + + /* + * Get the XFS inode, building a vnode to go with it. + */ + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); + if (error) + return error; + if (ip == NULL) + return XFS_ERROR(EIO); + if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) { + xfs_iput_new(ip, XFS_ILOCK_SHARED); + return XFS_ERROR(ENOENT); + } + + vpp = XFS_ITOV(ip); + inodep = LINVFS_GET_IP(vpp); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + *vp = vpp; + *inode = inodep; + return 0; +} + +STATIC int +xfs_open_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + int new_fd; + int permflag; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + vnode_t *vp; + xfs_fsop_handlereq_t hreq; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode); + if (error) + return -error; + + /* Restrict xfs_open_by_handle to directories & regular files. */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + iput(inode); + return -XFS_ERROR(EINVAL); + } + +#if BITS_PER_LONG != 32 + hreq.oflags |= O_LARGEFILE; +#endif + /* Put open permission in namei format. */ + permflag = hreq.oflags; + if ((permflag+1) & O_ACCMODE) + permflag++; + if (permflag & O_TRUNC) + permflag |= 2; + + if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && + (permflag & FMODE_WRITE) && IS_APPEND(inode)) { + iput(inode); + return -XFS_ERROR(EPERM); + } + + if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + iput(inode); + return -XFS_ERROR(EACCES); + } + + /* Can't write directories. */ + if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { + iput(inode); + return -XFS_ERROR(EISDIR); + } + + if ((new_fd = get_unused_fd()) < 0) { + iput(inode); + return new_fd; + } + + dentry = d_alloc_anon(inode); + if (dentry == NULL) { + iput(inode); + put_unused_fd(new_fd); + return -XFS_ERROR(ENOMEM); + } + + /* Ensure umount returns EBUSY on umounts while this file is open. */ + mntget(parfilp->f_vfsmnt); + + /* Create file pointer. */ + filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags); + if (IS_ERR(filp)) { + put_unused_fd(new_fd); + return -XFS_ERROR(-PTR_ERR(filp)); + } + if (inode->i_mode & S_IFREG) + filp->f_op = &linvfs_invis_file_operations; + + fd_install(new_fd, filp); + return new_fd; +} + +STATIC int +xfs_readlink_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + struct iovec aiov; + struct uio auio; + struct inode *inode; + xfs_fsop_handlereq_t hreq; + vnode_t *vp; + __u32 olen; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode); + if (error) + return -error; + + /* Restrict this handle operation to symlinks only. */ + if (vp->v_type != VLNK) { + VN_RELE(vp); + return -XFS_ERROR(EINVAL); + } + + if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) { + VN_RELE(vp); + return -XFS_ERROR(EFAULT); + } + aiov.iov_len = olen; + aiov.iov_base = hreq.ohandle; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_resid = olen; + + VOP_READLINK(vp, &auio, IO_INVIS, NULL, error); + + VN_RELE(vp); + return (olen - auio.uio_resid); +} + +STATIC int +xfs_fssetdm_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + struct fsdmidata fsd; + xfs_fsop_setdm_handlereq_t dmhreq; + struct inode *inode; + bhv_desc_t *bdp; + vnode_t *vp; + + if (!capable(CAP_MKNOD)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &vp, &inode); + if (error) + return -error; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { + VN_RELE(vp); + return -XFS_ERROR(EPERM); + } + + if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { + VN_RELE(vp); + return -XFS_ERROR(EFAULT); + } + + bdp = bhv_base_unlocked(VN_BHV_HEAD(vp)); + error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL); + + VN_RELE(vp); + if (error) + return -error; + return 0; +} + +STATIC int +xfs_attrlist_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + attrlist_cursor_kern_t *cursor; + xfs_fsop_attrlist_handlereq_t al_hreq; + struct inode *inode; + vnode_t *vp; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) + return -XFS_ERROR(EFAULT); + if (al_hreq.buflen > XATTR_LIST_MAX) + return -XFS_ERROR(EINVAL); + + error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, + &vp, &inode); + if (error) + goto out; + + kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + if (!kbuf) + goto out_vn_rele; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + VOP_ATTR_LIST(vp, kbuf, al_hreq.buflen, al_hreq.flags, + cursor, NULL, error); + if (error) + goto out_kfree; + + if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) + error = -EFAULT; + + out_kfree: + kfree(kbuf); + out_vn_rele: + VN_RELE(vp); + out: + return -error; +} + +STATIC int +xfs_attrmulti_attr_get( + struct vnode *vp, + char *name, + char __user *ubuf, + __uint32_t *len, + __uint32_t flags) +{ + char *kbuf; + int error = EFAULT; + + if (*len > XATTR_SIZE_MAX) + return EINVAL; + kbuf = kmalloc(*len, GFP_KERNEL); + if (!kbuf) + return ENOMEM; + + VOP_ATTR_GET(vp, name, kbuf, len, flags, NULL, error); + if (error) + goto out_kfree; + + if (copy_to_user(ubuf, kbuf, *len)) + error = EFAULT; + + out_kfree: + kfree(kbuf); + return error; +} + +STATIC int +xfs_attrmulti_attr_set( + struct vnode *vp, + char *name, + const char __user *ubuf, + __uint32_t len, + __uint32_t flags) +{ + char *kbuf; + int error = EFAULT; + + if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) + return EPERM; + if (len > XATTR_SIZE_MAX) + return EINVAL; + + kbuf = kmalloc(len, GFP_KERNEL); + if (!kbuf) + return ENOMEM; + + if (copy_from_user(kbuf, ubuf, len)) + goto out_kfree; + + VOP_ATTR_SET(vp, name, kbuf, len, flags, NULL, error); + + out_kfree: + kfree(kbuf); + return error; +} + +STATIC int +xfs_attrmulti_attr_remove( + struct vnode *vp, + char *name, + __uint32_t flags) +{ + int error; + + if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) + return EPERM; + + VOP_ATTR_REMOVE(vp, name, flags, NULL, error); + return error; +} + +STATIC int +xfs_attrmulti_by_handle( + xfs_mount_t *mp, + void __user *arg, + struct file *parfilp, + struct inode *parinode) +{ + int error; + xfs_attr_multiop_t *ops; + xfs_fsop_attrmulti_handlereq_t am_hreq; + struct inode *inode; + vnode_t *vp; + unsigned int i, size; + char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &vp, &inode); + if (error) + goto out; + + error = E2BIG; + size = am_hreq.opcount * sizeof(attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_vn_rele; + + error = ENOMEM; + ops = kmalloc(size, GFP_KERNEL); + if (!ops) + goto out_vn_rele; + + error = EFAULT; + if (copy_from_user(ops, am_hreq.ops, size)) + goto out_kfree_ops; + + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user(attr_name, + ops[i].am_attrname, MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = -ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get(vp, + attr_name, ops[i].am_attrvalue, + &ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = xfs_attrmulti_attr_set(vp, + attr_name, ops[i].am_attrvalue, + ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = xfs_attrmulti_attr_remove(vp, + attr_name, ops[i].am_flags); + break; + default: + ops[i].am_error = EINVAL; + } + } + + if (copy_to_user(am_hreq.ops, ops, size)) + error = XFS_ERROR(EFAULT); + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_vn_rele: + VN_RELE(vp); + out: + return -error; +} + +/* prototypes for a few of the stack-hungry cases that have + * their own functions. Functions are defined after their use + * so gcc doesn't get fancy and inline them with -03 */ + +STATIC int +xfs_ioc_space( + bhv_desc_t *bdp, + vnode_t *vp, + struct file *filp, + int flags, + unsigned int cmd, + void __user *arg); + +STATIC int +xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg); + +STATIC int +xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + void __user *arg); + +STATIC int +xfs_ioc_fsgeometry( + xfs_mount_t *mp, + void __user *arg); + +STATIC int +xfs_ioc_xattr( + vnode_t *vp, + xfs_inode_t *ip, + struct file *filp, + unsigned int cmd, + void __user *arg); + +STATIC int +xfs_ioc_getbmap( + bhv_desc_t *bdp, + struct file *filp, + int flags, + unsigned int cmd, + void __user *arg); + +STATIC int +xfs_ioc_getbmapx( + bhv_desc_t *bdp, + void __user *arg); + +int +xfs_ioctl( + bhv_desc_t *bdp, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + int error; + vnode_t *vp; + xfs_inode_t *ip; + xfs_mount_t *mp; + + vp = LINVFS_GET_VP(inode); + + vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address); + + ip = XFS_BHVTOI(bdp); + mp = ip->i_mount; + + switch (cmd) { + + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + /* + * Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg); + + case XFS_IOC_DIOINFO: { + struct dioattr da; + xfs_buftarg_t *target = + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + da.d_mem = da.d_miniosz = 1 << target->pbr_sshift; + /* The size dio will do in one go */ + da.d_maxiosz = 64 * PAGE_CACHE_SIZE; + + if (copy_to_user(arg, &da, sizeof(da))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + return xfs_ioc_bulkstat(mp, cmd, arg); + + case XFS_IOC_FSGEOMETRY_V1: + return xfs_ioc_fsgeometry_v1(mp, arg); + + case XFS_IOC_FSGEOMETRY: + return xfs_ioc_fsgeometry(mp, arg); + + case XFS_IOC_GETVERSION: + case XFS_IOC_GETXFLAGS: + case XFS_IOC_SETXFLAGS: + case XFS_IOC_FSGETXATTR: + case XFS_IOC_FSSETXATTR: + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_xattr(vp, ip, filp, cmd, arg); + + case XFS_IOC_FSSETDM: { + struct fsdmidata dmi; + + if (copy_from_user(&dmi, arg, sizeof(dmi))) + return -XFS_ERROR(EFAULT); + + error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate, + NULL); + return -error; + } + + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg); + + case XFS_IOC_GETBMAPX: + return xfs_ioc_getbmapx(bdp, arg); + + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: + return xfs_find_handle(cmd, arg); + + case XFS_IOC_OPEN_BY_HANDLE: + return xfs_open_by_handle(mp, arg, filp, inode); + + case XFS_IOC_FSSETDM_BY_HANDLE: + return xfs_fssetdm_by_handle(mp, arg, filp, inode); + + case XFS_IOC_READLINK_BY_HANDLE: + return xfs_readlink_by_handle(mp, arg, filp, inode); + + case XFS_IOC_ATTRLIST_BY_HANDLE: + return xfs_attrlist_by_handle(mp, arg, filp, inode); + + case XFS_IOC_ATTRMULTI_BY_HANDLE: + return xfs_attrmulti_by_handle(mp, arg, filp, inode); + + case XFS_IOC_SWAPEXT: { + error = xfs_swapext((struct xfs_swapext __user *)arg); + return -error; + } + + case XFS_IOC_FSCOUNTS: { + xfs_fsop_counts_t out; + + error = xfs_fs_counts(mp, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_SET_RESBLKS: { + xfs_fsop_resblks_t inout; + __uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&inout, arg, sizeof(inout))) + return -XFS_ERROR(EFAULT); + + /* input parameter is passed in resblks field of structure */ + in = inout.resblks; + error = xfs_reserve_blocks(mp, &in, &inout); + if (error) + return -error; + + if (copy_to_user(arg, &inout, sizeof(inout))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_GET_RESBLKS: { + xfs_fsop_resblks_t out; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_reserve_blocks(mp, NULL, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + + return 0; + } + + case XFS_IOC_FSGROWFSDATA: { + xfs_growfs_data_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_data(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSLOG: { + xfs_growfs_log_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_log(mp, &in); + return -error; + } + + case XFS_IOC_FSGROWFSRT: { + xfs_growfs_rt_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_growfs_rt(mp, &in); + return -error; + } + + case XFS_IOC_FREEZE: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (inode->i_sb->s_frozen == SB_UNFROZEN) + freeze_bdev(inode->i_sb->s_bdev); + return 0; + + case XFS_IOC_THAW: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (inode->i_sb->s_frozen != SB_UNFROZEN) + thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); + return 0; + + case XFS_IOC_GOINGDOWN: { + __uint32_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__uint32_t __user *)arg)) + return -XFS_ERROR(EFAULT); + + error = xfs_fs_goingdown(mp, in); + return -error; + } + + case XFS_IOC_ERROR_INJECTION: { + xfs_error_injection_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_errortag_add(in.errtag, mp); + return -error; + } + + case XFS_IOC_ERROR_CLEARALL: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_errortag_clearall(mp); + return -error; + + default: + return -ENOTTY; + } +} + +STATIC int +xfs_ioc_space( + bhv_desc_t *bdp, + vnode_t *vp, + struct file *filp, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + xfs_flock64_t bf; + int attr_flags = 0; + int error; + + if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND)) + return -XFS_ERROR(EPERM); + + if (!(filp->f_flags & FMODE_WRITE)) + return -XFS_ERROR(EBADF); + + if (vp->v_type != VREG) + return -XFS_ERROR(EINVAL); + + if (copy_from_user(&bf, arg, sizeof(bf))) + return -XFS_ERROR(EFAULT); + + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; + if (ioflags & IO_INVIS) + attr_flags |= ATTR_DMI; + + error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos, + NULL, attr_flags); + return -error; +} + +STATIC int +xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg) +{ + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) + return -XFS_ERROR(EFAULT); + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -XFS_ERROR(EFAULT); + + if ((count = bulkreq.icount) <= 0) + return -XFS_ERROR(EINVAL); + + if (cmd == XFS_IOC_FSINUMBERS) + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer); + else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) + error = xfs_bulkstat_single(mp, &inlast, + bulkreq.ubuffer, &done); + else { /* XFS_IOC_FSBULKSTAT */ + if (count == 1 && inlast != 0) { + inlast++; + error = xfs_bulkstat_single(mp, &inlast, + bulkreq.ubuffer, &done); + } else { + error = xfs_bulkstat(mp, &inlast, &count, + (bulkstat_one_pf)xfs_bulkstat_one, NULL, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + BULKSTAT_FG_QUICK, &done); + } + } + + if (error) + return -error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -XFS_ERROR(EFAULT); + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -XFS_ERROR(EFAULT); + } + + return 0; +} + +STATIC int +xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_v1_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); + if (error) + return -error; + + if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_ioc_fsgeometry( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 4); + if (error) + return -error; + + if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + return -XFS_ERROR(EFAULT); + return 0; +} + +/* + * Linux extended inode flags interface. + */ +#define LINUX_XFLAG_SYNC 0x00000008 /* Synchronous updates */ +#define LINUX_XFLAG_IMMUTABLE 0x00000010 /* Immutable file */ +#define LINUX_XFLAG_APPEND 0x00000020 /* writes to file may only append */ +#define LINUX_XFLAG_NODUMP 0x00000040 /* do not dump file */ +#define LINUX_XFLAG_NOATIME 0x00000080 /* do not update atime */ + +STATIC unsigned int +xfs_merge_ioc_xflags( + unsigned int flags, + unsigned int start) +{ + unsigned int xflags = start; + + if (flags & LINUX_XFLAG_IMMUTABLE) + xflags |= XFS_XFLAG_IMMUTABLE; + else + xflags &= ~XFS_XFLAG_IMMUTABLE; + if (flags & LINUX_XFLAG_APPEND) + xflags |= XFS_XFLAG_APPEND; + else + xflags &= ~XFS_XFLAG_APPEND; + if (flags & LINUX_XFLAG_SYNC) + xflags |= XFS_XFLAG_SYNC; + else + xflags &= ~XFS_XFLAG_SYNC; + if (flags & LINUX_XFLAG_NOATIME) + xflags |= XFS_XFLAG_NOATIME; + else + xflags &= ~XFS_XFLAG_NOATIME; + if (flags & LINUX_XFLAG_NODUMP) + xflags |= XFS_XFLAG_NODUMP; + else + xflags &= ~XFS_XFLAG_NODUMP; + + return xflags; +} + +STATIC unsigned int +xfs_di2lxflags( + __uint16_t di_flags) +{ + unsigned int flags = 0; + + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= LINUX_XFLAG_IMMUTABLE; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= LINUX_XFLAG_APPEND; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= LINUX_XFLAG_SYNC; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= LINUX_XFLAG_NOATIME; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= LINUX_XFLAG_NODUMP; + return flags; +} + +STATIC int +xfs_ioc_xattr( + vnode_t *vp, + xfs_inode_t *ip, + struct file *filp, + unsigned int cmd, + void __user *arg) +{ + struct fsxattr fa; + vattr_t va; + int error; + int attr_flags; + unsigned int flags; + + switch (cmd) { + case XFS_IOC_FSGETXATTR: { + va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (error) + return -error; + + fa.fsx_xflags = va.va_xflags; + fa.fsx_extsize = va.va_extsize; + fa.fsx_nextents = va.va_nextents; + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSSETXATTR: { + if (copy_from_user(&fa, arg, sizeof(fa))) + return -XFS_ERROR(EFAULT); + + attr_flags = 0; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; + + va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE; + va.va_xflags = fa.fsx_xflags; + va.va_extsize = fa.fsx_extsize; + + VOP_SETATTR(vp, &va, attr_flags, NULL, error); + if (!error) + vn_revalidate(vp); /* update Linux inode flags */ + return -error; + } + + case XFS_IOC_FSGETXATTRA: { + va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_ANEXTENTS; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (error) + return -error; + + fa.fsx_xflags = va.va_xflags; + fa.fsx_extsize = va.va_extsize; + fa.fsx_nextents = va.va_anextents; + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_GETXFLAGS: { + flags = xfs_di2lxflags(ip->i_d.di_flags); + if (copy_to_user(arg, &flags, sizeof(flags))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_SETXFLAGS: { + if (copy_from_user(&flags, arg, sizeof(flags))) + return -XFS_ERROR(EFAULT); + + if (flags & ~(LINUX_XFLAG_IMMUTABLE | LINUX_XFLAG_APPEND | \ + LINUX_XFLAG_NOATIME | LINUX_XFLAG_NODUMP | \ + LINUX_XFLAG_SYNC)) + return -XFS_ERROR(EOPNOTSUPP); + + attr_flags = 0; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + attr_flags |= ATTR_NONBLOCK; + + va.va_mask = XFS_AT_XFLAGS; + va.va_xflags = xfs_merge_ioc_xflags(flags, + xfs_ip2xflags(ip)); + + VOP_SETATTR(vp, &va, attr_flags, NULL, error); + if (!error) + vn_revalidate(vp); /* update Linux inode flags */ + return -error; + } + + case XFS_IOC_GETVERSION: { + flags = LINVFS_GET_IP(vp)->i_generation; + if (copy_to_user(arg, &flags, sizeof(flags))) + return -XFS_ERROR(EFAULT); + return 0; + } + + default: + return -ENOTTY; + } +} + +STATIC int +xfs_ioc_getbmap( + bhv_desc_t *bdp, + struct file *filp, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + struct getbmap bm; + int iflags; + int error; + + if (copy_from_user(&bm, arg, sizeof(bm))) + return -XFS_ERROR(EFAULT); + + if (bm.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); + if (ioflags & IO_INVIS) + iflags |= BMV_IF_NO_DMAPI_READ; + + error = xfs_getbmap(bdp, &bm, (struct getbmap __user *)arg+1, iflags); + if (error) + return -error; + + if (copy_to_user(arg, &bm, sizeof(bm))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_ioc_getbmapx( + bhv_desc_t *bdp, + void __user *arg) +{ + struct getbmapx bmx; + struct getbmap bm; + int iflags; + int error; + + if (copy_from_user(&bmx, arg, sizeof(bmx))) + return -XFS_ERROR(EFAULT); + + if (bmx.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + /* + * Map input getbmapx structure to a getbmap + * structure for xfs_getbmap. + */ + GETBMAP_CONVERT(bmx, bm); + + iflags = bmx.bmv_iflags; + + if (iflags & (~BMV_IF_VALID)) + return -XFS_ERROR(EINVAL); + + iflags |= BMV_IF_EXTENDED; + + error = xfs_getbmap(bdp, &bm, (struct getbmapx __user *)arg+1, iflags); + if (error) + return -error; + + GETBMAP_CONVERT(bm, bmx); + + if (copy_to_user(arg, &bmx, sizeof(bmx))) + return -XFS_ERROR(EFAULT); + + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c new file mode 100644 index 000000000000..7a12c83184f5 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include <linux/config.h> +#include <linux/compat.h> +#include <linux/init.h> +#include <linux/ioctl.h> +#include <linux/ioctl32.h> +#include <linux/syscalls.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <asm/uaccess.h> + +#include "xfs.h" +#include "xfs_types.h" +#include "xfs_fs.h" +#include "xfs_vfs.h" +#include "xfs_vnode.h" +#include "xfs_dfrag.h" + +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#define BROKEN_X86_ALIGNMENT +#else + +typedef struct xfs_fsop_bulkreq32 { + compat_uptr_t lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + compat_uptr_t ubuffer; /* user buffer for inode desc. */ + __s32 ocount; /* output count pointer */ +} xfs_fsop_bulkreq32_t; + +static unsigned long +xfs_ioctl32_bulkstat(unsigned long arg) +{ + xfs_fsop_bulkreq32_t __user *p32 = (void __user *)arg; + xfs_fsop_bulkreq_t __user *p = compat_alloc_user_space(sizeof(*p)); + u32 addr; + + if (get_user(addr, &p32->lastip) || + put_user(compat_ptr(addr), &p->lastip) || + copy_in_user(&p->icount, &p32->icount, sizeof(s32)) || + get_user(addr, &p32->ubuffer) || + put_user(compat_ptr(addr), &p->ubuffer) || + get_user(addr, &p32->ocount) || + put_user(compat_ptr(addr), &p->ocount)) + return -EFAULT; + + return (unsigned long)p; +} +#endif + +static long +__xfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg) +{ + int error; + struct inode *inode = f->f_dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + + switch (cmd) { + case XFS_IOC_DIOINFO: + case XFS_IOC_FSGEOMETRY_V1: + case XFS_IOC_FSGEOMETRY: + case XFS_IOC_GETVERSION: + case XFS_IOC_GETXFLAGS: + case XFS_IOC_SETXFLAGS: + case XFS_IOC_FSGETXATTR: + case XFS_IOC_FSSETXATTR: + case XFS_IOC_FSGETXATTRA: + case XFS_IOC_FSSETDM: + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + case XFS_IOC_GETBMAPX: +/* not handled + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: + case XFS_IOC_OPEN_BY_HANDLE: + case XFS_IOC_FSSETDM_BY_HANDLE: + case XFS_IOC_READLINK_BY_HANDLE: + case XFS_IOC_ATTRLIST_BY_HANDLE: + case XFS_IOC_ATTRMULTI_BY_HANDLE: +*/ + case XFS_IOC_FSCOUNTS: + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + case XFS_IOC_FSGROWFSDATA: + case XFS_IOC_FSGROWFSLOG: + case XFS_IOC_FSGROWFSRT: + case XFS_IOC_FREEZE: + case XFS_IOC_THAW: + case XFS_IOC_GOINGDOWN: + case XFS_IOC_ERROR_INJECTION: + case XFS_IOC_ERROR_CLEARALL: + break; + +#ifndef BROKEN_X86_ALIGNMENT + /* xfs_flock_t and xfs_bstat_t have wrong u32 vs u64 alignment */ + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_SWAPEXT: + break; + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + arg = xfs_ioctl32_bulkstat(arg); + break; +#endif + default: + return -ENOIOCTLCMD; + } + + VOP_IOCTL(vp, inode, f, mode, cmd, (void __user *)arg, error); + VMODIFY(vp); + + return error; +} + +long xfs_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg) +{ + return __xfs_compat_ioctl(0, f, cmd, arg); +} + +long xfs_compat_invis_ioctl(struct file *f, unsigned cmd, unsigned long arg) +{ + return __xfs_compat_ioctl(IO_INVIS, f, cmd, arg); +} diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h new file mode 100644 index 000000000000..779f69a48116 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_ioctl32.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +long xfs_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg); +long xfs_compat_invis_ioctl(struct file *f, unsigned cmd, unsigned long arg); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c new file mode 100644 index 000000000000..407e99359391 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -0,0 +1,680 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" + +#include <linux/xattr.h> +#include <linux/namei.h> + + +/* + * Pull the link count and size up from the xfs inode to the linux inode + */ +STATIC void +validate_fields( + struct inode *ip) +{ + vnode_t *vp = LINVFS_GET_VP(ip); + vattr_t va; + int error; + + va.va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS; + VOP_GETATTR(vp, &va, ATTR_LAZY, NULL, error); + if (likely(!error)) { + ip->i_nlink = va.va_nlink; + ip->i_blocks = va.va_nblocks; + + /* we're under i_sem so i_size can't change under us */ + if (i_size_read(ip) != va.va_size) + i_size_write(ip, va.va_size); + } +} + +/* + * Determine whether a process has a valid fs_struct (kernel daemons + * like knfsd don't have an fs_struct). + * + * XXX(hch): nfsd is broken, better fix it instead. + */ +STATIC inline int +has_fs_struct(struct task_struct *task) +{ + return (task->fs != init_task.fs); +} + +STATIC int +linvfs_mknod( + struct inode *dir, + struct dentry *dentry, + int mode, + dev_t rdev) +{ + struct inode *ip; + vattr_t va; + vnode_t *vp = NULL, *dvp = LINVFS_GET_VP(dir); + xfs_acl_t *default_acl = NULL; + attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS; + int error; + + /* + * Irix uses Missed'em'V split, but doesn't want to see + * the upper 5 bits of (14bit) major. + */ + if (!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff) + return -EINVAL; + + if (test_default_acl && test_default_acl(dvp)) { + if (!_ACL_ALLOC(default_acl)) + return -ENOMEM; + if (!_ACL_GET_DEFAULT(dvp, default_acl)) { + _ACL_FREE(default_acl); + default_acl = NULL; + } + } + + if (IS_POSIXACL(dir) && !default_acl && has_fs_struct(current)) + mode &= ~current->fs->umask; + + memset(&va, 0, sizeof(va)); + va.va_mask = XFS_AT_TYPE|XFS_AT_MODE; + va.va_type = IFTOVT(mode); + va.va_mode = mode; + + switch (mode & S_IFMT) { + case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: + va.va_rdev = sysv_encode_dev(rdev); + va.va_mask |= XFS_AT_RDEV; + /*FALLTHROUGH*/ + case S_IFREG: + VOP_CREATE(dvp, dentry, &va, &vp, NULL, error); + break; + case S_IFDIR: + VOP_MKDIR(dvp, dentry, &va, &vp, NULL, error); + break; + default: + error = EINVAL; + break; + } + + if (default_acl) { + if (!error) { + error = _ACL_INHERIT(vp, &va, default_acl); + if (!error) { + VMODIFY(vp); + } else { + struct dentry teardown = {}; + int err2; + + /* Oh, the horror. + * If we can't add the ACL we must back out. + * ENOSPC can hit here, among other things. + */ + teardown.d_inode = ip = LINVFS_GET_IP(vp); + teardown.d_name = dentry->d_name; + + vn_mark_bad(vp); + + if (S_ISDIR(mode)) + VOP_RMDIR(dvp, &teardown, NULL, err2); + else + VOP_REMOVE(dvp, &teardown, NULL, err2); + VN_RELE(vp); + } + } + _ACL_FREE(default_acl); + } + + if (!error) { + ASSERT(vp); + ip = LINVFS_GET_IP(vp); + + if (S_ISCHR(mode) || S_ISBLK(mode)) + ip->i_rdev = rdev; + else if (S_ISDIR(mode)) + validate_fields(ip); + d_instantiate(dentry, ip); + validate_fields(dir); + } + return -error; +} + +STATIC int +linvfs_create( + struct inode *dir, + struct dentry *dentry, + int mode, + struct nameidata *nd) +{ + return linvfs_mknod(dir, dentry, mode, 0); +} + +STATIC int +linvfs_mkdir( + struct inode *dir, + struct dentry *dentry, + int mode) +{ + return linvfs_mknod(dir, dentry, mode|S_IFDIR, 0); +} + +STATIC struct dentry * +linvfs_lookup( + struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct vnode *vp = LINVFS_GET_VP(dir), *cvp; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error); + if (error) { + if (unlikely(error != ENOENT)) + return ERR_PTR(-error); + d_add(dentry, NULL); + return NULL; + } + + return d_splice_alias(LINVFS_GET_IP(cvp), dentry); +} + +STATIC int +linvfs_link( + struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + struct inode *ip; /* inode of guy being linked to */ + vnode_t *tdvp; /* target directory for new name/link */ + vnode_t *vp; /* vp of name being linked */ + int error; + + ip = old_dentry->d_inode; /* inode being linked to */ + if (S_ISDIR(ip->i_mode)) + return -EPERM; + + tdvp = LINVFS_GET_VP(dir); + vp = LINVFS_GET_VP(ip); + + VOP_LINK(tdvp, vp, dentry, NULL, error); + if (!error) { + VMODIFY(tdvp); + VN_HOLD(vp); + validate_fields(ip); + d_instantiate(dentry, ip); + } + return -error; +} + +STATIC int +linvfs_unlink( + struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode; + vnode_t *dvp; /* directory containing name to remove */ + int error; + + inode = dentry->d_inode; + dvp = LINVFS_GET_VP(dir); + + VOP_REMOVE(dvp, dentry, NULL, error); + if (!error) { + validate_fields(dir); /* For size only */ + validate_fields(inode); + } + + return -error; +} + +STATIC int +linvfs_symlink( + struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct inode *ip; + vattr_t va; + vnode_t *dvp; /* directory containing name of symlink */ + vnode_t *cvp; /* used to lookup symlink to put in dentry */ + int error; + + dvp = LINVFS_GET_VP(dir); + cvp = NULL; + + memset(&va, 0, sizeof(va)); + va.va_type = VLNK; + va.va_mode = irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO; + va.va_mask = XFS_AT_TYPE|XFS_AT_MODE; + + error = 0; + VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error); + if (!error && cvp) { + ASSERT(cvp->v_type == VLNK); + ip = LINVFS_GET_IP(cvp); + d_instantiate(dentry, ip); + validate_fields(dir); + validate_fields(ip); /* size needs update */ + } + return -error; +} + +STATIC int +linvfs_rmdir( + struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + vnode_t *dvp = LINVFS_GET_VP(dir); + int error; + + VOP_RMDIR(dvp, dentry, NULL, error); + if (!error) { + validate_fields(inode); + validate_fields(dir); + } + return -error; +} + +STATIC int +linvfs_rename( + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry) +{ + struct inode *new_inode = ndentry->d_inode; + vnode_t *fvp; /* from directory */ + vnode_t *tvp; /* target directory */ + int error; + + fvp = LINVFS_GET_VP(odir); + tvp = LINVFS_GET_VP(ndir); + + VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error); + if (error) + return -error; + + if (new_inode) + validate_fields(new_inode); + + validate_fields(odir); + if (ndir != odir) + validate_fields(ndir); + return 0; +} + +/* + * careful here - this function can get called recursively, so + * we need to be very careful about how much stack we use. + * uio is kmalloced for this reason... + */ +STATIC int +linvfs_follow_link( + struct dentry *dentry, + struct nameidata *nd) +{ + vnode_t *vp; + uio_t *uio; + iovec_t iov; + int error; + char *link; + + ASSERT(dentry); + ASSERT(nd); + + link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL); + if (!link) { + nd_set_link(nd, ERR_PTR(-ENOMEM)); + return 0; + } + + uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL); + if (!uio) { + kfree(link); + nd_set_link(nd, ERR_PTR(-ENOMEM)); + return 0; + } + + vp = LINVFS_GET_VP(dentry->d_inode); + + iov.iov_base = link; + iov.iov_len = MAXNAMELEN; + + uio->uio_iov = &iov; + uio->uio_offset = 0; + uio->uio_segflg = UIO_SYSSPACE; + uio->uio_resid = MAXNAMELEN; + uio->uio_iovcnt = 1; + + VOP_READLINK(vp, uio, 0, NULL, error); + if (error) { + kfree(link); + link = ERR_PTR(-error); + } else { + link[MAXNAMELEN - uio->uio_resid] = '\0'; + } + kfree(uio); + + nd_set_link(nd, link); + return 0; +} + +static void linvfs_put_link(struct dentry *dentry, struct nameidata *nd) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} + +#ifdef CONFIG_XFS_POSIX_ACL +STATIC int +linvfs_permission( + struct inode *inode, + int mode, + struct nameidata *nd) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error; + + mode <<= 6; /* convert from linux to vnode access bits */ + VOP_ACCESS(vp, mode, NULL, error); + return -error; +} +#else +#define linvfs_permission NULL +#endif + +STATIC int +linvfs_getattr( + struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + vnode_t *vp = LINVFS_GET_VP(inode); + int error = 0; + + if (unlikely(vp->v_flag & VMODIFIED)) + error = vn_revalidate(vp); + if (!error) + generic_fillattr(inode, stat); + return 0; +} + +STATIC int +linvfs_setattr( + struct dentry *dentry, + struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + unsigned int ia_valid = attr->ia_valid; + vnode_t *vp = LINVFS_GET_VP(inode); + vattr_t vattr; + int flags = 0; + int error; + + memset(&vattr, 0, sizeof(vattr_t)); + if (ia_valid & ATTR_UID) { + vattr.va_mask |= XFS_AT_UID; + vattr.va_uid = attr->ia_uid; + } + if (ia_valid & ATTR_GID) { + vattr.va_mask |= XFS_AT_GID; + vattr.va_gid = attr->ia_gid; + } + if (ia_valid & ATTR_SIZE) { + vattr.va_mask |= XFS_AT_SIZE; + vattr.va_size = attr->ia_size; + } + if (ia_valid & ATTR_ATIME) { + vattr.va_mask |= XFS_AT_ATIME; + vattr.va_atime = attr->ia_atime; + } + if (ia_valid & ATTR_MTIME) { + vattr.va_mask |= XFS_AT_MTIME; + vattr.va_mtime = attr->ia_mtime; + } + if (ia_valid & ATTR_CTIME) { + vattr.va_mask |= XFS_AT_CTIME; + vattr.va_ctime = attr->ia_ctime; + } + if (ia_valid & ATTR_MODE) { + vattr.va_mask |= XFS_AT_MODE; + vattr.va_mode = attr->ia_mode; + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + inode->i_mode &= ~S_ISGID; + } + + if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) + flags |= ATTR_UTIME; +#ifdef ATTR_NO_BLOCK + if ((ia_valid & ATTR_NO_BLOCK)) + flags |= ATTR_NONBLOCK; +#endif + + VOP_SETATTR(vp, &vattr, flags, NULL, error); + if (error) + return -error; + vn_revalidate(vp); + return error; +} + +STATIC void +linvfs_truncate( + struct inode *inode) +{ + block_truncate_page(inode->i_mapping, inode->i_size, linvfs_get_block); +} + +STATIC int +linvfs_setxattr( + struct dentry *dentry, + const char *name, + const void *data, + size_t size, + int flags) +{ + vnode_t *vp = LINVFS_GET_VP(dentry->d_inode); + char *attr = (char *)name; + attrnames_t *namesp; + int xflags = 0; + int error; + + namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); + if (!namesp) + return -EOPNOTSUPP; + attr += namesp->attr_namelen; + error = namesp->attr_capable(vp, NULL); + if (error) + return error; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (flags & XATTR_CREATE) + xflags |= ATTR_CREATE; + if (flags & XATTR_REPLACE) + xflags |= ATTR_REPLACE; + xflags |= namesp->attr_flag; + return namesp->attr_set(vp, attr, (void *)data, size, xflags); +} + +STATIC ssize_t +linvfs_getxattr( + struct dentry *dentry, + const char *name, + void *data, + size_t size) +{ + vnode_t *vp = LINVFS_GET_VP(dentry->d_inode); + char *attr = (char *)name; + attrnames_t *namesp; + int xflags = 0; + ssize_t error; + + namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); + if (!namesp) + return -EOPNOTSUPP; + attr += namesp->attr_namelen; + error = namesp->attr_capable(vp, NULL); + if (error) + return error; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (!size) { + xflags |= ATTR_KERNOVAL; + data = NULL; + } + xflags |= namesp->attr_flag; + return namesp->attr_get(vp, attr, (void *)data, size, xflags); +} + +STATIC ssize_t +linvfs_listxattr( + struct dentry *dentry, + char *data, + size_t size) +{ + vnode_t *vp = LINVFS_GET_VP(dentry->d_inode); + int error, xflags = ATTR_KERNAMELS; + ssize_t result; + + if (!size) + xflags |= ATTR_KERNOVAL; + xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS; + + error = attr_generic_list(vp, data, size, xflags, &result); + if (error < 0) + return error; + return result; +} + +STATIC int +linvfs_removexattr( + struct dentry *dentry, + const char *name) +{ + vnode_t *vp = LINVFS_GET_VP(dentry->d_inode); + char *attr = (char *)name; + attrnames_t *namesp; + int xflags = 0; + int error; + + namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); + if (!namesp) + return -EOPNOTSUPP; + attr += namesp->attr_namelen; + error = namesp->attr_capable(vp, NULL); + if (error) + return error; + xflags |= namesp->attr_flag; + return namesp->attr_remove(vp, attr, xflags); +} + + +struct inode_operations linvfs_file_inode_operations = { + .permission = linvfs_permission, + .truncate = linvfs_truncate, + .getattr = linvfs_getattr, + .setattr = linvfs_setattr, + .setxattr = linvfs_setxattr, + .getxattr = linvfs_getxattr, + .listxattr = linvfs_listxattr, + .removexattr = linvfs_removexattr, +}; + +struct inode_operations linvfs_dir_inode_operations = { + .create = linvfs_create, + .lookup = linvfs_lookup, + .link = linvfs_link, + .unlink = linvfs_unlink, + .symlink = linvfs_symlink, + .mkdir = linvfs_mkdir, + .rmdir = linvfs_rmdir, + .mknod = linvfs_mknod, + .rename = linvfs_rename, + .permission = linvfs_permission, + .getattr = linvfs_getattr, + .setattr = linvfs_setattr, + .setxattr = linvfs_setxattr, + .getxattr = linvfs_getxattr, + .listxattr = linvfs_listxattr, + .removexattr = linvfs_removexattr, +}; + +struct inode_operations linvfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = linvfs_follow_link, + .put_link = linvfs_put_link, + .permission = linvfs_permission, + .getattr = linvfs_getattr, + .setattr = linvfs_setattr, + .setxattr = linvfs_setxattr, + .getxattr = linvfs_getxattr, + .listxattr = linvfs_listxattr, + .removexattr = linvfs_removexattr, +}; diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h new file mode 100644 index 000000000000..6a69a62c36b0 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_iops.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_IOPS_H__ +#define __XFS_IOPS_H__ + +extern struct inode_operations linvfs_file_inode_operations; +extern struct inode_operations linvfs_dir_inode_operations; +extern struct inode_operations linvfs_symlink_inode_operations; + +extern struct file_operations linvfs_file_operations; +extern struct file_operations linvfs_invis_file_operations; +extern struct file_operations linvfs_dir_operations; + +extern struct address_space_operations linvfs_aops; + +extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern void linvfs_unwritten_done(struct buffer_head *, int); + +extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *, + int, unsigned int, void __user *); + +#endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h new file mode 100644 index 000000000000..71bb41019a12 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_LINUX__ +#define __XFS_LINUX__ + +#include <linux/types.h> +#include <linux/config.h> + +/* + * Some types are conditional depending on the target system. + * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. + * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well + * as requiring XFS_BIG_BLKNOS to be set. + */ +#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) +# define XFS_BIG_BLKNOS 1 +# if BITS_PER_LONG == 64 +# define XFS_BIG_INUMS 1 +# else +# define XFS_BIG_INUMS 0 +# endif +#else +# define XFS_BIG_BLKNOS 0 +# define XFS_BIG_INUMS 0 +#endif + +#include <xfs_types.h> +#include <xfs_arch.h> + +#include <kmem.h> +#include <mrlock.h> +#include <spin.h> +#include <sv.h> +#include <mutex.h> +#include <sema.h> +#include <time.h> + +#include <support/qsort.h> +#include <support/ktrace.h> +#include <support/debug.h> +#include <support/move.h> +#include <support/uuid.h> + +#include <linux/mm.h> +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/file.h> +#include <linux/swap.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/bitops.h> +#include <linux/major.h> +#include <linux/pagemap.h> +#include <linux/vfs.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/proc_fs.h> +#include <linux/version.h> +#include <linux/sort.h> + +#include <asm/page.h> +#include <asm/div64.h> +#include <asm/param.h> +#include <asm/uaccess.h> +#include <asm/byteorder.h> +#include <asm/unaligned.h> + +#include <xfs_behavior.h> +#include <xfs_vfs.h> +#include <xfs_cred.h> +#include <xfs_vnode.h> +#include <xfs_stats.h> +#include <xfs_sysctl.h> +#include <xfs_iops.h> +#include <xfs_super.h> +#include <xfs_globals.h> +#include <xfs_fs_subr.h> +#include <xfs_lrw.h> +#include <xfs_buf.h> + +/* + * Feature macros (disable/enable) + */ +#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ +#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ + +/* + * State flag for unwritten extent buffers. + * + * We need to be able to distinguish between these and delayed + * allocate buffers within XFS. The generic IO path code does + * not need to distinguish - we use the BH_Delay flag for both + * delalloc and these ondisk-uninitialised buffers. + */ +BUFFER_FNS(PrivateStart, unwritten); +static inline void set_buffer_unwritten_io(struct buffer_head *bh) +{ + bh->b_end_io = linvfs_unwritten_done; +} + +#define restricted_chown xfs_params.restrict_chown.val +#define irix_sgid_inherit xfs_params.sgid_inherit.val +#define irix_symlink_mode xfs_params.symlink_mode.val +#define xfs_panic_mask xfs_params.panic_mask.val +#define xfs_error_level xfs_params.error_level.val +#define xfs_syncd_centisecs xfs_params.syncd_timer.val +#define xfs_stats_clear xfs_params.stats_clear.val +#define xfs_inherit_sync xfs_params.inherit_sync.val +#define xfs_inherit_nodump xfs_params.inherit_nodump.val +#define xfs_inherit_noatime xfs_params.inherit_noatim.val +#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val +#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val +#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val +#define xfs_rotorstep xfs_params.rotorstep.val + +#ifndef __smp_processor_id +#define __smp_processor_id() smp_processor_id() +#endif +#define current_cpu() __smp_processor_id() +#define current_pid() (current->pid) +#define current_fsuid(cred) (current->fsuid) +#define current_fsgid(cred) (current->fsgid) + +#define NBPP PAGE_SIZE +#define DPPSHFT (PAGE_SHIFT - 9) +#define NDPP (1 << (PAGE_SHIFT - 9)) +#define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT) +#define dtopt(DD) ((DD) >> DPPSHFT) +#define dpoff(DD) ((DD) & (NDPP-1)) + +#define NBBY 8 /* number of bits per byte */ +#define NBPC PAGE_SIZE /* Number of bytes per click */ +#define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */ + +/* + * Size of block device i/o is parameterized here. + * Currently the system supports page-sized i/o. + */ +#define BLKDEV_IOSHIFT BPCSHIFT +#define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT) +/* number of BB's per block device block */ +#define BLKDEV_BB BTOBB(BLKDEV_IOSIZE) + +/* bytes to clicks */ +#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT) +#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) +#define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) +#define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT) +#define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT) +#define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT) + +/* off_t bytes to clicks */ +#define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) +#define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT) + +/* clicks to off_t bytes */ +#define ctooff(x) ((xfs_off_t)(x)<<BPCSHIFT) + +/* clicks to bytes */ +#define ctob(x) ((__psunsigned_t)(x)<<BPCSHIFT) +#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) +#define ctob64(x) ((__uint64_t)(x)<<BPCSHIFT) +#define io_ctob(x) ((__psunsigned_t)(x)<<IO_BPCSHIFT) + +/* bytes to clicks */ +#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT) + +#ifndef CELL_CAPABLE +#define FSC_NOTIFY_NAME_CHANGED(vp) +#endif + +#ifndef ENOATTR +#define ENOATTR ENODATA /* Attribute not found */ +#endif + +/* Note: EWRONGFS never visible outside the kernel */ +#define EWRONGFS EINVAL /* Mount with wrong filesystem type */ + +/* + * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't + * return codes out of its known range in errno. + * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't + * conflict with any code we use already or any code a driver may use) + * XXX Some options (currently we do #2): + * 1/ New error code ["Filesystem is corrupted", _after_ glibc updated] + * 2/ 990 ["Unknown error 990"] + * 3/ EUCLEAN ["Structure needs cleaning"] + * 4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace] + */ +#define EFSCORRUPTED 990 /* Filesystem is corrupted */ + +#define SYNCHRONIZE() barrier() +#define __return_address __builtin_return_address(0) + +/* + * IRIX (BSD) quotactl makes use of separate commands for user/group, + * whereas on Linux the syscall encodes this information into the cmd + * field (see the QCMD macro in quota.h). These macros help keep the + * code portable - they are not visible from the syscall interface. + */ +#define Q_XSETGQLIM XQM_CMD(0x8) /* set groups disk limits */ +#define Q_XGETGQUOTA XQM_CMD(0x9) /* get groups disk limits */ + +/* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */ +/* we may well need to fine-tune this if it ever becomes an issue. */ +#define DQUOT_MAX_HEURISTIC 1024 /* NR_DQUOTS */ +#define ndquot DQUOT_MAX_HEURISTIC + +/* IRIX uses the current size of the name cache to guess a good value */ +/* - this isn't the same but is a good enough starting point for now. */ +#define DQUOT_HASH_HEURISTIC files_stat.nr_files + +/* IRIX inodes maintain the project ID also, zero this field on Linux */ +#define DEFAULT_PROJID 0 +#define dfltprid DEFAULT_PROJID + +#define MAXPATHLEN 1024 + +#define MIN(a,b) (min(a,b)) +#define MAX(a,b) (max(a,b)) +#define howmany(x, y) (((x)+((y)-1))/(y)) +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) + +#define xfs_stack_trace() dump_stack() + +#define xfs_itruncate_data(ip, off) \ + (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off))) + + +/* Move the kernel do_div definition off to one side */ + +#if defined __i386__ +/* For ia32 we need to pull some tricks to get past various versions + * of the compiler which do not like us using do_div in the middle + * of large functions. + */ +static inline __u32 xfs_do_div(void *a, __u32 b, int n) +{ + __u32 mod; + + switch (n) { + case 4: + mod = *(__u32 *)a % b; + *(__u32 *)a = *(__u32 *)a / b; + return mod; + case 8: + { + unsigned long __upper, __low, __high, __mod; + __u64 c = *(__u64 *)a; + __upper = __high = c >> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + *(__u64 *)a = c; + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + unsigned long __upper, __low, __high, __mod; + __u64 c = *(__u64 *)a; + __upper = __high = c >> 32; + __low = c; + if (__high) { + __upper = __high % (b); + __high = __high / (b); + } + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); + asm("":"=A" (c):"a" (__low),"d" (__high)); + return __mod; + } + } + + /* NOTREACHED */ + return 0; +} +#else +static inline __u32 xfs_do_div(void *a, __u32 b, int n) +{ + __u32 mod; + + switch (n) { + case 4: + mod = *(__u32 *)a % b; + *(__u32 *)a = *(__u32 *)a / b; + return mod; + case 8: + mod = do_div(*(__u64 *)a, b); + return mod; + } + + /* NOTREACHED */ + return 0; +} + +/* Side effect free 64 bit mod operation */ +static inline __u32 xfs_do_mod(void *a, __u32 b, int n) +{ + switch (n) { + case 4: + return *(__u32 *)a % b; + case 8: + { + __u64 c = *(__u64 *)a; + return do_div(c, b); + } + } + + /* NOTREACHED */ + return 0; +} +#endif + +#undef do_div +#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) +#define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) + +static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return(x * y); +} + +#define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL) + +#endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c new file mode 100644 index 000000000000..ff145fd0d1a4 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -0,0 +1,1082 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +/* + * fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff) + * + */ + +#include "xfs.h" + +#include "xfs_fs.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_inode_item.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_iomap.h" + +#include <linux/capability.h> +#include <linux/writeback.h> + + +#if defined(XFS_RW_TRACE) +void +xfs_rw_enter_trace( + int tag, + xfs_iocore_t *io, + void *data, + size_t segs, + loff_t offset, + int ioflags) +{ + xfs_inode_t *ip = XFS_IO_INODE(io); + + if (ip->i_rwtrace == NULL) + return; + ktrace_enter(ip->i_rwtrace, + (void *)(unsigned long)tag, + (void *)ip, + (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), + (void *)data, + (void *)((unsigned long)segs), + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)ioflags), + (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), + (void *)((unsigned long)(io->io_new_size & 0xffffffff)), + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL); +} + +void +xfs_inval_cached_trace( + xfs_iocore_t *io, + xfs_off_t offset, + xfs_off_t len, + xfs_off_t first, + xfs_off_t last) +{ + xfs_inode_t *ip = XFS_IO_INODE(io); + + if (ip->i_rwtrace == NULL) + return; + ktrace_enter(ip->i_rwtrace, + (void *)(__psint_t)XFS_INVAL_CACHED, + (void *)ip, + (void *)((unsigned long)((offset >> 32) & 0xffffffff)), + (void *)((unsigned long)(offset & 0xffffffff)), + (void *)((unsigned long)((len >> 32) & 0xffffffff)), + (void *)((unsigned long)(len & 0xffffffff)), + (void *)((unsigned long)((first >> 32) & 0xffffffff)), + (void *)((unsigned long)(first & 0xffffffff)), + (void *)((unsigned long)((last >> 32) & 0xffffffff)), + (void *)((unsigned long)(last & 0xffffffff)), + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL, + (void *)NULL); +} +#endif + +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +STATIC int +xfs_iozero( + struct inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count, /* size of data to zero */ + loff_t end_size) /* max file size to set */ +{ + unsigned bytes; + struct page *page; + struct address_space *mapping; + char *kaddr; + int status; + + mapping = ip->i_mapping; + do { + unsigned long index, offset; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + break; + + kaddr = kmap(page); + status = mapping->a_ops->prepare_write(NULL, page, offset, + offset + bytes); + if (status) { + goto unlock; + } + + memset((void *) (kaddr + offset), 0, bytes); + flush_dcache_page(page); + status = mapping->a_ops->commit_write(NULL, page, offset, + offset + bytes); + if (!status) { + pos += bytes; + count -= bytes; + if (pos > i_size_read(ip)) + i_size_write(ip, pos < end_size ? pos : end_size); + } + +unlock: + kunmap(page); + unlock_page(page); + page_cache_release(page); + if (status) + break; + } while (count); + + return (-status); +} + +/* + * xfs_inval_cached_pages + * + * This routine is responsible for keeping direct I/O and buffered I/O + * somewhat coherent. From here we make sure that we're at least + * temporarily holding the inode I/O lock exclusively and then call + * the page cache to flush and invalidate any cached pages. If there + * are no cached pages this routine will be very quick. + */ +void +xfs_inval_cached_pages( + vnode_t *vp, + xfs_iocore_t *io, + xfs_off_t offset, + int write, + int relock) +{ + if (VN_CACHED(vp)) { + xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1); + VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED); + } + +} + +ssize_t /* bytes read, or (-) error */ +xfs_read( + bhv_desc_t *bdp, + struct kiocb *iocb, + const struct iovec *iovp, + unsigned int segs, + loff_t *offset, + int ioflags, + cred_t *credp) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + size_t size = 0; + ssize_t ret; + xfs_fsize_t n; + xfs_inode_t *ip; + xfs_mount_t *mp; + vnode_t *vp; + unsigned long seg; + + ip = XFS_BHVTOI(bdp); + vp = BHV_TO_VNODE(bdp); + mp = ip->i_mount; + + XFS_STATS_INC(xs_read_calls); + + /* START copy & waste from filemap.c */ + for (seg = 0; seg < segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + size += iv->iov_len; + if (unlikely((ssize_t)(size|iv->iov_len) < 0)) + return XFS_ERROR(-EINVAL); + } + /* END copy & waste from filemap.c */ + + if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_buftarg_t *target = + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + if ((*offset & target->pbr_smask) || + (size & target->pbr_smask)) { + if (*offset == ip->i_d.di_size) { + return (0); + } + return -XFS_ERROR(EINVAL); + } + } + + n = XFS_MAXIOFFSET(mp) - *offset; + if ((n <= 0) || (size == 0)) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) { + return -EIO; + } + + if (unlikely(ioflags & IO_ISDIRECT)) + down(&inode->i_sem); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + !(ioflags & IO_INVIS)) { + vrwlock_t locktype = VRWLOCK_READ; + + ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, + BHV_TO_VNODE(bdp), *offset, size, + FILP_DELAY_FLAG(file), &locktype); + if (ret) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + goto unlock_isem; + } + } + + xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore, + (void *)iovp, segs, *offset, ioflags); + ret = __generic_file_aio_read(iocb, iovp, segs, offset); + if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) + ret = wait_on_sync_kiocb(iocb); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + if (likely(!(ioflags & IO_INVIS))) + xfs_ichgtime(ip, XFS_ICHGTIME_ACC); + +unlock_isem: + if (unlikely(ioflags & IO_ISDIRECT)) + up(&inode->i_sem); + return ret; +} + +ssize_t +xfs_sendfile( + bhv_desc_t *bdp, + struct file *filp, + loff_t *offset, + int ioflags, + size_t count, + read_actor_t actor, + void *target, + cred_t *credp) +{ + ssize_t ret; + xfs_fsize_t n; + xfs_inode_t *ip; + xfs_mount_t *mp; + vnode_t *vp; + + ip = XFS_BHVTOI(bdp); + vp = BHV_TO_VNODE(bdp); + mp = ip->i_mount; + + XFS_STATS_INC(xs_read_calls); + + n = XFS_MAXIOFFSET(mp) - *offset; + if ((n <= 0) || (count == 0)) + return 0; + + if (n < count) + count = n; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + (!(ioflags & IO_INVIS))) { + vrwlock_t locktype = VRWLOCK_READ; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count, + FILP_DELAY_FLAG(filp), &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore, + (void *)(unsigned long)target, count, *offset, ioflags); + ret = generic_file_sendfile(filp, offset, count, actor, target); + + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + if (likely(!(ioflags & IO_INVIS))) + xfs_ichgtime(ip, XFS_ICHGTIME_ACC); + + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last + * block of the file that is beyond the EOF. We do this since the + * size is being increased without writing anything to that block + * and we don't want anyone to read the garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + struct inode *ip, + xfs_iocore_t *io, + xfs_off_t offset, + xfs_fsize_t isize, + xfs_fsize_t end_size) +{ + xfs_fileoff_t last_fsb; + xfs_mount_t *mp; + int nimaps; + int zero_offset; + int zero_len; + int isize_fsb_offset; + int error = 0; + xfs_bmbt_irec_t imap; + loff_t loff; + size_t lsize; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); + ASSERT(offset > isize); + + mp = io->io_mount; + + isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize); + if (isize_fsb_offset == 0) { + /* + * There are no extra bytes in the last block on disk to + * zero, so return. + */ + return 0; + } + + last_fsb = XFS_B_TO_FSBT(mp, isize); + nimaps = 1; + error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap, + &nimaps, NULL); + if (error) { + return error; + } + ASSERT(nimaps > 0); + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) { + return 0; + } + /* + * Zero the part of the last block beyond the EOF, and write it + * out sync. We need to drop the ilock while we do this so we + * don't deadlock when the buffer cache calls back to us. + */ + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); + loff = XFS_FSB_TO_B(mp, last_fsb); + lsize = XFS_FSB_TO_B(mp, 1); + + zero_offset = isize_fsb_offset; + zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset; + + error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + ASSERT(error >= 0); + return error; +} + +/* + * Zero any on disk space between the current EOF and the new, + * larger EOF. This handles the normal case of zeroing the remainder + * of the last block in the file and the unusual case of zeroing blocks + * out beyond the size of the file. This second case only happens + * with fixed size extents and when the system crashes before the inode + * size was updated but after blocks were allocated. If fill is set, + * then any holes in the range are filled and zeroed. If not, the holes + * are left alone as holes. + */ + +int /* error (positive) */ +xfs_zero_eof( + vnode_t *vp, + xfs_iocore_t *io, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize, /* current inode size */ + xfs_fsize_t end_size) /* terminal inode size */ +{ + struct inode *ip = LINVFS_GET_IP(vp); + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t prev_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_extlen_t buf_len_fsb; + xfs_extlen_t prev_zero_count; + xfs_mount_t *mp; + int nimaps; + int error = 0; + xfs_bmbt_irec_t imap; + loff_t loff; + size_t lsize; + + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + + mp = io->io_mount; + + /* + * First handle zeroing the block on which isize resides. + * We only zero a part of that block so it is handled specially. + */ + error = xfs_zero_last_block(ip, io, offset, isize, end_size); + if (error) { + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + return error; + } + + /* + * Calculate the range between the new size and the old + * where blocks needing to be zeroed may exist. To get the + * block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back + * to a block boundary. We subtract 1 in case the size is + * exactly on a block boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + prev_zero_fsb = NULLFILEOFF; + prev_zero_count = 0; + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb, + 0, NULL, 0, &imap, &nimaps, NULL); + if (error) { + ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); + ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + return error; + } + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + /* + * This loop handles initializing pages that were + * partially initialized by the code below this + * loop. It basically zeroes the part of the page + * that sits on a hole and sets the page as P_HOLE + * and calls remapf if it is a mapped file. + */ + prev_zero_fsb = NULLFILEOFF; + prev_zero_count = 0; + start_zero_fsb = imap.br_startoff + + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks in the range requested. + * Zero them a single write at a time. We actually + * don't zero the entire range returned if it is + * too big and simply loop around to get the rest. + * That is not the most efficient thing to do, but it + * is simple and this path should not be exercised often. + */ + buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount, + mp->m_writeio_blocks << 8); + /* + * Drop the inode lock while we're doing the I/O. + * We'll still have the iolock to protect us. + */ + XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + + loff = XFS_FSB_TO_B(mp, start_zero_fsb); + lsize = XFS_FSB_TO_B(mp, buf_len_fsb); + + error = xfs_iozero(ip, loff, lsize, end_size); + + if (error) { + goto out_lock; + } + + prev_zero_fsb = start_zero_fsb; + prev_zero_count = buf_len_fsb; + start_zero_fsb = imap.br_startoff + buf_len_fsb; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + } + + return 0; + +out_lock: + + XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + ASSERT(error >= 0); + return error; +} + +ssize_t /* bytes written, or (-) error */ +xfs_write( + bhv_desc_t *bdp, + struct kiocb *iocb, + const struct iovec *iovp, + unsigned int nsegs, + loff_t *offset, + int ioflags, + cred_t *credp) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned long segs = nsegs; + xfs_inode_t *xip; + xfs_mount_t *mp; + ssize_t ret = 0, error = 0; + xfs_fsize_t isize, new_size; + xfs_iocore_t *io; + vnode_t *vp; + unsigned long seg; + int iolock; + int eventsent = 0; + vrwlock_t locktype; + size_t ocount = 0, count; + loff_t pos; + int need_isem = 1, need_flush = 0; + + XFS_STATS_INC(xs_write_calls); + + vp = BHV_TO_VNODE(bdp); + xip = XFS_BHVTOI(bdp); + + for (seg = 0; seg < segs; seg++) { + const struct iovec *iv = &iovp[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + segs = seg; + ocount -= iv->iov_len; /* This segment is no good */ + break; + } + + count = ocount; + pos = *offset; + + if (count == 0) + return 0; + + io = &xip->i_iocore; + mp = io->io_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE); + + if (ioflags & IO_ISDIRECT) { + xfs_buftarg_t *target = + (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + if ((pos & target->pbr_smask) || (count & target->pbr_smask)) + return XFS_ERROR(-EINVAL); + + if (!VN_CACHED(vp) && pos < i_size_read(inode)) + need_isem = 0; + + if (VN_CACHED(vp)) + need_flush = 1; + } + +relock: + if (need_isem) { + iolock = XFS_IOLOCK_EXCL; + locktype = VRWLOCK_WRITE; + + down(&inode->i_sem); + } else { + iolock = XFS_IOLOCK_SHARED; + locktype = VRWLOCK_WRITE_DIRECT; + } + + xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + + isize = i_size_read(inode); + + if (file->f_flags & O_APPEND) + *offset = isize; + +start: + error = -generic_write_checks(file, &pos, &count, + S_ISBLK(inode->i_mode)); + if (error) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + goto out_unlock_isem; + } + + new_size = pos + count; + if (new_size > isize) + io->io_new_size = new_size; + + if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && + !(ioflags & IO_INVIS) && !eventsent)) { + loff_t savedsize = pos; + int dmflags = FILP_DELAY_FLAG(file); + + if (need_isem) + dmflags |= DM_FLAGS_ISEM; + + xfs_iunlock(xip, XFS_ILOCK_EXCL); + error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, + pos, count, + dmflags, &locktype); + if (error) { + xfs_iunlock(xip, iolock); + goto out_unlock_isem; + } + xfs_ilock(xip, XFS_ILOCK_EXCL); + eventsent = 1; + + /* + * The iolock was dropped and reaquired in XFS_SEND_DATA + * so we have to recheck the size when appending. + * We will only "goto start;" once, since having sent the + * event prevents another call to XFS_SEND_DATA, which is + * what allows the size to change in the first place. + */ + if ((file->f_flags & O_APPEND) && savedsize != isize) { + pos = isize = xip->i_d.di_size; + goto start; + } + } + + /* + * On Linux, generic_file_write updates the times even if + * no data is copied in so long as the write had a size. + * + * We must update xfs' times since revalidate will overcopy xfs. + */ + if (!(ioflags & IO_INVIS)) { + xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + inode_update_time(inode, 1); + } + + /* + * If the offset is beyond the size of the file, we have a couple + * of things to do. First, if there is already space allocated + * we need to either create holes or zero the disk or ... + * + * If there is a page where the previous size lands, we need + * to zero it out up to the new size. + */ + + if (pos > isize) { + error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, + isize, pos + count); + if (error) { + xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); + goto out_unlock_isem; + } + } + xfs_iunlock(xip, XFS_ILOCK_EXCL); + + /* + * If we're writing the file then make sure to clear the + * setuid and setgid bits if the process is not being run + * by root. This keeps people from modifying setuid and + * setgid binaries. + */ + + if (((xip->i_d.di_mode & S_ISUID) || + ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == + (S_ISGID | S_IXGRP))) && + !capable(CAP_FSETID)) { + error = xfs_write_clear_setuid(xip); + if (likely(!error)) + error = -remove_suid(file->f_dentry); + if (unlikely(error)) { + xfs_iunlock(xip, iolock); + goto out_unlock_isem; + } + } + +retry: + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + if ((ioflags & IO_ISDIRECT)) { + if (need_flush) { + xfs_inval_cached_trace(io, pos, -1, + ctooff(offtoct(pos)), -1); + VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)), + -1, FI_REMAPF_LOCKED); + } + + if (need_isem) { + /* demote the lock now the cached pages are gone */ + XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); + up(&inode->i_sem); + + iolock = XFS_IOLOCK_SHARED; + locktype = VRWLOCK_WRITE_DIRECT; + need_isem = 0; + } + + xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs, + *offset, ioflags); + ret = generic_file_direct_write(iocb, iovp, + &segs, pos, offset, count, ocount); + + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + if (ret >= 0 && ret != count) { + XFS_STATS_ADD(xs_write_bytes, ret); + + pos += ret; + count -= ret; + + need_isem = 1; + ioflags &= ~IO_ISDIRECT; + xfs_iunlock(xip, iolock); + goto relock; + } + } else { + xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs, + *offset, ioflags); + ret = generic_file_buffered_write(iocb, iovp, segs, + pos, offset, count, ret); + } + + current->backing_dev_info = NULL; + + if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) + ret = wait_on_sync_kiocb(iocb); + + if ((ret == -ENOSPC) && + DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && + !(ioflags & IO_INVIS)) { + + xfs_rwunlock(bdp, locktype); + error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, + DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, + 0, 0, 0); /* Delay flag intentionally unused */ + if (error) + goto out_unlock_isem; + xfs_rwlock(bdp, locktype); + pos = xip->i_d.di_size; + ret = 0; + goto retry; + } + + if (*offset > xip->i_d.di_size) { + xfs_ilock(xip, XFS_ILOCK_EXCL); + if (*offset > xip->i_d.di_size) { + xip->i_d.di_size = *offset; + i_size_write(inode, *offset); + xip->i_update_core = 1; + xip->i_update_size = 1; + } + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + + error = -ret; + if (ret <= 0) + goto out_unlock_internal; + + XFS_STATS_ADD(xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + /* + * If we're treating this as O_DSYNC and we have not updated the + * size, force the log. + */ + if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && + !(xip->i_update_size)) { + xfs_inode_log_item_t *iip = xip->i_itemp; + + /* + * If an allocation transaction occurred + * without extending the size, then we have to force + * the log up the proper point to ensure that the + * allocation is permanent. We can't count on + * the fact that buffered writes lock out direct I/O + * writes - the direct I/O write could have extended + * the size nontransactionally, then finished before + * we started. xfs_write_file will think that the file + * didn't grow but the update isn't safe unless the + * size change is logged. + * + * Force the log if we've committed a transaction + * against the inode or if someone else has and + * the commit record hasn't gone to disk (e.g. + * the inode is pinned). This guarantees that + * all changes affecting the inode are permanent + * when we return. + */ + if (iip && iip->ili_last_lsn) { + xfs_log_force(mp, iip->ili_last_lsn, + XFS_LOG_FORCE | XFS_LOG_SYNC); + } else if (xfs_ipincount(xip) > 0) { + xfs_log_force(mp, (xfs_lsn_t)0, + XFS_LOG_FORCE | XFS_LOG_SYNC); + } + + } else { + xfs_trans_t *tp; + + /* + * O_SYNC or O_DSYNC _with_ a size update are handled + * the same way. + * + * If the write was synchronous then we need to make + * sure that the inode modification time is permanent. + * We'll have updated the timestamp above, so here + * we use a synchronous transaction to log the inode. + * It's not fast, but it's necessary. + * + * If this a dsync write and the size got changed + * non-transactionally, then we need to ensure that + * the size change gets logged in a synchronous + * transaction. + */ + + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); + if ((error = xfs_trans_reserve(tp, 0, + XFS_SWRITE_LOG_RES(mp), + 0, 0, 0))) { + /* Transaction reserve failed */ + xfs_trans_cancel(tp, 0); + } else { + /* Transaction reserve successful */ + xfs_ilock(xip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, xip); + xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0, NULL); + xfs_iunlock(xip, XFS_ILOCK_EXCL); + } + if (error) + goto out_unlock_internal; + } + + xfs_rwunlock(bdp, locktype); + if (need_isem) + up(&inode->i_sem); + + error = sync_page_range(inode, mapping, pos, ret); + if (!error) + error = ret; + return error; + } + + out_unlock_internal: + xfs_rwunlock(bdp, locktype); + out_unlock_isem: + if (need_isem) + up(&inode->i_sem); + return -error; +} + +/* + * All xfs metadata buffers except log state machine buffers + * get this attached as their b_bdstrat callback function. + * This is so that we can catch a buffer + * after prematurely unpinning it to forcibly shutdown the filesystem. + */ +int +xfs_bdstrat_cb(struct xfs_buf *bp) +{ + xfs_mount_t *mp; + + mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); + if (!XFS_FORCED_SHUTDOWN(mp)) { + pagebuf_iorequest(bp); + return 0; + } else { + xfs_buftrace("XFS__BDSTRAT IOERROR", bp); + /* + * Metadata write that didn't get logged but + * written delayed anyway. These aren't associated + * with a transaction, and can be ignored. + */ + if (XFS_BUF_IODONE_FUNC(bp) == NULL && + (XFS_BUF_ISREAD(bp)) == 0) + return (xfs_bioerror_relse(bp)); + else + return (xfs_bioerror(bp)); + } +} + + +int +xfs_bmap(bhv_desc_t *bdp, + xfs_off_t offset, + ssize_t count, + int flags, + xfs_iomap_t *iomapp, + int *niomaps) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_iocore_t *io = &ip->i_iocore; + + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); + ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == + ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); + + return xfs_iomap(io, offset, count, flags, iomapp, niomaps); +} + +/* + * Wrapper around bdstrat so that we can stop data + * from going to disk in case we are shutting down the filesystem. + * Typically user data goes thru this path; one of the exceptions + * is the superblock. + */ +int +xfsbdstrat( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + ASSERT(mp); + if (!XFS_FORCED_SHUTDOWN(mp)) { + /* Grio redirection would go here + * if (XFS_BUF_IS_GRIO(bp)) { + */ + + pagebuf_iorequest(bp); + return 0; + } + + xfs_buftrace("XFSBDSTRAT IOERROR", bp); + return (xfs_bioerror_relse(bp)); +} + +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + xfs_mount_t *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + cmn_err(CE_NOTE, + "XFS: %s required on read-only device.", message); + cmn_err(CE_NOTE, + "XFS: write access unavailable, cannot proceed."); + return EROFS; + } + return 0; +} diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h new file mode 100644 index 000000000000..d723e35254a0 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_LRW_H__ +#define __XFS_LRW_H__ + +struct vnode; +struct bhv_desc; +struct xfs_mount; +struct xfs_iocore; +struct xfs_inode; +struct xfs_bmbt_irec; +struct xfs_buf; +struct xfs_iomap; + +#if defined(XFS_RW_TRACE) +/* + * Defines for the trace mechanisms in xfs_lrw.c. + */ +#define XFS_RW_KTRACE_SIZE 128 + +#define XFS_READ_ENTER 1 +#define XFS_WRITE_ENTER 2 +#define XFS_IOMAP_READ_ENTER 3 +#define XFS_IOMAP_WRITE_ENTER 4 +#define XFS_IOMAP_READ_MAP 5 +#define XFS_IOMAP_WRITE_MAP 6 +#define XFS_IOMAP_WRITE_NOSPACE 7 +#define XFS_ITRUNC_START 8 +#define XFS_ITRUNC_FINISH1 9 +#define XFS_ITRUNC_FINISH2 10 +#define XFS_CTRUNC1 11 +#define XFS_CTRUNC2 12 +#define XFS_CTRUNC3 13 +#define XFS_CTRUNC4 14 +#define XFS_CTRUNC5 15 +#define XFS_CTRUNC6 16 +#define XFS_BUNMAPI 17 +#define XFS_INVAL_CACHED 18 +#define XFS_DIORD_ENTER 19 +#define XFS_DIOWR_ENTER 20 +#define XFS_SENDFILE_ENTER 21 +#define XFS_WRITEPAGE_ENTER 22 +#define XFS_RELEASEPAGE_ENTER 23 +#define XFS_IOMAP_ALLOC_ENTER 24 +#define XFS_IOMAP_ALLOC_MAP 25 +#define XFS_IOMAP_UNWRITTEN 26 +extern void xfs_rw_enter_trace(int, struct xfs_iocore *, + void *, size_t, loff_t, int); +extern void xfs_inval_cached_trace(struct xfs_iocore *, + xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t); +#else +#define xfs_rw_enter_trace(tag, io, data, size, offset, ioflags) +#define xfs_inval_cached_trace(io, offset, len, first, last) +#endif + +/* + * Maximum count of bmaps used by read and write paths. + */ +#define XFS_MAX_RW_NBMAPS 4 + +extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int, + struct xfs_iomap *, int *); +extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *); +extern int xfs_bdstrat_cb(struct xfs_buf *); + +extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t, + xfs_fsize_t, xfs_fsize_t); +extern void xfs_inval_cached_pages(struct vnode *, struct xfs_iocore *, + xfs_off_t, int, int); +extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *, + const struct iovec *, unsigned int, + loff_t *, int, struct cred *); +extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, + const struct iovec *, unsigned int, + loff_t *, int, struct cred *); +extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, + loff_t *, int, size_t, read_actor_t, + void *, struct cred *); + +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); + +#define XFS_FSB_TO_DB_IO(io,fsb) \ + (((io)->io_flags & XFS_IOCORE_RT) ? \ + XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((io)->io_mount, (fsb))) + +#endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c new file mode 100644 index 000000000000..aaf5ddba47f3 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_stats.c @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include <linux/proc_fs.h> + +DEFINE_PER_CPU(struct xfsstats, xfsstats); + +STATIC int +xfs_read_xfsstats( + char *buffer, + char **start, + off_t offset, + int count, + int *eof, + void *data) +{ + int c, i, j, len, val; + __uint64_t xs_xstrat_bytes = 0; + __uint64_t xs_write_bytes = 0; + __uint64_t xs_read_bytes = 0; + + static struct xstats_entry { + char *desc; + int endpoint; + } xstats[] = { + { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, + { "abt", XFSSTAT_END_ALLOC_BTREE }, + { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, + { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, + { "dir", XFSSTAT_END_DIRECTORY_OPS }, + { "trans", XFSSTAT_END_TRANSACTIONS }, + { "ig", XFSSTAT_END_INODE_OPS }, + { "log", XFSSTAT_END_LOG_OPS }, + { "push_ail", XFSSTAT_END_TAIL_PUSHING }, + { "xstrat", XFSSTAT_END_WRITE_CONVERT }, + { "rw", XFSSTAT_END_READ_WRITE_OPS }, + { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, + { "icluster", XFSSTAT_END_INODE_CLUSTER }, + { "vnodes", XFSSTAT_END_VNODE_OPS }, + { "buf", XFSSTAT_END_BUF }, + }; + + /* Loop over all stats groups */ + for (i=j=len = 0; i < sizeof(xstats)/sizeof(struct xstats_entry); i++) { + len += sprintf(buffer + len, xstats[i].desc); + /* inner loop does each group */ + while (j < xstats[i].endpoint) { + val = 0; + /* sum over all cpus */ + for (c = 0; c < NR_CPUS; c++) { + if (!cpu_possible(c)) continue; + val += *(((__u32*)&per_cpu(xfsstats, c) + j)); + } + len += sprintf(buffer + len, " %u", val); + j++; + } + buffer[len++] = '\n'; + } + /* extra precision counters */ + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) continue; + xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; + xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; + xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; + } + + len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n", + xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + len += sprintf(buffer + len, "debug %u\n", +#if defined(DEBUG) + 1); +#else + 0); +#endif + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + + return len; +} + +void +xfs_init_procfs(void) +{ + if (!proc_mkdir("fs/xfs", NULL)) + return; + create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL); +} + +void +xfs_cleanup_procfs(void) +{ + remove_proc_entry("fs/xfs/stat", NULL); + remove_proc_entry("fs/xfs", NULL); +} diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h new file mode 100644 index 000000000000..3f756a6c3eb0 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_stats.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_STATS_H__ +#define __XFS_STATS_H__ + + +#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) + +#include <linux/percpu.h> + +/* + * XFS global statistics + */ +struct xfsstats { +# define XFSSTAT_END_EXTENT_ALLOC 4 + __uint32_t xs_allocx; + __uint32_t xs_allocb; + __uint32_t xs_freex; + __uint32_t xs_freeb; +# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) + __uint32_t xs_abt_lookup; + __uint32_t xs_abt_compare; + __uint32_t xs_abt_insrec; + __uint32_t xs_abt_delrec; +# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) + __uint32_t xs_blk_mapr; + __uint32_t xs_blk_mapw; + __uint32_t xs_blk_unmap; + __uint32_t xs_add_exlist; + __uint32_t xs_del_exlist; + __uint32_t xs_look_exlist; + __uint32_t xs_cmp_exlist; +# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) + __uint32_t xs_bmbt_lookup; + __uint32_t xs_bmbt_compare; + __uint32_t xs_bmbt_insrec; + __uint32_t xs_bmbt_delrec; +# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) + __uint32_t xs_dir_lookup; + __uint32_t xs_dir_create; + __uint32_t xs_dir_remove; + __uint32_t xs_dir_getdents; +# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) + __uint32_t xs_trans_sync; + __uint32_t xs_trans_async; + __uint32_t xs_trans_empty; +# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) + __uint32_t xs_ig_attempts; + __uint32_t xs_ig_found; + __uint32_t xs_ig_frecycle; + __uint32_t xs_ig_missed; + __uint32_t xs_ig_dup; + __uint32_t xs_ig_reclaims; + __uint32_t xs_ig_attrchg; +# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) + __uint32_t xs_log_writes; + __uint32_t xs_log_blocks; + __uint32_t xs_log_noiclogs; + __uint32_t xs_log_force; + __uint32_t xs_log_force_sleep; +# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) + __uint32_t xs_try_logspace; + __uint32_t xs_sleep_logspace; + __uint32_t xs_push_ail; + __uint32_t xs_push_ail_success; + __uint32_t xs_push_ail_pushbuf; + __uint32_t xs_push_ail_pinned; + __uint32_t xs_push_ail_locked; + __uint32_t xs_push_ail_flushing; + __uint32_t xs_push_ail_restarts; + __uint32_t xs_push_ail_flush; +# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) + __uint32_t xs_xstrat_quick; + __uint32_t xs_xstrat_split; +# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) + __uint32_t xs_write_calls; + __uint32_t xs_read_calls; +# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) + __uint32_t xs_attr_get; + __uint32_t xs_attr_set; + __uint32_t xs_attr_remove; + __uint32_t xs_attr_list; +# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) + __uint32_t xs_iflush_count; + __uint32_t xs_icluster_flushcnt; + __uint32_t xs_icluster_flushinode; +# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) + __uint32_t vn_active; /* # vnodes not on free lists */ + __uint32_t vn_alloc; /* # times vn_alloc called */ + __uint32_t vn_get; /* # times vn_get called */ + __uint32_t vn_hold; /* # times vn_hold called */ + __uint32_t vn_rele; /* # times vn_rele called */ + __uint32_t vn_reclaim; /* # times vn_reclaim called */ + __uint32_t vn_remove; /* # times vn_remove called */ + __uint32_t vn_free; /* # times vn_free called */ +#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) + __uint32_t pb_get; + __uint32_t pb_create; + __uint32_t pb_get_locked; + __uint32_t pb_get_locked_waited; + __uint32_t pb_busy_locked; + __uint32_t pb_miss_locked; + __uint32_t pb_page_retries; + __uint32_t pb_page_found; + __uint32_t pb_get_read; +/* Extra precision counters */ + __uint64_t xs_xstrat_bytes; + __uint64_t xs_write_bytes; + __uint64_t xs_read_bytes; +}; + +DECLARE_PER_CPU(struct xfsstats, xfsstats); + +/* + * We don't disable preempt, not too worried about poking the + * wrong CPU's stat for now (also aggregated before reporting). + */ +#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++) +#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) +#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) + +extern void xfs_init_procfs(void); +extern void xfs_cleanup_procfs(void); + + +#else /* !CONFIG_PROC_FS */ + +# define XFS_STATS_INC(count) +# define XFS_STATS_DEC(count) +# define XFS_STATS_ADD(count, inc) + +static __inline void xfs_init_procfs(void) { }; +static __inline void xfs_cleanup_procfs(void) { }; + +#endif /* !CONFIG_PROC_FS */ + +#endif /* __XFS_STATS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c new file mode 100644 index 000000000000..53dc658cafa6 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -0,0 +1,912 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_clnt.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_alloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_attr_sf.h" +#include "xfs_dir_sf.h" +#include "xfs_dir2_sf.h" +#include "xfs_dinode.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bit.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_itable.h" +#include "xfs_rw.h" +#include "xfs_acl.h" +#include "xfs_cap.h" +#include "xfs_mac.h" +#include "xfs_attr.h" +#include "xfs_buf_item.h" +#include "xfs_utils.h" +#include "xfs_version.h" +#include "xfs_ioctl32.h" + +#include <linux/namei.h> +#include <linux/init.h> +#include <linux/mount.h> +#include <linux/writeback.h> + +STATIC struct quotactl_ops linvfs_qops; +STATIC struct super_operations linvfs_sops; +STATIC kmem_zone_t *linvfs_inode_zone; + +STATIC struct xfs_mount_args * +xfs_args_allocate( + struct super_block *sb) +{ + struct xfs_mount_args *args; + + args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP); + args->logbufs = args->logbufsize = -1; + strncpy(args->fsname, sb->s_id, MAXNAMELEN); + + /* Copy the already-parsed mount(2) flags we're interested in */ + if (sb->s_flags & MS_NOATIME) + args->flags |= XFSMNT_NOATIME; + if (sb->s_flags & MS_DIRSYNC) + args->flags |= XFSMNT_DIRSYNC; + if (sb->s_flags & MS_SYNCHRONOUS) + args->flags |= XFSMNT_WSYNC; + + /* Default to 32 bit inodes on Linux all the time */ + args->flags |= XFSMNT_32BITINODES; + + return args; +} + +__uint64_t +xfs_max_file_offset( + unsigned int blockshift) +{ + unsigned int pagefactor = 1; + unsigned int bitshift = BITS_PER_LONG - 1; + + /* Figure out maximum filesize, on Linux this can depend on + * the filesystem blocksize (on 32 bit platforms). + * __block_prepare_write does this in an [unsigned] long... + * page->index << (PAGE_CACHE_SHIFT - bbits) + * So, for page sized blocks (4K on 32 bit platforms), + * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is + * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) + * but for smaller blocksizes it is less (bbits = log2 bsize). + * Note1: get_block_t takes a long (implicit cast from above) + * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch + * can optionally convert the [unsigned] long from above into + * an [unsigned] long long. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBD) + ASSERT(sizeof(sector_t) == 8); + pagefactor = PAGE_CACHE_SIZE; + bitshift = BITS_PER_LONG; +# else + pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); +# endif +#endif + + return (((__uint64_t)pagefactor) << bitshift) - 1; +} + +STATIC __inline__ void +xfs_set_inodeops( + struct inode *inode) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + + if (vp->v_type == VNON) { + vn_mark_bad(vp); + } else if (S_ISREG(inode->i_mode)) { + inode->i_op = &linvfs_file_inode_operations; + inode->i_fop = &linvfs_file_operations; + inode->i_mapping->a_ops = &linvfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &linvfs_dir_inode_operations; + inode->i_fop = &linvfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &linvfs_symlink_inode_operations; + if (inode->i_blocks) + inode->i_mapping->a_ops = &linvfs_aops; + } else { + inode->i_op = &linvfs_file_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } +} + +STATIC __inline__ void +xfs_revalidate_inode( + xfs_mount_t *mp, + vnode_t *vp, + xfs_inode_t *ip) +{ + struct inode *inode = LINVFS_GET_IP(vp); + + inode->i_mode = (ip->i_d.di_mode & MODEMASK) | VTTOIF(vp->v_type); + inode->i_nlink = ip->i_d.di_nlink; + inode->i_uid = ip->i_d.di_uid; + inode->i_gid = ip->i_d.di_gid; + if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) { + inode->i_rdev = 0; + } else { + xfs_dev_t dev = ip->i_df.if_u2.if_rdev; + inode->i_rdev = MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); + } + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_generation = ip->i_d.di_gen; + i_size_write(inode, ip->i_d.di_size); + inode->i_blocks = + XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; + inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; + inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; + inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + vp->v_flag &= ~VMODIFIED; +} + +void +xfs_initialize_vnode( + bhv_desc_t *bdp, + vnode_t *vp, + bhv_desc_t *inode_bhv, + int unlock) +{ + xfs_inode_t *ip = XFS_BHVTOI(inode_bhv); + struct inode *inode = LINVFS_GET_IP(vp); + + if (!inode_bhv->bd_vobj) { + vp->v_vfsp = bhvtovfs(bdp); + bhv_desc_init(inode_bhv, ip, vp, &xfs_vnodeops); + bhv_insert(VN_BHV_HEAD(vp), inode_bhv); + } + + /* + * We need to set the ops vectors, and unlock the inode, but if + * we have been called during the new inode create process, it is + * too early to fill in the Linux inode. We will get called a + * second time once the inode is properly set up, and then we can + * finish our work. + */ + if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) { + vp->v_type = IFTOVT(ip->i_d.di_mode); + xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip); + xfs_set_inodeops(inode); + + ip->i_flags &= ~XFS_INEW; + barrier(); + + unlock_new_inode(inode); + } +} + +int +xfs_blkdev_get( + xfs_mount_t *mp, + const char *name, + struct block_device **bdevp) +{ + int error = 0; + + *bdevp = open_bdev_excl(name, 0, mp); + if (IS_ERR(*bdevp)) { + error = PTR_ERR(*bdevp); + printk("XFS: Invalid device [%s], error=%d\n", name, error); + } + + return -error; +} + +void +xfs_blkdev_put( + struct block_device *bdev) +{ + if (bdev) + close_bdev_excl(bdev); +} + + +STATIC struct inode * +linvfs_alloc_inode( + struct super_block *sb) +{ + vnode_t *vp; + + vp = (vnode_t *)kmem_cache_alloc(linvfs_inode_zone, + kmem_flags_convert(KM_SLEEP)); + if (!vp) + return NULL; + return LINVFS_GET_IP(vp); +} + +STATIC void +linvfs_destroy_inode( + struct inode *inode) +{ + kmem_cache_free(linvfs_inode_zone, LINVFS_GET_VP(inode)); +} + +STATIC void +init_once( + void *data, + kmem_cache_t *cachep, + unsigned long flags) +{ + vnode_t *vp = (vnode_t *)data; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(LINVFS_GET_IP(vp)); +} + +STATIC int +init_inodecache( void ) +{ + linvfs_inode_zone = kmem_cache_create("linvfs_icache", + sizeof(vnode_t), 0, SLAB_RECLAIM_ACCOUNT, + init_once, NULL); + if (linvfs_inode_zone == NULL) + return -ENOMEM; + return 0; +} + +STATIC void +destroy_inodecache( void ) +{ + if (kmem_cache_destroy(linvfs_inode_zone)) + printk(KERN_WARNING "%s: cache still in use!\n", __FUNCTION__); +} + +/* + * Attempt to flush the inode, this will actually fail + * if the inode is pinned, but we dirty the inode again + * at the point when it is unpinned after a log write, + * since this is when the inode itself becomes flushable. + */ +STATIC int +linvfs_write_inode( + struct inode *inode, + int sync) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + int error = 0, flags = FLUSH_INODE; + + if (vp) { + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + if (sync) + flags |= FLUSH_SYNC; + VOP_IFLUSH(vp, flags, error); + if (error == EAGAIN) { + if (sync) + VOP_IFLUSH(vp, flags | FLUSH_LOG, error); + else + error = 0; + } + } + + return -error; +} + +STATIC void +linvfs_clear_inode( + struct inode *inode) +{ + vnode_t *vp = LINVFS_GET_VP(inode); + + if (vp) { + vn_rele(vp); + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + /* + * Do all our cleanup, and remove this vnode. + */ + vn_remove(vp); + } +} + + +/* + * Enqueue a work item to be picked up by the vfs xfssyncd thread. + * Doing this has two advantages: + * - It saves on stack space, which is tight in certain situations + * - It can be used (with care) as a mechanism to avoid deadlocks. + * Flushing while allocating in a full filesystem requires both. + */ +STATIC void +xfs_syncd_queue_work( + struct vfs *vfs, + void *data, + void (*syncer)(vfs_t *, void *)) +{ + vfs_sync_work_t *work; + + work = kmem_alloc(sizeof(struct vfs_sync_work), KM_SLEEP); + INIT_LIST_HEAD(&work->w_list); + work->w_syncer = syncer; + work->w_data = data; + work->w_vfs = vfs; + spin_lock(&vfs->vfs_sync_lock); + list_add_tail(&work->w_list, &vfs->vfs_sync_list); + spin_unlock(&vfs->vfs_sync_lock); + wake_up_process(vfs->vfs_sync_task); +} + +/* + * Flush delayed allocate data, attempting to free up reserved space + * from existing allocations. At this point a new allocation attempt + * has failed with ENOSPC and we are in the process of scratching our + * heads, looking about for more room... + */ +STATIC void +xfs_flush_inode_work( + vfs_t *vfs, + void *inode) +{ + filemap_flush(((struct inode *)inode)->i_mapping); + iput((struct inode *)inode); +} + +void +xfs_flush_inode( + xfs_inode_t *ip) +{ + struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip)); + struct vfs *vfs = XFS_MTOVFS(ip->i_mount); + + igrab(inode); + xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work); + delay(HZ/2); +} + +/* + * This is the "bigger hammer" version of xfs_flush_inode_work... + * (IOW, "If at first you don't succeed, use a Bigger Hammer"). + */ +STATIC void +xfs_flush_device_work( + vfs_t *vfs, + void *inode) +{ + sync_blockdev(vfs->vfs_super->s_bdev); + iput((struct inode *)inode); +} + +void +xfs_flush_device( + xfs_inode_t *ip) +{ + struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip)); + struct vfs *vfs = XFS_MTOVFS(ip->i_mount); + + igrab(inode); + xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work); + delay(HZ/2); + xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); +} + +#define SYNCD_FLAGS (SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR) +STATIC void +vfs_sync_worker( + vfs_t *vfsp, + void *unused) +{ + int error; + + if (!(vfsp->vfs_flag & VFS_RDONLY)) + VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error); + vfsp->vfs_sync_seq++; + wmb(); + wake_up(&vfsp->vfs_wait_single_sync_task); +} + +STATIC int +xfssyncd( + void *arg) +{ + long timeleft; + vfs_t *vfsp = (vfs_t *) arg; + struct list_head tmp; + struct vfs_sync_work *work, *n; + + daemonize("xfssyncd"); + + vfsp->vfs_sync_work.w_vfs = vfsp; + vfsp->vfs_sync_work.w_syncer = vfs_sync_worker; + vfsp->vfs_sync_task = current; + wmb(); + wake_up(&vfsp->vfs_wait_sync_task); + + INIT_LIST_HEAD(&tmp); + timeleft = (xfs_syncd_centisecs * HZ) / 100; + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + timeleft = schedule_timeout(timeleft); + /* swsusp */ + try_to_freeze(PF_FREEZE); + if (vfsp->vfs_flag & VFS_UMOUNT) + break; + + spin_lock(&vfsp->vfs_sync_lock); + /* + * We can get woken by laptop mode, to do a sync - + * that's the (only!) case where the list would be + * empty with time remaining. + */ + if (!timeleft || list_empty(&vfsp->vfs_sync_list)) { + if (!timeleft) + timeleft = (xfs_syncd_centisecs * HZ) / 100; + INIT_LIST_HEAD(&vfsp->vfs_sync_work.w_list); + list_add_tail(&vfsp->vfs_sync_work.w_list, + &vfsp->vfs_sync_list); + } + list_for_each_entry_safe(work, n, &vfsp->vfs_sync_list, w_list) + list_move(&work->w_list, &tmp); + spin_unlock(&vfsp->vfs_sync_lock); + + list_for_each_entry_safe(work, n, &tmp, w_list) { + (*work->w_syncer)(vfsp, work->w_data); + list_del(&work->w_list); + if (work == &vfsp->vfs_sync_work) + continue; + kmem_free(work, sizeof(struct vfs_sync_work)); + } + } + + vfsp->vfs_sync_task = NULL; + wmb(); + wake_up(&vfsp->vfs_wait_sync_task); + + return 0; +} + +STATIC int +linvfs_start_syncd( + vfs_t *vfsp) +{ + int pid; + + pid = kernel_thread(xfssyncd, (void *) vfsp, + CLONE_VM | CLONE_FS | CLONE_FILES); + if (pid < 0) + return -pid; + wait_event(vfsp->vfs_wait_sync_task, vfsp->vfs_sync_task); + return 0; +} + +STATIC void +linvfs_stop_syncd( + vfs_t *vfsp) +{ + vfsp->vfs_flag |= VFS_UMOUNT; + wmb(); + + wake_up_process(vfsp->vfs_sync_task); + wait_event(vfsp->vfs_wait_sync_task, !vfsp->vfs_sync_task); +} + +STATIC void +linvfs_put_super( + struct super_block *sb) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + linvfs_stop_syncd(vfsp); + VFS_SYNC(vfsp, SYNC_ATTR|SYNC_DELWRI, NULL, error); + if (!error) + VFS_UNMOUNT(vfsp, 0, NULL, error); + if (error) { + printk("XFS unmount got error %d\n", error); + printk("%s: vfsp/0x%p left dangling!\n", __FUNCTION__, vfsp); + return; + } + + vfs_deallocate(vfsp); +} + +STATIC void +linvfs_write_super( + struct super_block *sb) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + if (sb->s_flags & MS_RDONLY) { + sb->s_dirt = 0; /* paranoia */ + return; + } + /* Push the log and superblock a little */ + VFS_SYNC(vfsp, SYNC_FSDATA, NULL, error); + sb->s_dirt = 0; +} + +STATIC int +linvfs_sync_super( + struct super_block *sb, + int wait) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + int flags = SYNC_FSDATA; + + if (wait) + flags |= SYNC_WAIT; + + VFS_SYNC(vfsp, flags, NULL, error); + sb->s_dirt = 0; + + if (unlikely(laptop_mode)) { + int prev_sync_seq = vfsp->vfs_sync_seq; + + /* + * The disk must be active because we're syncing. + * We schedule xfssyncd now (now that the disk is + * active) instead of later (when it might not be). + */ + wake_up_process(vfsp->vfs_sync_task); + /* + * We have to wait for the sync iteration to complete. + * If we don't, the disk activity caused by the sync + * will come after the sync is completed, and that + * triggers another sync from laptop mode. + */ + wait_event(vfsp->vfs_wait_single_sync_task, + vfsp->vfs_sync_seq != prev_sync_seq); + } + + return -error; +} + +STATIC int +linvfs_statfs( + struct super_block *sb, + struct kstatfs *statp) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + int error; + + VFS_STATVFS(vfsp, statp, NULL, error); + return -error; +} + +STATIC int +linvfs_remount( + struct super_block *sb, + int *flags, + char *options) +{ + vfs_t *vfsp = LINVFS_GET_VFS(sb); + struct xfs_mount_args *args = xfs_args_allocate(sb); + int error; + + VFS_PARSEARGS(vfsp, options, args, 1, error); + if (!error) + VFS_MNTUPDATE(vfsp, flags, args, error); + kmem_free(args, sizeof(*args)); + return -error; +} + +STATIC void +linvfs_freeze_fs( + struct super_block *sb) +{ + VFS_FREEZE(LINVFS_GET_VFS(sb)); +} + +STATIC int +linvfs_show_options( + struct seq_file *m, + struct vfsmount *mnt) +{ + struct vfs *vfsp = LINVFS_GET_VFS(mnt->mnt_sb); + int error; + + VFS_SHOWARGS(vfsp, m, error); + return error; +} + +STATIC int +linvfs_getxstate( + struct super_block *sb, + struct fs_quota_stat *fqs) +{ + struct vfs *vfsp = LINVFS_GET_VFS(sb); + int error; + + VFS_QUOTACTL(vfsp, Q_XGETQSTAT, 0, (caddr_t)fqs, error); + return -error; +} + +STATIC int +linvfs_setxstate( + struct super_block *sb, + unsigned int flags, + int op) +{ + struct vfs *vfsp = LINVFS_GET_VFS(sb); + int error; + + VFS_QUOTACTL(vfsp, op, 0, (caddr_t)&flags, error); + return -error; +} + +STATIC int +linvfs_getxquota( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct vfs *vfsp = LINVFS_GET_VFS(sb); + int error, getmode; + + getmode = (type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETQUOTA; + VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error); + return -error; +} + +STATIC int +linvfs_setxquota( + struct super_block *sb, + int type, + qid_t id, + struct fs_disk_quota *fdq) +{ + struct vfs *vfsp = LINVFS_GET_VFS(sb); + int error, setmode; + + setmode = (type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETQLIM; + VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error); + return -error; +} + +STATIC int +linvfs_fill_super( + struct super_block *sb, + void *data, + int silent) +{ + vnode_t *rootvp; + struct vfs *vfsp = vfs_allocate(); + struct xfs_mount_args *args = xfs_args_allocate(sb); + struct kstatfs statvfs; + int error, error2; + + vfsp->vfs_super = sb; + LINVFS_SET_VFS(sb, vfsp); + if (sb->s_flags & MS_RDONLY) + vfsp->vfs_flag |= VFS_RDONLY; + bhv_insert_all_vfsops(vfsp); + + VFS_PARSEARGS(vfsp, (char *)data, args, 0, error); + if (error) { + bhv_remove_all_vfsops(vfsp, 1); + goto fail_vfsop; + } + + sb_min_blocksize(sb, BBSIZE); +#ifdef CONFIG_XFS_EXPORT + sb->s_export_op = &linvfs_export_ops; +#endif + sb->s_qcop = &linvfs_qops; + sb->s_op = &linvfs_sops; + + VFS_MOUNT(vfsp, args, NULL, error); + if (error) { + bhv_remove_all_vfsops(vfsp, 1); + goto fail_vfsop; + } + + VFS_STATVFS(vfsp, &statvfs, NULL, error); + if (error) + goto fail_unmount; + + sb->s_dirt = 1; + sb->s_magic = statvfs.f_type; + sb->s_blocksize = statvfs.f_bsize; + sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1; + sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_time_gran = 1; + set_posix_acl_flag(sb); + + VFS_ROOT(vfsp, &rootvp, error); + if (error) + goto fail_unmount; + + sb->s_root = d_alloc_root(LINVFS_GET_IP(rootvp)); + if (!sb->s_root) { + error = ENOMEM; + goto fail_vnrele; + } + if (is_bad_inode(sb->s_root->d_inode)) { + error = EINVAL; + goto fail_vnrele; + } + if ((error = linvfs_start_syncd(vfsp))) + goto fail_vnrele; + vn_trace_exit(rootvp, __FUNCTION__, (inst_t *)__return_address); + + kmem_free(args, sizeof(*args)); + return 0; + +fail_vnrele: + if (sb->s_root) { + dput(sb->s_root); + sb->s_root = NULL; + } else { + VN_RELE(rootvp); + } + +fail_unmount: + VFS_UNMOUNT(vfsp, 0, NULL, error2); + +fail_vfsop: + vfs_deallocate(vfsp); + kmem_free(args, sizeof(*args)); + return -error; +} + +STATIC struct super_block * +linvfs_get_sb( + struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, linvfs_fill_super); +} + +STATIC struct super_operations linvfs_sops = { + .alloc_inode = linvfs_alloc_inode, + .destroy_inode = linvfs_destroy_inode, + .write_inode = linvfs_write_inode, + .clear_inode = linvfs_clear_inode, + .put_super = linvfs_put_super, + .write_super = linvfs_write_super, + .sync_fs = linvfs_sync_super, + .write_super_lockfs = linvfs_freeze_fs, + .statfs = linvfs_statfs, + .remount_fs = linvfs_remount, + .show_options = linvfs_show_options, +}; + +STATIC struct quotactl_ops linvfs_qops = { + .get_xstate = linvfs_getxstate, + .set_xstate = linvfs_setxstate, + .get_xquota = linvfs_getxquota, + .set_xquota = linvfs_setxquota, +}; + +STATIC struct file_system_type xfs_fs_type = { + .owner = THIS_MODULE, + .name = "xfs", + .get_sb = linvfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + + +STATIC int __init +init_xfs_fs( void ) +{ + int error; + struct sysinfo si; + static char message[] __initdata = KERN_INFO \ + XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"; + + printk(message); + + si_meminfo(&si); + xfs_physmem = si.totalram; + + ktrace_init(64); + + error = init_inodecache(); + if (error < 0) + goto undo_inodecache; + + error = pagebuf_init(); + if (error < 0) + goto undo_pagebuf; + + vn_init(); + xfs_init(); + uuid_init(); + vfs_initquota(); + + error = register_filesystem(&xfs_fs_type); + if (error) + goto undo_register; + XFS_DM_INIT(&xfs_fs_type); + return 0; + +undo_register: + pagebuf_terminate(); + +undo_pagebuf: + destroy_inodecache(); + +undo_inodecache: + return error; +} + +STATIC void __exit +exit_xfs_fs( void ) +{ + vfs_exitquota(); + XFS_DM_EXIT(&xfs_fs_type); + unregister_filesystem(&xfs_fs_type); + xfs_cleanup(); + pagebuf_terminate(); + destroy_inodecache(); + ktrace_uninit(); +} + +module_init(init_xfs_fs); +module_exit(exit_xfs_fs); + +MODULE_AUTHOR("Silicon Graphics, Inc."); +MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled"); +MODULE_LICENSE("GPL"); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h new file mode 100644 index 000000000000..ec7e0035c731 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_SUPER_H__ +#define __XFS_SUPER_H__ + +#ifdef CONFIG_XFS_DMAPI +# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops) +# define vfs_initdmapi() dmapi_init() +# define vfs_exitdmapi() dmapi_uninit() +#else +# define vfs_insertdmapi(vfs) do { } while (0) +# define vfs_initdmapi() do { } while (0) +# define vfs_exitdmapi() do { } while (0) +#endif + +#ifdef CONFIG_XFS_QUOTA +# define vfs_insertquota(vfs) vfs_insertops(vfsp, &xfs_qmops) +extern void xfs_qm_init(void); +extern void xfs_qm_exit(void); +# define vfs_initquota() xfs_qm_init() +# define vfs_exitquota() xfs_qm_exit() +#else +# define vfs_insertquota(vfs) do { } while (0) +# define vfs_initquota() do { } while (0) +# define vfs_exitquota() do { } while (0) +#endif + +#ifdef CONFIG_XFS_POSIX_ACL +# define XFS_ACL_STRING "ACLs, " +# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL) +#else +# define XFS_ACL_STRING +# define set_posix_acl_flag(sb) do { } while (0) +#endif + +#ifdef CONFIG_XFS_SECURITY +# define XFS_SECURITY_STRING "security attributes, " +# define ENOSECURITY 0 +#else +# define XFS_SECURITY_STRING +# define ENOSECURITY EOPNOTSUPP +#endif + +#ifdef CONFIG_XFS_RT +# define XFS_REALTIME_STRING "realtime, " +#else +# define XFS_REALTIME_STRING +#endif + +#if XFS_BIG_BLKNOS +# if XFS_BIG_INUMS +# define XFS_BIGFS_STRING "large block/inode numbers, " +# else +# define XFS_BIGFS_STRING "large block numbers, " +# endif +#else +# define XFS_BIGFS_STRING +#endif + +#ifdef CONFIG_XFS_TRACE +# define XFS_TRACE_STRING "tracing, " +#else +# define XFS_TRACE_STRING +#endif + +#ifdef CONFIG_XFS_DMAPI +# define XFS_DMAPI_STRING "dmapi support, " +#else +# define XFS_DMAPI_STRING +#endif + +#ifdef DEBUG +# define XFS_DBG_STRING "debug" +#else +# define XFS_DBG_STRING "no debug" +#endif + +#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ + XFS_SECURITY_STRING \ + XFS_REALTIME_STRING \ + XFS_BIGFS_STRING \ + XFS_TRACE_STRING \ + XFS_DMAPI_STRING \ + XFS_DBG_STRING /* DBG must be last */ + +#define LINVFS_GET_VFS(s) \ + (vfs_t *)((s)->s_fs_info) +#define LINVFS_SET_VFS(s, vfsp) \ + ((s)->s_fs_info = vfsp) + +struct xfs_inode; +struct xfs_mount; +struct xfs_buftarg; +struct block_device; + +extern __uint64_t xfs_max_file_offset(unsigned int); + +extern void xfs_initialize_vnode(bhv_desc_t *, vnode_t *, bhv_desc_t *, int); + +extern void xfs_flush_inode(struct xfs_inode *); +extern void xfs_flush_device(struct xfs_inode *); + +extern int xfs_blkdev_get(struct xfs_mount *, const char *, + struct block_device **); +extern void xfs_blkdev_put(struct block_device *); + +extern struct export_operations linvfs_export_ops; + +#endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c new file mode 100644 index 000000000000..0dc010356f4d --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2001-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_rw.h" +#include <linux/sysctl.h> +#include <linux/proc_fs.h> + + +static struct ctl_table_header *xfs_table_header; + + +#ifdef CONFIG_PROC_FS +STATIC int +xfs_stats_clear_proc_handler( + ctl_table *ctl, + int write, + struct file *filp, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int c, ret, *valp = ctl->data; + __uint32_t vn_active; + + ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); + + if (!ret && write && *valp) { + printk("XFS Clearing xfsstats\n"); + for (c = 0; c < NR_CPUS; c++) { + if (!cpu_possible(c)) continue; + preempt_disable(); + /* save vn_active, it's a universal truth! */ + vn_active = per_cpu(xfsstats, c).vn_active; + memset(&per_cpu(xfsstats, c), 0, + sizeof(struct xfsstats)); + per_cpu(xfsstats, c).vn_active = vn_active; + preempt_enable(); + } + xfs_stats_clear = 0; + } + + return ret; +} +#endif /* CONFIG_PROC_FS */ + +STATIC ctl_table xfs_table[] = { + {XFS_RESTRICT_CHOWN, "restrict_chown", &xfs_params.restrict_chown.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.restrict_chown.min, &xfs_params.restrict_chown.max}, + + {XFS_SGID_INHERIT, "irix_sgid_inherit", &xfs_params.sgid_inherit.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.sgid_inherit.min, &xfs_params.sgid_inherit.max}, + + {XFS_SYMLINK_MODE, "irix_symlink_mode", &xfs_params.symlink_mode.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.symlink_mode.min, &xfs_params.symlink_mode.max}, + + {XFS_PANIC_MASK, "panic_mask", &xfs_params.panic_mask.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.panic_mask.min, &xfs_params.panic_mask.max}, + + {XFS_ERRLEVEL, "error_level", &xfs_params.error_level.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.error_level.min, &xfs_params.error_level.max}, + + {XFS_SYNCD_TIMER, "xfssyncd_centisecs", &xfs_params.syncd_timer.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.syncd_timer.min, &xfs_params.syncd_timer.max}, + + {XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.inherit_sync.min, &xfs_params.inherit_sync.max}, + + {XFS_INHERIT_NODUMP, "inherit_nodump", &xfs_params.inherit_nodump.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.inherit_nodump.min, &xfs_params.inherit_nodump.max}, + + {XFS_INHERIT_NOATIME, "inherit_noatime", &xfs_params.inherit_noatim.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max}, + + {XFS_BUF_TIMER, "xfsbufd_centisecs", &xfs_params.xfs_buf_timer.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.xfs_buf_timer.min, &xfs_params.xfs_buf_timer.max}, + + {XFS_BUF_AGE, "age_buffer_centisecs", &xfs_params.xfs_buf_age.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.xfs_buf_age.min, &xfs_params.xfs_buf_age.max}, + + {XFS_INHERIT_NOSYM, "inherit_nosymlinks", &xfs_params.inherit_nosym.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.inherit_nosym.min, &xfs_params.inherit_nosym.max}, + + {XFS_ROTORSTEP, "rotorstep", &xfs_params.rotorstep.val, + sizeof(int), 0644, NULL, &proc_dointvec_minmax, + &sysctl_intvec, NULL, + &xfs_params.rotorstep.min, &xfs_params.rotorstep.max}, + + /* please keep this the last entry */ +#ifdef CONFIG_PROC_FS + {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val, + sizeof(int), 0644, NULL, &xfs_stats_clear_proc_handler, + &sysctl_intvec, NULL, + &xfs_params.stats_clear.min, &xfs_params.stats_clear.max}, +#endif /* CONFIG_PROC_FS */ + + {0} +}; + +STATIC ctl_table xfs_dir_table[] = { + {FS_XFS, "xfs", NULL, 0, 0555, xfs_table}, + {0} +}; + +STATIC ctl_table xfs_root_table[] = { + {CTL_FS, "fs", NULL, 0, 0555, xfs_dir_table}, + {0} +}; + +void +xfs_sysctl_register(void) +{ + xfs_table_header = register_sysctl_table(xfs_root_table, 1); +} + +void +xfs_sysctl_unregister(void) +{ + if (xfs_table_header) + unregister_sysctl_table(xfs_table_header); +} diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h new file mode 100644 index 000000000000..a39a95020a58 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_sysctl.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2001-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#ifndef __XFS_SYSCTL_H__ +#define __XFS_SYSCTL_H__ + +#include <linux/sysctl.h> + +/* + * Tunable xfs parameters + */ + +typedef struct xfs_sysctl_val { + int min; + int val; + int max; +} xfs_sysctl_val_t; + +typedef struct xfs_param { + xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/ + xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is + * not a member of parent dir GID. */ + xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ + xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ + xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ + xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ + xfs_sysctl_val_t stats_clear; /* Reset all XFS statistics to zero. */ + xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ + xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ + xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ + xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */ + xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */ + xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ + xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ +} xfs_param_t; + +/* + * xfs_error_level: + * + * How much error reporting will be done when internal problems are + * encountered. These problems normally return an EFSCORRUPTED to their + * caller, with no other information reported. + * + * 0 No error reports + * 1 Report EFSCORRUPTED errors that will cause a filesystem shutdown + * 5 Report all EFSCORRUPTED errors (all of the above errors, plus any + * additional errors that are known to not cause shutdowns) + * + * xfs_panic_mask bit 0x8 turns the error reports into panics + */ + +enum { + /* XFS_REFCACHE_SIZE = 1 */ + /* XFS_REFCACHE_PURGE = 2 */ + XFS_RESTRICT_CHOWN = 3, + XFS_SGID_INHERIT = 4, + XFS_SYMLINK_MODE = 5, + XFS_PANIC_MASK = 6, + XFS_ERRLEVEL = 7, + XFS_SYNCD_TIMER = 8, + /* XFS_PROBE_DMAPI = 9 */ + /* XFS_PROBE_IOOPS = 10 */ + /* XFS_PROBE_QUOTA = 11 */ + XFS_STATS_CLEAR = 12, + XFS_INHERIT_SYNC = 13, + XFS_INHERIT_NODUMP = 14, + XFS_INHERIT_NOATIME = 15, + XFS_BUF_TIMER = 16, + XFS_BUF_AGE = 17, + /* XFS_IO_BYPASS = 18 */ + XFS_INHERIT_NOSYM = 19, + XFS_ROTORSTEP = 20, +}; + +extern xfs_param_t xfs_params; + +#ifdef CONFIG_SYSCTL +extern void xfs_sysctl_register(void); +extern void xfs_sysctl_unregister(void); +#else +# define xfs_sysctl_register() do { } while (0) +# define xfs_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + +#endif /* __XFS_SYSCTL_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h new file mode 100644 index 000000000000..96f96394417e --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_version.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +/* + * Dummy file that can contain a timestamp to put into the + * XFS init string, to help users keep track of what they're + * running + */ + +#ifndef __XFS_VERSION_H__ +#define __XFS_VERSION_H__ + +#define XFS_VERSION_STRING "SGI XFS" + +#endif /* __XFS_VERSION_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c new file mode 100644 index 000000000000..669c61644959 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_vfs.c @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_macros.h" +#include "xfs_inum.h" +#include "xfs_log.h" +#include "xfs_clnt.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_dir.h" +#include "xfs_dir2.h" +#include "xfs_imap.h" +#include "xfs_alloc.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" +#include "xfs_quota.h" + +int +vfs_mount( + struct bhv_desc *bdp, + struct xfs_mount_args *args, + struct cred *cr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_mount) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_mount)(next, args, cr)); +} + +int +vfs_parseargs( + struct bhv_desc *bdp, + char *s, + struct xfs_mount_args *args, + int f) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_parseargs) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_parseargs)(next, s, args, f)); +} + +int +vfs_showargs( + struct bhv_desc *bdp, + struct seq_file *m) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_showargs) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_showargs)(next, m)); +} + +int +vfs_unmount( + struct bhv_desc *bdp, + int fl, + struct cred *cr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_unmount) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_unmount)(next, fl, cr)); +} + +int +vfs_mntupdate( + struct bhv_desc *bdp, + int *fl, + struct xfs_mount_args *args) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_mntupdate) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_mntupdate)(next, fl, args)); +} + +int +vfs_root( + struct bhv_desc *bdp, + struct vnode **vpp) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_root) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_root)(next, vpp)); +} + +int +vfs_statvfs( + struct bhv_desc *bdp, + xfs_statfs_t *sp, + struct vnode *vp) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_statvfs) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_statvfs)(next, sp, vp)); +} + +int +vfs_sync( + struct bhv_desc *bdp, + int fl, + struct cred *cr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_sync) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_sync)(next, fl, cr)); +} + +int +vfs_vget( + struct bhv_desc *bdp, + struct vnode **vpp, + struct fid *fidp) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_vget) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_vget)(next, vpp, fidp)); +} + +int +vfs_dmapiops( + struct bhv_desc *bdp, + caddr_t addr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_dmapiops) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_dmapiops)(next, addr)); +} + +int +vfs_quotactl( + struct bhv_desc *bdp, + int cmd, + int id, + caddr_t addr) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_quotactl) + next = BHV_NEXT(next); + return ((*bhvtovfsops(next)->vfs_quotactl)(next, cmd, id, addr)); +} + +void +vfs_init_vnode( + struct bhv_desc *bdp, + struct vnode *vp, + struct bhv_desc *bp, + int unlock) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_init_vnode) + next = BHV_NEXT(next); + ((*bhvtovfsops(next)->vfs_init_vnode)(next, vp, bp, unlock)); +} + +void +vfs_force_shutdown( + struct bhv_desc *bdp, + int fl, + char *file, + int line) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_force_shutdown) + next = BHV_NEXT(next); + ((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line)); +} + +void +vfs_freeze( + struct bhv_desc *bdp) +{ + struct bhv_desc *next = bdp; + + ASSERT(next); + while (! (bhvtovfsops(next))->vfs_freeze) + next = BHV_NEXT(next); + ((*bhvtovfsops(next)->vfs_freeze)(next)); +} + +vfs_t * +vfs_allocate( void ) +{ + struct vfs *vfsp; + + vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP); + bhv_head_init(VFS_BHVHEAD(vfsp), "vfs"); + INIT_LIST_HEAD(&vfsp->vfs_sync_list); + spin_lock_init(&vfsp->vfs_sync_lock); + init_waitqueue_head(&vfsp->vfs_wait_sync_task); + init_waitqueue_head(&vfsp->vfs_wait_single_sync_task); + return vfsp; +} + +void +vfs_deallocate( + struct vfs *vfsp) +{ + bhv_head_destroy(VFS_BHVHEAD(vfsp)); + kmem_free(vfsp, sizeof(vfs_t)); +} + +void +vfs_insertops( + struct vfs *vfsp, + struct bhv_vfsops *vfsops) +{ + struct bhv_desc *bdp; + + bdp = kmem_alloc(sizeof(struct bhv_desc), KM_SLEEP); + bhv_desc_init(bdp, NULL, vfsp, vfsops); + bhv_insert(&vfsp->vfs_bh, bdp); +} + +void +vfs_insertbhv( + struct vfs *vfsp, + struct bhv_desc *bdp, + struct vfsops *vfsops, + void *mount) +{ + bhv_desc_init(bdp, mount, vfsp, vfsops); + bhv_insert_initial(&vfsp->vfs_bh, bdp); +} + +void +bhv_remove_vfsops( + struct vfs *vfsp, + int pos) +{ + struct bhv_desc *bhv; + + bhv = bhv_lookup_range(&vfsp->vfs_bh, pos, pos); + if (!bhv) + return; + bhv_remove(&vfsp->vfs_bh, bhv); + kmem_free(bhv, sizeof(*bhv)); +} + +void +bhv_remove_all_vfsops( + struct vfs *vfsp, + int freebase) +{ + struct xfs_mount *mp; + + bhv_remove_vfsops(vfsp, VFS_POSITION_QM); + bhv_remove_vfsops(vfsp, VFS_POSITION_DM); + if (!freebase) + return; + mp = XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfsp), &xfs_vfsops)); + VFS_REMOVEBHV(vfsp, &mp->m_bhv); + xfs_mount_free(mp, 0); +} + +void +bhv_insert_all_vfsops( + struct vfs *vfsp) +{ + struct xfs_mount *mp; + + mp = xfs_mount_init(); + vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp); + vfs_insertdmapi(vfsp); + vfs_insertquota(vfsp); +} diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h new file mode 100644 index 000000000000..76493991578f --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ +#ifndef __XFS_VFS_H__ +#define __XFS_VFS_H__ + +#include <linux/vfs.h> +#include "xfs_fs.h" + +struct fid; +struct vfs; +struct cred; +struct vnode; +struct kstatfs; +struct seq_file; +struct super_block; +struct xfs_mount_args; + +typedef struct kstatfs xfs_statfs_t; + +typedef struct vfs_sync_work { + struct list_head w_list; + struct vfs *w_vfs; + void *w_data; /* syncer routine argument */ + void (*w_syncer)(struct vfs *, void *); +} vfs_sync_work_t; + +typedef struct vfs { + u_int vfs_flag; /* flags */ + xfs_fsid_t vfs_fsid; /* file system ID */ + xfs_fsid_t *vfs_altfsid; /* An ID fixed for life of FS */ + bhv_head_t vfs_bh; /* head of vfs behavior chain */ + struct super_block *vfs_super; /* generic superblock pointer */ + struct task_struct *vfs_sync_task; /* generalised sync thread */ + vfs_sync_work_t vfs_sync_work; /* work item for VFS_SYNC */ + struct list_head vfs_sync_list; /* sync thread work item list */ + spinlock_t vfs_sync_lock; /* work item list lock */ + int vfs_sync_seq; /* sync thread generation no. */ + wait_queue_head_t vfs_wait_single_sync_task; + wait_queue_head_t vfs_wait_sync_task; +} vfs_t; + +#define vfs_fbhv vfs_bh.bh_first /* 1st on vfs behavior chain */ + +#define bhvtovfs(bdp) ( (struct vfs *)BHV_VOBJ(bdp) ) +#define bhvtovfsops(bdp) ( (struct vfsops *)BHV_OPS(bdp) ) +#define VFS_BHVHEAD(vfs) ( &(vfs)->vfs_bh ) +#define VFS_REMOVEBHV(vfs, bdp) ( bhv_remove(VFS_BHVHEAD(vfs), bdp) ) + +#define VFS_POSITION_BASE BHV_POSITION_BASE /* chain bottom */ +#define VFS_POSITION_TOP BHV_POSITION_TOP /* chain top */ +#define VFS_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */ + +typedef enum { + VFS_BHV_UNKNOWN, /* not specified */ + VFS_BHV_XFS, /* xfs */ + VFS_BHV_DM, /* data migration */ + VFS_BHV_QM, /* quota manager */ + VFS_BHV_IO, /* IO path */ + VFS_BHV_END /* housekeeping end-of-range */ +} vfs_bhv_t; + +#define VFS_POSITION_XFS (BHV_POSITION_BASE) +#define VFS_POSITION_DM (VFS_POSITION_BASE+10) +#define VFS_POSITION_QM (VFS_POSITION_BASE+20) +#define VFS_POSITION_IO (VFS_POSITION_BASE+30) + +#define VFS_RDONLY 0x0001 /* read-only vfs */ +#define VFS_GRPID 0x0002 /* group-ID assigned from directory */ +#define VFS_DMI 0x0004 /* filesystem has the DMI enabled */ +#define VFS_UMOUNT 0x0008 /* unmount in progress */ +#define VFS_END 0x0008 /* max flag */ + +#define SYNC_ATTR 0x0001 /* sync attributes */ +#define SYNC_CLOSE 0x0002 /* close file system down */ +#define SYNC_DELWRI 0x0004 /* look at delayed writes */ +#define SYNC_WAIT 0x0008 /* wait for i/o to complete */ +#define SYNC_BDFLUSH 0x0010 /* BDFLUSH is calling -- don't block */ +#define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */ +#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ +#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ + +typedef int (*vfs_mount_t)(bhv_desc_t *, + struct xfs_mount_args *, struct cred *); +typedef int (*vfs_parseargs_t)(bhv_desc_t *, char *, + struct xfs_mount_args *, int); +typedef int (*vfs_showargs_t)(bhv_desc_t *, struct seq_file *); +typedef int (*vfs_unmount_t)(bhv_desc_t *, int, struct cred *); +typedef int (*vfs_mntupdate_t)(bhv_desc_t *, int *, + struct xfs_mount_args *); +typedef int (*vfs_root_t)(bhv_desc_t *, struct vnode **); +typedef int (*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *, struct vnode *); +typedef int (*vfs_sync_t)(bhv_desc_t *, int, struct cred *); +typedef int (*vfs_vget_t)(bhv_desc_t *, struct vnode **, struct fid *); +typedef int (*vfs_dmapiops_t)(bhv_desc_t *, caddr_t); +typedef int (*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t); +typedef void (*vfs_init_vnode_t)(bhv_desc_t *, + struct vnode *, bhv_desc_t *, int); +typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int); +typedef void (*vfs_freeze_t)(bhv_desc_t *); + +typedef struct vfsops { + bhv_position_t vf_position; /* behavior chain position */ + vfs_mount_t vfs_mount; /* mount file system */ + vfs_parseargs_t vfs_parseargs; /* parse mount options */ + vfs_showargs_t vfs_showargs; /* unparse mount options */ + vfs_unmount_t vfs_unmount; /* unmount file system */ + vfs_mntupdate_t vfs_mntupdate; /* update file system options */ + vfs_root_t vfs_root; /* get root vnode */ + vfs_statvfs_t vfs_statvfs; /* file system statistics */ + vfs_sync_t vfs_sync; /* flush files */ + vfs_vget_t vfs_vget; /* get vnode from fid */ + vfs_dmapiops_t vfs_dmapiops; /* data migration */ + vfs_quotactl_t vfs_quotactl; /* disk quota */ + vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */ + vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */ + vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */ +} vfsops_t; + +/* + * VFS's. Operates on vfs structure pointers (starts at bhv head). + */ +#define VHEAD(v) ((v)->vfs_fbhv) +#define VFS_MOUNT(v, ma,cr, rv) ((rv) = vfs_mount(VHEAD(v), ma,cr)) +#define VFS_PARSEARGS(v, o,ma,f, rv) ((rv) = vfs_parseargs(VHEAD(v), o,ma,f)) +#define VFS_SHOWARGS(v, m, rv) ((rv) = vfs_showargs(VHEAD(v), m)) +#define VFS_UNMOUNT(v, f, cr, rv) ((rv) = vfs_unmount(VHEAD(v), f,cr)) +#define VFS_MNTUPDATE(v, fl, args, rv) ((rv) = vfs_mntupdate(VHEAD(v), fl, args)) +#define VFS_ROOT(v, vpp, rv) ((rv) = vfs_root(VHEAD(v), vpp)) +#define VFS_STATVFS(v, sp,vp, rv) ((rv) = vfs_statvfs(VHEAD(v), sp,vp)) +#define VFS_SYNC(v, flag,cr, rv) ((rv) = vfs_sync(VHEAD(v), flag,cr)) +#define VFS_VGET(v, vpp,fidp, rv) ((rv) = vfs_vget(VHEAD(v), vpp,fidp)) +#define VFS_DMAPIOPS(v, p, rv) ((rv) = vfs_dmapiops(VHEAD(v), p)) +#define VFS_QUOTACTL(v, c,id,p, rv) ((rv) = vfs_quotactl(VHEAD(v), c,id,p)) +#define VFS_INIT_VNODE(v, vp,b,ul) ( vfs_init_vnode(VHEAD(v), vp,b,ul) ) +#define VFS_FORCE_SHUTDOWN(v, fl,f,l) ( vfs_force_shutdown(VHEAD(v), fl,f,l) ) +#define VFS_FREEZE(v) ( vfs_freeze(VHEAD(v)) ) + +/* + * PVFS's. Operates on behavior descriptor pointers. + */ +#define PVFS_MOUNT(b, ma,cr, rv) ((rv) = vfs_mount(b, ma,cr)) +#define PVFS_PARSEARGS(b, o,ma,f, rv) ((rv) = vfs_parseargs(b, o,ma,f)) +#define PVFS_SHOWARGS(b, m, rv) ((rv) = vfs_showargs(b, m)) +#define PVFS_UNMOUNT(b, f,cr, rv) ((rv) = vfs_unmount(b, f,cr)) +#define PVFS_MNTUPDATE(b, fl, args, rv) ((rv) = vfs_mntupdate(b, fl, args)) +#define PVFS_ROOT(b, vpp, rv) ((rv) = vfs_root(b, vpp)) +#define PVFS_STATVFS(b, sp,vp, rv) ((rv) = vfs_statvfs(b, sp,vp)) +#define PVFS_SYNC(b, flag,cr, rv) ((rv) = vfs_sync(b, flag,cr)) +#define PVFS_VGET(b, vpp,fidp, rv) ((rv) = vfs_vget(b, vpp,fidp)) +#define PVFS_DMAPIOPS(b, p, rv) ((rv) = vfs_dmapiops(b, p)) +#define PVFS_QUOTACTL(b, c,id,p, rv) ((rv) = vfs_quotactl(b, c,id,p)) +#define PVFS_INIT_VNODE(b, vp,b2,ul) ( vfs_init_vnode(b, vp,b2,ul) ) +#define PVFS_FORCE_SHUTDOWN(b, fl,f,l) ( vfs_force_shutdown(b, fl,f,l) ) +#define PVFS_FREEZE(b) ( vfs_freeze(b) ) + +extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *); +extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int); +extern int vfs_showargs(bhv_desc_t *, struct seq_file *); +extern int vfs_unmount(bhv_desc_t *, int, struct cred *); +extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *); +extern int vfs_root(bhv_desc_t *, struct vnode **); +extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct vnode *); +extern int vfs_sync(bhv_desc_t *, int, struct cred *); +extern int vfs_vget(bhv_desc_t *, struct vnode **, struct fid *); +extern int vfs_dmapiops(bhv_desc_t *, caddr_t); +extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t); +extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int); +extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int); +extern void vfs_freeze(bhv_desc_t *); + +typedef struct bhv_vfsops { + struct vfsops bhv_common; + void * bhv_custom; +} bhv_vfsops_t; + +#define vfs_bhv_lookup(v, id) ( bhv_lookup_range(&(v)->vfs_bh, (id), (id)) ) +#define vfs_bhv_custom(b) ( ((bhv_vfsops_t *)BHV_OPS(b))->bhv_custom ) +#define vfs_bhv_set_custom(b,o) ( (b)->bhv_custom = (void *)(o)) +#define vfs_bhv_clr_custom(b) ( (b)->bhv_custom = NULL ) + +extern vfs_t *vfs_allocate(void); +extern void vfs_deallocate(vfs_t *); +extern void vfs_insertops(vfs_t *, bhv_vfsops_t *); +extern void vfs_insertbhv(vfs_t *, bhv_desc_t *, vfsops_t *, void *); + +extern void bhv_insert_all_vfsops(struct vfs *); +extern void bhv_remove_all_vfsops(struct vfs *, int); +extern void bhv_remove_vfsops(struct vfs *, int); + +#define fs_frozen(vfsp) ((vfsp)->vfs_super->s_frozen) +#define fs_check_frozen(vfsp, level) \ + vfs_check_frozen(vfsp->vfs_super, level); + +#endif /* __XFS_VFS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c new file mode 100644 index 000000000000..849c61c74f3c --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_vnode.c @@ -0,0 +1,455 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + */ + +#include "xfs.h" + + +uint64_t vn_generation; /* vnode generation number */ +DEFINE_SPINLOCK(vnumber_lock); + +/* + * Dedicated vnode inactive/reclaim sync semaphores. + * Prime number of hash buckets since address is used as the key. + */ +#define NVSYNC 37 +#define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC]) +sv_t vsync[NVSYNC]; + +/* + * Translate stat(2) file types to vnode types and vice versa. + * Aware of numeric order of S_IFMT and vnode type values. + */ +enum vtype iftovt_tab[] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON +}; + +u_short vttoif_tab[] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK +}; + + +void +vn_init(void) +{ + register sv_t *svp; + register int i; + + for (svp = vsync, i = 0; i < NVSYNC; i++, svp++) + init_sv(svp, SV_DEFAULT, "vsy", i); +} + +/* + * Clean a vnode of filesystem-specific data and prepare it for reuse. + */ +STATIC int +vn_reclaim( + struct vnode *vp) +{ + int error; + + XFS_STATS_INC(vn_reclaim); + vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address); + + /* + * Only make the VOP_RECLAIM call if there are behaviors + * to call. + */ + if (vp->v_fbhv) { + VOP_RECLAIM(vp, error); + if (error) + return -error; + } + ASSERT(vp->v_fbhv == NULL); + + VN_LOCK(vp); + vp->v_flag &= (VRECLM|VWAIT); + VN_UNLOCK(vp, 0); + + vp->v_type = VNON; + vp->v_fbhv = NULL; + +#ifdef XFS_VNODE_TRACE + ktrace_free(vp->v_trace); + vp->v_trace = NULL; +#endif + + return 0; +} + +STATIC void +vn_wakeup( + struct vnode *vp) +{ + VN_LOCK(vp); + if (vp->v_flag & VWAIT) + sv_broadcast(vptosync(vp)); + vp->v_flag &= ~(VRECLM|VWAIT|VMODIFIED); + VN_UNLOCK(vp, 0); +} + +int +vn_wait( + struct vnode *vp) +{ + VN_LOCK(vp); + if (vp->v_flag & (VINACT | VRECLM)) { + vp->v_flag |= VWAIT; + sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0); + return 1; + } + VN_UNLOCK(vp, 0); + return 0; +} + +struct vnode * +vn_initialize( + struct inode *inode) +{ + struct vnode *vp = LINVFS_GET_VP(inode); + + XFS_STATS_INC(vn_active); + XFS_STATS_INC(vn_alloc); + + vp->v_flag = VMODIFIED; + spinlock_init(&vp->v_lock, "v_lock"); + + spin_lock(&vnumber_lock); + if (!++vn_generation) /* v_number shouldn't be zero */ + vn_generation++; + vp->v_number = vn_generation; + spin_unlock(&vnumber_lock); + + ASSERT(VN_CACHED(vp) == 0); + + /* Initialize the first behavior and the behavior chain head. */ + vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode"); + +#ifdef XFS_VNODE_TRACE + vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP); + printk("Allocated VNODE_TRACE at 0x%p\n", vp->v_trace); +#endif /* XFS_VNODE_TRACE */ + + vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address); + return vp; +} + +/* + * Get a reference on a vnode. + */ +vnode_t * +vn_get( + struct vnode *vp, + vmap_t *vmap) +{ + struct inode *inode; + + XFS_STATS_INC(vn_get); + inode = LINVFS_GET_IP(vp); + if (inode->i_state & I_FREEING) + return NULL; + + inode = ilookup(vmap->v_vfsp->vfs_super, vmap->v_ino); + if (!inode) /* Inode not present */ + return NULL; + + vn_trace_exit(vp, "vn_get", (inst_t *)__return_address); + + return vp; +} + +/* + * Revalidate the Linux inode from the vattr. + * Note: i_size _not_ updated; we must hold the inode + * semaphore when doing that - callers responsibility. + */ +void +vn_revalidate_core( + struct vnode *vp, + vattr_t *vap) +{ + struct inode *inode = LINVFS_GET_IP(vp); + + inode->i_mode = VTTOIF(vap->va_type) | vap->va_mode; + inode->i_nlink = vap->va_nlink; + inode->i_uid = vap->va_uid; + inode->i_gid = vap->va_gid; + inode->i_blocks = vap->va_nblocks; + inode->i_mtime = vap->va_mtime; + inode->i_ctime = vap->va_ctime; + inode->i_atime = vap->va_atime; + if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (vap->va_xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (vap->va_xflags & XFS_XFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (vap->va_xflags & XFS_XFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +/* + * Revalidate the Linux inode from the vnode. + */ +int +vn_revalidate( + struct vnode *vp) +{ + vattr_t va; + int error; + + vn_trace_entry(vp, "vn_revalidate", (inst_t *)__return_address); + ASSERT(vp->v_fbhv != NULL); + + va.va_mask = XFS_AT_STAT|XFS_AT_XFLAGS; + VOP_GETATTR(vp, &va, 0, NULL, error); + if (!error) { + vn_revalidate_core(vp, &va); + VUNMODIFY(vp); + } + return -error; +} + +/* + * purge a vnode from the cache + * At this point the vnode is guaranteed to have no references (vn_count == 0) + * The caller has to make sure that there are no ways someone could + * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock). + */ +void +vn_purge( + struct vnode *vp, + vmap_t *vmap) +{ + vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address); + +again: + /* + * Check whether vp has already been reclaimed since our caller + * sampled its version while holding a filesystem cache lock that + * its VOP_RECLAIM function acquires. + */ + VN_LOCK(vp); + if (vp->v_number != vmap->v_number) { + VN_UNLOCK(vp, 0); + return; + } + + /* + * If vp is being reclaimed or inactivated, wait until it is inert, + * then proceed. Can't assume that vnode is actually reclaimed + * just because the reclaimed flag is asserted -- a vn_alloc + * reclaim can fail. + */ + if (vp->v_flag & (VINACT | VRECLM)) { + ASSERT(vn_count(vp) == 0); + vp->v_flag |= VWAIT; + sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0); + goto again; + } + + /* + * Another process could have raced in and gotten this vnode... + */ + if (vn_count(vp) > 0) { + VN_UNLOCK(vp, 0); + return; + } + + XFS_STATS_DEC(vn_active); + vp->v_flag |= VRECLM; + VN_UNLOCK(vp, 0); + + /* + * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells + * vp's filesystem to flush and invalidate all cached resources. + * When vn_reclaim returns, vp should have no private data, + * either in a system cache or attached to v_data. + */ + if (vn_reclaim(vp) != 0) + panic("vn_purge: cannot reclaim"); + + /* + * Wakeup anyone waiting for vp to be reclaimed. + */ + vn_wakeup(vp); +} + +/* + * Add a reference to a referenced vnode. + */ +struct vnode * +vn_hold( + struct vnode *vp) +{ + struct inode *inode; + + XFS_STATS_INC(vn_hold); + + VN_LOCK(vp); + inode = igrab(LINVFS_GET_IP(vp)); + ASSERT(inode); + VN_UNLOCK(vp, 0); + + return vp; +} + +/* + * Call VOP_INACTIVE on last reference. + */ +void +vn_rele( + struct vnode *vp) +{ + int vcnt; + int cache; + + XFS_STATS_INC(vn_rele); + + VN_LOCK(vp); + + vn_trace_entry(vp, "vn_rele", (inst_t *)__return_address); + vcnt = vn_count(vp); + + /* + * Since we always get called from put_inode we know + * that i_count won't be decremented after we + * return. + */ + if (!vcnt) { + /* + * As soon as we turn this on, noone can find us in vn_get + * until we turn off VINACT or VRECLM + */ + vp->v_flag |= VINACT; + VN_UNLOCK(vp, 0); + + /* + * Do not make the VOP_INACTIVE call if there + * are no behaviors attached to the vnode to call. + */ + if (vp->v_fbhv) + VOP_INACTIVE(vp, NULL, cache); + + VN_LOCK(vp); + if (vp->v_flag & VWAIT) + sv_broadcast(vptosync(vp)); + + vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VMODIFIED); + } + + VN_UNLOCK(vp, 0); + + vn_trace_exit(vp, "vn_rele", (inst_t *)__return_address); +} + +/* + * Finish the removal of a vnode. + */ +void +vn_remove( + struct vnode *vp) +{ + vmap_t vmap; + + /* Make sure we don't do this to the same vnode twice */ + if (!(vp->v_fbhv)) + return; + + XFS_STATS_INC(vn_remove); + vn_trace_exit(vp, "vn_remove", (inst_t *)__return_address); + + /* + * After the following purge the vnode + * will no longer exist. + */ + VMAP(vp, vmap); + vn_purge(vp, &vmap); +} + + +#ifdef XFS_VNODE_TRACE + +#define KTRACE_ENTER(vp, vk, s, line, ra) \ + ktrace_enter( (vp)->v_trace, \ +/* 0 */ (void *)(__psint_t)(vk), \ +/* 1 */ (void *)(s), \ +/* 2 */ (void *)(__psint_t) line, \ +/* 3 */ (void *)(vn_count(vp)), \ +/* 4 */ (void *)(ra), \ +/* 5 */ (void *)(__psunsigned_t)(vp)->v_flag, \ +/* 6 */ (void *)(__psint_t)current_cpu(), \ +/* 7 */ (void *)(__psint_t)current_pid(), \ +/* 8 */ (void *)__return_address, \ +/* 9 */ 0, 0, 0, 0, 0, 0, 0) + +/* + * Vnode tracing code. + */ +void +vn_trace_entry(vnode_t *vp, char *func, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra); +} + +void +vn_trace_exit(vnode_t *vp, char *func, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra); +} + +void +vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra); +} + +void +vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra); +} + +void +vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra) +{ + KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra); +} +#endif /* XFS_VNODE_TRACE */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h new file mode 100644 index 000000000000..da76c1f1e11c --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -0,0 +1,666 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * Further, this software is distributed without any warranty that it is + * free of the rightful claim of any third person regarding infringement + * or the like. Any license provided herein, whether implied or + * otherwise, applies only to this software file. Patent licenses, if + * any, provided herein do not apply to combinations of this program with + * other software, or any other product whatsoever. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy, + * Mountain View, CA 94043, or: + * + * http://www.sgi.com + * + * For further information regarding this notice, see: + * + * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + * + * Portions Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef __XFS_VNODE_H__ +#define __XFS_VNODE_H__ + +struct uio; +struct file; +struct vattr; +struct xfs_iomap; +struct attrlist_cursor_kern; + +/* + * Vnode types. VNON means no type. + */ +enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VFIFO, VBAD, VSOCK }; + +typedef xfs_ino_t vnumber_t; +typedef struct dentry vname_t; +typedef bhv_head_t vn_bhv_head_t; + +/* + * MP locking protocols: + * v_flag, v_vfsp VN_LOCK/VN_UNLOCK + * v_type read-only or fs-dependent + */ +typedef struct vnode { + __u32 v_flag; /* vnode flags (see below) */ + enum vtype v_type; /* vnode type */ + struct vfs *v_vfsp; /* ptr to containing VFS */ + vnumber_t v_number; /* in-core vnode number */ + vn_bhv_head_t v_bh; /* behavior head */ + spinlock_t v_lock; /* VN_LOCK/VN_UNLOCK */ + struct inode v_inode; /* Linux inode */ +#ifdef XFS_VNODE_TRACE + struct ktrace *v_trace; /* trace header structure */ +#endif +} vnode_t; + +#define v_fbhv v_bh.bh_first /* first behavior */ +#define v_fops v_bh.bh_first->bd_ops /* first behavior ops */ + +#define VNODE_POSITION_BASE BHV_POSITION_BASE /* chain bottom */ +#define VNODE_POSITION_TOP BHV_POSITION_TOP /* chain top */ +#define VNODE_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */ + +typedef enum { + VN_BHV_UNKNOWN, /* not specified */ + VN_BHV_XFS, /* xfs */ + VN_BHV_DM, /* data migration */ + VN_BHV_QM, /* quota manager */ + VN_BHV_IO, /* IO path */ + VN_BHV_END /* housekeeping end-of-range */ +} vn_bhv_t; + +#define VNODE_POSITION_XFS (VNODE_POSITION_BASE) +#define VNODE_POSITION_DM (VNODE_POSITION_BASE+10) +#define VNODE_POSITION_QM (VNODE_POSITION_BASE+20) +#define VNODE_POSITION_IO (VNODE_POSITION_BASE+30) + +/* + * Macros for dealing with the behavior descriptor inside of the vnode. + */ +#define BHV_TO_VNODE(bdp) ((vnode_t *)BHV_VOBJ(bdp)) +#define BHV_TO_VNODE_NULL(bdp) ((vnode_t *)BHV_VOBJNULL(bdp)) + +#define VN_BHV_HEAD(vp) ((bhv_head_t *)(&((vp)->v_bh))) +#define vn_bhv_head_init(bhp,name) bhv_head_init(bhp,name) +#define vn_bhv_remove(bhp,bdp) bhv_remove(bhp,bdp) +#define vn_bhv_lookup(bhp,ops) bhv_lookup(bhp,ops) +#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops) + +/* + * Vnode to Linux inode mapping. + */ +#define LINVFS_GET_VP(inode) ((vnode_t *)list_entry(inode, vnode_t, v_inode)) +#define LINVFS_GET_IP(vp) (&(vp)->v_inode) + +/* + * Convert between vnode types and inode formats (since POSIX.1 + * defines mode word of stat structure in terms of inode formats). + */ +extern enum vtype iftovt_tab[]; +extern u_short vttoif_tab[]; +#define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) +#define VTTOIF(indx) (vttoif_tab[(int)(indx)]) +#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) + + +/* + * Vnode flags. + */ +#define VINACT 0x1 /* vnode is being inactivated */ +#define VRECLM 0x2 /* vnode is being reclaimed */ +#define VWAIT 0x4 /* waiting for VINACT/VRECLM to end */ +#define VMODIFIED 0x8 /* XFS inode state possibly differs */ + /* to the Linux inode state. */ + +/* + * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter. + */ +typedef enum vrwlock { + VRWLOCK_NONE, + VRWLOCK_READ, + VRWLOCK_WRITE, + VRWLOCK_WRITE_DIRECT, + VRWLOCK_TRY_READ, + VRWLOCK_TRY_WRITE +} vrwlock_t; + +/* + * Return values for VOP_INACTIVE. A return value of + * VN_INACTIVE_NOCACHE implies that the file system behavior + * has disassociated its state and bhv_desc_t from the vnode. + */ +#define VN_INACTIVE_CACHE 0 +#define VN_INACTIVE_NOCACHE 1 + +/* + * Values for the cmd code given to VOP_VNODE_CHANGE. + */ +typedef enum vchange { + VCHANGE_FLAGS_FRLOCKS = 0, + VCHANGE_FLAGS_ENF_LOCKING = 1, + VCHANGE_FLAGS_TRUNCATED = 2, + VCHANGE_FLAGS_PAGE_DIRTY = 3, + VCHANGE_FLAGS_IOEXCL_COUNT = 4 +} vchange_t; + + +typedef int (*vop_open_t)(bhv_desc_t *, struct cred *); +typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *, + const struct iovec *, unsigned int, + loff_t *, int, struct cred *); +typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, + const struct iovec *, unsigned int, + loff_t *, int, struct cred *); +typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, + loff_t *, int, size_t, read_actor_t, + void *, struct cred *); +typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, + int, unsigned int, void __user *); +typedef int (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int, + struct cred *); +typedef int (*vop_setattr_t)(bhv_desc_t *, struct vattr *, int, + struct cred *); +typedef int (*vop_access_t)(bhv_desc_t *, int, struct cred *); +typedef int (*vop_lookup_t)(bhv_desc_t *, vname_t *, vnode_t **, + int, vnode_t *, struct cred *); +typedef int (*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *, + vnode_t **, struct cred *); +typedef int (*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *); +typedef int (*vop_link_t)(bhv_desc_t *, vnode_t *, vname_t *, + struct cred *); +typedef int (*vop_rename_t)(bhv_desc_t *, vname_t *, vnode_t *, vname_t *, + struct cred *); +typedef int (*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *, + vnode_t **, struct cred *); +typedef int (*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *); +typedef int (*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *, + int *); +typedef int (*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *, + char *, vnode_t **, struct cred *); +typedef int (*vop_readlink_t)(bhv_desc_t *, struct uio *, int, + struct cred *); +typedef int (*vop_fsync_t)(bhv_desc_t *, int, struct cred *, + xfs_off_t, xfs_off_t); +typedef int (*vop_inactive_t)(bhv_desc_t *, struct cred *); +typedef int (*vop_fid2_t)(bhv_desc_t *, struct fid *); +typedef int (*vop_release_t)(bhv_desc_t *); +typedef int (*vop_rwlock_t)(bhv_desc_t *, vrwlock_t); +typedef void (*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t); +typedef int (*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, + struct xfs_iomap *, int *); +typedef int (*vop_reclaim_t)(bhv_desc_t *); +typedef int (*vop_attr_get_t)(bhv_desc_t *, char *, char *, int *, int, + struct cred *); +typedef int (*vop_attr_set_t)(bhv_desc_t *, char *, char *, int, int, + struct cred *); +typedef int (*vop_attr_remove_t)(bhv_desc_t *, char *, int, struct cred *); +typedef int (*vop_attr_list_t)(bhv_desc_t *, char *, int, int, + struct attrlist_cursor_kern *, struct cred *); +typedef void (*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int); +typedef void (*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t); +typedef void (*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +typedef void (*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); +typedef int (*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, + uint64_t, int); +typedef int (*vop_iflush_t)(bhv_desc_t *, int); + + +typedef struct vnodeops { + bhv_position_t vn_position; /* position within behavior chain */ + vop_open_t vop_open; + vop_read_t vop_read; + vop_write_t vop_write; + vop_sendfile_t vop_sendfile; + vop_ioctl_t vop_ioctl; + vop_getattr_t vop_getattr; + vop_setattr_t vop_setattr; + vop_access_t vop_access; + vop_lookup_t vop_lookup; + vop_create_t vop_create; + vop_remove_t vop_remove; + vop_link_t vop_link; + vop_rename_t vop_rename; + vop_mkdir_t vop_mkdir; + vop_rmdir_t vop_rmdir; + vop_readdir_t vop_readdir; + vop_symlink_t vop_symlink; + vop_readlink_t vop_readlink; + vop_fsync_t vop_fsync; + vop_inactive_t vop_inactive; + vop_fid2_t vop_fid2; + vop_rwlock_t vop_rwlock; + vop_rwunlock_t vop_rwunlock; + vop_bmap_t vop_bmap; + vop_reclaim_t vop_reclaim; + vop_attr_get_t vop_attr_get; + vop_attr_set_t vop_attr_set; + vop_attr_remove_t vop_attr_remove; + vop_attr_list_t vop_attr_list; + vop_link_removed_t vop_link_removed; + vop_vnode_change_t vop_vnode_change; + vop_ptossvp_t vop_tosspages; + vop_pflushinvalvp_t vop_flushinval_pages; + vop_pflushvp_t vop_flush_pages; + vop_release_t vop_release; + vop_iflush_t vop_iflush; +} vnodeops_t; + +/* + * VOP's. + */ +#define _VOP_(op, vp) (*((vnodeops_t *)(vp)->v_fops)->op) + +#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv) \ + rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) +#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv) \ + rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) +#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \ + rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr) +#define VOP_BMAP(vp,of,sz,rw,b,n,rv) \ + rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n) +#define VOP_OPEN(vp, cr, rv) \ + rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr) +#define VOP_GETATTR(vp, vap, f, cr, rv) \ + rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr) +#define VOP_SETATTR(vp, vap, f, cr, rv) \ + rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr) +#define VOP_ACCESS(vp, mode, cr, rv) \ + rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr) +#define VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv) \ + rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr) +#define VOP_CREATE(dvp,d,vap,vpp,cr,rv) \ + rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr) +#define VOP_REMOVE(dvp,d,cr,rv) \ + rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr) +#define VOP_LINK(tdvp,fvp,d,cr,rv) \ + rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr) +#define VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv) \ + rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr) +#define VOP_MKDIR(dp,d,vap,vpp,cr,rv) \ + rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr) +#define VOP_RMDIR(dp,d,cr,rv) \ + rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr) +#define VOP_READDIR(vp,uiop,cr,eofp,rv) \ + rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp) +#define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv) \ + rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr) +#define VOP_READLINK(vp,uiop,fl,cr,rv) \ + rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr) +#define VOP_FSYNC(vp,f,cr,b,e,rv) \ + rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e) +#define VOP_INACTIVE(vp, cr, rv) \ + rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr) +#define VOP_RELEASE(vp, rv) \ + rv = _VOP_(vop_release, vp)((vp)->v_fbhv) +#define VOP_FID2(vp, fidp, rv) \ + rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp) +#define VOP_RWLOCK(vp,i) \ + (void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i) +#define VOP_RWLOCK_TRY(vp,i) \ + _VOP_(vop_rwlock, vp)((vp)->v_fbhv, i) +#define VOP_RWUNLOCK(vp,i) \ + (void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i) +#define VOP_FRLOCK(vp,c,fl,flags,offset,fr,rv) \ + rv = _VOP_(vop_frlock, vp)((vp)->v_fbhv,c,fl,flags,offset,fr) +#define VOP_RECLAIM(vp, rv) \ + rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv) +#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv) \ + rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred) +#define VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv) \ + rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred) +#define VOP_ATTR_REMOVE(vp, name, flags, cred, rv) \ + rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred) +#define VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv) \ + rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred) +#define VOP_LINK_REMOVED(vp, dvp, linkzero) \ + (void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero) +#define VOP_VNODE_CHANGE(vp, cmd, val) \ + (void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val) +/* + * These are page cache functions that now go thru VOPs. + * 'last' parameter is unused and left in for IRIX compatibility + */ +#define VOP_TOSS_PAGES(vp, first, last, fiopt) \ + _VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt) +/* + * 'last' parameter is unused and left in for IRIX compatibility + */ +#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt) \ + _VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt) +/* + * 'last' parameter is unused and left in for IRIX compatibility + */ +#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv) \ + rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt) +#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv) \ + rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg) +#define VOP_IFLUSH(vp, flags, rv) \ + rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags) + +/* + * Flags for read/write calls - same values as IRIX + */ +#define IO_ISAIO 0x00001 /* don't wait for completion */ +#define IO_ISDIRECT 0x00004 /* bypass page cache */ +#define IO_INVIS 0x00020 /* don't update inode timestamps */ + +/* + * Flags for VOP_IFLUSH call + */ +#define FLUSH_SYNC 1 /* wait for flush to complete */ +#define FLUSH_INODE 2 /* flush the inode itself */ +#define FLUSH_LOG 4 /* force the last log entry for + * this inode out to disk */ + +/* + * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and + * VOP_FLUSH_PAGES. + */ +#define FI_NONE 0 /* none */ +#define FI_REMAPF 1 /* Do a remapf prior to the operation */ +#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation. + Prevent VM access to the pages until + the operation completes. */ + +/* + * Vnode attributes. va_mask indicates those attributes the caller + * wants to set or extract. + */ +typedef struct vattr { + int va_mask; /* bit-mask of attributes present */ + enum vtype va_type; /* vnode type (for create) */ + mode_t va_mode; /* file access mode and type */ + nlink_t va_nlink; /* number of references to file */ + uid_t va_uid; /* owner user id */ + gid_t va_gid; /* owner group id */ + xfs_ino_t va_nodeid; /* file id */ + xfs_off_t va_size; /* file size in bytes */ + u_long va_blocksize; /* blocksize preferred for i/o */ + struct timespec va_atime; /* time of last access */ + struct timespec va_mtime; /* time of last modification */ + struct timespec va_ctime; /* time file changed */ + u_int va_gen; /* generation number of file */ + xfs_dev_t va_rdev; /* device the special file represents */ + __int64_t va_nblocks; /* number of blocks allocated */ + u_long va_xflags; /* random extended file flags */ + u_long va_extsize; /* file extent size */ + u_long va_nextents; /* number of extents in file */ + u_long va_anextents; /* number of attr extents in file */ + int va_projid; /* project id */ +} vattr_t; + +/* + * setattr or getattr attributes + */ +#define XFS_AT_TYPE 0x00000001 +#define XFS_AT_MODE 0x00000002 +#define XFS_AT_UID 0x00000004 +#define XFS_AT_GID 0x00000008 +#define XFS_AT_FSID 0x00000010 +#define XFS_AT_NODEID 0x00000020 +#define XFS_AT_NLINK 0x00000040 +#define XFS_AT_SIZE 0x00000080 +#define XFS_AT_ATIME 0x00000100 +#define XFS_AT_MTIME 0x00000200 +#define XFS_AT_CTIME 0x00000400 +#define XFS_AT_RDEV 0x00000800 +#define XFS_AT_BLKSIZE 0x00001000 +#define XFS_AT_NBLOCKS 0x00002000 +#define XFS_AT_VCODE 0x00004000 +#define XFS_AT_MAC 0x00008000 +#define XFS_AT_UPDATIME 0x00010000 +#define XFS_AT_UPDMTIME 0x00020000 +#define XFS_AT_UPDCTIME 0x00040000 +#define XFS_AT_ACL 0x00080000 +#define XFS_AT_CAP 0x00100000 +#define XFS_AT_INF 0x00200000 +#define XFS_AT_XFLAGS 0x00400000 +#define XFS_AT_EXTSIZE 0x00800000 +#define XFS_AT_NEXTENTS 0x01000000 +#define XFS_AT_ANEXTENTS 0x02000000 +#define XFS_AT_PROJID 0x04000000 +#define XFS_AT_SIZE_NOPERM 0x08000000 +#define XFS_AT_GENCOUNT 0x10000000 + +#define XFS_AT_ALL (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ + XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ + XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ + XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\ + XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\ + XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT) + +#define XFS_AT_STAT (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ + XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ + XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ + XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID) + +#define XFS_AT_TIMES (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME) + +#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME) + +#define XFS_AT_NOSET (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\ + XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\ + XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT) + +/* + * Modes. + */ +#define VSUID S_ISUID /* set user id on execution */ +#define VSGID S_ISGID /* set group id on execution */ +#define VSVTX S_ISVTX /* save swapped text even after use */ +#define VREAD S_IRUSR /* read, write, execute permissions */ +#define VWRITE S_IWUSR +#define VEXEC S_IXUSR + +#define MODEMASK S_IALLUGO /* mode bits plus permission bits */ + +/* + * Check whether mandatory file locking is enabled. + */ +#define MANDLOCK(vp, mode) \ + ((vp)->v_type == VREG && ((mode) & (VSGID|(VEXEC>>3))) == VSGID) + +extern void vn_init(void); +extern int vn_wait(struct vnode *); +extern vnode_t *vn_initialize(struct inode *); + +/* + * Acquiring and invalidating vnodes: + * + * if (vn_get(vp, version, 0)) + * ...; + * vn_purge(vp, version); + * + * vn_get and vn_purge must be called with vmap_t arguments, sampled + * while a lock that the vnode's VOP_RECLAIM function acquires is + * held, to ensure that the vnode sampled with the lock held isn't + * recycled (VOP_RECLAIMed) or deallocated between the release of the lock + * and the subsequent vn_get or vn_purge. + */ + +/* + * vnode_map structures _must_ match vn_epoch and vnode structure sizes. + */ +typedef struct vnode_map { + vfs_t *v_vfsp; + vnumber_t v_number; /* in-core vnode number */ + xfs_ino_t v_ino; /* inode # */ +} vmap_t; + +#define VMAP(vp, vmap) {(vmap).v_vfsp = (vp)->v_vfsp, \ + (vmap).v_number = (vp)->v_number, \ + (vmap).v_ino = (vp)->v_inode.i_ino; } + +extern void vn_purge(struct vnode *, vmap_t *); +extern vnode_t *vn_get(struct vnode *, vmap_t *); +extern int vn_revalidate(struct vnode *); +extern void vn_revalidate_core(struct vnode *, vattr_t *); +extern void vn_remove(struct vnode *); + +static inline int vn_count(struct vnode *vp) +{ + return atomic_read(&LINVFS_GET_IP(vp)->i_count); +} + +/* + * Vnode reference counting functions (and macros for compatibility). + */ +extern vnode_t *vn_hold(struct vnode *); +extern void vn_rele(struct vnode *); + +#if defined(XFS_VNODE_TRACE) +#define VN_HOLD(vp) \ + ((void)vn_hold(vp), \ + vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address)) +#define VN_RELE(vp) \ + (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \ + iput(LINVFS_GET_IP(vp))) +#else +#define VN_HOLD(vp) ((void)vn_hold(vp)) +#define VN_RELE(vp) (iput(LINVFS_GET_IP(vp))) +#endif + +/* + * Vname handling macros. + */ +#define VNAME(dentry) ((char *) (dentry)->d_name.name) +#define VNAMELEN(dentry) ((dentry)->d_name.len) +#define VNAME_TO_VNODE(dentry) (LINVFS_GET_VP((dentry)->d_inode)) + +/* + * Vnode spinlock manipulation. + */ +#define VN_LOCK(vp) mutex_spinlock(&(vp)->v_lock) +#define VN_UNLOCK(vp, s) mutex_spinunlock(&(vp)->v_lock, s) +#define VN_FLAGSET(vp,b) vn_flagset(vp,b) +#define VN_FLAGCLR(vp,b) vn_flagclr(vp,b) + +static __inline__ void vn_flagset(struct vnode *vp, uint flag) +{ + spin_lock(&vp->v_lock); + vp->v_flag |= flag; + spin_unlock(&vp->v_lock); +} + +static __inline__ void vn_flagclr(struct vnode *vp, uint flag) +{ + spin_lock(&vp->v_lock); + vp->v_flag &= ~flag; + spin_unlock(&vp->v_lock); +} + +/* + * Update modify/access/change times on the vnode + */ +#define VN_MTIMESET(vp, tvp) (LINVFS_GET_IP(vp)->i_mtime = *(tvp)) +#define VN_ATIMESET(vp, tvp) (LINVFS_GET_IP(vp)->i_atime = *(tvp)) +#define VN_CTIMESET(vp, tvp) (LINVFS_GET_IP(vp)->i_ctime = *(tvp)) + +/* + * Dealing with bad inodes + */ +static inline void vn_mark_bad(struct vnode *vp) +{ + make_bad_inode(LINVFS_GET_IP(vp)); +} + +static inline int VN_BAD(struct vnode *vp) +{ + return is_bad_inode(LINVFS_GET_IP(vp)); +} + +/* + * Some useful predicates. + */ +#define VN_MAPPED(vp) mapping_mapped(LINVFS_GET_IP(vp)->i_mapping) +#define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages) +#define VN_DIRTY(vp) mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \ + PAGECACHE_TAG_DIRTY) +#define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED) +#define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED) + +/* + * Flags to VOP_SETATTR/VOP_GETATTR. + */ +#define ATTR_UTIME 0x01 /* non-default utime(2) request */ +#define ATTR_DMI 0x08 /* invocation from a DMI function */ +#define ATTR_LAZY 0x80 /* set/get attributes lazily */ +#define ATTR_NONBLOCK 0x100 /* return EAGAIN if operation would block */ + +/* + * Flags to VOP_FSYNC and VOP_RECLAIM. + */ +#define FSYNC_NOWAIT 0 /* asynchronous flush */ +#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */ +#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */ +#define FSYNC_DATA 0x4 /* synchronous fsync of data only */ + +/* + * Tracking vnode activity. + */ +#if defined(XFS_VNODE_TRACE) + +#define VNODE_TRACE_SIZE 16 /* number of trace entries */ +#define VNODE_KTRACE_ENTRY 1 +#define VNODE_KTRACE_EXIT 2 +#define VNODE_KTRACE_HOLD 3 +#define VNODE_KTRACE_REF 4 +#define VNODE_KTRACE_RELE 5 + +extern void vn_trace_entry(struct vnode *, char *, inst_t *); +extern void vn_trace_exit(struct vnode *, char *, inst_t *); +extern void vn_trace_hold(struct vnode *, char *, int, inst_t *); +extern void vn_trace_ref(struct vnode *, char *, int, inst_t *); +extern void vn_trace_rele(struct vnode *, char *, int, inst_t *); + +#define VN_TRACE(vp) \ + vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address) +#else +#define vn_trace_entry(a,b,c) +#define vn_trace_exit(a,b,c) +#define vn_trace_hold(a,b,c,d) +#define vn_trace_ref(a,b,c,d) +#define vn_trace_rele(a,b,c,d) +#define VN_TRACE(vp) +#endif + +#endif /* __XFS_VNODE_H__ */ |