diff options
Diffstat (limited to 'lib/device/dev-io.c')
-rw-r--r-- | lib/device/dev-io.c | 634 |
1 files changed, 175 insertions, 459 deletions
diff --git a/lib/device/dev-io.c b/lib/device/dev-io.c index 3bb9d65..5875e75 100644 --- a/lib/device/dev-io.c +++ b/lib/device/dev-io.c @@ -10,16 +10,13 @@ * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "lib.h" -#include "lvm-types.h" -#include "device.h" -#include "metadata.h" -#include "lvmcache.h" -#include "memlock.h" -#include "locking.h" +#include "lib/misc/lib.h" +#include "lib/device/device.h" +#include "lib/metadata/metadata.h" +#include "lib/mm/memlock.h" #include <limits.h> #include <sys/stat.h> @@ -27,7 +24,7 @@ #include <unistd.h> #include <sys/ioctl.h> -#ifdef linux +#ifdef __linux__ # define u64 uint64_t /* Missing without __KERNEL__ */ # undef WNOHANG /* Avoid redefinition */ # undef WUNTRACED /* Avoid redefinition */ @@ -54,186 +51,23 @@ # endif #endif -static DM_LIST_INIT(_open_devices); +static unsigned _dev_size_seqno = 1; -/*----------------------------------------------------------------- - * The standard io loop that keeps submitting an io until it's - * all gone. - *---------------------------------------------------------------*/ -static int _io(struct device_area *where, char *buffer, int should_write) -{ - int fd = dev_fd(where->dev); - ssize_t n = 0; - size_t total = 0; - - if (fd < 0) { - log_error("Attempt to read an unopened device (%s).", - dev_name(where->dev)); - return 0; - } - - /* - * Skip all writes in test mode. - */ - if (should_write && test_mode()) - return 1; - - if (where->size > SSIZE_MAX) { - log_error("Read size too large: %" PRIu64, where->size); - return 0; - } - - if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) { - log_error("%s: lseek %" PRIu64 " failed: %s", - dev_name(where->dev), (uint64_t) where->start, - strerror(errno)); - return 0; - } - - while (total < (size_t) where->size) { - do - n = should_write ? - write(fd, buffer, (size_t) where->size - total) : - read(fd, buffer, (size_t) where->size - total); - while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN))); - - if (n < 0) - log_error_once("%s: %s failed after %" PRIu64 " of %" PRIu64 - " at %" PRIu64 ": %s", dev_name(where->dev), - should_write ? "write" : "read", - (uint64_t) total, - (uint64_t) where->size, - (uint64_t) where->start, strerror(errno)); - - if (n <= 0) - break; - - total += n; - buffer += n; - } - - return (total == (size_t) where->size); -} - -/*----------------------------------------------------------------- - * LVM2 uses O_DIRECT when performing metadata io, which requires - * block size aligned accesses. If any io is not aligned we have - * to perform the io via a bounce buffer, obviously this is quite - * inefficient. - *---------------------------------------------------------------*/ - -/* - * Get the sector size from an _open_ device. - */ -static int _get_block_size(struct device *dev, unsigned int *size) +static int _dev_get_size_file(struct device *dev, uint64_t *size) { const char *name = dev_name(dev); + struct stat info; - if (dev->block_size == -1) { - if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) { - log_sys_error("ioctl BLKBSZGET", name); - return 0; - } - log_debug("%s: block size is %u bytes", name, dev->block_size); - } - - *size = (unsigned int) dev->block_size; - - return 1; -} - -/* - * Widens a region to be an aligned region. - */ -static void _widen_region(unsigned int block_size, struct device_area *region, - struct device_area *result) -{ - uint64_t mask = block_size - 1, delta; - memcpy(result, region, sizeof(*result)); - - /* adjust the start */ - delta = result->start & mask; - if (delta) { - result->start -= delta; - result->size += delta; - } - - /* adjust the end */ - delta = (result->start + result->size) & mask; - if (delta) - result->size += block_size - delta; -} - -static int _aligned_io(struct device_area *where, char *buffer, - int should_write) -{ - char *bounce, *bounce_buf; - unsigned int block_size = 0; - uintptr_t mask; - struct device_area widened; - int r = 0; - - if (!(where->dev->flags & DEV_REGULAR) && - !_get_block_size(where->dev, &block_size)) + if (dm_list_empty(&dev->aliases)) return_0; - if (!block_size) - block_size = lvm_getpagesize(); - - _widen_region(block_size, where, &widened); - - /* Do we need to use a bounce buffer? */ - mask = block_size - 1; - if (!memcmp(where, &widened, sizeof(widened)) && - !((uintptr_t) buffer & mask)) - return _io(where, buffer, should_write); - - /* Allocate a bounce buffer with an extra block */ - if (!(bounce_buf = bounce = dm_malloc((size_t) widened.size + block_size))) { - log_error("Bounce buffer malloc failed"); - return 0; - } - - /* - * Realign start of bounce buffer (using the extra sector) - */ - if (((uintptr_t) bounce) & mask) - bounce = (char *) ((((uintptr_t) bounce) + mask) & ~mask); - - /* channel the io through the bounce buffer */ - if (!_io(&widened, bounce, 0)) { - if (!should_write) - goto_out; - /* FIXME pre-extend the file */ - memset(bounce, '\n', widened.size); - } - - if (should_write) { - memcpy(bounce + (where->start - widened.start), buffer, - (size_t) where->size); - - /* ... then we write */ - if (!(r = _io(&widened, bounce, 1))) - stack; - - goto out; + if (dev->size_seqno == _dev_size_seqno) { + log_very_verbose("%s: using cached size %" PRIu64 " sectors", + name, dev->size); + *size = dev->size; + return 1; } - memcpy(buffer, bounce + (where->start - widened.start), - (size_t) where->size); - - r = 1; - -out: - dm_free(bounce_buf); - return r; -} - -static int _dev_get_size_file(const struct device *dev, uint64_t *size) -{ - const char *name = dev_name(dev); - struct stat info; - if (stat(name, &info)) { log_sys_error("stat", name); return 0; @@ -241,53 +75,74 @@ static int _dev_get_size_file(const struct device *dev, uint64_t *size) *size = info.st_size; *size >>= SECTOR_SHIFT; /* Convert to sectors */ + dev->size = *size; + dev->size_seqno = _dev_size_seqno; log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); return 1; } -static int _dev_get_size_dev(const struct device *dev, uint64_t *size) +static int _dev_get_size_dev(struct device *dev, uint64_t *size) { - int fd; const char *name = dev_name(dev); + int fd = dev->bcache_fd; + int do_close = 0; - if ((fd = open(name, O_RDONLY)) < 0) { - log_sys_error("open", name); - return 0; + if (dm_list_empty(&dev->aliases)) + return_0; + + if (dev->size_seqno == _dev_size_seqno) { + log_very_verbose("%s: using cached size %" PRIu64 " sectors", + name, dev->size); + *size = dev->size; + return 1; + } + + if (fd <= 0) { + if (!dev_open_readonly_quiet(dev)) + return_0; + fd = dev_fd(dev); + do_close = 1; } if (ioctl(fd, BLKGETSIZE64, size) < 0) { - log_sys_error("ioctl BLKGETSIZE64", name); - if (close(fd)) - log_sys_error("close", name); + log_warn("WARNING: %s: ioctl BLKGETSIZE64 %s", name, strerror(errno)); + if (do_close && !dev_close_immediate(dev)) + stack; return 0; } *size >>= BLKSIZE_SHIFT; /* Convert to sectors */ - if (close(fd)) - log_sys_error("close", name); + dev->size = *size; + dev->size_seqno = _dev_size_seqno; log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); + if (do_close && !dev_close_immediate(dev)) + stack; + return 1; } static int _dev_read_ahead_dev(struct device *dev, uint32_t *read_ahead) { - long read_ahead_long; + long read_ahead_long = 0; if (dev->read_ahead != -1) { *read_ahead = (uint32_t) dev->read_ahead; return 1; } - if (!dev_open_readonly(dev)) - return_0; + if (!dev_open_readonly_quiet(dev)) { + log_warn("WARNING: Failed to open %s to get readahead %s.", + dev_name(dev), strerror(errno)); + return 0; + } if (ioctl(dev->fd, BLKRAGET, &read_ahead_long) < 0) { - log_sys_error("ioctl BLKRAGET", dev_name(dev)); - if (!dev_close(dev)) + log_warn("WARNING: %s: ioctl BLKRAGET %s.", dev_name(dev), strerror(errno)); + if (!dev_close_immediate(dev)) stack; return 0; } @@ -298,7 +153,7 @@ static int _dev_read_ahead_dev(struct device *dev, uint32_t *read_ahead) log_very_verbose("%s: read_ahead is %u sectors", dev_name(dev), *read_ahead); - if (!dev_close(dev)) + if (!dev_close_immediate(dev)) stack; return 1; @@ -314,18 +169,74 @@ static int _dev_discard_blocks(struct device *dev, uint64_t offset_bytes, uint64 discard_range[0] = offset_bytes; discard_range[1] = size_bytes; - log_debug("Discarding %" PRIu64 " bytes offset %" PRIu64 " bytes on %s.", - size_bytes, offset_bytes, dev_name(dev)); - if (ioctl(dev->fd, BLKDISCARD, &discard_range) < 0) { - log_error("%s: BLKDISCARD ioctl at offset %" PRIu64 " size %" PRIu64 " failed: %s.", + log_debug_devs("Discarding %" PRIu64 " bytes offset %" PRIu64 " bytes on %s. %s", + size_bytes, offset_bytes, dev_name(dev), + test_mode() ? " (test mode - suppressed)" : ""); + + if (!test_mode() && ioctl(dev->fd, BLKDISCARD, &discard_range) < 0) { + log_warn("WARNING: %s: ioctl BLKDISCARD at offset %" PRIu64 " size %" PRIu64 " failed: %s.", dev_name(dev), offset_bytes, size_bytes, strerror(errno)); - if (!dev_close(dev)) + if (!dev_close_immediate(dev)) stack; /* It doesn't matter if discard failed, so return success. */ return 1; } - if (!dev_close(dev)) + if (!dev_close_immediate(dev)) + stack; + + return 1; +} + +int dev_get_direct_block_sizes(struct device *dev, unsigned int *physical_block_size, + unsigned int *logical_block_size) +{ + int fd = dev->bcache_fd; + int do_close = 0; + unsigned int pbs = 0; + unsigned int lbs = 0; + + if (dev->physical_block_size || dev->logical_block_size) { + *physical_block_size = dev->physical_block_size; + *logical_block_size = dev->logical_block_size; + return 1; + } + + if (fd <= 0) { + if (!dev_open_readonly_quiet(dev)) + return 0; + fd = dev_fd(dev); + do_close = 1; + } + +#ifdef BLKPBSZGET /* not defined before kernel version 2.6.32 (e.g. rhel5) */ + /* + * BLKPBSZGET from kernel comment for blk_queue_physical_block_size: + * "the lowest possible sector size that the hardware can operate on + * without reverting to read-modify-write operations" + */ + if (ioctl(fd, BLKPBSZGET, &pbs)) { + stack; + pbs = 0; + } +#endif + + /* + * BLKSSZGET from kernel comment for blk_queue_logical_block_size: + * "the lowest possible block size that the storage device can address." + */ + if (ioctl(fd, BLKSSZGET, &lbs)) { + stack; + lbs = 0; + } + + dev->physical_block_size = pbs; + dev->logical_block_size = lbs; + + *physical_block_size = pbs; + *logical_block_size = lbs; + + if (do_close && !dev_close_immediate(dev)) stack; return 1; @@ -334,16 +245,20 @@ static int _dev_discard_blocks(struct device *dev, uint64_t offset_bytes, uint64 /*----------------------------------------------------------------- * Public functions *---------------------------------------------------------------*/ +void dev_size_seqno_inc(void) +{ + _dev_size_seqno++; +} -int dev_get_size(const struct device *dev, uint64_t *size) +int dev_get_size(struct device *dev, uint64_t *size) { if (!dev) return 0; if ((dev->flags & DEV_REGULAR)) return _dev_get_size_file(dev, size); - else - return _dev_get_size_dev(dev, size); + + return _dev_get_size_dev(dev, size); } int dev_get_read_ahead(struct device *dev, uint32_t *read_ahead) @@ -370,36 +285,6 @@ int dev_discard_blocks(struct device *dev, uint64_t offset_bytes, uint64_t size_ return _dev_discard_blocks(dev, offset_bytes, size_bytes); } -/* FIXME Unused -int dev_get_sectsize(struct device *dev, uint32_t *size) -{ - int fd; - int s; - const char *name = dev_name(dev); - - if ((fd = open(name, O_RDONLY)) < 0) { - log_sys_error("open", name); - return 0; - } - - if (ioctl(fd, BLKSSZGET, &s) < 0) { - log_sys_error("ioctl BLKSSZGET", name); - if (close(fd)) - log_sys_error("close", name); - return 0; - } - - if (close(fd)) - log_sys_error("close", name); - - *size = (uint32_t) s; - - log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size); - - return 1; -} -*/ - void dev_flush(struct device *dev) { if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0) @@ -423,6 +308,13 @@ int dev_open_flags(struct device *dev, int flags, int direct, int quiet) if ((flags & O_EXCL)) need_excl = 1; + if (dm_list_empty(&dev->aliases)) { + /* shouldn't happen */ + log_print_unless_silent("Cannot open device %d:%d with no valid paths.", (int)MAJOR(dev->dev), (int)MINOR(dev->dev)); + return 0; + } + name = dev_name(dev); + if (dev->fd >= 0) { if (((dev->flags & DEV_OPENED_RW) || !need_rw) && ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) { @@ -430,24 +322,22 @@ int dev_open_flags(struct device *dev, int flags, int direct, int quiet) return 1; } - if (dev->open_count && !need_excl) { - log_debug("%s already opened read-only. Upgrading " - "to read-write.", dev_name(dev)); - dev->open_count++; - } + if (dev->open_count && !need_excl) + log_debug_devs("%s: Already opened read-only. Upgrading " + "to read-write.", name); - dev_close_immediate(dev); + /* dev_close_immediate will decrement this */ + dev->open_count++; + + if (!dev_close_immediate(dev)) + return_0; + // FIXME: dev with DEV_ALLOCED is released + // but code is referencing it } if (critical_section()) /* FIXME Make this log_error */ - log_verbose("dev_open(%s) called while suspended", - dev_name(dev)); - - if (dev->flags & DEV_REGULAR) - name = dev_name(dev); - else if (!(name = dev_name_confirmed(dev, quiet))) - return_0; + log_verbose("dev_open(%s) called while suspended", name); #ifdef O_DIRECT_SUPPORT if (direct) { @@ -461,35 +351,49 @@ int dev_open_flags(struct device *dev, int flags, int direct, int quiet) #ifdef O_NOATIME /* Don't update atime on device inodes */ - if (!(dev->flags & DEV_REGULAR)) + if (!(dev->flags & DEV_REGULAR) && !(dev->flags & DEV_NOT_O_NOATIME)) flags |= O_NOATIME; #endif if ((dev->fd = open(name, flags, 0777)) < 0) { +#ifdef O_NOATIME + if ((errno == EPERM) && (flags & O_NOATIME)) { + flags &= ~O_NOATIME; + dev->flags |= DEV_NOT_O_NOATIME; + if ((dev->fd = open(name, flags, 0777)) >= 0) { + log_debug_devs("%s: Not using O_NOATIME", name); + goto opened; + } + } +#endif + #ifdef O_DIRECT_SUPPORT if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) { flags &= ~O_DIRECT; if ((dev->fd = open(name, flags, 0777)) >= 0) { dev->flags &= ~DEV_O_DIRECT; - log_debug("%s: Not using O_DIRECT", name); + log_debug_devs("%s: Not using O_DIRECT", name); goto opened; } } #endif if (quiet) - log_sys_debug("open", name); + log_debug("Failed to open device path %s (%d).", name, errno); else - log_sys_error("open", name); + log_error("Failed to open device path %s (%d).", name, errno); + + dev->flags |= DEV_OPEN_FAILURE; return 0; } -#ifdef O_DIRECT_SUPPORT +#if defined(O_NOATIME) || defined(O_DIRECT_SUPPORT) opened: +#endif +#ifdef O_DIRECT_SUPPORT if (direct) dev->flags |= DEV_O_DIRECT_TESTED; #endif dev->open_count++; - dev->flags &= ~DEV_ACCESSED_W; if (need_rw) dev->flags |= DEV_OPENED_RW; @@ -504,7 +408,8 @@ int dev_open_flags(struct device *dev, int flags, int direct, int quiet) if (!(dev->flags & DEV_REGULAR) && ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) { log_error("%s: fstat failed: Has device name changed?", name); - dev_close_immediate(dev); + if (!dev_close_immediate(dev)) + stack; return 0; } @@ -516,13 +421,14 @@ int dev_open_flags(struct device *dev, int flags, int direct, int quiet) if ((flags & O_CREAT) && !(flags & O_TRUNC)) dev->end = lseek(dev->fd, (off_t) 0, SEEK_END); - dm_list_add(&_open_devices, &dev->open_list); - - log_debug("Opened %s %s%s%s", dev_name(dev), - dev->flags & DEV_OPENED_RW ? "RW" : "RO", - dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "", - dev->flags & DEV_O_DIRECT ? " O_DIRECT" : ""); + if (!quiet) { + log_debug_devs("Opened %s %s%s%s", name, + dev->flags & DEV_OPENED_RW ? "RW" : "RO", + dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "", + dev->flags & DEV_O_DIRECT ? " O_DIRECT" : ""); + } + dev->flags &= ~DEV_OPEN_FAILURE; return 1; } @@ -551,63 +457,34 @@ int dev_open_readonly_quiet(struct device *dev) return dev_open_flags(dev, O_RDONLY, 1, 1); } -int dev_test_excl(struct device *dev) -{ - int flags; - int r; - - flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; - flags |= O_EXCL; - - r = dev_open_flags(dev, flags, 1, 1); - if (r) - dev_close_immediate(dev); - - return r; -} - static void _close(struct device *dev) { if (close(dev->fd)) - log_sys_error("close", dev_name(dev)); + log_sys_debug("close", dev_name(dev)); dev->fd = -1; - dev->block_size = -1; - dm_list_del(&dev->open_list); - log_debug("Closed %s", dev_name(dev)); + log_debug_devs("Closed %s", dev_name(dev)); - if (dev->flags & DEV_ALLOCED) { - dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)-> - str); - dm_free(dev->aliases.n); - dm_free(dev); - } + if (dev->flags & DEV_ALLOCED) + dev_destroy_file(dev); } static int _dev_close(struct device *dev, int immediate) { - if (dev->fd < 0) { log_error("Attempt to close device '%s' " "which is not open.", dev_name(dev)); return 0; } -#ifndef O_DIRECT_SUPPORT - if (dev->flags & DEV_ACCESSED_W) - dev_flush(dev); -#endif - if (dev->open_count > 0) dev->open_count--; if (immediate && dev->open_count) - log_debug("%s: Immediate close attempt while still referenced", - dev_name(dev)); + log_debug_devs("%s: Immediate close attempt while still referenced", + dev_name(dev)); - /* Close unless device is known to belong to a locked VG */ - if (immediate || - (dev->open_count < 1 && !lvmcache_pvid_is_locked(dev->pvid))) + if (immediate || (dev->open_count < 1)) _close(dev); return 1; @@ -622,164 +499,3 @@ int dev_close_immediate(struct device *dev) { return _dev_close(dev, 1); } - -void dev_close_all(void) -{ - struct dm_list *doh, *doht; - struct device *dev; - - dm_list_iterate_safe(doh, doht, &_open_devices) { - dev = dm_list_struct_base(doh, struct device, open_list); - if (dev->open_count < 1) - _close(dev); - } -} - -static inline int _dev_is_valid(struct device *dev) -{ - return (dev->max_error_count == NO_DEV_ERROR_COUNT_LIMIT || - dev->error_count < dev->max_error_count); -} - -static void _dev_inc_error_count(struct device *dev) -{ - if (++dev->error_count == dev->max_error_count) - log_warn("WARNING: Error counts reached a limit of %d. " - "Device %s was disabled", - dev->max_error_count, dev_name(dev)); -} - -int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer) -{ - struct device_area where; - int ret; - - if (!dev->open_count) - return_0; - - if (!_dev_is_valid(dev)) - return 0; - - where.dev = dev; - where.start = offset; - where.size = len; - - // fprintf(stderr, "READ: %s, %lld, %d\n", dev_name(dev), offset, len); - - ret = _aligned_io(&where, buffer, 0); - if (!ret) - _dev_inc_error_count(dev); - - return ret; -} - -/* - * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted - * by (offset,len) and (offset2,len2). Thus, the total size of - * 'buf' should be len+len2. - */ -int dev_read_circular(struct device *dev, uint64_t offset, size_t len, - uint64_t offset2, size_t len2, char *buf) -{ - if (!dev_read(dev, offset, len, buf)) { - log_error("Read from %s failed", dev_name(dev)); - return 0; - } - - /* - * The second region is optional, and allows for - * a circular buffer on the device. - */ - if (!len2) - return 1; - - if (!dev_read(dev, offset2, len2, buf + len)) { - log_error("Circular read from %s failed", - dev_name(dev)); - return 0; - } - - return 1; -} - -/* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after. - * But fails if concurrent processes writing - */ - -/* FIXME pre-extend the file */ -int dev_append(struct device *dev, size_t len, char *buffer) -{ - int r; - - if (!dev->open_count) - return_0; - - r = dev_write(dev, dev->end, len, buffer); - dev->end += (uint64_t) len; - -#ifndef O_DIRECT_SUPPORT - dev_flush(dev); -#endif - return r; -} - -int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer) -{ - struct device_area where; - int ret; - - if (!dev->open_count) - return_0; - - if (!_dev_is_valid(dev)) - return 0; - - where.dev = dev; - where.start = offset; - where.size = len; - - dev->flags |= DEV_ACCESSED_W; - - ret = _aligned_io(&where, buffer, 1); - if (!ret) - _dev_inc_error_count(dev); - - return ret; -} - -int dev_set(struct device *dev, uint64_t offset, size_t len, int value) -{ - size_t s; - char buffer[4096] __attribute__((aligned(8))); - - if (!dev_open(dev)) - return_0; - - if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE)) - log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t, - dev_name(dev), offset, len); - else - log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t - " sectors", dev_name(dev), offset >> SECTOR_SHIFT, - len >> SECTOR_SHIFT); - - memset(buffer, value, sizeof(buffer)); - while (1) { - s = len > sizeof(buffer) ? sizeof(buffer) : len; - if (!dev_write(dev, offset, s, buffer)) - break; - - len -= s; - if (!len) - break; - - offset += s; - } - - dev->flags |= DEV_ACCESSED_W; - - if (!dev_close(dev)) - stack; - - return (len == 0); -} |