diff options
Diffstat (limited to 'lib/metadata/lv_manip.c')
-rw-r--r-- | lib/metadata/lv_manip.c | 8176 |
1 files changed, 6691 insertions, 1485 deletions
diff --git a/lib/metadata/lv_manip.c b/lib/metadata/lv_manip.c index d469fe8..9bec8b5 100644 --- a/lib/metadata/lv_manip.c +++ b/lib/metadata/lv_manip.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. - * Copyright (C) 2004-2012 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2018 Red Hat, Inc. All rights reserved. * * This file is part of LVM2. * @@ -10,23 +10,33 @@ * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "lib.h" -#include "metadata.h" -#include "locking.h" +#include "lib/misc/lib.h" +#include "lib/metadata/metadata.h" +#include "lib/locking/locking.h" #include "pv_map.h" -#include "lvm-string.h" -#include "toolcontext.h" -#include "lv_alloc.h" -#include "pv_alloc.h" -#include "display.h" -#include "segtype.h" -#include "archiver.h" -#include "activate.h" -#include "str_list.h" -#include "defaults.h" +#include "lib/misc/lvm-string.h" +#include "lib/commands/toolcontext.h" +#include "lib/metadata/lv_alloc.h" +#include "lib/metadata/pv_alloc.h" +#include "lib/display/display.h" +#include "lib/metadata/segtype.h" +#include "lib/activate/activate.h" +#include "lib/datastruct/str_list.h" +#include "lib/config/defaults.h" +#include "lib/misc/lvm-exec.h" +#include "lib/mm/memlock.h" +#include "lib/locking/lvmlockd.h" +#include "lib/label/label.h" +#include "lib/misc/lvm-signal.h" +#include "lib/device/filesystem.h" + +#ifdef HAVE_BLKZEROOUT +#include <sys/ioctl.h> +#include <linux/fs.h> +#endif typedef enum { PREFERRED, @@ -46,6 +56,10 @@ typedef enum { #define A_CLING_BY_TAGS 0x08 /* Must match tags against existing segment */ #define A_CAN_SPLIT 0x10 +#define A_AREA_COUNT_MATCHES 0x20 /* Existing lvseg has same number of areas as new segment */ + +#define A_POSITIONAL_FILL 0x40 /* Slots are positional and filled using PREFERRED */ +#define A_PARTITION_BY_TAGS 0x80 /* No allocated area may share any tag with any other */ /* * Constant parameters during a single allocation attempt. @@ -61,10 +75,12 @@ struct alloc_parms { * Holds varying state of each allocation attempt. */ struct alloc_state { + const struct alloc_parms *alloc_parms; struct pv_area_used *areas; uint32_t areas_size; uint32_t log_area_count_still_needed; /* Number of areas still needing to be allocated for the log */ uint32_t allocated; /* Total number of extents allocated so far */ + uint32_t num_positional_areas; /* Number of parallel allocations that must be contiguous/cling */ }; struct lv_names { @@ -72,6 +88,773 @@ struct lv_names { const char *new; }; +enum { + LV_TYPE_UNKNOWN, + LV_TYPE_NONE, + LV_TYPE_PUBLIC, + LV_TYPE_PRIVATE, + LV_TYPE_HISTORY, + LV_TYPE_LINEAR, + LV_TYPE_STRIPED, + LV_TYPE_MIRROR, + LV_TYPE_RAID, + LV_TYPE_THIN, + LV_TYPE_CACHE, + LV_TYPE_SPARSE, + LV_TYPE_ORIGIN, + LV_TYPE_THINORIGIN, + LV_TYPE_MULTITHINORIGIN, + LV_TYPE_THICKORIGIN, + LV_TYPE_MULTITHICKORIGIN, + LV_TYPE_CACHEORIGIN, + LV_TYPE_EXTTHINORIGIN, + LV_TYPE_MULTIEXTTHINORIGIN, + LV_TYPE_SNAPSHOT, + LV_TYPE_THINSNAPSHOT, + LV_TYPE_THICKSNAPSHOT, + LV_TYPE_PVMOVE, + LV_TYPE_IMAGE, + LV_TYPE_LOG, + LV_TYPE_METADATA, + LV_TYPE_POOL, + LV_TYPE_DATA, + LV_TYPE_SPARE, + LV_TYPE_VDO, + LV_TYPE_VIRTUAL, + LV_TYPE_RAID0, + LV_TYPE_RAID0_META, + LV_TYPE_RAID1, + LV_TYPE_RAID10, + LV_TYPE_RAID4, + LV_TYPE_RAID5, + LV_TYPE_RAID5_N, + LV_TYPE_RAID5_LA, + LV_TYPE_RAID5_RA, + LV_TYPE_RAID5_LS, + LV_TYPE_RAID5_RS, + LV_TYPE_RAID6, + LV_TYPE_RAID6_ZR, + LV_TYPE_RAID6_NR, + LV_TYPE_RAID6_NC, + LV_TYPE_LOCKD, + LV_TYPE_SANLOCK, + LV_TYPE_CACHEVOL, + LV_TYPE_WRITECACHE, + LV_TYPE_WRITECACHEORIGIN, + LV_TYPE_INTEGRITY, + LV_TYPE_INTEGRITYORIGIN +}; + +static const char *_lv_type_names[] = { + [LV_TYPE_UNKNOWN] = "unknown", + [LV_TYPE_NONE] = "none", + [LV_TYPE_PUBLIC] = "public", + [LV_TYPE_PRIVATE] = "private", + [LV_TYPE_HISTORY] = "history", + [LV_TYPE_LINEAR] = "linear", + [LV_TYPE_STRIPED] = "striped", + [LV_TYPE_MIRROR] = "mirror", + [LV_TYPE_RAID] = "raid", + [LV_TYPE_THIN] = "thin", + [LV_TYPE_CACHE] = "cache", + [LV_TYPE_SPARSE] = "sparse", + [LV_TYPE_ORIGIN] = "origin", + [LV_TYPE_THINORIGIN] = "thinorigin", + [LV_TYPE_MULTITHINORIGIN] = "multithinorigin", + [LV_TYPE_THICKORIGIN] = "thickorigin", + [LV_TYPE_MULTITHICKORIGIN] = "multithickorigin", + [LV_TYPE_CACHEORIGIN] = "cacheorigin", + [LV_TYPE_EXTTHINORIGIN] = "extthinorigin", + [LV_TYPE_MULTIEXTTHINORIGIN] = "multiextthinorigin", + [LV_TYPE_SNAPSHOT] = "snapshot", + [LV_TYPE_THINSNAPSHOT] = "thinsnapshot", + [LV_TYPE_THICKSNAPSHOT] = "thicksnapshot", + [LV_TYPE_PVMOVE] = "pvmove", + [LV_TYPE_IMAGE] = "image", + [LV_TYPE_LOG] = "log", + [LV_TYPE_METADATA] = "metadata", + [LV_TYPE_POOL] = "pool", + [LV_TYPE_DATA] = "data", + [LV_TYPE_SPARE] = "spare", + [LV_TYPE_VDO] = "vdo", + [LV_TYPE_VIRTUAL] = "virtual", + [LV_TYPE_RAID0] = SEG_TYPE_NAME_RAID0, + [LV_TYPE_RAID0_META] = SEG_TYPE_NAME_RAID0_META, + [LV_TYPE_RAID1] = SEG_TYPE_NAME_RAID1, + [LV_TYPE_RAID10] = SEG_TYPE_NAME_RAID10, + [LV_TYPE_RAID4] = SEG_TYPE_NAME_RAID4, + [LV_TYPE_RAID5] = SEG_TYPE_NAME_RAID5, + [LV_TYPE_RAID5_N] = SEG_TYPE_NAME_RAID5_N, + [LV_TYPE_RAID5_LA] = SEG_TYPE_NAME_RAID5_LA, + [LV_TYPE_RAID5_RA] = SEG_TYPE_NAME_RAID5_RA, + [LV_TYPE_RAID5_LS] = SEG_TYPE_NAME_RAID5_LS, + [LV_TYPE_RAID5_RS] = SEG_TYPE_NAME_RAID5_RS, + [LV_TYPE_RAID6] = SEG_TYPE_NAME_RAID6, + [LV_TYPE_RAID6_ZR] = SEG_TYPE_NAME_RAID6_ZR, + [LV_TYPE_RAID6_NR] = SEG_TYPE_NAME_RAID6_NR, + [LV_TYPE_RAID6_NC] = SEG_TYPE_NAME_RAID6_NC, + [LV_TYPE_LOCKD] = "lockd", + [LV_TYPE_SANLOCK] = "sanlock", + [LV_TYPE_CACHEVOL] = "cachevol", + [LV_TYPE_WRITECACHE] = "writecache", + [LV_TYPE_WRITECACHEORIGIN] = "writecacheorigin", + [LV_TYPE_INTEGRITY] = "integrity", + [LV_TYPE_INTEGRITYORIGIN] = "integrityorigin", +}; + +static int _lv_layout_and_role_mirror(struct dm_pool *mem, + const struct logical_volume *lv, + struct dm_list *layout, + struct dm_list *role, + int *public_lv) +{ + int top_level = 0; + + /* non-top-level LVs */ + if (lv_is_mirror_image(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_MIRROR]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_IMAGE])) + goto_bad; + } else if (lv_is_mirror_log(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_MIRROR]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_LOG])) + goto_bad; + if (lv_is_mirrored(lv) && + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_MIRROR])) + goto_bad; + } else if (lv_is_pvmove(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_PVMOVE]) || + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_MIRROR])) + goto_bad; + } else + top_level = 1; + + + if (!top_level) { + *public_lv = 0; + return 1; + } + + /* top-level LVs */ + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_MIRROR])) + goto_bad; + + return 1; +bad: + return 0; +} + +static int _lv_layout_and_role_raid(struct dm_pool *mem, + const struct logical_volume *lv, + struct dm_list *layout, + struct dm_list *role, + int *public_lv) +{ + int top_level = 0; + const struct segment_type *segtype; + + /* non-top-level LVs */ + if (lv_is_raid_image(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_RAID]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_IMAGE])) + goto_bad; + } else if (lv_is_raid_metadata(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_RAID]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_METADATA])) + goto_bad; + } else if (lv_is_pvmove(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_PVMOVE]) || + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID])) + goto_bad; + } else + top_level = 1; + + if (!top_level) { + *public_lv = 0; + return 1; + } + + /* top-level LVs */ + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID])) + goto_bad; + + segtype = first_seg(lv)->segtype; + + if (segtype_is_raid0(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID0])) + goto_bad; + } else if (segtype_is_raid1(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID1])) + goto_bad; + } else if (segtype_is_raid10(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID10])) + goto_bad; + } else if (segtype_is_raid4(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID4])) + goto_bad; + } else if (segtype_is_any_raid5(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID5])) + goto_bad; + + if (segtype_is_raid5_la(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID5_LA])) + goto_bad; + } else if (segtype_is_raid5_ra(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID5_RA])) + goto_bad; + } else if (segtype_is_raid5_ls(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID5_LS])) + goto_bad; + } else if (segtype_is_raid5_rs(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID5_RS])) + goto_bad; + } + } else if (segtype_is_any_raid6(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID6])) + goto_bad; + + if (segtype_is_raid6_zr(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID6_ZR])) + goto_bad; + } else if (segtype_is_raid6_nr(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID6_NR])) + goto_bad; + } else if (segtype_is_raid6_nc(segtype)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_RAID6_NC])) + goto_bad; + } + } + + return 1; +bad: + return 0; +} + +static int _lv_layout_and_role_thin(struct dm_pool *mem, + const struct logical_volume *lv, + struct dm_list *layout, + struct dm_list *role, + int *public_lv) +{ + int top_level = 0; + unsigned snap_count; + + /* non-top-level LVs */ + if (lv_is_thin_pool_metadata(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_THIN]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_POOL]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_METADATA])) + goto_bad; + } else if (lv_is_thin_pool_data(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_THIN]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_POOL]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_DATA])) + goto_bad; + } else + top_level = 1; + + if (!top_level) { + *public_lv = 0; + return 1; + } + + /* top-level LVs */ + if (lv_is_thin_volume(lv)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_THIN]) || + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_SPARSE])) + goto_bad; + if (lv_is_thin_origin(lv, &snap_count)) { + if (!str_list_add(mem, role, _lv_type_names[LV_TYPE_ORIGIN]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_THINORIGIN])) + goto_bad; + if (snap_count > 1 && + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_MULTITHINORIGIN])) + goto_bad; + } + if (lv_is_thin_snapshot(lv)) + if (!str_list_add(mem, role, _lv_type_names[LV_TYPE_SNAPSHOT]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_THINSNAPSHOT])) + goto_bad; + } else if (lv_is_thin_pool(lv)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_THIN]) || + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_POOL])) + goto_bad; + *public_lv = 0; + } + + if (lv_is_external_origin(lv)) { + if (!str_list_add(mem, role, _lv_type_names[LV_TYPE_ORIGIN]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_EXTTHINORIGIN])) + goto_bad; + if (lv->external_count > 1 && + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_MULTIEXTTHINORIGIN])) + goto_bad; + } + + return 1; +bad: + return 0; +} + +static int _lv_layout_and_role_cache(struct dm_pool *mem, + const struct logical_volume *lv, + struct dm_list *layout, + struct dm_list *role, + int *public_lv) +{ + int top_level = 0; + + /* non-top-level LVs */ + if (lv_is_cache_pool_metadata(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_CACHE]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_POOL]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_METADATA])) + goto_bad; + } else if (lv_is_cache_pool_data(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_CACHE]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_POOL]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_DATA])) + goto_bad; + if (lv_is_cache(lv) && + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_CACHE])) + goto_bad; + } else if (lv_is_cache_origin(lv)) { + if (!str_list_add(mem, role, _lv_type_names[LV_TYPE_CACHE]) || + !str_list_add(mem, role, _lv_type_names[LV_TYPE_ORIGIN]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_CACHEORIGIN])) + goto_bad; + if (lv_is_cache(lv) && + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_CACHE])) + goto_bad; + } else if (lv_is_writecache_origin(lv)) { + if (!str_list_add(mem, role, _lv_type_names[LV_TYPE_WRITECACHE]) || + !str_list_add(mem, role, _lv_type_names[LV_TYPE_ORIGIN]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_WRITECACHEORIGIN])) + goto_bad; + if (lv_is_writecache(lv) && + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_WRITECACHE])) + goto_bad; + } else + top_level = 1; + + if (!top_level) { + *public_lv = 0; + return 1; + } + + /* top-level LVs */ + if (lv_is_cache(lv) && + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_CACHE])) + goto_bad; + else if (lv_is_writecache(lv) && + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_WRITECACHE])) + goto_bad; + else if (lv_is_writecache_cachevol(lv)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_WRITECACHE]) || + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_CACHEVOL])) + goto_bad; + *public_lv = 0; + } else if (lv_is_cache_vol(lv)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_CACHE]) || + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_CACHEVOL])) + goto_bad; + *public_lv = 0; + } else if (lv_is_cache_pool(lv)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_CACHE]) || + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_POOL])) + goto_bad; + *public_lv = 0; + } + + return 1; +bad: + return 0; +} + +static int _lv_layout_and_role_integrity(struct dm_pool *mem, + const struct logical_volume *lv, + struct dm_list *layout, + struct dm_list *role, + int *public_lv) +{ + int top_level = 0; + + /* non-top-level LVs */ + if (lv_is_integrity_metadata(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_INTEGRITY]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_METADATA])) + goto_bad; + } else if (lv_is_integrity_origin(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_INTEGRITY]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_ORIGIN]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_INTEGRITYORIGIN])) + goto_bad; + } else + top_level = 1; + + if (!top_level) { + *public_lv = 0; + return 1; + } + + /* top-level LVs */ + if (lv_is_integrity(lv)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_INTEGRITY])) + goto_bad; + } + + return 1; +bad: + return 0; +} + +static int _lv_layout_and_role_thick_origin_snapshot(struct dm_pool *mem, + const struct logical_volume *lv, + struct dm_list *layout, + struct dm_list *role, + int *public_lv) +{ + if (lv_is_origin(lv)) { + if (!str_list_add(mem, role, _lv_type_names[LV_TYPE_ORIGIN]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_THICKORIGIN])) + goto_bad; + /* + * Thin volumes are also marked with virtual flag, but we don't show "virtual" + * layout for thin LVs as they have their own keyword for layout - "thin"! + * So rule thin LVs out here! + */ + if (lv_is_virtual(lv) && !lv_is_thin_volume(lv)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_VIRTUAL])) + goto_bad; + *public_lv = 0; + } + if (lv->origin_count > 1 && + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_MULTITHICKORIGIN])) + goto_bad; + } else if (lv_is_cow(lv)) { + if (!str_list_add(mem, role, _lv_type_names[LV_TYPE_SNAPSHOT]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_THICKSNAPSHOT])) + goto_bad; + } + + return 1; +bad: + return 0; +} + +static int _lv_layout_and_role_vdo(struct dm_pool *mem, + const struct logical_volume *lv, + struct dm_list *layout, + struct dm_list *role, + int *public_lv) +{ + int top_level = 0; + + /* non-top-level LVs */ + if (lv_is_vdo_pool(lv)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_VDO]) || + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_POOL])) + goto_bad; + } else if (lv_is_vdo_pool_data(lv)) { + if (!str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_VDO]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_POOL]) || + !str_list_add_no_dup_check(mem, role, _lv_type_names[LV_TYPE_DATA])) + goto_bad; + } else + top_level = 1; + + if (!top_level) { + *public_lv = 0; + return 1; + } + + /* top-level LVs */ + if (lv_is_vdo(lv)) { + if (!str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_VDO]) || + !str_list_add_no_dup_check(mem, layout, _lv_type_names[LV_TYPE_SPARSE])) + goto_bad; + } + + return 1; +bad: + return 0; +} + +int lv_layout_and_role(struct dm_pool *mem, const struct logical_volume *lv, + struct dm_list **layout, struct dm_list **role) { + int linear, striped; + struct lv_segment *seg; + int public_lv = 1; + + *layout = *role = NULL; + + if (!(*layout = str_list_create(mem))) { + log_error("LV layout list allocation failed"); + return 0; + } + + if (!(*role = str_list_create(mem))) { + log_error("LV role list allocation failed"); + goto bad; + } + + if (lv_is_historical(lv)) { + if (!str_list_add_no_dup_check(mem, *layout, _lv_type_names[LV_TYPE_NONE]) || + !str_list_add_no_dup_check(mem, *role, _lv_type_names[LV_TYPE_HISTORY])) + goto_bad; + } + + /* Mirrors and related */ + if ((lv_is_mirror_type(lv) || lv_is_pvmove(lv)) && + !_lv_layout_and_role_mirror(mem, lv, *layout, *role, &public_lv)) + goto_bad; + + /* RAIDs and related */ + if (lv_is_raid_type(lv) && + !_lv_layout_and_role_raid(mem, lv, *layout, *role, &public_lv)) + goto_bad; + + /* Thins and related */ + if ((lv_is_thin_type(lv) || lv_is_external_origin(lv)) && + !_lv_layout_and_role_thin(mem, lv, *layout, *role, &public_lv)) + goto_bad; + + /* Caches and related */ + if ((lv_is_cache_type(lv) || lv_is_cache_origin(lv) || lv_is_writecache(lv) || lv_is_writecache_origin(lv)) && + !_lv_layout_and_role_cache(mem, lv, *layout, *role, &public_lv)) + goto_bad; + + /* Integrity related */ + if ((lv_is_integrity(lv) || lv_is_integrity_origin(lv) || lv_is_integrity_metadata(lv)) && + !_lv_layout_and_role_integrity(mem, lv, *layout, *role, &public_lv)) + goto_bad; + + /* VDO and related */ + if (lv_is_vdo_type(lv) && + !_lv_layout_and_role_vdo(mem, lv, *layout, *role, &public_lv)) + goto_bad; + + /* Pool-specific */ + if (lv_is_pool_metadata_spare(lv)) { + if (!str_list_add_no_dup_check(mem, *role, _lv_type_names[LV_TYPE_POOL]) || + !str_list_add_no_dup_check(mem, *role, _lv_type_names[LV_TYPE_SPARE])) + goto_bad; + public_lv = 0; + } + + /* Old-style origins/snapshots, virtual origins */ + if (!_lv_layout_and_role_thick_origin_snapshot(mem, lv, *layout, *role, &public_lv)) + goto_bad; + + if (lv_is_lockd_sanlock_lv(lv)) { + if (!str_list_add_no_dup_check(mem, *role, _lv_type_names[LV_TYPE_LOCKD]) || + !str_list_add_no_dup_check(mem, *role, _lv_type_names[LV_TYPE_SANLOCK])) + goto_bad; + public_lv = 0; + } + + /* + * If layout not yet determined, it must be either + * linear or striped or mixture of these two. + */ + if (dm_list_empty(*layout)) { + linear = striped = 0; + dm_list_iterate_items(seg, &lv->segments) { + if (seg_is_linear(seg)) + linear = 1; + else if (seg_is_striped(seg)) + striped = 1; + else { + /* + * This should not happen but if it does + * we'll see that there's "unknown" layout + * present. This means we forgot to detect + * the role above and we need add proper + * detection for such role! + */ + log_warn(INTERNAL_ERROR "WARNING: Failed to properly detect " + "layout and role for LV %s/%s.", + lv->vg->name, lv->name); + } + } + + if (linear && + !str_list_add_no_dup_check(mem, *layout, _lv_type_names[LV_TYPE_LINEAR])) + goto_bad; + + if (striped && + !str_list_add_no_dup_check(mem, *layout, _lv_type_names[LV_TYPE_STRIPED])) + goto_bad; + + if (!linear && !striped && + !str_list_add_no_dup_check(mem, *layout, _lv_type_names[LV_TYPE_UNKNOWN])) + goto_bad; + } + + /* finally, add either 'public' or 'private' role to the LV */ + if (public_lv) { + if (!str_list_add_h_no_dup_check(mem, *role, _lv_type_names[LV_TYPE_PUBLIC])) + goto_bad; + } else { + if (!str_list_add_h_no_dup_check(mem, *role, _lv_type_names[LV_TYPE_PRIVATE])) + goto_bad; + } + + return 1; +bad: + dm_pool_free(mem, *layout); + + return 0; +} +struct dm_list_and_mempool { + struct dm_list *list; + struct dm_pool *mem; +}; +static int _get_pv_list_for_lv(struct logical_volume *lv, void *data) +{ + int dup_found; + uint32_t s; + struct pv_list *pvl; + struct lv_segment *seg; + struct dm_list *pvs = ((struct dm_list_and_mempool *)data)->list; + struct dm_pool *mem = ((struct dm_list_and_mempool *)data)->mem; + + dm_list_iterate_items(seg, &lv->segments) { + for (s = 0; s < seg->area_count; s++) { + dup_found = 0; + + if (seg_type(seg, s) != AREA_PV) + continue; + + /* do not add duplicates */ + dm_list_iterate_items(pvl, pvs) + if (pvl->pv == seg_pv(seg, s)) + dup_found = 1; + + if (dup_found) + continue; + + if (!(pvl = dm_pool_zalloc(mem, sizeof(*pvl)))) { + log_error("Failed to allocate memory"); + return 0; + } + + pvl->pv = seg_pv(seg, s); + log_debug_metadata(" %s/%s uses %s", lv->vg->name, + lv->name, pv_dev_name(pvl->pv)); + + dm_list_add(pvs, &pvl->list); + } + } + + return 1; +} + +/* + * get_pv_list_for_lv + * @mem - mempool to allocate the list from. + * @lv + * @pvs - The list to add pv_list items to. + * + * 'pvs' is filled with 'pv_list' items for PVs that compose the LV. + * If the 'pvs' list already has items in it, duplicates will not be + * added. So, it is safe to repeatedly call this function for different + * LVs and build up a list of PVs for them all. + * + * Memory to create the list is obtained from the mempool provided. + * + * Returns: 1 on success, 0 on error + */ +int get_pv_list_for_lv(struct dm_pool *mem, + struct logical_volume *lv, struct dm_list *pvs) +{ + struct dm_list_and_mempool context = { pvs, mem }; + + log_debug_metadata("Generating list of PVs that %s/%s uses:", + lv->vg->name, lv->name); + + if (!_get_pv_list_for_lv(lv, &context)) + return_0; + + return for_each_sub_lv(lv, &_get_pv_list_for_lv, &context); +} + +/* + * get_default_region_size + * @cmd + * + * 'mirror_region_size' and 'raid_region_size' are effectively the same thing. + * However, "raid" is more inclusive than "mirror", so the name has been + * changed. This function checks for the old setting and warns the user if + * it is being overridden by the new setting (i.e. warn if both settings are + * present). + * + * Note that the config files give defaults in kiB terms, but we + * return the value in terms of sectors. + * + * Returns: default region_size in sectors + */ +static int _get_default_region_size(struct cmd_context *cmd) +{ + int mrs, rrs; + + /* + * 'mirror_region_size' is the old setting. It is overridden + * by the new setting, 'raid_region_size'. + */ + mrs = 2 * find_config_tree_int(cmd, activation_mirror_region_size_CFG, NULL); + rrs = 2 * find_config_tree_int(cmd, activation_raid_region_size_CFG, NULL); + + if (!mrs && !rrs) + return DEFAULT_RAID_REGION_SIZE * 2; + + if (!mrs) + return rrs; + + if (!rrs) + return mrs; + + if (mrs != rrs) + log_verbose("Overriding default 'mirror_region_size' setting" + " with 'raid_region_size' setting of %u kiB", + rrs / 2); + + return rrs; +} + +static int _round_down_pow2(int r) +{ + /* Set all bits to the right of the leftmost set bit */ + r |= (r >> 1); + r |= (r >> 2); + r |= (r >> 4); + r |= (r >> 8); + r |= (r >> 16); + + /* Pull out the leftmost set bit */ + return r & ~(r >> 1); +} + +uint32_t get_default_region_size(struct cmd_context *cmd) +{ + int pagesize = lvm_getpagesize(); + int region_size = _get_default_region_size(cmd); + + if (!is_power_of_2(region_size)) { + region_size = _round_down_pow2(region_size); + log_verbose("Reducing region size to %u kiB (power of 2).", + region_size / 2); + } + + if (region_size % (pagesize >> SECTOR_SHIFT)) { + region_size = DEFAULT_RAID_REGION_SIZE * 2; + log_verbose("Using default region size %u kiB (multiple of page size).", + region_size / 2); + } + + return (uint32_t) region_size; +} + int add_seg_to_segs_using_this_lv(struct logical_volume *lv, struct lv_segment *seg) { @@ -84,11 +867,11 @@ int add_seg_to_segs_using_this_lv(struct logical_volume *lv, } } - log_very_verbose("Adding %s:%" PRIu32 " as an user of %s", - seg->lv->name, seg->le, lv->name); + log_very_verbose("Adding %s:" FMTu32 " as an user of %s.", + display_lvname(seg->lv), seg->le, display_lvname(lv)); if (!(sl = dm_pool_zalloc(lv->vg->vgmem, sizeof(*sl)))) { - log_error("Failed to allocate segment list"); + log_error("Failed to allocate segment list."); return 0; } @@ -110,14 +893,16 @@ int remove_seg_from_segs_using_this_lv(struct logical_volume *lv, if (sl->count > 1) sl->count--; else { - log_very_verbose("%s:%" PRIu32 " is no longer a user " - "of %s", seg->lv->name, seg->le, - lv->name); + log_very_verbose("%s:" FMTu32 " is no longer a user of %s.", + display_lvname(seg->lv), seg->le, + display_lvname(lv)); dm_list_del(&sl->list); } return 1; } + log_error(INTERNAL_ERROR "Segment %s:" FMTu32 " is not a user of %s.", + display_lvname(seg->lv), seg->le, display_lvname(lv)); return 0; } @@ -128,28 +913,35 @@ int remove_seg_from_segs_using_this_lv(struct logical_volume *lv, * * In general, walk through lv->segs_using_this_lv. */ -struct lv_segment *get_only_segment_using_this_lv(struct logical_volume *lv) +struct lv_segment *get_only_segment_using_this_lv(const struct logical_volume *lv) { struct seg_list *sl; - if (dm_list_size(&lv->segs_using_this_lv) != 1) { - log_error("%s is expected to have only one segment using it, " - "while it has %d", lv->name, - dm_list_size(&lv->segs_using_this_lv)); + if (!lv) { + log_error(INTERNAL_ERROR "get_only_segment_using_this_lv() called with NULL LV."); return NULL; } - dm_list_iterate_items(sl, &lv->segs_using_this_lv) - break; /* first item */ + dm_list_iterate_items(sl, &lv->segs_using_this_lv) { + /* Needs to be he only item in list */ + if (!dm_list_end(&lv->segs_using_this_lv, &sl->list)) + break; - if (sl->count != 1) { - log_error("%s is expected to have only one segment using it, " - "while %s:%" PRIu32 " uses it %d times", - lv->name, sl->seg->lv->name, sl->seg->le, sl->count); - return NULL; + if (sl->count != 1) { + log_error("%s is expected to have only one segment using it, " + "while %s:" FMTu32 " uses it %d times.", + display_lvname(lv), display_lvname(sl->seg->lv), + sl->seg->le, sl->count); + return NULL; + } + + return sl->seg; } - return sl->seg; + log_error("%s is expected to have only one segment using it, while it has %d.", + display_lvname(lv), dm_list_size(&lv->segs_using_this_lv)); + + return NULL; } /* @@ -199,18 +991,59 @@ uint32_t find_free_lvnum(struct logical_volume *lv) return i; } +dm_percent_t copy_percent(const struct logical_volume *lv) +{ + uint32_t numerator = 0u, denominator = 0u; + struct lv_segment *seg; + + dm_list_iterate_items(seg, &lv->segments) { + denominator += seg->area_len; + + /* FIXME Generalise name of 'extents_copied' field */ + if (((seg_is_raid(seg) && !seg_is_any_raid0(seg)) || seg_is_mirrored(seg)) && + (seg->area_count > 1)) + numerator += seg->extents_copied; + else + numerator += seg->area_len; + } + + return denominator ? dm_make_percent(numerator, denominator) : DM_PERCENT_100; +} + +/* Round up extents to next stripe boundary for number of stripes */ +static uint32_t _round_to_stripe_boundary(struct volume_group *vg, uint32_t extents, + uint32_t stripes, int extend) +{ + uint32_t size_rest, new_extents = extents; + + if (!stripes) + return extents; + + /* Round up extents to stripe divisible amount */ + if ((size_rest = extents % stripes)) { + new_extents += extend ? stripes - size_rest : -size_rest; + log_print_unless_silent("Rounding size %s (%u extents) %s to stripe boundary size %s (%u extents).", + display_size(vg->cmd, (uint64_t) extents * vg->extent_size), extents, + new_extents < extents ? "down" : "up", + display_size(vg->cmd, (uint64_t) new_extents * vg->extent_size), new_extents); + } + + return new_extents; +} + /* * All lv_segments get created here. */ struct lv_segment *alloc_lv_segment(const struct segment_type *segtype, struct logical_volume *lv, uint32_t le, uint32_t len, + uint32_t reshape_len, uint64_t status, uint32_t stripe_size, struct logical_volume *log_lv, - struct logical_volume *thin_pool_lv, uint32_t area_count, uint32_t area_len, + uint32_t data_copies, uint32_t chunk_size, uint32_t region_size, uint32_t extents_copied, @@ -233,7 +1066,7 @@ struct lv_segment *alloc_lv_segment(const struct segment_type *segtype, return_NULL; } - if (segtype_is_raid(segtype) && + if (segtype_is_raid_with_meta(segtype) && !(seg->meta_areas = dm_pool_zalloc(mem, areas_sz))) { dm_pool_free(mem, seg); /* frees everything alloced since seg */ return_NULL; @@ -243,65 +1076,70 @@ struct lv_segment *alloc_lv_segment(const struct segment_type *segtype, seg->lv = lv; seg->le = le; seg->len = len; + seg->reshape_len = reshape_len; seg->status = status; seg->stripe_size = stripe_size; seg->area_count = area_count; seg->area_len = area_len; + seg->data_copies = data_copies ? : lv_raid_data_copies(segtype, area_count); seg->chunk_size = chunk_size; seg->region_size = region_size; seg->extents_copied = extents_copied; seg->pvmove_source_seg = pvmove_source_seg; dm_list_init(&seg->tags); + dm_list_init(&seg->origin_list); dm_list_init(&seg->thin_messages); - if (thin_pool_lv) { - /* If this thin volume, thin snapshot is being created */ - if (lv_is_thin_volume(thin_pool_lv)) { - seg->transaction_id = first_seg(first_seg(thin_pool_lv)->pool_lv)->transaction_id; - if (!attach_pool_lv(seg, first_seg(thin_pool_lv)->pool_lv, thin_pool_lv)) - return_NULL; - } else { - seg->transaction_id = first_seg(thin_pool_lv)->transaction_id; - if (!attach_pool_lv(seg, thin_pool_lv, NULL)) - return_NULL; - } - } - if (log_lv && !attach_mirror_log(seg, log_lv)) return_NULL; + if (segtype_is_mirror(segtype)) + lv->status |= MIRROR; + + if (segtype_is_mirrored(segtype)) + lv->status |= MIRRORED; + return seg; } -struct lv_segment *alloc_snapshot_seg(struct logical_volume *lv, - uint64_t status, uint32_t old_le_count) +/* + * Temporary helper to return number of data copies for + * RAID segment @seg until seg->data_copies got added + */ +static uint32_t _raid_data_copies(struct lv_segment *seg) { - struct lv_segment *seg; - const struct segment_type *segtype; + /* + * FIXME: needs to change once more than 2 are supported. + * I.e. use seg->data_copies then + */ + if (seg_is_raid10(seg)) + return 2; - segtype = get_segtype_from_string(lv->vg->cmd, "snapshot"); - if (!segtype) { - log_error("Failed to find snapshot segtype"); - return NULL; - } + if (seg_is_raid1(seg)) + return seg->area_count; - if (!(seg = alloc_lv_segment(segtype, lv, old_le_count, - lv->le_count - old_le_count, status, 0, - NULL, NULL, 0, lv->le_count - old_le_count, - 0, 0, 0, NULL))) { - log_error("Couldn't allocate new snapshot segment."); - return NULL; - } + return seg->segtype->parity_devs + 1; +} - dm_list_add(&lv->segments, &seg->list); - lv->status |= VIRTUAL; +/* Data image count for RAID segment @seg */ +static uint32_t _raid_stripes_count(struct lv_segment *seg) +{ + /* + * FIXME: raid10 needs to change once more than + * 2 data_copies and odd # of legs supported. + */ + if (seg_is_raid10(seg)) + return seg->area_count / _raid_data_copies(seg); - return seg; + return seg->area_count - seg->segtype->parity_devs; } static int _release_and_discard_lv_segment_area(struct lv_segment *seg, uint32_t s, uint32_t area_reduction, int with_discard) { + struct lv_segment *cache_seg; + struct logical_volume *lv = seg_lv(seg, s); + if (seg_type(seg, s) == AREA_UNASSIGNED) return 1; @@ -318,55 +1156,92 @@ static int _release_and_discard_lv_segment_area(struct lv_segment *seg, uint32_t return 1; } - if ((seg_lv(seg, s)->status & MIRROR_IMAGE) || - (seg_lv(seg, s)->status & THIN_POOL_DATA)) { - if (!lv_reduce(seg_lv(seg, s), area_reduction)) + if (lv_is_mirror_image(lv) || + lv_is_thin_pool_data(lv) || + lv_is_vdo_pool_data(lv) || + lv_is_cache_pool_data(lv)) { + if (!lv_reduce(lv, area_reduction)) return_0; /* FIXME: any upper level reporting */ return 1; } - if (seg_lv(seg, s)->status & RAID_IMAGE) { - /* - * FIXME: Use lv_reduce not lv_remove - * We use lv_remove for now, because I haven't figured out - * why lv_reduce won't remove the LV. - lv_reduce(seg_lv(seg, s), area_reduction); - */ - if (area_reduction != seg->area_len) { - log_error("Unable to reduce RAID LV - operation not implemented."); + if (seg_is_cache_pool(seg) && + !dm_list_empty(&seg->lv->segs_using_this_lv)) { + if (!(cache_seg = get_only_segment_using_this_lv(seg->lv))) return_0; - } else { - if (!lv_remove(seg_lv(seg, s))) { - log_error("Failed to remove RAID image %s", - seg_lv(seg, s)->name); - return 0; - } - } - /* Remove metadata area if image has been removed */ - if (area_reduction == seg->area_len) { - if (!lv_reduce(seg_metalv(seg, s), - seg_metalv(seg, s)->le_count)) { - log_error("Failed to remove RAID meta-device %s", - seg_metalv(seg, s)->name); + if (!lv_cache_remove(cache_seg->lv)) + return_0; + } + + if (lv_is_raid_image(lv)) { + /* Calculate the amount of extents to reduce per rmeta/rimage LV */ + uint32_t rimage_extents; + struct lv_segment *seg1 = first_seg(lv); + + /* FIXME: avoid extra seg_is_*() conditionals here */ + rimage_extents = raid_rimage_extents(seg1->segtype, area_reduction, + seg_is_any_raid0(seg) ? 0 : _raid_stripes_count(seg), + seg_is_raid10(seg) ? 1 :_raid_data_copies(seg)); + if (!rimage_extents) + return 0; + + if (seg->meta_areas) { + uint32_t meta_area_reduction; + struct logical_volume *mlv; + struct volume_group *vg = lv->vg; + + if (seg_metatype(seg, s) != AREA_LV || + !(mlv = seg_metalv(seg, s))) return 0; - } + + meta_area_reduction = raid_rmeta_extents_delta(vg->cmd, lv->le_count, lv->le_count - rimage_extents, + seg->region_size, vg->extent_size); + /* Limit for raid0_meta not having region size set */ + if (meta_area_reduction > mlv->le_count || + !(lv->le_count - rimage_extents)) + meta_area_reduction = mlv->le_count; + + if (meta_area_reduction && + !lv_reduce(mlv, meta_area_reduction)) + return_0; /* FIXME: any upper level reporting */ } + + if (!lv_reduce(lv, rimage_extents)) + return_0; /* FIXME: any upper level reporting */ + return 1; } if (area_reduction == seg->area_len) { - log_very_verbose("Remove %s:%" PRIu32 "[%" PRIu32 "] from " - "the top of LV %s:%" PRIu32, - seg->lv->name, seg->le, s, - seg_lv(seg, s)->name, seg_le(seg, s)); + log_very_verbose("Remove %s:" FMTu32 "[" FMTu32 "] from " + "the top of LV %s:" FMTu32 ".", + display_lvname(seg->lv), seg->le, s, + display_lvname(lv), seg_le(seg, s)); + + if (!remove_seg_from_segs_using_this_lv(lv, seg)) + return_0; - remove_seg_from_segs_using_this_lv(seg_lv(seg, s), seg); seg_lv(seg, s) = NULL; seg_le(seg, s) = 0; seg_type(seg, s) = AREA_UNASSIGNED; } + /* When removed last VDO user automatically removes VDO pool */ + if (lv_is_vdo_pool(lv) && dm_list_empty(&(lv->segs_using_this_lv))) { + struct volume_group *vg = lv->vg; + + if (!lv_remove(lv)) /* FIXME: any upper level reporting */ + return_0; + + if (vg_is_shared(vg)) { + if (!lockd_lv_name(vg->cmd, vg, lv->name, &lv->lvid.id[1], lv->lock_args, "un", LDLV_PERSISTENT)) + log_error("Failed to unlock vdo pool in lvmlockd."); + lockd_free_lv(vg->cmd, vg, lv->name, &lv->lvid.id[1], lv->lock_args); + } + return 1; + } + return 1; } @@ -451,14 +1326,21 @@ int set_lv_segment_area_lv(struct lv_segment *seg, uint32_t area_num, struct logical_volume *lv, uint32_t le, uint64_t status) { - log_very_verbose("Stack %s:%" PRIu32 "[%" PRIu32 "] on LV %s:%" PRIu32, - seg->lv->name, seg->le, area_num, lv->name, le); + log_very_verbose("Stack %s:" FMTu32 "[" FMTu32 "] on LV %s:" FMTu32 ".", + display_lvname(seg->lv), seg->le, area_num, + display_lvname(lv), le); - if (status & RAID_META) { + if (area_num >= seg->area_count) { + log_error(INTERNAL_ERROR "Try to set to high area number (%u >= %u) for LV %s.", + area_num, seg->area_count, display_lvname(seg->lv)); + return 0; + } + lv->status |= status; + if (lv_is_raid_metadata(lv)) { seg->meta_areas[area_num].type = AREA_LV; seg_metalv(seg, area_num) = lv; if (le) { - log_error(INTERNAL_ERROR "Meta le != 0"); + log_error(INTERNAL_ERROR "Meta le != 0."); return 0; } seg_metale(seg, area_num) = 0; @@ -467,7 +1349,6 @@ int set_lv_segment_area_lv(struct lv_segment *seg, uint32_t area_num, seg_lv(seg, area_num) = lv; seg_le(seg, area_num) = le; } - lv->status |= status; if (!add_seg_to_segs_using_this_lv(lv, seg)) return_0; @@ -478,17 +1359,19 @@ int set_lv_segment_area_lv(struct lv_segment *seg, uint32_t area_num, /* * Prepare for adding parallel areas to an existing segment. */ -static int _lv_segment_add_areas(struct logical_volume *lv, - struct lv_segment *seg, - uint32_t new_area_count) +int add_lv_segment_areas(struct lv_segment *seg, uint32_t new_area_count) { struct lv_segment_area *newareas; uint32_t areas_sz = new_area_count * sizeof(*newareas); - if (!(newareas = dm_pool_zalloc(lv->vg->cmd->mem, areas_sz))) - return_0; + if (!(newareas = dm_pool_zalloc(seg->lv->vg->vgmem, areas_sz))) { + log_error("Failed to allocate widened LV segment for %s.", + display_lvname(seg->lv)); + return 0; + } - memcpy(newareas, seg->areas, seg->area_count * sizeof(*seg->areas)); + if (seg->area_count) + memcpy(newareas, seg->areas, seg->area_count * sizeof(*seg->areas)); seg->areas = newareas; seg->area_count = new_area_count; @@ -496,22 +1379,68 @@ static int _lv_segment_add_areas(struct logical_volume *lv, return 1; } +static uint32_t _calc_area_multiple(const struct segment_type *segtype, + const uint32_t area_count, + const uint32_t stripes) +{ + if (!area_count) + return 1; + + /* Striped */ + if (segtype_is_striped(segtype)) + return area_count; + + /* Parity RAID (e.g. RAID 4/5/6) */ + if (segtype_is_raid(segtype) && segtype->parity_devs) { + /* + * As articulated in _alloc_init, we can tell by + * the area_count whether a replacement drive is + * being allocated; and if this is the case, then + * there is no area_multiple that should be used. + */ + if (area_count <= segtype->parity_devs) + return 1; + + return area_count - segtype->parity_devs; + } + + /* + * RAID10 - only has 2-way mirror right now. + * If we are to move beyond 2-way RAID10, then + * the 'stripes' argument will always need to + * be given. + */ + if (segtype_is_raid10(segtype)) { + if (!stripes) + return area_count / 2; + return stripes; + } + + /* Mirrored stripes */ + if (stripes) + return stripes; + + /* Mirrored */ + return 1; +} + /* * Reduce the size of an lv_segment. New size can be zero. */ static int _lv_segment_reduce(struct lv_segment *seg, uint32_t reduction) { uint32_t area_reduction, s; + uint32_t areas = (seg->area_count / (seg_is_raid10(seg) ? seg->data_copies : 1)) - seg->segtype->parity_devs; /* Caller must ensure exact divisibility */ - if (seg_is_striped(seg)) { - if (reduction % seg->area_count) { + if (seg_is_striped(seg) || seg_is_striped_raid(seg)) { + if (reduction % areas) { log_error("Segment extent reduction %" PRIu32 " not divisible by #stripes %" PRIu32, reduction, seg->area_count); return 0; } - area_reduction = (reduction / seg->area_count); + area_reduction = reduction / areas; } else area_reduction = reduction; @@ -520,7 +1449,77 @@ static int _lv_segment_reduce(struct lv_segment *seg, uint32_t reduction) return_0; seg->len -= reduction; - seg->area_len -= area_reduction; + + if (seg_is_raid(seg)) + seg->area_len = seg->len; + else + seg->area_len -= area_reduction; + + return 1; +} + +/* Find the bottommost resizable LV in the stack. + * It does not matter which LV is used in this stack for cmdline tool. */ +static struct logical_volume *_get_resizable_layer_lv(struct logical_volume *lv) +{ + while (lv_is_cache(lv) || /* _corig */ + lv_is_integrity(lv) || + lv_is_thin_pool(lv) || /* _tdata */ + lv_is_vdo_pool(lv) || /* _vdata */ + lv_is_writecache(lv)) /* _worigin */ + lv = seg_lv(first_seg(lv), 0); /* component-level down */ + + return lv; +} + +/* Check if LV is component of resizable LV. + * When resize changes size of LV this also changes the size whole stack upward. + * Support syntax suggar - so user can pick any LV in stack for resize. */ +static int _is_layered_lv(struct logical_volume *lv) +{ + return (lv_is_cache_origin(lv) || + lv_is_integrity_origin(lv) || + lv_is_thin_pool_data(lv) || + lv_is_vdo_pool_data(lv) || + lv_is_writecache_origin(lv)); +} + +/* Find the topmost LV in the stack - usually such LV is visible. */ +static struct logical_volume *_get_top_layer_lv(struct logical_volume *lv) +{ + struct lv_segment *seg; + + while (_is_layered_lv(lv)) { + if (!(seg = get_only_segment_using_this_lv(lv))) { + log_error(INTERNAL_ERROR "No single component user of logical volume %s.", + display_lvname(lv)); + return NULL; + } + lv = seg->lv; /* component-level up */ + } + + return lv; +} + + +/* Handles also stacking */ +static int _setup_lv_size(struct logical_volume *lv, uint32_t extents) +{ + struct lv_segment *seg; + + lv->le_count = extents; + lv->size = (uint64_t) extents * lv->vg->extent_size; + + while (lv->size && _is_layered_lv(lv)) { + if (!(seg = get_only_segment_using_this_lv(lv))) + return_0; + + seg->lv->le_count = + seg->len = + seg->area_len = lv->le_count; + seg->lv->size = lv->size; + lv = seg->lv; + } return 1; } @@ -530,15 +1529,42 @@ static int _lv_segment_reduce(struct lv_segment *seg, uint32_t reduction) */ static int _lv_reduce(struct logical_volume *lv, uint32_t extents, int delete) { - struct lv_segment *seg; + struct lv_segment *seg = NULL; uint32_t count = extents; uint32_t reduction; + struct logical_volume *pool_lv; + struct logical_volume *external_lv = NULL; + int is_raid10 = 0; + uint32_t data_copies = 0; + struct lv_list *lvl; + int is_last_pool = lv_is_pool(lv); + + if (!dm_list_empty(&lv->segments)) { + seg = first_seg(lv); + is_raid10 = seg_is_any_raid10(seg) && seg->reshape_len; + data_copies = seg->data_copies; + } + + if (lv_is_merging_origin(lv)) { + log_debug_metadata("Dropping snapshot merge of %s to removed origin %s.", + find_snapshot(lv)->lv->name, lv->name); + clear_snapshot_merge(lv); + } dm_list_iterate_back_items(seg, &lv->segments) { if (!count) break; + if (seg->external_lv) + external_lv = seg->external_lv; + if (seg->len <= count) { + if (seg->merge_lv) { + log_debug_metadata("Dropping snapshot merge of removed %s to origin %s.", + seg->lv->name, seg->merge_lv->name); + clear_snapshot_merge(seg->merge_lv); + } + /* remove this segment completely */ /* FIXME Check this is safe */ if (seg->log_lv && !lv_remove(seg->log_lv)) @@ -547,9 +1573,51 @@ static int _lv_reduce(struct logical_volume *lv, uint32_t extents, int delete) if (seg->metadata_lv && !lv_remove(seg->metadata_lv)) return_0; - if (seg->pool_lv) { + /* Remove cache origin only when removing (not on lv_empty()) */ + if (delete && seg_is_cache(seg)) { + if (lv_is_pending_delete(seg->lv)) { + /* Just dropping reference on origin when pending delete */ + if (!remove_seg_from_segs_using_this_lv(seg_lv(seg, 0), seg)) + return_0; + seg_lv(seg, 0) = NULL; + seg_le(seg, 0) = 0; + seg_type(seg, 0) = AREA_UNASSIGNED; + if (seg->pool_lv && !detach_pool_lv(seg)) + return_0; + } else if (!lv_remove(seg_lv(seg, 0))) + return_0; + } + + if (delete && seg_is_integrity(seg)) { + /* Remove integrity origin in addition to integrity layer. */ + if (!lv_remove(seg_lv(seg, 0))) + return_0; + /* Remove integrity metadata. */ + if (seg->integrity_meta_dev && !lv_remove(seg->integrity_meta_dev)) + return_0; + } + + if ((pool_lv = seg->pool_lv)) { if (!detach_pool_lv(seg)) return_0; + /* When removing cached LV, remove pool as well */ + if (seg_is_cache(seg) && !lv_remove(pool_lv)) + return_0; + } + + if (seg_is_thin_pool(seg)) { + /* For some segtypes the size may differ between the segment size and its layered LV + * i.e. thin-pool and tdata. + * + * This can get useful, when we will support multiple commits + * while resizing a stacked LV. + */ + if (seg->len != seg_lv(seg, 0)->le_count) { + seg->len = seg_lv(seg, 0)->le_count; + /* FIXME: ATM capture as error as it should not happen. */ + log_debug(INTERNAL_ERROR "Pool size mismatched data size for %s", + display_lvname(seg->lv)); + } } dm_list_del(&seg->list); @@ -562,12 +1630,24 @@ static int _lv_reduce(struct logical_volume *lv, uint32_t extents, int delete) count -= reduction; } - lv->le_count -= extents; - lv->size = (uint64_t) lv->le_count * lv->vg->extent_size; + if (!_setup_lv_size(lv, lv->le_count - extents * (is_raid10 ? data_copies : 1))) + return_0; + + if ((seg = first_seg(lv))) { + if (is_raid10) + seg->len = seg->area_len = lv->le_count; + + seg->extents_copied = seg->len; + } if (!delete) return 1; + if (lv == lv->vg->pool_metadata_spare_lv) { + lv->status &= ~POOL_METADATA_SPARE; + lv->vg->pool_metadata_spare_lv = NULL; + } + /* Remove the LV if it is now empty */ if (!lv->le_count && !unlink_lv_from_vg(lv)) return_0; @@ -575,6 +1655,34 @@ static int _lv_reduce(struct logical_volume *lv, uint32_t extents, int delete) !lv->vg->fid->fmt->ops->lv_setup(lv->vg->fid, lv)) return_0; + /* Removal of last user enforces refresh */ + if (external_lv && !lv_is_external_origin(external_lv) && + lv_is_active(external_lv) && + !lv_update_and_reload(external_lv)) + return_0; + + /* When removing last pool, automatically drop the spare volume */ + if (is_last_pool && lv->vg->pool_metadata_spare_lv) { + /* TODO: maybe use a list of pools or a counter to avoid linear search through VG */ + dm_list_iterate_items(lvl, &lv->vg->lvs) + if (lv_is_thin_type(lvl->lv) || + lv_is_cache_type(lvl->lv)) { + is_last_pool = 0; + break; + } + + if (is_last_pool) { + /* This is purely internal LV volume, no question */ + if (!deactivate_lv(lv->vg->cmd, lv->vg->pool_metadata_spare_lv)) { + log_error("Unable to deactivate spare logical volume %s.", + display_lvname(lv->vg->pool_metadata_spare_lv)); + return 0; + } + if (!lv_remove(lv->vg->pool_metadata_spare_lv)) + return_0; + } + } + return 1; } @@ -592,6 +1700,10 @@ int lv_empty(struct logical_volume *lv) int replace_lv_with_error_segment(struct logical_volume *lv) { uint32_t len = lv->le_count; + struct segment_type *segtype; + + if (!(segtype = get_segtype_from_string(lv->vg->cmd, SEG_TYPE_NAME_ERROR))) + return_0; if (len && !lv_empty(lv)) return_0; @@ -605,29 +1717,149 @@ int replace_lv_with_error_segment(struct logical_volume *lv) * an error segment, we should also clear any flags * that suggest it is anything other than "error". */ - lv->status &= ~(MIRRORED|PVMOVE); + /* FIXME Check for other flags that need removing */ + lv->status &= ~(MIRROR|MIRRORED|PVMOVE|LOCKED); - /* FIXME: Should we bug if we find a log_lv attached? */ + /* FIXME Check for any attached LVs that will become orphans e.g. mirror logs */ - if (!lv_add_virtual_segment(lv, 0, len, get_segtype_from_string(lv->vg->cmd, "error"), NULL)) + if (!lv_add_virtual_segment(lv, 0, len, segtype)) return_0; return 1; } +static int _lv_refresh_suspend_resume(const struct logical_volume *lv) +{ + struct cmd_context *cmd = lv->vg->cmd; + int r = 1; + + if (!cmd->partial_activation && lv_is_partial(lv)) { + log_error("Refusing refresh of partial LV %s." + " Use '--activationmode partial' to override.", + display_lvname(lv)); + return 0; + } + + if (!suspend_lv(cmd, lv)) { + log_error("Failed to suspend %s.", display_lvname(lv)); + r = 0; + } + + if (!resume_lv(cmd, lv)) { + log_error("Failed to reactivate %s.", display_lvname(lv)); + r = 0; + } + + return r; +} + +int lv_refresh_suspend_resume(const struct logical_volume *lv) +{ + if (!_lv_refresh_suspend_resume(lv)) + return 0; + + /* + * Remove any transiently activated error + * devices which arean't used any more. + */ + if (lv_is_raid(lv) && !lv_deactivate_any_missing_subdevs(lv)) { + log_error("Failed to remove temporary SubLVs from %s", display_lvname(lv)); + return 0; + } + + return 1; +} + /* * Remove given number of extents from LV. */ int lv_reduce(struct logical_volume *lv, uint32_t extents) { + struct lv_segment *seg = first_seg(lv); + + /* Ensure stripe boundary extents on RAID LVs */ + if (lv_is_raid(lv) && extents != lv->le_count) + extents =_round_to_stripe_boundary(lv->vg, extents, + seg_is_raid1(seg) ? 0 : _raid_stripes_count(seg), 0); + + if ((extents == lv->le_count) && lv_is_component(lv) && lv_is_active(lv)) { + /* When LV is removed, make sure it is inactive */ + log_error(INTERNAL_ERROR "Removing still active LV %s.", display_lvname(lv)); + return 0; + } + return _lv_reduce(lv, extents, 1); } +int historical_glv_remove(struct generic_logical_volume *glv) +{ + struct generic_logical_volume *origin_glv; + struct glv_list *glvl, *user_glvl; + struct historical_logical_volume *hlv; + int reconnected; + + if (!glv || !glv->is_historical) + return_0; + + hlv = glv->historical; + + if (!(glv = find_historical_glv(hlv->vg, hlv->name, 0, &glvl))) { + if (!(find_historical_glv(hlv->vg, hlv->name, 1, NULL))) { + log_error(INTERNAL_ERROR "historical_glv_remove: historical LV %s/-%s not found ", + hlv->vg->name, hlv->name); + return 0; + } + + log_verbose("Historical LV %s/-%s already on removed list ", + hlv->vg->name, hlv->name); + return 1; + } + + if ((origin_glv = hlv->indirect_origin) && + !remove_glv_from_indirect_glvs(origin_glv, glv)) + return_0; + + dm_list_iterate_items(user_glvl, &hlv->indirect_glvs) { + reconnected = 0; + if ((origin_glv && !origin_glv->is_historical) && !user_glvl->glv->is_historical) + log_verbose("Removing historical connection between %s and %s.", + origin_glv->live->name, user_glvl->glv->live->name); + else if (hlv->vg->cmd->record_historical_lvs) { + if (!add_glv_to_indirect_glvs(hlv->vg->vgmem, origin_glv, user_glvl->glv)) + return_0; + reconnected = 1; + } + + if (!reconnected) { + /* + * Break ancestry chain if we're removing historical LV and tracking + * historical LVs is switched off either via: + * - "metadata/record_lvs_history=0" config + * - "--nohistory" cmd line option + * + * Also, break the chain if we're unable to store such connection at all + * because we're removing the very last historical LV that was in between + * live LVs - pure live LVs can't store any indirect origin relation in + * metadata - we need at least one historical LV to do that! + */ + if (user_glvl->glv->is_historical) + user_glvl->glv->historical->indirect_origin = NULL; + else + first_seg(user_glvl->glv->live)->indirect_origin = NULL; + } + } + + dm_list_move(&hlv->vg->removed_historical_lvs, &glvl->list); + return 1; +} + /* * Completely remove an LV. */ int lv_remove(struct logical_volume *lv) { + if (lv_is_historical(lv)) + return historical_glv_remove(lv->this_glv); if (!lv_reduce(lv, lv->le_count)) return_0; @@ -654,9 +1886,10 @@ struct alloc_handle { struct dm_pool *mem; alloc_policy_t alloc; /* Overall policy */ + int approx_alloc; /* get as much as possible up to new_extents */ uint32_t new_extents; /* Number of new extents required */ uint32_t area_count; /* Number of parallel areas */ - uint32_t parity_count; /* Adds to area_count, but not area_multiple */ + uint32_t parity_count; /* Adds to area_count, but not area_multiple */ uint32_t area_multiple; /* seg->len = area_len * area_multiple */ uint32_t log_area_count; /* Number of parallel logs */ uint32_t metadata_area_count; /* Number of parallel metadata areas */ @@ -673,8 +1906,12 @@ struct alloc_handle { * that is new_extents + log_len and then split that between two * allocated areas when found. 'alloc_and_split_meta' indicates * that this is the desired dynamic. + * + * This same idea is used by cache LVs to get the metadata device + * and data device allocated together. */ unsigned alloc_and_split_meta; + unsigned split_metadata_is_allocated; /* Metadata has been allocated */ const struct dm_config_node *cling_tag_list_cn; @@ -684,57 +1921,18 @@ struct alloc_handle { * Contains area_count lists of areas allocated to data stripes * followed by log_area_count lists of areas allocated to log stripes. */ - struct dm_list alloced_areas[0]; + struct dm_list alloced_areas[]; }; -static uint32_t _calc_area_multiple(const struct segment_type *segtype, - const uint32_t area_count, const uint32_t stripes) -{ - if (!area_count) - return 1; - - /* Striped */ - if (segtype_is_striped(segtype)) - return area_count; - - /* Parity RAID (e.g. RAID 4/5/6) */ - if (segtype_is_raid(segtype) && segtype->parity_devs) { - /* - * As articulated in _alloc_init, we can tell by - * the area_count whether a replacement drive is - * being allocated; and if this is the case, then - * there is no area_multiple that should be used. - */ - if (area_count <= segtype->parity_devs) - return 1; - return area_count - segtype->parity_devs; - } - - /* RAID10 - only has 2-way mirror right now */ - if (!strcmp(segtype->name, "raid10")) { - // FIXME: I'd like the 'stripes' arg always given - if (!stripes) - return area_count / 2; - return stripes; - } - - /* Mirrored stripes */ - if (stripes) - return stripes; - - /* Mirrored */ - return 1; -} - /* * Returns log device size in extents, algorithm from kernel code */ #define BYTE_SHIFT 3 -static uint32_t mirror_log_extents(uint32_t region_size, uint32_t pe_size, uint32_t area_len) +static uint32_t _mirror_log_extents(uint32_t region_size, uint32_t pe_size, uint32_t area_len) { - size_t area_size, bitset_size, log_size, region_count; + uint64_t area_size, region_count, bitset_size, log_size; - area_size = (size_t)area_len * pe_size; + area_size = (uint64_t) area_len * pe_size; region_count = dm_div_up(area_size, region_size); /* Work out how many "unsigned long"s we need to hold the bitset. */ @@ -746,186 +1944,30 @@ static uint32_t mirror_log_extents(uint32_t region_size, uint32_t pe_size, uint3 log_size >>= SECTOR_SHIFT; log_size = dm_div_up(log_size, pe_size); - /* - * Kernel requires a mirror to be at least 1 region large. So, - * if our mirror log is itself a mirror, it must be at least - * 1 region large. This restriction may not be necessary for - * non-mirrored logs, but we apply the rule anyway. - * - * (The other option is to make the region size of the log - * mirror smaller than the mirror it is acting as a log for, - * but that really complicates things. It's much easier to - * keep the region_size the same for both.) - */ - return (log_size > (region_size / pe_size)) ? log_size : - (region_size / pe_size); -} - -/* - * Preparation for a specific allocation attempt - * stripes and mirrors refer to the parallel areas used for data. - * If log_area_count > 1 it is always mirrored (not striped). - */ -static struct alloc_handle *_alloc_init(struct cmd_context *cmd, - struct dm_pool *mem, - const struct segment_type *segtype, - alloc_policy_t alloc, - uint32_t new_extents, - uint32_t mirrors, - uint32_t stripes, - uint32_t metadata_area_count, - uint32_t extent_size, - uint32_t region_size, - struct dm_list *parallel_areas) -{ - struct alloc_handle *ah; - uint32_t s, area_count, alloc_count, parity_count; - size_t size = 0; - - /* FIXME Caller should ensure this */ - if (mirrors && !stripes) - stripes = 1; - - if (segtype_is_virtual(segtype)) - area_count = 0; - else if (mirrors > 1) - area_count = mirrors * stripes; - else - area_count = stripes; - - size = sizeof(*ah); - - /* - * It is a requirement that RAID 4/5/6 are created with a number of - * stripes that is greater than the number of parity devices. (e.g - * RAID4/5 must have at least 2 stripes and RAID6 must have at least - * 3.) It is also a constraint that, when replacing individual devices - * in a RAID 4/5/6 array, no more devices can be replaced than - * there are parity devices. (Otherwise, there would not be enough - * redundancy to maintain the array.) Understanding these two - * constraints allows us to infer whether the caller of this function - * is intending to allocate an entire array or just replacement - * component devices. In the former case, we must account for the - * necessary parity_count. In the later case, we do not need to - * account for the extra parity devices because the array already - * exists and they only want replacement drives. - */ - parity_count = (area_count <= segtype->parity_devs) ? 0 : - segtype->parity_devs; - alloc_count = area_count + parity_count; - if (segtype_is_raid(segtype) && metadata_area_count) - /* RAID has a meta area for each device */ - alloc_count *= 2; - else - /* mirrors specify their exact log count */ - alloc_count += metadata_area_count; - - size += sizeof(ah->alloced_areas[0]) * alloc_count; - - if (!(ah = dm_pool_zalloc(mem, size))) { - log_error("allocation handle allocation failed"); - return NULL; - } - - ah->cmd = cmd; - - if (segtype_is_virtual(segtype)) - return ah; - - if (!(area_count + metadata_area_count)) { - log_error(INTERNAL_ERROR "_alloc_init called for non-virtual segment with no disk space."); - return NULL; - } - - if (!(ah->mem = dm_pool_create("allocation", 1024))) { - log_error("allocation pool creation failed"); - return NULL; - } - - if (mirrors || stripes) - ah->new_extents = new_extents; - else - ah->new_extents = 0; - ah->area_count = area_count; - ah->parity_count = parity_count; - ah->region_size = region_size; - ah->alloc = alloc; - - /* - * For the purposes of allocation, area_count and parity_count are - * kept separately. However, the 'area_count' field in an - * lv_segment includes both; and this is what '_calc_area_multiple' - * is calculated from. So, we must pass in the total count to get - * a correct area_multiple. - */ - ah->area_multiple = _calc_area_multiple(segtype, area_count + parity_count, stripes); - ah->mirror_logs_separate = find_config_tree_bool(cmd, "allocation/mirror_logs_require_separate_pvs", - DEFAULT_MIRROR_LOGS_REQUIRE_SEPARATE_PVS); - - if (segtype_is_raid(segtype)) { - if (metadata_area_count) { - if (metadata_area_count != area_count) - log_error(INTERNAL_ERROR - "Bad metadata_area_count"); - ah->metadata_area_count = area_count; - ah->alloc_and_split_meta = 1; - - ah->log_len = RAID_METADATA_AREA_LEN; - - /* - * We need 'log_len' extents for each - * RAID device's metadata_area - */ - ah->new_extents += (ah->log_len * ah->area_multiple); - } else { - ah->log_area_count = 0; - ah->log_len = 0; - } - } else if (segtype_is_thin_pool(segtype)) { - ah->log_area_count = metadata_area_count; - /* thin_pool uses region_size to pass metadata size in extents */ - ah->log_len = ah->region_size; - ah->region_size = 0; - ah->mirror_logs_separate = - find_config_tree_bool(cmd, "allocation/thin_pool_metadata_require_separate_pvs", - DEFAULT_THIN_POOL_METADATA_REQUIRE_SEPARATE_PVS); - } else { - ah->log_area_count = metadata_area_count; - ah->log_len = !metadata_area_count ? 0 : - mirror_log_extents(ah->region_size, extent_size, - new_extents / ah->area_multiple); + if (log_size > UINT32_MAX) { + log_error("Log size needs too many extents "FMTu64" with region size of %u sectors.", + log_size, region_size); + log_size = UINT32_MAX; + /* VG likely will not have enough free space for this allocation -> error */ } - for (s = 0; s < alloc_count; s++) - dm_list_init(&ah->alloced_areas[s]); - - ah->parallel_areas = parallel_areas; - - ah->cling_tag_list_cn = find_config_tree_node(cmd, "allocation/cling_tag_list"); - - ah->maximise_cling = find_config_tree_bool(cmd, "allocation/maximise_cling", DEFAULT_MAXIMISE_CLING); - - return ah; -} - -void alloc_destroy(struct alloc_handle *ah) -{ - if (ah->mem) - dm_pool_destroy(ah->mem); + return (uint32_t) log_size; } /* Is there enough total space or should we give up immediately? */ static int _sufficient_pes_free(struct alloc_handle *ah, struct dm_list *pvms, - uint32_t allocated, uint32_t extents_still_needed) + uint32_t allocated, uint32_t log_still_needed, + uint32_t extents_still_needed) { uint32_t area_extents_needed = (extents_still_needed - allocated) * ah->area_count / ah->area_multiple; uint32_t parity_extents_needed = (extents_still_needed - allocated) * ah->parity_count / ah->area_multiple; - uint32_t metadata_extents_needed = ah->metadata_area_count * RAID_METADATA_AREA_LEN; /* One each */ - uint32_t total_extents_needed = area_extents_needed + parity_extents_needed + metadata_extents_needed; + uint32_t metadata_extents_needed = (ah->alloc_and_split_meta ? 0 : ah->metadata_area_count * RAID_METADATA_AREA_LEN) + + (log_still_needed ? ah->log_len : 0); /* One each */ + uint64_t total_extents_needed = (uint64_t)area_extents_needed + parity_extents_needed + metadata_extents_needed; uint32_t free_pes = pv_maps_size(pvms); if (total_extents_needed > free_pes) { - log_error("Insufficient free space: %" PRIu32 " extents needed," + log_error("Insufficient free space: %" PRIu64 " extents needed," " but only %" PRIu32 " available", total_extents_needed, free_pes); return 0; @@ -948,7 +1990,9 @@ static uint32_t _stripes_per_mimage(struct lv_segment *seg) return 1; } -static void _init_alloc_parms(struct alloc_handle *ah, struct alloc_parms *alloc_parms, alloc_policy_t alloc, +static void _init_alloc_parms(struct alloc_handle *ah, + struct alloc_parms *alloc_parms, + alloc_policy_t alloc, struct lv_segment *prev_lvseg, unsigned can_split, uint32_t allocated, uint32_t extents_still_needed) { @@ -957,78 +2001,56 @@ static void _init_alloc_parms(struct alloc_handle *ah, struct alloc_parms *alloc alloc_parms->flags = 0; alloc_parms->extents_still_needed = extents_still_needed; - /* Are there any preceding segments we must follow on from? */ - if (alloc_parms->prev_lvseg) { - if (alloc_parms->alloc == ALLOC_CONTIGUOUS) + /* + * Only attempt contiguous/cling allocation to previous segment + * areas if the number of areas matches. + */ + if (alloc_parms->prev_lvseg && + ((ah->area_count + ah->parity_count) == prev_lvseg->area_count)) { + alloc_parms->flags |= A_AREA_COUNT_MATCHES; + + /* Are there any preceding segments we must follow on from? */ + if (alloc_parms->alloc == ALLOC_CONTIGUOUS) { alloc_parms->flags |= A_CONTIGUOUS_TO_LVSEG; - else if ((alloc_parms->alloc == ALLOC_CLING) || (alloc_parms->alloc == ALLOC_CLING_BY_TAGS)) + alloc_parms->flags |= A_POSITIONAL_FILL; + } else if ((alloc_parms->alloc == ALLOC_CLING) || + (alloc_parms->alloc == ALLOC_CLING_BY_TAGS)) { alloc_parms->flags |= A_CLING_TO_LVSEG; + alloc_parms->flags |= A_POSITIONAL_FILL; + } } else /* - * A cling allocation that follows a successful contiguous allocation - * must use the same PVs (or else fail). + * A cling allocation that follows a successful contiguous + * allocation must use the same PVs (or else fail). */ - if ((alloc_parms->alloc == ALLOC_CLING) || (alloc_parms->alloc == ALLOC_CLING_BY_TAGS)) + if ((alloc_parms->alloc == ALLOC_CLING) || + (alloc_parms->alloc == ALLOC_CLING_BY_TAGS)) { alloc_parms->flags |= A_CLING_TO_ALLOCED; + alloc_parms->flags |= A_POSITIONAL_FILL; + } if (alloc_parms->alloc == ALLOC_CLING_BY_TAGS) alloc_parms->flags |= A_CLING_BY_TAGS; + if (!(alloc_parms->alloc & A_POSITIONAL_FILL) && + (alloc_parms->alloc == ALLOC_CONTIGUOUS) && + ah->cling_tag_list_cn) + alloc_parms->flags |= A_PARTITION_BY_TAGS; + /* - * For normal allocations, if any extents have already been found + * For normal allocations, if any extents have already been found * for allocation, prefer to place further extents on the same disks as * have already been used. */ - if (ah->maximise_cling && alloc_parms->alloc == ALLOC_NORMAL && allocated != alloc_parms->extents_still_needed) + if (ah->maximise_cling && + (alloc_parms->alloc == ALLOC_NORMAL) && + (allocated != alloc_parms->extents_still_needed)) alloc_parms->flags |= A_CLING_TO_ALLOCED; if (can_split) alloc_parms->flags |= A_CAN_SPLIT; } -static int _log_parallel_areas(struct dm_pool *mem, struct dm_list *parallel_areas) -{ - struct seg_pvs *spvs; - struct pv_list *pvl; - char *pvnames; - - if (!parallel_areas) - return 1; - - dm_list_iterate_items(spvs, parallel_areas) { - if (!dm_pool_begin_object(mem, 256)) { - log_error("dm_pool_begin_object failed"); - return 0; - } - - dm_list_iterate_items(pvl, &spvs->pvs) { - if (!dm_pool_grow_object(mem, pv_dev_name(pvl->pv), strlen(pv_dev_name(pvl->pv)))) { - log_error("dm_pool_grow_object failed"); - dm_pool_abandon_object(mem); - return 0; - } - if (!dm_pool_grow_object(mem, " ", 1)) { - log_error("dm_pool_grow_object failed"); - dm_pool_abandon_object(mem); - return 0; - } - } - - if (!dm_pool_grow_object(mem, "\0", 1)) { - log_error("dm_pool_grow_object failed"); - dm_pool_abandon_object(mem); - return 0; - } - - pvnames = dm_pool_end_object(mem); - log_debug("Parallel PVs at LE %" PRIu32 " length %" PRIu32 ": %s", - spvs->le, spvs->len, pvnames); - dm_pool_free(mem, pvnames); - } - - return 1; -} - static int _setup_alloced_segment(struct logical_volume *lv, uint64_t status, uint32_t area_count, uint32_t stripe_size, @@ -1040,12 +2062,12 @@ static int _setup_alloced_segment(struct logical_volume *lv, uint64_t status, struct lv_segment *seg; area_multiple = _calc_area_multiple(segtype, area_count, 0); + extents = aa[0].len * area_multiple; - if (!(seg = alloc_lv_segment(segtype, lv, lv->le_count, - aa[0].len * area_multiple, - status, stripe_size, NULL, NULL, + if (!(seg = alloc_lv_segment(segtype, lv, lv->le_count, extents, 0, + status, stripe_size, NULL, area_count, - aa[0].len, 0u, region_size, 0u, NULL))) { + aa[0].len, 0, 0u, region_size, 0u, NULL))) { log_error("Couldn't allocate new LV segment."); return 0; } @@ -1057,11 +2079,9 @@ static int _setup_alloced_segment(struct logical_volume *lv, uint64_t status, dm_list_add(&lv->segments, &seg->list); extents = aa[0].len * area_multiple; - lv->le_count += extents; - lv->size += (uint64_t) extents *lv->vg->extent_size; - if (segtype_is_mirrored(segtype)) - lv->status |= MIRRORED; + if (!_setup_lv_size(lv, lv->le_count + extents)) + return_0; return 1; } @@ -1095,16 +2115,15 @@ static int _alloc_parallel_area(struct alloc_handle *ah, uint32_t max_to_allocat struct alloc_state *alloc_state, uint32_t ix_log_offset) { uint32_t area_len, len; - uint32_t s; + uint32_t s, smeta; uint32_t ix_log_skip = 0; /* How many areas to skip in middle of array to reach log areas */ uint32_t total_area_count; struct alloced_area *aa; struct pv_area *pva; - total_area_count = ah->area_count + alloc_state->log_area_count_still_needed; - total_area_count += ah->parity_count; + total_area_count = ah->area_count + ah->parity_count + alloc_state->log_area_count_still_needed; if (!total_area_count) { - log_error(INTERNAL_ERROR "_alloc_parallel_area called without any allocation to do."); + log_warn(INTERNAL_ERROR "_alloc_parallel_area called without any allocation to do."); return 1; } @@ -1115,7 +2134,7 @@ static int _alloc_parallel_area(struct alloc_handle *ah, uint32_t max_to_allocat if (area_len > alloc_state->areas[s].used) area_len = alloc_state->areas[s].used; - len = (ah->alloc_and_split_meta) ? total_area_count * 2 : total_area_count; + len = (ah->alloc_and_split_meta && !ah->split_metadata_is_allocated) ? total_area_count * 2 : total_area_count; len *= sizeof(*aa); if (!(aa = dm_pool_alloc(ah->mem, len))) { log_error("alloced_area allocation failed"); @@ -1135,7 +2154,7 @@ static int _alloc_parallel_area(struct alloc_handle *ah, uint32_t max_to_allocat } pva = alloc_state->areas[s + ix_log_skip].pva; - if (ah->alloc_and_split_meta) { + if (ah->alloc_and_split_meta && !ah->split_metadata_is_allocated) { /* * The metadata area goes at the front of the allocated * space for now, but could easily go at the end (or @@ -1145,23 +2164,25 @@ static int _alloc_parallel_area(struct alloc_handle *ah, uint32_t max_to_allocat * allocation, we store the images at the beginning * of the areas array and the metadata at the end. */ - s += ah->area_count + ah->parity_count; - aa[s].pv = pva->map->pv; - aa[s].pe = pva->start; - aa[s].len = ah->log_len; - - log_debug("Allocating parallel metadata area %" PRIu32 - " on %s start PE %" PRIu32 - " length %" PRIu32 ".", - (s - (ah->area_count + ah->parity_count)), - pv_dev_name(aa[s].pv), aa[s].pe, - ah->log_len); - - consume_pv_area(pva, ah->log_len); - dm_list_add(&ah->alloced_areas[s], &aa[s].list); - s -= ah->area_count + ah->parity_count; - } - aa[s].len = (ah->alloc_and_split_meta) ? len - ah->log_len : len; + smeta = s + ah->area_count + ah->parity_count; + aa[smeta].pv = pva->map->pv; + aa[smeta].pe = pva->start; + aa[smeta].len = ah->log_len; + if (aa[smeta].len > pva->count) { + log_error("Metadata does not fit on a single PV."); + return 0; + } + log_debug_alloc("Allocating parallel metadata area %" PRIu32 + " on %s start PE %" PRIu32 + " length %" PRIu32 ".", + (smeta - (ah->area_count + ah->parity_count)), + pv_dev_name(aa[smeta].pv), aa[smeta].pe, + aa[smeta].len); + + consume_pv_area(pva, aa[smeta].len); + dm_list_add(&ah->alloced_areas[smeta], &aa[smeta].list); + } + aa[s].len = (ah->alloc_and_split_meta && !ah->split_metadata_is_allocated) ? len - ah->log_len : len; /* Skip empty allocations */ if (!aa[s].len) continue; @@ -1169,9 +2190,9 @@ static int _alloc_parallel_area(struct alloc_handle *ah, uint32_t max_to_allocat aa[s].pv = pva->map->pv; aa[s].pe = pva->start; - log_debug("Allocating parallel area %" PRIu32 - " on %s start PE %" PRIu32 " length %" PRIu32 ".", - s, pv_dev_name(aa[s].pv), aa[s].pe, aa[s].len); + log_debug_alloc("Allocating parallel area %" PRIu32 + " on %s start PE %" PRIu32 " length %" PRIu32 ".", + s, pv_dev_name(aa[s].pv), aa[s].pe, aa[s].len); consume_pv_area(pva, aa[s].len); @@ -1179,7 +2200,8 @@ static int _alloc_parallel_area(struct alloc_handle *ah, uint32_t max_to_allocat } /* Only need to alloc metadata from the first batch */ - ah->alloc_and_split_meta = 0; + if (ah->alloc_and_split_meta) + ah->split_metadata_is_allocated = 1; ah->total_area_len += area_len; @@ -1194,6 +2216,7 @@ static int _alloc_parallel_area(struct alloc_handle *ah, uint32_t max_to_allocat * reduced to cover only the first. * fn should return 0 on error, 1 to continue scanning or >1 to terminate without error. * In the last case, this function passes on the return code. + * FIXME I think some callers are expecting this to check all PV segments used by an LV. */ static int _for_each_pv(struct cmd_context *cmd, struct logical_volume *lv, uint32_t le, uint32_t len, struct lv_segment *seg, @@ -1227,7 +2250,7 @@ static int _for_each_pv(struct cmd_context *cmd, struct logical_volume *lv, *max_seg_len = remaining_seg_len; area_multiple = _calc_area_multiple(seg->segtype, seg->area_count, 0); - area_len = remaining_seg_len / area_multiple ? : 1; + area_len = (remaining_seg_len / area_multiple) ? : 1; /* For striped mirrors, all the areas are counted, through the mirror layer */ if (top_level_area_index == -1) @@ -1263,7 +2286,20 @@ static int _for_each_pv(struct cmd_context *cmd, struct logical_volume *lv, return r; } - /* FIXME Add snapshot cow LVs etc. */ + /* FIXME Add snapshot cow, thin meta etc. */ + +/* + if (!only_single_area_segments && !max_areas && seg_is_raid(seg)) { + for (s = first_area; s < seg->area_count; s++) { + if (seg_metalv(seg, s)) + if (!(r = _for_each_pv(cmd, seg_metalv(seg, s), 0, seg_metalv(seg, s)->le_count, NULL, + NULL, 0, 0, 0, 0, fn, data))) + stack; + if (r != 1) + return r; + } + } +*/ return 1; } @@ -1276,7 +2312,7 @@ static int _comp_area(const void *l, const void *r) if (lhs->used < rhs->used) return 1; - else if (lhs->used > rhs->used) + if (lhs->used > rhs->used) return -1; return 0; @@ -1288,9 +2324,9 @@ static int _comp_area(const void *l, const void *r) struct pv_match { int (*condition)(struct pv_match *pvmatch, struct pv_segment *pvseg, struct pv_area *pva); - struct pv_area_used *areas; + struct alloc_handle *ah; + struct alloc_state *alloc_state; struct pv_area *pva; - uint32_t areas_size; const struct dm_config_node *cling_tag_list_cn; int s; /* Area index of match */ }; @@ -1307,69 +2343,235 @@ static int _is_same_pv(struct pv_match *pvmatch __attribute((unused)), struct pv } /* - * Does PV area have a tag listed in allocation/cling_tag_list that - * matches a tag of the PV of the existing segment? + * Does PV area have a tag listed in allocation/cling_tag_list that + * matches EITHER a tag of the PV of the existing segment OR a tag in pv_tags? + * If mem is set, then instead we append a list of matching tags for printing to the object there. */ -static int _pvs_have_matching_tag(const struct dm_config_node *cling_tag_list_cn, struct physical_volume *pv1, struct physical_volume *pv2) +static int _match_pv_tags(const struct dm_config_node *cling_tag_list_cn, + struct physical_volume *pv1, uint32_t pv1_start_pe, uint32_t area_num, + struct physical_volume *pv2, struct dm_list *pv_tags, unsigned validate_only, + struct dm_pool *mem, unsigned parallel_pv) { const struct dm_config_value *cv; const char *str; const char *tag_matched; + struct dm_list *tags_to_match = mem ? NULL : pv_tags ? : ((pv2) ? &pv2->tags : NULL); + struct dm_str_list *sl; + unsigned first_tag = 1; for (cv = cling_tag_list_cn->v; cv; cv = cv->next) { if (cv->type != DM_CFG_STRING) { - log_error("Ignoring invalid string in config file entry " - "allocation/cling_tag_list"); + if (validate_only) + log_warn("WARNING: Ignoring invalid string in config file entry " + "allocation/cling_tag_list"); continue; } str = cv->v.str; if (!*str) { - log_error("Ignoring empty string in config file entry " - "allocation/cling_tag_list"); + if (validate_only) + log_warn("WARNING: Ignoring empty string in config file entry " + "allocation/cling_tag_list"); continue; } if (*str != '@') { - log_error("Ignoring string not starting with @ in config file entry " - "allocation/cling_tag_list: %s", str); + if (validate_only) + log_warn("WARNING: Ignoring string not starting with @ in config file entry " + "allocation/cling_tag_list: %s", str); continue; } str++; if (!*str) { - log_error("Ignoring empty tag in config file entry " - "allocation/cling_tag_list"); + if (validate_only) + log_warn("WARNING: Ignoring empty tag in config file entry " + "allocation/cling_tag_list"); continue; } + if (validate_only) + continue; + /* Wildcard matches any tag against any tag. */ if (!strcmp(str, "*")) { - if (!str_list_match_list(&pv1->tags, &pv2->tags, &tag_matched)) + if (mem) { + dm_list_iterate_items(sl, &pv1->tags) { + if (!first_tag && !dm_pool_grow_object(mem, ",", 0)) { + log_error("PV tags string extension failed."); + return 0; + } + first_tag = 0; + if (!dm_pool_grow_object(mem, sl->str, 0)) { + log_error("PV tags string extension failed."); + return 0; + } + } continue; - else { - log_debug("Matched allocation PV tag %s on existing %s with free space on %s.", - tag_matched, pv_dev_name(pv1), pv_dev_name(pv2)); - return 1; } + + if (tags_to_match && !str_list_match_list(&pv1->tags, tags_to_match, &tag_matched)) + continue; + + if (!pv_tags) { + if (parallel_pv) + log_debug_alloc("Not using free space on %s: Matched allocation PV tag %s on existing parallel PV %s.", + pv_dev_name(pv1), tag_matched, pv2 ? pv_dev_name(pv2) : "-"); + else + log_debug_alloc("Matched allocation PV tag %s on existing %s with free space on %s.", + tag_matched, pv_dev_name(pv1), pv2 ? pv_dev_name(pv2) : "-"); + } else + log_debug_alloc("Eliminating allocation area %" PRIu32 " at PV %s start PE %" PRIu32 + " from consideration: PV tag %s already used.", + area_num, pv_dev_name(pv1), pv1_start_pe, tag_matched); + return 1; } if (!str_list_match_item(&pv1->tags, str) || - !str_list_match_item(&pv2->tags, str)) + (tags_to_match && !str_list_match_item(tags_to_match, str))) + continue; + + if (mem) { + if (!first_tag && !dm_pool_grow_object(mem, ",", 0)) { + log_error("PV tags string extension failed."); + return 0; + } + first_tag = 0; + if (!dm_pool_grow_object(mem, str, 0)) { + log_error("PV tags string extension failed."); + return 0; + } continue; - else { - log_debug("Matched allocation PV tag %s on existing %s with free space on %s.", - str, pv_dev_name(pv1), pv_dev_name(pv2)); - return 1; } + + if (!pv_tags) { + if (parallel_pv) + log_debug_alloc("Not using free space on %s: Matched allocation PV tag %s on existing parallel PV %s.", + pv2 ? pv_dev_name(pv2) : "-", str, pv_dev_name(pv1)); + else + log_debug_alloc("Matched allocation PV tag %s on existing %s with free space on %s.", + str, pv_dev_name(pv1), pv2 ? pv_dev_name(pv2) : "-"); + } else + log_debug_alloc("Eliminating allocation area %" PRIu32 " at PV %s start PE %" PRIu32 + " from consideration: PV tag %s already used.", + area_num, pv_dev_name(pv1), pv1_start_pe, str); + + return 1; } + if (mem) + return 1; + return 0; } +static int _validate_tag_list(const struct dm_config_node *cling_tag_list_cn) +{ + return _match_pv_tags(cling_tag_list_cn, NULL, 0, 0, NULL, NULL, 1, NULL, 0); +} + +static int _tags_list_str(struct dm_pool *mem, struct physical_volume *pv1, const struct dm_config_node *cling_tag_list_cn) +{ + if (!_match_pv_tags(cling_tag_list_cn, pv1, 0, 0, NULL, NULL, 0, mem, 0)) { + dm_pool_abandon_object(mem); + return_0; + } + + return 1; +} + +/* + * Does PV area have a tag listed in allocation/cling_tag_list that + * matches a tag in the pv_tags list? + */ +static int _pv_has_matching_tag(const struct dm_config_node *cling_tag_list_cn, + struct physical_volume *pv1, uint32_t pv1_start_pe, uint32_t area_num, + struct dm_list *pv_tags) +{ + return _match_pv_tags(cling_tag_list_cn, pv1, pv1_start_pe, area_num, NULL, pv_tags, 0, NULL, 0); +} + +/* + * Does PV area have a tag listed in allocation/cling_tag_list that + * matches a tag of the PV of the existing segment? + */ +static int _pvs_have_matching_tag(const struct dm_config_node *cling_tag_list_cn, + struct physical_volume *pv1, struct physical_volume *pv2, + unsigned parallel_pv) +{ + return _match_pv_tags(cling_tag_list_cn, pv1, 0, 0, pv2, NULL, 0, NULL, parallel_pv); +} + static int _has_matching_pv_tag(struct pv_match *pvmatch, struct pv_segment *pvseg, struct pv_area *pva) { - return _pvs_have_matching_tag(pvmatch->cling_tag_list_cn, pvseg->pv, pva->map->pv); + return _pvs_have_matching_tag(pvmatch->cling_tag_list_cn, pvseg->pv, pva->map->pv, 0); +} + +static int _log_parallel_areas(struct dm_pool *mem, struct dm_list *parallel_areas, + const struct dm_config_node *cling_tag_list_cn) +{ + struct seg_pvs *spvs; + struct pv_list *pvl; + char *pvnames; + unsigned first; + + if (!parallel_areas) + return 1; + + dm_list_iterate_items(spvs, parallel_areas) { + first = 1; + + if (!dm_pool_begin_object(mem, 256)) { + log_error("dm_pool_begin_object failed"); + return 0; + } + + dm_list_iterate_items(pvl, &spvs->pvs) { + if (!first && !dm_pool_grow_object(mem, " ", 1)) { + log_error("dm_pool_grow_object failed"); + dm_pool_abandon_object(mem); + return 0; + } + + if (!dm_pool_grow_object(mem, pv_dev_name(pvl->pv), strlen(pv_dev_name(pvl->pv)))) { + log_error("dm_pool_grow_object failed"); + dm_pool_abandon_object(mem); + return 0; + } + + if (cling_tag_list_cn) { + if (!dm_pool_grow_object(mem, "(", 1)) { + log_error("dm_pool_grow_object failed"); + dm_pool_abandon_object(mem); + return 0; + } + if (!_tags_list_str(mem, pvl->pv, cling_tag_list_cn)) { + dm_pool_abandon_object(mem); + return_0; + } + if (!dm_pool_grow_object(mem, ")", 1)) { + log_error("dm_pool_grow_object failed"); + dm_pool_abandon_object(mem); + return 0; + } + } + + first = 0; + } + + if (!dm_pool_grow_object(mem, "\0", 1)) { + log_error("dm_pool_grow_object failed"); + dm_pool_abandon_object(mem); + return 0; + } + + pvnames = dm_pool_end_object(mem); + log_debug_alloc("Parallel PVs at LE %" PRIu32 " length %" PRIu32 ": %s", + spvs->le, spvs->len, pvnames); + dm_pool_free(mem, pvnames); + } + + return 1; } /* @@ -1386,17 +2588,65 @@ static int _is_contiguous(struct pv_match *pvmatch __attribute((unused)), struct return 1; } -static void _reserve_area(struct pv_area_used *area_used, struct pv_area *pva, uint32_t required, - uint32_t ix_pva, uint32_t unreserved) +static int _reserve_area(struct alloc_handle *ah, struct alloc_state *alloc_state, struct pv_area *pva, + uint32_t required, uint32_t ix_pva, uint32_t unreserved) { - log_debug("%s allocation area %" PRIu32 " %s %s start PE %" PRIu32 - " length %" PRIu32 " leaving %" PRIu32 ".", - area_used->pva ? "Changing " : "Considering", - ix_pva - 1, area_used->pva ? "to" : "as", - dev_name(pva->map->pv->dev), pva->start, required, unreserved); + struct pv_area_used *area_used = &alloc_state->areas[ix_pva]; + const char *pv_tag_list = NULL; + + if (ah->cling_tag_list_cn) { + if (!dm_pool_begin_object(ah->mem, 256)) { + log_error("PV tags string allocation failed."); + return 0; + } else if (!_tags_list_str(ah->mem, pva->map->pv, ah->cling_tag_list_cn)) + dm_pool_abandon_object(ah->mem); + else if (!dm_pool_grow_object(ah->mem, "\0", 1)) { + dm_pool_abandon_object(ah->mem); + log_error("PV tags string extension failed."); + return 0; + } else + pv_tag_list = dm_pool_end_object(ah->mem); + } + + log_debug_alloc("%s allocation area %" PRIu32 " %s %s start PE %" PRIu32 + " length %" PRIu32 " leaving %" PRIu32 "%s%s.", + area_used->pva ? "Changing " : "Considering", + ix_pva, area_used->pva ? "to" : "as", + dev_name(pva->map->pv->dev), pva->start, required, unreserved, + pv_tag_list ? " with PV tags: " : "", + pv_tag_list ? : ""); + + if (pv_tag_list) + dm_pool_free(ah->mem, (void *)pv_tag_list); area_used->pva = pva; area_used->used = required; + + return 1; +} + +static int _reserve_required_area(struct alloc_handle *ah, struct alloc_state *alloc_state, struct pv_area *pva, + uint32_t required, uint32_t ix_pva, uint32_t unreserved) +{ + uint32_t s; + struct pv_area_used *new_state; + + /* Expand areas array if needed after an area was split. */ + if (ix_pva >= alloc_state->areas_size) { + alloc_state->areas_size *= 2; + if (!(new_state = realloc(alloc_state->areas, sizeof(*alloc_state->areas) * (alloc_state->areas_size)))) { + log_error("Memory reallocation for parallel areas failed."); + return 0; + } + alloc_state->areas = new_state; + for (s = alloc_state->areas_size / 2; s < alloc_state->areas_size; s++) + alloc_state->areas[s].pva = NULL; + } + + if (!_reserve_area(ah, alloc_state, pva, required, ix_pva, unreserved)) + return_0; + + return 1; } static int _is_condition(struct cmd_context *cmd __attribute__((unused)), @@ -1404,21 +2654,28 @@ static int _is_condition(struct cmd_context *cmd __attribute__((unused)), void *data) { struct pv_match *pvmatch = data; + int positional = pvmatch->alloc_state->alloc_parms->flags & A_POSITIONAL_FILL; - if (pvmatch->areas[s].pva) + if (positional && pvmatch->alloc_state->areas[s].pva) return 1; /* Area already assigned */ if (!pvmatch->condition(pvmatch, pvseg, pvmatch->pva)) return 1; /* Continue */ - if (s >= pvmatch->areas_size) + if (positional && (s >= pvmatch->alloc_state->num_positional_areas)) + return 1; + + /* FIXME The previous test should make this one redundant. */ + if (positional && (s >= pvmatch->alloc_state->areas_size)) return 1; /* * Only used for cling and contiguous policies (which only make one allocation per PV) * so it's safe to say all the available space is used. */ - _reserve_area(&pvmatch->areas[s], pvmatch->pva, pvmatch->pva->count, s + 1, 0); + if (positional && + !_reserve_required_area(pvmatch->ah, pvmatch->alloc_state, pvmatch->pva, pvmatch->pva->count, s, 0)) + return_0; return 2; /* Finished */ } @@ -1435,9 +2692,9 @@ static int _check_cling(struct alloc_handle *ah, int r; uint32_t le, len; + pvmatch.ah = ah; pvmatch.condition = cling_tag_list_cn ? _has_matching_pv_tag : _is_same_pv; - pvmatch.areas = alloc_state->areas; - pvmatch.areas_size = alloc_state->areas_size; + pvmatch.alloc_state = alloc_state; pvmatch.pva = pva; pvmatch.cling_tag_list_cn = cling_tag_list_cn; @@ -1466,21 +2723,21 @@ static int _check_cling(struct alloc_handle *ah, /* * Is pva contiguous to any existing areas or on the same PV? */ -static int _check_contiguous(struct cmd_context *cmd, +static int _check_contiguous(struct alloc_handle *ah, struct lv_segment *prev_lvseg, struct pv_area *pva, struct alloc_state *alloc_state) { struct pv_match pvmatch; int r; + pvmatch.ah = ah; pvmatch.condition = _is_contiguous; - pvmatch.areas = alloc_state->areas; - pvmatch.areas_size = alloc_state->areas_size; + pvmatch.alloc_state = alloc_state; pvmatch.pva = pva; pvmatch.cling_tag_list_cn = NULL; /* FIXME Cope with stacks by flattening */ - if (!(r = _for_each_pv(cmd, prev_lvseg->lv, + if (!(r = _for_each_pv(ah->cmd, prev_lvseg->lv, prev_lvseg->le + prev_lvseg->len - 1, 1, NULL, NULL, 0, 0, -1, 1, _is_condition, &pvmatch))) @@ -1500,6 +2757,7 @@ static int _check_cling_to_alloced(struct alloc_handle *ah, const struct dm_conf { unsigned s; struct alloced_area *aa; + int positional = alloc_state->alloc_parms->flags & A_POSITIONAL_FILL; /* * Ignore log areas. They are always allocated whole as part of the @@ -1509,12 +2767,14 @@ static int _check_cling_to_alloced(struct alloc_handle *ah, const struct dm_conf return 0; for (s = 0; s < ah->area_count; s++) { - if (alloc_state->areas[s].pva) + if (positional && alloc_state->areas[s].pva) continue; /* Area already assigned */ dm_list_iterate_items(aa, &ah->alloced_areas[s]) { if ((!cling_tag_list_cn && (pva->map->pv == aa[0].pv)) || - (cling_tag_list_cn && _pvs_have_matching_tag(cling_tag_list_cn, pva->map->pv, aa[0].pv))) { - _reserve_area(&alloc_state->areas[s], pva, pva->count, s + 1, 0); + (cling_tag_list_cn && _pvs_have_matching_tag(cling_tag_list_cn, pva->map->pv, aa[0].pv, 0))) { + if (positional && + !_reserve_required_area(ah, alloc_state, pva, pva->count, s, 0)) + return_0; return 1; } } @@ -1523,13 +2783,20 @@ static int _check_cling_to_alloced(struct alloc_handle *ah, const struct dm_conf return 0; } -static int _pv_is_parallel(struct physical_volume *pv, struct dm_list *parallel_pvs) +static int _pv_is_parallel(struct physical_volume *pv, struct dm_list *parallel_pvs, const struct dm_config_node *cling_tag_list_cn) { struct pv_list *pvl; - dm_list_iterate_items(pvl, parallel_pvs) - if (pv == pvl->pv) + dm_list_iterate_items(pvl, parallel_pvs) { + if (pv == pvl->pv) { + log_debug_alloc("Not using free space on existing parallel PV %s.", + pv_dev_name(pvl->pv)); + return 1; + } + if (cling_tag_list_cn && _pvs_have_matching_tag(cling_tag_list_cn, pvl->pv, pv, 1)) return 1; + } + return 0; } @@ -1539,9 +2806,10 @@ static int _pv_is_parallel(struct physical_volume *pv, struct dm_list *parallel_ * alloc_state->areas may get modified. */ static area_use_t _check_pva(struct alloc_handle *ah, struct pv_area *pva, uint32_t still_needed, - const struct alloc_parms *alloc_parms, struct alloc_state *alloc_state, + struct alloc_state *alloc_state, unsigned already_found_one, unsigned iteration_count, unsigned log_iteration_count) { + const struct alloc_parms *alloc_parms = alloc_state->alloc_parms; unsigned s; /* Skip fully-reserved areas (which are not currently removed from the list). */ @@ -1560,34 +2828,36 @@ static area_use_t _check_pva(struct alloc_handle *ah, struct pv_area *pva, uint3 /* If maximise_cling is set, perform several checks, otherwise perform exactly one. */ if (!iteration_count && !log_iteration_count && alloc_parms->flags & (A_CONTIGUOUS_TO_LVSEG | A_CLING_TO_LVSEG | A_CLING_TO_ALLOCED)) { /* Contiguous? */ - if (((alloc_parms->flags & A_CONTIGUOUS_TO_LVSEG) || (ah->maximise_cling && alloc_parms->prev_lvseg)) && - _check_contiguous(ah->cmd, alloc_parms->prev_lvseg, pva, alloc_state)) - return PREFERRED; - + if (((alloc_parms->flags & A_CONTIGUOUS_TO_LVSEG) || + (ah->maximise_cling && (alloc_parms->flags & A_AREA_COUNT_MATCHES))) && + _check_contiguous(ah, alloc_parms->prev_lvseg, pva, alloc_state)) + goto found; + /* Try next area on same PV if looking for contiguous space */ if (alloc_parms->flags & A_CONTIGUOUS_TO_LVSEG) return NEXT_AREA; /* Cling to prev_lvseg? */ - if (((alloc_parms->flags & A_CLING_TO_LVSEG) || (ah->maximise_cling && alloc_parms->prev_lvseg)) && + if (((alloc_parms->flags & A_CLING_TO_LVSEG) || + (ah->maximise_cling && (alloc_parms->flags & A_AREA_COUNT_MATCHES))) && _check_cling(ah, NULL, alloc_parms->prev_lvseg, pva, alloc_state)) /* If this PV is suitable, use this first area */ - return PREFERRED; + goto found; /* Cling_to_alloced? */ if ((alloc_parms->flags & A_CLING_TO_ALLOCED) && _check_cling_to_alloced(ah, NULL, pva, alloc_state)) - return PREFERRED; + goto found; /* Cling_by_tags? */ if (!(alloc_parms->flags & A_CLING_BY_TAGS) || !ah->cling_tag_list_cn) return NEXT_PV; - if (alloc_parms->prev_lvseg) { + if ((alloc_parms->flags & A_AREA_COUNT_MATCHES)) { if (_check_cling(ah, ah->cling_tag_list_cn, alloc_parms->prev_lvseg, pva, alloc_state)) - return PREFERRED; + goto found; } else if (_check_cling_to_alloced(ah, ah->cling_tag_list_cn, pva, alloc_state)) - return PREFERRED; + goto found; /* All areas on this PV give same result so pointless checking more */ return NEXT_PV; @@ -1601,6 +2871,10 @@ static area_use_t _check_pva(struct alloc_handle *ah, struct pv_area *pva, uint3 (already_found_one && alloc_parms->alloc != ALLOC_ANYWHERE))) return NEXT_PV; +found: + if (alloc_parms->flags & A_POSITIONAL_FILL) + return PREFERRED; + return USE_AREA; } @@ -1613,12 +2887,12 @@ static uint32_t _calc_required_extents(struct alloc_handle *ah, struct pv_area * uint32_t required = max_to_allocate / ah->area_multiple; /* - * Update amount unreserved - effectively splitting an area + * Update amount unreserved - effectively splitting an area * into two or more parts. If the whole stripe doesn't fit, * reduce amount we're looking for. */ if (alloc == ALLOC_ANYWHERE) { - if (ix_pva - 1 >= ah->area_count) + if (ix_pva >= ah->area_count + ah->parity_count) required = ah->log_len; } else if (required < ah->log_len) required = ah->log_len; @@ -1634,33 +2908,12 @@ static uint32_t _calc_required_extents(struct alloc_handle *ah, struct pv_area * return required; } -static int _reserve_required_area(struct alloc_handle *ah, uint32_t max_to_allocate, - unsigned ix_pva, struct pv_area *pva, - struct alloc_state *alloc_state, alloc_policy_t alloc) -{ - uint32_t required = _calc_required_extents(ah, pva, ix_pva, max_to_allocate, alloc); - uint32_t s; - - /* Expand areas array if needed after an area was split. */ - if (ix_pva > alloc_state->areas_size) { - alloc_state->areas_size *= 2; - if (!(alloc_state->areas = dm_realloc(alloc_state->areas, sizeof(*alloc_state->areas) * (alloc_state->areas_size)))) { - log_error("Memory reallocation for parallel areas failed."); - return 0; - } - for (s = alloc_state->areas_size / 2; s < alloc_state->areas_size; s++) - alloc_state->areas[s].pva = NULL; - } - - _reserve_area(&alloc_state->areas[ix_pva - 1], pva, required, ix_pva, pva->unreserved); - - return 1; -} - static void _clear_areas(struct alloc_state *alloc_state) { uint32_t s; + alloc_state->num_positional_areas = 0; + for (s = 0; s < alloc_state->areas_size; s++) alloc_state->areas[s].pva = NULL; } @@ -1679,48 +2932,89 @@ static void _reset_unreserved(struct dm_list *pvms) } static void _report_needed_allocation_space(struct alloc_handle *ah, - struct alloc_state *alloc_state) + struct alloc_state *alloc_state, + struct dm_list *pvms) { const char *metadata_type; uint32_t parallel_areas_count, parallel_area_size; uint32_t metadata_count, metadata_size; - parallel_area_size = (ah->new_extents - alloc_state->allocated) / ah->area_multiple - - ((ah->alloc_and_split_meta) ? ah->log_len : 0); + parallel_area_size = ah->new_extents - alloc_state->allocated; + parallel_area_size /= ah->area_multiple; + parallel_area_size -= (ah->alloc_and_split_meta && !ah->split_metadata_is_allocated) ? ah->log_len : 0; parallel_areas_count = ah->area_count + ah->parity_count; metadata_size = ah->log_len; if (ah->alloc_and_split_meta) { - metadata_type = "RAID metadata area"; + metadata_type = "metadata area"; metadata_count = parallel_areas_count; + if (ah->split_metadata_is_allocated) + metadata_size = 0; } else { metadata_type = "mirror log"; metadata_count = alloc_state->log_area_count_still_needed; } - log_debug("Still need %" PRIu32 " total extents:", - parallel_area_size * parallel_areas_count + metadata_size * metadata_count); - log_debug(" %" PRIu32 " (%" PRIu32 " data/%" PRIu32 - " parity) parallel areas of %" PRIu32 " extents each", - parallel_areas_count, ah->area_count, ah->parity_count, parallel_area_size); - log_debug(" %" PRIu32 " %ss of %" PRIu32 " extents each", - metadata_count, metadata_type, metadata_size); + log_debug_alloc("Still need %s%" PRIu32 " total extents from %" PRIu32 " remaining (%" PRIu32 " positional slots):", + ah->approx_alloc ? "up to " : "", + parallel_area_size * parallel_areas_count + metadata_size * metadata_count, pv_maps_size(pvms), + alloc_state->num_positional_areas); + log_debug_alloc(" %" PRIu32 " (%" PRIu32 " data/%" PRIu32 + " parity) parallel areas of %" PRIu32 " extents each", + parallel_areas_count, ah->area_count, ah->parity_count, parallel_area_size); + log_debug_alloc(" %" PRIu32 " %s%s of %" PRIu32 " extents each", + metadata_count, metadata_type, + (metadata_count == 1) ? "" : "s", + metadata_size); } + +/* Work through the array, removing any entries with tags already used by previous areas. */ +static int _limit_to_one_area_per_tag(struct alloc_handle *ah, struct alloc_state *alloc_state, + uint32_t ix_log_offset, unsigned *ix) +{ + uint32_t s = 0, u = 0; + DM_LIST_INIT(pv_tags); + + while (s < alloc_state->areas_size && alloc_state->areas[s].pva) { + /* Start again with an empty tag list when we reach the log devices */ + if (u == ix_log_offset) + dm_list_init(&pv_tags); + if (!_pv_has_matching_tag(ah->cling_tag_list_cn, alloc_state->areas[s].pva->map->pv, alloc_state->areas[s].pva->start, s, &pv_tags)) { + /* The comparison fn will ignore any non-cling tags so just add everything */ + if (!str_list_add_list(ah->mem, &pv_tags, &alloc_state->areas[s].pva->map->pv->tags)) + return_0; + + if (s != u) + alloc_state->areas[u] = alloc_state->areas[s]; + + u++; + } else + (*ix)--; /* One area removed */ + + s++; + } + + if (u < alloc_state->areas_size) + alloc_state->areas[u].pva = NULL; + + return 1; +} + /* * Returns 1 regardless of whether any space was found, except on error. */ -static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc_parms *alloc_parms, +static int _find_some_parallel_space(struct alloc_handle *ah, struct dm_list *pvms, struct alloc_state *alloc_state, struct dm_list *parallel_pvs, uint32_t max_to_allocate) { + const struct alloc_parms *alloc_parms = alloc_state->alloc_parms; unsigned ix = 0; unsigned last_ix; struct pv_map *pvm; struct pv_area *pva; unsigned preferred_count = 0; unsigned already_found_one; - unsigned ix_offset = 0; /* Offset for non-preferred allocations */ unsigned ix_log_offset; /* Offset to start of areas to use for log */ unsigned too_small_for_log_count; /* How many too small for log? */ unsigned iteration_count = 0; /* cling_to_alloced may need 2 iterations */ @@ -1728,30 +3022,38 @@ static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc struct alloced_area *aa; uint32_t s; uint32_t devices_needed = ah->area_count + ah->parity_count; + uint32_t required; - /* ix_offset holds the number of parallel allocations that must be contiguous/cling */ - /* At most one of A_CONTIGUOUS_TO_LVSEG, A_CLING_TO_LVSEG or A_CLING_TO_ALLOCED may be set */ - if (alloc_parms->flags & (A_CONTIGUOUS_TO_LVSEG | A_CLING_TO_LVSEG)) - ix_offset = _stripes_per_mimage(alloc_parms->prev_lvseg) * alloc_parms->prev_lvseg->area_count; + _clear_areas(alloc_state); + _reset_unreserved(pvms); - if (alloc_parms->flags & A_CLING_TO_ALLOCED) - ix_offset = ah->area_count; + /* num_positional_areas holds the number of parallel allocations that must be contiguous/cling */ + /* These appear first in the array, so it is also the offset to the non-preferred allocations */ + /* At most one of A_CONTIGUOUS_TO_LVSEG, A_CLING_TO_LVSEG or A_CLING_TO_ALLOCED may be set */ + if (!(alloc_parms->flags & A_POSITIONAL_FILL)) + alloc_state->num_positional_areas = 0; + else if (alloc_parms->flags & (A_CONTIGUOUS_TO_LVSEG | A_CLING_TO_LVSEG)) + alloc_state->num_positional_areas = _stripes_per_mimage(alloc_parms->prev_lvseg) * alloc_parms->prev_lvseg->area_count; + else if (alloc_parms->flags & A_CLING_TO_ALLOCED) + alloc_state->num_positional_areas = ah->area_count; if (alloc_parms->alloc == ALLOC_NORMAL || (alloc_parms->flags & A_CLING_TO_ALLOCED)) - log_debug("Cling_to_allocated is %sset", - alloc_parms->flags & A_CLING_TO_ALLOCED ? "" : "not "); + log_debug_alloc("Cling_to_allocated is %sset", + alloc_parms->flags & A_CLING_TO_ALLOCED ? "" : "not "); - _clear_areas(alloc_state); - _reset_unreserved(pvms); + if (alloc_parms->flags & A_POSITIONAL_FILL) + log_debug_alloc("%u preferred area(s) to be filled positionally.", alloc_state->num_positional_areas); + else + log_debug_alloc("Areas to be sorted and filled sequentially."); - _report_needed_allocation_space(ah, alloc_state); + _report_needed_allocation_space(ah, alloc_state, pvms); /* ix holds the number of areas found on other PVs */ do { if (log_iteration_count) { - log_debug("Found %u areas for %" PRIu32 " parallel areas and %" PRIu32 " log areas so far.", ix, devices_needed, alloc_state->log_area_count_still_needed); + log_debug_alloc("Found %u areas for %" PRIu32 " parallel areas and %" PRIu32 " log areas so far.", ix, devices_needed, alloc_state->log_area_count_still_needed); } else if (iteration_count) - log_debug("Filled %u out of %u preferred areas so far.", preferred_count, ix_offset); + log_debug_alloc("Filled %u out of %u preferred areas so far.", preferred_count, alloc_state->num_positional_areas); /* * Provide for escape from the loop if no progress is made. @@ -1783,16 +3085,16 @@ static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc /* FIXME Split into log and non-log parallel_pvs and only check the log ones if log_iteration? */ /* (I've temporatily disabled the check.) */ /* Avoid PVs used by existing parallel areas */ - if (!log_iteration_count && parallel_pvs && _pv_is_parallel(pvm->pv, parallel_pvs)) + if (!log_iteration_count && parallel_pvs && _pv_is_parallel(pvm->pv, parallel_pvs, ah->cling_tag_list_cn)) goto next_pv; /* - * Avoid PVs already set aside for log. + * Avoid PVs already set aside for log. * We only reach here if there were enough PVs for the main areas but * not enough for the logs. */ if (log_iteration_count) { - for (s = devices_needed; s < ix + ix_offset; s++) + for (s = devices_needed; s < ix + alloc_state->num_positional_areas; s++) if (alloc_state->areas[s].pva && alloc_state->areas[s].pva->map->pv == pvm->pv) goto next_pv; /* On a second pass, avoid PVs already used in an uncommitted area */ @@ -1806,11 +3108,16 @@ static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc /* First area in each list is the largest */ dm_list_iterate_items(pva, &pvm->areas) { /* - * There are two types of allocations, which can't be mixed at present. + * There are two types of allocations, which can't be mixed at present: + * * PREFERRED are stored immediately in a specific parallel slot. + * This is only used if the A_POSITIONAL_FILL flag is set. + * This requires the number of slots to match, so if comparing with + * prev_lvseg then A_AREA_COUNT_MATCHES must be set. + * * USE_AREA are stored for later, then sorted and chosen from. */ - switch(_check_pva(ah, pva, max_to_allocate, alloc_parms, + switch(_check_pva(ah, pva, max_to_allocate, alloc_state, already_found_one, iteration_count, log_iteration_count)) { case PREFERRED: @@ -1835,8 +3142,8 @@ static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc } /* Reserve required amount of pva */ - if (!_reserve_required_area(ah, max_to_allocate, ix + ix_offset, - pva, alloc_state, alloc_parms->alloc)) + required = _calc_required_extents(ah, pva, ix + alloc_state->num_positional_areas - 1, max_to_allocate, alloc_parms->alloc); + if (!_reserve_required_area(ah, alloc_state, pva, required, ix + alloc_state->num_positional_areas - 1, pva->unreserved)) return_0; } @@ -1847,22 +3154,23 @@ static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc /* With cling and contiguous we stop if we found a match for *all* the areas */ /* FIXME Rename these variables! */ if ((alloc_parms->alloc == ALLOC_ANYWHERE && - ix + ix_offset >= devices_needed + alloc_state->log_area_count_still_needed) || - (preferred_count == ix_offset && - (ix_offset == devices_needed + alloc_state->log_area_count_still_needed))) + ix + alloc_state->num_positional_areas >= devices_needed + alloc_state->log_area_count_still_needed) || + (preferred_count == alloc_state->num_positional_areas && + (alloc_state->num_positional_areas == devices_needed + alloc_state->log_area_count_still_needed))) break; } } while ((alloc_parms->alloc == ALLOC_ANYWHERE && last_ix != ix && ix < devices_needed + alloc_state->log_area_count_still_needed) || /* With cling_to_alloced and normal, if there were gaps in the preferred areas, have a second iteration */ (alloc_parms->alloc == ALLOC_NORMAL && preferred_count && - (preferred_count < ix_offset || alloc_state->log_area_count_still_needed) && + (preferred_count < alloc_state->num_positional_areas || alloc_state->log_area_count_still_needed) && (alloc_parms->flags & A_CLING_TO_ALLOCED) && !iteration_count++) || /* Extra iteration needed to fill log areas on PVs already used? */ - (alloc_parms->alloc == ALLOC_NORMAL && preferred_count == ix_offset && !ah->mirror_logs_separate && + (alloc_parms->alloc == ALLOC_NORMAL && preferred_count == alloc_state->num_positional_areas && !ah->mirror_logs_separate && (ix + preferred_count >= devices_needed) && (ix + preferred_count < devices_needed + alloc_state->log_area_count_still_needed) && !log_iteration_count++)); - if (preferred_count < ix_offset && !(alloc_parms->flags & A_CLING_TO_ALLOCED)) + /* Non-zero ix means at least one USE_AREA was returned */ + if (preferred_count < alloc_state->num_positional_areas && !(alloc_parms->flags & A_CLING_TO_ALLOCED) && !ix) return 1; if (ix + preferred_count < devices_needed + alloc_state->log_area_count_still_needed) @@ -1871,26 +3179,26 @@ static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc /* Sort the areas so we allocate from the biggest */ if (log_iteration_count) { if (ix > devices_needed + 1) { - log_debug("Sorting %u log areas", ix - devices_needed); + log_debug_alloc("Sorting %u log areas", ix - devices_needed); qsort(alloc_state->areas + devices_needed, ix - devices_needed, sizeof(*alloc_state->areas), _comp_area); } } else if (ix > 1) { - log_debug("Sorting %u areas", ix); - qsort(alloc_state->areas + ix_offset, ix, sizeof(*alloc_state->areas), + log_debug_alloc("Sorting %u areas", ix); + qsort(alloc_state->areas + alloc_state->num_positional_areas, ix, sizeof(*alloc_state->areas), _comp_area); } - /* If there are gaps in our preferred areas, fill then from the sorted part of the array */ - if (preferred_count && preferred_count != ix_offset) { + /* If there are gaps in our preferred areas, fill them from the sorted part of the array */ + if (preferred_count && preferred_count != alloc_state->num_positional_areas) { for (s = 0; s < devices_needed; s++) if (!alloc_state->areas[s].pva) { - alloc_state->areas[s].pva = alloc_state->areas[ix_offset].pva; - alloc_state->areas[s].used = alloc_state->areas[ix_offset].used; - alloc_state->areas[ix_offset++].pva = NULL; + alloc_state->areas[s].pva = alloc_state->areas[alloc_state->num_positional_areas].pva; + alloc_state->areas[s].used = alloc_state->areas[alloc_state->num_positional_areas].used; + alloc_state->areas[alloc_state->num_positional_areas++].pva = NULL; } } - + /* * First time around, if there's a log, allocate it on the * smallest device that has space for it. @@ -1901,19 +3209,60 @@ static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc /* FIXME This logic is due to its heritage and can be simplified! */ if (alloc_state->log_area_count_still_needed) { /* How many areas are too small for the log? */ - while (too_small_for_log_count < ix_offset + ix && - (*(alloc_state->areas + ix_offset + ix - 1 - + while (too_small_for_log_count < alloc_state->num_positional_areas + ix && + (*(alloc_state->areas + alloc_state->num_positional_areas + ix - 1 - too_small_for_log_count)).used < ah->log_len) too_small_for_log_count++; - ix_log_offset = ix_offset + ix - too_small_for_log_count - ah->log_area_count; + if (ah->mirror_logs_separate && + too_small_for_log_count && + (too_small_for_log_count >= devices_needed)) + return 1; + if ((alloc_state->num_positional_areas + ix) < (too_small_for_log_count + ah->log_area_count)) + return 1; + ix_log_offset = alloc_state->num_positional_areas + ix - (too_small_for_log_count + ah->log_area_count); } - if (ix + ix_offset < devices_needed + - (alloc_state->log_area_count_still_needed ? alloc_state->log_area_count_still_needed + - too_small_for_log_count : 0)) + if (ix + alloc_state->num_positional_areas < devices_needed) return 1; /* + * FIXME We should change the code to do separate calls for the log allocation + * and the data allocation so that _limit_to_one_area_per_tag doesn't have to guess + * where the split is going to occur. + */ + + /* + * This code covers the initial allocation - after that there is something to 'cling' to + * and we shouldn't get this far. + * alloc_state->num_positional_areas is assumed to be 0 with A_PARTITION_BY_TAGS. + * + * FIXME Consider a second attempt with A_PARTITION_BY_TAGS if, for example, the largest area + * had all the tags set, but other areas don't. + */ + if ((alloc_parms->flags & A_PARTITION_BY_TAGS) && !alloc_state->num_positional_areas) { + if (!_limit_to_one_area_per_tag(ah, alloc_state, ix_log_offset, &ix)) + return_0; + + /* Recalculate log position because we might have removed some areas from consideration */ + if (alloc_state->log_area_count_still_needed) { + /* How many areas are too small for the log? */ + too_small_for_log_count = 0; + while (too_small_for_log_count < ix && + (*(alloc_state->areas + ix - 1 - too_small_for_log_count)).pva && + (*(alloc_state->areas + ix - 1 - too_small_for_log_count)).used < ah->log_len) + too_small_for_log_count++; + if (ix < too_small_for_log_count + ah->log_area_count) + return 1; + ix_log_offset = ix - too_small_for_log_count - ah->log_area_count; + } + + if (ix < devices_needed + + (alloc_state->log_area_count_still_needed ? alloc_state->log_area_count_still_needed + + too_small_for_log_count : 0)) + return 1; + } + + /* * Finally add the space identified to the list of areas to be used. */ if (!_alloc_parallel_area(ah, max_to_allocate, alloc_state, ix_log_offset)) @@ -1928,7 +3277,7 @@ static int _find_some_parallel_space(struct alloc_handle *ah, const struct alloc } /* - * Choose sets of parallel areas to use, respecting any constraints + * Choose sets of parallel areas to use, respecting any constraints * supplied in alloc_parms. */ static int _find_max_parallel_space_for_one_policy(struct alloc_handle *ah, struct alloc_parms *alloc_parms, @@ -1941,6 +3290,8 @@ static int _find_max_parallel_space_for_one_policy(struct alloc_handle *ah, stru struct seg_pvs *spvs; struct dm_list *parallel_pvs; + alloc_state->alloc_parms = alloc_parms; + /* FIXME This algorithm needs a lot of cleaning up! */ /* FIXME anywhere doesn't find all space yet */ do { @@ -1965,11 +3316,11 @@ static int _find_max_parallel_space_for_one_policy(struct alloc_handle *ah, stru * data together will be split, we must adjust * the comparison accordingly. */ - if (ah->alloc_and_split_meta) + if (ah->alloc_and_split_meta && !ah->split_metadata_is_allocated) max_tmp -= ah->log_len; if (max_tmp > (spvs->le + spvs->len) * ah->area_multiple) { max_to_allocate = (spvs->le + spvs->len) * ah->area_multiple - alloc_state->allocated; - max_to_allocate += ah->alloc_and_split_meta ? ah->log_len : 0; + max_to_allocate += (ah->alloc_and_split_meta && !ah->split_metadata_is_allocated) ? ah->log_len : 0; } parallel_pvs = &spvs->pvs; break; @@ -1978,10 +3329,13 @@ static int _find_max_parallel_space_for_one_policy(struct alloc_handle *ah, stru old_allocated = alloc_state->allocated; - if (!_find_some_parallel_space(ah, alloc_parms, pvms, alloc_state, parallel_pvs, max_to_allocate)) + if (!_find_some_parallel_space(ah, pvms, alloc_state, parallel_pvs, max_to_allocate)) return_0; /* + * For ALLOC_CLING, if the number of areas matches and maximise_cling is + * set we allow two passes, first with A_POSITIONAL_FILL then without. + * * If we didn't allocate anything this time with ALLOC_NORMAL and had * A_CLING_TO_ALLOCED set, try again without it. * @@ -1990,14 +3344,17 @@ static int _find_max_parallel_space_for_one_policy(struct alloc_handle *ah, stru * remain on the same disks where possible. */ if (old_allocated == alloc_state->allocated) { - if ((alloc_parms->alloc == ALLOC_NORMAL) && (alloc_parms->flags & A_CLING_TO_ALLOCED)) + if (ah->maximise_cling && ((alloc_parms->alloc == ALLOC_CLING) || (alloc_parms->alloc == ALLOC_CLING_BY_TAGS)) && + (alloc_parms->flags & A_CLING_TO_LVSEG) && (alloc_parms->flags & A_POSITIONAL_FILL)) + alloc_parms->flags &= ~A_POSITIONAL_FILL; + else if ((alloc_parms->alloc == ALLOC_NORMAL) && (alloc_parms->flags & A_CLING_TO_ALLOCED)) alloc_parms->flags &= ~A_CLING_TO_ALLOCED; else break; /* Give up */ } else if (ah->maximise_cling && alloc_parms->alloc == ALLOC_NORMAL && !(alloc_parms->flags & A_CLING_TO_ALLOCED)) alloc_parms->flags |= A_CLING_TO_ALLOCED; - } while ((alloc_parms->alloc != ALLOC_CONTIGUOUS) && alloc_state->allocated != alloc_parms->extents_still_needed && (alloc_parms->flags & A_CAN_SPLIT)); + } while ((alloc_parms->alloc != ALLOC_CONTIGUOUS) && alloc_state->allocated != alloc_parms->extents_still_needed && (alloc_parms->flags & A_CAN_SPLIT) && (!ah->approx_alloc || pv_maps_size(pvms))); return 1; } @@ -2024,13 +3381,13 @@ static int _allocate(struct alloc_handle *ah, alloc_state.allocated = lv ? lv->le_count : 0; if (alloc_state.allocated >= ah->new_extents && !ah->log_area_count) { - log_error("_allocate called with no work to do!"); + log_warn("_allocate called with no work to do!"); return 1; } if (ah->area_multiple > 1 && (ah->new_extents - alloc_state.allocated) % ah->area_multiple) { - log_error("Number of extents requested (%d) needs to be divisible by %d.", + log_error("Number of extents requested (" FMTu32 ") needs to be divisible by " FMTu32 ".", ah->new_extents - alloc_state.allocated, ah->area_multiple); return 0; @@ -2041,16 +3398,15 @@ static int _allocate(struct alloc_handle *ah, if (ah->alloc == ALLOC_CONTIGUOUS) can_split = 0; - if (lv && !dm_list_empty(&lv->segments)) - prev_lvseg = dm_list_item(dm_list_last(&lv->segments), - struct lv_segment); + if (lv) + prev_lvseg = last_seg(lv); /* * Build the sets of available areas on the pv's. */ if (!(pvms = create_pv_maps(ah->mem, vg, allocatable_pvs))) return_0; - if (!_log_parallel_areas(ah->mem, ah->parallel_areas)) + if (!_log_parallel_areas(ah->mem, ah->parallel_areas, ah->cling_tag_list_cn)) stack; alloc_state.areas_size = dm_list_size(pvms); @@ -2071,7 +3427,7 @@ static int _allocate(struct alloc_handle *ah, alloc_state.areas_size += _stripes_per_mimage(prev_lvseg) * prev_lvseg->area_count; /* Allocate an array of pv_areas to hold the largest space on each PV */ - if (!(alloc_state.areas = dm_malloc(sizeof(*alloc_state.areas) * alloc_state.areas_size))) { + if (!(alloc_state.areas = malloc(sizeof(*alloc_state.areas) * alloc_state.areas_size))) { log_error("Couldn't allocate areas array."); return 0; } @@ -2089,9 +3445,11 @@ static int _allocate(struct alloc_handle *ah, if (alloc == ALLOC_CLING_BY_TAGS && !ah->cling_tag_list_cn) continue; old_allocated = alloc_state.allocated; - log_debug("Trying allocation using %s policy.", get_alloc_string(alloc)); + log_debug_alloc("Trying allocation using %s policy.", get_alloc_string(alloc)); - if (!_sufficient_pes_free(ah, pvms, alloc_state.allocated, ah->new_extents)) + if (!ah->approx_alloc && !_sufficient_pes_free(ah, pvms, alloc_state.allocated, + alloc_state.log_area_count_still_needed, + ah->new_extents)) goto_out; _init_alloc_parms(ah, &alloc_parms, alloc, prev_lvseg, @@ -2101,19 +3459,45 @@ static int _allocate(struct alloc_handle *ah, if (!_find_max_parallel_space_for_one_policy(ah, &alloc_parms, pvms, &alloc_state)) goto_out; - if ((alloc_state.allocated == ah->new_extents && !alloc_state.log_area_count_still_needed) || + /* As a workaround, if only the log is missing now, fall through and try later policies up to normal. */ + /* FIXME Change the core algorithm so the log extents cling to parallel LVs instead of avoiding them. */ + if (alloc_state.allocated == ah->new_extents && + alloc_state.log_area_count_still_needed && + ah->alloc < ALLOC_NORMAL) { + ah->alloc = ALLOC_NORMAL; + continue; + } + + if ((alloc_state.allocated == ah->new_extents && + !alloc_state.log_area_count_still_needed) || (!can_split && (alloc_state.allocated != old_allocated))) break; } if (alloc_state.allocated != ah->new_extents) { - log_error("Insufficient suitable %sallocatable extents " - "for logical volume %s: %u more required", - can_split ? "" : "contiguous ", - lv ? lv->name : "", - (ah->new_extents - alloc_state.allocated) * ah->area_count - / ah->area_multiple); - goto out; + if (!ah->approx_alloc) { + log_error("Insufficient suitable %sallocatable extents " + "for logical volume %s: %u more required", + can_split ? "" : "contiguous ", + lv ? lv->name : "", + (ah->new_extents - alloc_state.allocated) * + ah->area_count / ah->area_multiple); + goto out; + } + if (!alloc_state.allocated) { + log_error("Insufficient suitable %sallocatable extents " + "found for logical volume %s.", + can_split ? "" : "contiguous ", + lv ? lv->name : ""); + goto out; + } + log_verbose("Found fewer %sallocatable extents " + "for logical volume %s than requested: using %" PRIu32 " extents (reduced by %u).", + can_split ? "" : "contiguous ", + lv ? lv->name : "", + alloc_state.allocated, + (ah->new_extents - alloc_state.allocated) * ah->area_count / ah->area_multiple); + ah->new_extents = alloc_state.allocated; } if (alloc_state.log_area_count_still_needed) { @@ -2126,61 +3510,268 @@ static int _allocate(struct alloc_handle *ah, r = 1; out: - dm_free(alloc_state.areas); + free(alloc_state.areas); return r; } -int lv_add_virtual_segment(struct logical_volume *lv, uint64_t status, - uint32_t extents, const struct segment_type *segtype, - const char *thin_pool_name) +/* + * FIXME: Add proper allocation function for VDO segment on top + * of VDO pool with virtual size. + * + * Note: ATM lvm2 can't resize VDO device so it can add only a single segment. + */ +static int _lv_add_vdo_segment(struct logical_volume *lv, uint64_t status, + uint32_t extents, const struct segment_type *segtype) { struct lv_segment *seg; - struct logical_volume *thin_pool_lv = NULL; - struct lv_list *lvl; - uint32_t size; - if (thin_pool_name) { - if (!(lvl = find_lv_in_vg(lv->vg, thin_pool_name))) { - log_error("Unable to find existing pool LV %s in VG %s.", - thin_pool_name, lv->vg->name); + if (!dm_list_empty(&lv->segments) && + (seg = last_seg(lv)) && (seg->segtype == segtype)) { + seg->area_len += extents; + seg->len += extents; + } else { + if (!(seg = alloc_lv_segment(segtype, lv, lv->le_count, extents, 0, + status, 0, NULL, 1, + extents, 0, 0, 0, 0, NULL))) { + log_error("Couldn't allocate new %s segment.", segtype->name); return 0; } - thin_pool_lv = lvl->lv; - size = first_seg(thin_pool_lv)->chunk_size; - if (lv->vg->extent_size < size) { - /* Align extents on chunk boundary size */ - size = ((uint64_t)lv->vg->extent_size * extents + size - 1) / - size * size / lv->vg->extent_size; - if (size != extents) { - log_print_unless_silent("Rounding size (%d extents) up to chunk boundary " - "size (%d extents).", extents, size); - extents = size; - } - } + lv->status |= LV_VDO; + dm_list_add(&lv->segments, &seg->list); } + lv->le_count += extents; + lv->size += (uint64_t) extents * lv->vg->extent_size; + + if (seg_lv(seg, 0) && + !update_vdo_pool_virtual_size(first_seg(seg_lv(seg, 0)))) + return_0; + + return 1; +} + +int lv_add_virtual_segment(struct logical_volume *lv, uint64_t status, + uint32_t extents, const struct segment_type *segtype) +{ + struct lv_segment *seg; + + if (segtype_is_vdo(segtype)) + return _lv_add_vdo_segment(lv, 0u, extents, segtype); + if (!dm_list_empty(&lv->segments) && (seg = last_seg(lv)) && (seg->segtype == segtype)) { seg->area_len += extents; seg->len += extents; } else { - if (!(seg = alloc_lv_segment(segtype, lv, lv->le_count, extents, - status, 0, NULL, thin_pool_lv, 0, - extents, 0, 0, 0, NULL))) { - log_error("Couldn't allocate new zero segment."); + if (!(seg = alloc_lv_segment(segtype, lv, lv->le_count, extents, 0, + status, 0, NULL, 0, + extents, 0, 0, 0, 0, NULL))) { + log_error("Couldn't allocate new %s segment.", segtype->name); return 0; } lv->status |= VIRTUAL; dm_list_add(&lv->segments, &seg->list); } - lv->le_count += extents; - lv->size += (uint64_t) extents *lv->vg->extent_size; + if (!_setup_lv_size(lv, lv->le_count + extents)) + return_0; return 1; } /* + * Preparation for a specific allocation attempt + * stripes and mirrors refer to the parallel areas used for data. + * If log_area_count > 1 it is always mirrored (not striped). + */ +static struct alloc_handle *_alloc_init(struct cmd_context *cmd, + const struct segment_type *segtype, + alloc_policy_t alloc, int approx_alloc, + uint32_t existing_extents, + uint32_t new_extents, + uint32_t mirrors, + uint32_t stripes, + uint32_t metadata_area_count, + uint32_t extent_size, + uint32_t region_size, + struct dm_list *parallel_areas) +{ + struct dm_pool *mem; + struct alloc_handle *ah; + uint32_t s, area_count, alloc_count, parity_count, total_extents; + size_t size = 0; + + if (segtype_is_virtual(segtype)) { + log_error(INTERNAL_ERROR "_alloc_init called for virtual segment."); + return NULL; + } + + /* FIXME Caller should ensure this */ + if (mirrors && !stripes) + stripes = 1; + + if (mirrors > 1) + area_count = mirrors * stripes; + else + area_count = stripes; + + if (!(area_count + metadata_area_count)) { + log_error(INTERNAL_ERROR "_alloc_init called for non-virtual segment with no disk space."); + return NULL; + } + + size = sizeof(*ah); + + /* + * It is a requirement that RAID 4/5/6 are created with a number of + * stripes that is greater than the number of parity devices. (e.g + * RAID4/5 must have at least 2 stripes and RAID6 must have at least + * 3.) It is also a constraint that, when replacing individual devices + * in a RAID 4/5/6 array, no more devices can be replaced than + * there are parity devices. (Otherwise, there would not be enough + * redundancy to maintain the array.) Understanding these two + * constraints allows us to infer whether the caller of this function + * is intending to allocate an entire array or just replacement + * component devices. In the former case, we must account for the + * necessary parity_count. In the later case, we do not need to + * account for the extra parity devices because the array already + * exists and they only want replacement drives. + */ + parity_count = (area_count <= segtype->parity_devs) ? 0 : segtype->parity_devs; + alloc_count = area_count + parity_count; + if (segtype_is_raid(segtype) && metadata_area_count) + /* RAID has a meta area for each device */ + alloc_count *= 2; + else + /* mirrors specify their exact log count */ + alloc_count += metadata_area_count; + + size += sizeof(ah->alloced_areas[0]) * alloc_count; + + if (!(mem = dm_pool_create("allocation", 1024))) { + log_error("allocation pool creation failed"); + return NULL; + } + + if (!(ah = dm_pool_zalloc(mem, size))) { + log_error("allocation handle allocation failed"); + dm_pool_destroy(mem); + return NULL; + } + + ah->cmd = cmd; + ah->mem = mem; + ah->area_count = area_count; + ah->parity_count = parity_count; + ah->region_size = region_size; + ah->alloc = alloc; + + /* + * For the purposes of allocation, area_count and parity_count are + * kept separately. However, the 'area_count' field in an + * lv_segment includes both; and this is what '_calc_area_multiple' + * is calculated from. So, we must pass in the total count to get + * a correct area_multiple. + */ + ah->area_multiple = _calc_area_multiple(segtype, area_count + parity_count, stripes); + //FIXME: s/mirror_logs_separate/metadata_separate/ so it can be used by others? + ah->mirror_logs_separate = find_config_tree_bool(cmd, allocation_mirror_logs_require_separate_pvs_CFG, NULL); + + if (mirrors || stripes) + total_extents = new_extents; + else + total_extents = 0; + + if (segtype_is_raid(segtype)) { + if (metadata_area_count) { + uint32_t cur_rimage_extents, new_rimage_extents; + + if (metadata_area_count != area_count) + log_error(INTERNAL_ERROR + "Bad metadata_area_count"); + + /* Calculate log_len (i.e. length of each rmeta device) for RAID */ + cur_rimage_extents = raid_rimage_extents(segtype, existing_extents, stripes, mirrors); + new_rimage_extents = raid_rimage_extents(segtype, existing_extents + new_extents, stripes, mirrors), + ah->log_len = raid_rmeta_extents_delta(cmd, cur_rimage_extents, new_rimage_extents, + region_size, extent_size); + ah->metadata_area_count = metadata_area_count; + ah->alloc_and_split_meta = !!ah->log_len; + /* + * We need 'log_len' extents for each + * RAID device's metadata_area + */ + total_extents += ah->log_len * (segtype_is_raid1(segtype) ? 1 : ah->area_multiple); + } else { + ah->log_area_count = 0; + ah->log_len = 0; + } + } else if (segtype_is_thin_pool(segtype)) { + /* + * thin_pool uses ah->region_size to + * pass metadata size in extents + */ + ah->log_len = ah->region_size; + ah->log_area_count = metadata_area_count; + ah->region_size = 0; + ah->mirror_logs_separate = + find_config_tree_bool(cmd, allocation_thin_pool_metadata_require_separate_pvs_CFG, NULL); + } else if (segtype_is_cache_pool(segtype)) { + /* + * Like thin_pool, cache_pool uses ah->region_size to + * pass metadata size in extents + */ + ah->log_len = ah->region_size; + /* use metadata_area_count, not log_area_count */ + ah->metadata_area_count = metadata_area_count; + ah->region_size = 0; + ah->mirror_logs_separate = + find_config_tree_bool(cmd, allocation_cache_pool_metadata_require_separate_pvs_CFG, NULL); + if (!ah->mirror_logs_separate) { + ah->alloc_and_split_meta = 1; + total_extents += ah->log_len; + } + } else { + ah->log_area_count = metadata_area_count; + ah->log_len = !metadata_area_count ? 0 : + _mirror_log_extents(ah->region_size, extent_size, + (existing_extents + new_extents) / ah->area_multiple); + } + + if (total_extents || existing_extents) + log_debug("Adjusted allocation request to " FMTu32 " logical extents. Existing size " FMTu32 ". New size " FMTu32 ".", + total_extents, existing_extents, total_extents + existing_extents); + if (ah->log_len) + log_debug("Mirror log of " FMTu32 " extents of size " FMTu32 " sectors needed for region size %s.", + ah->log_len, extent_size, display_size(cmd, (uint64_t)ah->region_size)); + + if (mirrors || stripes) + total_extents += existing_extents; + + ah->new_extents = total_extents; + + for (s = 0; s < alloc_count; s++) + dm_list_init(&ah->alloced_areas[s]); + + ah->parallel_areas = parallel_areas; + + if ((ah->cling_tag_list_cn = find_config_tree_array(cmd, allocation_cling_tag_list_CFG, NULL))) + (void) _validate_tag_list(ah->cling_tag_list_cn); + + ah->maximise_cling = find_config_tree_bool(cmd, allocation_maximise_cling_CFG, NULL); + + ah->approx_alloc = approx_alloc; + + return ah; +} + +void alloc_destroy(struct alloc_handle *ah) +{ + if (ah) + dm_pool_destroy(ah->mem); +} + +/* * Entry point for all extent allocations. */ struct alloc_handle *allocate_extents(struct volume_group *vg, @@ -2190,11 +3781,10 @@ struct alloc_handle *allocate_extents(struct volume_group *vg, uint32_t mirrors, uint32_t log_count, uint32_t region_size, uint32_t extents, struct dm_list *allocatable_pvs, - alloc_policy_t alloc, + alloc_policy_t alloc, int approx_alloc, struct dm_list *parallel_areas) { struct alloc_handle *ah; - uint32_t new_extents; if (segtype_is_virtual(segtype)) { log_error("allocate_extents does not handle virtual segments"); @@ -2219,9 +3809,8 @@ struct alloc_handle *allocate_extents(struct volume_group *vg, if (alloc >= ALLOC_INHERIT) alloc = vg->alloc; - new_extents = (lv ? lv->le_count : 0) + extents; - if (!(ah = _alloc_init(vg->cmd, vg->cmd->mem, segtype, alloc, - new_extents, mirrors, stripes, log_count, + if (!(ah = _alloc_init(vg->cmd, segtype, alloc, approx_alloc, + lv ? lv->le_count : 0, extents, mirrors, stripes, log_count, vg->extent_size, region_size, parallel_areas))) return_NULL; @@ -2255,7 +3844,7 @@ int lv_add_segment(struct alloc_handle *ah, return 0; } - if ((status & MIRROR_LOG) && dm_list_size(&lv->segments)) { + if ((status & MIRROR_LOG) && !dm_list_empty(&lv->segments)) { log_error("Log segments can only be added to an empty LV"); return 0; } @@ -2266,7 +3855,7 @@ int lv_add_segment(struct alloc_handle *ah, region_size)) return_0; - if ((segtype->flags & SEG_CAN_SPLIT) && !lv_merge_segments(lv)) { + if (segtype_can_split(segtype) && !lv_merge_segments(lv)) { log_error("Couldn't merge segments after extending " "logical volume."); return 0; @@ -2302,14 +3891,14 @@ static struct lv_segment *_convert_seg_to_mirror(struct lv_segment *seg, return NULL; } - if (!(newseg = alloc_lv_segment(get_segtype_from_string(seg->lv->vg->cmd, "mirror"), - seg->lv, seg->le, seg->len, + if (!(newseg = alloc_lv_segment(get_segtype_from_string(seg->lv->vg->cmd, SEG_TYPE_NAME_MIRROR), + seg->lv, seg->le, seg->len, 0, seg->status, seg->stripe_size, - log_lv, NULL, - seg->area_count, seg->area_len, + log_lv, + seg->area_count, seg->area_len, 0, seg->chunk_size, region_size, seg->extents_copied, NULL))) { - log_error("Couldn't allocate converted LV segment"); + log_error("Couldn't allocate converted LV segment."); return NULL; } @@ -2328,6 +3917,116 @@ static struct lv_segment *_convert_seg_to_mirror(struct lv_segment *seg, /* * Add new areas to mirrored segments */ +int lv_add_segmented_mirror_image(struct alloc_handle *ah, + struct logical_volume *lv, uint32_t le, + uint32_t region_size) +{ + char *image_name; + struct alloced_area *aa; + struct lv_segment *seg, *new_seg; + uint32_t current_le = le; + uint32_t s; + struct segment_type *segtype; + struct logical_volume *orig_lv, *copy_lv; + + if (!lv_is_pvmove(lv)) { + log_error(INTERNAL_ERROR + "Non-pvmove LV, %s, passed as argument.", + display_lvname(lv)); + return 0; + } + + if (seg_type(first_seg(lv), 0) != AREA_PV) { + log_error(INTERNAL_ERROR + "Bad segment type for first segment area."); + return 0; + } + + /* + * If the allocator provided two or more PV allocations for any + * single segment of the original LV, that LV segment must be + * split up to match. + */ + dm_list_iterate_items(aa, &ah->alloced_areas[0]) { + if (!(seg = find_seg_by_le(lv, current_le))) { + log_error("Failed to find segment for %s extent " FMTu32 ".", + display_lvname(lv), current_le); + return 0; + } + + /* Allocator assures aa[0].len <= seg->area_len */ + if (aa[0].len < seg->area_len) { + if (!lv_split_segment(lv, seg->le + aa[0].len)) { + log_error("Failed to split segment at %s " + "extent " FMTu32 ".", + display_lvname(lv), le); + return 0; + } + } + current_le += seg->area_len; + } + + current_le = le; + + if (!insert_layer_for_lv(lv->vg->cmd, lv, PVMOVE, "_mimage_0")) { + log_error("Failed to build pvmove LV-type mirror %s.", + display_lvname(lv)); + return 0; + } + orig_lv = seg_lv(first_seg(lv), 0); + if (!(image_name = dm_pool_strdup(lv->vg->vgmem, orig_lv->name))) + return_0; + image_name[strlen(image_name) - 1] = '1'; + + if (!(copy_lv = lv_create_empty(image_name, NULL, + orig_lv->status, + ALLOC_INHERIT, lv->vg))) + return_0; + + if (!lv_add_mirror_lvs(lv, ©_lv, 1, MIRROR_IMAGE, region_size)) + return_0; + + if (!(segtype = get_segtype_from_string(lv->vg->cmd, SEG_TYPE_NAME_STRIPED))) + return_0; + + dm_list_iterate_items(aa, &ah->alloced_areas[0]) { + if (!(seg = find_seg_by_le(orig_lv, current_le))) { + log_error("Failed to find segment for %s extent " FMTu32 ".", + display_lvname(lv), current_le); + return 0; + } + + if (!(new_seg = alloc_lv_segment(segtype, copy_lv, + seg->le, seg->len, 0, PVMOVE, 0, + NULL, 1, seg->len, 0, + 0, 0, 0, NULL))) + return_0; + + for (s = 0; s < ah->area_count; s++) { + if (!set_lv_segment_area_pv(new_seg, s, + aa[s].pv, aa[s].pe)) + return_0; + } + + dm_list_add(©_lv->segments, &new_seg->list); + + current_le += seg->area_len; + copy_lv->le_count += seg->area_len; + } + lv->status |= MIRRORED; + + /* FIXME: add log */ + + if (lv->vg->fid->fmt->ops->lv_setup && + !lv->vg->fid->fmt->ops->lv_setup(lv->vg->fid, lv)) + return_0; + + return 1; +} + +/* + * Add new areas to mirrored segments + */ int lv_add_mirror_areas(struct alloc_handle *ah, struct logical_volume *lv, uint32_t le, uint32_t region_size) @@ -2339,16 +4038,16 @@ int lv_add_mirror_areas(struct alloc_handle *ah, dm_list_iterate_items(aa, &ah->alloced_areas[0]) { if (!(seg = find_seg_by_le(lv, current_le))) { - log_error("Failed to find segment for %s extent %" - PRIu32, lv->name, current_le); + log_error("Failed to find segment for %s extent " FMTu32 ".", + display_lvname(lv), current_le); return 0; } /* Allocator assures aa[0].len <= seg->area_len */ if (aa[0].len < seg->area_len) { if (!lv_split_segment(lv, seg->le + aa[0].len)) { - log_error("Failed to split segment at %s " - "extent %" PRIu32, lv->name, le); + log_error("Failed to split segment at %s extent " FMTu32 ".", + display_lvname(lv), le); return 0; } } @@ -2360,7 +4059,7 @@ int lv_add_mirror_areas(struct alloc_handle *ah, old_area_count = seg->area_count; new_area_count = old_area_count + ah->area_count; - if (!_lv_segment_add_areas(lv, seg, new_area_count)) + if (!add_lv_segment_areas(seg, new_area_count)) return_0; for (s = 0; s < ah->area_count; s++) { @@ -2389,36 +4088,31 @@ int lv_add_mirror_lvs(struct logical_volume *lv, uint32_t num_extra_areas, uint64_t status, uint32_t region_size) { - struct lv_segment *seg; - uint32_t old_area_count, new_area_count; uint32_t m; + uint32_t old_area_count, new_area_count; struct segment_type *mirror_segtype; - - seg = first_seg(lv); + struct lv_segment *seg = first_seg(lv); if (dm_list_size(&lv->segments) != 1 || seg_type(seg, 0) != AREA_LV) { - log_error("Mirror layer must be inserted before adding mirrors"); + log_error(INTERNAL_ERROR "Mirror layer must be inserted before adding mirrors."); return 0; } - mirror_segtype = get_segtype_from_string(lv->vg->cmd, "mirror"); + mirror_segtype = get_segtype_from_string(lv->vg->cmd, SEG_TYPE_NAME_MIRROR); if (seg->segtype != mirror_segtype) if (!(seg = _convert_seg_to_mirror(seg, region_size, NULL))) return_0; if (region_size && region_size != seg->region_size) { - log_error("Conflicting region_size"); + log_error("Conflicting region_size %u != %u.", region_size, seg->region_size); return 0; } old_area_count = seg->area_count; new_area_count = old_area_count + num_extra_areas; - if (!_lv_segment_add_areas(lv, seg, new_area_count)) { - log_error("Failed to allocate widened LV segment for %s.", - lv->name); - return 0; - } + if (!add_lv_segment_areas(seg, new_area_count)) + return_0; for (m = 0; m < old_area_count; m++) seg_lv(seg, m)->status |= status; @@ -2460,8 +4154,7 @@ int lv_add_log_segment(struct alloc_handle *ah, uint32_t first_area, { return lv_add_segment(ah, ah->area_count + first_area, 1, log_lv, - get_segtype_from_string(log_lv->vg->cmd, - "striped"), + get_segtype_from_string(log_lv->vg->cmd, SEG_TYPE_NAME_STRIPED), 0, status, 0); } @@ -2474,8 +4167,7 @@ static int _lv_insert_empty_sublvs(struct logical_volume *lv, uint32_t i; uint64_t sub_lv_status = 0; const char *layer_name; - size_t len = strlen(lv->name) + 32; - char img_name[len]; + char img_name[NAME_LEN]; struct lv_segment *mapseg; if (lv->le_count || !dm_list_empty(&lv->segments)) { @@ -2498,10 +4190,11 @@ static int _lv_insert_empty_sublvs(struct logical_volume *lv, /* * First, create our top-level segment for our top-level LV */ - if (!(mapseg = alloc_lv_segment(segtype, lv, 0, 0, lv->status, - stripe_size, NULL, NULL, - devices, 0, 0, region_size, 0, NULL))) { - log_error("Failed to create mapping segment for %s", lv->name); + if (!(mapseg = alloc_lv_segment(segtype, lv, 0, 0, 0, lv->status, + stripe_size, NULL, + devices, 0, 0, 0, region_size, 0, NULL))) { + log_error("Failed to create mapping segment for %s.", + display_lvname(lv)); return 0; } @@ -2511,80 +4204,122 @@ static int _lv_insert_empty_sublvs(struct logical_volume *lv, for (i = 0; i < devices; i++) { /* Data LVs */ if (devices > 1) { - if (dm_snprintf(img_name, len, "%s_%s_%u", + if (dm_snprintf(img_name, sizeof(img_name), "%s_%s_%u", lv->name, layer_name, i) < 0) - return_0; + goto_bad; } else { - if (dm_snprintf(img_name, len, "%s_%s", + if (dm_snprintf(img_name, sizeof(img_name), "%s_%s", lv->name, layer_name) < 0) - return_0; + goto_bad; } /* FIXME Should use ALLOC_INHERIT here and inherit from parent LV */ if (!(sub_lv = lv_create_empty(img_name, NULL, - LVM_READ | LVM_WRITE, - lv->alloc, lv->vg))) + LVM_READ | LVM_WRITE, + lv->alloc, lv->vg))) return_0; if (!set_lv_segment_area_lv(mapseg, i, sub_lv, 0, sub_lv_status)) return_0; /* Metadata LVs for raid */ - if (segtype_is_raid(segtype)) { - if (dm_snprintf(img_name, len, "%s_rmeta_%u", lv->name, i) < 0) + if (segtype_is_raid_with_meta(segtype)) { + if (dm_snprintf(img_name, sizeof(img_name), "%s_rmeta_%u", + lv->name, i) < 0) + goto_bad; + /* FIXME Should use ALLOC_INHERIT here and inherit from parent LV */ + if (!(sub_lv = lv_create_empty(img_name, NULL, + LVM_READ | LVM_WRITE, + lv->alloc, lv->vg))) return_0; - } else - continue; - - /* FIXME Should use ALLOC_INHERIT here and inherit from parent LV */ - if (!(sub_lv = lv_create_empty(img_name, NULL, - LVM_READ | LVM_WRITE, - lv->alloc, lv->vg))) - return_0; - if (!set_lv_segment_area_lv(mapseg, i, sub_lv, 0, RAID_META)) + if (!set_lv_segment_area_lv(mapseg, i, sub_lv, 0, RAID_META)) return_0; + } } dm_list_add(&lv->segments, &mapseg->list); return 1; + +bad: + log_error("Failed to create sub LV name for LV %s.", + display_lvname(lv)); + + return 0; +} + +/* Add all rmeta SubLVs for @seg to @lvs and return allocated @lvl to free by caller. */ +static struct lv_list *_raid_list_metalvs(struct lv_segment *seg, struct dm_list *lvs) +{ + uint32_t s; + struct lv_list *lvl; + + dm_list_init(lvs); + + if (!(lvl = dm_pool_alloc(seg->lv->vg->vgmem, sizeof(*lvl) * seg->area_count))) + return_NULL; + + for (s = 0; s < seg->area_count; s++) { + lvl[s].lv = seg_metalv(seg, s); + dm_list_add(lvs, &lvl[s].list); + } + + return lvl; } static int _lv_extend_layered_lv(struct alloc_handle *ah, struct logical_volume *lv, uint32_t extents, uint32_t first_area, - uint32_t stripes, uint32_t stripe_size) + uint32_t mirrors, uint32_t stripes, uint32_t stripe_size) { + struct logical_volume *sub_lvs[DEFAULT_RAID_MAX_IMAGES]; const struct segment_type *segtype; - struct logical_volume *sub_lv, *meta_lv; - struct lv_segment *seg; + struct logical_volume *meta_lv, *sub_lv; + struct lv_segment *seg = first_seg(lv); + struct lv_segment *sub_lv_seg; uint32_t fa, s; int clear_metadata = 0; + int integrity_sub_lvs = 0; + uint32_t area_multiple = 1; - segtype = get_segtype_from_string(lv->vg->cmd, "striped"); + if (!(segtype = get_segtype_from_string(lv->vg->cmd, SEG_TYPE_NAME_STRIPED))) + return_0; /* * The component devices of a "striped" LV all go in the same * LV. However, RAID has an LV for each device - making the * 'stripes' and 'stripe_size' parameters meaningless. */ - if (seg_is_raid(first_seg(lv))) { + if (seg_is_raid(seg)) { stripes = 1; stripe_size = 0; + if (seg_is_any_raid0(seg)) + area_multiple = seg->area_count; + } + + for (s = 0; s < seg->area_count; s++) { + sub_lv = seg_lv(seg, s); + sub_lv_seg = sub_lv ? first_seg(sub_lv) : NULL; + + if (sub_lv_seg && seg_is_integrity(sub_lv_seg)) { + sub_lvs[s] = seg_lv(sub_lv_seg, 0); + integrity_sub_lvs = 1; + } else + sub_lvs[s] = sub_lv; } - seg = first_seg(lv); for (fa = first_area, s = 0; s < seg->area_count; s++) { - if (is_temporary_mirror_layer(seg_lv(seg, s))) { - if (!_lv_extend_layered_lv(ah, seg_lv(seg, s), extents, - fa, stripes, stripe_size)) + sub_lv = sub_lvs[s]; + + if (is_temporary_mirror_layer(sub_lv)) { + if (!_lv_extend_layered_lv(ah, sub_lv, extents / area_multiple, + fa, mirrors, stripes, stripe_size)) return_0; - fa += lv_mirror_count(seg_lv(seg, s)); + fa += lv_mirror_count(sub_lv); continue; } - sub_lv = seg_lv(seg, s); if (!lv_add_segment(ah, fa, stripes, sub_lv, segtype, stripe_size, sub_lv->status, 0)) { log_error("Aborting. Failed to extend %s in %s.", @@ -2592,8 +4327,10 @@ static int _lv_extend_layered_lv(struct alloc_handle *ah, return 0; } + last_seg(lv)->data_copies = mirrors; + /* Extend metadata LVs only on initial creation */ - if (seg_is_raid(seg) && !lv->le_count) { + if (seg_is_raid_with_meta(seg) && !lv->le_count) { if (!seg->meta_areas) { log_error("No meta_areas for RAID type"); return 0; @@ -2608,115 +4345,224 @@ static int _lv_extend_layered_lv(struct alloc_handle *ah, return 0; } lv_set_visible(meta_lv); + + /* + * Copy any tags from the new LV to the metadata LV so + * it can be activated temporarily. + */ + if (!str_list_dup(meta_lv->vg->vgmem, &meta_lv->tags, &lv->tags)) { + log_error("Failed to copy tags onto LV %s to clear metadata.", display_lvname(meta_lv)); + return 0; + } + clear_metadata = 1; } fa += stripes; } - if (clear_metadata) { - /* - * We must clear the metadata areas upon creation. - */ - if (!vg_write(lv->vg) || !vg_commit(lv->vg)) - return_0; - + /* + * In raid+integrity, the lv_iorig raid images have been extended above. + * Now propagate the new lv_iorig sizes up to the integrity LV layers + * that are referencing the lv_iorig. + */ + if (integrity_sub_lvs) { for (s = 0; s < seg->area_count; s++) { - meta_lv = seg_metalv(seg, s); + struct logical_volume *lv_image; + struct logical_volume *lv_iorig; + struct lv_segment *seg_image; - if (test_mode()) { - lv_set_hidden(meta_lv); - continue; - } + lv_image = seg_lv(seg, s); + seg_image = first_seg(lv_image); - if (!activate_lv(meta_lv->vg->cmd, meta_lv)) { - log_error("Failed to activate %s/%s for clearing", - meta_lv->vg->name, meta_lv->name); + if (!seg_image->integrity_meta_dev) { + log_error("1"); return 0; } - log_verbose("Clearing metadata area of %s/%s", - meta_lv->vg->name, meta_lv->name); - /* - * Rather than wiping meta_lv->size, we can simply - * wipe '1' to remove the superblock of any previous - * RAID devices. It is much quicker. - */ - if (!set_lv(meta_lv->vg->cmd, meta_lv, 1, 0)) { - log_error("Failed to zero %s/%s", - meta_lv->vg->name, meta_lv->name); + if (!(lv_iorig = seg_lv(seg_image, 0))) { + log_error("2"); return 0; } - if (!deactivate_lv(meta_lv->vg->cmd, meta_lv)) { - log_error("Failed to deactivate %s/%s", - meta_lv->vg->name, meta_lv->name); + /* new size in sectors */ + lv_image->size = lv_iorig->size; + seg_image->integrity_data_sectors = lv_iorig->size; + /* new size in extents */ + lv_image->le_count = lv_iorig->le_count; + seg_image->len = lv_iorig->le_count; + seg_image->area_len = lv_iorig->le_count; + } + } + + seg->len += extents; + if (seg_is_raid(seg)) + seg->area_len = seg->len; + else + seg->area_len += extents / area_multiple; + + if (!_setup_lv_size(lv, lv->le_count + extents)) + return_0; + + if (clear_metadata) { + struct volume_group *vg = lv->vg; + + /* + * We must clear the metadata areas upon creation. + */ + + /* + * Declare the new RaidLV as temporary to avoid visible SubLV + * failures on activation until after we wiped them so that + * we can avoid activating crashed, potentially partially + * wiped RaidLVs. + */ + lv->status |= LV_ACTIVATION_SKIP; + + if (test_mode()) { + /* FIXME VG is not in a fully-consistent state here and should not be committed! */ + if (!vg_write(vg) || !vg_commit(vg)) + return_0; + + log_verbose("Test mode: Skipping wiping of metadata areas."); + } else { + struct dm_list meta_lvs; + struct lv_list *lvl; + + if (!(lvl = _raid_list_metalvs(seg, &meta_lvs))) return 0; + + /* Wipe lv list committing metadata */ + if (!activate_and_wipe_lvlist(&meta_lvs, 1)) { + /* If we failed clearing rmeta SubLVs, try removing the new RaidLV */ + if (!lv_remove(lv)) + log_error("Failed to remove LV"); + else if (!vg_write(vg) || !vg_commit(vg)) + log_error("Failed to commit VG %s", vg->name); + return_0; } - lv_set_hidden(meta_lv); + + dm_pool_free(vg->vgmem, lvl); } + + for (s = 0; s < seg->area_count; s++) + lv_set_hidden(seg_metalv(seg, s)); + + lv->status &= ~LV_ACTIVATION_SKIP; } - seg->area_len += extents; - seg->len += extents; - lv->le_count += extents; - lv->size += (uint64_t) extents * lv->vg->extent_size; + return 1; +} - /* - * The MD bitmap is limited to being able to track 2^21 regions. - * The region_size must be adjusted to meet that criteria. - */ - while (seg_is_raid(seg) && (seg->region_size < (lv->size / (1 << 21)))) { - seg->region_size *= 2; - log_very_verbose("Forced to adjust RAID region_size to %uS", - seg->region_size); +/* Check either RAID images and metas are being allocated redundantly. */ +static int _lv_raid_redundant(struct logical_volume *lv, + struct dm_list *allocatable_pvs, int meta) +{ + uint32_t nlvs, s; + struct lv_segment *seg = first_seg(lv); + struct pv_list *pvl; + + if (meta && !seg->meta_areas) + return 1; + + dm_list_iterate_items(pvl, allocatable_pvs) { + nlvs = 0; + + for (s = 0; s < seg->area_count; s++) { + struct logical_volume *slv = meta ? seg_metalv(seg, s) : seg_lv(seg, s); + + if (slv && lv_is_on_pv(slv, pvl->pv) && nlvs++) { + log_error("LV %s using PV %s is not redundant.", + display_lvname(slv), dev_name(pvl->pv->dev)); + return 0; + } + } } return 1; } +/* Check both RAID images and metas are being allocated redundantly. */ +static int _lv_raid_redundant_allocation(struct logical_volume *lv, struct dm_list *allocatable_pvs) +{ + return _lv_raid_redundant(lv, allocatable_pvs, 0) && + _lv_raid_redundant(lv, allocatable_pvs, 1); +} + /* * Entry point for single-step LV allocation + extension. + * Extents is the number of logical extents to append to the LV unless + * approx_alloc is set when it is an upper limit for the total number of + * extents to use from the VG. + * + * FIXME The approx_alloc raid/stripe conversion should be performed + * before calling this function. */ int lv_extend(struct logical_volume *lv, const struct segment_type *segtype, uint32_t stripes, uint32_t stripe_size, uint32_t mirrors, uint32_t region_size, - uint32_t extents, const char *thin_pool_name, - struct dm_list *allocatable_pvs, alloc_policy_t alloc) + uint32_t extents, + struct dm_list *allocatable_pvs, alloc_policy_t alloc, + int approx_alloc) { int r = 1; int log_count = 0; struct alloc_handle *ah; uint32_t sub_lv_count; + uint32_t old_extents; + uint32_t new_extents; /* Total logical size after extension. */ + uint64_t raid_size; - log_very_verbose("Extending segment type, %s", segtype->name); + log_very_verbose("Adding segment of type %s to LV %s.", segtype->name, lv->name); if (segtype_is_virtual(segtype)) - return lv_add_virtual_segment(lv, 0u, extents, segtype, thin_pool_name); - - if (!lv->le_count && segtype_is_thin_pool(segtype)) { - /* Thin pool allocation treats its metadata device like a mirror log. */ - /* FIXME Allow pool and data on same device with NORMAL */ - /* FIXME Support striped metadata pool */ - log_count = 1; - } else if (segtype_is_raid(segtype) && !lv->le_count) - log_count = mirrors * stripes; + return lv_add_virtual_segment(lv, 0u, extents, segtype); + + if (!lv->le_count) { + if (segtype_is_pool(segtype)) + /* + * Pool allocations treat the metadata device like a mirror log. + */ + /* FIXME Support striped metadata pool */ + log_count = 1; + else if (segtype_is_raid0_meta(segtype)) + /* Extend raid0 metadata LVs too */ + log_count = stripes; + else if (segtype_is_raid_with_meta(segtype)) + log_count = mirrors * stripes; + } /* FIXME log_count should be 1 for mirrors */ + if (segtype_is_raid(segtype) && !segtype_is_any_raid0(segtype)) { + raid_size = ((uint64_t) lv->le_count + extents) * lv->vg->extent_size; + + /* + * The MD bitmap is limited to being able to track 2^21 regions. + * The region_size must be adjusted to meet that criteria + * unless raid0/raid0_meta, which doesn't have a bitmap. + */ + + region_size = raid_ensure_min_region_size(lv, raid_size, region_size); + + if (first_seg(lv)) + first_seg(lv)->region_size = region_size; + + } + if (!(ah = allocate_extents(lv->vg, lv, segtype, stripes, mirrors, log_count, region_size, extents, - allocatable_pvs, alloc, NULL))) + allocatable_pvs, alloc, approx_alloc, NULL))) return_0; - if (segtype_is_thin_pool(segtype)) { - if (!lv->le_count) { - if (!(r = extend_pool(lv, segtype, ah, stripes, stripe_size))) - stack; - } else if (!(r = _lv_extend_layered_lv(ah, lv, extents, 0, - stripes, stripe_size))) + new_extents = ah->new_extents; + if (segtype_is_raid_with_meta(segtype)) + new_extents -= ah->log_len * ah->area_multiple; + + if (segtype_is_pool(segtype)) { + if (!(r = create_pool(lv, segtype, ah, stripes, stripe_size))) stack; - } else if (!segtype_is_mirrored(segtype) && !segtype_is_raid(segtype)) { + } else if (!segtype_is_mirror(segtype) && !segtype_is_raid(segtype)) { if (!(r = lv_add_segment(ah, 0, ah->area_count, lv, segtype, stripe_size, 0u, 0))) stack; @@ -2732,6 +4578,8 @@ int lv_extend(struct logical_volume *lv, else sub_lv_count = mirrors; + old_extents = lv->le_count; + if (!lv->le_count && !(r = _lv_insert_empty_sublvs(lv, segtype, stripe_size, region_size, sub_lv_count))) { @@ -2739,48 +4587,64 @@ int lv_extend(struct logical_volume *lv, goto out; } - if (!(r = _lv_extend_layered_lv(ah, lv, extents, 0, - stripes, stripe_size))) + if (!(r = _lv_extend_layered_lv(ah, lv, new_extents - lv->le_count, 0, + mirrors, stripes, stripe_size))) goto_out; + if (segtype_is_raid(segtype) && + alloc != ALLOC_ANYWHERE && + !(r = _lv_raid_redundant_allocation(lv, allocatable_pvs))) { + log_error("Insufficient suitable allocatable extents for logical volume %s", display_lvname(lv)); + if (!lv_remove(lv) || !vg_write(lv->vg) || !vg_commit(lv->vg)) + return_0; + goto out; + } + + if (lv_raid_has_integrity(lv)) { + if (!lv_extend_integrity_in_raid(lv, allocatable_pvs)) { + r = 0; + goto_out; + } + } + /* * If we are expanding an existing mirror, we can skip the * resync of the extension if the LV is currently in-sync * and the LV has the LV_NOTSYNCED flag set. */ - if ((lv->le_count != extents) && + if (old_extents && segtype_is_mirrored(segtype) && - (lv->status & LV_NOTSYNCED)) { - percent_t sync_percent = PERCENT_INVALID; + (lv_is_not_synced(lv))) { + dm_percent_t sync_percent = DM_PERCENT_INVALID; if (!lv_is_active(lv)) { - log_error("%s/%s is not active." - " Unable to get sync percent.", - lv->vg->name, lv->name); + log_error("Unable to read sync percent while LV %s " + "is not locally active.", display_lvname(lv)); /* FIXME Support --force */ if (yes_no_prompt("Do full resync of extended " - "portion of %s/%s? [y/n]: ", - lv->vg->name, lv->name) == 'y') - goto out; - r = 0; + "portion of %s? [y/n]: ", + display_lvname(lv)) == 'n') { + r = 0; + goto_out; + } goto out; } if (!(r = lv_mirror_percent(lv->vg->cmd, lv, 0, &sync_percent, NULL))) { - log_error("Failed to get sync percent for %s/%s", - lv->vg->name, lv->name); + log_error("Failed to get sync percent for %s.", + display_lvname(lv)); goto out; - } else if (sync_percent == PERCENT_100) { + } else if (lv_is_not_synced(lv) || + sync_percent == DM_PERCENT_100) { log_verbose("Skipping initial resync for " - "extended portion of %s/%s", - lv->vg->name, lv->name); + "extended portion of %s", + display_lvname(lv)); init_mirror_in_sync(1); lv->status |= LV_NOTSYNCED; } else { - log_error("%s/%s cannot be extended while" - " it is recovering.", - lv->vg->name, lv->name); + log_error("LV %s cannot be extended while it " + "is recovering.", display_lvname(lv)); r = 0; goto out; } @@ -2795,19 +4659,21 @@ out: /* * Minimal LV renaming function. * Metadata transaction should be made by caller. - * Assumes new_name is allocated from cmd->mem pool. + * Assumes new_name is allocated from lv->vgmem pool. */ static int _rename_single_lv(struct logical_volume *lv, char *new_name) { struct volume_group *vg = lv->vg; + int historical; - if (find_lv_in_vg(vg, new_name)) { - log_error("Logical volume \"%s\" already exists in " - "volume group \"%s\"", new_name, vg->name); + if (lv_name_is_used_in_vg(vg, new_name, &historical)) { + log_error("%sLogical Volume \"%s\" already exists in " + "volume group \"%s\"", historical ? "historical " : "", + new_name, vg->name); return 0; } - if (lv->status & LOCKED) { + if (lv_is_locked(lv)) { log_error("Cannot rename locked LV %s", lv->name); return 0; } @@ -2821,8 +4687,7 @@ static int _rename_single_lv(struct logical_volume *lv, char *new_name) * Rename sub LV. * 'lv_name_old' and 'lv_name_new' are old and new names of the main LV. */ -static int _rename_sub_lv(struct cmd_context *cmd, - struct logical_volume *lv, +static int _rename_sub_lv(struct logical_volume *lv, const char *lv_name_old, const char *lv_name_new) { const char *suffix; @@ -2849,7 +4714,7 @@ static int _rename_sub_lv(struct cmd_context *cmd, * a new name for main LV is "lvol1" */ len = strlen(lv_name_new) + strlen(suffix) + 1; - new_name = dm_pool_alloc(cmd->mem, len); + new_name = dm_pool_alloc(lv->vg->vgmem, len); if (!new_name) { log_error("Failed to allocate space for new name"); return 0; @@ -2859,73 +4724,100 @@ static int _rename_sub_lv(struct cmd_context *cmd, return 0; } + if (!validate_name(new_name)) { + log_error("Cannot rename \"%s\". New logical volume name \"%s\" is invalid.", + lv->name, new_name); + return 0; + } + /* Rename it */ return _rename_single_lv(lv, new_name); } /* Callback for for_each_sub_lv */ -static int _rename_cb(struct cmd_context *cmd, struct logical_volume *lv, - void *data) +static int _rename_cb(struct logical_volume *lv, void *data) { struct lv_names *lv_names = (struct lv_names *) data; - return _rename_sub_lv(cmd, lv, lv_names->old, lv_names->new); + return _rename_sub_lv(lv, lv_names->old, lv_names->new); +} + +static int _rename_skip_pools_externals_cb(struct logical_volume *lv, void *data) +{ + if (lv_is_pool(lv) || + lv_is_vdo_pool(lv) || + lv_is_cache_vol(lv) || + lv_is_external_origin(lv)) + return -1; /* and skip subLVs */ + + return _rename_cb(lv, data); } /* * Loop down sub LVs and call fn for each. * fn is responsible to log necessary information on failure. + * Return value '0' stops whole traversal. + * Return value '-1' stops subtree traversal. */ -int for_each_sub_lv(struct cmd_context *cmd, struct logical_volume *lv, - int (*fn)(struct cmd_context *cmd, - struct logical_volume *lv, void *data), - void *data) +static int _for_each_sub_lv(struct logical_volume *lv, int level, + int (*fn)(struct logical_volume *lv, void *data), + void *data) { struct logical_volume *org; struct lv_segment *seg; uint32_t s; + int r; - if (lv_is_cow(lv) && lv_is_virtual_origin(org = origin_from_cow(lv))) { - if (!fn(cmd, org, data)) + if (!lv) + return 1; + + if (level++) { + if (!(r = fn(lv, data))) return_0; - if (!for_each_sub_lv(cmd, org, fn, data)) + if (r == -1) + return 1; + /* Only r != -1 continues with for_each_sub_lv()... */ + } + + if (lv_is_cow(lv) && lv_is_virtual_origin(org = origin_from_cow(lv))) { + if (!_for_each_sub_lv(org, level, fn, data)) return_0; } dm_list_iterate_items(seg, &lv->segments) { - if (seg->log_lv) { - if (!fn(cmd, seg->log_lv, data)) - return_0; - if (!for_each_sub_lv(cmd, seg->log_lv, fn, data)) - return_0; - } + if (!_for_each_sub_lv(seg->external_lv, level, fn, data)) + return_0; - if (seg->metadata_lv) { - if (!fn(cmd, seg->metadata_lv, data)) - return_0; - if (!for_each_sub_lv(cmd, seg->metadata_lv, fn, data)) - return_0; - } + if (!_for_each_sub_lv(seg->log_lv, level, fn, data)) + return_0; + + if (!_for_each_sub_lv(seg->metadata_lv, level, fn, data)) + return_0; + + if (!_for_each_sub_lv(seg->pool_lv, level, fn, data)) + return_0; + + if (!_for_each_sub_lv(seg->writecache, level, fn, data)) + return_0; + + if (!_for_each_sub_lv(seg->integrity_meta_dev, level, fn, data)) + return_0; for (s = 0; s < seg->area_count; s++) { if (seg_type(seg, s) != AREA_LV) continue; - if (!fn(cmd, seg_lv(seg, s), data)) - return_0; - if (!for_each_sub_lv(cmd, seg_lv(seg, s), fn, data)) + if (!_for_each_sub_lv(seg_lv(seg, s), level, fn, data)) return_0; } - if (!seg_is_raid(seg)) + if (!seg_is_raid_with_meta(seg)) continue; /* RAID has meta_areas */ for (s = 0; s < seg->area_count; s++) { - if (seg_metatype(seg, s) != AREA_LV) + if ((seg_metatype(seg, s) != AREA_LV) || !seg_metalv(seg, s)) continue; - if (!fn(cmd, seg_metalv(seg, s), data)) - return_0; - if (!for_each_sub_lv(cmd, seg_metalv(seg, s), fn, data)) + if (!_for_each_sub_lv(seg_metalv(seg, s), level, fn, data)) return_0; } } @@ -2933,6 +4825,12 @@ int for_each_sub_lv(struct cmd_context *cmd, struct logical_volume *lv, return 1; } +int for_each_sub_lv(struct logical_volume *lv, + int (*fn)(struct logical_volume *lv, void *data), + void *data) +{ + return _for_each_sub_lv(lv, 0, fn, data); +} /* * Core of LV renaming routine. @@ -2942,90 +4840,2318 @@ int lv_rename_update(struct cmd_context *cmd, struct logical_volume *lv, const char *new_name, int update_mda) { struct volume_group *vg = lv->vg; - struct lv_names lv_names; - DM_LIST_INIT(lvs_changed); - struct lv_list lvl, lvl2, *lvlp; - int r = 0; + struct lv_names lv_names = { .old = lv->name }; + int old_lv_is_historical = lv_is_historical(lv); + int historical; + unsigned attrs; + const struct segment_type *segtype; - /* rename is not allowed on sub LVs */ - if (!lv_is_visible(lv)) { + /* + * rename is not allowed on sub LVs except for pools + * (thin pool is 'visible', but cache may not) + */ + if (!lv_is_pool(lv) && + !lv_is_vdo_pool(lv) && + !lv_is_visible(lv)) { log_error("Cannot rename internal LV \"%s\".", lv->name); return 0; } - if (find_lv_in_vg(vg, new_name)) { - log_error("Logical volume \"%s\" already exists in " - "volume group \"%s\"", new_name, vg->name); + if (lv_name_is_used_in_vg(vg, new_name, &historical)) { + log_error("%sLogical Volume \"%s\" already exists in " + "volume group \"%s\"", historical ? "Historical " : "", + new_name, vg->name); return 0; } - if (lv->status & LOCKED) { + if (lv_is_locked(lv)) { log_error("Cannot rename locked LV %s", lv->name); return 0; } - if (update_mda && !archive(vg)) - return 0; + if (lv_is_vdo_pool(lv) && lv_is_active(lv_lock_holder(lv))) { + segtype = first_seg(lv)->segtype; + if (!segtype->ops->target_present || + !segtype->ops->target_present(lv->vg->cmd, NULL, &attrs) || + !(attrs & VDO_FEATURE_ONLINE_RENAME)) { + log_error("Cannot rename active VDOPOOL volume %s, " + "VDO target feature support is missing.", + display_lvname(lv)); + return 0; + } + } + + if (old_lv_is_historical) { + /* + * Historical LVs have neither sub LVs nor any + * devices to reload, so just update metadata. + */ + lv->this_glv->historical->name = lv->name = new_name; + if (update_mda && + (!vg_write(vg) || !vg_commit(vg))) + return_0; + } else { + if (!(lv_names.new = dm_pool_strdup(cmd->mem, new_name))) { + log_error("Failed to allocate space for new name."); + return 0; + } + + /* rename sub LVs */ + if (!for_each_sub_lv(lv, _rename_skip_pools_externals_cb, (void *) &lv_names)) + return_0; + + /* rename main LV */ + lv->name = lv_names.new; + + if (lv_is_cow(lv)) + lv = origin_from_cow(lv); + + if (update_mda && !lv_update_and_reload((struct logical_volume *)lv_lock_holder(lv))) + return_0; + } + + return 1; +} + +/* + * Rename LV to new name, if name is occupies, lvol% is generated. + * VG must be locked by caller. + */ +int lv_uniq_rename_update(struct cmd_context *cmd, struct logical_volume *lv, + const char *new_name, int update_mda) +{ + char uniq_name[NAME_LEN]; - /* rename sub LVs */ - lv_names.old = lv->name; - lv_names.new = new_name; - if (!for_each_sub_lv(cmd, lv, _rename_cb, (void *) &lv_names)) + /* If the name is in use, generate new lvol%d */ + if (lv_name_is_used_in_vg(lv->vg, new_name, NULL)) { + if (!generate_lv_name(lv->vg, "lvol%d", uniq_name, sizeof(uniq_name))) { + log_error("Failed to generate unique name for unused logical volume."); + return 0; + } + new_name = uniq_name; + } + + if (!lv_rename_update(cmd, lv, new_name, 0)) + return_0; + + return 1; +} + +/* + * Core of LV renaming routine. + * VG must be locked by caller. + */ +int lv_rename(struct cmd_context *cmd, struct logical_volume *lv, + const char *new_name) +{ + return lv_rename_update(cmd, lv, new_name, 1); +} + +/* + * Core lv resize code + */ + +#define SIZE_BUF 128 + +/* TODO: unify stripe size validation across source code */ +static int _validate_stripesize(const struct volume_group *vg, + struct lvresize_params *lp) +{ + if (lp->stripe_size > (STRIPE_SIZE_LIMIT * 2)) { + log_error("Stripe size cannot be larger than %s.", + display_size(vg->cmd, (uint64_t) STRIPE_SIZE_LIMIT)); return 0; + } - /* rename main LV */ - if (!(lv->name = dm_pool_strdup(cmd->mem, new_name))) { - log_error("Failed to allocate space for new name"); + if (lp->stripe_size > vg->extent_size) { + log_print_unless_silent("Reducing stripe size %s to maximum, " + "physical extent size %s.", + display_size(vg->cmd, lp->stripe_size), + display_size(vg->cmd, vg->extent_size)); + lp->stripe_size = vg->extent_size; + } + + if (!is_power_of_2(lp->stripe_size)) { + log_error("Stripe size must be power of 2."); return 0; } - lvl.lv = lv; - dm_list_add(&lvs_changed, &lvl.list); + return 1; +} + +static int _lv_reduce_confirmation(struct logical_volume *lv, + struct lvresize_params *lp) +{ + const struct volume_group *vg = lv->vg; + struct lvinfo info = { 0 }; - /* rename active virtual origin too */ - if (lv_is_cow(lv) && lv_is_virtual_origin(lvl2.lv = origin_from_cow(lv))) - dm_list_add_h(&lvs_changed, &lvl2.list); + if (!lv_info(vg->cmd, lv, 0, &info, 1, 0) && driver_version(NULL, 0)) { + log_error("lv_info failed: aborting."); + return 0; + } - if (!update_mda) + if (!info.exists) return 1; - log_verbose("Writing out updated volume group"); - if (!vg_write(vg)) + log_warn("WARNING: Reducing active%s logical volume to %s.", + info.open_count ? " and open" : "", + display_size(vg->cmd, (uint64_t) lp->extents * vg->extent_size)); + + log_warn("THIS MAY DESTROY YOUR DATA (filesystem etc.)"); + + if (!lp->force && !lp->yes) { + if (yes_no_prompt("Do you really want to reduce %s? [y/n]: ", + display_lvname(lv)) == 'n') { + log_error("Logical volume %s NOT reduced.", + display_lvname(lv)); + return 0; + } + } + + return 1; +} + +enum fsadm_cmd_e { FSADM_CMD_CHECK, FSADM_CMD_RESIZE }; + +#define FSADM_CMD_MAX_ARGS 10 +#define FSADM_CHECK_FAILS_FOR_MOUNTED 3 /* shell exist status code */ + +/* + * fsadm --dry-run --verbose --force check lv_path + * fsadm --dry-run --verbose --force resize lv_path size + */ +static int _fsadm_cmd(enum fsadm_cmd_e fcmd, + struct logical_volume *lv, + uint32_t extents, + int yes, + int force, + int *status) +{ + struct volume_group *vg = lv->vg; + struct cmd_context *cmd = vg->cmd; + char lv_path[PATH_MAX]; + char size_buf[SIZE_BUF]; + unsigned i = 1; + const char *argv[FSADM_CMD_MAX_ARGS] = { + find_config_tree_str(cmd, global_fsadm_executable_CFG, NULL) + }; + + if (!argv[0] || !*argv[0]) { + log_error("Cannot use misconfigured fsadm executable to resize %s.", display_lvname(lv)); return 0; + } - if (!suspend_lvs(cmd, &lvs_changed, vg)) - goto_out; + if (test_mode()) + argv[i++] = "--dry-run"; + + if (verbose_level() >= _LOG_NOTICE) + argv[i++] = "--verbose"; + + if (yes) + argv[i++] = "--yes"; + + if (force) + argv[i++] = "--force"; + + argv[i++] = (fcmd == FSADM_CMD_RESIZE) ? "resize" : "check"; + + if (status) + *status = -1; + + if (dm_snprintf(lv_path, sizeof(lv_path), "%s%s/%s", cmd->dev_dir, + vg->name, lv->name) < 0) { + log_error("Couldn't create LV path for %s.", display_lvname(lv)); + return 0; + } + + argv[i++] = lv_path; + + if (fcmd == FSADM_CMD_RESIZE) { + if (dm_snprintf(size_buf, sizeof(size_buf), FMTu64 "K", + (uint64_t) extents * (vg->extent_size / 2)) < 0) { + log_error("Couldn't generate new LV size string."); + return 0; + } + + argv[i++] = size_buf; + } + + return exec_cmd(cmd, argv, status, 1); +} + +static uint32_t _adjust_amount(dm_percent_t percent, int policy_threshold, int policy_amount) +{ + if (!((50 * DM_PERCENT_1) < percent && percent <= DM_PERCENT_100) || + percent <= (policy_threshold * DM_PERCENT_1)) + return 0; /* nothing to do */ + /* + * Evaluate the minimal amount needed to get bellow threshold. + * Keep using DM_PERCENT_1 units for better precision. + * Round-up to needed percentage value + */ + policy_threshold *= (DM_PERCENT_1 / 100); + percent = (percent + policy_threshold - 1) / policy_threshold - 100; + + /* Use it if current policy amount is smaller */ + return (policy_amount < percent) ? (uint32_t) percent : (uint32_t) policy_amount; +} + +/* "amount" here is percent */ +int lv_extend_policy_calculate_percent(struct logical_volume *lv, + uint32_t *amount, uint32_t *meta_amount) +{ + struct cmd_context *cmd = lv->vg->cmd; + dm_percent_t percent; + dm_percent_t min_threshold; + int policy_threshold, policy_amount; + struct lv_status_thin_pool *thin_pool_status; + + *amount = *meta_amount = 0; + + if (lv_is_thin_pool(lv)) { + policy_threshold = + find_config_tree_int(cmd, activation_thin_pool_autoextend_threshold_CFG, + lv_config_profile(lv)); + policy_amount = + find_config_tree_int(cmd, activation_thin_pool_autoextend_percent_CFG, + lv_config_profile(lv)); + if (policy_threshold < 50) { + log_warn("WARNING: Thin pool autoextend threshold %d%% is set below " + "minimum supported 50%%.", policy_threshold); + policy_threshold = 50; + } + } else if (lv_is_vdo_pool(lv)) { + policy_threshold = + find_config_tree_int(cmd, activation_vdo_pool_autoextend_threshold_CFG, + lv_config_profile(lv)); + policy_amount = + find_config_tree_int(cmd, activation_vdo_pool_autoextend_percent_CFG, + lv_config_profile(lv)); + if (policy_threshold < 50) { + log_warn("WARNING: VDO pool autoextend threshold %d%% is set below " + "minimum supported 50%%.", policy_threshold); + policy_threshold = 50; + } + } else { + policy_threshold = + find_config_tree_int(cmd, activation_snapshot_autoextend_threshold_CFG, NULL); + policy_amount = + find_config_tree_int(cmd, activation_snapshot_autoextend_percent_CFG, NULL); + if (policy_threshold < 50) { + log_warn("WARNING: Snapshot autoextend threshold %d%% is set bellow " + "minimal supported value 50%%.", policy_threshold); + policy_threshold = 50; + } + } + + if (policy_threshold >= 100) { + log_debug("lvextend policy disabled by threshold 100"); + return 1; /* nothing to do */ + } + + if (!policy_amount) { + log_error("Can't extend %s with %s autoextend percent set to 0%%.", + display_lvname(lv), lvseg_name(first_seg(lv))); + return 0; + } + + if (lv_is_thin_pool(lv)) { + if (!lv_thin_pool_status(lv, 0, &thin_pool_status)) + goto_bad; + + /* Resize below the minimal usable value */ + min_threshold = thin_pool_metadata_min_threshold(first_seg(lv)) / DM_PERCENT_1; + *meta_amount = _adjust_amount(thin_pool_status->metadata_usage, + (min_threshold < policy_threshold) ? + min_threshold : policy_threshold, policy_amount); + if (*meta_amount) + /* Compensate possible extra space consumption by kernel on resize */ + (*meta_amount)++; + percent = thin_pool_status->data_usage; + dm_pool_destroy(thin_pool_status->mem); + } else if (lv_is_vdo_pool(lv)) { + if (!lv_vdo_pool_percent(lv, &percent)) + goto_bad; + } else if (!lv_snapshot_percent(lv, &percent)) + goto_bad; + else if (!lv_is_active(lv)) { + bad: + log_error("Can't read state of locally inactive LV %s.", + display_lvname(lv)); + return 0; + } + + *amount = _adjust_amount(percent, policy_threshold, policy_amount); + + log_debug("lvextend policy calculated percentages main %u meta %u from threshold %d percent %d", + *amount, *meta_amount, policy_threshold, policy_amount); + return 1; +} + +static uint32_t _lvseg_get_stripes(struct lv_segment *seg, uint32_t *stripesize) +{ + uint32_t s; + struct lv_segment *seg_get, *seg_image, *seg_iorig; + struct logical_volume *lv_image, *lv_iorig; + + /* If segment mirrored, check if images are striped */ + if (seg_is_mirrored(seg)) { + for (s = 0; s < seg->area_count; s++) { + if (seg_type(seg, s) != AREA_LV) + continue; - if (!(r = vg_commit(vg))) + lv_image = seg_lv(seg, s); + seg_image = first_seg(lv_image); + seg_get = NULL; + + if (seg_is_integrity(seg_image)) { + /* Get stripe values from the iorig layer. */ + lv_iorig = seg_lv(seg_image, 0); + seg_iorig = first_seg(lv_iorig); + seg_get = seg_iorig; + } else { + /* Get stripe values from the image layer. */ + seg_get = seg_image; + } + + if (seg_get && seg_is_striped(seg_get)) { + seg = seg_get; + break; + } + } + } + + if (seg_is_striped(seg)) { + *stripesize = seg->stripe_size; + return seg->area_count; + } + + if (seg_is_raid(seg)) { + *stripesize = seg->stripe_size; + return _raid_stripes_count(seg); + } + + *stripesize = 0; + return 0; +} + +static int _lvresize_adjust_size(struct volume_group *vg, + uint64_t size, sign_t sign, + uint32_t *extents) +{ + uint32_t extent_size = vg->extent_size; + uint32_t adjust; + + /* + * First adjust to an exact multiple of extent size. + * When changing to an absolute size, we round that size up. + * When extending by a relative amount we round that amount up. + * When reducing by a relative amount we remove at most that amount. + */ + if ((adjust = (size % extent_size))) { + if (sign != SIGN_MINUS) /* not reducing */ + size += extent_size; + + size -= adjust; + log_print_unless_silent("Rounding size to boundary between physical extents: %s.", + display_size(vg->cmd, size)); + } + + if (!(*extents = extents_from_size(vg->cmd, size, extent_size))) + return_0; + + return 1; +} + +/* + * If percent options were used, convert them into actual numbers of extents. + * FIXME: fix cases where lp->extents is initially used as a percentage, + * and is then rewritten to be a number of extents (simply save the percent + * value elsewhere.) + */ +static int _lvresize_extents_from_percent(const struct logical_volume *lv, + struct lvresize_params *lp) +{ + const struct volume_group *vg = lv->vg; + uint32_t pv_extent_count; + uint32_t old_extents = lp->extents; + + log_debug("lvresize_extents_from_percent type %d extents %u percent_value %u", + lp->percent, lp->extents, lp->percent_value); + + switch (lp->percent) { + case PERCENT_VG: + /* rewrites lp->extents from percentage to extents */ + lp->extents = percent_of_extents(lp->extents, vg->extent_count, + (lp->sign != SIGN_MINUS)); + if ((lp->sign == SIGN_NONE) && (lp->extents > (lv->le_count + vg->free_count))) { + lp->extents = lv->le_count + vg->free_count; + log_print_unless_silent("Reducing %u%%VG to remaining free space %s in VG.", + old_extents, + display_size(vg->cmd, (uint64_t)vg->extent_size * lp->extents)); + } + break; + case PERCENT_FREE: + /* rewrites lp->extents from percentage to extents */ + lp->extents = percent_of_extents(lp->extents, vg->free_count, + (lp->sign != SIGN_MINUS)); + break; + case PERCENT_LV: + if (lp->extents) { + /* rewrites lp->extents from percentage to extents */ + lp->extents = percent_of_extents(lp->extents, lv->le_count, + (lp->sign != SIGN_MINUS)); + } else if (lp->percent_value) { + old_extents = lp->percent_value; + lp->extents = percent_of_extents(lp->percent_value, lv->le_count, + (lp->sign != SIGN_MINUS)); + } + break; + case PERCENT_PVS: + if (lp->pvh != &vg->pvs) { + pv_extent_count = pv_list_extents_free(lp->pvh); + if (lp->extents) { + /* rewrites lp->extents from percentage to extents */ + lp->extents = percent_of_extents(lp->extents, pv_extent_count, + (lp->sign != SIGN_MINUS)); + } else if (lp->percent_value) { + /* lvresize has PVs args and no size of exents options */ + old_extents = lp->percent_value; + lp->extents = percent_of_extents(lp->percent_value, pv_extent_count, + (lp->sign != SIGN_MINUS)); + } + } else { + if (lp->extents) { + /* rewrites lp->extents from percentage to extents */ + lp->extents = percent_of_extents(lp->extents, vg->extent_count, + (lp->sign != SIGN_MINUS)); + } else if (lp->percent_value) { + old_extents = lp->percent_value; + lp->extents = percent_of_extents(lp->percent_value, vg->extent_count, + (lp->sign != SIGN_MINUS)); + } + } + break; + case PERCENT_ORIGIN: + if (!lv_is_cow(lv)) { + log_error("Specified LV does not have an origin LV."); + return 0; + } + lp->extents = percent_of_extents(lp->extents, origin_from_cow(lv)->le_count, + (lp->sign != SIGN_MINUS)); + break; + case PERCENT_NONE: + return 1; /* Nothing to do */ + default: + log_error(INTERNAL_ERROR "Unsupported percent type %u.", lp->percent); + return 0; + } + + if (lp->percent == PERCENT_VG || lp->percent == PERCENT_FREE || lp->percent == PERCENT_PVS) + lp->extents_are_pes = 1; + + if (lp->sign == SIGN_NONE && (lp->percent == PERCENT_VG || lp->percent == PERCENT_FREE || lp->percent == PERCENT_PVS)) + lp->approx_alloc = 1; + + if (lp->sign == SIGN_PLUS && lp->percent == PERCENT_FREE) + lp->approx_alloc = 1; + + log_verbose("Converted %" PRIu32 "%%%s into %s%" PRIu32 " %s extents.", old_extents, get_percent_string(lp->percent), + lp->approx_alloc ? "at most " : "", lp->extents, lp->extents_are_pes ? "physical" : "logical"); + + return 1; +} + +static int _add_pes(struct logical_volume *lv, void *data) +{ + uint32_t *pe_total = data; + struct lv_segment *seg; + uint32_t s; + + dm_list_iterate_items(seg, &lv->segments) { + for (s = 0; s < seg->area_count; s++) { + if (seg_type(seg, s) != AREA_PV) + continue; + + *pe_total += seg_pvseg(seg, s)->len; + } + } + + return 1; +} + +static uint32_t _lv_pe_count(struct logical_volume *lv) +{ + uint32_t pe_total = 0; + + /* Top-level LV first */ + if (!_add_pes(lv, &pe_total)) + stack; + + /* Any sub-LVs */ + if (!for_each_sub_lv(lv, _add_pes, &pe_total)) stack; + return pe_total; +} + +/* FIXME Avoid having variables like lp->extents mean different things at different places */ +static int _lvresize_adjust_extents(struct logical_volume *lv, + struct lvresize_params *lp, + int *matches_existing) +{ + struct volume_group *vg = lv->vg; + struct cmd_context *cmd = vg->cmd; + uint32_t logical_extents_used = 0; + uint32_t physical_extents_used = 0; + uint32_t seg_stripes = 0, seg_stripesize = 0; + uint32_t seg_mirrors = 0; + struct lv_segment *seg, *seg_last; + uint32_t sz, str; + uint32_t seg_logical_extents; + uint32_t seg_physical_extents; + uint32_t area_multiple; + uint32_t stripes_extents; + uint32_t size_rest; + uint32_t existing_logical_extents = lv->le_count; + uint32_t existing_physical_extents, saved_existing_physical_extents; + uint32_t existing_extents; + uint32_t seg_size = 0; + uint32_t new_extents; + uint64_t max_metadata_size; + thin_crop_metadata_t crop; + int reducing = 0; + + seg_last = last_seg(lv); + + if (!lp->segtype) + /* Use segment type of last segment */ + lp->segtype = seg_last->segtype; + else if (lp->segtype != seg_last->segtype) { + /* Support newseg error or zero with lastseg striped + * and newseg striped with lastseg error or zero */ + if ((segtype_is_error(lp->segtype) || segtype_is_zero(lp->segtype) || + segtype_is_striped(lp->segtype)) && + (segtype_is_striped(seg_last->segtype) || + segtype_is_error(seg_last->segtype) || segtype_is_zero(seg_last->segtype))) { + if (!lp->stripes) + lp->stripes = 1; + } else { + log_error("VolumeType does not match (%s).", lp->segtype->name); + return 0; + } + /* FIXME Support more LVs with mixed segment types */ + log_print_unless_silent("Logical volume %s is using mixing segment types %s and %s.", + display_lvname(lv), seg_last->segtype->name, lp->segtype->name); + } + + /* For virtual devices, just pretend the physical size matches. */ + existing_physical_extents = saved_existing_physical_extents = _lv_pe_count(lv); + if (!existing_physical_extents) { + existing_physical_extents = lv->le_count; + lp->extents_are_pes = 0; + } + + existing_extents = (lp->extents_are_pes) + ? existing_physical_extents : existing_logical_extents; + + /* Initial decision on whether we are extending or reducing */ + if (lp->sign == SIGN_MINUS || + (lp->sign == SIGN_NONE && (lp->extents < existing_extents))) + reducing = 1; + + /* If extending, find properties of last segment */ + if (!reducing) { + seg_mirrors = seg_is_mirrored(seg_last) ? lv_mirror_count(lv) : 0; + + if (!lp->mirrors && seg_mirrors) { + log_print_unless_silent("Extending %" PRIu32 " mirror images.", seg_mirrors); + lp->mirrors = seg_mirrors; + } else if ((lp->mirrors || seg_mirrors) && (lp->mirrors != seg_mirrors)) { + log_error("Cannot vary number of mirrors in LV yet."); + return 0; + } + + if (seg_is_raid10(seg_last)) { + if (!seg_mirrors) { + log_error(INTERNAL_ERROR "Missing mirror segments for %s.", + display_lvname(lv)); + return 0; + } + /* FIXME Warn if command line values are being overridden? */ + lp->stripes = seg_last->area_count / seg_mirrors; + lp->stripe_size = seg_last->stripe_size; + } else if (!(lp->stripes == 1 || (lp->stripes > 1 && lp->stripe_size))) { + /* If extending, find stripes, stripesize & size of last segment */ + /* FIXME Don't assume mirror seg will always be AREA_LV */ + /* FIXME We will need to support resize for metadata LV as well, + * and data LV could be any type (i.e. mirror)) */ + dm_list_iterate_items(seg, seg_mirrors ? &seg_lv(seg_last, 0)->segments : &lv->segments) { + /* Allow through "striped" and RAID 4/5/6/10 */ + if (!seg_is_striped(seg) && + (!seg_is_raid(seg) || seg_is_mirrored(seg)) && + !seg_is_raid10(seg)) + continue; + + sz = seg->stripe_size; + str = seg->area_count - lp->segtype->parity_devs; + + if ((seg_stripesize && seg_stripesize != sz && + sz && !lp->stripe_size) || + (seg_stripes && seg_stripes != str && !lp->stripes)) { + log_error("Please specify number of " + "stripes (-i) and stripesize (-I)"); + return 0; + } + + seg_stripesize = sz; + seg_stripes = str; + } + + if (!lp->stripes) + lp->stripes = seg_stripes; + else if (seg_is_raid(first_seg(lv)) && + (lp->stripes != seg_stripes)) { + log_error("Unable to extend \"%s\" segment type with different number of stripes.", + lvseg_name(first_seg(lv))); + return 0; + } + + if (!lp->stripe_size && lp->stripes > 1) { + if (seg_stripesize) { + log_print_unless_silent("Using stripesize of last segment %s", + display_size(cmd, (uint64_t) seg_stripesize)); + lp->stripe_size = seg_stripesize; + } else { + lp->stripe_size = + find_config_tree_int(cmd, metadata_stripesize_CFG, NULL) * 2; + log_print_unless_silent("Using default stripesize %s", + display_size(cmd, (uint64_t) lp->stripe_size)); + } + } + } + + if (lp->stripes > 1 && !lp->stripe_size) { + log_error("Stripesize for striped segment should not be 0!"); + return 0; + } + + /* Determine the amount to extend by */ + if (lp->sign == SIGN_PLUS) + seg_size = lp->extents; + else + seg_size = lp->extents - existing_extents; + + if (lv_is_vdo_pool_data(lv)) { + if (!(seg = get_only_segment_using_this_lv(lv))) + return_0; + /* Min growth is defined this way: max(1 slab, 128M + 128K (recovery journal + slab summary)) */ + new_extents = max(seg->vdo_params.slab_size_mb * 1024, UINT32_C(128 * 1024 + 128)); + new_extents *= (1024 >> SECTOR_SHIFT); /* minimal growth (~128MiB..32GiB) in sectors */ + + if (new_extents > vg->extent_size) { + /* Minimal growth in extent size units */ + new_extents = (new_extents + vg->extent_size - 1) / vg->extent_size; + + if (new_extents > seg_size) { + /* Notify user about extra increase of extension */ + log_print_unless_silent("Increasing incremention size from %s to %s to fit new VDO slab.", + display_size(cmd, (uint64_t)seg_size * vg->extent_size), + display_size(cmd, (uint64_t)new_extents * vg->extent_size)); + seg_size = new_extents; + } + } + } + + /* Convert PEs to LEs */ + if (lp->extents_are_pes && !seg_is_striped(seg_last) && !seg_is_virtual(seg_last)) { + area_multiple = _calc_area_multiple(seg_last->segtype, seg_last->area_count, 0); + seg_size = seg_size * area_multiple / (seg_last->area_count - seg_last->segtype->parity_devs); + seg_size = (seg_size / area_multiple) * area_multiple; + } + + if (seg_size >= (MAX_EXTENT_COUNT - existing_logical_extents)) { + log_error("Unable to extend %s by %u logical extents: exceeds limit (%u).", + display_lvname(lv), seg_size, MAX_EXTENT_COUNT); + return 0; + } + + lp->extents = existing_logical_extents + seg_size; + + /* Don't allow a cow to grow larger than necessary. */ + if (lv_is_cow(lv)) { + logical_extents_used = cow_max_extents(origin_from_cow(lv), find_snapshot(lv)->chunk_size); + if (logical_extents_used < lp->extents) { + log_print_unless_silent("Reached maximum COW size %s (%" PRIu32 " extents).", + display_size(vg->cmd, (uint64_t) vg->extent_size * logical_extents_used), + logical_extents_used); + lp->extents = logical_extents_used; // CHANGES lp->extents + seg_size = lp->extents - existing_logical_extents; // Recalculate + if (lp->extents == existing_logical_extents) { + /* Signal that normal resizing is not required */ + lp->size_changed = 1; + return 1; + } + } + } else if (lv_is_thin_pool_metadata(lv)) { + if (!(seg = get_only_segment_using_this_lv(lv))) + return_0; + + max_metadata_size = get_thin_pool_max_metadata_size(cmd, lv_config_profile(lv), &crop); + + if (((uint64_t)lp->extents * vg->extent_size) > max_metadata_size) { + lp->extents = (max_metadata_size + vg->extent_size - 1) / vg->extent_size; + log_print_unless_silent("Reached maximum pool metadata size %s (%" PRIu32 " extents).", + display_size(vg->cmd, max_metadata_size), lp->extents); + } + + if (existing_logical_extents >= lp->extents) + lp->extents = existing_logical_extents; + + crop = get_thin_pool_crop_metadata(cmd, crop, (uint64_t)lp->extents * vg->extent_size); + + if (seg->crop_metadata != crop) { + seg->crop_metadata = crop; + seg->lv->status |= LV_CROP_METADATA; + /* Crop change require reload even if there no size change */ + lp->size_changed = 1; + log_print_unless_silent("Thin pool will use metadata without cropping."); + } + + if (!(seg_size = lp->extents - existing_logical_extents)) + return 1; /* No change in metadata size */ + } + } else { + /* If reducing, find stripes, stripesize & size of last segment */ + + if (lp->sign == SIGN_MINUS) { + if (lp->extents >= existing_extents) { + log_error("Unable to reduce %s below 1 extent.", + display_lvname(lv)); + return 0; + } + new_extents = existing_extents - lp->extents; + } else + new_extents = lp->extents; + + dm_list_iterate_items(seg, &lv->segments) { + seg_logical_extents = seg->len; + seg_physical_extents = seg->area_len * seg->area_count; /* FIXME Also metadata, cow etc. */ + + /* Check for underlying stripe sizes */ + seg_stripes = _lvseg_get_stripes(seg, &seg_stripesize); + + if (seg_is_mirrored(seg)) + seg_mirrors = lv_mirror_count(seg->lv); + else + seg_mirrors = 0; + + /* Have we reached the final segment of the new LV? */ + if (lp->extents_are_pes) { + if (new_extents <= physical_extents_used + seg_physical_extents) { + seg_size = new_extents - physical_extents_used; + if (seg_mirrors) + seg_size /= seg_mirrors; + lp->extents = logical_extents_used + seg_size; + break; + } + } else if (new_extents <= logical_extents_used + seg_logical_extents) { + seg_size = new_extents - logical_extents_used; + lp->extents = new_extents; + break; + } + + logical_extents_used += seg_logical_extents; + physical_extents_used += seg_physical_extents; + } + + lp->stripe_size = seg_stripesize; + lp->stripes = seg_stripes; + lp->mirrors = seg_mirrors; + } + + /* At this point, lp->extents should hold the correct NEW logical size required. */ + + if (!lp->extents) { + log_error("New size of 0 not permitted."); + return 0; + } + + if ((lp->extents == existing_logical_extents) && !lp->use_policies) { + log_print_unless_silent("New size (%d extents) matches existing size (%d extents).", + lp->extents, existing_logical_extents); + if (lp->resize == LV_ANY) + lp->resize = LV_EXTEND; /* lets pretend zero size extension */ + *matches_existing = 1; + return 1; + } + + /* Perform any rounding to produce complete stripes. */ + if (lp->stripes > 1) { + if (lp->stripe_size < STRIPE_SIZE_MIN) { + log_error("Invalid stripe size %s.", + display_size(cmd, (uint64_t) lp->stripe_size)); + return 0; + } + + /* Segment size in extents must be divisible by stripes */ + stripes_extents = lp->stripes; + if (lp->stripe_size > vg->extent_size) + /* Strip size is bigger then extent size needs more extents */ + stripes_extents *= (lp->stripe_size / vg->extent_size); + + size_rest = seg_size % stripes_extents; + /* Round toward the original size. */ + if (size_rest && + ((lp->extents < existing_logical_extents) || + !lp->percent || + (vg->free_count >= (lp->extents - existing_logical_extents - size_rest + + stripes_extents)))) { + log_print_unless_silent("Rounding size (%d extents) up to stripe " + "boundary size for segment (%d extents).", + lp->extents, + lp->extents - size_rest + stripes_extents); + lp->extents = lp->extents - size_rest + stripes_extents; + } else if (size_rest) { + log_print_unless_silent("Rounding size (%d extents) down to stripe " + "boundary size for segment (%d extents)", + lp->extents, lp->extents - size_rest); + lp->extents = lp->extents - size_rest; + } + } + + /* Final sanity checking */ + if (lp->extents < existing_logical_extents) { + if (lp->resize == LV_EXTEND) { + log_error("New size given (%d extents) not larger " + "than existing size (%d extents)", + lp->extents, existing_logical_extents); + return 0; + } + lp->resize = LV_REDUCE; + } else if (lp->extents > existing_logical_extents) { + if (lp->resize == LV_REDUCE) { + log_error("New size given (%d extents) not less than " + "existing size (%d extents)", lp->extents, + existing_logical_extents); + return 0; + } + lp->resize = LV_EXTEND; + } else if ((lp->extents == existing_logical_extents) && !lp->use_policies) { + log_print_unless_silent("New size (%d extents) matches existing size (%d extents)", + lp->extents, existing_logical_extents); + if (lp->resize == LV_ANY) + lp->resize = LV_EXTEND; + *matches_existing = 1; + return 1; + } + /* - * FIXME: resume LVs in reverse order to prevent memory - * lock imbalance when resuming virtual snapshot origin - * (resume of snapshot resumes origin too) + * Has the user specified that they would like the additional + * extents of a mirror not to have an initial sync? */ - dm_list_iterate_back_items(lvlp, &lvs_changed) - if (!resume_lv(cmd, lvlp->lv)) - stack; -out: - backup(vg); - return r; + if ((lp->extents > existing_logical_extents)) { + if (seg_is_mirrored(first_seg(lv)) && lp->nosync) + lv->status |= LV_NOTSYNCED; + } + + log_debug("New size for %s: %" PRIu32 ". Existing logical extents: %" PRIu32 " / physical extents: %" PRIu32 ".", + display_lvname(lv), lp->extents, existing_logical_extents, saved_existing_physical_extents); + + return 1; +} + +static int _lv_reduce_vdo_discard(struct cmd_context *cmd, + struct logical_volume *lv, + struct lvresize_params *lp) +{ + char name[PATH_MAX]; + struct device *dev; + struct volume_group *vg = lv->vg; + + /* FIXME: stop using dev-cache and struct device here, dev-cache + should only be used for scanning headers/metadata to find PVs. */ + + if (dm_snprintf(name, sizeof(name), "%s%s/%s", cmd->dev_dir, + vg->name, lv->name) < 0) { + log_error("Name too long - device not discarded (%s)", lv->name); + return 0; + } + + if (!(dev = dev_cache_get(cmd, name, NULL))) { + log_error("%s: not found: device not discarded.", name); + return 0; + } + + if (!dev_discard_max_bytes(cmd->dev_types, dev) || + !dev_discard_granularity(cmd->dev_types, dev)) { + log_error("%s: max bytes and granularity query fails.", name); + dev_destroy_file(dev); + return 0; + } + + log_warn("WARNING: %s: Discarding %s at offset " FMTu64 ", please wait...", + name, display_size(cmd, (uint64_t)(lv->le_count - lp->extents) * vg->extent_size), + ((uint64_t)lp->extents * vg->extent_size) << SECTOR_SHIFT); + + if (!dev_discard_blocks(dev, ((uint64_t)lp->extents * vg->extent_size) << SECTOR_SHIFT, + ((uint64_t)(lv->le_count - lp->extents) * vg->extent_size) << SECTOR_SHIFT)) { + log_error("%s: discarding failed.", name); + dev_destroy_file(dev); + return 0; + } + + dev_destroy_file(dev); + return 1; +} + +static int _lv_resize_check_type(struct logical_volume *lv, + struct lvresize_params *lp) +{ + struct lv_segment *seg; + + if (lv_is_origin(lv)) { + if (lp->resize == LV_REDUCE) { + log_error("Snapshot origin volumes cannot be reduced in size yet."); + return 0; + } + + if (lv_is_active(lv)) { + log_error("Snapshot origin volumes can be resized " + "only while inactive: try lvchange -an."); + return 0; + } + } + + if (lv_is_raid_image(lv) || lv_is_raid_metadata(lv)) { + log_error("Cannot resize a RAID %s directly for %s", + lv_is_raid_image(lv) ? "image" : "metadata area", + display_lvname(lv)); + return 0; + } + + seg = first_seg(lv); + if ((seg_is_raid4(seg) || seg_is_any_raid5(seg)) && seg->area_count < 3) { + log_error("Cannot resize %s LV %s. Convert to more stripes first.", + lvseg_name(seg), display_lvname(lv)); + return 0; + } + + if (lp->resize == LV_REDUCE) { + if (lv_is_thin_pool_data(lv)) { + log_error("Thin pool volumes %s cannot be reduced in size yet.", + display_lvname(lv)); + return 0; + } + if (lv_is_thin_pool_metadata(lv)) { + log_error("Thin pool metadata volumes cannot be reduced."); + return 0; + } + if (lv_is_vdo_pool_data(lv)) { + log_error("Cannot reduce VDO pool data volume %s.", + display_lvname(lv)); + return 0; + } + if (lv_is_writecache(lv)) { + /* TODO: detect kernel with support for reduction */ + log_error("Reduce not yet allowed on LVs with writecache attached."); + return 0; + } + if (lv_is_raid(lv)) { + unsigned attrs = 0; + const struct segment_type *segtype = first_seg(lv)->segtype; + + if (!segtype->ops->target_present || + !segtype->ops->target_present(lv->vg->cmd, NULL, &attrs) || + !(attrs & RAID_FEATURE_SHRINK)) { + log_error("RAID module does not support shrinking."); + return 0; + } + } + if (lv_is_integrity(lv) || lv_raid_has_integrity(lv)) { + log_error("Cannot reduce LV with integrity."); + return 0; + } + } else if (lp->resize == LV_EXTEND) { + if (lv_is_thin_pool_metadata(lv) && + (!(seg = find_pool_seg(first_seg(lv))) || + !thin_pool_feature_supported(seg->lv, THIN_FEATURE_METADATA_RESIZE))) { + log_error("Support for online metadata resize of %s not detected.", + display_lvname(lv)); + return 0; + } + + /* Validate thin target supports bigger size of thin volume then external origin */ + if (lv_is_thin_volume(lv) && first_seg(lv)->external_lv && + (lp->extents > first_seg(lv)->external_lv->le_count) && + !thin_pool_feature_supported(first_seg(lv)->pool_lv, THIN_FEATURE_EXTERNAL_ORIGIN_EXTEND)) { + log_error("Thin target does not support external origin smaller then thin volume."); + return 0; + } + } + + /* Prevent resizing on out-of-sync reshapable raid */ + if (first_seg(lv)->reshape_len && !lv_raid_in_sync(lv)) { + log_error("Can't resize reshaping LV %s.", display_lvname(lv)); + return 0; + } + + if ((lp->resize == LV_REDUCE) && (lp->pvh != &lv->vg->pvs)) + log_print_unless_silent("Ignoring PVs on command line when reducing."); + + return 1; +} + +static int _lv_resize_volume(struct logical_volume *lv, + struct lvresize_params *lp, + struct dm_list *pvh) +{ + struct volume_group *vg = lv->vg; + struct cmd_context *cmd = vg->cmd; + uint32_t old_extents; + alloc_policy_t alloc = lp->alloc ? : lv->alloc; + + old_extents = lv->le_count; + log_verbose("%sing logical volume %s to %s%s", + (lp->resize == LV_REDUCE) ? "Reduc" : "Extend", + display_lvname(lv), lp->approx_alloc ? "up to " : "", + display_size(cmd, (uint64_t) lp->extents * vg->extent_size)); + + if (lp->resize == LV_REDUCE) { + if (!lv_reduce(lv, lv->le_count - lp->extents)) + return_0; + } else if ((lp->extents > lv->le_count) && /* Ensure we extend */ + !lv_extend(lv, lp->segtype, + lp->stripes, lp->stripe_size, + lp->mirrors, first_seg(lv)->region_size, + lp->extents - lv->le_count, + pvh, alloc, lp->approx_alloc)) + return_0; + + if (old_extents == lv->le_count) + log_print_unless_silent("Size of logical volume %s unchanged from %s (%" PRIu32 " extents).", + display_lvname(lv), + display_size(cmd, (uint64_t) old_extents * vg->extent_size), old_extents); + else { + lp->size_changed = 1; + log_print_unless_silent("Size of logical volume %s changed from %s (%" PRIu32 " extents) to %s (%" PRIu32 " extents).", + display_lvname(lv), + display_size(cmd, (uint64_t) old_extents * vg->extent_size), old_extents, + display_size(cmd, (uint64_t) lv->le_count * vg->extent_size), lv->le_count); + } + + return 1; +} + +static int _lv_resize_adjust_size(struct logical_volume *lv, + struct lvresize_params *lp, + int *matches_existing) +{ + /* Resolve extents from size */ + if (lp->size) { + if (!_lvresize_adjust_size(lv->vg, lp->size, lp->sign, &lp->extents)) + return_0; + } + + /* set lp->extents based on lp->percent_value */ + else if (lp->percent_value) { + if (!_lvresize_extents_from_percent(lv, lp)) + return_0; + } + + /* rewrites lp->extents from percentage to extents */ + else if (lp->extents && (lp->percent != PERCENT_NONE)) { + if (!_lvresize_extents_from_percent(lv, lp)) + return_0; + } + + /* Ensure stripe boundary extents! */ + if (!lp->percent && lv_is_raid(lv)) + lp->extents =_round_to_stripe_boundary(lv->vg, lp->extents, + seg_is_raid1(first_seg(lv)) ? 0 : _raid_stripes_count(first_seg(lv)), + lp->resize == LV_REDUCE ? 0 : 1); + + if (!_lvresize_adjust_extents(lv, lp, matches_existing)) + return_0; + + return 1; +} + +/* Set thin pool metadata properties, we can't use those from command line */ +static void _setup_params_for_extend_metadata(struct logical_volume *lv, + struct lvresize_params *lp) +{ + struct lv_segment *mseg = last_seg(lv); + + lp->alloc = lv->alloc; + lp->percent = PERCENT_NONE; + lp->segtype = mseg->segtype; + lp->mirrors = seg_is_mirrored(mseg) ? lv_mirror_count(lv) : 0; + lp->fsopt[0] = '\0'; + lp->stripes = lp->mirrors ? mseg->area_count / lp->mirrors : 0; + lp->stripe_size = mseg->stripe_size; +} + + +static int _lv_resize_check_used(struct logical_volume *lv) +{ + if (!lv) { + log_error(INTERNAL_ERROR "LV is not specified."); + return 0; + } + + if (lv_is_locked(lv)) { + log_error("Can't resize locked logical volume %s.", display_lvname(lv)); + return 0; + } + + if (lv_is_converting(lv)) { + log_error("Can't resize logical volume %s while lvconvert in progress.", display_lvname(lv)); + return 0; + } + + if (lv_component_is_active(lv)) { + log_error("Cannot resize logical volume %s with active component LV(s).", display_lvname(lv)); + return 0; + } + + if (lv_is_raid_with_tracking(lv)) { + log_error("Cannot resize logical volume %s while it is tracking a split image.", display_lvname(lv)); + return 0; + } + + if (lv_is_vdo(lv) && !lv_is_active(lv)) { + log_error("Cannot resize inactive VDO logical volume %s.", display_lvname(lv)); + return 0; + } + + if (lv_is_vdo_pool(lv) && !lv_is_active(lv_lock_holder(lv))) { + log_error("Cannot resize inactive VDO POOL volume %s.", display_lvname(lv)); + return 0; + } + + if (lv_is_external_origin(lv)) { + /* + * Since external-origin can be activated read-only, + * there is no way to use extended areas. + */ + log_error("Cannot resize external origin logical volume %s.", + display_lvname(lv)); + return 0; + } + + return 1; } /* - * Core of LV renaming routine. - * VG must be locked by caller. + * --fs checksize: check fs size and allow the lv to reduce if the fs is not + * using the affected space, i.e. the fs does not need to be + * resized. fail the command without reducing the fs or lv if + * the fs is using the affected space. + * + * --fs resize --fsmode manage: resize the fs, mounting/unmounting the fs + * as needed, but avoiding mounting/unmounted when possible. + * + * --fs resize --fsmode nochange: resize the fs without changing the current + * mount/unmount state. fail the command without reducing the + * fs or lv if the fs resize would require mounting or unmounting. + * + * --fs resize --fsmode offline: resize the fs only while it's unmounted + * unmounting the fs if needed. fail the commandn without + * reducing the fs or lv if the fs resize would require having + * the fs mounted. + * + * --fs resize_fsadm: old method using fsadm script to do everything */ -int lv_rename(struct cmd_context *cmd, struct logical_volume *lv, - const char *new_name) +static int _fs_reduce_allow(struct cmd_context *cmd, struct logical_volume *lv, + struct lvresize_params *lp, uint64_t newsize_bytes_lv, + uint64_t newsize_bytes_fs, struct fs_info *fsi) { - return lv_rename_update(cmd, lv, new_name, 1); + const char *fs_reduce_cmd = ""; + const char *cmp_desc = ""; + int equal = 0, smaller = 0, larger = 0; + int is_ext_fstype = 0; + int confirm_mount_change = 0; + + /* + * Allow reducing the LV for other fs types if the fs is not using + * space that's being reduced. + */ + if (!strcmp(fsi->fstype, "ext2") || + !strcmp(fsi->fstype, "ext3") || + !strcmp(fsi->fstype, "ext4") || + !strcmp(fsi->fstype, "xfs")) { + log_debug("Found fs %s last_byte %llu newsize_bytes_fs %llu", + fsi->fstype, + (unsigned long long)fsi->fs_last_byte, + (unsigned long long)newsize_bytes_fs); + if (!strncmp(fsi->fstype, "ext", 3)) { + is_ext_fstype = 1; + fs_reduce_cmd = " resize2fs"; + } + } + + if (!fsi->mounted) + log_print_unless_silent("File system %s%s found on %s.", + fsi->fstype, fsi->needs_crypt ? "+crypto_LUKS" : "", + display_lvname(lv)); + else + log_print_unless_silent("File system %s%s found on %s mounted at %s.", + fsi->fstype, fsi->needs_crypt ? "+crypto_LUKS" : "", + display_lvname(lv), fsi->mount_dir); + + if (!fsi->fs_last_byte) { + if (!strcmp(fsi->fstype, "reiserfs")) { + log_error("File system reduce for reiserfs requires --fs resize_fsadm."); + return 0; + } + log_error("File system device usage is not available from libblkid."); + return 0; + } + + if ((equal = (fsi->fs_last_byte == newsize_bytes_fs))) + cmp_desc = "equal to"; + else if ((smaller = (fsi->fs_last_byte < newsize_bytes_fs))) + cmp_desc = "smaller than"; + else if ((larger = (fsi->fs_last_byte > newsize_bytes_fs))) + cmp_desc = "larger than"; + + log_print_unless_silent("File system size (%s) is %s the requested size (%s).", + display_size(cmd, fsi->fs_last_byte/512), cmp_desc, + display_size(cmd, newsize_bytes_fs/512)); + + /* + * FS reduce is not needed, it's not using the affected space. + */ + if (smaller || equal) { + log_print_unless_silent("File system reduce is not needed, skipping."); + fsi->needs_reduce = 0; + return 1; + } + + /* + * FS reduce is required, but checksize does not allow it. + */ + if (!strcmp(lp->fsopt, "checksize")) { + if (is_ext_fstype) + log_error("File system reduce is required (see resize2fs or --resizefs.)"); + else + log_error("File system reduce is required and not supported (%s).", fsi->fstype); + return 0; + } + + /* + * FS reduce required, ext* supports it, xfs does not. + */ + if (is_ext_fstype) { + log_print_unless_silent("File system reduce is required using resize2fs."); + } else if (!strcmp(fsi->fstype, "reiserfs")) { + log_error("File system reduce for reiserfs requires --fs resize_fsadm."); + return 0; + } else { + log_error("File system reduce is required and not supported (%s).", fsi->fstype); + return 0; + } + + /* + * Set fstype-specific requirements for running fs resize command. + * ext2,3,4 require the fs to be unmounted to shrink with resize2fs, + * and they require e2fsck to be run first, unless resize2fs -f is used. + */ + if (is_ext_fstype) { + /* it's traditional to run fsck before shrink */ + if (!lp->nofsck) + fsi->needs_fsck = 1; + + /* ext2,3,4 require fs to be unmounted to shrink */ + if (fsi->mounted) + fsi->needs_unmount = 1; + + fsi->needs_reduce = 1; + } else { + /* + * Shouldn't reach here since no other fs types get this far. + * A future fs supporting shrink may require the fs to be + * mounted or unmounted to run the fs shrink command. + * set fsi->needs_unmount or fs->needs_mount according to + * the fs-specific shrink command's requirement. + */ + log_error("File system %s: fs reduce not implemented.", fsi->fstype); + return 0; + } + + /* + * FS reduce may require mounting or unmounting, check the fsopt value + * from the user, and the current mount state to decide if fs resize + * can be done. + */ + if (!strcmp(lp->fsopt, "resize") && !strcmp(lp->fsmode, "nochange")) { + /* can't mount|unmount to run fs resize */ + if (fsi->needs_mount) { + log_error("File system needs to be mounted to reduce fs (see --fsmode)."); + return 0; + } + if (fsi->needs_unmount) { + log_error("File system needs to be unmounted to reduce fs (see --fsmode)."); + return 0; + } + } else if (!strcmp(lp->fsopt, "resize") && !strcmp(lp->fsmode, "offline")) { + /* we can unmount if needed to run fs resize */ + if (fsi->needs_mount) { + log_error("File system needs to be mounted to reduce fs (see --fsmode)."); + return 0; + } + } else if (!strcmp(lp->fsopt, "resize") && !strcmp(lp->fsmode, "manage")) { + /* we can mount|unmount as needed to run fs resize */ + /* confirm mount change unless --fsmode manage is set explicitly */ + + if (fsi->needs_mount || fsi->needs_unmount) + confirm_mount_change = 1; + + if (lp->user_set_fsmode) + confirm_mount_change = 0; + } else { + log_error("Unknown file system resize options: --fs %s --fsmode %s", lp->fsopt, lp->fsmode); + return 0; + } + + /* + * If future file systems can be reduced while mounted, then suppress + * needs_fsck here if the fs is already mounted. + */ + + if (fsi->needs_unmount) + log_print_unless_silent("File system unmount is needed for reduce."); + if (fsi->needs_fsck) + log_print_unless_silent("File system fsck will be run before reduce."); + if (fsi->needs_mount) + log_print_unless_silent("File system mount is needed for reduce."); + if (fsi->needs_crypt) + log_print_unless_silent("cryptsetup resize is needed for reduce."); + + /* + * Use a confirmation prompt because mount|unmount is needed, and + * no specific --fsmode was set (i.e. the user did not give specific + * direction about how to handle mounting|unmounting with --fsmode.) + */ + if (!lp->yes && confirm_mount_change) { + if (yes_no_prompt("Continue with %s file system reduce steps:%s%s%s%s%s? [y/n]:", + fsi->fstype, + fsi->needs_unmount ? " unmount," : "", + fsi->needs_fsck ? " fsck," : "", + fsi->needs_mount ? " mount," : "", + fsi->needs_crypt ? " cryptsetup," : "", + fsi->needs_reduce ? fs_reduce_cmd : "") == 'n') { + log_error("File system not reduced."); + return 0; + } + } + + return 1; +} + +static int _fs_extend_allow(struct cmd_context *cmd, struct logical_volume *lv, + struct lvresize_params *lp, struct fs_info *fsi) +{ + const char *fs_extend_cmd = ""; + int is_ext_fstype = 0; + int confirm_mount_change = 0; + + if (!strcmp(fsi->fstype, "ext2") || + !strcmp(fsi->fstype, "ext3") || + !strcmp(fsi->fstype, "ext4") || + !strcmp(fsi->fstype, "xfs")) { + log_debug("Found fs %s last_byte %llu", + fsi->fstype, (unsigned long long)fsi->fs_last_byte); + if (!strncmp(fsi->fstype, "ext", 3)) + is_ext_fstype = 1; + } else if (!strcmp(fsi->fstype, "reiserfs")) { + log_error("File system extend for reiserfs requires --fs resize_fsadm."); + return 0; + } else { + log_error("File system extend is not supported (%s).", fsi->fstype); + return 0; + } + + if (!fsi->mounted) + log_print_unless_silent("File system %s%s found on %s.", + fsi->fstype, fsi->needs_crypt ? "+crypto_LUKS" : "", + display_lvname(lv)); + else + log_print_unless_silent("File system %s%s found on %s mounted at %s.", + fsi->fstype, fsi->needs_crypt ? "+crypto_LUKS" : "", + display_lvname(lv), fsi->mount_dir); + + /* + * FS extend may require mounting or unmounting, check the fsopt value + * from the user, and the current mount state to decide if fs extend + * can be done. + */ + + if (is_ext_fstype) { + fs_extend_cmd = " resize2fs"; + + /* + * ext* can be extended while it's mounted or unmounted. If + * the fs is unmounted, it's traditional to run fsck before + * running the fs extend. + * + * --fs resize --fsmode nochange: don't change mount condition. + * if mounted: fs_extend + * if unmounted: fsck, fs_extend + * + * --fs resize --fsmode offline: extend offline, so unmount first if mounted. + * if mounted: unmount, fsck, fs_extend + * if unmounted: fsck, fs_extend + * + * --fs resize --fsmode manage: do any mount or unmount that's necessary, + * avoiding unnecessary mounting/unmounting. + * if mounted: fs_extend + * if unmounted: fsck, fs_extend + */ + if (!strcmp(lp->fsopt, "resize") && !strcmp(lp->fsmode, "nochange")) { + if (fsi->mounted) + fsi->needs_extend = 1; + else if (fsi->unmounted) { + fsi->needs_fsck = 1; + fsi->needs_extend = 1; + } + } else if (!strcmp(lp->fsopt, "resize") && !strcmp(lp->fsmode, "offline")) { + if (fsi->mounted) { + fsi->needs_unmount = 1; + fsi->needs_fsck = 1; + fsi->needs_extend = 1; + } else if (fsi->unmounted) { + fsi->needs_fsck = 1; + fsi->needs_extend = 1; + } + } else if (!strcmp(lp->fsopt, "resize") && !strcmp(lp->fsmode, "manage")) { + if (fsi->mounted) + fsi->needs_extend = 1; + else if (fsi->unmounted) { + fsi->needs_fsck = 1; + fsi->needs_extend = 1; + } + } + + if (lp->nofsck) + fsi->needs_fsck = 0; + + } else if (!strcmp(fsi->fstype, "xfs")) { + fs_extend_cmd = " xfs_growfs"; + + /* + * xfs must be mounted to extend. + * + * --fs resize --fsmode nochange: don't change mount condition. + * if mounted: fs_extend + * if unmounted: fail + * + * --fs resize --fsmode offline: extend offline, so unmount first if mounted. + * if mounted: fail + * if unmounted: fail + * + * --fs resize --fsmode manage: do any mount or unmount that's necessary, + * avoiding unnecessary mounting/unmounting. + * if mounted: fs_extend + * if unmounted: mount, fs_extend + */ + if (!strcmp(lp->fsopt, "resize") && !strcmp(lp->fsmode, "nochange")) { + if (fsi->mounted) + fsi->needs_extend = 1; + else if (fsi->unmounted) { + log_error("File system must be mounted to extend (see --fsmode)."); + return 0; + } + } else if (!strcmp(lp->fsopt, "resize") && !strcmp(lp->fsmode, "offline")) { + log_error("File system must be mounted to extend (see --fsmode)."); + return 0; + } else if (!strcmp(lp->fsopt, "resize") && !strcmp(lp->fsmode, "manage")) { + if (fsi->mounted) + fsi->needs_extend = 1; + else if (fsi->unmounted) { + fsi->needs_mount = 1; + fsi->needs_extend = 1; + } + } + + } else { + /* shouldn't reach here */ + log_error("File system type %s not handled.", fsi->fstype); + return 0; + } + + /* + * Skip needs_fsck if the fs is mounted and we can extend the fs while + * it's mounted. + */ + if (fsi->mounted && !fsi->needs_unmount && fsi->needs_fsck) { + log_print_unless_silent("File system fsck skipped for extending mounted fs."); + fsi->needs_fsck = 0; + } + + if (fsi->needs_unmount) + log_print_unless_silent("File system unmount is needed for extend."); + if (fsi->needs_fsck) + log_print_unless_silent("File system fsck will be run before extend."); + if (fsi->needs_mount) + log_print_unless_silent("File system mount is needed for extend."); + if (fsi->needs_crypt) + log_print_unless_silent("cryptsetup resize is needed for extend."); + + /* + * Use a confirmation prompt when mount|unmount is needed if + * the user did not give specific direction about how to handle + * mounting|unmounting with --fsmode. + */ + if (!strcmp(lp->fsopt, "resize") && !lp->user_set_fsmode && + (fsi->needs_mount || fsi->needs_unmount)) + confirm_mount_change = 1; + + if (!lp->yes && confirm_mount_change) { + if (yes_no_prompt("Continue with %s file system extend steps:%s%s%s%s%s? [y/n]:", + fsi->fstype, + fsi->needs_unmount ? " unmount," : "", + fsi->needs_fsck ? " fsck," : "", + fsi->needs_mount ? " mount," : "", + fsi->needs_crypt ? " cryptsetup," : "", + fsi->needs_extend ? fs_extend_cmd : "") == 'n') { + log_error("File system not extended."); + return 0; + } + } + + return 1; +} + +static int _fs_reduce(struct cmd_context *cmd, struct logical_volume *lv, + struct lvresize_params *lp) +{ + struct fs_info fsinfo; + struct fs_info fsinfo2; + uint64_t newsize_bytes_lv; + uint64_t newsize_bytes_fs; + int ret = 0; + + memset(&fsinfo, 0, sizeof(fsinfo)); + memset(&fsinfo2, 0, sizeof(fsinfo)); + + if (!fs_get_info(cmd, lv, &fsinfo, 1)) + goto_out; + + if (fsinfo.nofs) { + ret = 1; + goto_out; + } + + /* extent_size units is SECTOR_SIZE (512) */ + newsize_bytes_lv = lp->extents * lv->vg->extent_size * SECTOR_SIZE; + newsize_bytes_fs = newsize_bytes_lv; + + /* + * If needs_crypt, then newsize_bytes passed to fs_reduce_script() and + * crypt_resize_script() needs to be decreased by the offset of crypt + * data on the LV (usually the size of the LUKS header which is usually + * 2MB for LUKS1 and 16MB for LUKS2.) + */ + if (fsinfo.needs_crypt) { + newsize_bytes_fs -= fsinfo.crypt_offset_bytes; + log_print_unless_silent("File system size %llub is adjusted for crypt data offset %ub.", + (unsigned long long)newsize_bytes_fs, fsinfo.crypt_offset_bytes); + } + + /* + * Based on the --fs command option, the fs type, the last block used, + * and the mount state, determine if LV reduce is allowed. If not + * returns 0 and lvreduce should fail. If allowed, returns 1 and sets + * fsinfo.needs_* for any steps that are required to reduce the LV. + */ + if (!_fs_reduce_allow(cmd, lv, lp, newsize_bytes_lv, newsize_bytes_fs, &fsinfo)) + goto_out; + + /* + * Uncommon special case in which the FS does not need to be shrunk, + * but the crypt dev over the LV should be shrunk to correspond with + * the LV size, so that the FS does not see an incorrect device size. + */ + if (!fsinfo.needs_reduce && fsinfo.needs_crypt) { + /* Check if the crypt device is already sufficiently reduced. */ + if (fsinfo.crypt_dev_size_bytes <= newsize_bytes_fs) { + log_print_unless_silent("crypt device is already reduced to %llu bytes.", + (unsigned long long)fsinfo.crypt_dev_size_bytes); + ret = 1; + goto out; + } + if (!strcmp(lp->fsopt, "checksize")) { + log_error("crypt reduce is required (see --resizefs or cryptsetup resize.)"); + ret = 0; + goto out; + } + if (test_mode()) { + ret = 1; + goto_out; + } + ret = crypt_resize_script(cmd, lv, &fsinfo, newsize_bytes_fs); + goto out; + } + + /* + * fs reduce is not needed to reduce the LV. + */ + if (!fsinfo.needs_reduce) { + ret = 1; + goto_out; + } + + if (test_mode()) { + if (fsinfo.needs_unmount) + log_print_unless_silent("Skip unmount in test mode."); + if (fsinfo.needs_fsck) + log_print_unless_silent("Skip fsck in test mode."); + if (fsinfo.needs_mount) + log_print_unless_silent("Skip mount in test mode."); + if (fsinfo.needs_crypt) + log_print_unless_silent("Skip cryptsetup in test mode."); + log_print_unless_silent("Skip fs reduce in test mode."); + ret = 1; + goto out; + } + + /* + * mounting, unmounting, fsck, and shrink command can all take a long + * time to run, and this lvm command should not block other lvm + * commands from running during that time, so release the vg lock + * around the long-running steps, and reacquire after. + */ + unlock_vg(cmd, lv->vg, lv->vg->name); + + if (!fs_reduce_script(cmd, lv, &fsinfo, newsize_bytes_fs, lp->fsmode)) + goto_out; + + if (!lock_vol(cmd, lv->vg->name, LCK_VG_WRITE, NULL)) { + log_error("Failed to lock VG, cannot reduce LV."); + ret = 0; + goto out; + } + + /* + * Check that the vg wasn't changed while it was unlocked. + * (can_use_one_scan: check just one mda in the vg for changes) + */ + cmd->can_use_one_scan = 1; + if (scan_text_mismatch(cmd, lv->vg->name, NULL)) { + log_print_unless_silent("VG was changed during fs operations, restarting."); + lp->vg_changed_error = 1; + ret = 0; + goto out; + } + + /* + * Re-check the fs last block which should now be less than the + * requested (reduced) LV size. + */ + if (!fs_get_info(cmd, lv, &fsinfo2, 0)) + goto_out; + + if (fsinfo.fs_last_byte && (fsinfo2.fs_last_byte > newsize_bytes_fs)) { + log_error("File system last byte %llu is greater than new size %llu bytes.", + (unsigned long long)fsinfo2.fs_last_byte, + (unsigned long long)newsize_bytes_fs); + goto_out; + } + + ret = 1; + out: + return ret; +} + +static int _fs_extend(struct cmd_context *cmd, struct logical_volume *lv, + struct lvresize_params *lp) +{ + struct fs_info fsinfo; + uint64_t newsize_bytes_lv; + uint64_t newsize_bytes_fs; + int ret = 0; + + memset(&fsinfo, 0, sizeof(fsinfo)); + + if (!fs_get_info(cmd, lv, &fsinfo, 1)) + goto_out; + + if (fsinfo.nofs) { + ret = 1; + goto_out; + } + + /* + * Note: here in the case of extend, newsize_bytes_lv/newsize_bytes_fs + * are only calculated and used for log messages. The extend commands + * do not use these values, they just extend to the new LV size that + * is visible to them. + */ + + /* extent_size units is SECTOR_SIZE (512) */ + newsize_bytes_lv = lp->extents * lv->vg->extent_size * SECTOR_SIZE; + newsize_bytes_fs = newsize_bytes_lv; + if (fsinfo.needs_crypt) { + newsize_bytes_fs -= fsinfo.crypt_offset_bytes; + log_print_unless_silent("File system size %llub is adjusted for crypt data offset %ub.", + (unsigned long long)newsize_bytes_fs, fsinfo.crypt_offset_bytes); + } + + /* + * Decide if fs should be extended based on the --fs option, + * the fs type and the mount state. + */ + if (!_fs_extend_allow(cmd, lv, lp, &fsinfo)) + goto_out; + + /* + * fs extend is not needed + */ + if (!fsinfo.needs_extend) { + ret = 1; + goto_out; + } + + if (test_mode()) { + if (fsinfo.needs_unmount) + log_print_unless_silent("Skip unmount in test mode."); + if (fsinfo.needs_fsck) + log_print_unless_silent("Skip fsck in test mode."); + if (fsinfo.needs_mount) + log_print_unless_silent("Skip mount in test mode."); + if (fsinfo.needs_crypt) + log_print_unless_silent("Skip cryptsetup in test mode."); + log_print_unless_silent("Skip fs extend in test mode."); + ret = 1; + goto out; + } + + /* + * mounting, unmounting and extend command can all take a long + * time to run, and this lvm command should not block other lvm + * commands from running during that time, so release the vg + * lock around the long-running steps. + */ + unlock_vg(cmd, lv->vg, lv->vg->name); + + if (!fs_extend_script(cmd, lv, &fsinfo, newsize_bytes_fs, lp->fsmode)) + goto_out; + + ret = 1; + out: + return ret; +} + +int lv_resize(struct cmd_context *cmd, struct logical_volume *lv, + struct lvresize_params *lp) +{ + struct lvresize_params lp_meta; + struct volume_group *vg = lv->vg; + struct lv_segment *seg = first_seg(lv); + struct logical_volume *lv_top = NULL; + struct logical_volume *lv_main = NULL; + struct logical_volume *lv_meta = NULL; + struct logical_volume *lv_main_layer = NULL; + struct logical_volume *lv_meta_layer = NULL; + int main_size_matches = 0; + int meta_size_matches = 0; + int is_extend = (lp->resize == LV_EXTEND); + int is_reduce = (lp->resize == LV_REDUCE); + int is_active = 0; + int activated = 0; + int activated_checksize = 0; + int status; + int ret = 0; + + memset(&lp_meta, 0, sizeof(lp_meta)); + + /* + * Some checks apply to the LV command arg (don't require top/bottom + * LVs in a stack), and don't require knowing if the command is doing + * extend or reduce (determined later). + */ + + if (lp->stripe_size && !_validate_stripesize(vg, lp)) + return_0; + + /* + * The only types of !visible/internal/non-top LVs that can be directly + * resized via the command arg. Other internal LVs are resized + * indirectly when resizing a top LV. + */ + if (!lv_is_visible(lv) && + !lv_is_thin_pool_data(lv) && + !lv_is_thin_pool_metadata(lv) && + !lv_is_vdo_pool_data(lv) && + !lv_is_lockd_sanlock_lv(lv)) { + log_error("Can't resize internal logical volume %s.", display_lvname(lv)); + return 0; + } + + /* + * Figure out which LVs are going to be extended, and set params + * to the requested extents/size for each. Some LVs are extended + * only by extending an underlying LV. Extending some top level + * LVs results in extending multiple underlying LVs. + * + * lv_top is the top level LV in stack. + * lv_main is the main LV to be resized. + * lv_meta is always a thin pool metadata LV. + * + * lv_main_layer/lv_meta_layer may be LV types (like cache) that are + * layered over the main/meta LVs. These layer LVs are skipped over + * by get_resizable_layer_lv() which finds the bottom-most layer + * which is originally resized. The layer LVs are resized indirectly + * as a result of the lower data-holding LVs being resized. + * + * In the simplest case there is no layering/stacking, and + * lv == lv_main == lv_main_layer == lv_top + */ + + if (cmd->command_enum == lvextend_policy_CMD) { + /* lvextend --use-policies may extend main or meta or both */ + lv_top = lv; + if (lv_is_thin_pool(lv)) { + if (lp->policy_percent_main) { + lv_main = seg_lv(first_seg(lv), 0); /* thin pool data */ + lp->percent_value = lp->policy_percent_main; + } + if (lp->policy_percent_meta) { + lv_meta = first_seg(lv)->metadata_lv; /* thin pool metadata */ + _setup_params_for_extend_metadata(lv_meta, &lp_meta); + /* override setup function which isn't right for policy use */ + lp_meta.percent = PERCENT_LV; + lp_meta.sign = SIGN_PLUS; + lp_meta.percent_value = lp->policy_percent_meta; + lp_meta.pvh = lp->pvh; + } + } else if (lv_is_vdo_pool(lv)) { + lv_main = seg_lv(first_seg(lv), 0); /* vdo pool data */ + lp->percent_value = lp->policy_percent_main; + } else if (lv_is_cow(lv)) { + lv_main = lv; + lp->percent_value = lp->policy_percent_main; + } else + return_0; + + } else if ((cmd->command_enum == lvextend_pool_metadata_CMD) || + (cmd->command_enum == lvresize_pool_metadata_CMD)) { + /* lvresize|lvextend --poolmetadatasize, extends only thin pool metadata */ + if (lv_is_thin_pool(lv)) { + lv_top = lv; + lv_meta = first_seg(lv)->metadata_lv; /* thin pool metadata */ + } else if (lv_is_thin_pool_metadata(lv)) { + lv_top = _get_top_layer_lv(lv); /* thin pool LV */ + lv_meta = lv; + } else { + log_error("--poolmetadatasize can be used only with thin pools."); + return 0; + } + lp_meta = *lp; + _setup_params_for_extend_metadata(lv_meta, &lp_meta); + lp_meta.size = lp->poolmetadata_size; + lp_meta.sign = lp->poolmetadata_sign; + lp->poolmetadata_size = 0; + lp->poolmetadata_sign = 0; + + } else if (lv_is_thin_pool(lv) && lp->poolmetadata_size) { + /* extend both thin pool data and metadata */ + lv_top = lv; + lv_main = seg_lv(first_seg(lv), 0); /* thin pool data */ + lv_meta = first_seg(lv)->metadata_lv; /* thin pool metadata */ + lp_meta = *lp; + _setup_params_for_extend_metadata(lv_meta, &lp_meta); + lp_meta.size = lp->poolmetadata_size; + lp_meta.sign = lp->poolmetadata_sign; + lp->poolmetadata_size = 0; + lp->poolmetadata_sign = 0; + + } else if (lv_is_thin_pool_metadata(lv)) { + /* extend only thin pool metadata */ + lv_top = _get_top_layer_lv(lv); /* thin pool LV */ + lv_meta = lv; + lp_meta = *lp; + _setup_params_for_extend_metadata(lv_meta, &lp_meta); + if (lp->poolmetadata_size) { + lp_meta.size = lp->poolmetadata_size; + lp_meta.size = lp->poolmetadata_sign; + lp->poolmetadata_size = 0; + lp->poolmetadata_sign = 0; + } + /* else lp_meta.extents|size from lp->extents|size above */ + + } else if (lv_is_thin_pool(lv)) { + /* extend thin pool data and possibly metadata */ + lv_top = lv; + lv_main = seg_lv(first_seg(lv), 0); + /* Do not set lv_meta to the thin pool metadata here. + See below "Possibly enable lv_meta extend". */ + } + + /* + * None of the special cases above (selecting which LVs to extend + * depending on options set and type of LV) have applied, so this + * is the standard case. + */ + if (!lv_main && !lv_meta) { + lv_top = _get_top_layer_lv(lv); + lv_main_layer = lv; + lv_main = _get_resizable_layer_lv(lv_main_layer); + } else { + lv_main_layer = lv_main; + lv_meta_layer = lv_meta; + if (lv_main) + lv_main = _get_resizable_layer_lv(lv_main_layer); + if (lv_meta) + lv_meta = _get_resizable_layer_lv(lv_meta_layer); + } + /* Clear layer variables if no layer exists. */ + if (lv_main_layer == lv_main) + lv_main_layer = NULL; + if (lv_meta_layer == lv_meta) + lv_meta_layer = NULL; + + /* + * LVs to work with are now determined: + * lv_top is always set, it is not used to resize, but is used + * to reload dm devices for the lv. + * If lv_main is set, it is resized. + * If lv_meta is set, it is resized. + * If lv_meta is not set, it may be set below and resized. + */ + + if (!_lv_resize_check_used(lv_top)) + return_0; + if (lv_main && (lv_main != lv_top) && !_lv_resize_check_used(lv_main)) + return_0; + + /* + * Set a new size for lv_main. + */ + if (lv_main) { + /* sets lp extents and lp resize */ + if (!_lv_resize_adjust_size(lv_main, lp, &main_size_matches)) + return_0; + /* sanity check the result of adjust_size */ + if (lp->extents == 0) + return_0; + /* adjust_size resolves LV_ANY to EXTEND|REDUCE */ + if (lp->resize == LV_ANY) + return_0; + if (is_extend && (lp->resize != LV_EXTEND)) + return_0; + if (is_reduce && (lp->resize != LV_REDUCE)) + return_0; + is_extend = (lp->resize == LV_EXTEND); + is_reduce = (lp->resize == LV_REDUCE); + + if (!_lv_resize_check_type(lv_main, lp)) + return_0; + } + + /* + * Possibly enable lv_meta extend if not already enabled. If lv_meta + * for a thin pool is not already being extended, and user requested + * extending the thin pool, then we may need to automatically include + * extending lv_meta in addition to lv_main (data), so that the + * metadata size is sufficient for the extended data size. + * + * If specific PVs were named to extend, this is taken to mean that + * only the thin pool data should be extended (using those PVs), and + * the thin pool metadata should not be automatically extended (since + * it would likely want to be extended using different PVs.) + */ + if (lv_is_thin_pool(lv_top) && is_extend && lv_main && !lv_meta && (&vg->pvs == lp->pvh)) { + struct lv_segment *tpseg = first_seg(lv_top); + uint64_t meta_size = estimate_thin_pool_metadata_size(lp->extents, vg->extent_size, tpseg->chunk_size); + if (meta_size > tpseg->metadata_lv->size) { + log_verbose("Extending thin pool metadata to %llu for larger data", (unsigned long long)meta_size); + lv_meta = tpseg->metadata_lv; + lp_meta = *lp; + _setup_params_for_extend_metadata(lv_meta, &lp_meta); + lp_meta.size = meta_size; + lp_meta.sign = SIGN_NONE; + /* meta may have a layer over it */ + lv_meta_layer = lv_meta; + lv_meta = _get_resizable_layer_lv(lv_meta_layer); + if (lv_meta == lv_meta_layer) + lv_meta_layer = NULL; + } + } + + /* + * Set a new size for lv_meta (extend only.) + */ + if (lv_meta) { + /* sets lp extents and lp resize */ + if (!_lv_resize_adjust_size(lv_meta, &lp_meta, &meta_size_matches)) + return_0; + /* sanity check the result of adjust_size */ + if (lp_meta.extents == 0) + return_0; + /* adjust_size resolves lp_meta.resize to EXTEND|REDUCE */ + /* _lv_resize_check_type errors if resize is EXTEND for thin meta */ + if (!_lv_resize_check_type(lv_meta, &lp_meta)) + return_0; + } + + /* + * No resizing is needed. + */ + if ((main_size_matches && meta_size_matches) || + (main_size_matches && !lv_meta) || + (meta_size_matches && !lv_main)) { + log_error("No size change."); + return 0; + } + + /* + * If the LV is locked due to being active, this lock call is a no-op. + * Otherwise, this acquires a transient lock on the lv (not PERSISTENT) + */ + if (!lockd_lv_resize(cmd, lv_top, "ex", 0, lp)) + return_0; + + /* + * Active 'hidden' -tpool can be waiting for resize, but the pool LV + * itself might be inactive. Here plain suspend/resume would not work. + * So active temporarily pool LV (with on disk metadata) then use + * suspend and resume and deactivate pool LV, instead of searching for + * an active thin volume. + * + * FIXME: why are thin pools activated where other LV types return + * error if inactive? + */ + if (lv_is_thin_pool(lv_top) && !lv_is_active(lv_top)) { + if (!activation()) { + log_error("Cannot activate to resize %s without using device-mapper kernel driver.", + display_lvname(lv_top)); + return 0; + } + if (!activate_lv(cmd, lv_top)) { + log_error("Failed to activate %s.", display_lvname(lv_top)); + return 0; + } + if (!sync_local_dev_names(cmd)) + stack; + activated = 1; + } + + /* + * Disable fsopt checksize for lvextend. + */ + if (is_extend && !strcmp(lp->fsopt, "checksize")) + lp->fsopt[0] = '\0'; + + /* + * Disable fsopt if LV type cannot hold a file system. + */ + if (lp->fsopt[0] && + !(lv_is_linear(lv) || lv_is_striped(lv) || lv_is_raid(lv) || + lv_is_mirror(lv) || lv_is_thin_volume(lv) || lv_is_vdo(lv) || + lv_is_cache(lv) || lv_is_writecache(lv))) { + log_print_unless_silent("Ignoring fs resizing options for LV type %s.", + seg ? seg->segtype->name : "unknown"); + lp->fsopt[0] = '\0'; + } + + /* + * Using an option to resize the fs has always/traditionally required + * the LV to already be active, so keep that behavior. Reducing an + * inactive LV will activate the LV to look for a fs that would be + * damaged. + */ + is_active = lv_is_active(lv_top); + + if (is_reduce && !is_active && !strcmp(lp->fsopt, "checksize")) { + if (!lp->user_set_fs) { + log_error("The LV must be active to safely reduce (see --fs options.)"); + goto out; + } + lv_top->status |= LV_TEMPORARY; + if (!activate_lv(cmd, lv_top)) { + log_error("Failed to activate %s to check for fs.", display_lvname(lv_top)); + goto out; + } + lv_top->status &= ~LV_TEMPORARY; + if (!sync_local_dev_names(cmd)) + stack; + activated_checksize = 1; + + } else if (lp->fsopt[0] && !is_active) { + log_error("Logical volume %s must be active for file system %s.", + display_lvname(lv_top), lp->fsopt); + goto out; + } + + /* + * Return an error without resizing the LV if the user requested + * a file system resize when no file system exists on the LV. + * (fs checksize does not require a fs to exist.) + */ + if (lp->fsopt[0] && strcmp(lp->fsopt, "checksize") && lp->user_set_fs) { + char lv_path[PATH_MAX]; + char fstype[FSTYPE_MAX]; + int nofs = 0; + + if (dm_snprintf(lv_path, sizeof(lv_path), "%s%s/%s", cmd->dev_dir, + lv_top->vg->name, lv_top->name) < 0) { + log_error("Couldn't create LV path for %s.", display_lvname(lv_top)); + goto out; + } + if (!fs_block_size_and_type(lv_path, NULL, fstype, &nofs) || nofs) { + log_error("File system not found for --resizefs or --fs options."); + goto out; + } + if (!strcmp(fstype, "crypto_LUKS") && !lv_crypt_is_active(cmd, lv_path)) { + log_error("LUKS dm-crypt device must be active for fs resize."); + goto out; + } + /* FS utils will fail if LVs were renamed while mounted. */ + if (fs_mount_state_is_misnamed(cmd, lv_top, lv_path, fstype)) + goto_out; + } + + /* + * Warn and confirm if checksize has been disabled for reduce. + */ + if (is_reduce && !lp->fsopt[0] && !_lv_reduce_confirmation(lv_top, lp)) + goto_out; + + /* Part of old approach to fs handling using fsadm. */ + if (!strcmp(lp->fsopt, "resize_fsadm") && !lp->nofsck && + !_fsadm_cmd(FSADM_CMD_CHECK, lv_top, 0, lp->yes, lp->force, &status)) { + if (status != FSADM_CHECK_FAILS_FOR_MOUNTED) { + log_error("Filesystem check failed."); + goto out; + } + } + + if (is_reduce && lp->fsopt[0]) { + if (!strcmp(lp->fsopt, "resize_fsadm")) { + /* Old approach to fs handling using fsadm. */ + if (!_fsadm_cmd(FSADM_CMD_RESIZE, lv_top, lp->extents, lp->yes, lp->force, NULL)) { + log_error("Filesystem resize failed."); + goto out; + } + } else { + /* New approach to fs handling using fs info. */ + if (!_fs_reduce(cmd, lv_top, lp)) + goto_out; + } + + if (activated_checksize && !deactivate_lv(cmd, lv_top)) + log_warn("Problem deactivating %s.", display_lvname(lv_top)); + } + + /* + * Send DISCARD/TRIM to reduced area of VDO volumes + * TODO: enable thin and provide + * TODO2: we need polling method + */ + if (is_reduce && lv_is_vdo(lv_top) && !_lv_reduce_vdo_discard(cmd, lv_top, lp)) + goto_out; + + /* + * Remove any striped raid reshape space for LV resizing (not common). + */ + if (lv_meta && first_seg(lv_meta)->reshape_len && !lv_raid_free_reshape_space(lv_meta)) + goto_out; + if (lv_main && first_seg(lv_main)->reshape_len && !lv_raid_free_reshape_space(lv_main)) + goto_out; + + /* + * The core of the actual lv resizing. + * Allocate or free extents in the VG, adjust LV segments to reflect + * new requested size, write VG metadata, reload the dm device stack + * (reload from the top LV.) Do lv_meta first. + * When extending lv_meta, also extend (or create) the pool's spare + * meta lv to match the size of lv_meta (only do this when the + * command is not limited to allocating from specific PVs.) + */ + + if (!lv_meta) + goto do_main; + if (!_lv_resize_volume(lv_meta, &lp_meta, lp->pvh)) + goto_out; + if (!lp_meta.size_changed) + goto do_main; + if ((&vg->pvs == lp->pvh) && !handle_pool_metadata_spare(vg, 0, lp->pvh, 1)) + stack; + if (!lv_update_and_reload(lv_top)) + goto_out; + log_debug("Resized thin pool metadata %s to %u extents.", display_lvname(lv_meta), lp_meta.extents); + + do_main: + + if (!lv_main) + goto end_main; + if (!_lv_resize_volume(lv_main, lp, lp->pvh)) + goto_out; + if (!lp->size_changed) + goto_out; + if (!lv_update_and_reload(lv_top)) + goto_out; + log_debug("Resized %s to %u extents.", display_lvname(lv_main), lp->extents); + + end_main: + + /* + * other maintenance: + * - update lvm pool metadata (drop messages). + * - print warnings about overprovisioning. + * - stop monitoring cow snapshot larger than origin + */ + if (lv_is_thin_pool(lv_top)) { + if (!update_thin_pool_lv(lv_top, 1)) + goto_out; + } + if (lv_is_thin_type(lv_top) && is_extend) + thin_pool_check_overprovisioning(lv_top); + + if (lv_main && lv_is_cow_covering_origin(lv_main)) { + if (!monitor_dev_for_events(cmd, lv_main, 0, 0)) + stack; + } + + if (is_extend && lp->fsopt[0]) { + if (!strcmp(lp->fsopt, "resize_fsadm")) { + /* Old approach to fs handling using fsadm. */ + if (!_fsadm_cmd(FSADM_CMD_RESIZE, lv_top, lp->extents, lp->yes, lp->force, NULL)) { + log_error("File system extend error."); + lp->extend_fs_error = 1; + goto out; + } + } else { + /* New approach to fs handling using fs info. */ + if (!_fs_extend(cmd, lv_top, lp)) { + log_error("File system extend error."); + lp->extend_fs_error = 1; + goto out; + } + } + } + + ret = 1; + + out: + if (activated || activated_checksize) { + if (!sync_local_dev_names(cmd)) + stack; + if (!deactivate_lv(cmd, lv_top)) + log_warn("Problem deactivating %s.", display_lvname(lv_top)); + } + + return ret; } char *generate_lv_name(struct volume_group *vg, const char *format, char *buffer, size_t len) { struct lv_list *lvl; + struct glv_list *glvl; int high = -1, i; dm_list_iterate_items(lvl, &vg->lvs) { @@ -3036,26 +7162,114 @@ char *generate_lv_name(struct volume_group *vg, const char *format, high = i; } + dm_list_iterate_items(glvl, &vg->historical_lvs) { + if (sscanf(glvl->glv->historical->name, format, &i) != 1) + continue; + + if (i > high) + high = i; + } + if (dm_snprintf(buffer, len, format, high + 1) < 0) return NULL; return buffer; } -int vg_max_lv_reached(struct volume_group *vg) +struct generic_logical_volume *get_or_create_glv(struct dm_pool*mem, struct logical_volume *lv, int *glv_created) { - if (!vg->max_lv) - return 0; + struct generic_logical_volume *glv; + + if (!(glv = lv->this_glv)) { + if (!(glv = dm_pool_zalloc(mem, sizeof(struct generic_logical_volume)))) { + log_error("Failed to allocate generic logical volume structure."); + return NULL; + } + glv->live = lv; + lv->this_glv = glv; + if (glv_created) + *glv_created = 1; + } else if (glv_created) + *glv_created = 0; + + return glv; +} + +struct glv_list *get_or_create_glvl(struct dm_pool *mem, struct logical_volume *lv, int *glv_created) +{ + struct glv_list *glvl; + + if (!(glvl = dm_pool_zalloc(mem, sizeof(struct glv_list)))) { + log_error("Failed to allocate generic logical volume list item."); + return NULL; + } - if (vg->max_lv > vg_visible_lvs(vg)) + if (!(glvl->glv = get_or_create_glv(mem, lv, glv_created))) { + dm_pool_free(mem, glvl); + return_NULL; + } + + return glvl; +} + +int add_glv_to_indirect_glvs(struct dm_pool *mem, + struct generic_logical_volume *origin_glv, + struct generic_logical_volume *glv) +{ + struct glv_list *glvl; + + if (!(glvl = dm_pool_zalloc(mem, sizeof(struct glv_list)))) { + log_error("Failed to allocate generic volume list item " + "for indirect glv %s", glv->is_historical ? glv->historical->name + : glv->live->name); return 0; + } - log_verbose("Maximum number of logical volumes (%u) reached " - "in volume group %s", vg->max_lv, vg->name); + glvl->glv = glv; + + if (glv->is_historical) + glv->historical->indirect_origin = origin_glv; + else + first_seg(glv->live)->indirect_origin = origin_glv; + + if (origin_glv) { + if (origin_glv->is_historical) + dm_list_add(&origin_glv->historical->indirect_glvs, &glvl->list); + else + dm_list_add(&origin_glv->live->indirect_glvs, &glvl->list); + } return 1; } +int remove_glv_from_indirect_glvs(struct generic_logical_volume *origin_glv, + struct generic_logical_volume *glv) +{ + struct glv_list *glvl, *tglvl; + struct dm_list *list = origin_glv->is_historical ? &origin_glv->historical->indirect_glvs + : &origin_glv->live->indirect_glvs; + + dm_list_iterate_items_safe(glvl, tglvl, list) { + if (glvl->glv != glv) + continue; + + dm_list_del(&glvl->list); + + if (glvl->glv->is_historical) + glvl->glv->historical->indirect_origin = NULL; + else + first_seg(glvl->glv->live)->indirect_origin = NULL; + + return 1; + } + + log_error(INTERNAL_ERROR "%s logical volume %s is not a user of %s.", + glv->is_historical ? "historical" : "Live", + glv->is_historical ? glv->historical->name : glv->live->name, + origin_glv->is_historical ? origin_glv->historical->name : origin_glv->live->name); + return 0; +} + struct logical_volume *alloc_lv(struct dm_pool *mem) { struct logical_volume *lv; @@ -3065,12 +7279,11 @@ struct logical_volume *alloc_lv(struct dm_pool *mem) return NULL; } - lv->snapshot = NULL; dm_list_init(&lv->snapshot_segs); dm_list_init(&lv->segments); dm_list_init(&lv->tags); dm_list_init(&lv->segs_using_this_lv); - dm_list_init(&lv->rsites); + dm_list_init(&lv->indirect_glvs); return lv; } @@ -3087,6 +7300,7 @@ struct logical_volume *lv_create_empty(const char *name, struct format_instance *fi = vg->fid; struct logical_volume *lv; char dname[NAME_LEN]; + int historical; if (vg_max_lv_reached(vg)) stack; @@ -3096,9 +7310,12 @@ struct logical_volume *lv_create_empty(const char *name, log_error("Failed to generate unique name for the new " "logical volume"); return NULL; - } else if (find_lv_in_vg(vg, name)) { + } + + if (lv_name_is_used_in_vg(vg, name, &historical)) { log_error("Unable to create LV %s in Volume Group %s: " - "name already in use.", name, vg->name); + "name already in use%s.", name, vg->name, + historical ? " by historical LV" : ""); return NULL; } @@ -3126,10 +7343,13 @@ struct logical_volume *lv_create_empty(const char *name, if (!lv_set_creation(lv, NULL, 0)) goto_bad; - + if (fi->fmt->ops->lv_setup && !fi->fmt->ops->lv_setup(fi, lv)) goto_bad; - + + if (vg->fid->fmt->features & FMT_CONFIG_PROFILE) + lv->profile = vg->cmd->profile_params->global_metadata_profile; + return lv; bad: dm_pool_free(vg->vgmem, lv); @@ -3144,9 +7364,9 @@ static int _add_pvs(struct cmd_context *cmd, struct pv_segment *peg, /* Don't add again if it's already on list. */ if (find_pv_in_pv_list(&spvs->pvs, peg->pv)) - return 1; + return 1; - if (!(pvl = dm_pool_alloc(cmd->mem, sizeof(*pvl)))) { + if (!(pvl = dm_pool_zalloc(cmd->mem, sizeof(*pvl)))) { log_error("pv_list allocation failed"); return 0; } @@ -3159,20 +7379,35 @@ static int _add_pvs(struct cmd_context *cmd, struct pv_segment *peg, } /* - * Construct dm_list of segments of LVs showing which PVs they use. - * For pvmove we use the *parent* LV so we can pick up stripes & existing mirrors etc. + * build_parallel_areas_from_lv + * @lv + * @use_pvmove_parent_lv + * @create_single_list + * + * For each segment in an LV, create a list of PVs used by the segment. + * Thus, the returned list is really a list of segments (seg_pvs) + * containing a list of PVs that are in use by that segment. + * + * use_pvmove_parent_lv: For pvmove we use the *parent* LV so we can + * pick up stripes & existing mirrors etc. + * create_single_list : Instead of creating a list of segments that + * each contain a list of PVs, return a list + * containing just one segment (i.e. seg_pvs) + * that contains a list of all the PVs used by + * the entire LV and all it's segments. */ struct dm_list *build_parallel_areas_from_lv(struct logical_volume *lv, - unsigned use_pvmove_parent_lv) + unsigned use_pvmove_parent_lv, + unsigned create_single_list) { struct cmd_context *cmd = lv->vg->cmd; struct dm_list *parallel_areas; - struct seg_pvs *spvs; + struct seg_pvs *spvs = NULL; uint32_t current_le = 0; uint32_t raid_multiple; struct lv_segment *seg = first_seg(lv); - if (!(parallel_areas = dm_pool_alloc(cmd->mem, sizeof(*parallel_areas)))) { + if (!(parallel_areas = dm_pool_alloc(lv->vg->vgmem, sizeof(*parallel_areas)))) { log_error("parallel_areas allocation failed"); return NULL; } @@ -3180,19 +7415,20 @@ struct dm_list *build_parallel_areas_from_lv(struct logical_volume *lv, dm_list_init(parallel_areas); do { - if (!(spvs = dm_pool_zalloc(cmd->mem, sizeof(*spvs)))) { - log_error("allocation failed"); - return NULL; - } - - dm_list_init(&spvs->pvs); + if (!spvs || !create_single_list) { + if (!(spvs = dm_pool_zalloc(lv->vg->vgmem, sizeof(*spvs)))) { + log_error("allocation failed"); + return NULL; + } + dm_list_init(&spvs->pvs); + dm_list_add(parallel_areas, &spvs->list); + } spvs->le = current_le; spvs->len = lv->le_count - current_le; - dm_list_add(parallel_areas, &spvs->list); - - if (use_pvmove_parent_lv && !(seg = find_seg_by_le(lv, current_le))) { + if (use_pvmove_parent_lv && + !(seg = find_seg_by_le(lv, current_le))) { log_error("Failed to find segment for %s extent %" PRIu32, lv->name, current_le); return 0; @@ -3213,38 +7449,18 @@ struct dm_list *build_parallel_areas_from_lv(struct logical_volume *lv, seg->area_count - seg->segtype->parity_devs : 1; } while ((current_le * raid_multiple) < lv->le_count); - /* FIXME Merge adjacent segments with identical PV lists (avoids need for contiguous allocation attempts between successful allocations) */ - - return parallel_areas; -} - -int link_lv_to_vg(struct volume_group *vg, struct logical_volume *lv) -{ - struct lv_list *lvl; - - if (vg_max_lv_reached(vg)) - stack; - - if (!(lvl = dm_pool_zalloc(vg->vgmem, sizeof(*lvl)))) - return_0; - - lvl->lv = lv; - lv->vg = vg; - dm_list_add(&vg->lvs, &lvl->list); - - return 1; -} - -int unlink_lv_from_vg(struct logical_volume *lv) -{ - struct lv_list *lvl; - - if (!(lvl = find_lv_in_vg(lv->vg, lv->name))) - return_0; + if (create_single_list) { + spvs->le = 0; + spvs->len = lv->le_count; + } - dm_list_del(&lvl->list); + /* + * FIXME: Merge adjacent segments with identical PV lists + * (avoids need for contiguous allocation attempts between + * successful allocations) + */ - return 1; + return parallel_areas; } void lv_set_visible(struct logical_volume *lv) @@ -3254,7 +7470,7 @@ void lv_set_visible(struct logical_volume *lv) lv->status |= VISIBLE_LV; - log_debug("LV %s in VG %s is now visible.", lv->name, lv->vg->name); + log_debug_metadata("LV %s in VG %s is now visible.", lv->name, lv->vg->name); } void lv_set_hidden(struct logical_volume *lv) @@ -3264,19 +7480,79 @@ void lv_set_hidden(struct logical_volume *lv) lv->status &= ~VISIBLE_LV; - log_debug("LV %s in VG %s is now hidden.", lv->name, lv->vg->name); + log_debug_metadata("LV %s in VG %s is now hidden.", lv->name, lv->vg->name); +} + +static int _lv_remove_check_in_use(struct logical_volume *lv, force_t force) +{ + struct volume_group *vg = lv->vg; + const char *volume_type = ""; + char buffer[50 + NAME_LEN * 2] = ""; + int active; + int issue_discards = + (vg->cmd->current_settings.issue_discards && + !lv_is_thin_volume(lv) && + !lv_is_vdo(lv) && + !lv_is_virtual_origin(lv)) ? 1 : 0; + + switch (lv_check_not_in_use(lv, 1)) { + case 2: /* Not active, prompt when discarding real LVs */ + if (!issue_discards || + lv_is_historical(lv)) + return 1; + active = 0; + break; + case 1: /* Active, not in use, prompt when visible */ + if (!lv_is_visible(lv) || + lv_is_pending_delete(lv)) + return 1; + active = 1; + break; + default: /* Active, in use, can't remove */ + return_0; + } + + if (force == PROMPT) { + if (vg->needs_write_and_commit && (!vg_write(vg) || !vg_commit(vg))) + return_0; + + if (lv_is_origin(lv)) { + volume_type = " origin"; + (void) dm_snprintf(buffer, sizeof(buffer), " with %u snapshots(s)", + lv->origin_count); + } else if (lv_is_merging_origin(lv)) { + volume_type = " merging origin"; + (void) dm_snprintf(buffer, sizeof(buffer), " with snapshot %s", + display_lvname(find_snapshot(lv)->lv)); + } + + if (yes_no_prompt("Do you really want to remove%s%s%s%s " + "logical volume %s%s? [y/n]: ", + issue_discards ? " and DISCARD" : "", + active ? " active" : "", + vg_is_clustered(vg) ? " clustered" : "", + volume_type, display_lvname(lv), + buffer) == 'n') { + lv->to_remove = 0; + log_error("Logical volume %s not removed.", display_lvname(lv)); + return 0; + } + } + + return 1; } int lv_remove_single(struct cmd_context *cmd, struct logical_volume *lv, - const force_t force) + force_t force, int suppress_remove_message) { struct volume_group *vg; - struct lvinfo info; - struct logical_volume *format1_origin = NULL; - int format1_reload_required = 0; - int visible; + int visible, historical; struct logical_volume *pool_lv = NULL; - int ask_discard; + struct logical_volume *lock_lv = lv; + struct lv_segment *cache_seg = NULL; + struct seg_list *sl; + struct lv_segment *seg = first_seg(lv); + char msg[NAME_LEN + 300], *msg_dup; vg = lv->vg; @@ -3284,227 +7560,450 @@ int lv_remove_single(struct cmd_context *cmd, struct logical_volume *lv, return_0; if (lv_is_origin(lv)) { - log_error("Can't remove logical volume \"%s\" under snapshot", - lv->name); + log_error("Can't remove logical volume %s under snapshot.", + display_lvname(lv)); return 0; } - if (lv->status & MIRROR_IMAGE) { - log_error("Can't remove logical volume %s used by a mirror", - lv->name); + if (lv_is_external_origin(lv)) { + log_error("Can't remove external origin logical volume %s.", + display_lvname(lv)); return 0; } - if (lv->status & MIRROR_LOG) { - log_error("Can't remove logical volume %s used as mirror log", - lv->name); + if (lv_is_mirror_image(lv)) { + log_error("Can't remove logical volume %s used by a mirror.", + display_lvname(lv)); return 0; } - if (lv->status & (RAID_META | RAID_IMAGE)) { - log_error("Can't remove logical volume %s used as RAID device", - lv->name); + if (lv_is_mirror_log(lv)) { + log_error("Can't remove logical volume %s used as mirror log.", + display_lvname(lv)); return 0; } - if (lv_is_thin_pool_data(lv) || lv_is_thin_pool_metadata(lv)) { - log_error("Can't remove logical volume %s used by a thin pool.", - lv->name); + if (lv_is_raid_metadata(lv) || lv_is_raid_image(lv)) { + log_error("Can't remove logical volume %s used as RAID device.", + display_lvname(lv)); return 0; - } else if (lv_is_thin_volume(lv)) - pool_lv = first_seg(lv)->pool_lv; + } - if (lv->status & LOCKED) { - log_error("Can't remove locked LV %s", lv->name); + if (lv_is_thin_pool_data(lv) || lv_is_thin_pool_metadata(lv) || + lv_is_cache_pool_data(lv) || lv_is_cache_pool_metadata(lv)) { + log_error("Can't remove logical volume %s used by a pool.", + display_lvname(lv)); return 0; } - /* FIXME Ensure not referred to by another existing LVs */ - ask_discard = find_config_tree_bool(cmd, - "devices/issue_discards", DEFAULT_ISSUE_DISCARDS); + if (lv_is_thin_volume(lv)) { + if (!(pool_lv = first_seg(lv)->pool_lv)) { + log_error(INTERNAL_ERROR "Thin LV %s without pool.", + display_lvname(lv)); + return 0; + } + lock_lv = pool_lv; + if (pool_lv->to_remove) + /* Thin pool is to be removed so skip updating it when possible */ + pool_lv = NULL; + } - if (lv_info(cmd, lv, 0, &info, 1, 0)) { - if (!lv_check_not_in_use(cmd, lv, &info)) + if (lv_is_locked(lv)) { + log_error("Can't remove locked logical volume %s.", display_lvname(lv)); + return 0; + } + + if (!lockd_lv(cmd, lock_lv, "ex", LDLV_PERSISTENT)) + return_0; + + if (!lv_is_cache_vol(lv)) { + if (!_lv_remove_check_in_use(lv, force)) return_0; + } + + /* if thin pool data lv is writecache, then detach and remove the writecache */ + if (lv_is_thin_pool(lv)) { + struct logical_volume *data_lv = data_lv_from_thin_pool(lv); - if ((force == PROMPT) && - lv_is_visible(lv) && - lv_is_active(lv)) { - if (yes_no_prompt("Do you really want to remove%s active " - "%slogical volume %s? [y/n]: ", - ask_discard ? " and DISCARD" : "", - vg_is_clustered(vg) ? "clustered " : "", - lv->name) == 'n') { - log_error("Logical volume %s not removed", lv->name); + if (data_lv && lv_is_writecache(data_lv)) { + struct logical_volume *cachevol_lv = first_seg(data_lv)->writecache; + + if (!lv_detach_writecache_cachevol(data_lv, 1)) { + log_error("Failed to detach writecache from %s", display_lvname(data_lv)); + return 0; + } + + if (!lv_remove_single(cmd, cachevol_lv, force, 1)) { + log_error("Failed to remove cachevol %s.", display_lvname(cachevol_lv)); return 0; - } else { - ask_discard = 0; } } } - if ((force == PROMPT) && ask_discard && - yes_no_prompt("Do you really want to remove and DISCARD " - "logical volume %s? [y/n]: ", - lv->name) == 'n') { - log_error("Logical volume %s not removed", lv->name); - return 0; + if (lv_is_writecache(lv)) { + struct logical_volume *cachevol_lv = first_seg(lv)->writecache; + + if (!deactivate_lv(cmd, lv)) { + log_error("Failed to deactivate LV %s", display_lvname(lv)); + return 0; + } + + if (!lv_detach_writecache_cachevol(lv, 1)) { + log_error("Failed to detach writecache from %s", display_lvname(lv)); + return 0; + } + + if (!lv_remove_single(cmd, cachevol_lv, force, suppress_remove_message)) { + log_error("Failed to remove cachevol %s.", display_lvname(cachevol_lv)); + return 0; + } } - if (!archive(vg)) - return 0; - if (lv_is_cow(lv)) { - /* Old format1 code */ - if (!(lv->vg->fid->fmt->features & FMT_MDAS)) - format1_origin = origin_from_cow(lv); + /* Used cache pool, COW or historical LV cannot be activated */ + if (!lv_is_used_cache_pool(lv) && + !lv_is_cache_vol(lv) && + !lv_is_cow(lv) && !lv_is_historical(lv) && + !deactivate_lv_with_sub_lv(lv)) + /* FIXME Review and fix the snapshot error paths! */ + return_0; + + /* Special case removing a striped raid LV with allocated reshape space */ + if (seg && seg->reshape_len) { + if (!(seg->segtype = get_segtype_from_string(cmd, SEG_TYPE_NAME_STRIPED))) + return_0; + lv->le_count = seg->len = seg->area_len = seg_lv(seg, 0)->le_count * seg->area_count; + } + + /* Clear thin pool stacked messages */ + if (pool_lv && thin_pool_has_message(first_seg(pool_lv), lv, 0) && + !update_thin_pool_lv(pool_lv, 1)) { + if (force < DONT_PROMPT_OVERRIDE) { + log_error("Failed to update pool %s.", display_lvname(pool_lv)); + return 0; + } + log_print_unless_silent("Ignoring update failure of pool %s.", + display_lvname(pool_lv)); + pool_lv = NULL; /* Do not retry */ + } + + /* When referenced by the LV with pending delete flag, remove this deleted LV first */ + dm_list_iterate_items(sl, &lv->segs_using_this_lv) + if (lv_is_pending_delete(sl->seg->lv) && !lv_remove(sl->seg->lv)) { + log_error("Error releasing logical volume %s with pending delete.", + display_lvname(sl->seg->lv)); + return 0; + } - log_verbose("Removing snapshot %s", lv->name); + if (lv_is_cow(lv)) { + log_verbose("Removing snapshot volume %s.", display_lvname(lv)); /* vg_remove_snapshot() will preload origin/former snapshots */ if (!vg_remove_snapshot(lv)) return_0; + + if (!deactivate_lv(cmd, lv)) { + /* FIXME Review and fix the snapshot error paths! */ + log_error("Unable to deactivate logical volume %s.", + display_lvname(lv)); + return 0; + } } - /* FIXME Review and fix the snapshot error paths! */ - if (!deactivate_lv(cmd, lv)) { - log_error("Unable to deactivate logical volume \"%s\"", - lv->name); - return 0; + if (lv_is_cache_vol(lv)) { + if ((cache_seg = get_only_segment_using_this_lv(lv))) { + /* When used with cache, lvremove on cachevol also removes the cache! */ + if (seg_is_cache(cache_seg)) { + if (!lv_cache_remove(cache_seg->lv)) + return_0; + } else if (seg_is_writecache(cache_seg)) { + log_error("Detach cachevol before removing."); + return 0; + } + } } - /* Clear thin pool stacked messages */ - if (pool_lv && !pool_has_message(first_seg(pool_lv), lv, 0) && - !update_pool_lv(pool_lv, 1)) { - log_error("Failed to update thin pool %s.", pool_lv->name); - return 0; + if (lv_is_used_cache_pool(lv)) { + /* Cache pool removal drops cache layer + * If the cache pool is not linked, we can simply remove it. */ + if (!(cache_seg = get_only_segment_using_this_lv(lv))) + return_0; + /* TODO: polling */ + if (!lv_cache_remove(cache_seg->lv)) + return_0; } visible = lv_is_visible(lv); + historical = lv_is_historical(lv); - log_verbose("Releasing logical volume \"%s\"", lv->name); + log_verbose("Releasing %slogical volume \"%s\"", + historical ? "historical " : "", + historical ? lv->this_glv->historical->name : lv->name); if (!lv_remove(lv)) { - log_error("Error releasing logical volume \"%s\"", lv->name); + log_error("Error releasing %slogical volume \"%s\"", + historical ? "historical ": "", + historical ? lv->this_glv->historical->name : lv->name); return 0; } - /* - * Old format1 code: If no snapshots left reload without -real. - */ - if (format1_origin && !lv_is_origin(format1_origin)) { - log_warn("WARNING: Support for snapshots with old LVM1-style metadata is deprecated."); - log_warn("WARNING: Please use lvconvert to update to lvm2 metadata at your convenience."); - format1_reload_required = 1; - } - - /* store it on disks */ - if (!vg_write(vg)) - return_0; - - /* format1 */ - if (format1_reload_required && !suspend_lv(cmd, format1_origin)) - log_error("Failed to refresh %s without snapshot.", format1_origin->name); - - if (!vg_commit(vg)) + if (!pool_lv && (!strcmp(cmd->name, "lvremove") || !strcmp(cmd->name, "vgremove"))) { + /* With lvremove & vgremove try to postpone commit after last such LV */ + vg->needs_write_and_commit = 1; + log_debug_metadata("Postponing write and commit."); + } else if (!vg_write(vg) || !vg_commit(vg)) /* store it on disks */ return_0; - - /* format1 */ - if (format1_reload_required && !resume_lv(cmd, format1_origin)) { - log_error("Failed to resume %s.", format1_origin->name); - return 0; - } /* Release unneeded blocks in thin pool */ /* TODO: defer when multiple LVs relased at once */ - if (pool_lv && !update_pool_lv(pool_lv, 1)) { - log_error("Failed to update thin pool %s.", pool_lv->name); - return 0; + if (pool_lv && !update_thin_pool_lv(pool_lv, 1)) { + if (force < DONT_PROMPT_OVERRIDE) { + log_error("Failed to update thin pool %s.", display_lvname(pool_lv)); + return 0; + } + log_print_unless_silent("Ignoring update failure of pool %s.", + display_lvname(pool_lv)); + } + + if (!lockd_lv(cmd, lv, "un", LDLV_PERSISTENT)) + log_warn("WARNING: Failed to unlock %s.", display_lvname(lv)); + lockd_free_lv(cmd, vg, lv->name, &lv->lvid.id[1], lv->lock_args); + + if (!suppress_remove_message && (visible || historical)) { + (void) dm_snprintf(msg, sizeof(msg), + "%sogical volume \"%s\" successfully removed.", + historical ? "Historical l" : "L", + historical ? lv->this_glv->historical->name : lv->name); + if (!vg->needs_write_and_commit) + log_print_unless_silent("%s", msg); + /* Keep print message for later display with next vg_write() and vg_commit() */ + else if (!(msg_dup = dm_pool_strdup(vg->vgmem, msg)) || + !str_list_add_no_dup_check(vg->vgmem, &vg->msg_list, msg_dup)) + return_0; } - backup(vg); + return 1; +} - if (visible) - log_print_unless_silent("Logical volume \"%s\" successfully removed", lv->name); +static int _lv_remove_segs_using_this_lv(struct cmd_context *cmd, struct logical_volume *lv, + const force_t force, unsigned level, + const char *lv_type) +{ + struct seg_list *sl; + + if ((force == PROMPT) && + yes_no_prompt("Removing %s %s will remove %u dependent volume(s). " + "Proceed? [y/n]: ", lv_type, display_lvname(lv), + dm_list_size(&lv->segs_using_this_lv)) == 'n') { + lv->to_remove = 0; + log_error("Logical volume %s not removed.", display_lvname(lv)); + return 0; + } + + /* + * Not using _safe iterator here - since we may delete whole subtree + * (similar as process_each_lv_in_vg()) + * the code is roughly equivalent to this: + * + * while (!dm_list_empty(&lv->segs_using_this_lv)) + * dm_list_iterate_items(sl, &lv->segs_using_this_lv) + * break; + */ + dm_list_iterate_items(sl, &lv->segs_using_this_lv) + if (!lv_remove_with_dependencies(cmd, sl->seg->lv, + force, level + 1)) + return_0; return 1; } - /* * remove LVs with its dependencies - LV leaf nodes should be removed first */ int lv_remove_with_dependencies(struct cmd_context *cmd, struct logical_volume *lv, const force_t force, unsigned level) { - percent_t snap_percent; + dm_percent_t snap_percent; struct dm_list *snh, *snht; - struct seg_list *sl, *tsl; struct lvinfo info; + struct lv_list *lvl; + struct logical_volume *origin; - if (lv_is_cow(lv)) { + /* Make aware users of this LV, it's going to be removed, so they + * can skip any updates of itself */ + lv->to_remove = 1; + + if (!level && lv_is_cow(lv)) { /* * A merging snapshot cannot be removed directly unless * it has been invalidated or failed merge removal is requested. */ - if (lv_is_merging_cow(lv) && !level) { + if (lv_is_merging_cow(lv)) { if (lv_info(lv->vg->cmd, lv, 0, &info, 1, 0) && info.exists && info.live_table) { if (!lv_snapshot_percent(lv, &snap_percent)) { - log_error("Failed to obtain merging snapshot progress percentage for logical volume %s.", - lv->name); + log_error("Failed to obtain merging snapshot progress " + "percentage for logical volume %s.", + display_lvname(lv)); return 0; } - if ((snap_percent != PERCENT_INVALID) && - (snap_percent != PERCENT_MERGE_FAILED)) { - log_error("Can't remove merging snapshot logical volume \"%s\"", - lv->name); - return 0; - } else if ((snap_percent == PERCENT_MERGE_FAILED) && - (force == PROMPT) && - yes_no_prompt("Removing snapshot \"%s\" that failed to merge may leave origin \"%s\" inconsistent. " - "Proceed? [y/n]: ", lv->name, origin_from_cow(lv)->name) == 'n') { - log_error("Logical volume %s not removed.", lv->name); + + if ((snap_percent != DM_PERCENT_INVALID) && + (snap_percent != LVM_PERCENT_MERGE_FAILED)) { + log_error("Can't remove merging snapshot logical volume %s.", + display_lvname(lv)); return 0; } + + if ((snap_percent == LVM_PERCENT_MERGE_FAILED) && + (force == PROMPT) && + yes_no_prompt("Removing snapshot %s that failed to merge " + "may leave origin %s inconsistent. Proceed? [y/n]: ", + display_lvname(lv), + display_lvname(origin_from_cow(lv))) == 'n') + goto no_remove; } - } + } else if (lv_is_virtual_origin(origin = origin_from_cow(lv))) + /* If this is a sparse device, remove its origin too. */ + /* Stacking is not supported */ + lv = origin; } if (lv_is_origin(lv)) { /* Remove snapshot LVs first */ - if ((force == PROMPT) && - /* Active snapshot already needs to confirm each active LV */ - !lv_is_active(lv) && - yes_no_prompt("Removing origin %s will also remove %u " - "snapshots(s). Proceed? [y/n]: ", - lv->name, lv->origin_count) == 'n') { - log_error("Logical volume %s not removed.", lv->name); - return 0; - } + if (!_lv_remove_check_in_use(lv, force)) + return_0; + + if (!deactivate_lv(cmd, lv)) + goto no_remove; + + log_verbose("Removing origin logical volume %s with %u snapshots(s).", + display_lvname(lv), lv->origin_count); dm_list_iterate_safe(snh, snht, &lv->snapshot_segs) if (!lv_remove_with_dependencies(cmd, dm_list_struct_base(snh, struct lv_segment, origin_list)->cow, force, level + 1)) return_0; + } else if (lv_is_merging_origin(lv)) { + /* Removing thin merging origin requires to remove its merging snapshot first */ + if (!_lv_remove_check_in_use(lv, force)) + return_0; + + if (!deactivate_lv(cmd, lv)) + goto no_remove; + + log_verbose("Removing merging origin logical volume %s.", display_lvname(lv)); + + if (!lv_remove_with_dependencies(cmd, find_snapshot(lv)->lv, + force, level + 1)) + return_0; + } + + if (!level && lv_is_merging_thin_snapshot(lv)) { + /* Merged snapshot LV is no longer available for the user */ + log_error("Unable to remove %s, volume is merged to %s.", + display_lvname(lv), display_lvname(first_seg(lv)->merge_lv)); + return 0; + } + + if (lv_is_cache_origin(lv) || lv_is_writecache_origin(lv)) { + if (!_lv_remove_segs_using_this_lv(cmd, lv, force, level, "cache origin")) + return_0; + /* Removal of cache LV also removes caching origin */ + return 1; } - if (lv_is_used_thin_pool(lv)) { - /* Remove thin LVs first */ - if ((force == PROMPT) && - yes_no_prompt("Removing pool %s will also remove %u " - "thin volume(s). OK? [y/n]: ", lv->name, - /* Note: Snaphosts not included */ - dm_list_size(&lv->segs_using_this_lv)) == 'n') { - log_error("Logical volume %s not removed.", lv->name); + if (lv_is_external_origin(lv) && + !_lv_remove_segs_using_this_lv(cmd, lv, force, level, "external origin")) + return_0; + + if (lv_is_used_thin_pool(lv) && + !_lv_remove_segs_using_this_lv(cmd, lv, force, level, "pool")) + return_0; + + if (lv_is_vdo_pool(lv)) { + if (!_lv_remove_segs_using_this_lv(cmd, lv, force, level, "VDO pool")) + return_0; + /* Last user removes VDO pool itself, lv no longer exists */ + return 1; + } + + if (lv_is_cache_pool(lv) && !lv_is_used_cache_pool(lv)) { + if (!deactivate_lv(cmd, first_seg(lv)->metadata_lv) || + !deactivate_lv(cmd, seg_lv(first_seg(lv),0))) { + log_error("Unable to fully deactivate unused cache-pool %s.", + display_lvname(lv)); return 0; } + } - dm_list_iterate_items_safe(sl, tsl, &lv->segs_using_this_lv) - if (!lv_remove_with_dependencies(cmd, sl->seg->lv, - force, level + 1)) - return_0; + if (lv_is_pool_metadata_spare(lv) && + (force == PROMPT)) { + dm_list_iterate_items(lvl, &lv->vg->lvs) + if (lv_is_pool_metadata(lvl->lv)) { + if (yes_no_prompt("Removal of pool metadata spare logical volume " + "%s disables automatic recovery attempts " + "after damage to a thin or cache pool. " + "Proceed? [y/n]: ", display_lvname(lv)) == 'n') + goto no_remove; + break; + } + } + + return lv_remove_single(cmd, lv, force, 0); + +no_remove: + log_error("Logical volume %s not removed.", display_lvname(lv)); + + return 0; +} + +static int _lv_update_and_reload(struct logical_volume *lv, int origin_only) +{ + struct volume_group *vg = lv->vg; + int r = 0; + const struct logical_volume *lock_lv = lv_lock_holder(lv); + + log_very_verbose("Updating logical volume %s on disk(s)%s.", + display_lvname(lock_lv), origin_only ? " (origin only)": ""); + if (!vg_write(vg)) + return_0; + + if (origin_only && (lock_lv != lv)) { + log_debug_activation("Dropping origin_only for %s as lock holds %s", + display_lvname(lv), display_lvname(lock_lv)); + origin_only = 0; } - return lv_remove_single(cmd, lv, force); + if (!(origin_only ? suspend_lv_origin(vg->cmd, lock_lv) : suspend_lv(vg->cmd, lock_lv))) { + log_error("Failed to suspend logical volume %s.", + display_lvname(lock_lv)); + vg_revert(vg); + if (!revert_lv(vg->cmd, lock_lv)) + log_error("Failed to revert logical volume %s.", + display_lvname(lock_lv)); + return 0; + } else if (!(r = vg_commit(vg))) + stack; /* !vg_commit() has implict vg_revert() */ + + log_very_verbose("Updating logical volume %s in kernel.", + display_lvname(lock_lv)); + + if (!(origin_only ? resume_lv_origin(vg->cmd, lock_lv) : resume_lv(vg->cmd, lock_lv))) { + log_error("Problem reactivating logical volume %s.", + display_lvname(lock_lv)); + r = 0; + } + + return r; +} + +int lv_update_and_reload(struct logical_volume *lv) +{ + return _lv_update_and_reload(lv, 0); +} + +int lv_update_and_reload_origin(struct logical_volume *lv) +{ + return _lv_update_and_reload(lv, 1); } /* @@ -3534,16 +8033,16 @@ static int _split_parent_area(struct lv_segment *seg, uint32_t s, while (parent_area_len > 0) { /* Find the layer segment pointed at */ if (!(spvs = _find_seg_pvs_by_le(layer_seg_pvs, layer_le))) { - log_error("layer segment for %s:%" PRIu32 " not found", - seg->lv->name, parent_le); + log_error("layer segment for %s:" FMTu32 " not found.", + display_lvname(seg->lv), parent_le); return 0; } if (spvs->le != layer_le) { log_error("Incompatible layer boundary: " - "%s:%" PRIu32 "[%" PRIu32 "] on %s:%" PRIu32, - seg->lv->name, parent_le, s, - seg_lv(seg, s)->name, layer_le); + "%s:" FMTu32 "[" FMTu32 "] on %s:" FMTu32 ".", + display_lvname(seg->lv), parent_le, s, + display_lvname(seg_lv(seg, s)), layer_le); return 0; } @@ -3572,7 +8071,7 @@ int split_parent_segments_for_layer(struct cmd_context *cmd, uint32_t s; struct dm_list *parallel_areas; - if (!(parallel_areas = build_parallel_areas_from_lv(layer_lv, 0))) + if (!(parallel_areas = build_parallel_areas_from_lv(layer_lv, 0, 0))) return_0; /* Loop through all LVs except itself */ @@ -3643,7 +8142,7 @@ int remove_layers_for_segments(struct cmd_context *cmd, log_error("Layer boundary mismatch: " "%s:%" PRIu32 "-%" PRIu32 " on " "%s:%" PRIu32 " / " - "%" PRIu32 "-%" PRIu32 " / ", + FMTu32 "-" FMTu32 " / ", lv->name, seg->le, seg->area_len, layer_lv->name, seg_le(seg, s), lseg->le, lseg->area_len); @@ -3655,7 +8154,7 @@ int remove_layers_for_segments(struct cmd_context *cmd, /* Replace mirror with error segment */ if (!(lseg->segtype = - get_segtype_from_string(lv->vg->cmd, "error"))) { + get_segtype_from_string(lv->vg->cmd, SEG_TYPE_NAME_ERROR))) { log_error("Missing error segtype"); return 0; } @@ -3702,6 +8201,14 @@ int remove_layers_for_segments_all(struct cmd_context *cmd, if (!lv_empty(layer_lv)) return_0; + /* Assumes only used by PVMOVE ATM when unlocking LVs */ + dm_list_iterate_items(lvl, lvs_changed) { + /* FIXME Assumes only one pvmove at a time! */ + lvl->lv->status &= ~LOCKED; + if (!lv_merge_segments(lvl->lv)) + return_0; + } + return 1; } @@ -3709,6 +8216,17 @@ int move_lv_segments(struct logical_volume *lv_to, struct logical_volume *lv_from, uint64_t set_status, uint64_t reset_status) { + const uint64_t MOVE_BITS = (CACHE | + CACHE_POOL | + INTEGRITY | + LV_CACHE_VOL | + LV_VDO | + LV_VDO_POOL | + MIRROR | + RAID | + THIN_POOL | + THIN_VOLUME | + WRITECACHE); struct lv_segment *seg; dm_list_iterate_items(seg, &lv_to->segments) @@ -3726,6 +8244,16 @@ int move_lv_segments(struct logical_volume *lv_to, seg->status |= set_status; } + /* + * Move LV status bits for selected types with their segments + * i.e. when inserting layer to cache LV, we move raid segments + * to a new place, thus 'raid' LV property now belongs to this LV. + * + * Bits should match to those which appears after read from disk. + */ + lv_to->status |= lv_from->status & MOVE_BITS; + lv_from->status &= ~MOVE_BITS; + lv_to->le_count = lv_from->le_count; lv_to->size = lv_from->size; @@ -3739,41 +8267,98 @@ int move_lv_segments(struct logical_volume *lv_to, int remove_layer_from_lv(struct logical_volume *lv, struct logical_volume *layer_lv) { - struct logical_volume *parent; + static const char _suffixes[][8] = { "_tdata", "_cdata", "_corig", "_wcorig", "_vdata" }; + struct logical_volume *parent_lv; struct lv_segment *parent_seg; struct segment_type *segtype; + struct lv_names lv_names; + unsigned r; log_very_verbose("Removing layer %s for %s", layer_lv->name, lv->name); if (!(parent_seg = get_only_segment_using_this_lv(layer_lv))) { log_error("Failed to find layer %s in %s", - layer_lv->name, lv->name); + layer_lv->name, lv->name); + return 0; + } + parent_lv = parent_seg->lv; + if (parent_lv != lv) { + log_error(INTERNAL_ERROR "Wrong layer %s in %s", + layer_lv->name, lv->name); return 0; } - parent = parent_seg->lv; /* * Before removal, the layer should be cleaned up, * i.e. additional segments and areas should have been removed. */ - if (dm_list_size(&parent->segments) != 1 || - parent_seg->area_count != 1 || - seg_type(parent_seg, 0) != AREA_LV || - layer_lv != seg_lv(parent_seg, 0) || - parent->le_count != layer_lv->le_count) - return_0; + /* FIXME: + * These are all INTERNAL_ERROR, but ATM there is + * some internal API problem and this code is wrongle + * executed with certain mirror manipulations. + * So we need to fix mirror code first, then switch... + */ + if (dm_list_size(&parent_lv->segments) != 1) { + log_error("Invalid %d segments in %s, expected only 1.", + dm_list_size(&parent_lv->segments), + display_lvname(parent_lv)); + return 0; + } + + if (parent_seg->area_count != 1) { + log_error("Invalid %d area count(s) in %s, expected only 1.", + parent_seg->area_count, display_lvname(parent_lv)); + return 0; + } + + if (seg_type(parent_seg, 0) != AREA_LV) { + log_error("Invalid seg_type %d in %s, expected LV.", + seg_type(parent_seg, 0), display_lvname(parent_lv)); + return 0; + } + + if (layer_lv != seg_lv(parent_seg, 0)) { + log_error("Layer doesn't match segment in %s.", + display_lvname(parent_lv)); + return 0; + } + + if (parent_lv->le_count != layer_lv->le_count) { + log_error("Inconsistent extent count (%u != %u) of layer %s.", + parent_lv->le_count, layer_lv->le_count, + display_lvname(parent_lv)); + return 0; + } - if (!lv_empty(parent)) + if (!lv_empty(parent_lv)) return_0; - if (!move_lv_segments(parent, layer_lv, 0, 0)) + if (!move_lv_segments(parent_lv, layer_lv, 0, 0)) return_0; /* Replace the empty layer with error segment */ - segtype = get_segtype_from_string(lv->vg->cmd, "error"); - if (!lv_add_virtual_segment(layer_lv, 0, parent->le_count, segtype, NULL)) + if (!(segtype = get_segtype_from_string(lv->vg->cmd, SEG_TYPE_NAME_ERROR))) + return_0; + if (!lv_add_virtual_segment(layer_lv, 0, parent_lv->le_count, segtype)) return_0; + /* + * recuresively rename sub LVs + * currently supported only for thin data layer + * FIXME: without strcmp it breaks mirrors.... + */ + if (!strstr(layer_lv->name, "_mimage")) { + for (r = 0; r < DM_ARRAY_SIZE(_suffixes); ++r) { + if (strstr(layer_lv->name, _suffixes[r]) == 0) { + lv_names.old = layer_lv->name; + lv_names.new = parent_lv->name; + if (!for_each_sub_lv(parent_lv, _rename_skip_pools_externals_cb, (void *) &lv_names)) + return_0; + break; + } + } + } + return 1; } @@ -3788,45 +8373,36 @@ struct logical_volume *insert_layer_for_lv(struct cmd_context *cmd, uint64_t status, const char *layer_suffix) { + static const char _suffixes[][10] = { "_tdata", "_cdata", "_corig", "_wcorig", "_vdata", "_tpool%d" }; int r; - char *name; - size_t len; - struct str_list *sl; + char name[NAME_LEN]; + struct dm_str_list *sl; struct logical_volume *layer_lv; struct segment_type *segtype; struct lv_segment *mapseg; struct lv_names lv_names; - unsigned exclusive = 0; + unsigned i; /* create an empty layer LV */ - len = strlen(lv_where->name) + 32; - if (!(name = alloca(len))) { - log_error("layer name allocation failed. " - "Remove new LV and retry."); - return NULL; - } - - if (dm_snprintf(name, len, "%s%s", lv_where->name, layer_suffix) < 0) { - log_error("layer name allocation failed. " - "Remove new LV and retry."); + if (dm_snprintf(name, sizeof(name), "%s%s", lv_where->name, layer_suffix) < 0) { + log_error("Layered name is too long. Please use shorter LV name."); return NULL; } - if (!(layer_lv = lv_create_empty(name, NULL, LVM_READ | LVM_WRITE, + if (!(layer_lv = lv_create_empty(name, NULL, + /* Preserve read-only flag */ + LVM_READ | (lv_where->status & LVM_WRITE), ALLOC_INHERIT, lv_where->vg))) { log_error("Creation of layer LV failed"); return NULL; } - if (lv_is_active_exclusive_locally(lv_where)) - exclusive = 1; - - if (lv_is_active(lv_where) && strstr(name, "_mimagetmp")) { + if (lv_is_active(lv_where) && strstr(name, MIRROR_SYNC_LAYER)) { log_very_verbose("Creating transient LV %s for mirror conversion in VG %s.", name, lv_where->vg->name); - segtype = get_segtype_from_string(cmd, "error"); + segtype = get_segtype_from_string(cmd, SEG_TYPE_NAME_ERROR); - if (!lv_add_virtual_segment(layer_lv, 0, lv_where->le_count, segtype, NULL)) { + if (!lv_add_virtual_segment(layer_lv, 0, lv_where->le_count, segtype)) { log_error("Creation of transient LV %s for mirror conversion in VG %s failed.", name, lv_where->vg->name); return NULL; } @@ -3846,14 +8422,10 @@ struct logical_volume *insert_layer_for_lv(struct cmd_context *cmd, if (!vg_commit(lv_where->vg)) { log_error("Failed to commit intermediate VG %s metadata for mirror conversion.", lv_where->vg->name); - vg_revert(lv_where->vg); return NULL; } - if (exclusive) - r = activate_lv_excl(cmd, layer_lv); - else - r = activate_lv(cmd, layer_lv); + r = activate_lv(cmd, layer_lv); if (!r) { log_error("Failed to resume transient LV" @@ -3865,7 +8437,6 @@ struct logical_volume *insert_layer_for_lv(struct cmd_context *cmd, /* Remove the temporary tags */ dm_list_iterate_items(sl, &lv_where->tags) str_list_del(&layer_lv->tags, sl->str); - } log_very_verbose("Inserting layer %s for %s", @@ -3874,12 +8445,12 @@ struct logical_volume *insert_layer_for_lv(struct cmd_context *cmd, if (!move_lv_segments(layer_lv, lv_where, 0, 0)) return_NULL; - if (!(segtype = get_segtype_from_string(cmd, "striped"))) + if (!(segtype = get_segtype_from_string(cmd, SEG_TYPE_NAME_STRIPED))) return_NULL; /* allocate a new linear segment */ - if (!(mapseg = alloc_lv_segment(segtype, lv_where, 0, layer_lv->le_count, - status, 0, NULL, NULL, 1, layer_lv->le_count, + if (!(mapseg = alloc_lv_segment(segtype, lv_where, 0, layer_lv->le_count, 0, + status, 0, NULL, 1, layer_lv->le_count, 0, 0, 0, 0, NULL))) return_NULL; @@ -3892,17 +8463,22 @@ struct logical_volume *insert_layer_for_lv(struct cmd_context *cmd, lv_where->le_count = layer_lv->le_count; lv_where->size = (uint64_t) lv_where->le_count * lv_where->vg->extent_size; + if (lv_where->vg->fid->fmt->features & FMT_CONFIG_PROFILE) + lv_where->profile = lv_where->vg->cmd->profile_params->global_metadata_profile; + /* * recuresively rename sub LVs * currently supported only for thin data layer * FIXME: without strcmp it breaks mirrors.... */ - if (strcmp(layer_suffix, "_tdata") == 0) { - lv_names.old = lv_where->name; - lv_names.new = layer_lv->name; - if (!for_each_sub_lv(cmd, layer_lv, _rename_cb, (void *) &lv_names)) - return 0; - } + for (i = 0; i < DM_ARRAY_SIZE(_suffixes); ++i) + if (strcmp(layer_suffix, _suffixes[i]) == 0) { + lv_names.old = lv_where->name; + lv_names.new = layer_lv->name; + if (!for_each_sub_lv(layer_lv, _rename_skip_pools_externals_cb, (void *) &lv_names)) + return_NULL; + break; + } return layer_lv; } @@ -3922,7 +8498,7 @@ static int _extend_layer_lv_for_segment(struct logical_volume *layer_lv, if (seg_type(seg, s) != AREA_PV && seg_type(seg, s) != AREA_LV) return_0; - if (!(segtype = get_segtype_from_string(layer_lv->vg->cmd, "striped"))) + if (!(segtype = get_segtype_from_string(layer_lv->vg->cmd, SEG_TYPE_NAME_STRIPED))) return_0; /* FIXME Incomplete message? Needs more context */ @@ -3933,8 +8509,8 @@ static int _extend_layer_lv_for_segment(struct logical_volume *layer_lv, /* allocate a new segment */ if (!(mapseg = alloc_lv_segment(segtype, layer_lv, layer_lv->le_count, - seg->area_len, status, 0, - NULL, NULL, 1, seg->area_len, 0, 0, 0, seg))) + seg->area_len, 0, status, 0, + NULL, 1, seg->area_len, 0, 0, 0, 0, seg))) return_0; /* map the new segment to the original underlying are */ @@ -3980,10 +8556,10 @@ static int _match_seg_area_to_pe_range(struct lv_segment *seg, uint32_t s, continue; /* FIXME Missing context in this message - add LV/seg details */ - log_debug("Matched PE range %s:%" PRIu32 "-%" PRIu32 " against " - "%s %" PRIu32 " len %" PRIu32, dev_name(pvl->pv->dev), - per->start, per_end, dev_name(seg_dev(seg, s)), - seg_pe(seg, s), seg->area_len); + log_debug_alloc("Matched PE range %s:%" PRIu32 "-%" PRIu32 " against " + "%s %" PRIu32 " len %" PRIu32, dev_name(pvl->pv->dev), + per->start, per_end, dev_name(seg_dev(seg, s)), + seg_pe(seg, s), seg->area_len); return 1; } @@ -4066,14 +8642,25 @@ int insert_layer_for_segments_on_pv(struct cmd_context *cmd, struct lv_list *lvl; int lv_used = 0; uint32_t s; + struct logical_volume *holder = (struct logical_volume *) lv_lock_holder(lv_where); log_very_verbose("Inserting layer %s for segments of %s on %s", layer_lv->name, lv_where->name, pvl ? pv_dev_name(pvl->pv) : "any"); + /* Temporarily hide layer_lv from vg->lvs list + * so the lv_split_segment() passes vg_validate() + * since here layer_lv has empty segment list */ + if (!(lvl = find_lv_in_vg(lv_where->vg, layer_lv->name))) + return_0; + dm_list_del(&lvl->list); + if (!_align_segment_boundary_to_pe_range(lv_where, pvl)) return_0; + /* Put back layer_lv in vg->lv */ + dm_list_add(&lv_where->vg->lvs, &lvl->list); + /* Work through all segments on the supplied PV */ dm_list_iterate_items(seg, &lv_where->segments) { for (s = 0; s < seg->area_count; s++) { @@ -4082,13 +8669,23 @@ int insert_layer_for_segments_on_pv(struct cmd_context *cmd, /* First time, add LV to list of LVs affected */ if (!lv_used && lvs_changed) { - if (!(lvl = dm_pool_alloc(cmd->mem, sizeof(*lvl)))) { - log_error("lv_list alloc failed"); - return 0; + /* First check if LV is listed already */ + dm_list_iterate_items(lvl, lvs_changed) + if (lvl->lv == holder) { + lv_used = 1; + break; + } + + if (!lv_used) { + if (!(lvl = dm_pool_alloc(cmd->mem, sizeof(*lvl)))) { + log_error("lv_list alloc failed."); + return 0; + } + + lvl->lv = holder; + dm_list_add(lvs_changed, &lvl->list); + lv_used = 1; } - lvl->lv = lv_where; - dm_list_add(lvs_changed, &lvl->list); - lv_used = 1; } if (!_extend_layer_lv_for_segment(layer_lv, seg, s, @@ -4108,11 +8705,29 @@ int insert_layer_for_segments_on_pv(struct cmd_context *cmd, /* * Initialize the LV with 'value'. */ -int set_lv(struct cmd_context *cmd, struct logical_volume *lv, - uint64_t sectors, int value) +int wipe_lv(struct logical_volume *lv, struct wipe_params wp) { struct device *dev; - char *name; + char name[PATH_MAX]; + uint64_t zero_sectors; + int zero_metadata; + + if (!wp.do_zero && !wp.do_wipe_signatures && !wp.is_metadata) + /* nothing to do */ + return 1; + + if (!lv_is_active(lv)) { + log_error("Volume %s is not active locally (volume_list activation filter?).", + display_lvname(lv)); + return 0; + } + + /* Wait until devices are available */ + if (!sync_local_dev_names(lv->vg->cmd)) { + log_error("Failed to sync local devices before wiping volume %s.", + display_lvname(lv)); + return 0; + } /* * FIXME: @@ -4121,44 +8736,198 @@ int set_lv(struct cmd_context *cmd, struct logical_volume *lv, * <ejt_> k, I'll drop a fixme to that effect * (I know the device is at least 4k, but not 32k) */ - if (!(name = dm_pool_alloc(cmd->mem, PATH_MAX))) { - log_error("Name allocation failed - device not cleared"); + if (dm_snprintf(name, sizeof(name), "%s%s/%s", lv->vg->cmd->dev_dir, + lv->vg->name, lv->name) < 0) { + log_error("Name too long - device not cleared (%s)", lv->name); return 0; } - if (dm_snprintf(name, PATH_MAX, "%s%s/%s", cmd->dev_dir, - lv->vg->name, lv->name) < 0) { - log_error("Name too long - device not cleared (%s)", lv->name); + if (!(dev = dev_cache_get(lv->vg->cmd, name, NULL))) { + log_error("%s: not found: device not cleared", name); return 0; } - sync_local_dev_names(cmd); /* Wait until devices are available */ + if (!label_scan_open_rw(dev)) { + log_error("Failed to open %s for wiping and zeroing.", display_lvname(lv)); + return 0; + } - log_verbose("Clearing start of logical volume \"%s\"", lv->name); + sigint_allow(); + if (wp.do_wipe_signatures) { + log_verbose("Wiping known signatures on logical volume %s.", + display_lvname(lv)); + if (!wipe_known_signatures(lv->vg->cmd, dev, name, 0, + TYPE_DM_SNAPSHOT_COW, + wp.yes, wp.force, NULL)) { + sigint_restore(); + label_scan_invalidate(dev); + log_error("%s logical volume %s.", + sigint_caught() ? + "Interrupted initialization of" : "Failed to wipe signatures on", + display_lvname(lv)); + return 0; + } + } - if (!(dev = dev_cache_get(name, NULL))) { - log_error("%s: not found: device not cleared", name); - return 0; + if (wp.do_zero || wp.is_metadata) { + zero_metadata = !wp.is_metadata ? 0 : + find_config_tree_bool(lv->vg->cmd, allocation_zero_metadata_CFG, NULL); + if (zero_metadata) { + log_debug("Metadata logical volume %s will be fully zeroed.", + display_lvname(lv)); + zero_sectors = lv->size; + wp.zero_value = 0; + } else { + if (wp.is_metadata) /* Verbosely notify metadata will not be fully zeroed */ + log_verbose("Metadata logical volume %s not fully zeroed and may contain stale data.", + display_lvname(lv)); + zero_sectors = UINT64_C(4096) >> SECTOR_SHIFT; + if (wp.zero_sectors > zero_sectors) + zero_sectors = wp.zero_sectors; + + if (zero_sectors > lv->size) + zero_sectors = lv->size; + } + + log_verbose("Initializing %s of logical volume %s with value %d.", + display_size(lv->vg->cmd, zero_sectors), + display_lvname(lv), wp.zero_value); + +#ifdef HAVE_BLKZEROOUT + if (!test_mode() && !wp.zero_value && (zero_sectors > 16)) { + /* TODO: maybe integrate with bcache_zero_set() */ + const uint64_t end = zero_sectors << SECTOR_SHIFT; + uint64_t range[2] = { 0, 1024 * 1024 }; /* zeroing with 1M steps (for better ^C support) */ + for (/* empty */ ; range[0] < end; range[0] += range[1]) { + if ((range[0] + range[1]) > end) + range[1] = end - range[0]; + + if (ioctl(dev->bcache_fd, BLKZEROOUT, &range)) { + if (errno == EINVAL) + goto retry_with_dev_set; /* Kernel without support for BLKZEROOUT */ + log_sys_debug("ioctl", "BLKZEROOUT"); + sigint_restore(); + label_scan_invalidate(dev); + log_error("%s logical volume %s at position " FMTu64 " and size " FMTu64 ".", + sigint_caught() ? "Interrupted initialization of" : "Failed to initialize", + display_lvname(lv), range[0], range[1]); + return 0; + } + } + } else +retry_with_dev_set: +#endif + if (!dev_set_bytes(dev, UINT64_C(0), (size_t) zero_sectors << SECTOR_SHIFT, wp.zero_value)) { + sigint_restore(); + log_error("%s logical volume %s with value %d and size %s.", + sigint_caught() ? "Interrupted initialization" : "Failed to initialize", + display_lvname(lv), wp.zero_value, + display_size(lv->vg->cmd, zero_sectors)); + return 0; + } } + sigint_restore(); + + label_scan_invalidate(dev); - if (!dev_open_quiet(dev)) + lv->status &= ~LV_NOSCAN; + + return 1; +} + +/* + * Optionally makes on-disk metadata changes if @commit + * + * If LV is active: + * wipe any signatures and clear first sector of LVs listed on @lv_list + * otherwise: + * activate, wipe (as above), deactivate + * + * Returns: 1 on success, 0 on failure + */ +int activate_and_wipe_lvlist(struct dm_list *lv_list, int commit) +{ + struct lv_list *lvl; + struct volume_group *vg = NULL; + unsigned i = 0, sz = dm_list_size(lv_list); + char *was_active; + int r = 1; + + if (!sz) { + log_debug_metadata(INTERNAL_ERROR "Empty list of LVs given for wiping."); + return 1; + } + + dm_list_iterate_items(lvl, lv_list) { + if (!lv_is_visible(lvl->lv)) { + log_error(INTERNAL_ERROR + "LVs must be set visible before wiping."); + return 0; + } + vg = lvl->lv->vg; + } + + if (test_mode()) + return 1; + + /* + * FIXME: only vg_[write|commit] if LVs are not already written + * as visible in the LVM metadata (which is never the case yet). + */ + if (commit && + (!vg || !vg_write(vg) || !vg_commit(vg))) return_0; - if (!sectors) - sectors = UINT64_C(4096) >> SECTOR_SHIFT; + was_active = alloca(sz); - if (sectors > lv->size) - sectors = lv->size; + dm_list_iterate_items(lvl, lv_list) + if (!(was_active[i++] = lv_is_active(lvl->lv))) { + lvl->lv->status |= LV_TEMPORARY; + if (!activate_lv(vg->cmd, lvl->lv)) { + log_error("Failed to activate localy %s for wiping.", + display_lvname(lvl->lv)); + r = 0; + goto out; + } + lvl->lv->status &= ~LV_TEMPORARY; + } - if (!dev_set(dev, UINT64_C(0), (size_t) sectors << SECTOR_SHIFT, value)) - stack; + dm_list_iterate_items(lvl, lv_list) { + /* Wipe any know signatures */ + if (!wipe_lv(lvl->lv, (struct wipe_params) { .do_zero = 1 /* TODO: is_metadata = 1 */ })) { + r = 0; + goto_out; + } + } +out: + /* TODO: deactivation is only needed with clustered locking + * in normal case we should keep device active + */ + sz = 0; + dm_list_iterate_items(lvl, lv_list) + if ((i > sz) && !was_active[sz++] && + !deactivate_lv(vg->cmd, lvl->lv)) { + log_error("Failed to deactivate %s.", display_lvname(lvl->lv)); + r = 0; /* Continue deactivating as many as possible. */ + } - dev_flush(dev); + if (!sync_local_dev_names(vg->cmd)) + log_debug("Failed to sync local device names after deactivation of wiped volumes."); - if (!dev_close_immediate(dev)) - stack; + return r; +} - return 1; +/* Wipe logical volume @lv, optionally with @commit of metadata */ +int activate_and_wipe_lv(struct logical_volume *lv, int commit) +{ + struct dm_list lv_list; + struct lv_list lvl; + + lvl.lv = lv; + dm_list_init(&lv_list); + dm_list_add(&lv_list, &lvl.list); + + return activate_and_wipe_lvlist(&lv_list, commit); } static struct logical_volume *_create_virtual_origin(struct cmd_context *cmd, @@ -4168,19 +8937,16 @@ static struct logical_volume *_create_virtual_origin(struct cmd_context *cmd, uint64_t voriginextents) { const struct segment_type *segtype; - size_t len; - char *vorigin_name; + char vorigin_name[NAME_LEN]; struct logical_volume *lv; - if (!(segtype = get_segtype_from_string(cmd, "zero"))) { + if (!(segtype = get_segtype_from_string(cmd, SEG_TYPE_NAME_ZERO))) { log_error("Zero segment type for virtual origin not found"); return NULL; } - len = strlen(lv_name) + 32; - if (!(vorigin_name = alloca(len)) || - dm_snprintf(vorigin_name, len, "%s_vorigin", lv_name) < 0) { - log_error("Virtual origin name allocation failed."); + if (dm_snprintf(vorigin_name, sizeof(vorigin_name), "%s_vorigin", lv_name) < 0) { + log_error("Virtual origin name is too long."); return NULL; } @@ -4189,289 +8955,446 @@ static struct logical_volume *_create_virtual_origin(struct cmd_context *cmd, return_NULL; if (!lv_extend(lv, segtype, 1, 0, 1, 0, voriginextents, - NULL, NULL, ALLOC_INHERIT)) + NULL, ALLOC_INHERIT, 0)) return_NULL; - /* store vg on disk(s) */ - if (!vg_write(vg) || !vg_commit(vg)) - return_NULL; + return lv; +} + +/* + * Automatically set ACTIVATION_SKIP flag for the LV supplied - this + * is default behaviour. If override_default is set, then override + * the default behaviour and add/clear the flag based on 'add_skip' arg + * supplied instead. + */ +void lv_set_activation_skip(struct logical_volume *lv, int override_default, + int add_skip) +{ + int skip = 0; - backup(vg); + /* override default behaviour */ + if (override_default) + skip = add_skip; + /* default behaviour */ + else if (lv->vg->cmd->auto_set_activation_skip) { + /* skip activation for thin snapshots by default */ + if (lv_is_thin_volume(lv) && first_seg(lv)->origin) + skip = 1; + } - return lv; + if (skip) + lv->status |= LV_ACTIVATION_SKIP; + else + lv->status &= ~LV_ACTIVATION_SKIP; } -/* Thin notes: - * If lp->thin OR lp->activate is AY*, activate the pool if not already active. - * If lp->thin, create thin LV within the pool - as a snapshot if lp->snapshot. - * If lp->activate is AY*, activate it. - * If lp->activate was AN* and the pool was originally inactive, deactivate it. +/* + * Get indication whether the LV should be skipped during activation + * based on the ACTIVATION_SKIP flag (deactivation is never skipped!). + * If 'override_lv_skip_flag' is set, then override it based on the value + * of the 'skip' arg supplied instead. */ -static struct logical_volume *_lv_create_an_lv(struct volume_group *vg, struct lvcreate_params *lp, - const char *new_lv_name) +int lv_activation_skip(struct logical_volume *lv, activation_change_t activate, + int override_lv_skip_flag) { - struct cmd_context *cmd = vg->cmd; - uint32_t size_rest; - uint64_t status = UINT64_C(0); - struct logical_volume *lv, *org = NULL; - struct logical_volume *pool_lv; - struct lv_list *lvl; - int origin_active = 0; - struct lvinfo info; + if (!(lv->status & LV_ACTIVATION_SKIP) || + !is_change_activating(activate) || /* Do not skip deactivation */ + override_lv_skip_flag) + return 0; - if (new_lv_name && find_lv_in_vg(vg, new_lv_name)) { - log_error("Logical volume \"%s\" already exists in " - "volume group \"%s\"", new_lv_name, lp->vg_name); - return NULL; + log_verbose("ACTIVATION_SKIP flag set for LV %s/%s, skipping activation.", + lv->vg->name, lv->name); + return 1; +} + +static int _should_wipe_lv(struct lvcreate_params *lp, + struct logical_volume *lv, int warn) +{ + /* Unzeroable segment */ + if (seg_cannot_be_zeroed(first_seg(lv))) + return 0; + + /* Thin snapshot need not to be zeroed */ + /* Thin pool with zeroing doesn't need zeroing or wiping */ + if (lv_is_thin_volume(lv) && + (first_seg(lv)->origin || + first_seg(first_seg(lv)->pool_lv)->zero_new_blocks)) + return 0; + + /* VDO LV do not need to be zeroed */ + if (lv_is_vdo(lv)) + return 0; + + if (warn && (lv_passes_readonly_filter(lv))) { + log_warn("WARNING: Read-only activated logical volume %s not zeroed.", + display_lvname(lv)); + return 0; } + /* Cannot zero read-only volume */ + if ((lv->status & LVM_WRITE) && + (lp->zero || lp->wipe_signatures)) + return 1; + + if (warn && (!lp->zero || !(lv->status & LVM_WRITE))) + log_warn("WARNING: Logical volume %s not zeroed.", + display_lvname(lv)); + if (warn && (!lp->wipe_signatures || !(lv->status & LVM_WRITE))) + log_verbose("Signature wiping on logical volume %s not requested.", + display_lvname(lv)); + + return 0; +} + +/* Check if VG metadata supports needed features */ +static int _vg_check_features(struct volume_group *vg, + struct lvcreate_params *lp) +{ + uint32_t features = vg->fid->fmt->features; + if (vg_max_lv_reached(vg)) { log_error("Maximum number of logical volumes (%u) reached " "in volume group %s", vg->max_lv, vg->name); - return NULL; + return 0; } - if ((segtype_is_mirrored(lp->segtype) || - segtype_is_raid(lp->segtype) || segtype_is_thin(lp->segtype)) && - !(vg->fid->fmt->features & FMT_SEGMENTS)) { + if (!(features & FMT_SEGMENTS) && + (seg_is_cache(lp) || + seg_is_cache_pool(lp) || + seg_is_mirror(lp) || + seg_is_raid(lp) || + seg_is_thin(lp))) { log_error("Metadata does not support %s segments.", lp->segtype->name); - return NULL; + return 0; + } + + if (!(features & FMT_TAGS) && !dm_list_empty(&lp->tags)) { + log_error("Volume group %s does not support tags.", vg->name); + return 0; } - if (lp->read_ahead != DM_READ_AHEAD_AUTO && + if ((features & FMT_RESTRICTED_READAHEAD) && + lp->read_ahead != DM_READ_AHEAD_AUTO && lp->read_ahead != DM_READ_AHEAD_NONE && - (vg->fid->fmt->features & FMT_RESTRICTED_READAHEAD) && (lp->read_ahead < 2 || lp->read_ahead > 120)) { log_error("Metadata only supports readahead values between 2 and 120."); - return NULL; - } - - if (lp->stripe_size > vg->extent_size) { - log_error("Reducing requested stripe size %s to maximum, " - "physical extent size %s", - display_size(cmd, (uint64_t) lp->stripe_size), - display_size(cmd, (uint64_t) vg->extent_size)); - lp->stripe_size = vg->extent_size; + return 0; } /* Need to check the vg's format to verify this - the cmd format isn't setup properly yet */ - if (lp->stripes > 1 && - !(vg->fid->fmt->features & FMT_UNLIMITED_STRIPESIZE) && - (lp->stripe_size > STRIPE_SIZE_MAX)) { - log_error("Stripe size may not exceed %s", - display_size(cmd, (uint64_t) STRIPE_SIZE_MAX)); - return NULL; + if (!(features & FMT_UNLIMITED_STRIPESIZE) && + (lp->stripes > 1) && (lp->stripe_size > STRIPE_SIZE_MAX)) { + log_error("Stripe size may not exceed %s.", + display_size(vg->cmd, (uint64_t) STRIPE_SIZE_MAX)); + return 0; } - if ((size_rest = lp->extents % lp->stripes)) { - log_print_unless_silent("Rounding size (%d extents) up to stripe boundary " - "size (%d extents)", lp->extents, - lp->extents - size_rest + lp->stripes); - lp->extents = lp->extents - size_rest + lp->stripes; - } + return 1; +} - /* Does LV need to be zeroed? Thin handles this as a per-pool in-kernel setting. */ - if (lp->zero && !segtype_is_thin(lp->segtype) && !activation()) { - log_error("Can't wipe start of new LV without using " - "device-mapper kernel driver"); +/* Thin notes: + * If lp->thin OR lp->activate is AY*, activate the pool if not already active. + * If lp->thin, create thin LV within the pool - as a snapshot if lp->snapshot. + * If lp->activate is AY*, activate it. + * If lp->activate is AN* and the pool was originally not active, deactivate it. + */ +static struct logical_volume *_lv_create_an_lv(struct volume_group *vg, + struct lvcreate_params *lp, + const char *new_lv_name) +{ + struct cmd_context *cmd = vg->cmd; + uint32_t size; + uint64_t status = lp->permission | VISIBLE_LV; + const struct segment_type *create_segtype = lp->segtype; + struct logical_volume *lv, *origin_lv = NULL; + struct logical_volume *pool_lv = NULL; + struct logical_volume *tmp_lv; + struct lv_segment *seg = NULL, *pool_seg; + int thin_pool_was_active = -1; /* not scanned, inactive, active */ + int historical; + uint64_t transaction_id; + int ret; + + if (new_lv_name && lv_name_is_used_in_vg(vg, new_lv_name, &historical)) { + log_error("%sLogical Volume \"%s\" already exists in " + "volume group \"%s\"", historical ? "historical " : "", + new_lv_name, vg->name); return NULL; } - status |= lp->permission | VISIBLE_LV; + if (!_vg_check_features(vg, lp)) + return_NULL; - if (lp->snapshot && lp->thin) { - if (!(org = find_lv(vg, lp->origin))) { - log_error("Couldn't find origin volume '%s'.", - lp->origin); + if (!activation()) { + if (seg_is_cache(lp) || + seg_is_mirror(lp) || + (seg_is_raid(lp) && !seg_is_raid0(lp)) || + seg_is_thin(lp) || + seg_is_vdo(lp) || + lp->snapshot) { + /* + * FIXME: For thin pool add some code to allow delayed + * initialization of empty thin pool volume. + * i.e. using some LV flag, fake message,... + * and testing for metadata pool header signature? + */ + log_error("Can't create %s without using " + "device-mapper kernel driver.", + lp->segtype->name); return NULL; } - - if (org->status & LOCKED) { - log_error("Snapshots of locked devices are not supported."); - return NULL; + /* Does LV need to be zeroed? */ + if (lp->zero) { + log_warn("WARNING: Skipping zeroing and wipping, compiled without activation support."); + lp->zero = 0; + lp->wipe_signatures = 0; } + } - lp->voriginextents = org->le_count; - } else if (lp->snapshot) { - if (!activation()) { - log_error("Can't create snapshot without using " - "device-mapper kernel driver"); + if (lp->stripe_size > vg->extent_size) { + if (seg_is_raid(lp) && (vg->extent_size < STRIPE_SIZE_MIN)) { + /* + * FIXME: RAID will simply fail to load the table if + * this is the case, but we should probably + * honor the stripe minimum for regular stripe + * volumes as well. Avoiding doing that now + * only to minimize the change. + */ + log_error("The extent size in volume group %s is too " + "small to support striped RAID volumes.", + vg->name); return NULL; } - /* Must zero cow */ - status |= LVM_WRITE; - - if (lp->voriginsize) - origin_active = 1; - else { - - if (!(org = find_lv(vg, lp->origin))) { - log_error("Couldn't find origin volume '%s'.", - lp->origin); - return NULL; - } - if (lv_is_virtual_origin(org)) { - log_error("Can't share virtual origins. " - "Use --virtualsize."); - return NULL; - } - if (lv_is_cow(org)) { - log_error("Snapshots of snapshots are not " - "supported yet."); - return NULL; - } - if (org->status & LOCKED) { - log_error("Snapshots of locked devices are not " - "supported yet"); - return NULL; - } - if (lv_is_merging_origin(org)) { - log_error("Snapshots of an origin that has a " - "merging snapshot is not supported"); - return NULL; - } - - if (lv_is_thin_type(org) && !lv_is_thin_volume(org)) { - log_error("Snapshots of thin pool %sdevices " - "are not supported.", - lv_is_thin_pool_data(org) ? "data " : - lv_is_thin_pool_metadata(org) ? - "metadata " : ""); - return NULL; - } - - if (lv_is_mirror_type(org) && - !seg_is_raid(first_seg(org))) { - log_warn("WARNING: Snapshots of mirrors can deadlock under rare device failures."); - log_warn("WARNING: Consider using the raid1 mirror type to avoid this."); - log_warn("WARNING: See global/mirror_segtype_default in lvm.conf."); - } - - if (!lv_info(cmd, org, 0, &info, 0, 0)) { - log_error("Check for existence of active snapshot " - "origin '%s' failed.", org->name); - return NULL; - } - origin_active = info.exists; - - if (vg_is_clustered(vg) && - !lv_is_active_exclusive_locally(org)) { - log_error("%s must be active exclusively to" - " create snapshot", org->name); - return NULL; - } - } - } - - if (!seg_is_thin_volume(lp) && !lp->extents) { - log_error("Unable to create new logical volume with no extents"); - return NULL; + log_print_unless_silent("Reducing requested stripe size %s to maximum, " + "physical extent size %s.", + display_size(cmd, (uint64_t) lp->stripe_size), + display_size(cmd, (uint64_t) vg->extent_size)); + lp->stripe_size = vg->extent_size; } - if (seg_is_thin_pool(lp) && - ((uint64_t)lp->extents * vg->extent_size < lp->chunk_size)) { - log_error("Unable to create thin pool smaller than 1 chunk."); - return NULL; - } + lp->extents = _round_to_stripe_boundary(vg, lp->extents, lp->stripes, 1); - if (lp->snapshot && !lp->thin && ((uint64_t)lp->extents * vg->extent_size < 2 * lp->chunk_size)) { - log_error("Unable to create a snapshot smaller than 2 chunks."); + if (!lp->extents && !seg_is_virtual(lp)) { + log_error(INTERNAL_ERROR "Unable to create new logical volume with no extents."); return NULL; } - if (!seg_is_virtual(lp) && - vg->free_count < lp->extents) { - log_error("Volume group \"%s\" has insufficient free space " - "(%u extents): %u required.", - vg->name, vg->free_count, lp->extents); + if ((seg_is_pool(lp) || seg_is_cache(lp)) && + ((uint64_t)lp->extents * vg->extent_size < lp->chunk_size)) { + log_error("Unable to create %s smaller than 1 chunk.", + lp->segtype->name); return NULL; } - if (lp->stripes > dm_list_size(lp->pvh) && lp->alloc != ALLOC_ANYWHERE) { + if ((lp->alloc != ALLOC_ANYWHERE) && (lp->stripes > dm_list_size(lp->pvh))) { log_error("Number of stripes (%u) must not exceed " "number of physical volumes (%d)", lp->stripes, dm_list_size(lp->pvh)); return NULL; } - if (!activation() && - (seg_is_mirrored(lp) || - seg_is_raid(lp) || - seg_is_thin_pool(lp))) { - /* - * FIXME: For thin pool add some code to allow delayed - * initialization of empty thin pool volume. - * i.e. using some LV flag, fake message,... - * and testing for metadata pool header signature? - */ - log_error("Can't create %s without using " - "device-mapper kernel driver.", - segtype_is_raid(lp->segtype) ? lp->segtype->name : - segtype_is_mirrored(lp->segtype) ? "mirror" : - "thin pool volume"); - return NULL; - } - - /* The snapshot segment gets created later */ - if (lp->snapshot && !lp->thin && - !(lp->segtype = get_segtype_from_string(cmd, "striped"))) - return_NULL; - - if (!archive(vg)) - return_NULL; + if (seg_is_pool(lp)) + status |= LVM_WRITE; /* Pool is always writable */ + else if (seg_is_cache(lp) || seg_is_thin_volume(lp) || seg_is_vdo(lp)) { + /* Resolve pool volume */ + if (!lp->pool_name) { + /* Should be already checked */ + log_error(INTERNAL_ERROR "Cannot create %s volume without %s pool.", + lp->segtype->name, lp->segtype->name); + return NULL; + } - if (!dm_list_empty(&lp->tags)) { - if (!(vg->fid->fmt->features & FMT_TAGS)) { - log_error("Volume group %s does not support tags", - vg->name); + if (!(pool_lv = find_lv(vg, lp->pool_name))) { + log_error("Couldn't find volume %s in Volume group %s.", + lp->pool_name, vg->name); return NULL; } - } - if (seg_is_thin_volume(lp) && - ((lp->activate == CHANGE_AY) || - (lp->activate == CHANGE_AE) || - (lp->activate == CHANGE_ALY))) { - /* Ensure all stacked messages are submitted */ - if (!(lvl = find_lv_in_vg(vg, lp->pool))) { - log_error("Unable to find existing pool LV %s in VG %s.", - lp->pool, vg->name); + if (lv_is_locked(pool_lv)) { + log_error("Cannot use locked pool volume %s.", + display_lvname(pool_lv)); return NULL; } - if (!update_pool_lv(lvl->lv, 1)) + + if (seg_is_thin_volume(lp)) { + /* Validate volume size to to aling on chunk for small extents */ + size = first_seg(pool_lv)->chunk_size; + if (size > vg->extent_size) { + /* Align extents on chunk boundary size */ + size = ((uint64_t)vg->extent_size * lp->extents + size - 1) / + size * size / vg->extent_size; + if (size != lp->extents) { + log_print_unless_silent("Rounding size (%d extents) up to chunk boundary " + "size (%d extents).", lp->extents, size); + lp->extents = size; + } + } + + thin_pool_was_active = lv_is_active(pool_lv); + if (lv_is_new_thin_pool(pool_lv)) { + if (!check_new_thin_pool(pool_lv)) + return_NULL; + /* New pool is now inactive */ + } else { + if (!activate_lv(cmd, pool_lv)) { + log_error("Aborting. Failed to locally activate thin pool %s.", + display_lvname(pool_lv)); + return NULL; + } + if (!thin_pool_below_threshold(first_seg(pool_lv))) { + log_error("Cannot create new thin volume, free space in " + "thin pool %s reached threshold.", + display_lvname(pool_lv)); + return NULL; + } + } + } + + if (seg_is_cache(lp) && + !wipe_cache_pool(pool_lv)) return_NULL; } - if (vg_is_clustered(vg) && segtype_is_raid(lp->segtype)) { - /* - * FIXME: - * We could allow a RAID LV to be created as long as it - * is activated exclusively. Any subsequent activations - * would have to be enforced as exclusive also. - * - * For now, we disallow the existence of RAID LVs in a - * cluster VG - */ - log_error("Unable to create a %s logical volume in a cluster.", - lp->segtype->name); + /* Resolve origin volume */ + if (lp->origin_name && + !(origin_lv = find_lv(vg, lp->origin_name))) { + log_error("Origin volume %s not found in Volume group %s.", + lp->origin_name, vg->name); return NULL; } - if (segtype_is_mirrored(lp->segtype) || segtype_is_raid(lp->segtype)) { + if (origin_lv && seg_is_cache_pool(lp)) { + /* Converting exiting origin and creating cache pool */ + if (!validate_lv_cache_create_origin(origin_lv)) + return_NULL; + + if (origin_lv->size < lp->chunk_size) { + log_error("Caching of origin cache volume smaller then chunk size is unsupported."); + return NULL; + } + } else if (seg_is_cache(lp)) { + if (!pool_lv) { + log_error(INTERNAL_ERROR "Pool LV for cache is missing."); + return NULL; + } + if (!lv_is_cache_pool(pool_lv)) { + log_error("Logical volume %s is not a cache pool.", + display_lvname(pool_lv)); + return NULL; + } + /* Create cache origin for cache pool */ + /* FIXME Eventually support raid/mirrors with -m */ + if (!(create_segtype = get_segtype_from_string(vg->cmd, SEG_TYPE_NAME_STRIPED))) + return_0; + + } else if (seg_is_integrity(lp)) { + if (!(create_segtype = get_segtype_from_string(vg->cmd, SEG_TYPE_NAME_STRIPED))) + return_0; + + } else if (seg_is_mirrored(lp) || (seg_is_raid(lp) && !seg_is_any_raid0(lp))) { + if (!(lp->region_size = adjusted_mirror_region_size(vg->cmd, + vg->extent_size, + lp->extents, + lp->region_size, 0, + vg_is_clustered(vg)))) + return_NULL; + + /* FIXME This will not pass cluster lock! */ init_mirror_in_sync(lp->nosync); if (lp->nosync) { - log_warn("WARNING: New %s won't be synchronised. " + log_warn("WARNING: New %s won't be synchronized. " "Don't read what you didn't write!", lp->segtype->name); status |= LV_NOTSYNCED; } + } else if (pool_lv && seg_is_thin_volume(lp)) { + if (!lv_is_thin_pool(pool_lv)) { + log_error("Logical volume %s is not a thin pool.", + display_lvname(pool_lv)); + return NULL; + } + + if (origin_lv) { + if (lv_is_locked(origin_lv)) { + log_error("Snapshots of locked devices are not supported."); + return NULL; + } + + lp->virtual_extents = origin_lv->le_count; + + /* + * Check if using 'external origin' or the 'normal' snapshot + * within the same thin pool + */ + if (first_seg(origin_lv)->pool_lv != pool_lv) { + if (!thin_pool_supports_external_origin(first_seg(pool_lv), origin_lv)) + return_NULL; + if (origin_lv->status & LVM_WRITE) { + log_error("Cannot use writable LV as the external origin."); + return NULL; /* FIXME conversion for inactive */ + } + if (lv_is_active(origin_lv) && !lv_is_external_origin(origin_lv)) { + log_error("Cannot use active LV for the external origin."); + return NULL; /* We can't be sure device is read-only */ + } + } + } + } else if (lp->snapshot) { + if (!lp->virtual_extents) { + if (!origin_lv) { + log_error("Couldn't find origin volume '%s'.", + lp->origin_name); + return NULL; + } + if (lv_is_virtual_origin(origin_lv)) { + log_error("Can't share virtual origins. " + "Use --virtualsize."); + return NULL; + } + + if (!validate_snapshot_origin(origin_lv)) + return_0; + } + + if (!cow_has_min_chunks(vg, lp->extents, lp->chunk_size)) + return_NULL; + + /* The snapshot segment gets created later */ + if (!(create_segtype = get_segtype_from_string(cmd, SEG_TYPE_NAME_STRIPED))) + return_NULL; + + /* Must zero cow */ + status |= LVM_WRITE; + lp->zero = 1; + lp->wipe_signatures = 0; + } else if (seg_is_vdo_pool(lp)) { + if (!lp->virtual_extents) + log_verbose("Virtual size matching available free logical size in VDO pool."); + + if (!(create_segtype = get_segtype_from_string(vg->cmd, SEG_TYPE_NAME_STRIPED))) + return_NULL; + + /* Must zero and format data area */ + status |= LVM_WRITE; + lp->zero = 1; + } + + if (!segtype_is_virtual(create_segtype) && !lp->approx_alloc && + (vg->free_count < lp->extents)) { + log_error("Volume group \"%s\" has insufficient free space " + "(%u extents): %u required.", + vg->name, vg->free_count, lp->extents); + return NULL; + } - lp->region_size = adjusted_mirror_region_size(vg->extent_size, - lp->extents, - lp->region_size); + if (pool_lv && segtype_is_thin_volume(create_segtype)) { + /* Ensure all stacked messages are submitted */ + if ((thin_pool_is_active(pool_lv) || is_change_activating(lp->activate)) && + !update_thin_pool_lv(pool_lv, 1)) + return_NULL; } if (!(lv = lv_create_empty(new_lv_name ? : "lvol%d", NULL, @@ -4479,53 +9402,113 @@ static struct logical_volume *_lv_create_an_lv(struct volume_group *vg, struct l return_NULL; if (lp->read_ahead != lv->read_ahead) { - log_verbose("Setting read ahead sectors"); lv->read_ahead = lp->read_ahead; + log_debug_metadata("Setting read ahead sectors %u.", lv->read_ahead); } - if (!seg_is_thin_pool(lp) && lp->minor >= 0) { + if (!segtype_is_pool(create_segtype) && + !segtype_is_vdo_pool(create_segtype) && + lp->minor >= 0) { lv->major = lp->major; lv->minor = lp->minor; lv->status |= FIXED_MINOR; - log_verbose("Setting device number to (%d, %d)", lv->major, - lv->minor); + log_debug_metadata("Setting device number to (%d, %d).", + lv->major, lv->minor); } + /* + * The specific LV may not use a lock. lockd_init_lv() sets + * lv->lock_args to NULL if this LV does not use its own lock. + */ + + if (!lockd_init_lv(vg->cmd, vg, lv, lp)) + return_NULL; + dm_list_splice(&lv->tags, &lp->tags); - if (!lv_extend(lv, lp->segtype, + if (!lv_extend(lv, create_segtype, lp->stripes, lp->stripe_size, lp->mirrors, - seg_is_thin_pool(lp) ? lp->poolmetadataextents : lp->region_size, - seg_is_thin_volume(lp) ? lp->voriginextents : lp->extents, - seg_is_thin_volume(lp) ? (org ? org->name : lp->pool) : NULL, lp->pvh, lp->alloc)) + segtype_is_pool(create_segtype) ? lp->pool_metadata_extents : lp->region_size, + (segtype_is_thin_volume(create_segtype) || + segtype_is_vdo(create_segtype)) ? lp->virtual_extents : lp->extents, + lp->pvh, lp->alloc, lp->approx_alloc)) { + unlink_lv_from_vg(lv); /* Keep VG consistent and remove LV without any segment */ return_NULL; + } - if (seg_is_thin_pool(lp)) { - first_seg(lv)->zero_new_blocks = lp->zero ? 1 : 0; - first_seg(lv)->chunk_size = lp->chunk_size; - first_seg(lv)->discards = lp->discards; - /* FIXME: use lowwatermark via lvm.conf global for all thinpools ? */ - first_seg(lv)->low_water_mark = 0; - } else if (seg_is_thin_volume(lp)) { - pool_lv = first_seg(lv)->pool_lv; + /* rhbz1269533: allow for 100%FREE allocation to work with "mirror" and a disk log */ + if (segtype_is_mirror(create_segtype) && + lp->log_count && + !vg->free_count && + lv->le_count > 1) + lv_reduce(lv, 1); + + /* Unlock memory if possible */ + memlock_unlock(vg->cmd); - if (!(first_seg(lv)->device_id = - get_free_pool_device_id(first_seg(pool_lv)))) { + if (pool_lv && segtype_is_vdo(create_segtype)) + if (!set_lv_segment_area_lv(first_seg(lv), 0, pool_lv, 0, LV_VDO_POOL)) + return_NULL; + + if (lv_is_cache_pool(lv)) { + if (!cache_set_params(first_seg(lv), + lp->chunk_size, + lp->cache_metadata_format, + lp->cache_mode, + lp->policy_name, + lp->policy_settings)) { stack; goto revert_new_lv; } - - if (!attach_pool_message(first_seg(pool_lv), - DM_THIN_MESSAGE_CREATE_THIN, lv, 0, 0)) { + } else if (lv_is_raid(lv) && !seg_is_any_raid0(first_seg(lv))) { + first_seg(lv)->min_recovery_rate = lp->min_recovery_rate; + first_seg(lv)->max_recovery_rate = lp->max_recovery_rate; + } else if (lv_is_thin_pool(lv)) { + first_seg(lv)->chunk_size = lp->chunk_size; + first_seg(lv)->zero_new_blocks = lp->zero_new_blocks; + first_seg(lv)->discards = lp->discards; + if ((first_seg(lv)->crop_metadata = lp->crop_metadata) == THIN_CROP_METADATA_NO) + lv->status |= LV_CROP_METADATA; + if (!recalculate_pool_chunk_size_with_dev_hints(lv, seg_lv(first_seg(lv), 0), + lp->thin_chunk_size_calc_policy)) { stack; goto revert_new_lv; } + if (lp->error_when_full) + lv->status |= LV_ERROR_WHEN_FULL; + } else if (pool_lv && lv_is_virtual(lv) && /* not yet thin LV */ + (seg = first_seg(lv)) && + seg_is_thin(seg)) { /* going to be a thin volume */ + pool_seg = first_seg(pool_lv); + if (!(seg->device_id = get_free_thin_pool_device_id(pool_seg))) + return_NULL; + seg->transaction_id = pool_seg->transaction_id; + if (origin_lv && lv_is_thin_volume(origin_lv) && + (first_seg(origin_lv)->pool_lv == pool_lv)) { + /* For thin snapshot pool must match */ + if (!attach_pool_lv(seg, pool_lv, origin_lv, NULL, NULL)) + return_NULL; + /* Use the same external origin */ + if (!attach_thin_external_origin(seg, first_seg(origin_lv)->external_lv)) + return_NULL; + } else { + if (!attach_pool_lv(seg, pool_lv, NULL, NULL, NULL)) + return_NULL; + /* If there is an external origin... */ + if (!attach_thin_external_origin(seg, origin_lv)) + return_NULL; + } + + if (!attach_thin_pool_message(pool_seg, DM_THIN_MESSAGE_CREATE_THIN, lv, 0, 0)) + return_NULL; } + if (!thin_pool_check_overprovisioning(lv)) + return_NULL; + /* FIXME Log allocation and attachment should have happened inside lv_extend. */ - if (lp->log_count && - !seg_is_raid(first_seg(lv)) && seg_is_mirrored(first_seg(lv))) { + if (lp->log_count && segtype_is_mirror(create_segtype)) { if (!add_mirror_log(cmd, lv, lp->log_count, first_seg(lv)->region_size, lp->pvh, lp->alloc)) { @@ -4534,11 +9517,11 @@ static struct logical_volume *_lv_create_an_lv(struct volume_group *vg, struct l } } - /* store vg on disk(s) */ - if (!vg_write(vg) || !vg_commit(vg)) - return_NULL; + lv_set_activation_skip(lv, lp->activation_skip & ACTIVATION_SKIP_SET, + lp->activation_skip & ACTIVATION_SKIP_SET_ENABLED); - backup(vg); + if (lp->noautoactivate) + lv->status |= LV_NOAUTOACTIVATE; /* * Check for autoactivation. @@ -4546,184 +9529,407 @@ static struct logical_volume *_lv_create_an_lv(struct volume_group *vg, struct l * it just as if CHANGE_AY was used, CHANGE_AN otherwise. */ if (lp->activate == CHANGE_AAY) - lp->activate = lv_passes_auto_activation_filter(cmd, lv) ? - CHANGE_ALY : CHANGE_ALN; + lp->activate = lv_passes_auto_activation_filter(cmd, lv) + ? CHANGE_ALY : CHANGE_ALN; + + if (lv_activation_skip(lv, lp->activate, lp->activation_skip & ACTIVATION_SKIP_IGNORE)) + lp->activate = CHANGE_AN; + + /* store vg on disk(s) */ + if (!vg_write(vg) || !vg_commit(vg)) + /* Pool created metadata LV, but better avoid recover when vg_write/commit fails */ + return_NULL; if (test_mode()) { - log_verbose("Test mode: Skipping activation and zeroing."); + log_verbose("Test mode: Skipping activation, zeroing and signature wiping."); goto out; } - if (seg_is_thin(lp)) { + if (seg_is_raid(lp) && lp->raidintegrity) { + log_debug("Adding integrity to new LV"); + + if (!lv_add_integrity_to_raid(lv, &lp->integrity_settings, lp->pvh, NULL)) + goto revert_new_lv; + } + + /* Do not scan this LV until properly zeroed/wiped. */ + if (_should_wipe_lv(lp, lv, 0)) + lv->status |= LV_NOSCAN; + + if (lp->temporary) + lv->status |= LV_TEMPORARY; + + if (seg_is_cache(lp)) { + if (vg_is_shared(vg)) { + if (is_change_activating(lp->activate)) { + if (!lv_active_change(cmd, lv, CHANGE_AEY)) { + log_error("Aborting. Failed to activate LV %s.", + display_lvname(lv)); + goto revert_new_lv; + } + } + } + + /* FIXME Support remote exclusive activation? */ + /* Not yet 'cache' LV, it is stripe volume for wiping */ + + else if (is_change_activating(lp->activate) && !activate_lv(cmd, lv)) { + log_error("Aborting. Failed to activate LV %s locally exclusively.", + display_lvname(lv)); + goto revert_new_lv; + } + } else if (lv_is_cache_pool(lv)) { + /* Cache pool cannot be actived and zeroed */ + log_very_verbose("Cache pool is prepared."); + } else if (lv_is_thin_volume(lv)) { /* For snapshot, suspend active thin origin first */ - if (org && lv_is_active(org)) { - if (!pool_below_threshold(first_seg(first_seg(org)->pool_lv))) { - log_error("Cannot create thin snapshot. Pool %s/%s is filled " - "over the autoextend threshold.", - org->vg->name, first_seg(org)->pool_lv->name); - goto revert_new_lv; + if (origin_lv && lv_is_thin_volume(origin_lv) && lv_is_active(origin_lv)) { + if (!(ret = suspend_lv_origin(cmd, origin_lv))) { + log_error("Failed to suspend thin snapshot origin %s.", + display_lvname(origin_lv)); } - if (!suspend_lv_origin(cmd, org)) { - log_error("Failed to suspend thin snapshot origin %s/%s.", - org->vg->name, org->name); - goto revert_new_lv; + /* Note: always proceed with resume_lv() to leave critical_section */ + if (!resume_lv_origin(cmd, origin_lv)) { /* deptree updates thin-pool */ + log_error("Failed to resume thin snapshot origin %s.", + display_lvname(origin_lv)); + if (ret) + /* suspend with message was OK, only resume failed */ + goto revert_new_lv; /* hard to fix things here */ } - if (!resume_lv_origin(cmd, org)) { /* deptree updates thin-pool */ - log_error("Failed to resume thin snapshot origin %s/%s.", - org->vg->name, org->name); + if (!ret) { + /* Pool transaction_id has been incremented for this canceled transaction + * and needs to be restored to the state from this canceled segment. + * TODO: there is low chance actual suspend has failed + */ + struct lv_status_thin_pool *tpstatus; + if (!lv_thin_pool_status(pool_lv, 1, &tpstatus)) + log_error("Aborting. Failed to read transaction_id from thin pool %s.", + display_lvname(pool_lv)); /* Can't even get thin pool transaction id ??? */ + else { + transaction_id = tpstatus->thin_pool->transaction_id; + dm_pool_destroy(tpstatus->mem); + + if ((transaction_id != first_seg(pool_lv)->transaction_id) && + (transaction_id != seg->transaction_id)) + log_warn("WARNING: Metadata for thin pool %s have transaction_id " FMTu64 + ", but active pool has " FMTu64 ".", + display_lvname(pool_lv), seg->transaction_id, transaction_id); + log_debug_metadata("Restoring previous transaction_id " FMTu64 " for thin pool %s.", + seg->transaction_id, display_lvname(pool_lv)); + first_seg(pool_lv)->transaction_id = seg->transaction_id; + first_seg(lv)->device_id = 0; /* no delete of never existing thin device */ + } goto revert_new_lv; } /* At this point remove pool messages, snapshot is active */ - if (!update_pool_lv(first_seg(org)->pool_lv, 0)) { + if (!update_thin_pool_lv(pool_lv, 0)) { stack; - goto deactivate_and_revert_new_lv; + goto revert_new_lv; } - } - if (((lp->activate == CHANGE_AY) || - (lp->activate == CHANGE_AE) || - (lp->activate == CHANGE_ALY))) { - /* At this point send message to kernel thin mda */ - pool_lv = lv_is_thin_pool(lv) ? lv : first_seg(lv)->pool_lv; - if (!update_pool_lv(pool_lv, 1)) { - stack; - goto deactivate_and_revert_new_lv; + } else if (!dm_list_empty(&first_seg(pool_lv)->thin_messages)) { + /* Send message so that table preload knows new thin */ + if (!lv_is_active(pool_lv)) { + /* Avoid multiple thin-pool activations in this case */ + if (thin_pool_was_active < 0) + thin_pool_was_active = 0; + if (!activate_lv(cmd, pool_lv)) { + log_error("Failed to activate thin pool %s.", + display_lvname(pool_lv)); + goto revert_new_lv; + } + if (!lv_is_active(pool_lv)) { + log_error("Cannot activate thin pool %s, perhaps skipped in lvm.conf volume_list?", + display_lvname(pool_lv)); + return 0; + } } - if (!activate_lv_excl(cmd, lv)) { - log_error("Aborting. Failed to activate thin %s.", - lv->name); - goto deactivate_and_revert_new_lv; + /* Keep thin pool active until thin volume is activated */ + if (!update_thin_pool_lv(pool_lv, 1)) { + stack; + goto revert_new_lv; } } + + if (!lv_active_change(cmd, lv, lp->activate)) { + log_error("Failed to activate thin %s.", lv->name); + goto deactivate_and_revert_new_lv; + } + + /* Restore inactive state if needed */ + if (!thin_pool_was_active && + !deactivate_lv(cmd, pool_lv)) { + log_error("Failed to deactivate thin pool %s.", + display_lvname(pool_lv)); + return NULL; + } } else if (lp->snapshot) { - if (!activate_lv_excl(cmd, lv)) { + lv->status |= LV_TEMPORARY; + if (!activate_lv(cmd, lv)) { log_error("Aborting. Failed to activate snapshot " "exception store."); goto revert_new_lv; } - } else if ((lp->activate == CHANGE_AY && !activate_lv(cmd, lv)) || - (lp->activate == CHANGE_AE && !activate_lv_excl(cmd, lv)) || - (lp->activate == CHANGE_ALY && !activate_lv_local(cmd, lv))) { - log_error("Failed to activate new LV."); - if (lp->zero) - goto deactivate_and_revert_new_lv; - return NULL; - } - - if (!seg_is_thin(lp) && !lp->zero && !lp->snapshot) - log_warn("WARNING: \"%s\" not zeroed", lv->name); - else if ((!seg_is_thin(lp) || - (lv_is_thin_volume(lv) && - !first_seg(first_seg(lv)->pool_lv)->zero_new_blocks)) && - !set_lv(cmd, lv, UINT64_C(0), 0)) { - log_error("Aborting. Failed to wipe %s.", - lp->snapshot ? "snapshot exception store" : - "start of new LV"); + lv->status &= ~LV_TEMPORARY; + } else if (seg_is_vdo_pool(lp)) { + lv->status |= LV_TEMPORARY; + if (!activate_lv(cmd, lv)) { + log_error("Aborting. Failed to activate temporary " + "volume for VDO pool creation."); + goto revert_new_lv; + } + lv->status &= ~LV_TEMPORARY; + } else if (!lv_active_change(cmd, lv, lp->activate)) { + log_error("Failed to activate new LV %s.", display_lvname(lv)); goto deactivate_and_revert_new_lv; } - if (lp->snapshot && !lp->thin) { - /* Reset permission after zeroing */ - if (!(lp->permission & LVM_WRITE)) - lv->status &= ~LVM_WRITE; - - /* COW area must be deactivated if origin is not active */ - if (!origin_active && !deactivate_lv(cmd, lv)) { - log_error("Aborting. Couldn't deactivate snapshot " - "COW area. Manual intervention required."); - return NULL; + if (_should_wipe_lv(lp, lv, !lp->suppress_zero_warn)) { + if (!wipe_lv(lv, (struct wipe_params) + { + .do_zero = lp->zero, + .do_wipe_signatures = lp->wipe_signatures, + .yes = lp->yes, + .force = lp->force, + .is_metadata = lp->is_metadata, + })) { + log_error("Aborting. Failed to wipe %s.", lp->snapshot + ? "snapshot exception store" : "start of new LV"); + goto deactivate_and_revert_new_lv; } + } - /* A virtual origin must be activated explicitly. */ - if (lp->voriginsize && - (!(org = _create_virtual_origin(cmd, vg, lv->name, - lp->permission, - lp->voriginextents)) || - !activate_lv_excl(cmd, org))) { - log_error("Couldn't create virtual origin for LV %s", - lv->name); - if (org && !lv_remove(org)) - stack; + if (seg_is_vdo_pool(lp)) { + if (!convert_vdo_pool_lv(lv, &lp->vdo_params, &lp->virtual_extents, + 1, lp->vdo_pool_header_size)) { + stack; goto deactivate_and_revert_new_lv; } + if ((lv->status & LV_ACTIVATION_SKIP) && + !deactivate_lv(cmd, lv)) { + log_error("Aborting. Couldn't deactivate VDO LV %s with skipped activation.", + display_lvname(lv)); + return NULL; /* Let's retry on error path */ + } + } else if (seg_is_cache(lp) || (origin_lv && lv_is_cache_pool(lv))) { + /* Finish cache conversion magic */ + if (origin_lv) { + /* Convert origin to cached LV */ + if (!(tmp_lv = lv_cache_create(lv, origin_lv))) { + /* FIXME Do a better revert */ + log_error("Aborting. Leaving cache pool %s and uncached origin volume %s.", + display_lvname(lv), display_lvname(origin_lv)); + return NULL; + } + } else { + if (!(tmp_lv = lv_cache_create(pool_lv, lv))) { + /* 'lv' still keeps created new LV */ + stack; + goto deactivate_and_revert_new_lv; + } + } + lv = tmp_lv; - /* cow LV remains active and becomes snapshot LV */ + if (!cache_set_params(first_seg(lv), + lp->chunk_size, + lp->cache_metadata_format, + lp->cache_mode, + lp->policy_name, + lp->policy_settings)) + return_NULL; /* revert? */ + + if (!lv_update_and_reload(lv)) { + char name[NAME_LEN]; + + log_debug("Reverting created caching layer."); + + tmp_lv = seg_lv(first_seg(lv), 0); /* tmp corigin */ + pool_lv = first_seg(lv)->pool_lv; + + if (!detach_pool_lv(first_seg(lv))) + return_NULL; + if (!remove_layer_from_lv(lv, tmp_lv)) + return_NULL; + if (!lv_remove(tmp_lv)) + return_NULL; + + /* Either we need to preserve existing LV and remove created cache pool LV. + Or we need to preserve existing cache pool LV and remove created new LV. */ + if (origin_lv) + lv = pool_lv; // created cache pool to be reverted as new LV + else { + /* Cut off suffix _cpool from preserved existing cache pool */ + if (!drop_lvname_suffix(name, pool_lv->name, "cpool")) { + /* likely older instance of metadata */ + log_debug("LV %s has no suffix for cachepool (skipping rename).", + display_lvname(pool_lv)); + } else if (!lv_uniq_rename_update(cmd, pool_lv, name, 0)) + return_NULL; + } - if (!vg_add_snapshot(org, lv, NULL, - org->le_count, lp->chunk_size)) { - log_error("Couldn't create snapshot."); goto deactivate_and_revert_new_lv; } + } else if (lp->snapshot) { + /* Deactivate zeroed COW, avoid any race usage */ + if (!deactivate_lv(cmd, lv)) { + log_error("Aborting. Couldn't deactivate snapshot COW area %s.", + display_lvname(lv)); + goto deactivate_and_revert_new_lv; /* Let's retry on error path */ + } - /* store vg on disk(s) */ - if (!vg_write(vg)) - return_NULL; + /* Get in sync with deactivation, before reusing LV as snapshot */ + if (!sync_local_dev_names(lv->vg->cmd)) { + log_error("Failed to sync local devices before creating snapshot using %s.", + display_lvname(lv)); + goto revert_new_lv; + } - if (!suspend_lv(cmd, org)) { - log_error("Failed to suspend origin %s", org->name); - vg_revert(vg); - return NULL; + /* Create zero origin volume for spare snapshot */ + if (lp->virtual_extents && + !(origin_lv = _create_virtual_origin(cmd, vg, lv->name, + (lp->permission & ~LVM_WRITE), + lp->virtual_extents))) + goto revert_new_lv; + + /* Reset permission after zeroing */ + if (!(lp->permission & LVM_WRITE)) + lv->status &= ~LVM_WRITE; + + /* + * COW LV is activated via implicit activation of origin LV + * Only the snapshot origin holds the LV lock in cluster + */ + if (!origin_lv || + !vg_add_snapshot(origin_lv, lv, NULL, + origin_lv->le_count, lp->chunk_size)) { + log_error("Couldn't create snapshot."); + goto deactivate_and_revert_new_lv; } - if (!vg_commit(vg)) - return_NULL; + if (lp->virtual_extents) { + /* Store vg on disk(s) */ + if (!vg_write(vg) || !vg_commit(vg)) + return_NULL; /* Metadata update fails, deep troubles */ - if (!resume_lv(cmd, org)) { - log_error("Problem reactivating origin %s", org->name); - return NULL; + /* + * FIXME We do not actually need snapshot-origin as an active device, + * as virtual origin is already 'hidden' private device without + * vg/lv links. As such it is not supposed to be used by any user. + * Also it would save one dm table entry, but it needs quite a few + * changes in the libdm/lvm2 code base to support it. + */ + + /* Activate spare snapshot once it is a complete LV */ + if (!lv_active_change(cmd, origin_lv, lp->activate)) { + log_error("Failed to activate sparce volume %s.", + display_lvname(origin_lv)); + return NULL; + } + } else if (!lv_update_and_reload(origin_lv)) { + log_error("Aborting. Manual intervention required."); + return NULL; /* FIXME: revert */ } } - /* FIXME out of sequence */ - backup(vg); - out: return lv; deactivate_and_revert_new_lv: + if (!sync_local_dev_names(lv->vg->cmd)) + log_error("Failed to sync local devices before reverting %s.", + display_lvname(lv)); if (!deactivate_lv(cmd, lv)) { - log_error("Unable to deactivate failed new LV. " - "Manual intervention required."); + log_error("Unable to deactivate failed new LV %s. " + "Manual intervention required.", display_lvname(lv)); return NULL; } revert_new_lv: + if (!lockd_lv(cmd, lv, "un", LDLV_PERSISTENT)) + log_warn("WARNING: Failed to unlock %s.", display_lvname(lv)); + lockd_free_lv(vg->cmd, vg, lv->name, &lv->lvid.id[1], lv->lock_args); + /* FIXME Better to revert to backup of metadata? */ - if (!lv_remove(lv) || !vg_write(vg) || !vg_commit(vg)) + /* Do not remove anything for create during conversion operation */ + if (!strncmp(cmd->name, "lvconvert", 9) || + !lv_remove(lv) || !vg_write(vg) || !vg_commit(vg)) log_error("Manual intervention may be required to remove " "abandoned LV(s) before retrying."); - else - backup(vg); return NULL; } -int lv_create_single(struct volume_group *vg, - struct lvcreate_params *lp) +struct logical_volume *lv_create_single(struct volume_group *vg, + struct lvcreate_params *lp) { + const struct segment_type *segtype; struct logical_volume *lv; - /* Create thin pool first if necessary */ - if (lp->create_thin_pool) { - if (!seg_is_thin_pool(lp) && - !(lp->segtype = get_segtype_from_string(vg->cmd, "thin-pool"))) - return_0; + /* Create pool first if necessary */ + if (lp->create_pool && !seg_is_pool(lp)) { + segtype = lp->segtype; + if (seg_is_thin_volume(lp)) { + if (!(lp->segtype = get_segtype_from_string(vg->cmd, SEG_TYPE_NAME_THIN_POOL))) + return_NULL; - if (!(lv = _lv_create_an_lv(vg, lp, lp->pool))) - return_0; + /* We want a lockd lock for the new thin pool, but not the thin lv. */ + lp->needs_lockd_init = 1; - if (!lp->thin) - goto out; + if (!(lv = _lv_create_an_lv(vg, lp, lp->pool_name))) + return_NULL; - lp->pool = lv->name; + lp->needs_lockd_init = 0; - if (!(lp->segtype = get_segtype_from_string(vg->cmd, "thin"))) - return_0; + } else if (seg_is_cache(lp)) { + if (!lp->origin_name) { + /* Until we have --pooldatasize we are lost */ + log_error(INTERNAL_ERROR "Unsupported creation of cache and cache pool volume."); + return NULL; + } + /* origin_name is defined -> creates cache LV with new cache pool */ + if (!(lp->segtype = get_segtype_from_string(vg->cmd, SEG_TYPE_NAME_CACHE_POOL))) + return_NULL; + + if (!(lv = _lv_create_an_lv(vg, lp, lp->pool_name))) + return_NULL; + + if (!lv_is_cache(lv)) { + log_error(INTERNAL_ERROR "Logical volume is not cache %s.", + display_lvname(lv)); + return NULL; + } + + /* Convertion via lvcreate */ + log_print_unless_silent("Logical volume %s is now cached.", + display_lvname(lv)); + return lv; + } else if (seg_is_vdo(lp)) { + /* The VDO segment needs VDO pool which is layer above created striped data LV */ + if (!(lp->segtype = get_segtype_from_string(vg->cmd, SEG_TYPE_NAME_VDO_POOL))) + return_NULL; + + /* We want a lockd lock for the new vdo pool, but not the vdo lv. */ + lp->needs_lockd_init = 1; + + /* Use vpool names for vdo-pool */ + if (!(lv = _lv_create_an_lv(vg, lp, lp->pool_name ? : "vpool%d"))) + return_NULL; + + lp->needs_lockd_init = 0; + } else { + log_error(INTERNAL_ERROR "Creation of pool for unsupported segment type %s.", + lp->segtype->name); + return NULL; + } + lp->pool_name = lv->name; + lp->segtype = segtype; } if (!(lv = _lv_create_an_lv(vg, lp, lp->lv_name))) - return_0; + return_NULL; -out: - log_print_unless_silent("Logical volume \"%s\" created", lv->name); + if (lp->temporary) + log_verbose("Temporary logical volume \"%s\" created.", lv->name); + else + log_print_unless_silent("Logical volume \"%s\" created.", lv->name); - return 1; + return lv; } |