diff options
Diffstat (limited to 'src/H5ACpkg.h')
-rw-r--r-- | src/H5ACpkg.h | 506 |
1 files changed, 506 insertions, 0 deletions
diff --git a/src/H5ACpkg.h b/src/H5ACpkg.h new file mode 100644 index 0000000..ea7f0bf --- /dev/null +++ b/src/H5ACpkg.h @@ -0,0 +1,506 @@ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * Copyright by The HDF Group. * + * Copyright by the Board of Trustees of the University of Illinois. * + * All rights reserved. * + * * + * This file is part of HDF5. The full HDF5 copyright notice, including * + * terms governing use, modification, and redistribution, is contained in * + * the COPYING file, which can be found at the root of the source code * + * distribution tree, or in https://support.hdfgroup.org/ftp/HDF5/releases. * + * If you do not have access to either file, you may request a copy from * + * help@hdfgroup.org. * + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +/* + * Programmer: John Mainzer -- 4/19/06 + * + * Purpose: This file contains declarations which are normally visible + * only within the H5AC package (just H5AC.c at present). + * + * Source files outside the H5AC package should include + * H5ACprivate.h instead. + * + * The one exception to this rule is testpar/t_cache.c. The + * test code is easier to write if it can look at H5AC_aux_t. + * Indeed, this is the main reason why this file was created. + * + */ + +#if !(defined H5AC_FRIEND || defined H5AC_MODULE) +#error "Do not include this file outside the H5AC package!" +#endif + +#ifndef _H5ACpkg_H +#define _H5ACpkg_H + +/* Get package's private header */ +#include "H5ACprivate.h" /* Metadata cache */ + + +/* Get needed headers */ +#include "H5Cprivate.h" /* Cache */ +#include "H5FLprivate.h" /* Free Lists */ + +/*****************************/ +/* Package Private Variables */ +/*****************************/ + +/* Declare extern the free list to manage the H5AC_aux_t struct */ +H5FL_EXTERN(H5AC_aux_t); + + +/**************************/ +/* Package Private Macros */ +/**************************/ + +#define H5AC_DEBUG_DIRTY_BYTES_CREATION 0 + +#ifdef H5_HAVE_PARALLEL + +/* the following #defined are used to specify the operation required + * at a sync point. + */ + +#define H5AC_SYNC_POINT_OP__FLUSH_TO_MIN_CLEAN 0 +#define H5AC_SYNC_POINT_OP__FLUSH_CACHE 1 + +#endif /* H5_HAVE_PARALLEL */ + +/*------------------------------------------------------------------------- + * It is a bit difficult to set ranges of allowable values on the + * dirty_bytes_threshold field of H5AC_aux_t. The following are + * probably broader than they should be. + *------------------------------------------------------------------------- + */ + +#define H5AC__MIN_DIRTY_BYTES_THRESHOLD (size_t) \ + (H5C__MIN_MAX_CACHE_SIZE / 2) +#define H5AC__DEFAULT_DIRTY_BYTES_THRESHOLD (256 * 1024) +#define H5AC__MAX_DIRTY_BYTES_THRESHOLD (size_t) \ + (H5C__MAX_MAX_CACHE_SIZE / 4) + + +/**************************************************************************** + * + * structure H5AC_aux_t + * + * While H5AC has become a wrapper for the cache implemented in H5C.c, there + * are some features of the metadata cache that are specific to it, and which + * therefore do not belong in the more generic H5C cache code. + * + * In particular, there is the matter of synchronizing writes from the + * metadata cache to disk in the PHDF5 case. + * + * Prior to this update, the presumption was that all metadata caches would + * write the same data at the same time since all operations modifying + * metadata must be performed collectively. Given this assumption, it was + * safe to allow only the writes from process 0 to actually make it to disk, + * while metadata writes from all other processes were discarded. + * + * Unfortunately, this presumption is in error as operations that read + * metadata need not be collective, but can change the location of dirty + * entries in the metadata cache LRU lists. This can result in the same + * metadata write operation triggering writes from the metadata caches on + * some processes, but not all (causing a hang), or in different sets of + * entries being written from different caches (potentially resulting in + * metadata corruption in the file). + * + * To deal with this issue, I decided to apply a paradigm shift to the way + * metadata is written to disk. + * + * With this set of changes, only the metadata cache on process 0 is able + * to write metadata to disk, although metadata caches on all other + * processes can read metadata from disk as before. + * + * To keep all the other caches from getting plugged up with dirty metadata, + * process 0 periodically broadcasts a list of entries that it has flushed + * since that last notice, and which are currently clean. The other caches + * mark these entries as clean as well, which allows them to evict the + * entries as needed. + * + * One obvious problem in this approach is synchronizing the broadcasts + * and receptions, as different caches may see different amounts of + * activity. + * + * The current solution is for the caches to track the number of bytes + * of newly generated dirty metadata, and to broadcast and receive + * whenever this value exceeds some user specified threshold. + * + * Maintaining this count is easy for all processes not on process 0 -- + * all that is necessary is to add the size of the entry to the total + * whenever there is an insertion, a move of a previously clean entry, + * or whever a previously clean entry is marked dirty in an unprotect. + * + * On process 0, we have to be careful not to count dirty bytes twice. + * If an entry is marked dirty, flushed, and marked dirty again, all + * within a single reporting period, it only th first marking should + * be added to the dirty bytes generated tally, as that is all that + * the other processes will see. + * + * At present, this structure exists to maintain the fields needed to + * implement the above scheme, and thus is only used in the parallel + * case. However, other uses may arise in the future. + * + * Instance of this structure are associated with metadata caches via + * the aux_ptr field of H5C_t (see H5Cpkg.h). The H5AC code is + * responsible for allocating, maintaining, and discarding instances + * of H5AC_aux_t. + * + * The remainder of this header comments documents the individual fields + * of the structure. + * + * JRM - 6/27/05 + * + * Update: When the above was written, I planned to allow the process + * 0 metadata cache to write dirty metadata between sync points. + * However, testing indicated that this allowed occasional + * messages from the future to reach the caches on other processes. + * + * To resolve this, the code was altered to require that all metadata + * writes take place during sync points -- which solved the problem. + * Initially all writes were performed by the process 0 cache. This + * approach was later replaced with a distributed write approach + * in which each process writes a subset of the metadata to be + * written. + * + * After thinking on the matter for a while, I arrived at the + * conclusion that the process 0 cache could be allowed to write + * dirty metadata between sync points if it restricted itself to + * entries that had been dirty at the time of the previous sync point. + * + * To date, there has been no attempt to implement this optimization. + * However, should it be attempted, much of the supporting code + * should still be around. + * + * JRM -- 1/6/15 + * + * magic: Unsigned 32 bit integer always set to + * H5AC__H5AC_AUX_T_MAGIC. This field is used to validate + * pointers to instances of H5AC_aux_t. + * + * mpi_comm: MPI communicator associated with the file for which the + * cache has been created. + * + * mpi_rank: MPI rank of this process within mpi_comm. + * + * mpi_size: Number of processes in mpi_comm. + * + * write_permitted: Boolean flag used to control whether the cache + * is permitted to write to file. + * + * dirty_bytes_threshold: Integer field containing the dirty bytes + * generation threashold. Whenever dirty byte creation + * exceeds this value, the metadata cache on process 0 + * broadcasts a list of the entries it has flushed since + * the last broadcast (or since the beginning of execution) + * and which are currently clean (if they are still in the + * cache) + * + * Similarly, metadata caches on processes other than process + * 0 will attempt to receive a list of clean entries whenever + * the threshold is exceeded. + * + * dirty_bytes: Integer field containing the number of bytes of dirty + * metadata generated since the beginning of the computation, + * or (more typically) since the last clean entries list + * broadcast. This field is reset to zero after each such + * broadcast. + * + * metadata_write_strategy: Integer code indicating how we will be + * writing the metadata. In the first incarnation of + * this code, all writes were done from process 0. This + * field exists to facilitate experiments with other + * strategies. + * + * At present, this field must be set to either + * H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY or + * H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED. + * + * dirty_bytes_propagations: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of times the cleaned list + * has been propagated from process 0 to the other + * processes. + * + * unprotect_dirty_bytes: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of dirty bytes created + * via unprotect operations since the last time the cleaned + * list was propagated. + * + * unprotect_dirty_bytes_updates: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of times dirty bytes have + * been created via unprotect operations since the last time + * the cleaned list was propagated. + * + * insert_dirty_bytes: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of dirty bytes created + * via insert operations since the last time the cleaned + * list was propagated. + * + * insert_dirty_bytes_updates: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of times dirty bytes have + * been created via insert operations since the last time + * the cleaned list was propagated. + * + * move_dirty_bytes: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of dirty bytes created + * via move operations since the last time the cleaned + * list was propagated. + * + * move_dirty_bytes_updates: This field only exists when the + * H5AC_DEBUG_DIRTY_BYTES_CREATION #define is TRUE. + * + * It is used to track the number of times dirty bytes have + * been created via move operations since the last time + * the cleaned list was propagated. + * + * Things have changed a bit since the following four fields were defined. + * If metadata_write_strategy is H5AC_METADATA_WRITE_STRATEGY__PROCESS_0_ONLY, + * all comments hold as before -- with the caviate that pending further + * coding, the process 0 metadata cache is forbidden to flush entries outside + * of a sync point. + * + * However, for different metadata write strategies, these fields are used + * only to maintain the correct dirty byte count on process zero -- and in + * most if not all cases, this is redundant, as process zero will be barred + * from flushing entries outside of a sync point. + * + * JRM -- 3/16/10 + * + * d_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list + * of entries that have been dirtied since the last time they + * were listed in a clean entries broadcast. This list is + * only maintained by the metadata cache on process 0 -- it + * it used to maintain a view of the dirty entries as seen + * by the other caches, so as to keep the dirty bytes count + * in synchronization with them. + * + * Thus on process 0, the dirty_bytes count is incremented + * only if either + * + * 1) an entry is inserted in the metadata cache, or + * + * 2) a previously clean entry is moved, and it does not + * already appear in the dirty entry list, or + * + * 3) a previously clean entry is unprotected with the + * dirtied flag set and the entry does not already appear + * in the dirty entry list. + * + * Entries are added to the dirty entry list whever they cause + * the dirty bytes count to be increased. They are removed + * when they appear in a clean entries broadcast. Note that + * moves must be reflected in the dirty entry list. + * + * To reitterate, this field is only used on process 0 -- it + * should be NULL on all other processes. + * + * c_slist_ptr: Pointer to an instance of H5SL_t used to maintain a list + * of entries that were dirty, have been flushed + * to disk since the last clean entries broadcast, and are + * still clean. Since only process 0 can write to disk, this + * list only exists on process 0. + * + * In essence, this slist is used to assemble the contents of + * the next clean entries broadcast. The list emptied after + * each broadcast. + * + * The following two fields are used only when metadata_write_strategy + * is H5AC_METADATA_WRITE_STRATEGY__DISTRIBUTED. + * + * candidate_slist_ptr: Pointer to an instance of H5SL_t used by process 0 + * to construct a list of entries to be flushed at this sync + * point. This list is then broadcast to the other processes, + * which then either flush or mark clean all entries on it. + * + * write_done: In the parallel test bed, it is necessary to ensure that + * all writes to the server process from cache 0 complete + * before it enters the barrier call with the other caches. + * + * The write_done callback allows t_cache to do this without + * requiring an ACK on each write. Since these ACKs greatly + * increase the run time on some platforms, this is a + * significant optimization. + * + * This field must be set to NULL when the callback is not + * needed. + * + * Note: This field has been extended for use by all processes + * with the addition of support for the distributed + * metadata write strategy. + * JRM -- 5/9/10 + * + * sync_point_done: In the parallel test bed, it is necessary to verify + * that the expected writes, and only the expected writes, + * have taken place at the end of each sync point. + * + * The sync_point_done callback allows t_cache to perform + * this verification. The field is set to NULL when the + * callback is not needed. + * + * The following field supports the metadata cache image feature. + * + * p0_image_len: unsiged integer containing the length of the metadata cache + * image constructed by MPI process 0. This field should be 0 + * if the value is unknown, or if cache image is not enabled. + * + ****************************************************************************/ + +#ifdef H5_HAVE_PARALLEL + +#define H5AC__H5AC_AUX_T_MAGIC (unsigned)0x00D0A01 + +typedef struct H5AC_aux_t +{ + uint32_t magic; + + MPI_Comm mpi_comm; + + int mpi_rank; + + int mpi_size; + + hbool_t write_permitted; + + size_t dirty_bytes_threshold; + + size_t dirty_bytes; + + int32_t metadata_write_strategy; + +#if H5AC_DEBUG_DIRTY_BYTES_CREATION + + unsigned dirty_bytes_propagations; + + size_t unprotect_dirty_bytes; + unsigned unprotect_dirty_bytes_updates; + + size_t insert_dirty_bytes; + unsigned insert_dirty_bytes_updates; + + size_t move_dirty_bytes; + unsigned move_dirty_bytes_updates; + +#endif /* H5AC_DEBUG_DIRTY_BYTES_CREATION */ + + H5SL_t * d_slist_ptr; + + H5SL_t * c_slist_ptr; + + H5SL_t * candidate_slist_ptr; + + void (* write_done)(void); + + void (* sync_point_done)(unsigned num_writes, + haddr_t * written_entries_tbl); + + unsigned p0_image_len; + +} H5AC_aux_t; /* struct H5AC_aux_t */ +#endif /* H5_HAVE_PARALLEL */ + + +/******************************/ +/* Package Private Prototypes */ +/******************************/ + +#ifdef H5_HAVE_PARALLEL +/* Parallel I/O routines */ +H5_DLL herr_t H5AC__log_deleted_entry(const H5AC_info_t *entry_ptr); +H5_DLL herr_t H5AC__log_dirtied_entry(const H5AC_info_t *entry_ptr); +H5_DLL herr_t H5AC__log_cleaned_entry(const H5AC_info_t *entry_ptr); +H5_DLL herr_t H5AC__log_flushed_entry(H5C_t *cache_ptr, haddr_t addr, + hbool_t was_dirty, unsigned flags); +H5_DLL herr_t H5AC__log_inserted_entry(const H5AC_info_t *entry_ptr); +H5_DLL herr_t H5AC__log_moved_entry(const H5F_t *f, haddr_t old_addr, + haddr_t new_addr); +H5_DLL herr_t H5AC__flush_entries(H5F_t *f, hid_t dxpl_id); +H5_DLL herr_t H5AC__run_sync_point(H5F_t *f, hid_t dxpl_id, int sync_point_op); +H5_DLL herr_t H5AC__set_sync_point_done_callback(H5C_t *cache_ptr, + void (*sync_point_done)(unsigned num_writes, haddr_t *written_entries_tbl)); +H5_DLL herr_t H5AC__set_write_done_callback(H5C_t * cache_ptr, + void (* write_done)(void)); +#endif /* H5_HAVE_PARALLEL */ + +/* Trace file routines */ +H5_DLL herr_t H5AC__close_trace_file(H5AC_t *cache_ptr); +H5_DLL herr_t H5AC__open_trace_file(H5AC_t *cache_ptr, const char *trace_file_name); + +/* Cache logging routines */ +H5_DLL herr_t H5AC__write_create_cache_log_msg(H5AC_t *cache); +H5_DLL herr_t H5AC__write_destroy_cache_log_msg(H5AC_t *cache); +H5_DLL herr_t H5AC__write_evict_cache_log_msg(const H5AC_t *cache, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_expunge_entry_log_msg(const H5AC_t *cache, + haddr_t address, + int type_id, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_flush_cache_log_msg(const H5AC_t *cache, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_insert_entry_log_msg(const H5AC_t *cache, + haddr_t address, + int type_id, + unsigned flags, + size_t size, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_mark_dirty_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_mark_clean_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_mark_unserialized_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_mark_serialized_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_move_entry_log_msg(const H5AC_t *cache, + haddr_t old_addr, + haddr_t new_addr, + int type_id, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_pin_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_create_fd_log_msg(const H5AC_t *cache, + const H5AC_info_t *parent, + const H5AC_info_t *child, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_protect_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, + unsigned flags, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_resize_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, + size_t new_size, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_unpin_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_destroy_fd_log_msg(const H5AC_t *cache, + const H5AC_info_t *parent, + const H5AC_info_t *child, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_unprotect_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, + int type_id, + unsigned flags, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_set_cache_config_log_msg(const H5AC_t *cache, + const H5AC_cache_config_t *config, + herr_t fxn_ret_value); +H5_DLL herr_t H5AC__write_remove_entry_log_msg(const H5AC_t *cache, + const H5AC_info_t *entry, + herr_t fxn_ret_value); + +#endif /* _H5ACpkg_H */ + |