diff options
Diffstat (limited to 'db/rep/rep_verify.c')
-rw-r--r-- | db/rep/rep_verify.c | 536 |
1 files changed, 536 insertions, 0 deletions
diff --git a/db/rep/rep_verify.c b/db/rep/rep_verify.c new file mode 100644 index 000000000..630db3f02 --- /dev/null +++ b/db/rep/rep_verify.c @@ -0,0 +1,536 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2004-2006 + * Oracle Corporation. All rights reserved. + * + * $Id: rep_verify.c,v 12.32 2006/09/07 03:05:26 sue Exp $ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +static int __rep_dorecovery __P((DB_ENV *, DB_LSN *, DB_LSN *)); + +/* + * __rep_verify -- + * Handle a REP_VERIFY message. + * + * PUBLIC: int __rep_verify __P((DB_ENV *, REP_CONTROL *, DBT *, int, time_t)); + */ +int +__rep_verify(dbenv, rp, rec, eid, savetime) + DB_ENV *dbenv; + REP_CONTROL *rp; + DBT *rec; + int eid; + time_t savetime; +{ + DB_LOG *dblp; + DB_LOGC *logc; + DB_LSN lsn; + DB_REP *db_rep; + DBT mylog; + LOG *lp; + REP *rep; + u_int32_t rectype; + int match, ret, t_ret; + + ret = 0; + db_rep = dbenv->rep_handle; + rep = db_rep->region; + dblp = dbenv->lg_handle; + lp = dblp->reginfo.primary; + + if (IS_ZERO_LSN(lp->verify_lsn)) + return (ret); + + if ((ret = __log_cursor(dbenv, &logc)) != 0) + return (ret); + memset(&mylog, 0, sizeof(mylog)); + if ((ret = __log_c_get(logc, &rp->lsn, &mylog, DB_SET)) != 0) + goto err; + match = 0; + memcpy(&rectype, mylog.data, sizeof(rectype)); + if (mylog.size == rec->size && + memcmp(mylog.data, rec->data, rec->size) == 0) + match = 1; + /* + * If we don't have a match, backup to the previous + * identification record and try again. + */ + if (match == 0) { + ZERO_LSN(lsn); + if ((ret = __rep_log_backup(dbenv, rep, logc, &lsn)) == 0) { + MUTEX_LOCK(dbenv, rep->mtx_clientdb); + lp->verify_lsn = lsn; + lp->rcvd_recs = 0; + lp->wait_recs = rep->request_gap; + MUTEX_UNLOCK(dbenv, rep->mtx_clientdb); + (void)__rep_send_message(dbenv, eid, REP_VERIFY_REQ, + &lsn, NULL, 0, DB_REP_ANYWHERE); + } else if (ret == DB_NOTFOUND) { + /* + * We've either run out of records because + * logs have been removed or we've rolled back + * all the way to the beginning. In the latter + * we don't think these sites were ever part of + * the same environment and we'll say so. + * In the former, request internal backup. + */ + if (rp->lsn.file == 1) { + __db_errx(dbenv, + "Client was never part of master's environment"); + ret = DB_REP_JOIN_FAILURE; + } else { + rep->stat.st_outdated++; + + LOG_SYSTEM_LOCK(dbenv); + lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(dbenv); + REP_SYSTEM_LOCK(dbenv); + F_CLR(rep, REP_F_RECOVER_VERIFY); + if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT) || + rep->version == DB_REPVERSION_42) + ret = DB_REP_JOIN_FAILURE; + else { + F_SET(rep, REP_F_RECOVER_UPDATE); + ZERO_LSN(rep->first_lsn); + } + REP_SYSTEM_UNLOCK(dbenv); + if (ret == 0) + (void)__rep_send_message(dbenv, + eid, REP_UPDATE_REQ, NULL, + NULL, 0, DB_REP_ANYWHERE); + } + } + } else + ret = __rep_verify_match(dbenv, &rp->lsn, savetime); + +err: if ((t_ret = __log_c_close(logc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_verify_fail -- + * Handle a REP_VERIFY_FAIL message. + * + * PUBLIC: int __rep_verify_fail __P((DB_ENV *, REP_CONTROL *, int)); + */ +int +__rep_verify_fail(dbenv, rp, eid) + DB_ENV *dbenv; + REP_CONTROL *rp; + int eid; +{ + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + int ret; + + ret = 0; + db_rep = dbenv->rep_handle; + rep = db_rep->region; + dblp = dbenv->lg_handle; + lp = dblp->reginfo.primary; + + /* + * If any recovery flags are set, but not VERIFY, + * then we ignore this message. We are already + * in the middle of updating. + */ + if (F_ISSET(rep, REP_F_RECOVER_MASK) && + !F_ISSET(rep, REP_F_RECOVER_VERIFY)) + return (0); + /* + * Update stats. Reset startup_complete. + */ + rep->stat.st_outdated++; + rep->stat.st_startup_complete = 0; + + MUTEX_LOCK(dbenv, rep->mtx_clientdb); + REP_SYSTEM_LOCK(dbenv); + /* + * We don't want an old or delayed VERIFY_FAIL + * message to throw us into internal initialization + * when we shouldn't be. + * + * Only go into internal initialization if: + * We are set for AUTOINIT mode. + * We are in RECOVER_VERIFY and this LSN == verify_lsn. + * We are not in any RECOVERY and we are expecting + * an LSN that no longer exists on the master. + * Otherwise, ignore this message. + */ + if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT) && + ((F_ISSET(rep, REP_F_RECOVER_VERIFY) && + LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) || + (F_ISSET(rep, REP_F_RECOVER_MASK) == 0 && + LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0))) { + ret = DB_REP_JOIN_FAILURE; + goto unlock; + } + if (((F_ISSET(rep, REP_F_RECOVER_VERIFY)) && + LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) || + (F_ISSET(rep, REP_F_RECOVER_MASK) == 0 && + LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0)) { + F_CLR(rep, REP_F_RECOVER_VERIFY); + F_SET(rep, REP_F_RECOVER_UPDATE); + ZERO_LSN(rep->first_lsn); + lp->wait_recs = rep->request_gap; + REP_SYSTEM_UNLOCK(dbenv); + MUTEX_UNLOCK(dbenv, rep->mtx_clientdb); + (void)__rep_send_message(dbenv, + eid, REP_UPDATE_REQ, NULL, NULL, 0, 0); + } else { +unlock: REP_SYSTEM_UNLOCK(dbenv); + MUTEX_UNLOCK(dbenv, rep->mtx_clientdb); + } + return (ret); +} + +/* + * __rep_verify_req -- + * Handle a REP_VERIFY_REQ message. + * + * PUBLIC: int __rep_verify_req __P((DB_ENV *, REP_CONTROL *, int)); + */ +int +__rep_verify_req(dbenv, rp, eid) + DB_ENV *dbenv; + REP_CONTROL *rp; + int eid; +{ + DB_LOGC *logc; + DB_REP *db_rep; + DBT *d, data_dbt; + REP *rep; + u_int32_t type; + int old, ret; + + ret = 0; + db_rep = dbenv->rep_handle; + rep = db_rep->region; + + type = REP_VERIFY; + if ((ret = __log_cursor(dbenv, &logc)) != 0) + return (ret); + d = &data_dbt; + memset(d, 0, sizeof(data_dbt)); + F_SET(logc, DB_LOG_SILENT_ERR); + ret = __log_c_get(logc, &rp->lsn, d, DB_SET); + /* + * If the LSN was invalid, then we might get a not + * found, we might get an EIO, we could get anything. + * If we get a DB_NOTFOUND, then there is a chance that + * the LSN comes before the first file present in which + * case we need to return a fail so that the client can return + * a DB_OUTDATED. + * + * If we're a client servicing this request and we get a + * NOTFOUND, return it so the caller can rerequest from + * a better source. + */ + if (ret == DB_NOTFOUND) { + if (F_ISSET(rep, REP_F_CLIENT)) + goto notfound; + else if (__log_is_outdated(dbenv, rp->lsn.file, &old) == 0 && + old != 0) + type = REP_VERIFY_FAIL; + } + + if (ret != 0) + d = NULL; + + (void)__rep_send_message(dbenv, eid, type, &rp->lsn, d, 0, 0); +notfound: + ret = __log_c_close(logc); + return (ret); +} + +static int +__rep_dorecovery(dbenv, lsnp, trunclsnp) + DB_ENV *dbenv; + DB_LSN *lsnp, *trunclsnp; +{ + DB_LSN lsn; + DB_REP *db_rep; + DBT mylog; + DB_LOGC *logc; + REP *rep; + int ret, t_ret, update; + u_int32_t rectype, opcode; + __txn_regop_args *txnrec; + __txn_regop_42_args *txn42rec; + + db_rep = dbenv->rep_handle; + rep = db_rep->region; + + /* Figure out if we are backing out any committed transactions. */ + if ((ret = __log_cursor(dbenv, &logc)) != 0) + return (ret); + + memset(&mylog, 0, sizeof(mylog)); + if (F_ISSET(rep, REP_F_RECOVER_LOG)) + update = 1; + else + update = 0; + while (update == 0 && + (ret = __log_c_get(logc, &lsn, &mylog, DB_PREV)) == 0 && + LOG_COMPARE(&lsn, lsnp) > 0) { + memcpy(&rectype, mylog.data, sizeof(rectype)); + if (rectype == DB___txn_regop) { + if (rep->version >= DB_REPVERSION_44) { + if ((ret = __txn_regop_read(dbenv, + mylog.data, &txnrec)) != 0) + goto err; + opcode = txnrec->opcode; + __os_free(dbenv, txnrec); + } else { + if ((ret = __txn_regop_42_read(dbenv, + mylog.data, &txn42rec)) != 0) + goto err; + opcode = txn42rec->opcode; + __os_free(dbenv, txn42rec); + } + if (opcode != TXN_ABORT) + update = 1; + } + } + /* + * Handle if the log_c_get fails. + */ + if (ret != 0) + goto err; + + /* + * If we successfully run recovery, we've opened all the necessary + * files. We are guaranteed to be single-threaded here, so no mutex + * is necessary. + */ + if ((ret = __db_apprec(dbenv, lsnp, trunclsnp, update, 0)) == 0) + F_SET(db_rep, DBREP_OPENFILES); + +err: if ((t_ret = __log_c_close(logc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __rep_verify_match -- + * We have just received a matching log record during verification. + * Figure out if we're going to need to run recovery. If so, wait until + * everything else has exited the library. If not, set up the world + * correctly and move forward. + * + * PUBLIC: int __rep_verify_match __P((DB_ENV *, DB_LSN *, time_t)); + */ +int +__rep_verify_match(dbenv, reclsnp, savetime) + DB_ENV *dbenv; + DB_LSN *reclsnp; + time_t savetime; +{ + DB_LOG *dblp; + DB_LSN trunclsn; + DB_REP *db_rep; + LOG *lp; + REGENV *renv; + REGINFO *infop; + REP *rep; + int done, master, ret; + u_int32_t unused; + + dblp = dbenv->lg_handle; + db_rep = dbenv->rep_handle; + rep = db_rep->region; + lp = dblp->reginfo.primary; + ret = 0; + infop = dbenv->reginfo; + renv = infop->primary; + + /* + * Check if the savetime is different than our current time stamp. + * If it is, then we're racing with another thread trying to recover + * and we lost. We must give up. + */ + MUTEX_LOCK(dbenv, rep->mtx_clientdb); + done = savetime != renv->rep_timestamp; + if (done) { + MUTEX_UNLOCK(dbenv, rep->mtx_clientdb); + return (0); + } + ZERO_LSN(lp->verify_lsn); + MUTEX_UNLOCK(dbenv, rep->mtx_clientdb); + + /* + * Make sure the world hasn't changed while we tried to get + * the lock. If it hasn't then it's time for us to kick all + * operations out of DB and run recovery. + */ + REP_SYSTEM_LOCK(dbenv); + if (rep->lockout_th != 0 || + (!F_ISSET(rep, REP_F_RECOVER_LOG) && + (F_ISSET(rep, REP_F_READY) || rep->in_recovery != 0))) { + rep->stat.st_msgs_recover++; + goto errunlock; + } + + if ((ret = __rep_lockout_msg(dbenv, rep, 1)) != 0) + goto errunlock; + + if ((ret = __rep_lockout_api(dbenv, rep)) != 0) + goto errunlock; + + /* OK, everyone is out, we can now run recovery. */ + REP_SYSTEM_UNLOCK(dbenv); + + if ((ret = __rep_dorecovery(dbenv, reclsnp, &trunclsn)) != 0) { + REP_SYSTEM_LOCK(dbenv); + rep->lockout_th = 0; + rep->in_recovery = 0; + F_CLR(rep, REP_F_READY); + goto errunlock; + } + + /* + * The log has been truncated (either directly by us or by __db_apprec) + * We want to make sure we're waiting for the LSN at the new end-of-log, + * not some later point. + */ + MUTEX_LOCK(dbenv, rep->mtx_clientdb); + lp->ready_lsn = trunclsn; + ZERO_LSN(lp->waiting_lsn); + ZERO_LSN(lp->max_wait_lsn); + lp->max_perm_lsn = *reclsnp; + lp->wait_recs = 0; + lp->rcvd_recs = 0; + ZERO_LSN(lp->verify_lsn); + + /* + * Discard any log records we have queued; we're about to re-request + * them, and can't trust the ones in the queue. We need to set the + * DB_AM_RECOVER bit in this handle, so that the operation doesn't + * deadlock. + */ + F_SET(db_rep->rep_db, DB_AM_RECOVER); + MUTEX_UNLOCK(dbenv, rep->mtx_clientdb); + ret = __db_truncate(db_rep->rep_db, NULL, &unused); + MUTEX_LOCK(dbenv, rep->mtx_clientdb); + F_CLR(db_rep->rep_db, DB_AM_RECOVER); + + REP_SYSTEM_LOCK(dbenv); + rep->stat.st_log_queued = 0; + rep->in_recovery = 0; + rep->lockout_th = 0; + F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK); + if (ret != 0) + goto errunlock2; + + /* + * If the master_id is invalid, this means that since + * the last record was sent, somebody declared an + * election and we may not have a master to request + * things of. + * + * This is not an error; when we find a new master, + * we'll re-negotiate where the end of the log is and + * try to bring ourselves up to date again anyway. + * + * !!! + * We cannot assert the election flags though because + * somebody may have declared an election and then + * got an error, thus clearing the election flags + * but we still have an invalid master_id. + */ + master = rep->master_id; + REP_SYSTEM_UNLOCK(dbenv); + if (master == DB_EID_INVALID) { + MUTEX_UNLOCK(dbenv, rep->mtx_clientdb); + ret = 0; + } else { + /* + * We're making an ALL_REQ. But now that we've + * cleared the flags, we're likely receiving new + * log records from the master, resulting in a gap + * immediately. So to avoid multiple data streams, + * set the wait_recs value high now to give the master + * a chance to start sending us these records before + * the gap code re-requests the same gap. Wait_recs + * will get reset once we start receiving these + * records. + */ + lp->wait_recs = rep->max_gap; + MUTEX_UNLOCK(dbenv, rep->mtx_clientdb); + (void)__rep_send_message(dbenv, + master, REP_ALL_REQ, reclsnp, NULL, 0, DB_REP_ANYWHERE); + } + if (0) { +errunlock2: MUTEX_UNLOCK(dbenv, rep->mtx_clientdb); +errunlock: REP_SYSTEM_UNLOCK(dbenv); + } + return (ret); +} + +/* + * __rep_log_backup -- + * + * In the verify handshake, we walk backward looking for + * identification records. Those are the only record types + * we verify and match on. + * + * PUBLIC: int __rep_log_backup __P((DB_ENV *, REP *, DB_LOGC *, DB_LSN *)); + */ +int +__rep_log_backup(dbenv, rep, logc, lsn) + DB_ENV *dbenv; + REP *rep; + DB_LOGC *logc; + DB_LSN *lsn; +{ + DBT mylog; + u_int32_t rectype; + int ret; + + COMPQUIET(dbenv, NULL); + ret = 0; + memset(&mylog, 0, sizeof(mylog)); + while ((ret = __log_c_get(logc, lsn, &mylog, DB_PREV)) == 0) { + /* + * Determine what we look for based on version number. + * Due to the contents of records changing between + * versions we have to match based on criteria of that + * particular version. + */ + memcpy(&rectype, mylog.data, sizeof(rectype)); + /* + * In 4.2, we match anything except ckp, recycle and + * dbreg register. + */ + if (rep->version == DB_REPVERSION_42 && + rectype != DB___txn_ckp && rectype != DB___txn_recycle && + rectype != DB___dbreg_register) + break; + /* + * In 4.3 we only match on checkpoint. + */ + if (rep->version == DB_REPVERSION_43 && + rectype == DB___txn_ckp) + break; + /* + * In 4.4 and beyond we match checkpoint and commit. + */ + if (rep->version >= DB_REPVERSION_44 && + (rectype == DB___txn_ckp || rectype == DB___txn_regop)) + break; + } + return (ret); +} |