1 files changed, 536 insertions, 0 deletions
diff --git a/db/rep/rep_verify.c b/db/rep/rep_verify.c
new file mode 100644
index 000000000..630db3f02
--- /dev/null
+++ b/db/rep/rep_verify.c
@@ -0,0 +1,536 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004-2006
+ *	Oracle Corporation.  All rights reserved.
+ *
+ * $Id: rep_verify.c,v 12.32 2006/09/07 03:05:26 sue Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+static int __rep_dorecovery __P((DB_ENV *, DB_LSN *, DB_LSN *));
+
+/*
+ * __rep_verify --
+ *	Handle a REP_VERIFY message.
+ *
+ * PUBLIC: int __rep_verify __P((DB_ENV *, REP_CONTROL *, DBT *, int, time_t));
+ */
+int
+__rep_verify(dbenv, rp, rec, eid, savetime)
+	DB_ENV *dbenv;
+	REP_CONTROL *rp;
+	DBT *rec;
+	int eid;
+	time_t savetime;
+{
+	DB_LOG *dblp;
+	DB_LOGC *logc;
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	DBT mylog;
+	LOG *lp;
+	REP *rep;
+	u_int32_t rectype;
+	int match, ret, t_ret;
+
+	ret = 0;
+	db_rep = dbenv->rep_handle;
+	rep = db_rep->region;
+	dblp = dbenv->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	if (IS_ZERO_LSN(lp->verify_lsn))
+		return (ret);
+
+	if ((ret = __log_cursor(dbenv, &logc)) != 0)
+		return (ret);
+	memset(&mylog, 0, sizeof(mylog));
+	if ((ret = __log_c_get(logc, &rp->lsn, &mylog, DB_SET)) != 0)
+		goto err;
+	match = 0;
+	memcpy(&rectype, mylog.data, sizeof(rectype));
+	if (mylog.size == rec->size &&
+	    memcmp(mylog.data, rec->data, rec->size) == 0)
+		match = 1;
+	/*
+	 * If we don't have a match, backup to the previous
+	 * identification record and try again.
+	 */
+	if (match == 0) {
+		ZERO_LSN(lsn);
+		if ((ret = __rep_log_backup(dbenv, rep, logc, &lsn)) == 0) {
+			MUTEX_LOCK(dbenv, rep->mtx_clientdb);
+			lp->verify_lsn = lsn;
+			lp->rcvd_recs = 0;
+			lp->wait_recs = rep->request_gap;
+			MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
+			(void)__rep_send_message(dbenv, eid, REP_VERIFY_REQ,
+			    &lsn, NULL, 0, DB_REP_ANYWHERE);
+		} else if (ret == DB_NOTFOUND) {
+			/*
+			 * We've either run out of records because
+			 * logs have been removed or we've rolled back
+			 * all the way to the beginning.  In the latter
+			 * we don't think these sites were ever part of
+			 * the same environment and we'll say so.
+			 * In the former, request internal backup.
+			 */
+			if (rp->lsn.file == 1) {
+				__db_errx(dbenv,
+		"Client was never part of master's environment");
+				ret = DB_REP_JOIN_FAILURE;
+			} else {
+				rep->stat.st_outdated++;
+
+				LOG_SYSTEM_LOCK(dbenv);
+				lsn = lp->lsn;
+				LOG_SYSTEM_UNLOCK(dbenv);
+				REP_SYSTEM_LOCK(dbenv);
+				F_CLR(rep, REP_F_RECOVER_VERIFY);
+				if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT) ||
+				    rep->version == DB_REPVERSION_42)
+					ret = DB_REP_JOIN_FAILURE;
+				else {
+					F_SET(rep, REP_F_RECOVER_UPDATE);
+					ZERO_LSN(rep->first_lsn);
+				}
+				REP_SYSTEM_UNLOCK(dbenv);
+				if (ret == 0)
+					(void)__rep_send_message(dbenv,
+					    eid, REP_UPDATE_REQ, NULL,
+					    NULL, 0, DB_REP_ANYWHERE);
+			}
+		}
+	} else
+		ret = __rep_verify_match(dbenv, &rp->lsn, savetime);
+
+err:	if ((t_ret = __log_c_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __rep_verify_fail --
+ *	Handle a REP_VERIFY_FAIL message.
+ *
+ * PUBLIC: int __rep_verify_fail __P((DB_ENV *, REP_CONTROL *, int));
+ */
+int
+__rep_verify_fail(dbenv, rp, eid)
+	DB_ENV *dbenv;
+	REP_CONTROL *rp;
+	int eid;
+{
+	DB_LOG *dblp;
+	DB_REP *db_rep;
+	LOG *lp;
+	REP *rep;
+	int ret;
+
+	ret = 0;
+	db_rep = dbenv->rep_handle;
+	rep = db_rep->region;
+	dblp = dbenv->lg_handle;
+	lp = dblp->reginfo.primary;
+
+	/*
+	 * If any recovery flags are set, but not VERIFY,
+	 * then we ignore this message.  We are already
+	 * in the middle of updating.
+	 */
+	if (F_ISSET(rep, REP_F_RECOVER_MASK) &&
+	    !F_ISSET(rep, REP_F_RECOVER_VERIFY))
+		return (0);
+	/*
+	 * Update stats.  Reset startup_complete.
+	 */
+	rep->stat.st_outdated++;
+	rep->stat.st_startup_complete = 0;
+
+	MUTEX_LOCK(dbenv, rep->mtx_clientdb);
+	REP_SYSTEM_LOCK(dbenv);
+	/*
+	 * We don't want an old or delayed VERIFY_FAIL
+	 * message to throw us into internal initialization
+	 * when we shouldn't be.
+	 *
+	 * Only go into internal initialization if:
+	 * We are set for AUTOINIT mode.
+	 * We are in RECOVER_VERIFY and this LSN == verify_lsn.
+	 * We are not in any RECOVERY and we are expecting
+	 *    an LSN that no longer exists on the master.
+	 * Otherwise, ignore this message.
+	 */
+	if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT) &&
+	    ((F_ISSET(rep, REP_F_RECOVER_VERIFY) &&
+	    LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) ||
+	    (F_ISSET(rep, REP_F_RECOVER_MASK) == 0 &&
+	    LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0))) {
+		ret = DB_REP_JOIN_FAILURE;
+		goto unlock;
+	}
+	if (((F_ISSET(rep, REP_F_RECOVER_VERIFY)) &&
+	    LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) ||
+	    (F_ISSET(rep, REP_F_RECOVER_MASK) == 0 &&
+	    LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0)) {
+		F_CLR(rep, REP_F_RECOVER_VERIFY);
+		F_SET(rep, REP_F_RECOVER_UPDATE);
+		ZERO_LSN(rep->first_lsn);
+		lp->wait_recs = rep->request_gap;
+		REP_SYSTEM_UNLOCK(dbenv);
+		MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
+		(void)__rep_send_message(dbenv,
+		    eid, REP_UPDATE_REQ, NULL, NULL, 0, 0);
+	} else {
+unlock:		REP_SYSTEM_UNLOCK(dbenv);
+		MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
+	}
+	return (ret);
+}
+
+/*
+ * __rep_verify_req --
+ *	Handle a REP_VERIFY_REQ message.
+ *
+ * PUBLIC: int __rep_verify_req __P((DB_ENV *, REP_CONTROL *, int));
+ */
+int
+__rep_verify_req(dbenv, rp, eid)
+	DB_ENV *dbenv;
+	REP_CONTROL *rp;
+	int eid;
+{
+	DB_LOGC *logc;
+	DB_REP *db_rep;
+	DBT *d, data_dbt;
+	REP *rep;
+	u_int32_t type;
+	int old, ret;
+
+	ret = 0;
+	db_rep = dbenv->rep_handle;
+	rep = db_rep->region;
+
+	type = REP_VERIFY;
+	if ((ret = __log_cursor(dbenv, &logc)) != 0)
+		return (ret);
+	d = &data_dbt;
+	memset(d, 0, sizeof(data_dbt));
+	F_SET(logc, DB_LOG_SILENT_ERR);
+	ret = __log_c_get(logc, &rp->lsn, d, DB_SET);
+	/*
+	 * If the LSN was invalid, then we might get a not
+	 * found, we might get an EIO, we could get anything.
+	 * If we get a DB_NOTFOUND, then there is a chance that
+	 * the LSN comes before the first file present in which
+	 * case we need to return a fail so that the client can return
+	 * a DB_OUTDATED.
+	 *
+	 * If we're a client servicing this request and we get a
+	 * NOTFOUND, return it so the caller can rerequest from
+	 * a better source.
+	 */
+	if (ret == DB_NOTFOUND) {
+		if (F_ISSET(rep, REP_F_CLIENT))
+			goto notfound;
+		else if (__log_is_outdated(dbenv, rp->lsn.file, &old) == 0 &&
+		    old != 0)
+			type = REP_VERIFY_FAIL;
+	}
+
+	if (ret != 0)
+		d = NULL;
+
+	(void)__rep_send_message(dbenv, eid, type, &rp->lsn, d, 0, 0);
+notfound:
+	ret = __log_c_close(logc);
+	return (ret);
+}
+
+static int
+__rep_dorecovery(dbenv, lsnp, trunclsnp)
+	DB_ENV *dbenv;
+	DB_LSN *lsnp, *trunclsnp;
+{
+	DB_LSN lsn;
+	DB_REP *db_rep;
+	DBT mylog;
+	DB_LOGC *logc;
+	REP *rep;
+	int ret, t_ret, update;
+	u_int32_t rectype, opcode;
+	__txn_regop_args *txnrec;
+	__txn_regop_42_args *txn42rec;
+
+	db_rep = dbenv->rep_handle;
+	rep = db_rep->region;
+
+	/* Figure out if we are backing out any committed transactions. */
+	if ((ret = __log_cursor(dbenv, &logc)) != 0)
+		return (ret);
+
+	memset(&mylog, 0, sizeof(mylog));
+	if (F_ISSET(rep, REP_F_RECOVER_LOG))
+		update = 1;
+	else
+		update = 0;
+	while (update == 0 &&
+	    (ret = __log_c_get(logc, &lsn, &mylog, DB_PREV)) == 0 &&
+	    LOG_COMPARE(&lsn, lsnp) > 0) {
+		memcpy(&rectype, mylog.data, sizeof(rectype));
+		if (rectype == DB___txn_regop) {
+			if (rep->version >= DB_REPVERSION_44) {
+				if ((ret = __txn_regop_read(dbenv,
+				    mylog.data, &txnrec)) != 0)
+					goto err;
+				opcode = txnrec->opcode;
+				__os_free(dbenv, txnrec);
+			} else {
+				if ((ret = __txn_regop_42_read(dbenv,
+				    mylog.data, &txn42rec)) != 0)
+					goto err;
+				opcode = txn42rec->opcode;
+				__os_free(dbenv, txn42rec);
+			}
+			if (opcode != TXN_ABORT)
+				update = 1;
+		}
+	}
+	/*
+	 * Handle if the log_c_get fails.
+	 */
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * If we successfully run recovery, we've opened all the necessary
+	 * files.  We are guaranteed to be single-threaded here, so no mutex
+	 * is necessary.
+	 */
+	if ((ret = __db_apprec(dbenv, lsnp, trunclsnp, update, 0)) == 0)
+		F_SET(db_rep, DBREP_OPENFILES);
+
+err:	if ((t_ret = __log_c_close(logc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __rep_verify_match --
+ *	We have just received a matching log record during verification.
+ * Figure out if we're going to need to run recovery. If so, wait until
+ * everything else has exited the library.  If not, set up the world
+ * correctly and move forward.
+ *
+ * PUBLIC: int __rep_verify_match __P((DB_ENV *, DB_LSN *, time_t));
+ */
+int
+__rep_verify_match(dbenv, reclsnp, savetime)
+	DB_ENV *dbenv;
+	DB_LSN *reclsnp;
+	time_t savetime;
+{
+	DB_LOG *dblp;
+	DB_LSN trunclsn;
+	DB_REP *db_rep;
+	LOG *lp;
+	REGENV *renv;
+	REGINFO *infop;
+	REP *rep;
+	int done, master, ret;
+	u_int32_t unused;
+
+	dblp = dbenv->lg_handle;
+	db_rep = dbenv->rep_handle;
+	rep = db_rep->region;
+	lp = dblp->reginfo.primary;
+	ret = 0;
+	infop = dbenv->reginfo;
+	renv = infop->primary;
+
+	/*
+	 * Check if the savetime is different than our current time stamp.
+	 * If it is, then we're racing with another thread trying to recover
+	 * and we lost.  We must give up.
+	 */
+	MUTEX_LOCK(dbenv, rep->mtx_clientdb);
+	done = savetime != renv->rep_timestamp;
+	if (done) {
+		MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
+		return (0);
+	}
+	ZERO_LSN(lp->verify_lsn);
+	MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
+
+	/*
+	 * Make sure the world hasn't changed while we tried to get
+	 * the lock.  If it hasn't then it's time for us to kick all
+	 * operations out of DB and run recovery.
+	 */
+	REP_SYSTEM_LOCK(dbenv);
+	if (rep->lockout_th != 0 ||
+	    (!F_ISSET(rep, REP_F_RECOVER_LOG) &&
+	    (F_ISSET(rep, REP_F_READY) || rep->in_recovery != 0))) {
+		rep->stat.st_msgs_recover++;
+		goto errunlock;
+	}
+
+	if ((ret = __rep_lockout_msg(dbenv, rep, 1)) != 0)
+		goto errunlock;
+
+	if ((ret = __rep_lockout_api(dbenv, rep)) != 0)
+		goto errunlock;
+
+	/* OK, everyone is out, we can now run recovery. */
+	REP_SYSTEM_UNLOCK(dbenv);
+
+	if ((ret = __rep_dorecovery(dbenv, reclsnp, &trunclsn)) != 0) {
+		REP_SYSTEM_LOCK(dbenv);
+		rep->lockout_th = 0;
+		rep->in_recovery = 0;
+		F_CLR(rep, REP_F_READY);
+		goto errunlock;
+	}
+
+	/*
+	 * The log has been truncated (either directly by us or by __db_apprec)
+	 * We want to make sure we're waiting for the LSN at the new end-of-log,
+	 * not some later point.
+	 */
+	MUTEX_LOCK(dbenv, rep->mtx_clientdb);
+	lp->ready_lsn = trunclsn;
+	ZERO_LSN(lp->waiting_lsn);
+	ZERO_LSN(lp->max_wait_lsn);
+	lp->max_perm_lsn = *reclsnp;
+	lp->wait_recs = 0;
+	lp->rcvd_recs = 0;
+	ZERO_LSN(lp->verify_lsn);
+
+	/*
+	 * Discard any log records we have queued;  we're about to re-request
+	 * them, and can't trust the ones in the queue.  We need to set the
+	 * DB_AM_RECOVER bit in this handle, so that the operation doesn't
+	 * deadlock.
+	 */
+	F_SET(db_rep->rep_db, DB_AM_RECOVER);
+	MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
+	ret = __db_truncate(db_rep->rep_db, NULL, &unused);
+	MUTEX_LOCK(dbenv, rep->mtx_clientdb);
+	F_CLR(db_rep->rep_db, DB_AM_RECOVER);
+
+	REP_SYSTEM_LOCK(dbenv);
+	rep->stat.st_log_queued = 0;
+	rep->in_recovery = 0;
+	rep->lockout_th = 0;
+	F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK);
+	if (ret != 0)
+		goto errunlock2;
+
+	/*
+	 * If the master_id is invalid, this means that since
+	 * the last record was sent, somebody declared an
+	 * election and we may not have a master to request
+	 * things of.
+	 *
+	 * This is not an error;  when we find a new master,
+	 * we'll re-negotiate where the end of the log is and
+	 * try to bring ourselves up to date again anyway.
+	 *
+	 * !!!
+	 * We cannot assert the election flags though because
+	 * somebody may have declared an election and then
+	 * got an error, thus clearing the election flags
+	 * but we still have an invalid master_id.
+	 */
+	master = rep->master_id;
+	REP_SYSTEM_UNLOCK(dbenv);
+	if (master == DB_EID_INVALID) {
+		MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
+		ret = 0;
+	} else {
+		/*
+		 * We're making an ALL_REQ.  But now that we've
+		 * cleared the flags, we're likely receiving new
+		 * log records from the master, resulting in a gap
+		 * immediately.  So to avoid multiple data streams,
+		 * set the wait_recs value high now to give the master
+		 * a chance to start sending us these records before
+		 * the gap code re-requests the same gap.  Wait_recs
+		 * will get reset once we start receiving these
+		 * records.
+		 */
+		lp->wait_recs = rep->max_gap;
+		MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
+		(void)__rep_send_message(dbenv,
+		    master, REP_ALL_REQ, reclsnp, NULL, 0, DB_REP_ANYWHERE);
+	}
+	if (0) {
+errunlock2:	MUTEX_UNLOCK(dbenv, rep->mtx_clientdb);
+errunlock:	REP_SYSTEM_UNLOCK(dbenv);
+	}
+	return (ret);
+}
+
+/*
+ * __rep_log_backup --
+ *
+ * In the verify handshake, we walk backward looking for
+ * identification records.  Those are the only record types
+ * we verify and match on.
+ *
+ * PUBLIC: int __rep_log_backup __P((DB_ENV *, REP *, DB_LOGC *, DB_LSN *));
+ */
+int
+__rep_log_backup(dbenv, rep, logc, lsn)
+	DB_ENV *dbenv;
+	REP *rep;
+	DB_LOGC *logc;
+	DB_LSN *lsn;
+{
+	DBT mylog;
+	u_int32_t rectype;
+	int ret;
+
+	COMPQUIET(dbenv, NULL);
+	ret = 0;
+	memset(&mylog, 0, sizeof(mylog));
+	while ((ret = __log_c_get(logc, lsn, &mylog, DB_PREV)) == 0) {
+		/*
+		 * Determine what we look for based on version number.
+		 * Due to the contents of records changing between
+		 * versions we have to match based on criteria of that
+		 * particular version.
+		 */
+		memcpy(&rectype, mylog.data, sizeof(rectype));
+		/*
+		 * In 4.2, we match anything except ckp, recycle and
+		 * dbreg register.
+		 */
+		if (rep->version == DB_REPVERSION_42 &&
+		    rectype != DB___txn_ckp && rectype != DB___txn_recycle &&
+		    rectype != DB___dbreg_register)
+			break;
+		/*
+		 * In 4.3 we only match on checkpoint.
+		 */
+		if (rep->version == DB_REPVERSION_43 &&
+		    rectype == DB___txn_ckp)
+			break;
+		/*
+		 * In 4.4 and beyond we match checkpoint and commit.
+		 */
+		if (rep->version >= DB_REPVERSION_44 &&
+		    (rectype == DB___txn_ckp || rectype == DB___txn_regop))
+			break;
+	}
+	return (ret);
+}