diff options
author | Kim Kibum <kb0929.kim@samsung.com> | 2012-05-21 17:40:46 +0900 |
---|---|---|
committer | Kim Kibum <kb0929.kim@samsung.com> | 2012-05-21 17:40:46 +0900 |
commit | 2e082c838d2ca750f5daac6dcdabecc22dfd4e46 (patch) | |
tree | 01c1dd87d4cc0b62a655c0d768ff695d2d244728 /rep | |
parent | a86e3ca152fb414b376e64c449c201d762e414dd (diff) | |
download | db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.gz db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.bz2 db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.zip |
Upload Tizen:Base source
Diffstat (limited to 'rep')
-rw-r--r-- | rep/mlease.html | 1197 | ||||
-rw-r--r-- | rep/rep.src | 116 | ||||
-rw-r--r-- | rep/rep_auto.c | 679 | ||||
-rw-r--r-- | rep/rep_backup.c | 3379 | ||||
-rw-r--r-- | rep/rep_elect.c | 1353 | ||||
-rw-r--r-- | rep/rep_lease.c | 524 | ||||
-rw-r--r-- | rep/rep_log.c | 872 | ||||
-rw-r--r-- | rep/rep_method.c | 2142 | ||||
-rw-r--r-- | rep/rep_record.c | 2379 | ||||
-rw-r--r-- | rep/rep_region.c | 488 | ||||
-rw-r--r-- | rep/rep_stat.c | 568 | ||||
-rw-r--r-- | rep/rep_stub.c | 391 | ||||
-rw-r--r-- | rep/rep_util.c | 2007 | ||||
-rw-r--r-- | rep/rep_verify.c | 766 |
14 files changed, 16861 insertions, 0 deletions
diff --git a/rep/mlease.html b/rep/mlease.html new file mode 100644 index 0000000..85b0aca --- /dev/null +++ b/rep/mlease.html @@ -0,0 +1,1197 @@ +<!DOCTYPE doctype PUBLIC "-//w3c//dtd html 4.0 transitional//en"> +<html> +<head> + <meta http-equiv="Content-Type" + content="text/html; charset=iso-8859-1"> + <meta name="GENERATOR" + content="Mozilla/4.76 [en] (X11; U; FreeBSD 4.3-RELEASE i386) [Netscape]"> + <title>Master Lease</title> +</head> +<body> +<center> +<h1>Master Leases for Berkeley DB</h1> +</center> +<center><i>Susan LoVerso</i> <br> +<i>sue@sleepycat.com</i> <br> +<i>Rev 1.1</i><br> +<i>2007 Feb 2</i><br> +</center> +<p><br> +</p> +<h2>What are Master Leases?</h2> +A master lease is a mechanism whereby clients grant master-ship rights +to a site and that master, by holding lease rights can provide a +guarantee of durability to a replication group for a given period of +time. By granting a lease to a master, +a client will not participate in an election to elect a new +master until that granted master lease has expired. By holding a +collection of granted leases, a master will be able to supply +authoritative read requests to applications. By holding leases a +read operation on a master can guarantee several things to the +application:<br> +<ol> + <li>Authoritative reads: a guarantee that the data being read by the +application is durable and can never be rolled back.</li> + <li>Freshness: a guarantee that the data being read by the +application <b>at the master</b> is +not stale.</li> + <li>Master viability: a guarantee that a current master with valid +leases will not encounter a duplicate master situation.<br> + </li> +</ol> +<h2>Requirements</h2> +The requirements of DB to support this include:<br> +<ul> + <li>After turning them on, users can choose to ignore them in reads +or not.</li> + <li>We are providing read authority on the master only. A +read on a client is equivalent to a read while ignoring leases.</li> + <li>We guarantee that data committed on a master <b>that has been +read by an application on the +master</b> will not be rolled back. Data read on a client or +while ignoring leases <i>or data +successfully updated/committed but not read,</i> +may be rolled back.<br> + </li> + <li>A master will not return successfully from a read operation +unless it holds a +majority of leases unless leases are ignored.</li> + <li>Master leases will remove the possibility of a current/correct +master being "shot down" by DUPMASTER. <b>NOTE: Old/Expired +masters may discover a +later master and return DUPMASTER to the application however.</b><br> + </li> + <li>Any send callback failure must result in premature lease +expiration on the master.<br> + </li> + <li>Users who change the system clock during master leases void the +guarantee and may get undefined behavior. We assume time always +runs forward. <br> + </li> + <li>Clients are forbidden from participating in elections while they +have an outstanding lease granted to another site.</li> + <li>Clients are forbidden from accepting a new master while they have +an outstanding lease granted to another site.</li> + <li>Clients are forbidden from upgrading themselves to master while +they have an outstanding lease granted to another site.</li> + <li>When asked for a lease grant explicitly by the master, the client +cannot grant the lease to the master unless the LSN in the master's +request has been processed by this client.<br> + </li> +</ul> +The requirements of the +application using leases include:<br> +<ul> + <li>Users must implement (Base API users on their own, RepMgr users +via configuration) a majority (or larger) ACK policy. <br> + </li> + <li>The application must use the election mechanism to decide a master. +It may not simply declare a site master.</li> + <li>The send callback must return an error if the majority ACK policy +is not met for PERM records.</li> + <li>Users must set the number of sites in the group.</li> + <li>Using leases in a replication group is all-or-none. +Therefore, if a site knows it is using leases, it can assume other +sites are also.<br> + </li> + <li>All applications that care about read guarantees must forward or +perform all reads on the master. Reading on the client means a +read ignoring leases. </li> +</ul> +<p>There are some open questions +remaining.</p> +<ul> + <li>There is one major showstopper issue, see Crashing - Potential +problem near the end of the document. We need a better solution +than the one shown there (writing to disk every time a lease is +granted). Perhaps just documenting that durability means it must be +flushed to disk before success to avoid that situation?<br> + </li> + <li>What about db->join? Users can call join, but the calls +on the join cursor to get the data would be subject to leases and +therefore protected. Ok, this is not an open question.</li> + <li>What about other read-like operations? Clearly <i> +DB->get, DB->pget, DBC->get, +DBC->pget</i> need lease checks. However, other APIs use +keys. <i>DB->key_range</i> +provides an estimate only so it shouldn't need lease checks. <i> +DB->stat</i> provides exact counts +to <i>bt_nkeys</i> and <i>bt_ndata</i> fields. Are those +fields considered authoritative that providing those values implies a +durability guarantee and therefore <i>DB->stat</i> +should be subject to lease verification? <i>DBC->count</i> +provides a count for +the number of data items associated with a key. Is this +authoritative information? This is similar to stat - should it be +subject to lease verification?<br> + </li> + <li>Do we require master lease checks on write operations? I +think lease checks are not needed on write operations. It doesn't +add correctness and adds a lot of complexity (checking leases in put, +del, and cursors, then what about rename, remove, etc).<br> + </li> + <li>Do master leases give an iron-clad guarantee of never rolling +back a transaction? No, but it should mean that a committed transaction +can never be <b>read</b> on a master +unless the lease is valid. A committed transaction on a master +that has never been presented to the application may get rolled back.<br> + </li> + <li>Do we need to quarantine or prevent reads on an ex-master until +sync-up is done? No. A master that is simply downgraded to +client or crashes and reboots is now a client. Reading from that +client is the same as saying Ignore Leases.</li> + <li>What about adding and removing sites while leases are +active? This is SR 14778. A consistent <i>nsites</i> value +is required by master +leases. It isn't +clear to me what a master is +supposed to do if the value of nsites gets smaller while leases are +active. Perhaps it leaves its larger table intact and simply +checks for a smaller number of granted leases?<br> + </li> + <li>Can users turn leases off? No. There is no planned <i>turn +leases off</i> API.</li> + <li>Clock skew will be a percentage. However, the smallest, 1%, +is probably rather large for clock skew. Percentage was chosen +for simplicity and similarity to other APIs. What granularity is +appropriate here?</li> +</ul> +<h2>API Changes</h2> +The API changes that are visible +to the user are fairly minimal. +There are a few API calls they need to make to configure master leases +and then there is the API call to turn them on. There is also a +new flag to existing APIs to allow read operations to ignore leases and +return data that +may be non-durable potentially.<br> +<h3>Lease Timeout<br> +</h3> +There is a new timout the user +must configure for leases called <b>DB_REP_LEASE_TIMEOUT</b>. +This timeout will be new to +the <i>dbenv->rep_set_timeout</i> method. The <b>DB_REP_LEASE_TIMEOUT</b> +has no default and it is required that the user configure a timeout +before they turn on leases (obviously, this timeout need not be set of +leases will not be used). That timeout is the amount of time +the lease is valid on the master and how long it is granted +on the client. This timeout must be the same +value on all sites (like log file size). The timeout used when +refreshing leases is the <b>DB_REP_ACK_TIMEOUT</b> +for RepMgr application. For Base API applications, lease +refreshes will use the same mechanism as <b>PERM</b> messages and they +should +have no additional burden. This timeout is used for lease +refreshment and is the amount of time a reader will wait to refresh +leases before returning failure to the application from a read +operation.<br> +<br> +This timeout will be both stored +with its original value, and also +converted to a <i>db_timespec</i> +using the <b>DB_TIMEOUT_TO_TIMESPEC</b> +macro and have the clock skew accounted for and stored in the shared +rep structure:<br> +<pre>db_timeout_t lease_timeout;<br>db_timespec lease_duration;<br></pre> +NOTE: By sending the lease refresh during DB operations, we are +forcing/assuming that the operation's process has a replication +transport function set. That is obviously the case for write +operations, but would it be a burden for read processes (on a +master)? I think mostly not, but if we need leases for <i> +DB->stat</i> then we need to +document it as it is certainly possible for an application to have a +separate or dedicated <i>stat</i> +application or attempt to use <i>db_stat</i> +(which will not work if leases must be checked).<br> +<br> +Leases should be checked after the local operation so that we don't +have a window/boundary if we were to check leases first, get +descheduled, the lose our lease and then perform the operation. +Do the operation, then check leases before returning to the user.<br> +<h3>Using Leases</h3> +There is a new API that the user must call to tell the system to use +the lease mechanism. The method must be called before the +application calls <i>dbenv->rep_start</i> +or <i>dbenv->repmgr_start</i>. +This new +method is:<br> +<br> +<pre> dbenv->rep_set_lease(DB_ENV *dbenv, u_int32_t clock_scale_factor, u_int32_t flags)<br> +</pre> +The <i>clock_scale_factor</i> +parameter is interpreted as a percentage, greater than 100 (to transmit +a floating point number as an integer to the API) that represents the +maximum shkew between any two sites' clocks. That is, a <span + style="font-style: italic;">clock_scale_factor</span> of 150 suggests +that the greatest discrepancy between clocks is that one runs 50% +faster than the others. Both the +master and client sides +compensate for possible clock skew. The master uses the value to +compensate in case the replica has a slow clock and replicas compensate +in case they have a fast clock. This scaling factor will need to +be divided by 100 on all sites to truly represent the percentage for +adjustments made to time values.<br> +<br> +Assume the slowest replica's clock is a factor of <i>clock_scale_factor</i> +slower than the +fastest clock. Using that assumption, if the fastest clock goes +from time t1 to t2 in X +seconds, the slowest clock does it in (<i>clock_scale_factor</i> / 100) +* X seconds.<br> +<br> +The <i>flags</i> parameter is not +currently used.<br> +<br> +When the <i>dbenv->rep_set_lease</i> +method is called, we will set a configuration flag indicating that +leases are turned on:<br> +<b>#define REP_C_LEASE <value></b>. +We will also record the <b>u_int32_t +clock_skew</b> value passed in. The <i>rep_set_lease</i> method +will not allow +calls after <i>rep_start. </i>If +multiple calls are made prior to calling <i>rep_start</i> then later +calls will +overwrite the earlier clock skew value. <br> +<br> +We need a new flag to prevent calling <i>rep_set_lease</i> +after <i>rep_start</i>. The +simplest solution would be to reject the call to +<i>rep_set_lease +</i>if<b> +REP_F_CLIENT</b> +or <b>REP_F_MASTER</b> is set. +However that does not work in the cases where a site cleanly closes its +environment and then opens without running recovery. The +replication state will still be set. The prevention will be +implemented as:<br> +<pre>#define REP_F_START_CALLED <some bit value><br></pre> +In __rep_start, at the end:<br> +<pre>if (ret == 0 ) {<br> REP_SYSTEM_LOCK<br> F_SET(rep, REP_F_START_CALLED)<br> REP_SYSTEM_UNLOCK<br>}</pre> +In <i>__rep_env_refresh</i>, if we +are the last reference closing the env (we already check for that):<br> +<pre>F_CLR(rep, REP_F_START_CALLED);</pre> +In order to avoid run-time floating point operations +on <i>db_timespec</i> structures, +when a site is declared as a client or master in <i>rep_start</i> we +will pre-compute the +lease duration based on the integer-based clock skew and the +integer-based lease timeout. A master should set a replica's +lease expiration to the <b>start time of +the sent message + +(lease_timeout / clock_scale_factor)</b> in case the replica has a +slow clock. Replicas extend their leases to <b>received message +time + (lease_timeout * +clock_scale_factor)</b> in case this replica has a fast clock. +Therefore, the computation will be as follows if the site is becoming a +master:<br> +<pre>db_timeout_t tmp;<br>tmp = (db_timeout_t)((double)rep->lease_timeout / ((double)rep->clock_skew / (double)100));<br>rep->lease_duration = DB_TIMEOUT_TO_TIMESPEC(&tmp);<br></pre> +Similarly, on a client the computation is:<br> +<pre>tmp = (db_timeout_t)((double)rep->lease_timeout * ((double)rep->clock_skew / (double)100));<br></pre> +When a site changes state, its lease duration will change based on +whether it is becoming a master or client and it will be recomputed +from the original values. Note that these computations, coupled +with the fact that the lease on the master is computed based on the +master's time that it sent the message means that leases on the master +are more conservatively computed than on the clients.<br> +<br> +The <i>dbenv->rep_set_lease</i> +method must be called after <i>dbenv->open</i>, +similar to <i>dbenv->rep_set_config</i>. +The reason is so that we can check that this is a replication +environment and we have access to the replication shared memory region.<br> +<h3>Read Operations<br> +</h3> +Authoritative read operations on the master with leases enabled will +abide by leases by default. We will provide a flag that allows an +operation on a master to ignore leases. <b>All read operations +on a client imply +ignoring leases.</b> If an application wants authoritative reads +they must forward the read requests to the master and it is the +application's responsibility to provide the forwarding. +The consensus was that forcing <span style="font-weight: bold;">DB_IGNORE_LEASE</span> +on client read operations (with leases enabled, obviously) was too +heavy handed. Read operations on the client will ignore leases, +but do no special flag checking.<br> +<br> +The flag will be called <b>DB_IGNORE_LEASE</b> +and it will be a flag that can be OR'd into the DB access method and +cursor operation values. It will be similar to the <b>DB_READ_UNCOMMITTED</b> +flag. +<br> +</b>The methods that will +adhere to leases are:<br> +<ul> + <li><i>Db->get</i></li> + <li><i>Db->pget</i></li> + <li><i>Dbc->get</i></li> + <li><i>Dbc->pget</i></li> +</ul> +The code that will check leases for a client reading would look +something +like this, if we decide to become heavy-handed:<br> +<pre>if (IS_REP_CLIENT(dbenv)) {<br> [get to rep structure]<br> if (FLD_ISSET(rep->config, REP_C_LEASE) && !LF_ISSET(DB_IGNORE_LEASE)) {<br> db_err("Read operations must ignore leases or go to master");<br> ret = EINVAL;<br> goto err;<br> }<br>}<br></pre> +On the master, the new code to abide by leases is more complex. +After the call to perform the operation we will check the lease. +In that checking code, the master will see if it has a valid +lease. If so, then all is well. If not, it will try to +refresh the leases. If that refresh attempt results in leases, +all is well. If the refresh attempt does not get leases, then the +master cannot respond to the read as an authority and we return an +error. The new error is called <b>DB_REP_LEASE_EXPIRED</b>. +The location of the master lease check is down after the internal call +to read the data is successful:<br> +<pre>if (IS_REP_MASTER(dbenv) && !LF_ISSET(DB_IGNORE_LEASE)) {<br> [get to rep structure]<br> if (FLD_ISSET(rep->config, REP_C_LEASE) &&<br> (ret = __rep_lease_check(dbenv)) != 0) {<br> /*<br> * We don't hold the lease.<br> */<br> goto err;<br> }<br>}<br></pre> +See below for the details of <i>__rep_lease_check</i>.<br> +<br> +Also note that if leases (or replication) are not configured, then <span + style="font-weight: bold;">DB_IGNORE_LEASE</span> is a no-op. It +is ignored (and won't error) if used when leases are not in +effect. The reason is so that we can generically set that flag in +utility programs like <span style="font-style: italic;">db_dump</span> +that walk the database with a cursor. Note that <span + style="font-style: italic;">db_dump</span> is the only utility that +reads with a cursor.<span style="font-style: italic;"><span + style="font-style: italic;"></span></span><br> +<h3><b>Nsites +and Elections</b></h3> +The call to <i>dbenv->rep_set_nsites</i> +must be performed before the call to <i>dbenv->rep_start</i> +or <i>dbenv->repmgr_start</i>. +This document assumes either that <b>SR +14778</b> gets resolved, or assumes that the value of <i>nsites</i> is +immutable. The +master and all clients need to know how many sites and leases are in +the group. Clients need to know for elections. The master +needs to know for the size of the lease table and to know what value a +majority of the group is. <b>[Until +14778 is resolved, the master lease work must assume <i>nsites</i> is +immutable and will +therefore enforce that this is called before <i>rep_start</i> using +the same mechanism +as <i>rep_set_lease</i>.]</b><br> +<br> +Elections and leases need to agree on the number of sites in the +group. Therefore, when leases are in effect on clients, all calls +to <i>dbenv->rep_elect</i> must +set the <i>nsites</i> parameter to +0. The <i>rep_elect</i> code +path will return <b>EINVAL</b> if <b>REP_C_LEASE</b> is set and <i>nsites</i> +is non-0. +<h2>Lease Management</h2> +<h3>Message Changes</h3> +In order for clients to grant leases to the master a new message type +must be added for that purpose. This will be the <b>REP_LEASE_GRANT</b> +message. +Granting leases will be a result of applying a <b>DB_REP_PERMANENT</b> +record and therefore we +do not need any additional message in order for a master to request a +lease grant. The <b>REP_LEASE_GRANT</b> +message will pass a structure as its message DBT:<br> +<pre>struct __rep_lease_grant {<br> db_timespec msg_time;<br>#ifdef DIAGNOSTIC<br> db_timespec expire_time;<br>#endif<br>} REP_GRANT_INFO;<br></pre> +In the <b>REP_LEASE_GRANT</b> +message, the client is actually giving the master several pieces of +information. We only need the echoed <i>msg_time</i> in this +structure because +everything else is already sent. The client is really sending the +master:<br> +<ul> + <li>Its EID (parameter to <span style="font-style: italic;">rep_send_message</span> +and <span style="font-style: italic;">rep_process_message</span>)<br> + </li> + <li>The PERM LSN this message acknowledged (sent in the control +message)</li> + <li>Unique identifier echoed back to master (<i>msg_time</i> sent in +message as above)</li> +</ul> +On the client, we always maintain the maximum PERM LSN already in <i>lp->max_perm_lsn</i>. +<h3>Local State Management</h3> +Each client must maintain a <i>db_timespec</i> +timestamp containing the expiration of its granted lease. This +field will be in the replication shared memory structure:<br> +<pre>db_timespec grant_expire;<br></pre> +This timestamp already takes into account the clock skew. All +new fields must be initialized when the region is created. Whenever we +grant our master lease and want to send the <b>REP_LEASE_GRANT</b> +message, this value +will be updated. It will be used in the following way: +<pre>db_timespec mytime;<br>DB_LSN perm_lsn;<br>DBT lease_dbt;<br>REP_GRANT_INFO gi;<br><br><br>timespecclear(&mytime);<br>timespecclear(&newgrant);<br>memset(&lease_dbt, 0, sizeof(lease_dbt));<br>memset(&gi, 0, sizeof(gi));<br>__os_gettime(dbenv, &mytime);<br>timespecadd(&mytime, &rep->lease_duration);<br>MUTEX_LOCK(rep->clientdb_mutex);<br>perm_lsn = lp->max_perm_lsn;<br>MUTEX_UNLOCK(rep->clientdb_mutex);<br>REP_SYSTEM_LOCK(dbenv);<br>if (timespeccmp(mytime, rep->grant_expire, >))<br> rep->grant_expire = mytime;<br>gi.msg_time = msg->msg_time;<br>#ifdef DIAGNOSTIC<br>gi.expire_time = rep->grant_expire;<br>#endif<br>lease_dbt.data = &gi;<br>lease_dbt.size = sizeof(gi);<br>REP_SYSTEM_UNLOCK(dbenv);<br>__rep_send_message(dbenv, eid, REP_LEASE_GRANT, &perm_lsn, &lease_dbt, 0, 0);<br></pre> +This updating of the lease grant will occur in the <b>PERM</b> code +path when we have +successfully applied the permanent record.<br> +<h3>Maintaining Leases on the +Master/Rep_start</h3> +The master maintains a lease table that it checks when fulfilling a +read request that is subject to leases. This table is initialized +when a site calls<i> +dbenv->rep_start(DB_MASTER)</i> and the site is undergoing a role +change (i.e. a master making additional calls to <i>dbenv->rep_start(DB_MASTER)</i> +does +not affect an already existing table).<br> +<br> +When a non-master site becomes master, it must do two things related to +leases on a role change. First, a client cannot upgrade to master +while it has an outstanding lease granted to another site. If a +client attempts to do so, an error, <b>EINVAL</b>, +will be returned. The only way this should happen is if the +application simply declares a site master, instead of using +elections. Elections will already wait for leases to expire +before proceeding. (See below.) +<br> +<br> +Second, once we are proceeding with becoming a master, the site must +allocate the table it will use to maintain lease information. +This table will be sized based on <i>nsites</i> +and it will be an array of the following structure:<br> +<pre>struct {<br> int eid; /* EID of client site. */<br> db_timespec start_time; /* Unique time ID client echoes back on grants. */<br> db_timespec end_time; /* Master's lease expiration time. */<br> DB_LSN lease_lsn; /* Durable LSN this lease applies to. */<br> u_int32_t flags; /* Unused for now?? */<br>} REP_LEASE_ENTRY;<br></pre> +<h3>Granting Leases</h3> +It is the burden of the application to make sure that all sites in the +group +are using leases, or none are. Therefore, when a client processes +a <b>PERM</b> +log record that arrived from the master, it will grant its lease +automatically if that record is permanent (i.e. <b>DB_REP_ISPERM</b> +is being returned), +and leases are configured. A client will not send a +lease grant when it is processing log records (even <b>PERM</b> +ones) it receives from other clients that use client-to-client +synchronization. The reason is that the master requires a unique +time-of-msg ID (see below) that the client echoes back in its lease +grant and it will not have such an ID from another client.<br> +<br> +The master stores a time-of-msg ID in each message and the client +simply echoes it back to the master. In its lease table, it does +keep the base +time-of-msg for a valid lease. When <b>REP_LEASE_GRANT</b> +message comes in, +the master does a number of things:<br> +<ol> + <li>Pulls the echoed timespec from the client message, into <i>msg_time</i>.<br> + </li> + <li>Finds the entry in its lease table for the client's EID. It +walks the table searching for the ID. EIDs of <span + style="font-weight: bold;">DB_EID_INVALID</span> are +illegal. Either the master will find the entry, or it will find +an empty slot in the table (i.e. it is still populating the table with +leases).</li> + <li>If this is a previously unknown site lease, the master +initializes the entry by copying to the <i>eid</i>, <i>start_time, </i>and + <i>lease_lsn</i> fields. The master +also computes the <i>end_time</i> +based on the adjusted <i>rep->lease_duration</i>.</li> + <li>If this is a lease from a previously known site, the master must +perform <i>timespeccmp(&msg_time, +&table[i].start_time, >)</i> and only update the <i>end_time</i> +of the lease when this is +a more recent message. If it is a more recent message, then we +should update +the <i>lease_lsn</i> to the LSN in +the message.</li> + <li>Since lease durations are computed taking the clock skew into +account, clients compute them based on the current time and the master +computes it based on original sending time, for diagnostic purposes +only, I also plan to send the client's expiration time. The +client errs on the side of computing a larger lease expiration time and +the master errs on the side of computing a smaller duration. +Since both are taking the clock skew +into account, the client's ending expiration time should never be +smaller than +the master's computed expiration time or their value for clock skew may +not be correct.<br> + </li> +</ol> +Any log records (new or resent) that originate from the master and +result in <b>DB_REP_ISPERM</b> get an +ack.<br> +<br> +<h3>Refreshing Leases</h3> +Leases get refreshed when a master receives a <b>REP_LEASE_GRANT</b> +message from a client. There are three pieces to lease +refreshment. <br> +<h4>Lazy Lease Refreshing on Read<br> +</h4> +If the master discovers that leases are +expired during the read operation, it attempts to refresh its +collection of lease grants. It does this by calling a new +function <i>__rep_lease_refresh</i>. +This function is very similar to the already-existing function <i>__rep_flush</i>. +Basically, to +refresh the lease, the master simply needs to resend the last PERM +record to the clients. The requirements state that when the +application send function returns successfully from sending a PERM +record, the majority of clients have that PERM LSN durable. We +will have a new public DB error return called <b>DB_REP_LEASE_EXPIRED</b> +that will be +returned back to the caller if the master cannot assert its +authority. The code will look something like this:<br> +<pre>/*<br> * Use lp->max_perm_lsn on the master (currently not used on the master)<br> * to keep track of the last PERM record written through the logging system.<br> * need to initialize lp->max_perm_lsn in rep_start on role_chg.<br> */<br>call __rep_send_message on the last PERM record the master wrote, with DB_REP_PERMANENT<br>if failure<br> expire leases<br> return lease expired error to caller<br>else /* success */<br> recheck lease table<br> /*<br> * We need to recheck the lease table because the client<br> * lease grant messages may not be processed yet, or got<br> * lost, or racing with the application's ACK messages or<br> * whatever. <br> */<br> if we have a majority of valid leases<br> return success<br> else<br> return lease expired error to caller <br></pre> +<h4>Ongoing Update Refreshment<br> +</h4> +Second is having the master indicate to +the client it needs to send a lease grant in response to the current +PERM log message. The problem is +that acknowledgements must contain a master-supplied message timestamp +that the client sends back to the master. We need to modify the +structure of the log record messages when leases are configured +so +that when a PERM message is sent, the master sends, and the client +expects, the message timestamp. There are three fairly +straightforward and different implementations to consider.<br> +<ol> + <li>Adding the timestamp to the <b>REP_CONTROL</b> +structure. If this option is chosen, then the code trivially +sends back the timestamp in the client's reply. There is no +special processing done by either side with the message contents. +So, on a PERM log record, the master will send a non-zero +timestamp. On a normal log record the timestamp will be zero or +some known invalid value. If the client sees a non-zero +timestamp, it sends a <b>REP_LEASE_GRANT</b> +with the <i>lp->max_perm_lsn</i> +after applying that log record. If it is zero, then the client +does nothing different. The advantage is ease of code. The +disadvantage is that for mixed version systems, the client is now +dealing with different sized control structures. We would have to +retain the old control structure so that during a mixed version group +the (upgraded) clients can use, expect and send old control structures +to the master. This is unfortunate, so let's consider additional +implementations that don't require modifying the control structure.<br> + </li> + <li>Adding a new <b>REPCTL_LEASE</b> +flag to the list of flags for the control structure, but do not change +the control structure fields. When a master wants to send a +message that needs a lease ack, it sets the flag. Additionally, +instead of simply sending a log record DBT as the <i>rec</i> parameter +for replication, we +would send a new structure that had the timestamp first and then the +record (similar to the bulk transfer buffer). The advantage of +this is that the control structure does not change. Disadvantages +include more special-cased code in the normal code path where we have +to check the flag. If the flag is set we have to extract the +timestamp value and massage the incoming data to pass on the real log +record to <i>rep_apply</i>. On +bulk transfer, we would just add the timestamp into the buffer. +On normal transfers, it would incur an additional data copy on the +master side. That is unfortunate. Additionally, if this +record needs to be stored in the temp db, we need some way to get it +back again later or <span style="font-style: italic;">rep_apply</span> +would have to extract the timestamp out when it processed the record +(either live or from the temp db).<br> + </li> + <li>Adding a different message type, such as <b>REP_LOG_ACK</b>. +Similarly to <b>REP_LOG_MORE</b> this message would be a +special-case version of a log record. We would extract out the +timestamp and then handle as a normal log record. This +implementation is rejected because it actually would require three new +message types: <b>REP_LOG_ACK, +REP_LOG_ACK_MORE, REP_BULK_LOG_ACK</b>. That is just too ugly +to contemplate.</li> +</ol> +<b>[Slight digression:</b> it occurs +to me while writing about #2 and #3 above, that our implementation of +all of the *_MORE messages could really be implemented with a <b>REPCTL_MORE</b> +flag instead of a +separate message type. We should clean that up and simplify the +messages but not part of master leases. Hmm, taking that thought +process further, we really could get rid of the <b>REP_BULK_*</b> +messages as well if we +added a <b>REPCTL_BULK</b> +flag. I think we should definitely do it for the *_MORE +messages. I am not sure we should do it for bulk because the +structure of the incoming data record is vastly different.]<br> +<br> +Of these options, I believe that modifying the control structure is the +best alternative. The handling of the old structure will be very +isolated to code dealing with old versions and is far less complicated +than injecting the timestamp into the log record DBT and doing a data +copy. Actually, I will likely combine #1 and the flag from #2 +above. I will have the <b>REPCTL_LEASE</b> +flag that indicates a lease grant reply is expected and have the +timestamp in the control structure. +Also I will probably add in a spare field or two for future use in the <b>REP_CONTROL</b> +structure.<br> +<h4>Gap processing</h4> +No matter which implementation we choose for ongoing lease refreshment, +gap processing must be considered. The code above assumes the +timestamps will be placed on PERM records only. Normal log +records will not have a timestamp, nor a flag or anything else like +that. However, any log message can fill a gap on a client and +result in the processing of that normal log record to return <b>DB_REP_ISPERM</b> +because later records +were also processed.<br> +<br> +The current implementation should work fine in that case because when +we store the message in the client temp db we store both the control +DBT and the record DBT. Therefore, when a normal record fills a +gap, the later PERM record, when retrieved will look just like it did +when it arrived. The client will have access to the LSN, and the +timestamp, etc. However, it does mean that sending the <b>REP_LEASE_GRANT</b> +message must take +place down in <i>__rep_apply</i> +because that is the only place we have access to the contents of those +stored records with the timestamps.<br> +<br> +There are two logical choices to consider for granting the lease when +processing an update. As we process (either a live record or one +read from the temp db after filling a gap) a PERM message, we send the <b>REP_LEASE_GRANT</b> +message for each +PERM record we successfully apply. Or, second, we keep track of +the largest timestamp of all PERM records we've processed and at the +end of the function after we've applied all records, we send back a +single lease grant with the <i>max_perm_lsn</i> +and a new <i>max_lease_timestamp</i> +value to the master. The first is easier to implement, the second +results in possibly slightly fewer messages at the expense of more +bookkeeping on the client.<br> +<br> +A third, more complicated option would be to have the message timestamp +on all records, but grants are only sent on the PERM messages. A +reason to do this is that the later timestamp of a normal log record +would be used as the timestamp sent in the reply and the master would +get a more up to date timestamp value and a longer lease. <br> +<br> +If we change the <span style="font-weight: bold;">REP_CONTROL</span> +structure to include the timestamp, we potentially break or at least +need to revisit the gap processing algorithm. That code assumes +that the control and record elements for the same LSN look the same +each and every time. The code stores the <span + style="font-style: italic;">control</span> DBT as the key and the <span + style="font-style: italic;">rec</span> DBT as the data. We use a +specialized compare function to sort based on the LSN in the control +DBT. With master leases, the same record transmitted by a master +multiple times or client for the same LSN will be different because the +timestamp field will not be the same. Therefore, the client will +end up with duplicate entries in the temp database for the same +LSN. Both solutions (adding the timestamp to <span + style="font-weight: bold;">REP_CONTROL</span> and adding a <span + style="font-weight: bold;">REPCTL_LEASE</span> flag) can yield +duplicate entries. The flag would cause the same record from the +master and client to be different as well.<br> +<h4>Handling Incoming Lease Grants<br> +</h4> +The third piece of lease management is handling the incoming <b>REP_LEASE_GRANT</b> +message on the +master. When this message is received, the master must do the +following:<br> +<pre>REP_SYSTEM_LOCK<br>msg_timestamp = cntrl->timestamp;<br>client_lease = __rep_lease_entry(dbenv, client eid)<br>if (client_lease == NULL)<br> initial lease for this site, DB_ASSERT there is space in the table<br> add this to the table if there is space<br>} else <br> compare msg_timestamp with client_lease->start_time<br> if (msg_timestamp is more recent && msg_lsn >= lease LSN)<br> update entry in table<br>REP_SYSTEM_UNLOCK<br></pre> +<h3>Expiring Leases</h3> +Leases can expire in two ways. First they can expire naturally +due to the passage of time. When checking leases, if the current +time is later than the lease entry's <i>end_time</i> +then the lease is expired. Second, they can be forced with a +premature expiration when the application's transport function returns +an error. In the first case, there is nothing to do, in the +second case we need to manipulate the <i>end_time</i> +so that all future lease checks fail. Since the lease <i>start_time</i> +is guaranteed to not be in the future we will have a function <i>__rep_lease_expire</i> +that will:<br> +<pre>REP_SYSTEM_LOCK<br>for each entry in the lease table<br> entry->end_time = entry->start_time;<br>REP_SYSTEM_UNLOCK<br></pre> +Is there a potential race or problem with prematurely expiring +leases? Consider an application that enforces an ALL +acknowledgement policy for PERM records in its transport +callback. There are four clients and three send the PERM ack to +the application. The callback returns an error to the master DB +code. The DB code will now prematurely expire its leases. +However, at approximately the same time the three clients are also +sending their <span style="font-weight: bold;">REP_LEASE_GRANT</span> +messages to the master. There is a race between the master +processing those messages and the thread handling the callback failure +expiring the table. This is only an issue if the messages arrive +after the table has been expired.<br> +<br> +Let's assume all three clients send their grants after the master +expires the table. If we accept those grants and then a read +occurs the read will succeed since the master has a majority of leases +even though the callback failed earlier. Is that a problem? +The lease code is using a majority and the application policy is using +something other value. It feels like this should be okay since +the data is held by leases on a majority. Should we consider +having the lease checking threshold be the same as the permanent ack +policy? That is difficult because Base API users implement +whatever they want and DB does not know what it is.<br> +<h3>Checking Leases</h3> +When a read operation on the master completes, the last thing we need +to do is verify the master leases. We've already discussed +refreshing them when they are expired above. We need two things +for a lease to be valid. It must be within the timeframe of the +lease grant and the lease must be valid for the last PERM record +LSN. Here is the logic +for checking the validity of leases in <i>__rep_lease_check</i>:<br> +<pre>#define MAX_REFRESH_TRIES 3<br>DB_LSN lease_lsn;<br>REP_LEASE_ENTRY *entry;<br>u_int32_t min_leases, valid_leases;<br>db_timespec cur_time;<br>int ret, tries;<br><br> tries = 0;<br>retry:<br> ret = 0;<br> LOG_SYSTEM_LOCK<br> lease_lsn = lp->lsn<br> LOG_SYSTEM_UNLOCK<br> REP_SYSTEM_LOCK<br> min_leases = rep->nsites / 2;<br> __os_gettime(dbenv, &cur_time);<br> for (entry = head of table, valid_leases = 0; entry != NULL && valid_leases < min_leases; entry++)<br> if (timespec_cmp(&entry->end_time, &cur_time) >= 0 && log_compare(&entry->lsn, lease_lsn) == 0)<br> valid_leases++;<br> REP_SYSTEM_UNLOCK<br> if (valid_leases < min_leases) {<br> ret =__rep_lease_refresh(dbenv, ...);<br> /*<br> * If we are successful, we need to recheck the leases because <br> * the lease grant messages may have raced with the PERM<br> * acknowledgement. Give those messages a chance to arrive.<br> */<br> if (ret == 0) {<br> if (tries <= MAX_REFRESH_TRIES) {<br> /*<br> * If we were successful sending, but not successful in racing the<br> * message thread, yield the processor so that message<br> * threads may have a chance to run.<br> */<br> if (tries > 0)<br> /* __os_sleep instead?? */<br> __os_yield()<br> tries++;<br> goto retry;<br> } else<br> ret = DB_RET_LEASE_EXPIRED;<br> }<br> }<br> return (ret);</pre> +If the master has enough valid leases it returns success. If it +does not have enough, it attempts to refresh them. This attempt +may fail if sending the PERM record does not receive sufficient +acks. If we do receive sufficient acknowledgements we may still +find that scheduling of message threads means the master hasn't yet +processed the incoming <b>REP_LEASE_GRANT</b> +messages yet. We will retry a couple times (possibly +parameterized) if the master discovers that situation. <br> +<h2>Elections</h2> +When a client grants a lease to a master, it gives up the right to +participate in an election until that grant expires. If we are +the master and <i>dbenv->rep_elect</i> +is called, it should return, no matter what, like it does today. +If we are a client and <i>rep_elect</i> +is called special processing takes place when leases are in +effect. First, the easy case is if the lease granted by this +client has already expired, then the client goes directly into the +election as normal. If a valid lease grant is outstanding to a +master, this site cannot participate in an election until that grant +expires. We have at least two options when a site calls the <i>dbenv->rep_elect</i> +API while +leases are in effect.<br> +<ol> + <li>The simplest coding solution for DB would be simply to refuse to +participate in the election if this site has a current lease granted to +a master. We would detect this situation and return EINVAL. +This is correct behavior and trivial to implement. The +disadvantage of this solution is that the application would then be +responsible for repeatedly attempting an election until the lease grant +expired.<br> + </li> + <li>The more satisfying solution is for DB to wait the remaining time +for the grant. If this client hears from the master during that +time the election does not take place and the call to <i>rep_elect</i> +returns with the +information for the current/old master.</li> +</ol> +<h3>Election Code Changes</h3> +The code changes to support leases in the election code are fairly +isolated. First if leases are configured, we must verify the <i>nsites</i> +parameter is set to 0. +Second, in <i>__rep_elect_init</i> +we must not overwrite the value of <i>rep->nsites</i> +for leases because it is controlled by the <i>dbenv->rep_set_nsites</i> +API. +These changes are small and easy to understand.<br> +<br> +The more complicated code will be the client code when it has an +outstanding lease granted. The client will wait for the current +lease grant to expire before proceeding with the election. The +client will only do so if it does not hear from the master for the +remainder of the lease grant time. If the client hears from the +master, it returns and does not begin participating in the +election. A new election phase, <b>REP_EPHASE0</b> +will exist so that the call to <i>__rep_wait</i> +can detect if a master responds. The client, while waiting for +the lease grant to expire, will send a <b>REP_MASTER_REQ</b> +message so that the master will respond with a <b>REP_NEWMASTER</b> +message and thus, +allow the client to know the master exists. However, it is also +desirable that if the master +replies to the client, the master wants the client to update its lease +grant. <br> +<br> +Recall that the <b>REP_NEWMASTER</b> +message does not result in a lease grant from the client. The +client responds when it processes a PERM record that has the <b>REPCTL_LEASE</b> +flag set in the message +with its lease grant up to the given LSN. Therefore, we want the +client's <b>REP_MASTER_REQ</b> to +yield both the discovery of the existing master and have the master +refresh its leases. The client will also use the <b>REPCTL_LEASE</b> +flag in its <b>REP_MASTER_REQ</b> message to the +master. This flag will serve as the indicator to the master that +it needs to deal with leases and both send the <b>REP_NEWMASTER</b> +message and refresh +the lease.<br> +The code will work as follows:<br> +<pre>if (leases_configured && (my_grant_still_valid || lease_never_granted) {<br> if (lease_never_granted)<br> wait_time = lease_timeout<br> else<br> wait_time = grant_expiration - current_time<br> F_SET(REP_F_EPHASE0);<br> __rep_send_message(..., REP_MASTER_REQ, ... REPCTL_LEASE);<br> ret = __rep_wait(..., REP_F_EPHASE0);<br> if (we found a master)<br> return<br>} /* if we don't return, fall out and proceed with election */<br></pre> +On the master side, the code handling the <b>REP_MASTER_REQ</b> will +do:<br> +<pre>if (I am master) {<br> ...<br> __rep_send_message(REP_NEWMASTER...)<br> if (F_ISSET(rp, REPCTL_LEASE))<br> __rep_lease_refresh(...)<br>}<br></pre> +Other minor implementation details are that<i> __rep_elect_done</i> +must also clear +the <b>REP_F_EPHASE0</b> flag. +We also, obviously, need to define <b>REP_F_EPHASE0</b> +in the list of replication flags. Note that the client's call to <i>__rep_wait</i> +will return upon +receiving the <b>REP_NEWMASTER</b> +message. The client will independently refresh its lease when it +receives the log record from the master's call to refresh the lease.<br> +<br> +Again, similar to what I suggested above, the code could simply assume +global leases are configured, and instead of having the <b>REPCTL_LEASE</b> +flag at all, the master +assumes that it needs to refresh leases because it has them configured, +not because it is specified in the <b>REP_MASTER_REQ</b> +message it is processing. Right now I don't think every possible +<b>REP_MASTER_REQ</b> message should result in a lease grant request.<br> +<h4>Elections and Quiescient Systems</h4> +It is possible that a master is slow or the client is close to its +expiration time, or that the master is quiescient and all leases are +currently expired, but nothing much is going on anyway, yet some client +calls <i>__rep_elect</i> at that +time. In the code above, we will not send the <b>REP_MASTER_REQ</b> +because the lease is +not valid. The client will simply proceed directly to sending the +<b>REP_VOTE1</b> message, throwing all +other clients into an election. The master is still master and +should stay that way. Currently in response to a vote message, a +master will broadcast out a <b>REP_NEWMASTER</b> +to assert its mastership. That causes the election to +complete. However, if desired the master may want to proactively +refresh its leases. This situation indicates to me that the +master should choose to refresh leases based on configuration, not a +flag sent from the client. I believe anytime the master asserts +its mastership via sending a <b>REP_NEWMASTER</b> +message that I need to add code to proactively refresh leases at that +time.<br> +<h2>Other Implementation Details</h2> +<h3>Role Changes<br> +</h3> +When a site changes its role via a call to <i>rep_start</i> in either +direction, we +must take action when leases are configured. There are three +types of role changes that all need changes to deal with leases:<br> +<ol> + <li><i>A master downgrading to a +client.</i> When a master downgrades to a client, it can do so +immediately after it has proactively expired all existing leases it +holds. This situation is similar to an error from the send +callback, and it effectively cancels all outstanding leases held on +this site. Note that if this master expires its leases, it does +not have any effect on when the clients' lease grants expire on the +client side. The clients must still wait their full expected +grant time.<br> + </li> + <li><i>A client upgrading to master.</i> +If a client is upgrading to a master but it has an outstanding lease +granted to another site, the code will return an <b>EINVAL</b> +error. This situation +only arises if the application simply declares this site master. +If a site wins an election then the election itself should have waited +long enough for the granted lease to expire and this state should not +arise then.</li> + <li><i>A client finding a new master.</i> +When a client discovers a new and different master, via a <b>REP_NEWMASTER</b> +message then the +client cannot accept that new master until its current lease grant +expires. This situation should only occur when a site declares +itself master without an election and that site's lease grant expires +before this client's grant expires. However, it is <b>possible</b> +for this situation to arise +with elections also. If we have 5 sites holding an election and 4 +of those sites have leases expire at about the same time T, and this +site's lease expires at time T+N and the election timeout is < N, +then those 4 sites may hold an election and elect a master without this +site's participation. A client in this situation must call <i>__rep_wait</i> +with the time remaining +on its lease. If the lease is expired after waiting the remaining +time, then the client can accept this new master. If the lease +was refreshed during the waiting period then the client does not accept +this new master and returns.<br> + </li> +</ol> +<h3>DUPMASTER</h3> +A duplicate master situation can occur if an old master becomes +disconnected from the rest of the group, that group elects a new master +and then the partition is resolved. The requirement for master +leases is that this situation will not cause the newly elected, +rightful master to receive the <b>DB_REP_DUPMASTER</b> +return. It is okay for the old master to get that return +value. When a dual master situation exists, the following will +happen:<br> +<ul> + <li><i>On the current master and all +current clients</i> - If the current master receives an update +message or other conflicting message from the old master then that +message will be ignored because the generation number is out of date.</li> + <li><i>On the old master</i> - If +the old master receives an update message from the current master, or +any other message with a later generation from any site, the new +generation number will trigger this site to return <b>DB_REP_DUPMASTER</b>. +However, +instead of broadcasting out the <b>REP_DUPMASTER</b> +message to shoot down others as well, this site, if leases are +configured, will call <i>__rep_lease_check</i> +and if they are expired, return the error. It should be +impossible for us to receive a later generation message and still hold +a majority of master leases. Something is seriously wrong and we +will <b>DB_ASSERT</b> this situation +cannot happen.<br> + </li> +</ul> +<h3>Client to Client Synchronization</h3> +One question to ask is how lease grants interact with client-to-client +synchronization. The only answer is that they do not. A client +that is sending log records to another client cannot request the +receiving client refresh its lease with the master. That client +does not have a timestamp it can use for the master and clock skew +makes it meaningless between machines. Therefore, sites that use +client-to-client synchronization will likely see more lease refreshment +during the read path and leases will be refreshed during live updates +only. Of course, if a client supplies log records that fill a +gap, and the later log records stored came from the master in a live +update then the client will respond as per the discussion on Gap +Processing above.<br> +<h2>Interaction Matrix</h2> +If leases are granted (by a client) or held (by a master) what should +the following APIs and messages do?<br> +<br> +Other:<br> +log_archive: Leases do not affect log_archive. OK.<br> +dbenv->close: OK.<br> +crash during lease grant and restart: <b>Potential +problem here. See discussion below</b>.<br> +<br> +Rep Base API method:<br> +rep_elect: Already discussed above. Must wait for lease to expire.<br> +rep_flush: Master only, OK - this will be the basis for refreshing +leases.<br> +rep_get_*: Not affected by leases.<br> +rep_process_message: Generally OK. We'll discuss each message +below.<br> +rep_set_config: OK.<br> +rep_set_limit: OK<br> +rep_set_nsites: Must be called before <i>rep_start</i> +and <i>nsites</i> is immutable until +14778 is resolved.<br> +rep_set_priority: OK<br> +rep_set_timeout: OK. Used to set lease timeout.<br> +rep_set_transport: OK.<br> +rep_start(MASTER): Role changes are discussed above. Make sure +duplicate rep_start calls are no-ops for leases.<br> +rep_start(CLIENT): Role changes are discussed above. Make sure +duplicate calls are no-ops for leases.<br> +rep_stat: OK.<br> +rep_sync: Should not be able to happen. Client cannot accept new +master with outstanding lease grant. Add DB_ASSERT here.<br> +<br> +REP_ALIVE: OK.<br> +REP_ALIVE_REQ: OK.<br> +REP_ALL_REQ: OK.<br> +REP_BULK_LOG: OK. Clients check to send ACK.<br> +REP_BULK_PAGE: Should never process one with lease granted. Add +DB_ASSERT.<br> +REP_DUPMASTER: Should never happen, this is what leases are supposed to +prevent. See above.<br> +REP_LOG: OK. Clients check to send ACK.<br> +REP_LOG_MORE: OK. Clients check to send ACK.<br> +REP_LOG_REQ: OK.<br> +REP_MASTER_REQ: OK.<br> +REP_NEWCLIENT: OK.<br> +REP_NEWFILE: OK. Clients check to send ACK.<br> +REP_NEWMASTER: See above.<br> +REP_NEWSITE: OK.<br> +REP_PAGE: OK. Should never process one with lease granted. +Add DB_ASSERT.<br> +REP_PAGE_FAIL: OK. Should never process one with lease +granted. Add DB_ASSERT.<br> +REP_PAGE_MORE: OK. Should never process one with lease +granted. Add DB_ASSERT.<br> +REP_PAGE_REQ: OK.<br> +REP_REREQUEST: OK.<br> +REP_UPDATE: OK. Should never process one with lease +granted. Add DB_ASSERT.<br> +REP_UPDATE_REQ: OK. This is a master-only message.<br> +REP_VERIFY: OK. Should never process one with lease +granted. Add DB_ASSERT.<br> +REP_VERIFY_FAIL: OK. Should never process one with lease +granted. Add DB_ASSERT.<br> +REP_VERIFY_REQ: OK.<br> +REP_VOTE1: OK. See Election discussion above. It is +possible to receive one with a lease granted. Client cannot send +one with an outstanding lease however.<br> +REP_VOTE2: OK. See Election discussion above. It is +possible to receive one with a lease granted.<br> +<br> +If the following method or message processing is in progress and a +client wants to grant a lease, what should it do? Let's examine +what this means. The client wanting to grant a lease simply means +it is responding to the receipt of a <b>REP_LOG</b> +(or its variants) message and applying a log record. Therefore, +we need to consider a thread processing a log message racing with these +other actions.<br> +<br> +Other:<br> +log_archive: OK. <br> +dbenv->close: User error. User should not be closing the env +while other threads are using that handle. Should have no effect +if a 2nd dbenv handle to same env is closed.<br> +<br> +Rep Base API method:<br> +rep_elect: See Election discussion above. <i>rep_elect</i> +should wait and may grant +lease while election is in progress.<br> +rep_flush: Should not be called on client.<br> +rep_get_*: OK.<br> +rep_process_message: Generally OK. See handling each message +below.<br> +rep_set_config: OK.<br> +rep_set_limit: OK.<br> +rep_set_nsites: Must be called before <i>rep_start</i> +until 14778 is resolved.<br> +rep_set_priority: OK.<br> +rep_set_timeout: OK.<br> +rep_set_transport: OK.<br> +rep_start(MASTER): OK, can't happen - already protect racing <i>rep_start</i> +and <i>rep_process_message</i>.<br> +rep_start(CLIENT): OK, can't happen - already protect racing <i>rep_start</i> +and <i>rep_process_message</i>.<br> +rep_stat: OK.<br> +rep_sync: Shouldn't happen because client cannot grant leases during +sync-up. Incoming log message ignored.<br> +<br> +REP_ALIVE: OK.<br> +REP_ALIVE_REQ: OK.<br> +REP_ALL_REQ: OK.<br> +REP_BULK_LOG: OK.<br> +REP_BULK_PAGE: OK. Incoming log message ignored during internal +init.<br> +REP_DUPMASTER: Shouldn't happen. See DUPMASTER discussion above.<br> +REP_LOG: OK.<br> +REP_LOG_MORE: OK.<br> +REP_LOG_REQ: OK.<br> +REP_MASTER_REQ: OK.<br> +REP_NEWCLIENT: OK.<br> +REP_NEWFILE: OK.<br> +REP_NEWMASTER: See above. If a client accepts a new master +because its lease grant expired, then that master sends a message +requesting the lease grant, this client will not process the log record +if it is in sync-up recovery, or it may after the master switch is +complete and the client doesn't need sync-up recovery. Basically, +just uses existing log record processing/newmaster infrastructure.<br> +REP_NEWSITE: OK.<br> +REP_PAGE: OK. Receiving a log record during internal init PAGE +phase should ignore log record.<br> +REP_PAGE_FAIL: OK.<br> +REP_PAGE_MORE: OK.<br> +REP_PAGE_REQ: OK.<br> +REP_REREQUEST: OK.<br> +REP_UPDATE: OK. Receiving a log record during internal init +should ignore log record.<br> +REP_UPDATE_REQ: OK - master-only message.<br> +REP_VERIFY: OK. Receiving a log record during verify phase +ignores log record.<br> +REP_VERIFY_FAIL: OK.<br> +REP_VERIFY_REQ: OK.<br> +REP_VOTE1: OK. This client is processing someone else's vote when +the lease request comes in. That is fine. We protect our +own election and lease interaction in <i>__rep_elect</i>.<br> +REP_VOTE2: OK.<br> +<h4>Crashing - Potential Problem<br> +</h4> +It appears there is one area where we could have a problem. I +believe that crashes can cause us to break our guarantee on durability, +authoritative reads and inability to elect duplicate masters. +Consider this scenario:<br> +<ol> + <li>A master and 4 clients are all up and running.</li> + <li>The master commits a txn and all 4 clients refresh their lease +grants at time T.</li> + <li>All 4 clients have the txn and log records in the cache. +None are flushing to disk.</li> + <li>All 4 clients have responded to the PERM messages as well as +refreshed their lease with the master.</li> + <li>All 4 clients hit the same application coding error and crash +(machine/OS stays up).</li> + <li>Master authoritatively reads data in txn from step 2.</li> + <li>All 4 clients restart the application and run recovery, thus the +txn from step 2 is lost on all clients because it isn't any logs.<span + style="font-weight: bold;"></span><br> + </li> + <li>A network partition happens and the master is alone on its side.</li> + <li>All 4 clients are on the other side and elect a new master.</li> + <li>Partition resolves itself and we have duplicate masters, where +the former master still holds all valid lease grants.<span + style="font-weight: bold;"></span><br> + </li> +</ol> +Therefore, we have broken both guarantees. In step 6 the data is +really not durable and we've given it to the user. One can argue +that if this is an issue the application better be syncing somewhere if +they really want durability. However, worse than that is that we +have a legitimate DUPMASTER situation in step 10 where both masters +hold valid leases. The reason is that all lease knowledge is in +the shared memory and that is lost when the app restarts and runs +recovery.<br> +<br> +How can we solve this? The obvious solution is (ugh, yet another) +durable BDB-owned file with some information in it, such as the current +lease expiration time so that rebooting after a crash leaves the +knowledge that the lease was granted. However, writing and +syncing every lease grant on every client out to disk is far too +expensive.<br> +<br> +A second possible solution is to have clients wait a full lease timeout +before entering an election the first time. This solution solves the +DUPMASTER issue, but not the non-authoritative read. This +solution naturally falls out of elections and leases really. If a +client has never granted a lease, it should be considered as having to +wait a full lease timeout before entering an election. +Applications already know that leases impact elections and this does +not seem so bad as it is only on the first election.<br> +<br> +Is it sufficient to document that the authoritative read is only as +authoritative as the durability guarantees they make on the sites that +indicate it is permanent? Yes, I believe this is sufficient. If +the application says it is permanent and it really isn't, then the +application is at fault. Believing the application when it +indicates with the PERM response that it is permanent avoids the +authoritative problem. <br> +<h2>Upgrade/Mixed Versions</h2> +Clearly leases cannot be used with mixed version sites since masters +running older releases will not have any knowledge of lease +support. What considerations are needed in the lease code for +mixed versions?<br> +<br> +First if the <b>REP_CONTROL</b> +structure changes, we need to maintain and use an old version of the +structure for talking to older clients and masters. The +implementation of this would be similar to the way we manage for old <b>REP_VOTE_INFO</b> +structures. +Second any new messages need translation table entries added. +Third, if we are assuming global leases then clearly any mixed versions +cannot have leases configured, and leases cannot be used in mixed +version groups. Maintaining two versions of the control structure +is not necessary if we choose a different style of implementation and +don't change the control structure.<br> +<br> +However, then how could an old application both run continuously, +upgrade to the new release and take advantage of leases without taking +down the entire application? I believe it is possible for clients +to be configured for leases but be subject to the master regarding +leases, yet the master code can assume that if it has leases +configured, all client sites do as well. In several places above +I suggested that a client could make a choice based on either a new <b>REPCTL_LEASE</b> +flag or simply having +leases turned on locally. If we choose to use the flag, then we +can support leases with mixed versions. The upgraded clients can +configure leases and they simply will not be granted until the old +master is upgraded and send PERM message with the flag indicating it +wants a lease grant. The client will not grant a lease until such +time. The clients, while having the leases configured, will not +grant a lease until told to do so and will simply have an expired +lease. Then, when the old master finally upgrades, it too can +configure leases and suddenly all sites are using them. I believe +this should work just fine and I will need to make sure a client's +granting of leases is only in response to the master asking for a +grant. If the master never asks, then the client has them +configured, but doesn't grant them.<br> +<h2>Testing</h2> +Clearly any user-facing API changes will need the equivalent reflection +in the Tcl API for testing, under CONFIG_TEST.<br> +<br> +I am sure the list of tests will grow but off the top of my head:<br> +Basic test: have N sites all configure leases, run some, read on +master, etc.<br> +Refresh test: Perform update on master, sleep until past expiration, +read on master and make sure leases are refreshed/read successful<br> +Error test: Test error conditions (reading on client with leases but no +ignore flag, calling after rep_start, etc)<br> +Read test: Test reading on both client and master both with and without +the IGNORE flag. Test that data read with the ignore flag can be +rolled back.<br> +Dupmaster test: Force a DUPMASTER situation and verify that the newer +master cannot get DUPMASTER error.<br> +Election test: Call election while grant is outstanding and master +exists.<br> +Call election while grant is outstanding and master does not exist.<br> +Call election after expiration on quiescient system with master +existing.<br> +Run with a group where some members have leases configured and other do +not to make sure we get errors instead of dumping core.<br> +<br> +<small><br> +</small> +</body> +</html> diff --git a/rep/rep.src b/rep/rep.src new file mode 100644 index 0000000..0d1664b --- /dev/null +++ b/rep/rep.src @@ -0,0 +1,116 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +DBPRIVATE +PREFIX __rep + +INCLUDE #include "db_int.h" +INCLUDE #include "dbinc/db_page.h" +INCLUDE #include "dbinc/db_am.h" +INCLUDE #include "dbinc/log.h" +INCLUDE #include "dbinc/mp.h" +INCLUDE #include "dbinc/txn.h" +INCLUDE + +/* + * bulk - message for bulk log records or pages + */ +BEGIN_MSG bulk check_length +ARG len u_int32_t +ARG lsn DB_LSN +ARG bulkdata DBT +END + +/* + * control - replication control message + */ +BEGIN_MSG control check_length +ARG rep_version u_int32_t +ARG log_version u_int32_t +ARG lsn DB_LSN +ARG rectype u_int32_t +ARG gen u_int32_t +ARG msg_sec u_int32_t +ARG msg_nsec u_int32_t +ARG flags u_int32_t +END + +/* + * egen data + */ +BEGIN_MSG egen check_length +ARG egen u_int32_t +END + +/* + * file info + */ +BEGIN_MSG fileinfo alloc check_length version +ARG pgsize u_int32_t +ARG pgno db_pgno_t +ARG max_pgno db_pgno_t +ARG filenum u_int32_t +ARG finfo_flags u_int32_t +ARG type u_int32_t +ARG db_flags u_int32_t +ARG uid DBT +ARG info DBT +END + +/* + * grant info - clients send to masters granting a lease. + */ +BEGIN_MSG grant_info check_length +ARG msg_sec u_int32_t +ARG msg_nsec u_int32_t +END + +/* + * We do not need to do anything with LOG record data. + * It is opaque data to us. + */ + +/* + * log request + */ +BEGIN_MSG logreq check_length +ARG endlsn DB_LSN +END + +/* + * We do not need to do anything with NEWCLIENT/NEWSITE cdata dbt. + * It is user data and the app has to do whatever transformation + * it needs to with its own data. + */ +/* + * newfile version + */ +BEGIN_MSG newfile check_length +ARG version u_int32_t +END + +/* + * update - send update information + */ +BEGIN_MSG update alloc check_length version +ARG first_lsn DB_LSN +ARG first_vers u_int32_t +ARG num_files u_int32_t +END + +/* + * vote info + */ +BEGIN_MSG vote_info check_length +ARG egen u_int32_t +ARG nsites u_int32_t +ARG nvotes u_int32_t +ARG priority u_int32_t +ARG tiebreaker u_int32_t +END + diff --git a/rep/rep_auto.c b/rep/rep_auto.c new file mode 100644 index 0000000..3cb3078 --- /dev/null +++ b/rep/rep_auto.c @@ -0,0 +1,679 @@ +/* Do not edit: automatically built by gen_msg.awk. */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __rep_bulk_marshal __P((ENV *, __rep_bulk_args *, + * PUBLIC: u_int8_t *, size_t, size_t *)); + */ +int +__rep_bulk_marshal(env, argp, bp, max, lenp) + ENV *env; + __rep_bulk_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + u_int8_t *start; + + if (max < __REP_BULK_SIZE + + (size_t)argp->bulkdata.size) + return (ENOMEM); + start = bp; + + DB_HTONL_COPYOUT(env, bp, argp->len); + DB_HTONL_COPYOUT(env, bp, argp->lsn.file); + DB_HTONL_COPYOUT(env, bp, argp->lsn.offset); + DB_HTONL_COPYOUT(env, bp, argp->bulkdata.size); + if (argp->bulkdata.size > 0) { + memcpy(bp, argp->bulkdata.data, argp->bulkdata.size); + bp += argp->bulkdata.size; + } + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_bulk_unmarshal __P((ENV *, __rep_bulk_args *, + * PUBLIC: u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_bulk_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_bulk_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + size_t needed; + + needed = __REP_BULK_SIZE; + if (max < needed) + goto too_few; + DB_NTOHL_COPYIN(env, argp->len, bp); + DB_NTOHL_COPYIN(env, argp->lsn.file, bp); + DB_NTOHL_COPYIN(env, argp->lsn.offset, bp); + DB_NTOHL_COPYIN(env, argp->bulkdata.size, bp); + argp->bulkdata.data = bp; + needed += (size_t)argp->bulkdata.size; + if (max < needed) + goto too_few; + bp += argp->bulkdata.size; + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, + "Not enough input bytes to fill a __rep_bulk message"); + return (EINVAL); +} + +/* + * PUBLIC: int __rep_control_marshal __P((ENV *, __rep_control_args *, + * PUBLIC: u_int8_t *, size_t, size_t *)); + */ +int +__rep_control_marshal(env, argp, bp, max, lenp) + ENV *env; + __rep_control_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + u_int8_t *start; + + if (max < __REP_CONTROL_SIZE) + return (ENOMEM); + start = bp; + + DB_HTONL_COPYOUT(env, bp, argp->rep_version); + DB_HTONL_COPYOUT(env, bp, argp->log_version); + DB_HTONL_COPYOUT(env, bp, argp->lsn.file); + DB_HTONL_COPYOUT(env, bp, argp->lsn.offset); + DB_HTONL_COPYOUT(env, bp, argp->rectype); + DB_HTONL_COPYOUT(env, bp, argp->gen); + DB_HTONL_COPYOUT(env, bp, argp->msg_sec); + DB_HTONL_COPYOUT(env, bp, argp->msg_nsec); + DB_HTONL_COPYOUT(env, bp, argp->flags); + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_control_unmarshal __P((ENV *, + * PUBLIC: __rep_control_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_control_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_control_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_CONTROL_SIZE) + goto too_few; + DB_NTOHL_COPYIN(env, argp->rep_version, bp); + DB_NTOHL_COPYIN(env, argp->log_version, bp); + DB_NTOHL_COPYIN(env, argp->lsn.file, bp); + DB_NTOHL_COPYIN(env, argp->lsn.offset, bp); + DB_NTOHL_COPYIN(env, argp->rectype, bp); + DB_NTOHL_COPYIN(env, argp->gen, bp); + DB_NTOHL_COPYIN(env, argp->msg_sec, bp); + DB_NTOHL_COPYIN(env, argp->msg_nsec, bp); + DB_NTOHL_COPYIN(env, argp->flags, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, + "Not enough input bytes to fill a __rep_control message"); + return (EINVAL); +} + +/* + * PUBLIC: int __rep_egen_marshal __P((ENV *, __rep_egen_args *, + * PUBLIC: u_int8_t *, size_t, size_t *)); + */ +int +__rep_egen_marshal(env, argp, bp, max, lenp) + ENV *env; + __rep_egen_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + u_int8_t *start; + + if (max < __REP_EGEN_SIZE) + return (ENOMEM); + start = bp; + + DB_HTONL_COPYOUT(env, bp, argp->egen); + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_egen_unmarshal __P((ENV *, __rep_egen_args *, + * PUBLIC: u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_egen_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_egen_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_EGEN_SIZE) + goto too_few; + DB_NTOHL_COPYIN(env, argp->egen, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, + "Not enough input bytes to fill a __rep_egen message"); + return (EINVAL); +} + +/* + * PUBLIC: int __rep_fileinfo_marshal __P((ENV *, u_int32_t, + * PUBLIC: __rep_fileinfo_args *, u_int8_t *, size_t, size_t *)); + */ +int +__rep_fileinfo_marshal(env, version, argp, bp, max, lenp) + ENV *env; + u_int32_t version; + __rep_fileinfo_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + int copy_only; + u_int8_t *start; + + if (max < __REP_FILEINFO_SIZE + + (size_t)argp->uid.size + + (size_t)argp->info.size) + return (ENOMEM); + start = bp; + + copy_only = 0; + if (version < DB_REPVERSION_47) + copy_only = 1; + if (copy_only) { + memcpy(bp, &argp->pgsize, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->pgsize); + if (copy_only) { + memcpy(bp, &argp->pgno, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->pgno); + if (copy_only) { + memcpy(bp, &argp->max_pgno, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->max_pgno); + if (copy_only) { + memcpy(bp, &argp->filenum, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->filenum); + if (copy_only) { + memcpy(bp, &argp->finfo_flags, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->finfo_flags); + if (copy_only) { + memcpy(bp, &argp->type, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->type); + if (copy_only) { + memcpy(bp, &argp->db_flags, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->db_flags); + if (copy_only) { + memcpy(bp, &argp->uid.size, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->uid.size); + if (argp->uid.size > 0) { + memcpy(bp, argp->uid.data, argp->uid.size); + bp += argp->uid.size; + } + if (copy_only) { + memcpy(bp, &argp->info.size, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->info.size); + if (argp->info.size > 0) { + memcpy(bp, argp->info.data, argp->info.size); + bp += argp->info.size; + } + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_fileinfo_unmarshal __P((ENV *, u_int32_t, + * PUBLIC: __rep_fileinfo_args **, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_fileinfo_unmarshal(env, version, argpp, bp, max, nextp) + ENV *env; + u_int32_t version; + __rep_fileinfo_args **argpp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + size_t needed; + __rep_fileinfo_args *argp; + int ret; + int copy_only; + + needed = __REP_FILEINFO_SIZE; + if (max < needed) + goto too_few; + if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0) + return (ret); + + copy_only = 0; + if (version < DB_REPVERSION_47) + copy_only = 1; + if (copy_only) { + memcpy(&argp->pgsize, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->pgsize, bp); + if (copy_only) { + memcpy(&argp->pgno, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->pgno, bp); + if (copy_only) { + memcpy(&argp->max_pgno, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->max_pgno, bp); + if (copy_only) { + memcpy(&argp->filenum, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->filenum, bp); + if (copy_only) { + memcpy(&argp->finfo_flags, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->finfo_flags, bp); + if (copy_only) { + memcpy(&argp->type, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->type, bp); + if (copy_only) { + memcpy(&argp->db_flags, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->db_flags, bp); + if (copy_only) { + memcpy(&argp->uid.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->uid.size, bp); + argp->uid.data = bp; + needed += (size_t)argp->uid.size; + if (max < needed) + goto too_few; + bp += argp->uid.size; + if (copy_only) { + memcpy(&argp->info.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->info.size, bp); + argp->info.data = bp; + needed += (size_t)argp->info.size; + if (max < needed) + goto too_few; + bp += argp->info.size; + + if (nextp != NULL) + *nextp = bp; + *argpp = argp; + return (0); + +too_few: + __db_errx(env, + "Not enough input bytes to fill a __rep_fileinfo message"); + return (EINVAL); +} + +/* + * PUBLIC: int __rep_grant_info_marshal __P((ENV *, + * PUBLIC: __rep_grant_info_args *, u_int8_t *, size_t, size_t *)); + */ +int +__rep_grant_info_marshal(env, argp, bp, max, lenp) + ENV *env; + __rep_grant_info_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + u_int8_t *start; + + if (max < __REP_GRANT_INFO_SIZE) + return (ENOMEM); + start = bp; + + DB_HTONL_COPYOUT(env, bp, argp->msg_sec); + DB_HTONL_COPYOUT(env, bp, argp->msg_nsec); + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_grant_info_unmarshal __P((ENV *, + * PUBLIC: __rep_grant_info_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_grant_info_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_grant_info_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_GRANT_INFO_SIZE) + goto too_few; + DB_NTOHL_COPYIN(env, argp->msg_sec, bp); + DB_NTOHL_COPYIN(env, argp->msg_nsec, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, + "Not enough input bytes to fill a __rep_grant_info message"); + return (EINVAL); +} + +/* + * PUBLIC: int __rep_logreq_marshal __P((ENV *, __rep_logreq_args *, + * PUBLIC: u_int8_t *, size_t, size_t *)); + */ +int +__rep_logreq_marshal(env, argp, bp, max, lenp) + ENV *env; + __rep_logreq_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + u_int8_t *start; + + if (max < __REP_LOGREQ_SIZE) + return (ENOMEM); + start = bp; + + DB_HTONL_COPYOUT(env, bp, argp->endlsn.file); + DB_HTONL_COPYOUT(env, bp, argp->endlsn.offset); + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_logreq_unmarshal __P((ENV *, __rep_logreq_args *, + * PUBLIC: u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_logreq_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_logreq_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_LOGREQ_SIZE) + goto too_few; + DB_NTOHL_COPYIN(env, argp->endlsn.file, bp); + DB_NTOHL_COPYIN(env, argp->endlsn.offset, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, + "Not enough input bytes to fill a __rep_logreq message"); + return (EINVAL); +} + +/* + * PUBLIC: int __rep_newfile_marshal __P((ENV *, __rep_newfile_args *, + * PUBLIC: u_int8_t *, size_t, size_t *)); + */ +int +__rep_newfile_marshal(env, argp, bp, max, lenp) + ENV *env; + __rep_newfile_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + u_int8_t *start; + + if (max < __REP_NEWFILE_SIZE) + return (ENOMEM); + start = bp; + + DB_HTONL_COPYOUT(env, bp, argp->version); + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_newfile_unmarshal __P((ENV *, + * PUBLIC: __rep_newfile_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_newfile_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_newfile_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_NEWFILE_SIZE) + goto too_few; + DB_NTOHL_COPYIN(env, argp->version, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, + "Not enough input bytes to fill a __rep_newfile message"); + return (EINVAL); +} + +/* + * PUBLIC: int __rep_update_marshal __P((ENV *, u_int32_t, + * PUBLIC: __rep_update_args *, u_int8_t *, size_t, size_t *)); + */ +int +__rep_update_marshal(env, version, argp, bp, max, lenp) + ENV *env; + u_int32_t version; + __rep_update_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + int copy_only; + u_int8_t *start; + + if (max < __REP_UPDATE_SIZE) + return (ENOMEM); + start = bp; + + copy_only = 0; + if (version < DB_REPVERSION_47) + copy_only = 1; + if (copy_only) { + memcpy(bp, &argp->first_lsn.file, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + memcpy(bp, &argp->first_lsn.offset, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + DB_HTONL_COPYOUT(env, bp, argp->first_lsn.file); + DB_HTONL_COPYOUT(env, bp, argp->first_lsn.offset); + } + if (copy_only) { + memcpy(bp, &argp->first_vers, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->first_vers); + if (copy_only) { + memcpy(bp, &argp->num_files, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_HTONL_COPYOUT(env, bp, argp->num_files); + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_update_unmarshal __P((ENV *, u_int32_t, + * PUBLIC: __rep_update_args **, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_update_unmarshal(env, version, argpp, bp, max, nextp) + ENV *env; + u_int32_t version; + __rep_update_args **argpp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + __rep_update_args *argp; + int ret; + int copy_only; + + if (max < __REP_UPDATE_SIZE) + goto too_few; + if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0) + return (ret); + + copy_only = 0; + if (version < DB_REPVERSION_47) + copy_only = 1; + if (copy_only) { + memcpy(&argp->first_lsn.file, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + memcpy(&argp->first_lsn.offset, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + DB_NTOHL_COPYIN(env, argp->first_lsn.file, bp); + DB_NTOHL_COPYIN(env, argp->first_lsn.offset, bp); + } + if (copy_only) { + memcpy(&argp->first_vers, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->first_vers, bp); + if (copy_only) { + memcpy(&argp->num_files, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else + DB_NTOHL_COPYIN(env, argp->num_files, bp); + + if (nextp != NULL) + *nextp = bp; + *argpp = argp; + return (0); + +too_few: + __db_errx(env, + "Not enough input bytes to fill a __rep_update message"); + return (EINVAL); +} + +/* + * PUBLIC: int __rep_vote_info_marshal __P((ENV *, + * PUBLIC: __rep_vote_info_args *, u_int8_t *, size_t, size_t *)); + */ +int +__rep_vote_info_marshal(env, argp, bp, max, lenp) + ENV *env; + __rep_vote_info_args *argp; + u_int8_t *bp; + size_t *lenp, max; +{ + u_int8_t *start; + + if (max < __REP_VOTE_INFO_SIZE) + return (ENOMEM); + start = bp; + + DB_HTONL_COPYOUT(env, bp, argp->egen); + DB_HTONL_COPYOUT(env, bp, argp->nsites); + DB_HTONL_COPYOUT(env, bp, argp->nvotes); + DB_HTONL_COPYOUT(env, bp, argp->priority); + DB_HTONL_COPYOUT(env, bp, argp->tiebreaker); + + *lenp = (size_t)(bp - start); + return (0); +} + +/* + * PUBLIC: int __rep_vote_info_unmarshal __P((ENV *, + * PUBLIC: __rep_vote_info_args *, u_int8_t *, size_t, u_int8_t **)); + */ +int +__rep_vote_info_unmarshal(env, argp, bp, max, nextp) + ENV *env; + __rep_vote_info_args *argp; + u_int8_t *bp; + size_t max; + u_int8_t **nextp; +{ + if (max < __REP_VOTE_INFO_SIZE) + goto too_few; + DB_NTOHL_COPYIN(env, argp->egen, bp); + DB_NTOHL_COPYIN(env, argp->nsites, bp); + DB_NTOHL_COPYIN(env, argp->nvotes, bp); + DB_NTOHL_COPYIN(env, argp->priority, bp); + DB_NTOHL_COPYIN(env, argp->tiebreaker, bp); + + if (nextp != NULL) + *nextp = bp; + return (0); + +too_few: + __db_errx(env, + "Not enough input bytes to fill a __rep_vote_info message"); + return (EINVAL); +} + diff --git a/rep/rep_backup.c b/rep/rep_backup.c new file mode 100644 index 0000000..e3ab31a --- /dev/null +++ b/rep/rep_backup.c @@ -0,0 +1,3379 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2004-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/fop.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +/* + * Context information needed for buffer management during the building of a + * list of database files present in the environment. When fully built, the + * buffer is in the form of an UPDATE message: a (marshaled) update_args, + * followed by some number of (marshaled) fileinfo_args. + * + * Note that the fileinfo for the first file in the list always appears at + * (constant) offset __REP_UPDATE_SIZE in the buffer. + */ +typedef struct { + u_int8_t *buf; /* Buffer base address. */ + size_t size; /* Total allocated buffer size. */ + u_int8_t *fillptr; /* Pointer to first unused space. */ + u_int32_t count; /* Number of entries currently in list. */ +} FILE_LIST_CTX; +#define FIRST_FILE_PTR(buf) ((buf) + __REP_UPDATE_SIZE) + +static int __rep_check_uid __P((ENV *, FILE_LIST_CTX *, u_int32_t, + u_int8_t *)); +static int __rep_clean_interrupted __P((ENV *)); +static int __rep_cleanup_nimdbs __P((ENV *)); +static int __rep_filedone __P((ENV *, DB_THREAD_INFO *ip, int, + REP *, __rep_fileinfo_args *, u_int32_t)); +static int __rep_find_dbs __P((ENV *, u_int32_t, FILE_LIST_CTX *)); +static int __rep_get_fileinfo __P((ENV *, const char *, + const char *, __rep_fileinfo_args *, u_int8_t *)); +static int __rep_get_file_list __P((ENV *, + DB_FH *, u_int32_t, u_int32_t *, DBT *)); +static int __rep_log_setup __P((ENV *, + REP *, u_int32_t, u_int32_t, DB_LSN *)); +static int __rep_mpf_open __P((ENV *, DB_MPOOLFILE **, + __rep_fileinfo_args *, u_int32_t)); +static int __rep_nextfile __P((ENV *, int, REP *)); +static int __rep_page_gap __P((ENV *, + REP *, __rep_fileinfo_args *, u_int32_t)); +static int __rep_page_sendpages __P((ENV *, DB_THREAD_INFO *, int, + __rep_control_args *, __rep_fileinfo_args *, DB_MPOOLFILE *, DB *)); +static int __rep_queue_filedone __P((ENV *, + DB_THREAD_INFO *, REP *, __rep_fileinfo_args *)); +static int __rep_remove_all __P((ENV *, u_int32_t, DBT *)); +static int __rep_remove_by_list __P((ENV *, u_int32_t, + u_int8_t *, u_int32_t, u_int32_t)); +static int __rep_remove_by_prefix __P((ENV *, const char *, const char *, + size_t, APPNAME)); +static int __rep_remove_file __P((ENV *, u_int8_t *, const char *, + u_int32_t, u_int32_t)); +static int __rep_remove_logs __P((ENV *)); +static int __rep_remove_nimdbs __P((ENV *)); +static int __rep_rollback __P((ENV *, DB_LSN *)); +static int __rep_unlink_by_list __P((ENV *, u_int32_t, + u_int8_t *, u_int32_t, u_int32_t)); +static int __rep_walk_dir __P((ENV *, const char *, u_int32_t, FILE_LIST_CTX*)); +static int __rep_write_page __P((ENV *, + DB_THREAD_INFO *, REP *, __rep_fileinfo_args *)); + +/* + * __rep_update_req - + * Process an update_req and send the file information to the client. + * + * PUBLIC: int __rep_update_req __P((ENV *, __rep_control_args *, int)); + */ +int +__rep_update_req(env, rp, eid) + ENV *env; + __rep_control_args *rp; + int eid; +{ + DBT updbt, vdbt; + DB_LOG *dblp; + DB_LOGC *logc; + DB_LSN lsn; + __rep_update_args u_args; + FILE_LIST_CTX context; + size_t updlen; + u_int32_t flag, version; + int ret, t_ret; + + /* + * Start by allocating 1Meg, which ought to be plenty enough to describe + * all databases in the environment. (If it's not, __rep_walk_dir can + * grow the size.) + * + * The data we send looks like this: + * __rep_update_args + * __rep_fileinfo_args + * __rep_fileinfo_args + * ... + */ + dblp = env->lg_handle; + logc = NULL; + if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0) + return (ret); + context.size = MEGABYTE; + context.count = 0; + + /* Reserve space for the update_args, and fill in file info. */ + context.fillptr = FIRST_FILE_PTR(context.buf); + if ((ret = __rep_find_dbs(env, rp->rep_version, &context)) != 0) + goto err; + + /* + * Now get our first LSN. We send the lsn of the first + * non-archivable log file. + */ + flag = DB_SET; + if ((ret = __log_get_stable_lsn(env, &lsn)) != 0) { + if (ret != DB_NOTFOUND) + goto err; + /* + * If ret is DB_NOTFOUND then there is no checkpoint + * in this log, that is okay, just start at the beginning. + */ + ret = 0; + flag = DB_FIRST; + } + + /* + * Now get the version number of the log file of that LSN. + */ + if ((ret = __log_cursor(env, &logc)) != 0) + goto err; + + memset(&vdbt, 0, sizeof(vdbt)); + /* + * Set our log cursor on the LSN we are sending. Or + * to the first LSN if we have no stable LSN. + */ + if ((ret = __logc_get(logc, &lsn, &vdbt, flag)) != 0) { + /* + * We could be racing a fresh master starting up. If we + * have no log records, assume an initial LSN and current + * log version. + */ + if (ret != DB_NOTFOUND) + goto err; + INIT_LSN(lsn); + version = DB_LOGVERSION; + } else { + if ((ret = __logc_version(logc, &version)) != 0) + goto err; + } + /* + * Package up the update information. + */ + u_args.first_lsn = lsn; + u_args.first_vers = version; + u_args.num_files = context.count; + if ((ret = __rep_update_marshal(env, rp->rep_version, + &u_args, context.buf, __REP_UPDATE_SIZE, &updlen)) != 0) + goto err; + DB_ASSERT(env, updlen == __REP_UPDATE_SIZE); + + /* + * We have all the file information now. Send it to the client. + */ + DB_INIT_DBT(updbt, context.buf, context.fillptr - context.buf); + + LOG_SYSTEM_LOCK(env); + lsn = ((LOG *)dblp->reginfo.primary)->lsn; + LOG_SYSTEM_UNLOCK(env); + (void)__rep_send_message( + env, eid, REP_UPDATE, &lsn, &updbt, 0, 0); + +err: __os_free(env, context.buf); + /* + * If we got here because the lower code could not get the page + * lock then we skipped sending the message, but we don't want + * to return an error to the user. + */ + if (ret == DB_REP_PAGELOCKED) + ret = 0; + if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_find_dbs - + * Walk through all the named files/databases including those in the + * environment or data_dirs and those that in named and in-memory. We + * need to open them, gather the necessary information and then close + * them. + * + * May be called either while holding REP_SYSTEM_LOCK or without. + */ +static int +__rep_find_dbs(env, version, context) + ENV *env; + u_int32_t version; + FILE_LIST_CTX *context; +{ + DB_ENV *dbenv; + int ret; + char **ddir, *real_dir; + + dbenv = env->dbenv; + ret = 0; + real_dir = NULL; + + if (dbenv->db_data_dir == NULL) { + /* + * If we don't have a data dir, we have just the + * env home dir. + */ + ret = __rep_walk_dir(env, env->db_home, version, context); + } else { + for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) { + if ((ret = __db_appname(env, + DB_APP_NONE, *ddir, NULL, &real_dir)) != 0) + break; + if ((ret = __rep_walk_dir(env, + real_dir, version, context)) != 0) + break; + __os_free(env, real_dir); + real_dir = NULL; + } + } + + /* Now, collect any in-memory named databases. */ + if (ret == 0) + ret = __rep_walk_dir(env, NULL, version, context); + + if (real_dir != NULL) + __os_free(env, real_dir); + return (ret); +} + +/* + * __rep_walk_dir -- + * + * This is the routine that walks a directory and fills in the structures + * that we use to generate messages to the client telling it what + * files are available. If the directory name is NULL, then we should + * walk the list of in-memory named files. + */ +static int +__rep_walk_dir(env, dir, version, context) + ENV *env; + const char *dir; + u_int32_t version; + FILE_LIST_CTX *context; +{ + __rep_fileinfo_args tmpfp; + size_t avail, len; + int cnt, first_file, i, ret; + u_int8_t uid[DB_FILE_ID_LEN]; + char *file, **names, *subdb; + + if (dir == NULL) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Walk_dir: Getting info for in-memory named files")); + if ((ret = __memp_inmemlist(env, &names, &cnt)) != 0) + return (ret); + } else { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Walk_dir: Getting info for dir: %s", dir)); + if ((ret = __os_dirlist(env, dir, 0, &names, &cnt)) != 0) + return (ret); + } + RPRINT(env, DB_VERB_REP_SYNC, (env, "Walk_dir: Dir %s has %d files", + (dir == NULL) ? "INMEM" : dir, cnt)); + first_file = 1; + for (i = 0; i < cnt; i++) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Walk_dir: File %d name: %s", i, names[i])); + /* + * Skip DB-owned files: __db*, DB_CONFIG, log* + */ + if (strncmp(names[i], + DB_REGION_PREFIX, sizeof(DB_REGION_PREFIX) - 1) == 0) { + /* Process partition files: "__dbp.*". */ + if (names[i][sizeof(DB_REGION_PREFIX) - 1] != 'p') + continue; + } + if (strncmp(names[i], "DB_CONFIG", 9) == 0) + continue; + if (strncmp(names[i], "log.", 4) == 0) + continue; + + /* We found a file to process. */ + if (dir == NULL) { + file = NULL; + subdb = names[i]; + } else { + file = names[i]; + subdb = NULL; + } + if ((ret = __rep_get_fileinfo(env, + file, subdb, &tmpfp, uid)) != 0) { + /* + * If we find a file that isn't a database, skip it. + */ + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Walk_dir: File %d %s: returned error %s", + i, names[i], db_strerror(ret))); + if (ret == DB_REP_PAGELOCKED) + goto err; + ret = 0; + continue; + } + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Walk_dir: File %s at 0x%lx: pgsize %lu, max_pgno %lu", + names[i], P_TO_ULONG(context->fillptr), + (u_long)tmpfp.pgsize, (u_long)tmpfp.max_pgno)); + + /* + * On the first time through the loop, check to see if the file + * we're about to add is already on the list. If it is, it must + * have been added in a previous call, and that means the + * directory we're currently scanning has already been scanned + * before. (This can happen if the user called + * env->set_data_dir() more than once for the same directory.) + * If that's the case, we're done: not only is it a waste of + * time to scan the same directory again, but doing so would + * result in the same files appearing in the list more than + * once. + */ + if (first_file && dir != NULL && + (ret = __rep_check_uid(env, context, version, uid)) != 0) { + if (ret == DB_KEYEXIST) + ret = 0; + goto err; + } + first_file = 0; + + /* + * Finally we know that this file is a suitable database file + * that we haven't yet included on our list. + */ + tmpfp.filenum = context->count++; + + DB_SET_DBT(tmpfp.info, names[i], strlen(names[i]) + 1); + DB_SET_DBT(tmpfp.uid, uid, DB_FILE_ID_LEN); +retry: avail = (size_t)(&context->buf[context->size] - + context->fillptr); + ret = __rep_fileinfo_marshal(env, version, + &tmpfp, context->fillptr, avail, &len); + if (ret == ENOMEM) { + /* + * Here, 'len' is the total space in use in the buffer. + */ + len = (size_t)(context->fillptr - context->buf); + context->size *= 2; + + if ((ret = __os_realloc(env, + context->size, &context->buf)) != 0) + goto err; + context->fillptr = context->buf + len; + + /* + * Now that we've reallocated the space, try to + * store it again. + */ + goto retry; + } + /* + * Here, 'len' (still) holds the length of the marshaled + * information about the current file (as filled in by the last + * call to __rep_fileinfo_marshal()). + */ + context->fillptr += len; + } +err: + __os_dirfree(env, names, cnt); + return (ret); +} + +/* + * Check whether the given uid is already present in the list of files being + * built in the context buffer. A return of DB_KEYEXIST means it is. + */ +static int +__rep_check_uid(env, context, version, uid) + ENV *env; + FILE_LIST_CTX *context; + u_int32_t version; + u_int8_t *uid; +{ + __rep_fileinfo_args *rfp; + size_t max; + u_int8_t *fp; + u_int32_t i; + int ret; + + ret = 0; + rfp = NULL; + fp = FIRST_FILE_PTR(context->buf); + for (i = 0; i < context->count; i++) { + max = (size_t)(context->fillptr - fp); + if ((ret = __rep_fileinfo_unmarshal(env, version, + &rfp, fp, max, &fp)) != 0) { + __db_errx(env, "rep_check_uid: Could not malloc"); + goto err; + } + if (memcmp(rfp->uid.data, uid, DB_FILE_ID_LEN) == 0) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Check_uid: Found matching file.")); + ret = DB_KEYEXIST; + goto err; + } + __os_free(env, rfp); + rfp = NULL; + } +err: + if (rfp != NULL) + __os_free(env, rfp); + return (ret); + +} + +static int +__rep_get_fileinfo(env, file, subdb, rfp, uid) + ENV *env; + const char *file, *subdb; + __rep_fileinfo_args *rfp; + u_int8_t *uid; +{ + DB *dbp; + DBC *dbc; + DBMETA *dbmeta; + DB_LOCK lk; + DB_MPOOLFILE *mpf; + DB_THREAD_INFO *ip; + DB_TXN *txn; + PAGE *pagep; + int lorder, ret, retry, t_ret; + + dbp = NULL; + dbc = NULL; + pagep = NULL; + mpf = NULL; + txn = NULL; + LOCK_INIT(lk); + + ENV_GET_THREAD_INFO(env, ip); + + /* + * If the meta page is locked, try a few times. If we cannot + * get it, return. + */ + for (retry = 0; retry < REP_META_RETRY; retry++) { + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto err; + if ((ret = __txn_begin(env, NULL, NULL, &txn, + DB_TXN_NOWAIT)) != 0) + goto err; + if ((ret = __db_open(dbp, ip, txn, file, subdb, DB_UNKNOWN, + DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0), + 0, PGNO_BASE_MD)) != 0) { + RPRINT(env, DB_VERB_REP_SYNC, + (env, "get_fileinfo: open error %d", ret)); + (void)__txn_abort(txn); + txn = NULL; + (void)__db_close(dbp, NULL, DB_NOSYNC); + dbp = NULL; + if (ret == DB_LOCK_DEADLOCK || + ret == DB_LOCK_NOTGRANTED) { + __os_yield(env, 1, 0); + RPRINT(env, DB_VERB_REP_SYNC, + (env, "get_fileinfo: Try %d could not get meta lock for open", retry)); + continue; + } else + goto err; + } else + break; + } + if (retry == REP_META_RETRY) { + ret = DB_REP_PAGELOCKED; + goto err; + } + + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + goto err; + /* + * If the meta page is locked, try a few times. If we cannot + * get it, return. + */ + for (retry = 0; retry < REP_META_RETRY; retry++) { + if ((ret = __db_lget(dbc, 0, dbp->meta_pgno, + DB_LOCK_READ, DB_LOCK_NOWAIT, &lk)) != 0) { + if (ret == DB_LOCK_DEADLOCK || + ret == DB_LOCK_NOTGRANTED) { + RPRINT(env, DB_VERB_REP_SYNC, + (env, "get_fileinfo: Try %d could not get meta lock", retry)); + __os_yield(env, 1, 0); + continue; + } else + goto err; + } else + break; + } + if (retry == REP_META_RETRY) { + ret = DB_REP_PAGELOCKED; + goto err; + } + if ((ret = __memp_fget(dbp->mpf, &dbp->meta_pgno, ip, dbc->txn, + 0, &pagep)) != 0) + goto err; + /* + * We have the meta page. Set up our information. + */ + dbmeta = (DBMETA *)pagep; + rfp->pgno = 0; + /* + * Queue is a special-case. We need to set max_pgno to 0 so that + * the client can compute the pages from the meta-data. + */ + if (dbp->type == DB_QUEUE) + rfp->max_pgno = 0; + else + rfp->max_pgno = dbmeta->last_pgno; + rfp->pgsize = dbp->pgsize; + memcpy(uid, dbp->fileid, DB_FILE_ID_LEN); + rfp->type = (u_int32_t)dbp->type; + rfp->db_flags = dbp->flags; + rfp->finfo_flags = 0; + /* + * Send the lorder of this database. + */ + (void)__db_get_lorder(dbp, &lorder); + if (lorder == 1234) + FLD_SET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN); + else + FLD_CLR(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN); + + ret = __memp_fput(dbp->mpf, ip, pagep, dbc->priority); + pagep = NULL; + if ((t_ret = __LPUT(dbc, lk)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; +err: + if ((t_ret = __LPUT(dbc, lk)) != 0 && ret == 0) + ret = t_ret; + if (pagep != NULL && (t_ret = + __memp_fput(mpf, ip, pagep, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + if (txn != NULL) + (void)__txn_abort(txn); + if (dbp != NULL && (t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_page_req + * Process a page_req and send the page information to the client. + * + * PUBLIC: int __rep_page_req __P((ENV *, + * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *)); + */ +int +__rep_page_req(env, ip, eid, rp, rec) + ENV *env; + DB_THREAD_INFO *ip; + int eid; + __rep_control_args *rp; + DBT *rec; +{ + __rep_fileinfo_args *msgfp; + DB_MPOOLFILE *mpf; + DB_REP *db_rep; + REP *rep; + int ret, t_ret; + u_int8_t *next; + + db_rep = env->rep_handle; + rep = db_rep->region; + + if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version, + &msgfp, rec->data, rec->size, &next)) != 0) + return (ret); + + RPRINT(env, DB_VERB_REP_SYNC, + (env, "page_req: file %d page %lu to %lu", + msgfp->filenum, (u_long)msgfp->pgno, (u_long)msgfp->max_pgno)); + + /* + * We need to open the file and then send its pages. + * If we cannot open the file, we send REP_FILE_FAIL. + */ + RPRINT(env, DB_VERB_REP_SYNC, + (env, "page_req: Open %d via mpf_open", msgfp->filenum)); + if ((ret = __rep_mpf_open(env, &mpf, msgfp, 0)) != 0) { + RPRINT(env, DB_VERB_REP_SYNC, + (env, "page_req: Open %d failed", msgfp->filenum)); + if (F_ISSET(rep, REP_F_MASTER)) + (void)__rep_send_message(env, eid, REP_FILE_FAIL, + NULL, rec, 0, 0); + else + ret = DB_NOTFOUND; + goto err; + } + + ret = __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, NULL); + t_ret = __memp_fclose(mpf, 0); + if (ret == 0 && t_ret != 0) + ret = t_ret; +err: + __os_free(env, msgfp); + return (ret); +} + +static int +__rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp) + ENV *env; + DB_THREAD_INFO *ip; + int eid; + __rep_control_args *rp; + __rep_fileinfo_args *msgfp; + DB_MPOOLFILE *mpf; + DB *dbp; +{ + DB *qdbp; + DBC *qdbc; + DBT lockdbt, msgdbt; + DB_LOCK lock; + DB_LOCKER *locker; + DB_LOCK_ILOCK lock_obj; + DB_LOG *dblp; + DB_LSN lsn; + DB_REP *db_rep; + PAGE *pagep; + REP *rep; + REP_BULK bulk; + REP_THROTTLE repth; + db_pgno_t p; + uintptr_t bulkoff; + size_t len, msgsz; + u_int32_t bulkflags, use_bulk; + int opened, ret, t_ret; + u_int8_t *buf; + + db_rep = env->rep_handle; + rep = db_rep->region; + locker = NULL; + opened = 0; + t_ret = 0; + qdbp = NULL; + qdbc = NULL; + buf = NULL; + bulk.addr = NULL; + use_bulk = FLD_ISSET(rep->config, REP_C_BULK); + if (msgfp->type == (u_int32_t)DB_QUEUE) { + if (dbp == NULL) { + if ((ret = __db_create_internal(&qdbp, env, 0)) != 0) + goto err; + /* + * We need to check whether this is in-memory so that + * we pass the name correctly as either the file or + * the database name. + */ + if ((ret = __db_open(qdbp, ip, NULL, + FLD_ISSET(msgfp->db_flags, DB_AM_INMEM) ? + NULL : msgfp->info.data, + FLD_ISSET(msgfp->db_flags, DB_AM_INMEM) ? + msgfp->info.data : NULL, + DB_UNKNOWN, + DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0), + 0, PGNO_BASE_MD)) != 0) + goto err; + opened = 1; + } else + qdbp = dbp; + if ((ret = __db_cursor(qdbp, ip, NULL, &qdbc, 0)) != 0) + goto err; + } + msgsz = __REP_FILEINFO_SIZE + DB_FILE_ID_LEN + msgfp->pgsize; + if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0) + goto err; + memset(&msgdbt, 0, sizeof(msgdbt)); + RPRINT(env, DB_VERB_REP_SYNC, + (env, "sendpages: file %d page %lu to %lu", + msgfp->filenum, (u_long)msgfp->pgno, (u_long)msgfp->max_pgno)); + memset(&repth, 0, sizeof(repth)); + /* + * If we're doing bulk transfer, allocate a bulk buffer to put our + * pages in. We still need to initialize the throttle info + * because if we encounter a page larger than our entire bulk + * buffer, we need to send it as a singleton. + * + * Use a local var so that we don't need to worry if someone else + * turns on/off bulk in the middle of our call here. + */ + if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid, + &bulkoff, &bulkflags, REP_BULK_PAGE)) != 0) + goto err; + REP_SYSTEM_LOCK(env); + repth.gbytes = rep->gbytes; + repth.bytes = rep->bytes; + repth.type = REP_PAGE; + repth.data_dbt = &msgdbt; + REP_SYSTEM_UNLOCK(env); + + /* + * Set up locking. + */ + LOCK_INIT(lock); + memset(&lock_obj, 0, sizeof(lock_obj)); + if ((ret = __lock_id(env, NULL, &locker)) != 0) + goto err; + memcpy(lock_obj.fileid, mpf->fileid, DB_FILE_ID_LEN); + lock_obj.type = DB_PAGE_LOCK; + + memset(&lockdbt, 0, sizeof(lockdbt)); + lockdbt.data = &lock_obj; + lockdbt.size = sizeof(lock_obj); + + for (p = msgfp->pgno; p <= msgfp->max_pgno; p++) { + /* + * We're not waiting for the lock, if we cannot get + * the lock for this page, skip it. The gap + * code will rerequest it. + */ + lock_obj.pgno = p; + if ((ret = __lock_get(env, locker, DB_LOCK_NOWAIT, &lockdbt, + DB_LOCK_READ, &lock)) != 0) { + /* + * Continue if we couldn't get the lock. + */ + if (ret == DB_LOCK_DEADLOCK || + ret == DB_LOCK_NOTGRANTED) { + ret = 0; + continue; + } + /* + * Otherwise we have an error. + */ + goto err; + } + if (msgfp->type == (u_int32_t)DB_QUEUE && p != 0) +#ifdef HAVE_QUEUE + ret = __qam_fget(qdbc, &p, DB_MPOOL_CREATE, &pagep); +#else + ret = DB_PAGE_NOTFOUND; +#endif + else + ret = __memp_fget(mpf, &p, ip, NULL, + DB_MPOOL_CREATE, &pagep); + msgfp->pgno = p; + if (ret == DB_PAGE_NOTFOUND) { + ZERO_LSN(lsn); + if (F_ISSET(rep, REP_F_MASTER)) { + ret = 0; + RPRINT(env, DB_VERB_REP_SYNC, (env, + "sendpages: PAGE_FAIL on page %lu", + (u_long)p)); + (void)__rep_send_message(env, eid, + REP_PAGE_FAIL, &lsn, &msgdbt, 0, 0); + } else + ret = DB_NOTFOUND; + goto lockerr; + } else if (ret != 0) + goto lockerr; + else + DB_SET_DBT(msgfp->info, pagep, msgfp->pgsize); + len = 0; + /* + * Send along an indication of the byte order of this mpool + * page. Since mpool always keeps pages in the native byte + * order of the local environment, this is simply my + * environment's byte order. + * + * Since pages can be served from a variety of sites when using + * client-to-client synchronization, the receiving client needs + * to know the byte order of each page independently. + */ + if (F_ISSET(env, ENV_LITTLEENDIAN)) + FLD_SET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN); + else + FLD_CLR(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN); + RPRINT(env, DB_VERB_REP_SYNC, (env, + "sendpages: %lu, page lsn [%lu][%lu]", (u_long)p, + (u_long)pagep->lsn.file, (u_long)pagep->lsn.offset)); + ret = __rep_fileinfo_marshal(env, rp->rep_version, + msgfp, buf, msgsz, &len); + if (msgfp->type != (u_int32_t)DB_QUEUE || p == 0) + t_ret = __memp_fput(mpf, + ip, pagep, DB_PRIORITY_UNCHANGED); +#ifdef HAVE_QUEUE + else + /* + * We don't need an #else for HAVE_QUEUE here because if + * we're not compiled with queue, then we're guaranteed + * to have set REP_PAGE_FAIL above. + */ + t_ret = __qam_fput(qdbc, p, pagep, qdbp->priority); +#endif + if (t_ret != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __ENV_LPUT(env, lock)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; + + DB_ASSERT(env, len <= msgsz); + DB_SET_DBT(msgdbt, buf, len); + + dblp = env->lg_handle; + LOG_SYSTEM_LOCK(env); + repth.lsn = ((LOG *)dblp->reginfo.primary)->lsn; + LOG_SYSTEM_UNLOCK(env); + /* + * If we are configured for bulk, try to send this as a bulk + * request. If not configured, or it is too big for bulk + * then just send normally. + */ + if (use_bulk) + ret = __rep_bulk_message(env, &bulk, &repth, + &repth.lsn, &msgdbt, 0); + if (!use_bulk || ret == DB_REP_BULKOVF) + ret = __rep_send_throttle(env, eid, &repth, 0, 0); + RPRINT(env, DB_VERB_REP_SYNC, (env, + "sendpages: %lu, lsn [%lu][%lu]", (u_long)p, + (u_long)repth.lsn.file, (u_long)repth.lsn.offset)); + /* + * If we have REP_PAGE_MORE we need to break this loop. + * Otherwise, with REP_PAGE, we keep going. + */ + if (repth.type == REP_PAGE_MORE || ret != 0) { + /* Ignore send failure, except to break the loop. */ + if (ret == DB_REP_UNAVAIL) + ret = 0; + break; + } + } + + if (0) { +lockerr: if ((t_ret = __ENV_LPUT(env, lock)) != 0 && ret == 0) + ret = t_ret; + } +err: + /* + * We're done, force out whatever remains in the bulk buffer and + * free it. + */ + if (use_bulk && bulk.addr != NULL && + (t_ret = __rep_bulk_free(env, &bulk, 0)) != 0 && ret == 0 && + t_ret != DB_REP_UNAVAIL) + ret = t_ret; + if (qdbc != NULL && (t_ret = __dbc_close(qdbc)) != 0 && ret == 0) + ret = t_ret; + if (opened && (t_ret = __db_close(qdbp, NULL, DB_NOSYNC)) != 0 && + ret == 0) + ret = t_ret; + if (buf != NULL) + __os_free(env, buf); + if (locker != NULL && (t_ret = __lock_id_free(env, + locker)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_update_setup + * Process and setup with this file information. + * + * PUBLIC: int __rep_update_setup __P((ENV *, int, __rep_control_args *, + * PUBLIC: DBT *, time_t)); + */ +int +__rep_update_setup(env, eid, rp, rec, savetime) + ENV *env; + int eid; + __rep_control_args *rp; + DBT *rec; + time_t savetime; +{ + DB_LOG *dblp; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + LOG *lp; + REGENV *renv; + REGINFO *infop; + REP *rep; + __rep_update_args *rup; + __rep_fileinfo_args *finfo; + DB_LSN verify_lsn; + size_t max; + int found, ret; + u_int32_t count; + u_int8_t *end, *next; + + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + ret = 0; + + MUTEX_LOCK(env, rep->mtx_clientdb); + verify_lsn = lp->verify_lsn; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + if (!F_ISSET(rep, REP_F_RECOVER_UPDATE) || IN_ELECTION(rep)) { + REP_SYSTEM_UNLOCK(env); + return (0); + } + F_CLR(rep, REP_F_RECOVER_UPDATE); + + if ((ret = __rep_update_unmarshal(env, rp->rep_version, + &rup, rec->data, rec->size, &next)) != 0) + return (ret); + DB_ASSERT(env, next == FIRST_FILE_PTR((u_int8_t*)rec->data)); + end = &((u_int8_t*)rec->data)[rec->size]; + + /* + * If we're doing an abbreviated internal init, it's because we found a + * sync point but we needed to materialize any NIMDBs. However, if we + * now see that there are no NIMDBs we can just skip to verify_match, + * just as we would have done if we had already loaded the NIMDBs. In + * other words, if there are no NIMDBs, then I can trivially say that + * I've already loaded all of them! The whole abbreviated internal init + * turns out not to have been necessary after all. + */ + if (F_ISSET(rep, REP_F_ABBREVIATED)) { + count = rup->num_files; + found = 0; + while (count-- > 0) { + max = (size_t)(end - next); + if ((ret = __rep_fileinfo_unmarshal(env, + rp->rep_version, &finfo, next, max, &next)) != 0) + goto err; + found = FLD_ISSET(finfo->db_flags, DB_AM_INMEM); + __os_free(env, finfo); + if (found) + break; + } + if (!found) { + /* + * Revert to VERIFY state, so that we can pick up where + * we left off, except that from now on (i.e., future + * master changes) we can skip checking for NIMDBs if we + * find a sync point. + */ + F_SET(rep, REP_F_NIMDBS_LOADED | REP_F_RECOVER_VERIFY); + F_CLR(rep, REP_F_ABBREVIATED); + + REP_SYSTEM_UNLOCK(env); + ret = __rep_verify_match(env, &verify_lsn, savetime); + __os_free(env, rup); + return (ret); + } + } + + /* + * We know we're the first to come in here due to the + * REP_F_RECOVER_UPDATE flag. + */ + F_SET(rep, REP_F_RECOVER_PAGE); + /* + * We should not ever be in internal init with a lease granted. + */ + DB_ASSERT(env, + !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0); + + /* + * We do not clear REP_F_READY_* in this code. + * We'll eventually call the normal __rep_verify_match recovery + * code and that will clear all the flags and allow others to + * proceed. We lockout both the messages and API here. + * We lockout messages briefly because we are about to reset + * all our LSNs and we do not want another thread possibly + * using/needing those. We have to lockout the API for + * the duration of internal init. + */ + if ((ret = __rep_lockout_msg(env, rep, 1)) != 0) + goto err; + + if ((ret = __rep_lockout_api(env, rep)) != 0) + goto err; + /* + * We need to update the timestamp and kill any open handles + * on this client. The files are changing completely. + */ + infop = env->reginfo; + renv = infop->primary; + (void)time(&renv->rep_timestamp); + + REP_SYSTEM_UNLOCK(env); + MUTEX_LOCK(env, rep->mtx_clientdb); + __os_gettime(env, &lp->rcvd_ts, 1); + lp->wait_ts = rep->request_gap; + ZERO_LSN(lp->ready_lsn); + ZERO_LSN(lp->verify_lsn); + ZERO_LSN(lp->prev_ckp); + ZERO_LSN(lp->waiting_lsn); + ZERO_LSN(lp->max_wait_lsn); + ZERO_LSN(lp->max_perm_lsn); + if (db_rep->rep_db == NULL) + ret = __rep_client_dbinit(env, 0, REP_DB); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (ret != 0) + goto err_nolock; + + /* + * We need to empty out any old log records that might be in the + * temp database. + */ + ENV_GET_THREAD_INFO(env, ip); + if ((ret = __db_truncate(db_rep->rep_db, ip, NULL, &count)) != 0) + goto err_nolock; + rep->stat.st_log_queued = 0; + + REP_SYSTEM_LOCK(env); + if (F_ISSET(rep, REP_F_ABBREVIATED)) { + /* + * For an abbreviated internal init, the place from which we'll + * want to request master's logs after (NIMDB) pages are loaded + * is precisely the sync point we found during VERIFY. We'll + * roll back to there in a moment. + * + * We don't need first_vers, because it's only used with + * __log_newfile, which only happens with non-ABBREVIATED + * internal init. + */ + rep->first_lsn = verify_lsn; + } else { + /* + * We will remove all logs we have so we need to request + * from the master's beginning. + */ + rep->first_lsn = rup->first_lsn; + rep->first_vers = rup->first_vers; + } + rep->last_lsn = rp->lsn; + rep->nfiles = rup->num_files; + + RPRINT(env, DB_VERB_REP_SYNC, + (env, "Update setup for %d files.", rep->nfiles)); + RPRINT(env, DB_VERB_REP_SYNC, + (env, "Update setup: First LSN [%lu][%lu].", + (u_long)rep->first_lsn.file, (u_long)rep->first_lsn.offset)); + RPRINT(env, DB_VERB_REP_SYNC, + (env, "Update setup: Last LSN [%lu][%lu]", + (u_long)rep->last_lsn.file, (u_long)rep->last_lsn.offset)); + + if (rep->nfiles > 0) { + rep->infoversion = rp->rep_version; + rep->originfolen = rep->infolen = + rec->size - __REP_UPDATE_SIZE; + if ((ret = __os_calloc(env, 1, rep->infolen, + &rep->originfo)) != 0) + goto err; + memcpy(rep->originfo, + FIRST_FILE_PTR((u_int8_t*)rec->data), rep->infolen); + rep->nextinfo = rep->originfo; + } + + /* + * Clear the decks to make room for the logs and databases that we will + * request as part of this internal init. For a normal, full internal + * init, that means all logs and databases. For an abbreviated internal + * init, it means only the NIMDBs, and only that portion of the log + * after the sync point. + */ + if (F_ISSET(rep, REP_F_ABBREVIATED)) { + /* + * Note that in order to pare the log back to the sync point, we + * can't just crudely hack it off there. We need to make sure + * that pages in regular databases get rolled back to a state + * consistent with that sync point. So we have to do a real + * recovery step. + */ + if ((ret = __rep_rollback(env, &rep->first_lsn)) != 0) + goto err; + ret = __rep_remove_nimdbs(env); + } else + ret = __rep_remove_all(env, rp->rep_version, rec); + if (ret != 0) + goto err; + F_CLR(rep, REP_F_READY_MSG); + + rep->curfile = 0; + ret = __rep_nextfile(env, eid, rep); + if (ret != 0) + goto err; + + if (0) { +err_nolock: REP_SYSTEM_LOCK(env); + } + +err: /* + * If we get an error, we cannot leave ourselves in the RECOVER_PAGE + * state because we have no file information. That also means undo'ing + * the rep_lockout. We need to move back to the RECOVER_UPDATE stage. + * In the non-error path, we will have already cleared READY_MSG, but it + * doesn't hurt to clear it again. + */ + F_CLR(rep, REP_F_READY_MSG); + if (ret != 0) { + if (rep->originfo != NULL) { + __os_free(env, rep->originfo); + rep->originfo = NULL; + } + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Update_setup: Error: Clear PAGE, set UPDATE again. %s", + db_strerror(ret))); + F_CLR(rep, REP_F_RECOVER_PAGE | REP_F_READY_API | + REP_F_READY_OP); + F_SET(rep, REP_F_RECOVER_UPDATE); + } + REP_SYSTEM_UNLOCK(env); + __os_free(env, rup); + return (ret); +} + +/* + * Removes any currently existing NIMDBs. We do this at the beginning of + * abbreviated internal init, when any existing NIMDBs should be intact, so + * walk_dir should produce reliable results. + */ +static int +__rep_remove_nimdbs(env) + ENV *env; +{ + __rep_fileinfo_args *finfo; + FILE_LIST_CTX context; + size_t max; + u_int8_t *fp; + int ret; + + finfo = NULL; + + if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0) + return (ret); + context.size = MEGABYTE; + context.count = 0; + context.fillptr = context.buf; + + /* NB: "NULL" asks walk_dir to consider only in-memory DBs */ + if ((ret = __rep_walk_dir(env, NULL, DB_REPVERSION, &context)) != 0) + goto out; + + if ((ret = __rep_closefiles(env)) != 0) + goto out; + + fp = context.buf; + while (context.count-- > 0) { + max = (size_t)(context.fillptr - fp); + if ((ret = __rep_fileinfo_unmarshal(env, DB_REPVERSION, + &finfo, fp, max, &fp)) != 0) + goto out; + if ((ret = __rep_remove_file(env, finfo->uid.data, + finfo->info.data, finfo->type, finfo->db_flags)) != 0) + goto out; + __os_free(env, finfo); + finfo = NULL; + } + +out: + if (finfo != NULL) + __os_free(env, finfo); + __os_free(env, context.buf); + return (ret); +} + +/* + * Removes all existing logs and databases, at the start of internal init. But + * before we do, write a list of the databases onto the init file, so that in + * case we crash in the middle, we'll know how to resume when we restart. + * Finally, also write into the init file the UPDATE message from the master (in + * the "rec" DBT), which includes the (new) list of databases we intend to + * request copies of (again, so that we know what to do if we crash in the + * middle). + * + * For the sake of simplicity, these database lists are in the form of an UPDATE + * message (since we already have the mechanisms in place), even though strictly + * speaking that contains more information than we really need to store. + * + * !!! Must be called with the REP_SYSTEM_LOCK held. + */ +static int +__rep_remove_all(env, msg_version, rec) + ENV *env; + u_int32_t msg_version; + DBT *rec; +{ + FILE_LIST_CTX context; + __rep_fileinfo_args *finfo; + __rep_update_args u_args; + DB_FH *fhp; + DB_REP *db_rep; + REP *rep; + size_t cnt, max, updlen; + u_int32_t bufsz, fvers, mvers, zero; + u_int8_t *fp; + int ret, t_ret; + char *fname; + + finfo = NULL; + fname = NULL; + fhp = NULL; + db_rep = env->rep_handle; + rep = db_rep->region; + + /* + * 1. Get list of databases currently present at this client, which we + * intend to remove. + */ + if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0) + return (ret); + context.size = MEGABYTE; + context.count = 0; + + /* Reserve space for the marshaled update_args. */ + context.fillptr = FIRST_FILE_PTR(context.buf); + + if ((ret = __rep_find_dbs(env, DB_REPVERSION, &context)) != 0) + goto out; + ZERO_LSN(u_args.first_lsn); + u_args.first_vers = 0; + u_args.num_files = context.count; + if ((ret = __rep_update_marshal(env, DB_REPVERSION, + &u_args, context.buf, __REP_UPDATE_SIZE, &updlen)) != 0) + goto out; + DB_ASSERT(env, updlen == __REP_UPDATE_SIZE); + + /* + * 2. Before removing anything, safe-store the database list, so that in + * case we crash before we've removed them all, when we restart we + * can clean up what we were doing. Only write database list to + * file if not running in-memory replication. + * + * The original version of the file contains: + * data1 size (4 bytes) + * data1 + * data2 size (possibly) (4 bytes) + * data2 (possibly) + * + * As of 4.7 the file has the following form: + * 0 (4 bytes - to indicate a new style file) + * file version (4 bytes) + * data1 version (4 bytes) + * data1 size (4 bytes) + * data1 + * data2 version (possibly) (4 bytes) + * data2 size (possibly) (4 bytes) + * data2 (possibly) + */ + if (!FLD_ISSET(rep->config, REP_C_INMEM)) { + if ((ret = __db_appname(env, + DB_APP_NONE, REP_INITNAME, NULL, &fname)) != 0) + goto out; + /* Sanity check that the write size fits into 32 bits. */ + DB_ASSERT(env, (size_t)(context.fillptr - context.buf) == + (u_int32_t)(context.fillptr - context.buf)); + bufsz = (u_int32_t)(context.fillptr - context.buf); + + /* + * (Short writes aren't possible, so we don't have to verify + * 'cnt'.) This first list is generated internally, so it is + * always in the form of the current message version. + */ + zero = 0; + fvers = REP_INITVERSION; + mvers = DB_REPVERSION; + if ((ret = __os_open(env, fname, 0, + DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) != 0 || + (ret = + __os_write(env, fhp, &zero, sizeof(zero), &cnt)) != 0 || + (ret = + __os_write(env, fhp, &fvers, sizeof(fvers), &cnt)) != 0 || + (ret = + __os_write(env, fhp, &mvers, sizeof(mvers), &cnt)) != 0 || + (ret = + __os_write(env, fhp, &bufsz, sizeof(bufsz), &cnt)) != 0 || + (ret = + __os_write(env, fhp, context.buf, bufsz, &cnt)) != 0 || + (ret = __os_fsync(env, fhp)) != 0) { + __db_err(env, ret, "%s", fname); + goto out; + } + } + + /* + * 3. Go ahead and remove logs and databases. The databases get removed + * according to the list we just finished safe-storing. + * + * Clearing NIMDBS_LOADED might not really be necessary, since once + * we've committed to removing all there's no chance of doing an + * abbreviated internal init. This just keeps us honest. + */ + if ((ret = __rep_remove_logs(env)) != 0) + goto out; + if ((ret = __rep_closefiles(env)) != 0) + goto out; + F_CLR(rep, REP_F_NIMDBS_LOADED); + fp = FIRST_FILE_PTR(context.buf); + while (context.count-- > 0) { + max = (size_t)(context.fillptr - fp); + if ((ret = __rep_fileinfo_unmarshal(env, DB_REPVERSION, + &finfo, fp, max, &fp)) != 0) + goto out; + if ((ret = __rep_remove_file(env, finfo->uid.data, + finfo->info.data, finfo->type, finfo->db_flags)) != 0) + goto out; + __os_free(env, finfo); + finfo = NULL; + } + + /* + * 4. Safe-store the (new) list of database files we intend to copy from + * the master (again, so that in case we crash before we're finished + * doing so, we'll have enough information to clean up and start over + * again). This list is the list from the master, so it uses + * the message version. Only write to file if not running + * in-memory replication. + */ + if (!FLD_ISSET(rep->config, REP_C_INMEM)) { + mvers = msg_version; + if ((ret = + __os_write(env, fhp, &mvers, sizeof(mvers), &cnt)) != 0 || + (ret = __os_write(env, fhp, + &rec->size, sizeof(rec->size), &cnt)) != 0 || + (ret = + __os_write(env, fhp, rec->data, rec->size, &cnt)) != 0 || + (ret = __os_fsync(env, fhp)) != 0) { + __db_err(env, ret, "%s", fname); + goto out; + } + } + +out: + if (fhp != NULL && (t_ret = __os_closehandle(env, fhp)) && ret == 0) + ret = t_ret; + if (fname != NULL) + __os_free(env, fname); + if (finfo != NULL) + __os_free(env, finfo); + __os_free(env, context.buf); + return (ret); +} + +/* + * __rep_remove_logs - + * Remove our logs to prepare for internal init. + */ +static int +__rep_remove_logs(env) + ENV *env; +{ + DB_LOG *dblp; + DB_LSN lsn; + LOG *lp; + u_int32_t fnum, lastfile; + int ret; + char *name; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + ret = 0; + + /* + * Call memp_sync to flush any pages that might be in the log buffers + * and not on disk before we remove files on disk. If there were no + * dirty pages, the log isn't flushed. Yet the log buffers could still + * be dirty: __log_flush should take care of this rare situation. + */ + if ((ret = __memp_sync_int(env, + NULL, 0, DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0) + return (ret); + if ((ret = __log_flush(env, NULL)) != 0) + return (ret); + /* + * Forcibly remove existing log files or reset + * the in-memory log space. + */ + if (lp->db_log_inmemory) { + ZERO_LSN(lsn); + if ((ret = __log_zero(env, &lsn)) != 0) + return (ret); + } else { + lastfile = lp->lsn.file; + for (fnum = 1; fnum <= lastfile; fnum++) { + if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) + return (ret); + (void)time(&lp->timestamp); + (void)__os_unlink(env, name, 0); + __os_free(env, name); + } + } + return (0); +} + +/* + * Removes a file during internal init. Assumes underlying subsystems are + * active; therefore, this can't be used for internal init crash recovery. + */ +static int +__rep_remove_file(env, uid, name, type, flags) + ENV *env; + u_int8_t *uid; + const char *name; + u_int32_t type, flags; +{ + DB *dbp; +#ifdef HAVE_QUEUE + DB_THREAD_INFO *ip; +#endif + int ret, t_ret; + + dbp = NULL; + + /* + * Calling __fop_remove will both purge any matching + * fileid from mpool and unlink it on disk. + */ +#ifdef HAVE_QUEUE + /* + * Handle queue separately. __fop_remove will not + * remove extent files. Use __qam_remove to remove + * extent files that might exist under this name. Note that + * in-memory queue databases can't have extent files. + */ + if (type == (u_int32_t)DB_QUEUE && !LF_ISSET(DB_AM_INMEM)) { + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + return (ret); + + /* + * At present, qam_remove expects the passed-in dbp to have a + * locker allocated, and if not, db_open allocates a locker + * which qam_remove then leaks. + * + * TODO: it would be better to avoid cobbling together this + * sequence of low-level operations, if fileops provided some + * API to allow us to remove a database without write-locking + * its handle. + */ + if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0) + goto out; + + ENV_GET_THREAD_INFO(env, ip); + RPRINT(env, DB_VERB_REP_SYNC, + (env, "QAM: Unlink %s via __qam_remove", name)); + if ((ret = __qam_remove(dbp, ip, NULL, name, NULL, 0)) != 0) { + RPRINT(env, DB_VERB_REP_SYNC, + (env, "qam_remove returned %d", ret)); + goto out; + } + } +#else + COMPQUIET(type, 0); +#endif + /* + * We call fop_remove even if we've called qam_remove. + * That will only have removed extent files. Now + * we need to deal with the actual file itself. + */ + if (LF_ISSET(DB_AM_INMEM)) { + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + return (ret); + MAKE_INMEM(dbp); + F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */ + ret = __db_inmem_remove(dbp, NULL, name); + } else + ret = __fop_remove(env, NULL, uid, name, NULL, DB_APP_DATA, 0); +#ifdef HAVE_QUEUE +out: +#endif + if (dbp != NULL && + (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_bulk_page + * Process a bulk page message. + * + * PUBLIC: int __rep_bulk_page __P((ENV *, + * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *)); + */ +int +__rep_bulk_page(env, ip, eid, rp, rec) + ENV *env; + DB_THREAD_INFO *ip; + int eid; + __rep_control_args *rp; + DBT *rec; +{ + __rep_control_args tmprp; + __rep_bulk_args b_args; + int ret; + u_int8_t *p, *ep; + + /* + * We're going to be modifying the rp LSN contents so make + * our own private copy to play with. We need to set the + * rectype to REP_PAGE because we're calling through __rep_page + * to process each page, and lower functions make decisions + * based on the rectypes (for throttling/gap processing) + */ + memcpy(&tmprp, rp, sizeof(tmprp)); + tmprp.rectype = REP_PAGE; + ret = 0; + for (ep = (u_int8_t *)rec->data + rec->size, p = (u_int8_t *)rec->data; + p < ep;) { + /* + * First thing in the buffer is the length. Then the LSN + * of this page, then the page info itself. + */ + if ((ret = __rep_bulk_unmarshal(env, + &b_args, p, rec->size, &p)) != 0) + return (ret); + RPRINT(env, DB_VERB_REP_SYNC, (env, + "rep_bulk_page: Processing LSN [%lu][%lu]", + (u_long)tmprp.lsn.file, (u_long)tmprp.lsn.offset)); + RPRINT(env, DB_VERB_REP_SYNC, (env, + "rep_bulk_page: p %#lx ep %#lx pgrec data %#lx, size %lu (%#lx)", + P_TO_ULONG(p), P_TO_ULONG(ep), + P_TO_ULONG(b_args.bulkdata.data), + (u_long)b_args.bulkdata.size, + (u_long)b_args.bulkdata.size)); + /* + * Now send the page info DBT to the page processing function. + */ + ret = __rep_page(env, ip, eid, &tmprp, &b_args.bulkdata); + RPRINT(env, DB_VERB_REP_SYNC, (env, + "rep_bulk_page: rep_page ret %d", ret)); + + /* + * If this set of pages is already done just return. + */ + if (ret != 0) { + if (ret == DB_REP_PAGEDONE) + ret = 0; + break; + } + } + return (ret); +} + +/* + * __rep_page + * Process a page message. + * + * PUBLIC: int __rep_page __P((ENV *, + * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *)); + */ +int +__rep_page(env, ip, eid, rp, rec) + ENV *env; + DB_THREAD_INFO *ip; + int eid; + __rep_control_args *rp; + DBT *rec; +{ + + DB_REP *db_rep; + DBT key, data; + REP *rep; + __rep_fileinfo_args *msgfp; + db_recno_t recno; + int ret; + + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + + if (!F_ISSET(rep, REP_F_RECOVER_PAGE)) + return (DB_REP_PAGEDONE); + /* + * If we restarted internal init, it is possible to receive + * an old REP_PAGE message, while we're in the current + * stage of recovering pages. Until we have some sort of + * an init generation number, ignore any message that has + * a message LSN that is before this internal init's first_lsn. + */ + if (LOG_COMPARE(&rp->lsn, &rep->first_lsn) < 0) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE: Old page: msg LSN [%lu][%lu] first_lsn [%lu][%lu]", + (u_long)rp->lsn.file, (u_long)rp->lsn.offset, + (u_long)rep->first_lsn.file, + (u_long)rep->first_lsn.offset)); + return (DB_REP_PAGEDONE); + } + if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version, + &msgfp, rec->data, rec->size, NULL)) != 0) + return (ret); + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + /* + * Check if the world changed. + */ + if (!F_ISSET(rep, REP_F_RECOVER_PAGE)) { + ret = DB_REP_PAGEDONE; + goto err; + } + /* + * We should not ever be in internal init with a lease granted. + */ + DB_ASSERT(env, + !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0); + + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE: Received page %lu from file %d", + (u_long)msgfp->pgno, msgfp->filenum)); + /* + * Check if this page is from the file we're expecting. + * This may be an old or delayed page message. + */ + /* + * !!! + * If we allow dbrename/dbremove on the master while a client + * is updating, then we'd have to verify the file's uid here too. + */ + if (msgfp->filenum != rep->curfile) { + RPRINT(env, DB_VERB_REP_SYNC, + (env, "Msg file %d != curfile %d", + msgfp->filenum, rep->curfile)); + ret = DB_REP_PAGEDONE; + goto err; + } + /* + * We want to create/open our dbp to the database + * where we'll keep our page information. + */ + if ((ret = __rep_client_dbinit(env, 1, REP_PG)) != 0) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE: Client_dbinit %s", db_strerror(ret))); + goto err; + } + + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + recno = (db_recno_t)(msgfp->pgno + 1); + key.data = &recno; + key.ulen = key.size = sizeof(db_recno_t); + key.flags = DB_DBT_USERMEM; + + /* + * If we already have this page, then we don't want to bother + * rewriting it into the file. Otherwise, any other error + * we want to return. + */ + ret = __db_put(rep->file_dbp, ip, NULL, &key, &data, DB_NOOVERWRITE); + if (ret == DB_KEYEXIST) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE: Received duplicate page %lu from file %d", + (u_long)msgfp->pgno, msgfp->filenum)); + STAT(rep->stat.st_pg_duplicated++); + ret = 0; + goto err; + } + if (ret != 0) + goto err; + + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE: Write page %lu into mpool", (u_long)msgfp->pgno)); + /* + * We put the page in the database file itself. + */ + ret = __rep_write_page(env, ip, rep, msgfp); + if (ret != 0) { + /* + * We got an error storing the page, therefore, we need + * remove this page marker from the page database too. + * !!! + * I'm ignoring errors from the delete because we want to + * return the original error. If we cannot write the page + * and we cannot delete the item we just put, what should + * we do? Panic the env and return DB_RUNRECOVERY? + */ + (void)__db_del(rep->file_dbp, NULL, NULL, &key, 0); + goto err; + } + STAT(rep->stat.st_pg_records++); + rep->npages++; + + /* + * Now check the LSN on the page and save it if it is later + * than the one we have. + */ + if (LOG_COMPARE(&rp->lsn, &rep->last_lsn) > 0) + rep->last_lsn = rp->lsn; + + /* + * We've successfully written the page. Now we need to see if + * we're done with this file. __rep_filedone will check if we + * have all the pages expected and if so, set up for the next + * file and send out a page request for the next file's pages. + */ + ret = __rep_filedone(env, ip, eid, rep, msgfp, rp->rectype); + +err: REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + + __os_free(env, msgfp); + return (ret); +} + +/* + * __rep_page_fail + * Process a page fail message. + * + * PUBLIC: int __rep_page_fail __P((ENV *, + * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *)); + */ +int +__rep_page_fail(env, ip, eid, rp, rec) + ENV *env; + DB_THREAD_INFO *ip; + int eid; + __rep_control_args *rp; + DBT *rec; +{ + + DB_REP *db_rep; + REP *rep; + __rep_fileinfo_args *msgfp, *rfp; + int ret; + + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + + if (!F_ISSET(rep, REP_F_RECOVER_PAGE)) + return (0); + if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version, + &msgfp, rec->data, rec->size, NULL)) != 0) + return (ret); + /* + * Check if this page is from the file we're expecting. + * This may be an old or delayed page message. + */ + /* + * !!! + * If we allow dbrename/dbremove on the master while a client + * is updating, then we'd have to verify the file's uid here too. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + /* + * We should not ever be in internal init with a lease granted. + */ + DB_ASSERT(env, + !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0); + + if (msgfp->filenum != rep->curfile) { + RPRINT(env, DB_VERB_REP_SYNC, + (env, "Msg file %d != curfile %d", + msgfp->filenum, rep->curfile)); + goto out; + } + rfp = rep->curinfo; + if (rfp->type != (u_int32_t)DB_QUEUE) + --rfp->max_pgno; + else { + /* + * Queue is special. Pages at the beginning of the queue + * may disappear, as well as at the end. Use msgfp->pgno + * to adjust accordingly. + */ + RPRINT(env, DB_VERB_REP_SYNC, (env, + "page_fail: BEFORE page %lu failed. ready %lu, max %lu, npages %d", + (u_long)msgfp->pgno, (u_long)rep->ready_pg, + (u_long)rfp->max_pgno, rep->npages)); + if (msgfp->pgno == rfp->max_pgno) + --rfp->max_pgno; + if (msgfp->pgno >= rep->ready_pg) { + rep->ready_pg = msgfp->pgno + 1; + rep->npages = rep->ready_pg; + } + RPRINT(env, DB_VERB_REP_SYNC, (env, + "page_fail: AFTER page %lu failed. ready %lu, max %lu, npages %d", + (u_long)msgfp->pgno, (u_long)rep->ready_pg, + (u_long)rfp->max_pgno, rep->npages)); + } + + /* + * We've lowered the number of pages expected. It is possible that + * this was the last page we were expecting. Now we need to see if + * we're done with this file. __rep_filedone will check if we have + * all the pages expected and if so, set up for the next file and + * send out a page request for the next file's pages. + */ + ret = __rep_filedone(env, ip, eid, rep, msgfp, REP_PAGE_FAIL); +out: + REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + __os_free(env, msgfp); + return (ret); +} + +/* + * __rep_write_page - + * Write this page into a database. + */ +static int +__rep_write_page(env, ip, rep, msgfp) + ENV *env; + DB_THREAD_INFO *ip; + REP *rep; + __rep_fileinfo_args *msgfp; +{ + DB db; + DBT pgcookie; + DB_MPOOLFILE *mpf; + DB_PGINFO *pginfo; + __rep_fileinfo_args *rfp; + int ret; + void *dst; + + rfp = NULL; + + /* + * If this is the first page we're putting in this database, we need + * to create the mpool file. Otherwise call memp_fget to create the + * page in mpool. Then copy the data to the page, and memp_fput the + * page to give it back to mpool. + * + * We need to create the file, removing any existing file and associate + * the correct file ID with the new one. + */ + rfp = rep->curinfo; + if (rep->file_mpf == NULL) { + if (!FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) { + /* + * Recreate the file on disk. We'll be putting + * the data into the file via mpool. + */ + RPRINT(env, DB_VERB_REP_SYNC, (env, + "rep_write_page: Calling fop_create for %s", + (char *)rfp->info.data)); + if ((ret = __fop_create(env, NULL, NULL, + rfp->info.data, NULL, DB_APP_DATA, + env->db_mode, 0)) != 0) + goto err; + } + + if ((ret = + __rep_mpf_open(env, &rep->file_mpf, rep->curinfo, + FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? + DB_CREATE : 0)) != 0) + goto err; + } + /* + * Handle queue specially. If we're a QUEUE database, we need to + * use the __qam_fget/put calls. We need to use rep->queue_dbc for + * that. That dbp is opened after getting the metapage for the + * queue database. Since the meta-page is always in the queue file, + * we'll use the normal path for that first page. After that we + * can assume the dbp is opened. + */ + if (msgfp->type == (u_int32_t)DB_QUEUE && msgfp->pgno != 0) { +#ifdef HAVE_QUEUE + ret = __qam_fget(rep->queue_dbc, &msgfp->pgno, + DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &dst); +#else + /* + * This always returns an error. + */ + ret = __db_no_queue_am(env); +#endif + } else + ret = __memp_fget(rep->file_mpf, &msgfp->pgno, ip, NULL, + DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &dst); + + if (ret != 0) + goto err; + + /* + * Before writing this page into our local mpool, see if its byte order + * needs to be swapped. When in mpool the page should be in the native + * byte order of our local environment. But the page image we've + * received may be in the opposite order (as indicated in finfo_flags). + */ + if ((F_ISSET(env, ENV_LITTLEENDIAN) && + !FLD_ISSET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN)) || + (!F_ISSET(env, ENV_LITTLEENDIAN) && + FLD_ISSET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN))) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "write_page: Page %d needs to be swapped", msgfp->pgno)); + /* + * Set up a dbp to pass into the swap functions. We need + * only a few things: The environment and any special + * dbp flags and some obvious basics like db type and + * pagesize. Those flags were set back in rep_mpf_open + * and are available in the pgcookie set up with the + * mpoolfile associated with this database. + */ + memset(&db, 0, sizeof(db)); + db.env = env; + db.type = (DBTYPE)msgfp->type; + db.pgsize = msgfp->pgsize; + mpf = rep->file_mpf; + if ((ret = __memp_get_pgcookie(mpf, &pgcookie)) != 0) + goto err; + pginfo = (DB_PGINFO *)pgcookie.data; + db.flags = pginfo->flags; + if ((ret = __db_pageswap(&db, msgfp->info.data, msgfp->pgsize, + NULL, 1)) != 0) + goto err; + } + + memcpy(dst, msgfp->info.data, msgfp->pgsize); +#ifdef HAVE_QUEUE + if (msgfp->type == (u_int32_t)DB_QUEUE && msgfp->pgno != 0) + ret = __qam_fput(rep->queue_dbc, + msgfp->pgno, dst, rep->queue_dbc->priority); + else +#endif + ret = __memp_fput(rep->file_mpf, + ip, dst, rep->file_dbp->priority); + +err: return (ret); +} + +/* + * __rep_page_gap - + * After we've put the page into the database, we need to check if + * we have a page gap and whether we need to request pages. + */ +static int +__rep_page_gap(env, rep, msgfp, type) + ENV *env; + REP *rep; + __rep_fileinfo_args *msgfp; + u_int32_t type; +{ + DBC *dbc; + DBT data, key; + DB_LOG *dblp; + DB_THREAD_INFO *ip; + LOG *lp; + __rep_fileinfo_args *rfp; + db_recno_t recno; + int ret, t_ret; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + ret = 0; + dbc = NULL; + + /* + * We've successfully put this page into our file. + * Now we need to account for it and re-request new pages + * if necessary. + */ + /* + * We already hold both the db mutex and rep mutex. + */ + rfp = rep->curinfo; + + /* + * Make sure we're still talking about the same file. + * If not, we're done here. + */ + if (rfp->filenum != msgfp->filenum) { + ret = DB_REP_PAGEDONE; + goto err; + } + + /* + * We have 3 possible states: + * 1. We receive a page we already have accounted for. + * msg pgno < ready pgno + * 2. We receive a page that is beyond a gap. + * msg pgno > ready pgno + * 3. We receive the page we're expecting next. + * msg pgno == ready pgno + */ + /* + * State 1. This can happen once we put our page record into the + * database, but by the time we acquire the mutex other + * threads have already accounted for this page and moved on. + * We just want to return. + */ + if (msgfp->pgno < rep->ready_pg) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE_GAP: pgno %lu < ready %lu, waiting %lu", + (u_long)msgfp->pgno, (u_long)rep->ready_pg, + (u_long)rep->waiting_pg)); + goto err; + } + + /* + * State 2. This page is beyond the page we're expecting. + * We need to update waiting_pg if this page is less than + * (earlier) the current waiting_pg. There is nothing + * to do but see if we need to request. + */ + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE_GAP: pgno %lu, max_pg %lu ready %lu, waiting %lu max_wait %lu", + (u_long)msgfp->pgno, (u_long)rfp->max_pgno, (u_long)rep->ready_pg, + (u_long)rep->waiting_pg, (u_long)rep->max_wait_pg)); + if (msgfp->pgno > rep->ready_pg) { + if (rep->waiting_pg == PGNO_INVALID || + msgfp->pgno < rep->waiting_pg) + rep->waiting_pg = msgfp->pgno; + } else { + /* + * We received the page we're expecting. + */ + rep->ready_pg++; + __os_gettime(env, &lp->rcvd_ts, 1); + if (rep->ready_pg == rep->waiting_pg) { + /* + * If we get here we know we just filled a gap. + * Move the cursor to that place and then walk + * forward looking for the next gap, if it exists. + */ + lp->wait_ts = rep->request_gap; + rep->max_wait_pg = PGNO_INVALID; + /* + * We need to walk the recno database looking for the + * next page we need or expect. + */ + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + ENV_GET_THREAD_INFO(env, ip); + if ((ret = __db_cursor(rep->file_dbp, ip, NULL, + &dbc, 0)) != 0) + goto err; + /* + * Set cursor to the first waiting page. + * Page numbers/record numbers are offset by 1. + */ + recno = (db_recno_t)rep->waiting_pg + 1; + key.data = &recno; + key.ulen = key.size = sizeof(db_recno_t); + key.flags = DB_DBT_USERMEM; + /* + * We know that page is there, this should + * find the record. + */ + ret = __dbc_get(dbc, &key, &data, DB_SET); + if (ret != 0) + goto err; + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE_GAP: Set cursor for ready %lu, waiting %lu", + (u_long)rep->ready_pg, (u_long)rep->waiting_pg)); + } + while (ret == 0 && rep->ready_pg == rep->waiting_pg) { + rep->ready_pg++; + ret = __dbc_get(dbc, &key, &data, DB_NEXT); + /* + * If we get to the end of the list, there are no + * more gaps. Reset waiting_pg. + */ + if (ret == DB_NOTFOUND || ret == DB_KEYEMPTY) { + rep->waiting_pg = PGNO_INVALID; + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE_GAP: Next cursor No next - ready %lu, waiting %lu", + (u_long)rep->ready_pg, + (u_long)rep->waiting_pg)); + break; + } + /* + * Subtract 1 from waiting_pg because record numbers + * are 1-based and pages are 0-based and we added 1 + * into the page number when we put it into the db. + */ + rep->waiting_pg = *(db_pgno_t *)key.data; + rep->waiting_pg--; + RPRINT(env, DB_VERB_REP_SYNC, (env, + "PAGE_GAP: Next cursor ready %lu, waiting %lu", + (u_long)rep->ready_pg, (u_long)rep->waiting_pg)); + } + } + + /* + * If we filled a gap and now have the entire file, there's + * nothing to do. We're done when ready_pg is > max_pgno + * because ready_pg is larger than the last page we received. + */ + if (rep->ready_pg > rfp->max_pgno) + goto err; + + /* + * Check if we need to ask for more pages. + */ + if ((rep->waiting_pg != PGNO_INVALID && + rep->ready_pg != rep->waiting_pg) || type == REP_PAGE_MORE) { + /* + * We got a page but we may still be waiting for more. + * If we got REP_PAGE_MORE we always want to ask for more. + * We need to set rfp->pgno to the current page number + * we will use to ask for more pages. + */ + if (type == REP_PAGE_MORE) + rfp->pgno = msgfp->pgno; + if ((__rep_check_doreq(env, rep) || type == REP_PAGE_MORE) && + ((ret = __rep_pggap_req(env, rep, rfp, + (type == REP_PAGE_MORE) ? REP_GAP_FORCE : 0)) != 0)) + goto err; + } else { + lp->wait_ts = rep->request_gap; + rep->max_wait_pg = PGNO_INVALID; + } + +err: + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __rep_init_cleanup - + * Clean up internal initialization pieces. + * + * !!! + * Caller must hold client database mutex (mtx_clientdb) and REP_SYSTEM_LOCK. + * + * PUBLIC: int __rep_init_cleanup __P((ENV *, REP *, int)); + */ +int +__rep_init_cleanup(env, rep, force) + ENV *env; + REP *rep; + int force; +{ + DB *queue_dbp; + int ret, t_ret; + + ret = 0; + /* + * 1. Close up the file data pointer we used. + * 2. Close/reset the page database. + * 3. Close/reset the queue database if we're forcing a cleanup. + * 4. Free current file info. + * 5. If we have all files or need to force, free original file info. + */ + if (rep->file_mpf != NULL) { + ret = __memp_fclose(rep->file_mpf, 0); + rep->file_mpf = NULL; + } + if (rep->file_dbp != NULL) { + t_ret = __db_close(rep->file_dbp, NULL, DB_NOSYNC); + rep->file_dbp = NULL; + if (ret == 0) + ret = t_ret; + } + if (force && rep->queue_dbc != NULL) { + queue_dbp = rep->queue_dbc->dbp; + if ((t_ret = __dbc_close(rep->queue_dbc)) != 0 && ret == 0) + ret = t_ret; + rep->queue_dbc = NULL; + if ((t_ret = __db_close(queue_dbp, NULL, DB_NOSYNC)) != 0 && + ret == 0) + ret = t_ret; + } + if (rep->curinfo != NULL) { + __os_free(env, rep->curinfo); + rep->curinfo = NULL; + } + if (IN_INTERNAL_INIT(rep) && force) { + RPRINT(env, DB_VERB_REP_SYNC, + (env, "clean up interrupted internal init")); + t_ret = F_ISSET(rep, REP_F_ABBREVIATED) ? + __rep_cleanup_nimdbs(env) : + __rep_clean_interrupted(env); + if (ret == 0) + ret = t_ret; + + if (rep->originfo != NULL) { + __os_free(env, rep->originfo); + rep->originfo = NULL; + } + } + + return (ret); +} + +/* + * Remove NIMDBs that may have been fully or partially loaded during an + * abbreviated internal init, when the init gets interrupted. At this point, + * we know that any databases we have processed are listed in originfo. + */ +static int +__rep_cleanup_nimdbs(env) + ENV *env; +{ + REP *rep; + DB *dbp; + __rep_fileinfo_args *rfp; + u_int8_t *filelist, *new_fp; + char *namep; + u_int32_t count, filesz, version; + int ret, t_ret; + + /* Use the saved file list from the original UPDATE message. */ + rep = env->rep_handle->region; + version = rep->infoversion; + filelist = rep->originfo; + filesz = rep->originfolen; + count = rep->nfiles; + + ret = 0; + rfp = NULL; + dbp = NULL; + while (count-- > 0) { + if ((ret = __rep_fileinfo_unmarshal(env, version, + &rfp, filelist, filesz, &new_fp)) != 0) + goto out; + filesz -= (u_int32_t)(new_fp - filelist); + filelist = new_fp; + + if (FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) { + namep = rfp->info.data; + + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto out; + MAKE_INMEM(dbp); + F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */ + + /* + * Some of these "files" (actually NIMDBs) may not exist + * yet, simply because the interrupted abbreviated + * internal init had not yet progressed far enough to + * retrieve them. So ENOENT is an acceptable outcome. + */ + if ((ret = __db_inmem_remove(dbp, NULL, namep)) != 0 && + ret != ENOENT) + goto out; + ret = __db_close(dbp, NULL, DB_NOSYNC); + dbp = NULL; + if (ret != 0) + goto out; + } + + __os_free(env, rfp); + rfp = NULL; + } + +out: + if (rfp != NULL) + __os_free(env, rfp); + if (dbp != NULL && + (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * Clean up files involved in an interrupted internal init. + */ +static int +__rep_clean_interrupted(env) + ENV *env; +{ + REP *rep; + DB_LOG *dblp; + LOG *lp; + int ret, t_ret; + + rep = env->rep_handle->region; + + /* + * 1. logs + * a) remove old log files + * b) set up initial log file #1 + * 2. database files + * 3. the "init file" + * + * Steps 1 and 2 can be attempted independently. Step 1b is + * dependent on successful completion of 1a. + */ + + /* Step 1a. */ + if ((ret = __rep_remove_logs(env)) == 0) { + /* + * Since we have no logs, recover by making it look like + * the case when a new client first starts up, namely we + * have nothing but a fresh log file #1. This is a + * little wasteful, since we may soon remove this log + * file again. But it's insignificant in the context of + * interrupted internal init. + */ + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + + /* Step 1b. */ + ret = __rep_log_setup(env, + rep, 1, DB_LOGVERSION, &lp->ready_lsn); + } + + /* Step 2. */ + if ((t_ret = __rep_remove_by_list(env, rep->infoversion, + rep->originfo, rep->originfolen, rep->nfiles)) != 0 && ret == 0) + ret = t_ret; + + /* + * Step 3 must not be done if anything fails along the way, because the + * init file's raison d'etre is to show that some files remain to be + * cleaned up. + */ + if (ret == 0) + ret = __rep_remove_init_file(env); + + return (ret); +} + +/* + * __rep_filedone - + * We need to check if we're done with the current file after + * processing the current page. Stat the database to see if + * we have all the pages. If so, we need to clean up/close + * this one, set up for the next one, and ask for its pages, + * or if this is the last file, request the log records and + * move to the REP_RECOVER_LOG state. + */ +static int +__rep_filedone(env, ip, eid, rep, msgfp, type) + ENV *env; + DB_THREAD_INFO *ip; + int eid; + REP *rep; + __rep_fileinfo_args *msgfp; + u_int32_t type; +{ + __rep_fileinfo_args *rfp; + int ret; + + /* + * We've put our page, now we need to do any gap processing + * that might be needed to re-request pages. + */ + ret = __rep_page_gap(env, rep, msgfp, type); + /* + * The world changed while we were doing gap processing. + * We're done here. + */ + if (ret == DB_REP_PAGEDONE) + return (0); + + rfp = rep->curinfo; + /* + * max_pgno is 0-based and npages is 1-based, so we don't have + * all the pages until npages is > max_pgno. + */ + RPRINT(env, DB_VERB_REP_SYNC, + (env, "FILEDONE: have %lu pages. Need %lu.", + (u_long)rep->npages, (u_long)rfp->max_pgno + 1)); + if (rep->npages <= rfp->max_pgno) + return (0); + + /* + * If we're queue and we think we have all the pages for this file, + * we need to do special queue processing. Queue is handled in + * several stages. + */ + if (rfp->type == (u_int32_t)DB_QUEUE && + ((ret = __rep_queue_filedone(env, ip, rep, rfp)) != + DB_REP_PAGEDONE)) + return (ret); + /* + * We have all the pages for this file. Clean up. + */ + if ((ret = __rep_init_cleanup(env, rep, 0)) != 0) + goto err; + + rep->curfile++; + ret = __rep_nextfile(env, eid, rep); +err: + return (ret); +} + +/* + * Starts requesting pages for the next file in the list (if any), or if not, + * proceeds to the next stage: requesting logs. + * + * !!! + * Called with REP_SYSTEM_LOCK held or both clientdb_mutex and REP_SYSTEM, + * though we may drop REP_SYSTEM_LOCK momentarily in order to send + * a LOG_REQ (but not a PAGE_REQ). + */ +static int +__rep_nextfile(env, eid, rep) + ENV *env; + int eid; + REP *rep; +{ + DBT dbt; + __rep_logreq_args lr_args; + int ret; + u_int8_t *buf, *info_ptr, lrbuf[__REP_LOGREQ_SIZE]; + size_t len, msgsz; + + /* + * Always direct the next request to the master (at least nominally), + * regardless of where the current response came from. The application + * can always still redirect it to another client. + */ + if (rep->master_id != DB_EID_INVALID) + eid = rep->master_id; + + while (rep->curfile < rep->nfiles) { + /* Set curinfo to next file and examine it. */ + info_ptr = rep->nextinfo; + if ((ret = __rep_fileinfo_unmarshal(env, + rep->infoversion, &rep->curinfo, + info_ptr, rep->infolen, &rep->nextinfo)) != 0) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "NEXTINFO: Fileinfo read: %s", db_strerror(ret))); + return (ret); + } + rep->infolen -= (u_int32_t)(rep->nextinfo - info_ptr); + + /* Skip over regular DB's in "abbreviated" internal inits. */ + if (F_ISSET(rep, REP_F_ABBREVIATED) && + !FLD_ISSET(rep->curinfo->db_flags, DB_AM_INMEM)) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Skipping file %d in abbreviated internal init", + rep->curinfo->filenum)); + __os_free(env, rep->curinfo); + rep->curinfo = NULL; + rep->curfile++; + continue; + } + + /* Request this file's pages. */ + DB_ASSERT(env, rep->curinfo->pgno == 0); + rep->ready_pg = 0; + rep->npages = 0; + rep->waiting_pg = PGNO_INVALID; + rep->max_wait_pg = PGNO_INVALID; + memset(&dbt, 0, sizeof(dbt)); + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Next file %d: pgsize %lu, maxpg %lu", + rep->curinfo->filenum, (u_long)rep->curinfo->pgsize, + (u_long)rep->curinfo->max_pgno)); + msgsz = __REP_FILEINFO_SIZE + + rep->curinfo->uid.size + rep->curinfo->info.size; + if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0) + return (ret); + if ((ret = __rep_fileinfo_marshal(env, rep->infoversion, + rep->curinfo, buf, msgsz, &len)) != 0) + return (ret); + DB_INIT_DBT(dbt, buf, len); + (void)__rep_send_message(env, eid, REP_PAGE_REQ, + NULL, &dbt, 0, DB_REP_ANYWHERE); + __os_free(env, buf); + + return (0); + } + + RPRINT(env, DB_VERB_REP_SYNC, (env, + "NEXTFILE: have %d files. RECOVER_LOG now", rep->nfiles)); + /* + * Move to REP_RECOVER_LOG state. + * Request logs. + */ + /* + * We need to do a sync here so that any later opens + * can find the file and file id. We need to do it + * before we clear REP_F_RECOVER_PAGE so that we do not + * try to flush the log. + */ + if ((ret = __memp_sync_int(env, NULL, 0, + DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0) + return (ret); + F_CLR(rep, REP_F_RECOVER_PAGE); + F_SET(rep, REP_F_RECOVER_LOG); + memset(&dbt, 0, sizeof(dbt)); + lr_args.endlsn = rep->last_lsn; + if ((ret = __rep_logreq_marshal(env, &lr_args, lrbuf, + __REP_LOGREQ_SIZE, &len)) != 0) + return (ret); + DB_INIT_DBT(dbt, lrbuf, len); + + /* + * Get the logging subsystem ready to receive the first log record we + * are going to ask for. In the case of a normal internal init, this is + * pretty simple, since we only deal in whole log files. In the + * ABBREVIATED case we've already taken care of this, back when we + * processed the UPDATE message, because we had to do it by rolling back + * to a sync point at an arbitrary LSN. + */ + if (!F_ISSET(rep, REP_F_ABBREVIATED) && + (ret = __rep_log_setup(env, rep, + rep->first_lsn.file, rep->first_vers, NULL)) != 0) + return (ret); + RPRINT(env, DB_VERB_REP_SYNC, (env, + "NEXTFILE: LOG_REQ from LSN [%lu][%lu] to [%lu][%lu]", + (u_long)rep->first_lsn.file, (u_long)rep->first_lsn.offset, + (u_long)rep->last_lsn.file, (u_long)rep->last_lsn.offset)); + REP_SYSTEM_UNLOCK(env); + (void)__rep_send_message(env, eid, + REP_LOG_REQ, &rep->first_lsn, &dbt, REPCTL_INIT, DB_REP_ANYWHERE); + REP_SYSTEM_LOCK(env); + return (0); +} + +/* + * Run a recovery, for the purpose of rolling back the client environment to a + * specific sync point, in preparation for doing an abbreviated internal init + * (materializing only NIMDBs, when we already have the on-disk DBs). + * + * REP_SYSTEM_LOCK should be held on entry, and will be held on exit, but we + * drop it momentarily during the call. + */ +static int +__rep_rollback(env, lsnp) + ENV *env; + DB_LSN *lsnp; +{ + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + DB_THREAD_INFO *ip; + DB_LSN trunclsn; + int ret; + u_int32_t unused; + + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + ENV_GET_THREAD_INFO(env, ip); + + DB_ASSERT(env, F_ISSET(rep, + REP_F_READY_API | REP_F_READY_MSG | REP_F_READY_OP)); + + REP_SYSTEM_UNLOCK(env); + + if ((ret = __rep_dorecovery(env, lsnp, &trunclsn)) != 0) + goto errlock; + + MUTEX_LOCK(env, rep->mtx_clientdb); + lp->ready_lsn = trunclsn; + ZERO_LSN(lp->waiting_lsn); + ZERO_LSN(lp->max_wait_lsn); + lp->max_perm_lsn = *lsnp; + lp->wait_ts = rep->request_gap; + __os_gettime(env, &lp->rcvd_ts, 1); + ZERO_LSN(lp->verify_lsn); + + if (db_rep->rep_db == NULL && + (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + goto errlock; + } + + F_SET(db_rep->rep_db, DB_AM_RECOVER); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused); + MUTEX_LOCK(env, rep->mtx_clientdb); + F_CLR(db_rep->rep_db, DB_AM_RECOVER); + rep->stat.st_log_queued = 0; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + +errlock: + REP_SYSTEM_LOCK(env); + + return (ret); +} + +/* + * __rep_mpf_open - + * Create and open the mpool file for a database. + * Used by both master and client to bring files into mpool. + */ +static int +__rep_mpf_open(env, mpfp, rfp, flags) + ENV *env; + DB_MPOOLFILE **mpfp; + __rep_fileinfo_args *rfp; + u_int32_t flags; +{ + DB db; + int ret; + + if ((ret = __memp_fcreate(env, mpfp)) != 0) + return (ret); + + /* + * We need a dbp to pass into to __env_mpool. Set up + * only the parts that it needs. + */ + memset(&db, 0, sizeof(db)); + db.env = env; + db.type = (DBTYPE)rfp->type; + db.pgsize = rfp->pgsize; + memcpy(db.fileid, rfp->uid.data, DB_FILE_ID_LEN); + db.flags = rfp->db_flags; + /* We need to make sure the dbp isn't marked open. */ + F_CLR(&db, DB_AM_OPEN_CALLED); + /* + * The byte order of this database may be different from my local native + * byte order. If so, set the swap bit so that the necessary swapping + * will be done during file I/O. + */ + if ((F_ISSET(env, ENV_LITTLEENDIAN) && + !FLD_ISSET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN)) || + (!F_ISSET(env, ENV_LITTLEENDIAN) && + FLD_ISSET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN))) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "rep_mpf_open: Different endian database. Set swap bit.")); + F_SET(&db, DB_AM_SWAP); + } else + F_CLR(&db, DB_AM_SWAP); + + db.mpf = *mpfp; + if (F_ISSET(&db, DB_AM_INMEM)) + (void)__memp_set_flags(db.mpf, DB_MPOOL_NOFILE, 1); + if ((ret = __env_mpool(&db, rfp->info.data, flags)) != 0) { + (void)__memp_fclose(db.mpf, 0); + *mpfp = NULL; + } + return (ret); +} + +/* + * __rep_pggap_req - + * Request a page gap. Assumes the caller holds the rep_mutex. + * + * PUBLIC: int __rep_pggap_req __P((ENV *, REP *, __rep_fileinfo_args *, + * PUBLIC: u_int32_t)); + */ +int +__rep_pggap_req(env, rep, reqfp, gapflags) + ENV *env; + REP *rep; + __rep_fileinfo_args *reqfp; + u_int32_t gapflags; +{ + DBT max_pg_dbt; + __rep_fileinfo_args *tmpfp, t; + size_t len, msgsz; + u_int32_t flags; + int alloc, master, ret; + u_int8_t *buf; + + ret = 0; + alloc = 0; + /* + * There is a window where we have to set REP_RECOVER_PAGE when + * we receive the update information to transition from getting + * file information to getting page information. However, that + * thread does release and then reacquire mutexes. So, we might + * try re-requesting before the original thread can get curinfo + * setup. If curinfo isn't set up there is nothing to do. + */ + if (rep->curinfo == NULL) + return (0); + if (reqfp == NULL) { + if ((ret = __rep_finfo_alloc(env, rep->curinfo, &tmpfp)) != 0) + return (ret); + alloc = 1; + } else { + t = *reqfp; + tmpfp = &t; + } + + /* + * If we've never requested this page, then + * request everything between it and the first + * page we have. If we have requested this page + * then only request this record, not the entire gap. + */ + flags = 0; + memset(&max_pg_dbt, 0, sizeof(max_pg_dbt)); + /* + * If this is a PAGE_MORE and we're forcing then we want to + * force the request to ask for the next page after this one. + */ + if (FLD_ISSET(gapflags, REP_GAP_FORCE)) + tmpfp->pgno++; + else + tmpfp->pgno = rep->ready_pg; + msgsz = __REP_FILEINFO_SIZE + + tmpfp->uid.size + tmpfp->info.size; + if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0) + goto err; + if (rep->max_wait_pg == PGNO_INVALID || + FLD_ISSET(gapflags, REP_GAP_FORCE | REP_GAP_REREQUEST)) { + /* + * Request the gap - set max to waiting_pg - 1 or if + * there is no waiting_pg, just ask for one. + */ + if (rep->waiting_pg == PGNO_INVALID) { + if (FLD_ISSET(gapflags, + REP_GAP_FORCE | REP_GAP_REREQUEST)) + rep->max_wait_pg = rep->curinfo->max_pgno; + else + rep->max_wait_pg = rep->ready_pg; + } else { + /* + * If we're forcing, and waiting_pg is less than + * the page we want to start this request at, then + * we set max_wait_pg to the max pgno in the file. + */ + if (FLD_ISSET(gapflags, REP_GAP_FORCE) && + rep->waiting_pg < tmpfp->pgno) + rep->max_wait_pg = rep->curinfo->max_pgno; + else + rep->max_wait_pg = rep->waiting_pg - 1; + } + tmpfp->max_pgno = rep->max_wait_pg; + /* + * Gap requests are "new" and can go anywhere. + */ + if (FLD_ISSET(gapflags, REP_GAP_REREQUEST)) + flags = DB_REP_REREQUEST; + else + flags = DB_REP_ANYWHERE; + } else { + /* + * Request 1 page - set max to ready_pg. + */ + rep->max_wait_pg = rep->ready_pg; + tmpfp->max_pgno = rep->ready_pg; + /* + * If we're dropping to singletons, this is a rerequest. + */ + flags = DB_REP_REREQUEST; + } + if ((master = rep->master_id) != DB_EID_INVALID) { + STAT(rep->stat.st_pg_requested++); + /* + * We need to request the pages, but we need to get the + * new info into rep->finfo. Assert that the sizes never + * change. The only thing this should do is change + * the pgno field. Everything else remains the same. + */ + if ((ret = __rep_fileinfo_marshal(env, rep->infoversion, + tmpfp, buf, msgsz, &len)) == 0) { + DB_INIT_DBT(max_pg_dbt, buf, len); + DB_ASSERT(env, len == max_pg_dbt.size); + (void)__rep_send_message(env, master, + REP_PAGE_REQ, NULL, &max_pg_dbt, 0, flags); + } + } else + (void)__rep_send_message(env, DB_EID_BROADCAST, + REP_MASTER_REQ, NULL, NULL, 0, 0); + + __os_free(env, buf); +err: + if (alloc) + __os_free(env, tmpfp); + return (ret); +} + +/* + * __rep_finfo_alloc - + * Allocate and initialize a fileinfo structure. + * + * PUBLIC: int __rep_finfo_alloc __P((ENV *, __rep_fileinfo_args *, + * PUBLIC: __rep_fileinfo_args **)); + */ +int +__rep_finfo_alloc(env, rfpsrc, rfpp) + ENV *env; + __rep_fileinfo_args *rfpsrc, **rfpp; +{ + __rep_fileinfo_args *rfp; + size_t size; + int ret; + void *uidp, *infop; + + /* + * Allocate enough for the structure and the two DBT data areas. + */ + size = sizeof(__rep_fileinfo_args) + rfpsrc->uid.size + + rfpsrc->info.size; + if ((ret = __os_malloc(env, size, &rfp)) != 0) + return (ret); + + /* + * Copy the structure itself, and then set the DBT data pointers + * to their space and copy the data itself as well. + */ + memcpy(rfp, rfpsrc, sizeof(__rep_fileinfo_args)); + uidp = (u_int8_t *)rfp + sizeof(__rep_fileinfo_args); + rfp->uid.data = uidp; + memcpy(uidp, rfpsrc->uid.data, rfpsrc->uid.size); + + infop = (u_int8_t *)uidp + rfpsrc->uid.size; + rfp->info.data = infop; + memcpy(infop, rfpsrc->info.data, rfpsrc->info.size); + *rfpp = rfp; + return (ret); +} + +/* + * __rep_log_setup - + * We know our first LSN and need to reset the log subsystem + * to get our logs set up for the proper file. + */ +static int +__rep_log_setup(env, rep, file, version, lsnp) + ENV *env; + REP *rep; + u_int32_t file; + u_int32_t version; + DB_LSN *lsnp; +{ + DB_LOG *dblp; + DB_LSN lsn; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + LOG *lp; + int ret; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + mgr = env->tx_handle; + region = mgr->reginfo.primary; + + /* + * Set up the log starting at the file number of the first LSN we + * need to get from the master. + */ + LOG_SYSTEM_LOCK(env); + if ((ret = __log_newfile(dblp, &lsn, file, version)) == 0 && + lsnp != NULL) + *lsnp = lsn; + LOG_SYSTEM_UNLOCK(env); + + /* + * We reset first_lsn to the lp->lsn. We were given the LSN of + * the checkpoint and we now need the LSN for the beginning of + * the file, which __log_newfile conveniently set up for us + * in lp->lsn. + */ + rep->first_lsn = lp->lsn; + TXN_SYSTEM_LOCK(env); + ZERO_LSN(region->last_ckp); + TXN_SYSTEM_UNLOCK(env); + return (ret); +} + +/* + * __rep_queue_filedone - + * Determine if we're really done getting the pages for a queue file. + * Queue is handled in several steps. + * 1. First we get the meta page only. + * 2. We use the meta-page information to figure out first and last + * page numbers (and if queue wraps, first can be > last. + * 3. If first < last, we do a REP_PAGE_REQ for all pages. + * 4. If first > last, we REP_PAGE_REQ from first -> max page number. + * Then we'll ask for page 1 -> last. + * + * This function can return several things: + * DB_REP_PAGEDONE - if we're done with this file. + * 0 - if we're not done with this file. + * error - if we get an error doing some operations. + * + * This function will open a dbp handle to the queue file. This is needed + * by most of the QAM macros. We'll open it on the first pass through + * here and we'll close it whenever we decide we're done. + */ +static int +__rep_queue_filedone(env, ip, rep, rfp) + ENV *env; + DB_THREAD_INFO *ip; + REP *rep; + __rep_fileinfo_args *rfp; +{ +#ifndef HAVE_QUEUE + COMPQUIET(ip, NULL); + COMPQUIET(rep, NULL); + COMPQUIET(rfp, NULL); + return (__db_no_queue_am(env)); +#else + DB *queue_dbp; + db_pgno_t first, last; + u_int32_t flags; + int empty, ret, t_ret; + + ret = 0; + queue_dbp = NULL; + if (rep->queue_dbc == NULL) { + /* + * We need to do a sync here so that the open + * can find the file and file id. + */ + if ((ret = __memp_sync_int(env, NULL, 0, + DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0) + goto out; + if ((ret = + __db_create_internal(&queue_dbp, env, 0)) != 0) + goto out; + flags = DB_NO_AUTO_COMMIT | + (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0); + /* + * We need to check whether this is in-memory so that we pass + * the name correctly as either the file or the database name. + */ + if ((ret = __db_open(queue_dbp, ip, NULL, + FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? NULL : + rfp->info.data, + FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? rfp->info.data : + NULL, + DB_QUEUE, flags, 0, PGNO_BASE_MD)) != 0) + goto out; + + if ((ret = __db_cursor(queue_dbp, + ip, NULL, &rep->queue_dbc, 0)) != 0) + goto out; + } else + queue_dbp = rep->queue_dbc->dbp; + + if ((ret = __queue_pageinfo(queue_dbp, + &first, &last, &empty, 0, 0)) != 0) + goto out; + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Queue fileinfo: first %lu, last %lu, empty %d", + (u_long)first, (u_long)last, empty)); + /* + * We can be at the end of 3 possible states. + * 1. We have received the meta-page and now need to get the + * rest of the pages in the database. + * 2. We have received from first -> max_pgno. We might be done, + * or we might need to ask for wrapped pages. + * 3. We have received all pages in the file. We're done. + */ + if (rfp->max_pgno == 0) { + /* + * We have just received the meta page. Set up the next + * pages to ask for and check if the file is empty. + */ + if (empty) + goto out; + if (first > last) { + rfp->max_pgno = + QAM_RECNO_PAGE(rep->queue_dbc->dbp, UINT32_MAX); + } else + rfp->max_pgno = last; + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Queue fileinfo: First req: first %lu, last %lu", + (u_long)first, (u_long)rfp->max_pgno)); + goto req; + } else if (rfp->max_pgno != last) { + /* + * If max_pgno != last that means we're dealing with a + * wrapped situation. Request next batch of pages. + * Set npages to 1 because we already have page 0, the + * meta-page, now we need pages 1-max_pgno. + */ + first = 1; + rfp->max_pgno = last; + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Queue fileinfo: Wrap req: first %lu, last %lu", + (u_long)first, (u_long)last)); +req: + /* + * Since we're simulating a "gap" to resend new PAGE_REQ + * for this file, we need to set waiting page to last + 1 + * so that we'll ask for all from ready_pg -> last. + */ + rep->npages = first; + rep->ready_pg = first; + rep->waiting_pg = rfp->max_pgno + 1; + rep->max_wait_pg = PGNO_INVALID; + ret = __rep_pggap_req(env, rep, rfp, 0); + return (ret); + } + /* + * max_pgno == last + * If we get here, we have all the pages we need. + * Close the dbp and return. + */ +out: + if (rep->queue_dbc != NULL && + (t_ret = __dbc_close(rep->queue_dbc)) != 0 && ret == 0) + ret = t_ret; + rep->queue_dbc = NULL; + + if (queue_dbp != NULL && + (t_ret = __db_close(queue_dbp, NULL, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + if (ret == 0) + ret = DB_REP_PAGEDONE; + return (ret); +#endif +} + +/* + * PUBLIC: int __rep_remove_init_file __P((ENV *)); + */ +int +__rep_remove_init_file(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + int ret; + char *name; + + db_rep = env->rep_handle; + rep = db_rep->region; + + /* + * If running in-memory replication, return without any file + * operations. + */ + if (FLD_ISSET(rep->config, REP_C_INMEM)) + return (0); + + /* Abbreviated internal init doesn't use an init file. */ + if (F_ISSET(rep, REP_F_ABBREVIATED)) + return (0); + + if ((ret = __db_appname(env, + DB_APP_NONE, REP_INITNAME, NULL, &name)) != 0) + return (ret); + (void)__os_unlink(env, name, 0); + __os_free(env, name); + return (0); +} + +/* + * Checks for the existence of the internal init flag file. If it exists, we + * remove all logs and databases, and then remove the flag file. This is + * intended to force the internal init to start over again, and thus affords + * protection against a client crashing during internal init. This function + * must be called before normal recovery in order to be properly effective. + * + * !!! + * This function should only be called during initial set-up of the environment, + * before various subsystems are initialized. It doesn't rely on the + * subsystems' code having been initialized, and it summarily deletes files "out + * from under" them, which might disturb the subsystems if they were up. + * + * PUBLIC: int __rep_reset_init __P((ENV *)); + */ +int +__rep_reset_init(env) + ENV *env; +{ + DB_FH *fhp; + __rep_update_args *rup; + DBT dbt; + char *allocated_dir, *dir, *init_name; + size_t cnt; + u_int32_t dbtvers, fvers, zero; + u_int8_t *next; + int ret, t_ret; + + allocated_dir = NULL; + rup = NULL; + dbt.data = NULL; + + if ((ret = __db_appname(env, + DB_APP_NONE, REP_INITNAME, NULL, &init_name)) != 0) + return (ret); + + if ((ret = __os_open( + env, init_name, 0, DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0) { + if (ret == ENOENT) + ret = 0; + goto out; + } + + RPRINT(env, DB_VERB_REP_SYNC, + (env, "Cleaning up interrupted internal init")); + + /* There are a few possibilities: + * 1. no init file, or less than 1 full file list + * 2. exactly one full file list + * 3. more than one, less then a second full file list + * 4. second file list in full + * + * In cases 2 or 4, we need to remove all logs, and then remove files + * according to the (most recent) file list. (In case 1 or 3, we don't + * have to do anything.) + * + * The __rep_get_file_list function takes care of folding these cases + * into two simple outcomes. + * + * As of 4.7, the first 4 bytes are 0. Read the first 4 bytes now. + * If they are non-zero it means we have an old-style init file. + * Otherwise, pass the file version in to rep_get_file_list. + */ + if ((ret = __os_read(env, fhp, &zero, sizeof(zero), &cnt)) != 0) + goto out; + /* + * If we read successfully, but not enough, then unlink the file. + */ + if (cnt != sizeof(zero)) + goto rm; + if (zero != 0) { + /* + * Old style file. We have to set fvers to the 4.6 + * version of the file and also rewind the file so + * that __rep_get_file_list can read out the length itself. + */ + if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0) + goto out; + fvers = REP_INITVERSION_46; + } else if ((ret = __os_read(env, + fhp, &fvers, sizeof(fvers), &cnt)) != 0) + goto out; + else if (cnt != sizeof(fvers)) + goto rm; + ret = __rep_get_file_list(env, fhp, fvers, &dbtvers, &dbt); + if ((t_ret = __os_closehandle(env, fhp)) != 0 || ret != 0) { + if (ret == 0) + ret = t_ret; + goto out; + } + if (dbt.data == NULL) { + /* + * The init file did not end with an intact file list. Since we + * never start log/db removal without an intact file list + * sync'ed to the init file, this must mean we don't have any + * partial set of files to clean up. So all we need to do is + * remove the init file. + */ + goto rm; + } + + /* Remove all log files. */ + if (env->dbenv->db_log_dir == NULL) + dir = env->db_home; + else { + if ((ret = __db_appname(env, + DB_APP_NONE, env->dbenv->db_log_dir, NULL, &dir)) != 0) + goto out; + allocated_dir = dir; + } + + if ((ret = __rep_remove_by_prefix(env, + dir, LFPREFIX, sizeof(LFPREFIX)-1, DB_APP_LOG)) != 0) + goto out; + + /* + * Remove databases according to the list, and queue extent files by + * searching them out on a walk through the data_dir's. + */ + if ((ret = __rep_update_unmarshal(env, dbtvers, + &rup, dbt.data, dbt.size, &next)) != 0) + goto out; + if ((ret = __rep_unlink_by_list(env, dbtvers, + next, dbt.size, rup->num_files)) != 0) + goto out; + + /* Here, we've established that the file exists. */ +rm: (void)__os_unlink(env, init_name, 0); +out: if (rup != NULL) + __os_free(env, rup); + if (allocated_dir != NULL) + __os_free(env, allocated_dir); + if (dbt.data != NULL) + __os_free(env, dbt.data); + + __os_free(env, init_name); + return (ret); +} + +/* + * Reads the last fully intact file list from the init file. If the file ends + * with a partial list (or is empty), we're not interested in it. Lack of a + * full file list is indicated by a NULL dbt->data. On success, the list is + * returned in allocated space, which becomes the responsibility of the caller. + * + * The file format is a u_int32_t buffer length, in native format, followed by + * the file list itself, in the same format as in an UPDATE message (though + * many parts of it in this case are meaningless). + */ +static int +__rep_get_file_list(env, fhp, fvers, dbtvers, dbt) + ENV *env; + DB_FH *fhp; + u_int32_t fvers; + u_int32_t *dbtvers; + DBT *dbt; +{ + u_int32_t length, mvers; + size_t cnt; + int i, ret; + + /* At most 2 file lists: old and new. */ + dbt->data = NULL; + mvers = DB_REPVERSION_46; + length = 0; + for (i = 1; i <= 2; i++) { + if (fvers >= REP_INITVERSION_47) { + if ((ret = __os_read(env, fhp, &mvers, + sizeof(mvers), &cnt)) != 0) + goto err; + if (cnt == 0 && dbt->data != NULL) + break; + if (cnt != sizeof(mvers)) + goto err; + } + if ((ret = __os_read(env, + fhp, &length, sizeof(length), &cnt)) != 0) + goto err; + + /* + * Reaching the end here is fine, if we've been through at least + * once already. + */ + if (cnt == 0 && dbt->data != NULL) + break; + if (cnt != sizeof(length)) + goto err; + + if ((ret = __os_realloc(env, + (size_t)length, &dbt->data)) != 0) + goto err; + + if ((ret = __os_read( + env, fhp, dbt->data, length, &cnt)) != 0 || + cnt != (size_t)length) + goto err; + } + + *dbtvers = mvers; + dbt->size = length; + return (0); + +err: + /* + * Note that it's OK to get here with a zero value in 'ret': it means we + * read less than we expected, and dbt->data == NULL indicates to the + * caller that we don't have an intact list. + */ + if (dbt->data != NULL) + __os_free(env, dbt->data); + dbt->data = NULL; + return (ret); +} + +/* + * Removes every file in a given directory that matches a given prefix. Notice + * how similar this is to __rep_walk_dir. + */ +static int +__rep_remove_by_prefix(env, dir, prefix, pref_len, appname) + ENV *env; + const char *dir; + const char *prefix; + size_t pref_len; + APPNAME appname; /* What kind of name. */ +{ + char *namep, **names; + int cnt, i, ret; + + if ((ret = __os_dirlist(env, dir, 0, &names, &cnt)) != 0) + return (ret); + for (i = 0; i < cnt; i++) { + if (strncmp(names[i], prefix, pref_len) == 0) { + if ((ret = __db_appname(env, + appname, names[i], NULL, &namep)) != 0) + goto out; + (void)__os_unlink(env, namep, 0); + __os_free(env, namep); + } + } +out: __os_dirfree(env, names, cnt); + return (ret); +} + +/* + * Removes database files according to the contents of a list. + * + * This function must support removal either during environment creation, or + * when an internal init is reset in the middle. This means it must work + * regardless of whether underlying subsystems are initialized. However, it may + * assume that databases are not open. That means there is no REP! + */ +static int +__rep_unlink_by_list(env, version, filelist, filesz, count) + ENV *env; + u_int32_t version; + u_int8_t *filelist; + u_int32_t filesz; + u_int32_t count; +{ + DB_ENV *dbenv; + __rep_fileinfo_args *rfp; + char **ddir, *dir, *namep; + u_int8_t *new_fp; + int ret; + + dbenv = env->dbenv; + ret = 0; + rfp = NULL; + while (count-- > 0) { + if ((ret = __rep_fileinfo_unmarshal(env, version, + &rfp, filelist, filesz, &new_fp)) != 0) + goto out; + filesz -= (u_int32_t)(new_fp - filelist); + filelist = new_fp; + + if ((ret = __db_appname(env, + DB_APP_DATA, rfp->info.data, NULL, &namep)) != 0) + goto out; + (void)__os_unlink(env, namep, 0); + __os_free(env, namep); + __os_free(env, rfp); + rfp = NULL; + } + + /* Notice how similar this code is to __rep_find_dbs. */ + if (dbenv->db_data_dir == NULL) + ret = __rep_remove_by_prefix(env, env->db_home, + QUEUE_EXTENT_PREFIX, sizeof(QUEUE_EXTENT_PREFIX) - 1, + DB_APP_DATA); + else { + for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) { + if ((ret = __db_appname(env, + DB_APP_NONE, *ddir, NULL, &dir)) != 0) + break; + ret = __rep_remove_by_prefix(env, dir, + QUEUE_EXTENT_PREFIX, sizeof(QUEUE_EXTENT_PREFIX)-1, + DB_APP_DATA); + __os_free(env, dir); + if (ret != 0) + break; + } + } + +out: + if (rfp != NULL) + __os_free(env, rfp); + return (ret); +} + +static int +__rep_remove_by_list(env, version, filelist, filesz, count) + ENV *env; + u_int32_t version; + u_int8_t *filelist; + u_int32_t filesz; + u_int32_t count; +{ + __rep_fileinfo_args *rfp; + u_int8_t *new_fp; + int ret; + + ret = 0; + rfp = NULL; + while (count-- > 0) { + if ((ret = __rep_fileinfo_unmarshal(env, version, + &rfp, filelist, filesz, &new_fp)) != 0) + break; + filesz -= (u_int32_t)(new_fp - filelist); + filelist = new_fp; + + if ((ret = __rep_remove_file(env, rfp->uid.data, + rfp->info.data, rfp->type, rfp->db_flags)) != 0) { + /* + * If the file already doesn't exist, that's perfectly + * OK. This can easily happen if we're cleaning up an + * interrupted internal init, and we only got part-way + * through the list of files. + */ + if (ret == ENOENT) + ret = 0; + else + break; + } + __os_free(env, rfp); + rfp = NULL; + } + + if (rfp != NULL) + __os_free(env, rfp); + return (ret); +} diff --git a/rep/rep_elect.c b/rep/rep_elect.c new file mode 100644 index 0000000..61f79e4 --- /dev/null +++ b/rep/rep_elect.c @@ -0,0 +1,1353 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2004-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" + +/* + * We need to check sites == nsites, not more than half + * like we do in __rep_elect and the VOTE2 code. The + * reason is that we want to process all the incoming votes + * and not short-circuit once we reach more than half. The + * real winner's vote may be in the last half. + */ +#define IS_PHASE1_DONE(rep) \ + ((rep)->sites >= (rep)->nsites && (rep)->w_priority > 0) + +#define I_HAVE_WON(rep, winner) \ + ((rep)->votes >= (rep)->nvotes && winner == (rep)->eid) + +static void __rep_cmp_vote __P((ENV *, REP *, int, DB_LSN *, + u_int32_t, u_int32_t, u_int32_t, u_int32_t)); +static int __rep_elect_init + __P((ENV *, u_int32_t, u_int32_t, int *, u_int32_t *)); +static int __rep_fire_elected __P((ENV *, REP *, u_int32_t)); +static void __rep_elect_master __P((ENV *, REP *)); +static int __rep_grow_sites __P((ENV *, u_int32_t)); +static int __rep_tally __P((ENV *, REP *, int, u_int32_t *, u_int32_t, int)); +static int __rep_wait __P((ENV *, db_timeout_t *, int, u_int32_t, u_int32_t)); + +/* + * __rep_elect_pp -- + * Called after master failure to hold/participate in an election for + * a new master. + * + * PUBLIC: int __rep_elect_pp + * PUBLIC: __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t)); + */ +int +__rep_elect_pp(dbenv, given_nsites, nvotes, flags) + DB_ENV *dbenv; + u_int32_t given_nsites, nvotes; + u_int32_t flags; +{ + DB_REP *db_rep; + ENV *env; + int ret; + + env = dbenv->env; + db_rep = env->rep_handle; + ret = 0; + + ENV_REQUIRES_CONFIG_XX( + env, rep_handle, "DB_ENV->rep_elect", DB_INIT_REP); + + if (APP_IS_REPMGR(env)) { + __db_errx(env, +"DB_ENV->rep_elect: cannot call from Replication Manager application"); + return (EINVAL); + } + + /* We need a transport function because we send messages. */ + if (db_rep->send == NULL) { + __db_errx(env, + "DB_ENV->rep_elect: must be called after DB_ENV->rep_set_transport"); + return (EINVAL); + } + + if (IS_USING_LEASES(env) && given_nsites != 0) { + __db_errx(env, + "DB_ENV->rep_elect: nsites must be zero if leases configured"); + return (EINVAL); + } + + ret = __rep_elect_int(env, given_nsites, nvotes, flags); + + return (ret); +} + +/* + * __rep_elect_int -- + * Internal processing to hold/participate in an election for + * a new master after master failure. + * + * PUBLIC: int __rep_elect_int + * PUBLIC: __P((ENV *, u_int32_t, u_int32_t, u_int32_t)); + */ +int +__rep_elect_int(env, given_nsites, nvotes, flags) + ENV *env; + u_int32_t given_nsites, nvotes; + u_int32_t flags; +{ + DB_LOG *dblp; + DB_LSN lsn; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + LOG *lp; + REP *rep; + int done, elected, full_elect, in_progress, locked, need_req; + int ret, send_vote, t_ret; + u_int32_t ack, ctlflags, egen, nsites, orig_tally, priority, realpri; + u_int32_t repflags, tiebreaker; + db_timeout_t last_to, timeout, to; + + COMPQUIET(flags, 0); + COMPQUIET(egen, 0); + + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + elected = 0; + + /* + * Specifying 0 for nsites signals us to use the value configured + * previously via rep_set_nsites. Similarly, if the given nvotes is 0, + * it asks us to compute the value representing a simple majority. + */ + nsites = given_nsites == 0 ? rep->config_nsites : given_nsites; + ack = nvotes == 0 ? ELECTION_MAJORITY(nsites) : nvotes; + locked = 0; + + /* + * XXX + * If users give us less than a majority, they run the risk of + * having a network partition. However, this also allows the + * scenario of master/1 client to elect the client. Allow + * sub-majority values, but give a warning. + */ + if (ack <= (nsites / 2)) { + __db_errx(env, + "DB_ENV->rep_elect:WARNING: nvotes (%d) is sub-majority with nsites (%d)", + nvotes, nsites); + } + + if (nsites < ack) { + __db_errx(env, + "DB_ENV->rep_elect: nvotes (%d) is larger than nsites (%d)", + ack, nsites); + return (EINVAL); + } + + /* + * Default to the normal timeout unless the user configured + * a full election timeout and we think we need a full election. + */ + full_elect = 0; + timeout = rep->elect_timeout; + if (!F_ISSET(rep, REP_F_GROUP_ESTD) && rep->full_elect_timeout != 0) { + full_elect = 1; + timeout = rep->full_elect_timeout; + } + realpri = rep->priority; + + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Start election nsites %d, ack %d, priority %d", + nsites, ack, realpri)); + + /* + * Special case when having an election while running with + * sites of potentially mixed versions. We set a bit indicating + * we're an electable site, but set our priority to 0. + * Old sites will never elect us, with 0 priority, but if all + * we have are new sites, then we can elect the best electable + * site of the group. + * Thus 'priority' is this special, possibly-fake, effective + * priority that we'll use for this election, while 'realpri' is our + * real, configured priority, as retrieved from REP region. + */ + ctlflags = realpri != 0 ? REPCTL_ELECTABLE : 0; + ENV_ENTER(env, ip); + + orig_tally = 0; + if ((ret = __rep_elect_init(env, nsites, ack, + &in_progress, &orig_tally)) != 0) { + if (ret == DB_REP_NEWMASTER) + ret = 0; + goto err; + } + /* + * If another thread is in the middle of an election we + * just quietly return and not interfere. + */ + if (in_progress) + goto edone; + + priority = lp->persist.version != DB_LOGVERSION ? 0 : realpri; +#ifdef CONFIG_TEST + /* + * This allows us to unit test the ELECTABLE flag simply by + * using the priority values. + */ + if (priority > 0 && priority <= 5) { + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Artificially setting priority 0 (ELECTABLE) for CONFIG_TEST mode")); + DB_ASSERT(env, ctlflags == REPCTL_ELECTABLE); + priority = 0; + } +#endif + __os_gettime(env, &rep->etime, 1); + REP_SYSTEM_LOCK(env); + /* + * If leases are configured, wait for them to expire, and + * see if we can discover the master while waiting. + */ + if (IS_USING_LEASES(env)) { + to = __rep_lease_waittime(env); + if (to != 0) { + F_SET(rep, REP_F_EPHASE0); + REP_SYSTEM_UNLOCK(env); + (void)__rep_send_message(env, DB_EID_BROADCAST, + REP_MASTER_REQ, NULL, NULL, 0, 0); + ret = __rep_wait(env, &to, 0, rep->egen, REP_F_EPHASE0); + REP_SYSTEM_LOCK(env); + repflags = rep->flags; + F_CLR(rep, REP_F_EPHASE0); + switch (ret) { + /* + * If waiting is successful, our flag is cleared + * and the master responded. We're done. + */ + case DB_REP_EGENCHG: + case 0: + REP_SYSTEM_UNLOCK(env); + goto edone; + /* + * If we get a timeout, continue with the election. + */ + case DB_TIMEOUT: + /* + * We have waited a full lease timeout. We + * need to check now under lock to verify that + * the phase was not over and that the client + * did not grant the lease. If either happened + * between the time the wait finished and we + * reacquired the mutex, we're done. + */ + if (!FLD_ISSET(repflags, REP_F_EPHASE0) || + __rep_islease_granted(env) != 0) { + ret = 0; + REP_SYSTEM_UNLOCK(env); + goto edone; + } + F_SET(rep, REP_F_LEASE_EXPIRED); + break; + default: + goto lockdone; + } + } + } + /* + * We need to lockout applying incoming log records during + * the election. We need to use a special rep_lockout_apply + * instead of rep_lockout_msg because we do not want to + * lockout all incoming messages, like other VOTEs! + */ + if ((ret = __rep_lockout_apply(env, rep, 0)) != 0) + goto lockdone; + locked = 1; + last_to = to = timeout; + REP_SYSTEM_UNLOCK(env); +restart: + /* Generate a randomized tiebreaker value. */ + __os_unique_id(env, &tiebreaker); + LOG_SYSTEM_LOCK(env); + lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + REP_SYSTEM_LOCK(env); + + F_SET(rep, REP_F_EPHASE1 | REP_F_NOARCHIVE); + F_CLR(rep, REP_F_TALLY); + /* + * We made sure that leases were expired before starting the + * election, but an existing master may be slow in responding. + * If, during lockout, acquiring mutexes, etc, the client has now + * re-granted its lease, we're done - a master exists. + */ + if (IS_USING_LEASES(env) && + __rep_islease_granted(env)) { + ret = 0; + goto lockdone; + } + + /* + * If we are in the middle of recovering or internal + * init, we participate, but we set our priority to 0 + * and turn off REPCTL_ELECTABLE. We *cannot* use the + * REP_F_RECOVER_MASK macro because we must explicitly + * exclude REP_F_RECOVER_VERIFY. If we are in verify + * then that is okay, we can be elected (i.e. we are not + * in an inconsistent state). + */ + if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP | + REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE | REP_F_RECOVER_UPDATE)) { + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Setting priority 0, unelectable, due to internal init/recovery")); + priority = 0; + ctlflags = 0; + } + + /* + * We are about to participate at this egen. We must + * write out the next egen before participating in this one + * so that if we crash we can never participate in this egen + * again. + */ + if ((ret = __rep_write_egen(env, rep, rep->egen + 1)) != 0) + goto lockdone; + + /* Tally our own vote */ + if (__rep_tally(env, rep, rep->eid, &rep->sites, rep->egen, 1) != 0) { + ret = EINVAL; + goto lockdone; + } + __rep_cmp_vote(env, rep, rep->eid, &lsn, priority, rep->gen, + tiebreaker, ctlflags); + + RPRINT(env, DB_VERB_REP_ELECT, (env, "Beginning an election")); + + /* Now send vote */ + send_vote = DB_EID_INVALID; + egen = rep->egen; + done = IS_PHASE1_DONE(rep); + REP_SYSTEM_UNLOCK(env); + __rep_send_vote(env, &lsn, nsites, ack, priority, tiebreaker, egen, + DB_EID_BROADCAST, REP_VOTE1, ctlflags); + DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTVOTE1, ret, NULL); + if (done) { + REP_SYSTEM_LOCK(env); + goto vote; + } + last_to = to; + ret = __rep_wait(env, &to, full_elect, egen, REP_F_EPHASE1); + switch (ret) { + case 0: + /* Check if election complete or phase complete. */ + if (!IN_ELECTION(rep)) { + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Ended election phase 1")); + goto edone; + } + goto phase2; + case DB_REP_EGENCHG: + /* + * Pick up reducing our timeout where we last + * left off. + */ + if (to > last_to) + to = last_to; + to = (to * 8) / 10; + RPRINT(env, DB_VERB_REP_ELECT, (env, +"Egen changed while waiting. Now %lu. New timeout %lu, orig timeout %lu", + (u_long)rep->egen, (u_long)to, (u_long)timeout)); + /* + * If the egen changed while we were sleeping, that + * means we're probably late to the next election, + * so we'll backoff our timeout so that we don't get + * into an out-of-phase election scenario. + * + * Backoff to 80% of the current timeout. + */ + goto restart; + case DB_TIMEOUT: + break; + default: + goto err; + } + + REP_SYSTEM_LOCK(env); + /* + * If we got here, we haven't heard from everyone, but we've + * run out of time, so it's time to decide if we have enough + * votes to pick a winner and if so, to send out a vote to + * the winner. + * + * Check the state of the world after reacquiring the mutex. + * See if the election actually finished anyway. + */ + if (!IN_ELECTION(rep)) { + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Ended election after acquiring mutex")); + ret = 0; + goto lockdone; + } + /* + * If our egen changed while we were waiting. We need to + * essentially reinitialize our election. + */ + if (egen != rep->egen) { + REP_SYSTEM_UNLOCK(env); + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Egen changed from %lu to %lu", + (u_long)egen, (u_long)rep->egen)); + goto restart; + } + if (rep->sites >= rep->nvotes) { +vote: + /* We think we've seen enough to cast a vote. */ + send_vote = rep->winner; + /* + * See if we won. This will make sure we + * don't count ourselves twice if we're racing + * with incoming votes. + */ + if (rep->winner == rep->eid) { + (void)__rep_tally(env, rep, rep->eid, &rep->votes, + egen, 2); + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Counted my vote %d", rep->votes)); + } + F_SET(rep, REP_F_EPHASE2); + F_CLR(rep, REP_F_EPHASE1); + } + REP_SYSTEM_UNLOCK(env); + if (send_vote == DB_EID_INVALID) { + /* We do not have enough votes to elect. */ + if (rep->sites >= rep->nvotes) + __db_errx(env, + "No electable site found: recvd %d of %d votes from %d sites", + rep->sites, rep->nvotes, rep->nsites); + else + __db_errx(env, + "Not enough votes to elect: recvd %d of %d from %d sites", + rep->sites, rep->nvotes, rep->nsites); + ret = DB_REP_UNAVAIL; + goto err; + } + + /* + * We have seen enough vote1's. Now we need to wait + * for all the vote2's. + */ + if (send_vote != rep->eid) { + RPRINT(env, DB_VERB_REP_ELECT, (env, "Sending vote")); + __rep_send_vote(env, NULL, 0, 0, 0, 0, egen, + send_vote, REP_VOTE2, 0); + /* + * If we are NOT the new master we want to send + * our vote to the winner, and wait longer. The + * reason is that the winner may be "behind" us + * in the election waiting and if the master is + * down, the winner will wait the full timeout + * and we want to give the winner enough time to + * process all the votes. Otherwise we could + * incorrectly return DB_REP_UNAVAIL and start a + * new election before the winner can declare + * itself. + */ + to = to * 2; + } + +phase2: + if (I_HAVE_WON(rep, rep->winner)) { + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Skipping phase2 wait: already got %d votes", rep->votes)); + REP_SYSTEM_LOCK(env); + goto i_won; + } + /* + * Don't set last_to to 'to' here because we may have adjusted + * it above. If egen changes we want to pick up reducing the + * timeout from the point we were above. + */ + ret = __rep_wait(env, &to, full_elect, egen, REP_F_EPHASE2); + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Ended election phase 2 %d", ret)); + switch (ret) { + case 0: + goto edone; + case DB_REP_EGENCHG: + if (to > last_to) + to = last_to; + to = (to * 8) / 10; + RPRINT(env, DB_VERB_REP_ELECT, (env, +"While waiting egen changed to %lu. Phase 2 New timeout %lu, orig timeout %lu", + (u_long)rep->egen, + (u_long)to, (u_long)timeout)); + goto restart; + case DB_TIMEOUT: + ret = DB_REP_UNAVAIL; + break; + default: + goto err; + } + REP_SYSTEM_LOCK(env); + /* + * Check the state of the world after reacquiring the mutex. + * See if the election actually finished anyway. + */ + if (!IN_ELECTION(rep)) { + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Ended election phase 2 after acquiring mutex")); + ret = 0; + goto lockdone; + } + if (egen != rep->egen) { + REP_SYSTEM_UNLOCK(env); + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Egen ph2 changed from %lu to %lu", + (u_long)egen, (u_long)rep->egen)); + goto restart; + } + RPRINT(env, DB_VERB_REP_ELECT, (env, + "After phase 2: votes %d, nvotes %d, nsites %d", + rep->votes, rep->nvotes, rep->nsites)); + if (I_HAVE_WON(rep, rep->winner)) { +i_won: __rep_elect_master(env, rep); + ret = 0; + elected = 1; + } + if (0) { +err: REP_SYSTEM_LOCK(env); + } +lockdone: + /* + * If we get here because of a non-election error, then we + * did not tally our vote. The only non-election error is + * from elect_init where we were unable to grow_sites. In + * that case we do not want to discard all known election info. + */ + if (ret == 0 || ret == DB_REP_UNAVAIL) + __rep_elect_done(env, rep, 0); + else if (orig_tally) + F_SET(rep, orig_tally); + + /* + * If the election finished elsewhere, we need to clear + * the elect flag anyway. + */ + if (0) { +edone: REP_SYSTEM_LOCK(env); + } + F_CLR(rep, REP_F_INREPELECT); + if (locked) { + need_req = F_ISSET(rep, REP_F_SKIPPED_APPLY) && + !I_HAVE_WON(rep, rep->winner); + F_CLR(rep, REP_F_READY_APPLY | REP_F_SKIPPED_APPLY); + REP_SYSTEM_UNLOCK(env); + /* + * If we skipped any log records, request them now. + */ + if (need_req && (t_ret = __rep_resend_req(env, 0)) != 0 && + ret == 0) + ret = t_ret; + } else + REP_SYSTEM_UNLOCK(env); + + if (elected) + ret = __rep_fire_elected(env, rep, egen); + + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Ended election with %d, sites %d, egen %lu, flags 0x%lx", + ret, rep->sites, (u_long)rep->egen, (u_long)rep->flags)); + +DB_TEST_RECOVERY_LABEL + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __rep_vote1 -- + * Handle incoming vote1 message on a client. + * + * PUBLIC: int __rep_vote1 __P((ENV *, __rep_control_args *, DBT *, int)); + */ +int +__rep_vote1(env, rp, rec, eid) + ENV *env; + __rep_control_args *rp; + DBT *rec; + int eid; +{ + DBT data_dbt; + DB_LOG *dblp; + DB_LSN lsn; + DB_REP *db_rep; + LOG *lp; + REP *rep; + REP_OLD_VOTE_INFO *ovi; + __rep_egen_args egen_arg; + __rep_vote_info_args tmpvi, *vi; + u_int32_t egen; + int elected, inelect, master, ret; + u_int8_t buf[__REP_MAXMSG_SIZE]; + size_t len; + + COMPQUIET(egen, 0); + + elected = ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + + if (F_ISSET(rep, REP_F_MASTER)) { + RPRINT(env, DB_VERB_REP_ELECT, (env, "Master received vote")); + LOG_SYSTEM_LOCK(env); + lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0); + return (ret); + } + + /* + * In 4.7 we changed to having fixed sized u_int32_t's from + * non-fixed 'int' fields in the vote structure. + */ + if (rp->rep_version < DB_REPVERSION_47) { + ovi = (REP_OLD_VOTE_INFO *)rec->data; + tmpvi.egen = ovi->egen; + tmpvi.nsites = (u_int32_t)ovi->nsites; + tmpvi.nvotes = (u_int32_t)ovi->nvotes; + tmpvi.priority = (u_int32_t)ovi->priority; + tmpvi.tiebreaker = ovi->tiebreaker; + } else + if ((ret = __rep_vote_info_unmarshal(env, + &tmpvi, rec->data, rec->size, NULL)) != 0) + return (ret); + vi = &tmpvi; + REP_SYSTEM_LOCK(env); + + /* + * If we get a vote from a later election gen, we + * clear everything from the current one, and we'll + * start over by tallying it. If we get an old vote, + * send an ALIVE to the old participant. + */ + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Received vote1 egen %lu, egen %lu", + (u_long)vi->egen, (u_long)rep->egen)); + if (vi->egen < rep->egen) { + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Received old vote %lu, egen %lu, ignoring vote1", + (u_long)vi->egen, (u_long)rep->egen)); + egen_arg.egen = rep->egen; + REP_SYSTEM_UNLOCK(env); + if (rep->version < DB_REPVERSION_47) + DB_INIT_DBT(data_dbt, &egen_arg.egen, + sizeof(egen_arg.egen)); + else { + if ((ret = __rep_egen_marshal(env, + &egen_arg, buf, __REP_EGEN_SIZE, &len)) != 0) + return (ret); + DB_INIT_DBT(data_dbt, buf, len); + } + (void)__rep_send_message(env, + eid, REP_ALIVE, &rp->lsn, &data_dbt, 0, 0); + return (ret); + } + inelect = F_ISSET(rep, REP_F_INREPELECT); + if (vi->egen > rep->egen) { + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Received VOTE1 from egen %lu, my egen %lu; reset", + (u_long)vi->egen, (u_long)rep->egen)); + /* + * Record if we're currently in rep_elect. If so, don't + * return HOLDELECTION because the election thread should + * notice it. However, there is a window where the thread + * could be at the tail end of processing the previous + * election and it would not see this change in egen. + */ + __rep_elect_done(env, rep, 0); + rep->egen = vi->egen; + F_SET(rep, REP_F_EGENUPDATE); + } + + /* + * If this site (sender of the VOTE1) is the first to the party, simply + * initialize values from the message. Otherwise, see if the site knows + * about more sites, and/or requires more votes, than we do. + */ + if (!IN_ELECTION_TALLY(rep)) { + F_SET(rep, REP_F_TALLY); + rep->nsites = vi->nsites; + rep->nvotes = vi->nvotes; + } else { + if (vi->nsites > rep->nsites) + rep->nsites = vi->nsites; + if (vi->nvotes > rep->nvotes) + rep->nvotes = vi->nvotes; + } + + /* + * We are keeping the vote, let's see if that changes our + * count of the number of sites. + */ + if (rep->sites + 1 > rep->nsites) + rep->nsites = rep->sites + 1; + /* + * Ignore vote1's if we're in phase 2. + */ + if (F_ISSET(rep, REP_F_EPHASE2)) { + RPRINT(env, DB_VERB_REP_ELECT, + (env, "In phase 2, ignoring vote1")); + goto err; + } + + /* + * Record this vote. If we get back non-zero, we + * ignore the vote. + */ + if ((ret = __rep_tally(env, rep, eid, &rep->sites, vi->egen, 1)) != 0) { + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Tally returned %d, sites %d", ret, rep->sites)); + ret = 0; + goto err; + } + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Incoming vote: (eid)%d (pri)%lu %s (gen)%lu (egen)%lu [%lu,%lu]", + eid, (u_long)vi->priority, + F_ISSET(rp, REPCTL_ELECTABLE) ? "ELECTABLE" : "", + (u_long)rp->gen, (u_long)vi->egen, + (u_long)rp->lsn.file, (u_long)rp->lsn.offset)); + if (rep->sites > 1) + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Existing vote: (eid)%d (pri)%lu (gen)%lu (sites)%d [%lu,%lu]", + rep->winner, (u_long)rep->w_priority, + (u_long)rep->w_gen, rep->sites, + (u_long)rep->w_lsn.file, + (u_long)rep->w_lsn.offset)); + + __rep_cmp_vote(env, rep, eid, &rp->lsn, vi->priority, + rp->gen, vi->tiebreaker, rp->flags); + /* + * If you get a vote and you're not in an election, we've + * already recorded this vote. But that is all we need + * to do. + */ + if (!IN_ELECTION(rep)) { + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Not in election, but received vote1 0x%x", rep->flags)); + /* + * If we were in the middle of an election and we ended up + * clearing the election out from under the rep_elect caller, + * we want to just return here. + */ + if (inelect) + ret = 0; + else + ret = DB_REP_HOLDELECTION; + goto err; + } + + master = rep->winner; + lsn = rep->w_lsn; + if (IS_PHASE1_DONE(rep)) { + RPRINT(env, DB_VERB_REP_ELECT, (env, "Phase1 election done")); + RPRINT(env, DB_VERB_REP_ELECT, (env, "Voting for %d%s", + master, master == rep->eid ? "(self)" : "")); + egen = rep->egen; + F_SET(rep, REP_F_EPHASE2); + F_CLR(rep, REP_F_EPHASE1); + if (master == rep->eid) { + (void)__rep_tally(env, rep, rep->eid, + &rep->votes, egen, 2); + RPRINT(env, DB_VERB_REP_ELECT, (env, + "After phase 1 done: counted vote %d of %d", + rep->votes, rep->nvotes)); + if (I_HAVE_WON(rep, rep->winner)) { + __rep_elect_master(env, rep); + elected = 1; + } + goto err; + } + REP_SYSTEM_UNLOCK(env); + + /* Vote for someone else. */ + __rep_send_vote(env, NULL, 0, 0, 0, 0, egen, + master, REP_VOTE2, 0); + } else +err: REP_SYSTEM_UNLOCK(env); + if (elected) + ret = __rep_fire_elected(env, rep, egen); + return (ret); +} + +/* + * __rep_vote2 -- + * Handle incoming vote2 message on a client. + * + * PUBLIC: int __rep_vote2 __P((ENV *, __rep_control_args *, DBT *, int)); + */ +int +__rep_vote2(env, rp, rec, eid) + ENV *env; + __rep_control_args *rp; + DBT *rec; + int eid; +{ + DB_LOG *dblp; + DB_LSN lsn; + DB_REP *db_rep; + LOG *lp; + REP *rep; + REP_OLD_VOTE_INFO *ovi; + __rep_vote_info_args tmpvi, *vi; + u_int32_t egen; + int ret; + + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + + RPRINT(env, DB_VERB_REP_ELECT, (env, "We received a vote%s", + F_ISSET(rep, REP_F_MASTER) ? " (master)" : "")); + if (F_ISSET(rep, REP_F_MASTER)) { + LOG_SYSTEM_LOCK(env); + lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + STAT(rep->stat.st_elections_won++); + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0); + if (IS_USING_LEASES(env)) + ret = __rep_lease_refresh(env); + return (ret); + } + + REP_SYSTEM_LOCK(env); + egen = rep->egen; + + /* + * We might be the last to the party and we haven't had + * time to tally all the vote1's, but others have and + * decided we're the winner. So, if we're in the process + * of tallying sites, keep the vote so that when our + * election thread catches up we'll have the votes we + * already received. + */ + /* + * In 4.7 we changed to having fixed sized u_int32_t's from + * non-fixed 'int' fields in the vote structure. + */ + if (rp->rep_version < DB_REPVERSION_47) { + ovi = (REP_OLD_VOTE_INFO *)rec->data; + tmpvi.egen = ovi->egen; + tmpvi.nsites = (u_int32_t)ovi->nsites; + tmpvi.nvotes = (u_int32_t)ovi->nvotes; + tmpvi.priority = (u_int32_t)ovi->priority; + tmpvi.tiebreaker = ovi->tiebreaker; + } else + if ((ret = __rep_vote_info_unmarshal(env, + &tmpvi, rec->data, rec->size, NULL)) != 0) + return (ret); + vi = &tmpvi; + if (!IN_ELECTION_TALLY(rep) && vi->egen >= rep->egen) { + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Not in election gen %lu, at %lu, got vote", + (u_long)vi->egen, (u_long)rep->egen)); + ret = DB_REP_HOLDELECTION; + goto err; + } + + /* + * Record this vote. In a VOTE2, the only valid entry + * in the vote information is the election generation. + * + * There are several things which can go wrong that we + * need to account for: + * 1. If we receive a latent VOTE2 from an earlier election, + * we want to ignore it. + * 2. If we receive a VOTE2 from a site from which we never + * received a VOTE1, we want to record it, because we simply + * may be processing messages out of order or its vote1 got lost, + * but that site got all the votes it needed to send it. + * 3. If we have received a duplicate VOTE2 from this election + * from the same site we want to ignore it. + * 4. If this is from the current election and someone is + * really voting for us, then we finally get to record it. + */ + /* + * Case 1. + */ + if (vi->egen != rep->egen) { + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Bad vote egen %lu. Mine %lu", + (u_long)vi->egen, (u_long)rep->egen)); + ret = 0; + goto err; + } + + /* + * __rep_tally takes care of cases 2, 3 and 4. + */ + if ((ret = __rep_tally(env, rep, eid, &rep->votes, vi->egen, 2)) != 0) { + ret = 0; + goto err; + } + RPRINT(env, DB_VERB_REP_ELECT, (env, "Counted vote %d of %d", + rep->votes, rep->nvotes)); + if (I_HAVE_WON(rep, rep->winner)) { + __rep_elect_master(env, rep); + ret = DB_REP_NEWMASTER; + } + +err: REP_SYSTEM_UNLOCK(env); + if (ret == DB_REP_NEWMASTER) + ret = __rep_fire_elected(env, rep, egen); + return (ret); +} + +/* + * __rep_tally -- + * Handle incoming vote message on a client. Called with the db_rep + * mutex held. This function will return 0 if we successfully tally + * the vote and non-zero if the vote is ignored. This will record + * both VOTE1 and VOTE2 records, depending on which region offset the + * caller passed in. + */ +static int +__rep_tally(env, rep, eid, countp, egen, phase) + ENV *env; + REP *rep; + int eid; + u_int32_t *countp; + u_int32_t egen; + int phase; +{ + REP_VTALLY *tally, *vtp; + u_int32_t i, max_sites; + int ret; + + /* + * The counts are indices, and therefore 0-based. + */ + if ((*countp + 1) > rep->nsites) + max_sites = (*countp + 1); + else + max_sites = rep->nsites; + if (max_sites > rep->asites && + (ret = __rep_grow_sites(env, max_sites)) != 0) { + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Grow sites returned error %d", ret)); + return (ret); + } + if (phase == 1) + tally = R_ADDR(env->reginfo, rep->tally_off); + else + tally = R_ADDR(env->reginfo, rep->v2tally_off); + vtp = &tally[0]; + for (i = 0; i < *countp;) { + /* + * Ignore votes from earlier elections (i.e. we've heard + * from this site in this election, but its vote from an + * earlier election got delayed and we received it now). + * However, if we happened to hear from an earlier vote + * and we recorded it and we're now hearing from a later + * election we want to keep the updated one. Note that + * updating the entry will not increase the count. + * Also ignore votes that are duplicates. + */ + if (vtp->eid == eid) { + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Tally found[%d] (%d, %lu), this vote (%d, %lu)", + i, vtp->eid, (u_long)vtp->egen, + eid, (u_long)egen)); + if (vtp->egen >= egen) + return (1); + else { + vtp->egen = egen; + return (0); + } + } + i++; + vtp = &tally[i]; + } + + /* + * If we get here, we have a new voter we haven't seen before. Tally + * this vote. + */ + RPRINT(env, DB_VERB_REP_ELECT, (env, "Tallying VOTE%d[%d] (%d, %lu)", + phase, i, eid, (u_long)egen)); + + vtp->eid = eid; + vtp->egen = egen; + (*countp)++; + return (0); +} + +/* + * __rep_cmp_vote -- + * Compare incoming vote1 message on a client. Called with the db_rep + * mutex held. + * + */ +static void +__rep_cmp_vote(env, rep, eid, lsnp, priority, gen, tiebreaker, flags) + ENV *env; + REP *rep; + int eid; + DB_LSN *lsnp; + u_int32_t priority; + u_int32_t flags, gen, tiebreaker; +{ + int cmp; + + cmp = LOG_COMPARE(lsnp, &rep->w_lsn); + /* + * If we've seen more than one, compare us to the best so far. + * If we're the first, make ourselves the winner to start. + */ + if (rep->sites > 1 && + (priority != 0 || LF_ISSET(REPCTL_ELECTABLE))) { + /* + * Special case, if we have a mixed version group of sites, + * we set priority to 0, but set the ELECTABLE flag so that + * all sites talking at lower versions can correctly elect. + * If a non-zero priority comes in and current winner is + * zero priority (but was electable), then the non-zero + * site takes precedence no matter what its LSN is. + * + * Then LSN is determinant only if we're comparing + * like-styled version/priorities. I.e. both with + * 0/ELECTABLE priority or both with non-zero priority. + * Then actual priority value if LSNs + * are equal, then tiebreaker if both are equal. + */ + if ((priority != 0 && rep->w_priority == 0) || + (((priority == 0 && rep->w_priority == 0) || + (priority != 0 && rep->w_priority != 0)) && cmp > 0) || + (cmp == 0 && (priority > rep->w_priority || + (priority == rep->w_priority && + (tiebreaker > rep->w_tiebreaker))))) { + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Accepting new vote")); + rep->winner = eid; + rep->w_priority = priority; + rep->w_lsn = *lsnp; + rep->w_gen = gen; + rep->w_tiebreaker = tiebreaker; + } + } else if (rep->sites == 1) { + if (priority != 0 || LF_ISSET(REPCTL_ELECTABLE)) { + /* Make ourselves the winner to start. */ + rep->winner = eid; + rep->w_priority = priority; + rep->w_gen = gen; + rep->w_lsn = *lsnp; + rep->w_tiebreaker = tiebreaker; + } else { + rep->winner = DB_EID_INVALID; + rep->w_priority = 0; + rep->w_gen = 0; + ZERO_LSN(rep->w_lsn); + rep->w_tiebreaker = 0; + } + } +} + +/* + * __rep_elect_init + * Initialize an election. Sets beginp non-zero if the election is + * already in progress; makes it 0 otherwise. + */ +static int +__rep_elect_init(env, nsites, nvotes, beginp, otally) + ENV *env; + u_int32_t nsites, nvotes; + int *beginp; + u_int32_t *otally; +{ + DB_LOG *dblp; + DB_LSN lsn; + DB_REP *db_rep; + LOG *lp; + REP *rep; + int ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + + ret = 0; + + /* We may miscount, as we don't hold the replication mutex here. */ + STAT(rep->stat.st_elections++); + + /* If we are already master; simply broadcast that fact and return. */ + if (F_ISSET(rep, REP_F_MASTER)) { + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + LOG_SYSTEM_LOCK(env); + lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0); + if (IS_USING_LEASES(env)) + ret = __rep_lease_refresh(env); + STAT(rep->stat.st_elections_won++); + return (DB_REP_NEWMASTER); + } + + REP_SYSTEM_LOCK(env); + if (otally != NULL) + *otally = F_ISSET(rep, REP_F_TALLY); + *beginp = IN_ELECTION(rep) || F_ISSET(rep, REP_F_INREPELECT); + if (!*beginp) { + /* + * Make sure that we always initialize all the election fields + * before putting ourselves in an election state. That means + * issuing calls that can fail (allocation) before setting all + * the variables. + */ + if (nsites > rep->asites && + (ret = __rep_grow_sites(env, nsites)) != 0) + goto err; + DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTINIT, ret, NULL); + F_SET(rep, REP_F_INREPELECT); + F_CLR(rep, REP_F_EGENUPDATE); + /* + * If we're the first to the party, we simply set initial + * values: pre-existing values would be left over from previous + * election. + */ + if (!IN_ELECTION_TALLY(rep)) { + rep->nsites = nsites; + rep->nvotes = nvotes; + } else { + if (nsites > rep->nsites) + rep->nsites = nsites; + if (nvotes > rep->nvotes) + rep->nvotes = nvotes; + } + } +DB_TEST_RECOVERY_LABEL +err: REP_SYSTEM_UNLOCK(env); + return (ret); +} + +/* + * __rep_elect_master + * Set up for new master from election. Must be called with + * the replication region mutex held. + */ +static void +__rep_elect_master(env, rep) + ENV *env; + REP *rep; +{ + if (F_ISSET(rep, REP_F_MASTERELECT | REP_F_MASTER)) { + /* We've been through here already; avoid double counting. */ + return; + } + + F_SET(rep, REP_F_MASTERELECT); + STAT(rep->stat.st_elections_won++); + + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Got enough votes to win; election done; winner is %d, gen %lu", + rep->master_id, (u_long)rep->gen)); +} + +static int +__rep_fire_elected(env, rep, egen) + ENV *env; + REP *rep; + u_int32_t egen; +{ + REP_EVENT_LOCK(env); + if (rep->notified_egen < egen) { + __rep_fire_event(env, DB_EVENT_REP_ELECTED, NULL); + rep->notified_egen = egen; + } + REP_EVENT_UNLOCK(env); + return (0); +} + +/* + * Compute a sleep interval. Set it to the smaller of .5s or + * timeout/10, making sure we sleep at least 1usec if timeout < 10. + */ +#define SLEEPTIME(timeout) \ + (timeout > 5000000) ? 500000 : ((timeout >= 10) ? timeout / 10 : 1); + +static int +__rep_wait(env, timeoutp, full_elect, egen, flags) + ENV *env; + db_timeout_t *timeoutp; + int full_elect; + u_int32_t egen, flags; +{ + DB_REP *db_rep; + REP *rep; + int done, echg, phase_over, ret; + u_int32_t sleeptime, sleeptotal, timeout; + + db_rep = env->rep_handle; + rep = db_rep->region; + done = echg = phase_over = ret = 0; + + timeout = *timeoutp; + /* + * The user specifies an overall timeout function, but checking + * is cheap and the timeout may be a generous upper bound. + * Sleep repeatedly for the smaller of .5s and timeout/10. + */ + sleeptime = SLEEPTIME(timeout); + sleeptotal = 0; + while (sleeptotal < timeout) { + __os_yield(env, 0, sleeptime); + sleeptotal += sleeptime; + REP_SYSTEM_LOCK(env); + /* + * Check if group membership changed while we were + * sleeping. Specifically we're trying for a full + * election and someone is telling us we're joining + * a previously established replication group. + */ + if (full_elect && F_ISSET(rep, REP_F_GROUP_ESTD)) { + *timeoutp = rep->elect_timeout; + timeout = *timeoutp; + /* + * We adjusted timeout, if we've already waited + * that long, then return as though this phase + * timed out. However, we want to give other + * changes a chance to return, so if we both + * found a group and found a new egen, we + * override this return with the egen information. + * If we found a group and our election finished + * then we want to return the election completion. + */ + if (sleeptotal >= timeout) { + done = 1; + ret = DB_TIMEOUT; + } else + sleeptime = SLEEPTIME(timeout); + } + + echg = egen != rep->egen; + phase_over = !F_ISSET(rep, flags); + + /* + * Since we're not clearing out master_id any more, + * we need to do more to detect the difference between + * a new master getting elected and egen changing, + * or a new election starting because the old one + * timed out at another site (which easily happens + * when sites have very different timeout settings). + * + * Detect this by: + * If my phase was over, egen has changed but + * there are still election flags set, or we're + * told our egen was out of date and updated + * then return DB_REP_EGENCHG. + * + * Phase 0 doesn't care about egen, only the phase over. + * + * Otherwise, if my phase is over I want to + * set my idea of the master and return. + */ + if (phase_over && echg && + flags != REP_F_EPHASE0 && + (IN_ELECTION_TALLY(rep) || + F_ISSET(rep, REP_F_EGENUPDATE))) { + done = 1; + F_CLR(rep, REP_F_EGENUPDATE); + ret = DB_REP_EGENCHG; + } else if (phase_over) { + done = 1; + ret = 0; + } + REP_SYSTEM_UNLOCK(env); + + if (done) + return (ret); + } + return (DB_TIMEOUT); +} + +/* + * __rep_grow_sites -- + * Called to allocate more space in the election tally information. + * Called with the rep mutex held. We need to call the region mutex, so + * we need to make sure that we *never* acquire those mutexes in the + * opposite order. + */ +static int +__rep_grow_sites(env, nsites) + ENV *env; + u_int32_t nsites; +{ + REGENV *renv; + REGINFO *infop; + REP *rep; + int ret, *tally; + u_int32_t nalloc; + + rep = env->rep_handle->region; + + /* + * Allocate either twice the current allocation or nsites, + * whichever is more. + */ + nalloc = 2 * rep->asites; + if (nalloc < nsites) + nalloc = nsites; + + infop = env->reginfo; + renv = infop->primary; + MUTEX_LOCK(env, renv->mtx_regenv); + + /* + * We allocate 2 tally regions, one for tallying VOTE1's and + * one for VOTE2's. Always grow them in tandem, because if we + * get more VOTE1's we'll always expect more VOTE2's then too. + */ + if ((ret = __env_alloc(infop, + (size_t)nalloc * sizeof(REP_VTALLY), &tally)) == 0) { + if (rep->tally_off != INVALID_ROFF) + __env_alloc_free( + infop, R_ADDR(infop, rep->tally_off)); + rep->tally_off = R_OFFSET(infop, tally); + if ((ret = __env_alloc(infop, + (size_t)nalloc * sizeof(REP_VTALLY), &tally)) == 0) { + /* Success */ + if (rep->v2tally_off != INVALID_ROFF) + __env_alloc_free(infop, + R_ADDR(infop, rep->v2tally_off)); + rep->v2tally_off = R_OFFSET(infop, tally); + rep->asites = nalloc; + rep->nsites = nsites; + } else { + /* + * We were unable to allocate both. So, we must + * free the first one and reinitialize. If + * v2tally_off is valid, it is from an old + * allocation and we are clearing it all out due + * to the error. + */ + if (rep->v2tally_off != INVALID_ROFF) + __env_alloc_free(infop, + R_ADDR(infop, rep->v2tally_off)); + __env_alloc_free(infop, + R_ADDR(infop, rep->tally_off)); + rep->v2tally_off = rep->tally_off = INVALID_ROFF; + rep->asites = 0; + rep->nsites = 0; + } + } + MUTEX_UNLOCK(env, renv->mtx_regenv); + return (ret); +} diff --git a/rep/rep_lease.c b/rep/rep_lease.c new file mode 100644 index 0000000..a13318e --- /dev/null +++ b/rep/rep_lease.c @@ -0,0 +1,524 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2007-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" + +static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **)); + +/* + * __rep_update_grant - + * Update a client's lease grant for this perm record + * and send the grant to the master. Caller must + * hold the mtx_clientdb mutex. Timespec given is in + * host local format. + * + * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *)); + */ +int +__rep_update_grant(env, ts) + ENV *env; + db_timespec *ts; +{ + DBT lease_dbt; + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + __rep_grant_info_args gi; + db_timespec mytime; + u_int8_t buf[__REP_GRANT_INFO_SIZE]; + int master, ret; + size_t len; + + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + timespecclear(&mytime); + + /* + * Get current time, and add in the (skewed) lease duration + * time to send the grant to the master. + */ + __os_gettime(env, &mytime, 1); + timespecadd(&mytime, &rep->lease_duration); + REP_SYSTEM_LOCK(env); + /* + * If we are in an election, we cannot grant the lease. + * We need to check under the region mutex. + */ + if (IN_ELECTION(rep)) { + REP_SYSTEM_UNLOCK(env); + return (0); + } + if (timespeccmp(&mytime, &rep->grant_expire, >)) + rep->grant_expire = mytime; + F_CLR(rep, REP_F_LEASE_EXPIRED); + REP_SYSTEM_UNLOCK(env); + + /* + * Send the LEASE_GRANT message with the current lease grant + * no matter if we've actually extended the lease or not. + */ + gi.msg_sec = (u_int32_t)ts->tv_sec; + gi.msg_nsec = (u_int32_t)ts->tv_nsec; + + if ((ret = __rep_grant_info_marshal(env, &gi, buf, + __REP_GRANT_INFO_SIZE, &len)) != 0) + return (ret); + DB_INIT_DBT(lease_dbt, buf, len); + if ((master = rep->master_id) != DB_EID_INVALID) + (void)__rep_send_message(env, master, REP_LEASE_GRANT, + &lp->max_perm_lsn, &lease_dbt, 0, 0); + return (0); +} + +/* + * __rep_islease_granted - + * Return 0 if this client has no outstanding lease granted. + * Return 1 otherwise. + * Caller must hold the REP_SYSTEM (region) mutex. + * + * PUBLIC: int __rep_islease_granted __P((ENV *)); + */ +int +__rep_islease_granted(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + db_timespec mytime; + + db_rep = env->rep_handle; + rep = db_rep->region; + /* + * Get current time and compare against our granted lease. + */ + timespecclear(&mytime); + __os_gettime(env, &mytime, 1); + + return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0); +} + +/* + * __rep_lease_table_alloc - + * Allocate the lease table on a master. Called with rep mutex + * held. We need to acquire the env region mutex, so we need to + * make sure we never acquire those mutexes in the opposite order. + * + * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t)); + */ +int +__rep_lease_table_alloc(env, nsites) + ENV *env; + u_int32_t nsites; +{ + REGENV *renv; + REGINFO *infop; + REP *rep; + REP_LEASE_ENTRY *le, *table; + int *lease, ret; + u_int32_t i; + + rep = env->rep_handle->region; + + infop = env->reginfo; + renv = infop->primary; + MUTEX_LOCK(env, renv->mtx_regenv); + /* + * If we have an old table from some other time, free it and + * allocate ourselves a new one that is known to be for + * the right number of sites. + */ + if (rep->lease_off != INVALID_ROFF) { + __env_alloc_free(infop, + R_ADDR(infop, rep->lease_off)); + rep->lease_off = INVALID_ROFF; + } + ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY), + &lease); + MUTEX_UNLOCK(env, renv->mtx_regenv); + if (ret != 0) + return (ret); + else + rep->lease_off = R_OFFSET(infop, lease); + table = R_ADDR(infop, rep->lease_off); + for (i = 0; i < nsites; i++) { + le = &table[i]; + le->eid = DB_EID_INVALID; + timespecclear(&le->start_time); + timespecclear(&le->end_time); + ZERO_LSN(le->lease_lsn); + } + return (0); +} + +/* + * __rep_lease_grant - + * Handle incoming REP_LEASE_GRANT message on a master. + * + * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int)); + */ +int +__rep_lease_grant(env, rp, rec, eid) + ENV *env; + __rep_control_args *rp; + DBT *rec; + int eid; +{ + DB_REP *db_rep; + REP *rep; + __rep_grant_info_args gi; + REP_LEASE_ENTRY *le; + db_timespec msg_time; + int ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + if ((ret = __rep_grant_info_unmarshal(env, + &gi, rec->data, rec->size, NULL)) != 0) + return (ret); + timespecset(&msg_time, gi.msg_sec, gi.msg_nsec); + le = NULL; + + /* + * Get current time, and add in the (skewed) lease duration + * time to send the grant to the master. + */ + REP_SYSTEM_LOCK(env); + __rep_find_entry(env, rep, eid, &le); + /* + * We either get back this site's entry, or an empty entry + * that we need to initialize. + */ + DB_ASSERT(env, le != NULL); + /* + * Update the entry if it is an empty entry or if the new + * lease grant is a later start time than the current one. + */ + RPRINT(env, DB_VERB_REP_LEASE, + (env, "lease_grant: grant msg time %lu %lu", + (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec)); + if (le->eid == DB_EID_INVALID || + timespeccmp(&msg_time, &le->start_time, >)) { + le->eid = eid; + le->start_time = msg_time; + le->end_time = le->start_time; + timespecadd(&le->end_time, &rep->lease_duration); + RPRINT(env, DB_VERB_REP_LEASE, (env, + "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu", + le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec, + (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec, + (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec)); + /* + * XXX Is this really true? Could we have a lagging + * record that has a later start time, but smaller + * LSN than we have previously seen?? + */ + DB_ASSERT(env, LOG_COMPARE(&rp->lsn, &le->lease_lsn) >= 0); + le->lease_lsn = rp->lsn; + } + REP_SYSTEM_UNLOCK(env); + return (0); +} + +/* + * Find the entry for the given EID. Or the first empty one. + */ +static void +__rep_find_entry(env, rep, eid, lep) + ENV *env; + REP *rep; + int eid; + REP_LEASE_ENTRY **lep; +{ + REGINFO *infop; + REP_LEASE_ENTRY *le, *table; + u_int32_t i; + + infop = env->reginfo; + table = R_ADDR(infop, rep->lease_off); + + for (i = 0; i < rep->nsites; i++) { + le = &table[i]; + /* + * Find either the one that matches the client's + * EID or the first empty one. + */ + if (le->eid == eid || le->eid == DB_EID_INVALID) { + *lep = le; + return; + } + } + return; +} + +/* + * __rep_lease_check - + * Return 0 if this master holds valid leases and can confirm + * its mastership. If leases are expired, an attempt is made + * to refresh the leases. If that fails, then return the + * DB_REP_LEASE_EXPIRED error to the user. No mutexes held. + * + * PUBLIC: int __rep_lease_check __P((ENV *, int)); + */ +int +__rep_lease_check(env, refresh) + ENV *env; + int refresh; +{ + DB_LOG *dblp; + DB_LSN lease_lsn; + DB_REP *db_rep; + LOG *lp; + REGINFO *infop; + REP *rep; + REP_LEASE_ENTRY *le, *table; + db_timespec curtime; + int ret, tries; + u_int32_t i, min_leases, valid_leases; + + infop = env->reginfo; + tries = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + LOG_SYSTEM_LOCK(env); + lease_lsn = lp->max_perm_lsn; + LOG_SYSTEM_UNLOCK(env); + +retry: + REP_SYSTEM_LOCK(env); + min_leases = rep->nsites / 2; + ret = 0; + __os_gettime(env, &curtime, 1); + RPRINT(env, DB_VERB_REP_LEASE, (env, + "lease_check: try %d min_leases %lu curtime %lu %lu, maxLSN [%lu][%lu]", + tries, + (u_long)min_leases, (u_long)curtime.tv_sec, + (u_long)curtime.tv_nsec, + (u_long)lease_lsn.file, + (u_long)lease_lsn.offset)); + table = R_ADDR(infop, rep->lease_off); + for (i = 0, valid_leases = 0; + i < rep->nsites && valid_leases < min_leases; i++) { + le = &table[i]; + /* + * Count this lease as valid if: + * - It is a valid entry (has an EID). + * - The lease has not expired. + * - The LSN is up to date. + */ + if (le->eid != DB_EID_INVALID) { + RPRINT(env, DB_VERB_REP_LEASE, (env, + "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]", + (u_long)valid_leases, le->eid, + (u_long)le->lease_lsn.file, + (u_long)le->lease_lsn.offset)); + RPRINT(env, DB_VERB_REP_LEASE, + (env, "lease_check: endtime %lu %lu", + (u_long)le->end_time.tv_sec, + (u_long)le->end_time.tv_nsec)); + } + if (le->eid != DB_EID_INVALID && + timespeccmp(&le->end_time, &curtime, >=) && + LOG_COMPARE(&le->lease_lsn, &lease_lsn) >= 0) + valid_leases++; + } + REP_SYSTEM_UNLOCK(env); + + /* + * Now see if we have enough. + */ + RPRINT(env, DB_VERB_REP_LEASE, (env, "valid %lu, min %lu", + (u_long)valid_leases, (u_long)min_leases)); + if (valid_leases < min_leases) { + if (!refresh) + ret = DB_REP_LEASE_EXPIRED; + else { + /* + * If we are successful, we need to recheck the leases + * because the lease grant messages may have raced with + * the PERM acknowledgement. Give the grant messages + * a chance to arrive and be processed. + */ + if ((ret = __rep_lease_refresh(env)) == 0) { + if (tries <= LEASE_REFRESH_TRIES) { + /* + * If we were successful sending, but + * not in racing the message threads, + * then yield the processor so that + * the message threads get a chance + * to run. + */ + if (tries > 0) + __os_yield(env, 1, 0); + tries++; + goto retry; + } else + ret = DB_REP_LEASE_EXPIRED; + } + } + } + + if (ret == DB_REP_LEASE_EXPIRED) + RPRINT(env, DB_VERB_REP_LEASE, (env, + "lease_check: Expired. Only %lu valid", + (u_long)valid_leases)); + return (ret); +} + +/* + * __rep_lease_refresh - + * Find the last permanent record and send that out so that it + * forces clients to grant their leases. + * + * If there is no permanent record, this function cannot refresh + * leases. That should not happen because the master should write + * a checkpoint when it starts, if there is no other perm record. + * + * PUBLIC: int __rep_lease_refresh __P((ENV *)); + */ +int +__rep_lease_refresh(env) + ENV *env; +{ + DBT rec; + DB_LOGC *logc; + DB_LSN lsn; + DB_REP *db_rep; + REP *rep; + int ret, t_ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + + memset(&rec, 0, sizeof(rec)); + memset(&lsn, 0, sizeof(lsn)); + /* + * Use __rep_log_backup to find the last PERM record. + */ + if ((ret = __rep_log_backup(env, rep, logc, &lsn)) != 0) { + /* + * If there is no PERM record, then we get DB_NOTFOUND. + */ + if (ret == DB_NOTFOUND) + ret = 0; + goto err; + } + + if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0) + goto err; + + (void)__rep_send_message(env, DB_EID_BROADCAST, REP_LOG, &lsn, + &rec, REPCTL_PERM, 0); + +err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_lease_expire - + * Proactively expire all leases granted to us. + * Assume the caller holds the REP_SYSTEM (region) mutex. + * + * PUBLIC: int __rep_lease_expire __P((ENV *)); + */ +int +__rep_lease_expire(env) + ENV *env; +{ + DB_REP *db_rep; + REGINFO *infop; + REP *rep; + REP_LEASE_ENTRY *le, *table; + int ret; + u_int32_t i; + + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + infop = env->reginfo; + + if (rep->lease_off != INVALID_ROFF) { + table = R_ADDR(infop, rep->lease_off); + /* + * Expire all leases forcibly. We are guaranteed that the + * start_time for all leases are not in the future. Therefore, + * set the end_time to the start_time. + */ + for (i = 0; i < rep->nsites; i++) { + le = &table[i]; + le->end_time = le->start_time; + } + } + return (ret); +} + +/* + * __rep_lease_waittime - + * Return the amount of time remaining on a granted lease. + * Assume the caller holds the REP_SYSTEM (region) mutex. + * + * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *)); + */ +db_timeout_t +__rep_lease_waittime(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + db_timespec exptime, mytime; + db_timeout_t to; + + db_rep = env->rep_handle; + rep = db_rep->region; + exptime = rep->grant_expire; + to = 0; + /* + * If the lease has never been granted, we must wait a full + * lease timeout because we could be freshly rebooted after + * a crash and a lease could be granted from a previous + * incarnation of this client. However, if the lease has never + * been granted, and this client has already waited a full + * lease timeout, we know our lease cannot be granted and there + * is no need to wait again. + */ + RPRINT(env, DB_VERB_REP_LEASE, (env, + "wait_time: grant_expire %lu %lu lease_to %lu", + (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec, + (u_long)rep->lease_timeout)); + if (!timespecisset(&exptime)) { + if (!F_ISSET(rep, REP_F_LEASE_EXPIRED)) + to = rep->lease_timeout; + } else { + __os_gettime(env, &mytime, 1); + RPRINT(env, DB_VERB_REP_LEASE, (env, + "wait_time: mytime %lu %lu, grant_expire %lu %lu", + (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec, + (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec)); + if (timespeccmp(&mytime, &exptime, <=)) { + /* + * If the current time is before the grant expiration + * compute the difference and return remaining grant + * time. + */ + timespecsub(&exptime, &mytime); + DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1); + } + } + return (to); +} diff --git a/rep/rep_log.c b/rep/rep_log.c new file mode 100644 index 0000000..d413daa --- /dev/null +++ b/rep/rep_log.c @@ -0,0 +1,872 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2004-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" + +static int __rep_chk_newfile __P((ENV *, DB_LOGC *, REP *, + __rep_control_args *, int)); + +/* + * __rep_allreq -- + * Handle a REP_ALL_REQ message. + * + * PUBLIC: int __rep_allreq __P((ENV *, __rep_control_args *, int)); + */ +int +__rep_allreq(env, rp, eid) + ENV *env; + __rep_control_args *rp; + int eid; +{ + DBT data_dbt, newfiledbt; + DB_LOGC *logc; + DB_LSN log_end, oldfilelsn; + DB_REP *db_rep; + REP *rep; + REP_BULK bulk; + REP_THROTTLE repth; + __rep_newfile_args nf_args; + uintptr_t bulkoff; + u_int32_t bulkflags, end_flag, flags, use_bulk; + int arch_flag, ret, t_ret; + u_int8_t buf[__REP_NEWFILE_SIZE]; + size_t len; + + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + end_flag = 0; + arch_flag = 0; + + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + memset(&data_dbt, 0, sizeof(data_dbt)); + /* + * If we're doing bulk transfer, allocate a bulk buffer to put our + * log records in. We still need to initialize the throttle info + * because if we encounter a log record larger than our entire bulk + * buffer, we need to send it as a singleton and also we want to + * support throttling with bulk. + * + * Use a local var so we don't need to worry if someone else turns + * on/off bulk in the middle of our call. + */ + use_bulk = FLD_ISSET(rep->config, REP_C_BULK); + bulk.addr = NULL; + if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid, + &bulkoff, &bulkflags, REP_BULK_LOG)) != 0) + goto err; + memset(&repth, 0, sizeof(repth)); + REP_SYSTEM_LOCK(env); + F_SET(rep, REP_F_NOARCHIVE); + arch_flag = 1; + repth.gbytes = rep->gbytes; + repth.bytes = rep->bytes; + oldfilelsn = repth.lsn = rp->lsn; + repth.type = REP_LOG; + repth.data_dbt = &data_dbt; + REP_SYSTEM_UNLOCK(env); + + /* + * Get the LSN of the end of the log, so that in our reading loop + * (below), we can recognize when we get there, and set the + * REPCTL_LOG_END flag. + */ + if ((ret = __logc_get(logc, &log_end, &data_dbt, DB_LAST)) != 0) { + if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER)) + ret = 0; + goto err; + } + + flags = IS_ZERO_LSN(rp->lsn) || + IS_INIT_LSN(rp->lsn) ? DB_FIRST : DB_SET; + /* + * We get the first item so that a client servicing requests + * can distinguish between not having the records and reaching + * the end of its log. Return the DB_NOTFOUND if the client + * cannot get the record. Return 0 if we finish the loop and + * sent all that we have. + */ + ret = __logc_get(logc, &repth.lsn, &data_dbt, flags); + /* + * If the client is asking for all records + * because it doesn't have any, and our first + * record is not in the first log file, then + * the client is outdated and needs to get a + * VERIFY_FAIL. + */ + if (ret == 0 && repth.lsn.file != 1 && flags == DB_FIRST) { + if (F_ISSET(rep, REP_F_CLIENT)) + ret = DB_NOTFOUND; + else + (void)__rep_send_message(env, eid, + REP_VERIFY_FAIL, &repth.lsn, NULL, 0, 0); + goto err; + } + /* + * If we got DB_NOTFOUND it could be because the LSN we were + * given is at the end of the log file and we need to switch + * log files. Reinitialize and get the current record when we return. + */ + if (ret == DB_NOTFOUND) { + ret = __rep_chk_newfile(env, logc, rep, rp, eid); + /* + * If we still get DB_NOTFOUND the client gave us a + * bad or unknown LSN. Ignore it if we're the master. + * Any other error is returned. + */ + if (ret == 0) + ret = __logc_get(logc, &repth.lsn, + &data_dbt, DB_CURRENT); + if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER)) { + ret = 0; + goto err; + } + if (ret != 0) + goto err; + } + + /* + * For singleton log records, we break when we get a REP_LOG_MORE. + * Or if we're not using throttling, or we are using bulk, we stop + * when we reach the end (i.e. ret != 0). + */ + for (end_flag = 0; + ret == 0 && repth.type != REP_LOG_MORE && end_flag == 0; + ret = __logc_get(logc, &repth.lsn, &data_dbt, DB_NEXT)) { + /* + * If we just changed log files, we need to send the + * version of this log file to the client. + */ + if (repth.lsn.file != oldfilelsn.file) { + if ((ret = __logc_version(logc, &nf_args.version)) != 0) + break; + memset(&newfiledbt, 0, sizeof(newfiledbt)); + if (rep->version < DB_REPVERSION_47) + DB_INIT_DBT(newfiledbt, &nf_args.version, + sizeof(nf_args.version)); + else { + if ((ret = __rep_newfile_marshal(env, &nf_args, + buf, __REP_NEWFILE_SIZE, &len)) != 0) + goto err; + DB_INIT_DBT(newfiledbt, buf, len); + } + (void)__rep_send_message(env, + eid, REP_NEWFILE, &oldfilelsn, &newfiledbt, + REPCTL_RESEND, 0); + } + + /* + * Mark the end of the ALL_REQ response to show that the + * receiving client should now be "caught up" with the + * replication group. If we're the master, then our log end is + * certainly authoritative. If we're another client, only if we + * ourselves have reached STARTUPDONE. + */ + end_flag = (LOG_COMPARE(&repth.lsn, &log_end) >= 0 && + (F_ISSET(rep, REP_F_MASTER) || + rep->stat.st_startup_complete)) ? + REPCTL_LOG_END : 0; + /* + * If we are configured for bulk, try to send this as a bulk + * request. If not configured, or it is too big for bulk + * then just send normally. + */ + if (use_bulk) + ret = __rep_bulk_message(env, &bulk, &repth, + &repth.lsn, &data_dbt, (REPCTL_RESEND | end_flag)); + if (!use_bulk || ret == DB_REP_BULKOVF) + ret = __rep_send_throttle(env, + eid, &repth, 0, end_flag); + if (ret != 0) + break; + /* + * If we are about to change files, then we'll need the + * last LSN in the previous file. Save it here. + */ + oldfilelsn = repth.lsn; + oldfilelsn.offset += logc->len; + } + + if (ret == DB_NOTFOUND || ret == DB_REP_UNAVAIL) + ret = 0; + /* + * We're done, force out whatever remains in the bulk buffer and + * free it. + */ +err: + /* + * We could have raced an unlink from an earlier log_archive + * and the user is removing the files themselves, now. If + * we get an error indicating the log file might no longer + * exist, ignore it. + */ + if (ret == ENOENT) + ret = 0; + if (bulk.addr != NULL && (t_ret = __rep_bulk_free(env, &bulk, + (REPCTL_RESEND | end_flag))) != 0 && ret == 0 && + t_ret != DB_REP_UNAVAIL) + ret = t_ret; + if (arch_flag) { + REP_SYSTEM_LOCK(env); + F_CLR(rep, REP_F_NOARCHIVE); + REP_SYSTEM_UNLOCK(env); + } + if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_log -- + * Handle a REP_LOG/REP_LOG_MORE message. + * + * PUBLIC: int __rep_log __P((ENV *, DB_THREAD_INFO *, + * PUBLIC: __rep_control_args *, DBT *, time_t, DB_LSN *)); + */ +int +__rep_log(env, ip, rp, rec, savetime, ret_lsnp) + ENV *env; + DB_THREAD_INFO *ip; + __rep_control_args *rp; + DBT *rec; + time_t savetime; + DB_LSN *ret_lsnp; +{ + DB_LOG *dblp; + DB_LSN last_lsn, lsn; + DB_REP *db_rep; + LOG *lp; + REP *rep; + int is_dup, master, ret; + + is_dup = ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + + ret = __rep_apply(env, ip, rp, rec, ret_lsnp, &is_dup, &last_lsn); + switch (ret) { + /* + * We're in an internal backup and we've gotten + * all the log we need to run recovery. Do so now. + */ + case DB_REP_LOGREADY: + if ((ret = + __rep_logready(env, rep, savetime, &last_lsn)) != 0) + goto out; + break; + /* + * If we get any of the "normal" returns, we only process + * LOG_MORE if this is not a duplicate record. If the + * record is a duplicate we don't want to handle LOG_MORE + * and request a multiple data stream (or trigger internal + * initialization) since this could be a very old record + * that no longer exists on the master. + */ + case DB_REP_ISPERM: + case DB_REP_NOTPERM: + case 0: + if (is_dup) + goto out; + else + break; + /* + * Any other return (errors), we're done. + */ + default: + goto out; + } + if (rp->rectype == REP_LOG_MORE) { + master = rep->master_id; + + /* + * Keep the cycle from stalling: In case we got the LOG_MORE out + * of order, before some preceding log records, we want to make + * sure our follow-up request resumes from where the LOG_MORE + * said it should. (If the preceding log records never arrive, + * normal gap processing should take care of asking for them.) + * But if we already have this record and/or more, we need to + * ask to resume from what we need. The upshot is we need the + * max of lp->lsn and the lsn from the message. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + lsn = lp->ready_lsn; + if (LOG_COMPARE(&rp->lsn, &lsn) > 0) + lsn = rp->lsn; + + /* + * If the master_id is invalid, this means that since + * the last record was sent, somebody declared an + * election and we may not have a master to request + * things of. + * + * This is not an error; when we find a new master, + * we'll re-negotiate where the end of the log is and + * try to bring ourselves up to date again anyway. + */ + if (master == DB_EID_INVALID) { + ret = 0; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + goto out; + } + /* + * If we're waiting for records, set the wait_ts + * high so that we avoid re-requesting too soon and + * end up with multiple data streams. + */ + if (IS_ZERO_LSN(lp->waiting_lsn)) + lp->wait_ts = rep->max_gap; + ret = __rep_loggap_req(env, rep, &lsn, REP_GAP_FORCE); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + } +out: + return (ret); +} + +/* + * __rep_bulk_log -- + * Handle a REP_BULK_LOG message. + * + * PUBLIC: int __rep_bulk_log __P((ENV *, DB_THREAD_INFO *, + * PUBLIC: __rep_control_args *, DBT *, time_t, DB_LSN *)); + */ +int +__rep_bulk_log(env, ip, rp, rec, savetime, ret_lsnp) + ENV *env; + DB_THREAD_INFO *ip; + __rep_control_args *rp; + DBT *rec; + time_t savetime; + DB_LSN *ret_lsnp; +{ + DB_LSN last_lsn; + DB_REP *db_rep; + REP *rep; + int ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + + ret = __log_rep_split(env, ip, rp, rec, ret_lsnp, &last_lsn); + switch (ret) { + /* + * We're in an internal backup and we've gotten + * all the log we need to run recovery. Do so now. + */ + case DB_REP_LOGREADY: + ret = __rep_logready(env, rep, savetime, &last_lsn); + break; + /* + * Any other return (errors), we're done. + */ + default: + break; + } + return (ret); +} + +/* + * __rep_log_req -- + * Handle a REP_LOG_REQ message. + * + * PUBLIC: int __rep_logreq __P((ENV *, __rep_control_args *, DBT *, int)); + */ +int +__rep_logreq(env, rp, rec, eid) + ENV *env; + __rep_control_args *rp; + DBT *rec; + int eid; +{ + DBT data_dbt, newfiledbt; + DB_LOGC *logc; + DB_LSN firstlsn, lsn, oldfilelsn; + DB_REP *db_rep; + REP *rep; + REP_BULK bulk; + REP_THROTTLE repth; + __rep_logreq_args lr_args; + __rep_newfile_args nf_args; + uintptr_t bulkoff; + u_int32_t bulkflags, use_bulk; + int count, ret, t_ret; + u_int8_t buf[__REP_NEWFILE_SIZE]; + size_t len; + + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + + /* COMPQUIET_LSN is what this is... */ + ZERO_LSN(lr_args.endlsn); + + if (rec != NULL && rec->size != 0) { + if (rp->rep_version < DB_REPVERSION_47) + lr_args.endlsn = *(DB_LSN *)rec->data; + else if ((ret = __rep_logreq_unmarshal(env, &lr_args, + rec->data, rec->size, NULL)) != 0) + return (ret); + RPRINT(env, DB_VERB_REP_MISC, (env, + "[%lu][%lu]: LOG_REQ max lsn: [%lu][%lu]", + (u_long) rp->lsn.file, (u_long)rp->lsn.offset, + (u_long)lr_args.endlsn.file, + (u_long)lr_args.endlsn.offset)); + } + /* + * There are several different cases here. + * 1. We asked logc_get for a particular LSN and got it. + * 2. We asked logc_get for an LSN and it's not found because it is + * beyond the end of a log file and we need a NEWFILE msg. + * and then the record that was requested. + * 3. We asked logc_get for an LSN and it is already archived. + * 4. We asked logc_get for an LSN and it simply doesn't exist, but + * doesn't meet any of those other criteria, in which case + * it's an error (that should never happen on a master). + * + * If we have a valid LSN and the request has a data_dbt with + * it, the sender is asking for a chunk of log records. + * Then we need to send all records up to the LSN in the data dbt. + */ + memset(&data_dbt, 0, sizeof(data_dbt)); + oldfilelsn = lsn = rp->lsn; + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + REP_SYSTEM_LOCK(env); + F_SET(rep, REP_F_NOARCHIVE); + REP_SYSTEM_UNLOCK(env); + if ((ret = __logc_get(logc, &lsn, &data_dbt, DB_SET)) == 0) { + /* Case 1 */ + (void)__rep_send_message(env, + eid, REP_LOG, &lsn, &data_dbt, REPCTL_RESEND, 0); + oldfilelsn.offset += logc->len; + } else if (ret == DB_NOTFOUND) { + /* + * If logc_get races with log_archive, it might return + * DB_NOTFOUND. We expect there to be some log record + * that is the first one. Loop until we either get + * a log record or some error. Since we only expect + * to get this racing log_archive, bound it to a few + * tries. + */ + count = 0; + do { + ret = __logc_get(logc, &firstlsn, &data_dbt, DB_FIRST); + count++; + } while (ret == DB_NOTFOUND && count < 10); + if (ret != 0) + goto err; + if (LOG_COMPARE(&firstlsn, &rp->lsn) > 0) { + /* Case 3 */ + if (F_ISSET(rep, REP_F_CLIENT)) { + ret = DB_NOTFOUND; + goto err; + } + (void)__rep_send_message(env, eid, + REP_VERIFY_FAIL, &rp->lsn, NULL, 0, 0); + ret = 0; + goto err; + } + ret = __rep_chk_newfile(env, logc, rep, rp, eid); + if (ret == DB_NOTFOUND) { + /* Case 4 */ + /* + * If we still get DB_NOTFOUND the client gave us an + * unknown LSN, perhaps at the end of the log. Ignore + * it if we're the master. Return DB_NOTFOUND if + * we are the client. + */ + if (F_ISSET(rep, REP_F_MASTER)) { + __db_errx(env, + "Request for LSN [%lu][%lu] not found", + (u_long)rp->lsn.file, + (u_long)rp->lsn.offset); + ret = 0; + goto err; + } else + ret = DB_NOTFOUND; + } + } + + if (ret != 0) + goto err; + + /* + * If the user requested a gap, send the whole thing, while observing + * the limits from rep_set_limit. + * + * If we're doing bulk transfer, allocate a bulk buffer to put our + * log records in. We still need to initialize the throttle info + * because if we encounter a log record larger than our entire bulk + * buffer, we need to send it as a singleton. + * + * Use a local var so we don't need to worry if someone else turns + * on/off bulk in the middle of our call. + */ + use_bulk = FLD_ISSET(rep->config, REP_C_BULK); + if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid, + &bulkoff, &bulkflags, REP_BULK_LOG)) != 0) + goto err; + memset(&repth, 0, sizeof(repth)); + REP_SYSTEM_LOCK(env); + repth.gbytes = rep->gbytes; + repth.bytes = rep->bytes; + repth.type = REP_LOG; + repth.data_dbt = &data_dbt; + REP_SYSTEM_UNLOCK(env); + while (ret == 0 && rec != NULL && rec->size != 0 && + repth.type == REP_LOG) { + if ((ret = + __logc_get(logc, &repth.lsn, &data_dbt, DB_NEXT)) != 0) { + /* + * If we're a client and we only have part of the gap, + * return DB_NOTFOUND so that we send a REREQUEST + * back to the requester and it can ask for more. + */ + if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER)) + ret = 0; + break; + } + if (LOG_COMPARE(&repth.lsn, &lr_args.endlsn) >= 0) + break; + if (repth.lsn.file != oldfilelsn.file) { + if ((ret = __logc_version(logc, &nf_args.version)) != 0) + break; + memset(&newfiledbt, 0, sizeof(newfiledbt)); + if (rep->version < DB_REPVERSION_47) + DB_INIT_DBT(newfiledbt, &nf_args.version, + sizeof(nf_args.version)); + else { + if ((ret = __rep_newfile_marshal(env, &nf_args, + buf, __REP_NEWFILE_SIZE, &len)) != 0) + goto err; + DB_INIT_DBT(newfiledbt, buf, len); + } + (void)__rep_send_message(env, + eid, REP_NEWFILE, &oldfilelsn, &newfiledbt, + REPCTL_RESEND, 0); + } + /* + * If we are configured for bulk, try to send this as a bulk + * request. If not configured, or it is too big for bulk + * then just send normally. + */ + if (use_bulk) + ret = __rep_bulk_message(env, &bulk, &repth, + &repth.lsn, &data_dbt, REPCTL_RESEND); + if (!use_bulk || ret == DB_REP_BULKOVF) + ret = __rep_send_throttle(env, eid, &repth, 0, 0); + if (ret != 0) { + /* Ignore send failure, except to break the loop. */ + if (ret == DB_REP_UNAVAIL) + ret = 0; + break; + } + /* + * If we are about to change files, then we'll need the + * last LSN in the previous file. Save it here. + */ + oldfilelsn = repth.lsn; + oldfilelsn.offset += logc->len; + } + + /* + * We're done, force out whatever remains in the bulk buffer and + * free it. + */ + if (use_bulk && (t_ret = __rep_bulk_free(env, &bulk, + REPCTL_RESEND)) != 0 && ret == 0 && + t_ret != DB_REP_UNAVAIL) + ret = t_ret; +err: + /* + * We could have raced an unlink from an earlier log_archive + * and the user is removing the files themselves, now. If + * we get an error indicating the log file might no longer + * exist, ignore it. + */ + if (ret == ENOENT) + ret = 0; + REP_SYSTEM_LOCK(env); + F_CLR(rep, REP_F_NOARCHIVE); + REP_SYSTEM_UNLOCK(env); + if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_loggap_req - + * Request a log gap. Assumes the caller holds the REP->mtx_clientdb. + * + * lsnp is the current LSN we're handling. It is used to help decide + * if we ask for a gap or singleton. + * gapflags are flags that may override the algorithm or control the + * processing in some way. + * + * PUBLIC: int __rep_loggap_req __P((ENV *, REP *, DB_LSN *, u_int32_t)); + */ +int +__rep_loggap_req(env, rep, lsnp, gapflags) + ENV *env; + REP *rep; + DB_LSN *lsnp; + u_int32_t gapflags; +{ + DBT max_lsn_dbt, *max_lsn_dbtp; + DB_LOG *dblp; + DB_LSN next_lsn; + LOG *lp; + __rep_logreq_args lr_args; + size_t len; + u_int32_t ctlflags, flags, type; + int master, ret; + u_int8_t buf[__REP_LOGREQ_SIZE]; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + if (FLD_ISSET(gapflags, REP_GAP_FORCE)) + next_lsn = *lsnp; + else + next_lsn = lp->ready_lsn; + ctlflags = flags = 0; + type = REP_LOG_REQ; + ret = 0; + + /* + * Check if we need to ask for the gap. + * We ask for the gap if: + * We are forced to with gapflags. + * If max_wait_lsn is ZERO_LSN - we've never asked for + * records before. + * If we asked for a single record and received it. + * + * If we want a gap, but don't have an ending LSN (waiting_lsn) + * send an ALL_REQ. This is primarily used by REP_REREQUEST when + * an ALL_REQ was not able to be fulfilled by another client. + */ + if (FLD_ISSET(gapflags, (REP_GAP_FORCE | REP_GAP_REREQUEST)) || + IS_ZERO_LSN(lp->max_wait_lsn) || + (lsnp != NULL && LOG_COMPARE(lsnp, &lp->max_wait_lsn) == 0)) { + lp->max_wait_lsn = lp->waiting_lsn; + /* + * If we are forcing a gap, we need to send a max_wait_lsn + * that may be beyond the current gap/waiting_lsn (but + * it may not be). If we cannot determine any future + * waiting LSN, then it should be zero. If we're in + * internal init, it should be our ending LSN. + */ + if (FLD_ISSET(gapflags, REP_GAP_FORCE)) { + if (LOG_COMPARE(&lp->max_wait_lsn, lsnp) <= 0) { + if (F_ISSET(rep, REP_F_RECOVER_LOG)) { + DB_ASSERT(env, LOG_COMPARE(lsnp, + &rep->last_lsn) <= 0); + lp->max_wait_lsn = rep->last_lsn; + } else + ZERO_LSN(lp->max_wait_lsn); + } + } + if (IS_ZERO_LSN(lp->max_wait_lsn)) + type = REP_ALL_REQ; + memset(&max_lsn_dbt, 0, sizeof(max_lsn_dbt)); + lr_args.endlsn = lp->max_wait_lsn; + if (rep->version < DB_REPVERSION_47) + DB_INIT_DBT(max_lsn_dbt, &lp->max_wait_lsn, + sizeof(DB_LSN)); + else { + if ((ret = __rep_logreq_marshal(env, &lr_args, buf, + __REP_LOGREQ_SIZE, &len)) != 0) + goto err; + DB_INIT_DBT(max_lsn_dbt, buf, len); + } + max_lsn_dbtp = &max_lsn_dbt; + /* + * Gap requests are "new" and can go anywhere, unless + * this is already a re-request. + */ + if (FLD_ISSET(gapflags, REP_GAP_REREQUEST)) + flags = DB_REP_REREQUEST; + else + flags = DB_REP_ANYWHERE; + } else { + max_lsn_dbtp = NULL; + lp->max_wait_lsn = next_lsn; + /* + * If we're dropping to singletons, this is a re-request. + */ + flags = DB_REP_REREQUEST; + } + if ((master = rep->master_id) != DB_EID_INVALID) { + STAT(rep->stat.st_log_requested++); + if (F_ISSET(rep, REP_F_RECOVER_LOG)) + ctlflags = REPCTL_INIT; + (void)__rep_send_message(env, master, + type, &next_lsn, max_lsn_dbtp, ctlflags, flags); + } else + (void)__rep_send_message(env, DB_EID_BROADCAST, + REP_MASTER_REQ, NULL, NULL, 0, 0); +err: + return (ret); +} + +/* + * __rep_logready - + * Handle getting back REP_LOGREADY. Any call to __rep_apply + * can return it. + * + * PUBLIC: int __rep_logready __P((ENV *, REP *, time_t, DB_LSN *)); + */ +int +__rep_logready(env, rep, savetime, last_lsnp) + ENV *env; + REP *rep; + time_t savetime; + DB_LSN *last_lsnp; +{ + int ret; + + if ((ret = __log_flush(env, NULL)) != 0) + goto out; + if ((ret = __rep_verify_match(env, last_lsnp, + savetime)) == 0) { + REP_SYSTEM_LOCK(env); + ZERO_LSN(rep->first_lsn); + + if (rep->originfo != NULL) { + __os_free(env, rep->originfo); + rep->originfo = NULL; + } + + F_CLR(rep, REP_F_RECOVER_LOG); + F_SET(rep, REP_F_NIMDBS_LOADED); + REP_SYSTEM_UNLOCK(env); + } else { +out: __db_errx(env, + "Client initialization failed. Need to manually restore client"); + return (__env_panic(env, ret)); + } + return (ret); + +} + +/* + * __rep_chk_newfile -- + * Determine if getting DB_NOTFOUND is because we're at the + * end of a log file and need to send a NEWFILE message. + * + * This function handles these cases: + * [Case 1 was that we found the record we were looking for - it + * is already handled by the caller.] + * 2. We asked logc_get for an LSN and it's not found because it is + * beyond the end of a log file and we need a NEWFILE msg. + * 3. We asked logc_get for an LSN and it simply doesn't exist, but + * doesn't meet any of those other criteria, in which case + * we return DB_NOTFOUND and the caller decides if it's an error. + * + * This function returns 0 if we had to send a message and the bad + * LSN is dealt with and DB_NOTFOUND if this really is an unknown LSN + * (on a client) and errors if it isn't found on the master. + */ +static int +__rep_chk_newfile(env, logc, rep, rp, eid) + ENV *env; + DB_LOGC *logc; + REP *rep; + __rep_control_args *rp; + int eid; +{ + DBT data_dbt, newfiledbt; + DB_LOG *dblp; + DB_LSN endlsn; + LOG *lp; + __rep_newfile_args nf_args; + int ret; + u_int8_t buf[__REP_NEWFILE_SIZE]; + size_t len; + + ret = 0; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + memset(&data_dbt, 0, sizeof(data_dbt)); + LOG_SYSTEM_LOCK(env); + endlsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + if (endlsn.file > rp->lsn.file) { + /* + * Case 2: + * Need to find the LSN of the last record in + * file lsn.file so that we can send it with + * the NEWFILE call. In order to do that, we + * need to try to get {lsn.file + 1, 0} and + * then backup. + */ + endlsn.file = rp->lsn.file + 1; + endlsn.offset = 0; + if ((ret = __logc_get(logc, + &endlsn, &data_dbt, DB_SET)) != 0 || + (ret = __logc_get(logc, + &endlsn, &data_dbt, DB_PREV)) != 0) { + RPRINT(env, DB_VERB_REP_MISC, (env, + "Unable to get prev of [%lu][%lu]", + (u_long)rp->lsn.file, + (u_long)rp->lsn.offset)); + /* + * We want to push the error back + * to the client so that the client + * does an internal backup. The + * client asked for a log record + * we no longer have and it is + * outdated. + * XXX - This could be optimized by + * having the master perform and + * send a REP_UPDATE message. We + * currently want the client to set + * up its 'update' state prior to + * requesting REP_UPDATE_REQ. + * + * If we're a client servicing a request + * just return DB_NOTFOUND. + */ + if (F_ISSET(rep, REP_F_MASTER)) { + ret = 0; + (void)__rep_send_message(env, eid, + REP_VERIFY_FAIL, &rp->lsn, + NULL, 0, 0); + } else + ret = DB_NOTFOUND; + } else { + endlsn.offset += logc->len; + if ((ret = __logc_version(logc, + &nf_args.version)) == 0) { + memset(&newfiledbt, 0, + sizeof(newfiledbt)); + if (rep->version < DB_REPVERSION_47) + DB_INIT_DBT(newfiledbt, + &nf_args.version, + sizeof(nf_args.version)); + else { + if ((ret = __rep_newfile_marshal(env, + &nf_args, buf, __REP_NEWFILE_SIZE, + &len)) != 0) + return (ret); + DB_INIT_DBT(newfiledbt, buf, len); + } + (void)__rep_send_message(env, eid, + REP_NEWFILE, &endlsn, + &newfiledbt, REPCTL_RESEND, 0); + } + } + } else + ret = DB_NOTFOUND; + + return (ret); +} diff --git a/rep/rep_method.c b/rep/rep_method.c new file mode 100644 index 0000000..fb21f7e --- /dev/null +++ b/rep/rep_method.c @@ -0,0 +1,2142 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __rep_abort_prepared __P((ENV *)); +static int __rep_bt_cmp __P((DB *, const DBT *, const DBT *)); +static void __rep_config_map __P((ENV *, u_int32_t *, u_int32_t *)); +static u_int32_t __rep_conv_vers __P((ENV *, u_int32_t)); +static int __rep_restore_prepared __P((ENV *)); + +/* + * __rep_env_create -- + * Replication-specific initialization of the ENV structure. + * + * PUBLIC: int __rep_env_create __P((DB_ENV *)); + */ +int +__rep_env_create(dbenv) + DB_ENV *dbenv; +{ + DB_REP *db_rep; + ENV *env; + int ret; + + env = dbenv->env; + + if ((ret = __os_calloc(env, 1, sizeof(DB_REP), &db_rep)) != 0) + return (ret); + + db_rep->eid = DB_EID_INVALID; + db_rep->bytes = REP_DEFAULT_THROTTLE; + DB_TIMEOUT_TO_TIMESPEC(DB_REP_REQUEST_GAP, &db_rep->request_gap); + DB_TIMEOUT_TO_TIMESPEC(DB_REP_MAX_GAP, &db_rep->max_gap); + db_rep->elect_timeout = 2 * US_PER_SEC; /* 2 seconds */ + db_rep->chkpt_delay = 30 * US_PER_SEC; /* 30 seconds */ + db_rep->my_priority = DB_REP_DEFAULT_PRIORITY; + /* + * Make no clock skew the default. Setting both fields + * to the same non-zero value means no skew. + */ + db_rep->clock_skew = 1; + db_rep->clock_base = 1; + +#ifdef HAVE_REPLICATION_THREADS + if ((ret = __repmgr_env_create(env, db_rep)) != 0) { + __os_free(env, db_rep); + return (ret); + } +#endif + + env->rep_handle = db_rep; + return (0); +} + +/* + * __rep_env_destroy -- + * Replication-specific destruction of the ENV structure. + * + * PUBLIC: void __rep_env_destroy __P((DB_ENV *)); + */ +void +__rep_env_destroy(dbenv) + DB_ENV *dbenv; +{ + ENV *env; + + env = dbenv->env; + + if (env->rep_handle != NULL) { +#ifdef HAVE_REPLICATION_THREADS + __repmgr_env_destroy(env, env->rep_handle); +#endif + __os_free(env, env->rep_handle); + env->rep_handle = NULL; + } +} + +/* + * __rep_get_config -- + * Return the replication subsystem configuration. + * + * PUBLIC: int __rep_get_config __P((DB_ENV *, u_int32_t, int *)); + */ +int +__rep_get_config(dbenv, which, onp) + DB_ENV *dbenv; + u_int32_t which; + int *onp; +{ + DB_REP *db_rep; + ENV *env; + REP *rep; + u_int32_t mapped; + + env = dbenv->env; + +#undef OK_FLAGS +#define OK_FLAGS \ + (DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \ + DB_REP_CONF_LEASE | DB_REP_CONF_NOAUTOINIT | DB_REP_CONF_NOWAIT | \ + DB_REPMGR_CONF_2SITE_STRICT) + + if (FLD_ISSET(which, ~OK_FLAGS)) + return (__db_ferr(env, "DB_ENV->rep_get_config", 0)); + + db_rep = env->rep_handle; + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_get_config", DB_INIT_REP); + + mapped = 0; + __rep_config_map(env, &which, &mapped); + if (REP_ON(env)) { + rep = db_rep->region; + if (FLD_ISSET(rep->config, mapped)) + *onp = 1; + else + *onp = 0; + } else { + if (FLD_ISSET(db_rep->config, mapped)) + *onp = 1; + else + *onp = 0; + } + return (0); +} + +/* + * __rep_set_config -- + * Configure the replication subsystem. + * + * PUBLIC: int __rep_set_config __P((DB_ENV *, u_int32_t, int)); + */ +int +__rep_set_config(dbenv, which, on) + DB_ENV *dbenv; + u_int32_t which; + int on; +{ + DB_LOG *dblp; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + LOG *lp; + REP *rep; + REP_BULK bulk; + u_int32_t mapped, orig; + int ret; + + env = dbenv->env; + db_rep = env->rep_handle; + ret = 0; + +#undef OK_FLAGS +#define OK_FLAGS \ + (DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \ + DB_REP_CONF_LEASE | DB_REP_CONF_NOAUTOINIT | DB_REP_CONF_NOWAIT | \ + DB_REPMGR_CONF_2SITE_STRICT) + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_set_config", DB_INIT_REP); + + if (FLD_ISSET(which, ~OK_FLAGS)) + return (__db_ferr(env, "DB_ENV->rep_set_config", 0)); + + mapped = 0; + __rep_config_map(env, &which, &mapped); + + if (APP_IS_BASEAPI(env) && FLD_ISSET(mapped, REP_C_2SITE_STRICT)) { + __db_errx(env, "%s %s", "DB_ENV->rep_set_config:", +"cannot configure 2SITE_STRICT from base replication application"); + return (EINVAL); + } + + if (REP_ON(env)) { + ENV_ENTER(env, ip); + + rep = db_rep->region; + /* + * In-memory replication must be called before calling + * env->open. If it is turned on and off before env->open, + * it doesn't matter. Any attempt to turn it on or off after + * env->open is intercepted by this error. + */ + if (FLD_ISSET(mapped, REP_C_INMEM)) { + __db_errx(env, "%s %s", "DB_ENV->rep_set_config:", + "in-memory replication must be configured before DB_ENV->open"); + return (EINVAL); + } + /* + * Leases must be turned on before calling rep_start. + * Leases can never be turned off once they're turned on. + */ + if (FLD_ISSET(mapped, REP_C_LEASE)) { + if (F_ISSET(rep, REP_F_START_CALLED)) { + __db_errx(env, +"DB_ENV->rep_set_config: leases must be configured before DB_ENV->rep_start"); + ret = EINVAL; + } + if (on == 0) { + __db_errx(env, + "DB_ENV->rep_set_config: leases cannot be turned off"); + ret = EINVAL; + } + if (ret != 0) + return (ret); + } + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + orig = rep->config; + if (on) + FLD_SET(rep->config, mapped); + else + FLD_CLR(rep->config, mapped); + + /* + * Bulk transfer requires special processing if it is getting + * toggled. + */ + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + if (FLD_ISSET(rep->config, REP_C_BULK) && + !FLD_ISSET(orig, REP_C_BULK)) + db_rep->bulk = R_ADDR(&dblp->reginfo, lp->bulk_buf); + REP_SYSTEM_UNLOCK(env); + + /* + * If turning bulk off and it was on, send out whatever is in + * the buffer already. + */ + if (FLD_ISSET(orig, REP_C_BULK) && + !FLD_ISSET(rep->config, REP_C_BULK) && lp->bulk_off != 0) { + memset(&bulk, 0, sizeof(bulk)); + if (db_rep->bulk == NULL) + bulk.addr = + R_ADDR(&dblp->reginfo, lp->bulk_buf); + else + bulk.addr = db_rep->bulk; + bulk.offp = &lp->bulk_off; + bulk.len = lp->bulk_len; + bulk.type = REP_BULK_LOG; + bulk.eid = DB_EID_BROADCAST; + bulk.flagsp = &lp->bulk_flags; + ret = __rep_send_bulk(env, &bulk, 0); + } + MUTEX_UNLOCK(env, rep->mtx_clientdb); + + ENV_LEAVE(env, ip); + } else { + if (on) + FLD_SET(db_rep->config, mapped); + else + FLD_CLR(db_rep->config, mapped); + } + /* Configuring 2SITE_STRICT makes this a repmgr application */ + if (ret == 0 && FLD_ISSET(mapped, REP_C_2SITE_STRICT)) + APP_SET_REPMGR(env); + return (ret); +} + +static void +__rep_config_map(env, inflagsp, outflagsp) + ENV *env; + u_int32_t *inflagsp, *outflagsp; +{ + COMPQUIET(env, NULL); + + if (FLD_ISSET(*inflagsp, DB_REP_CONF_BULK)) { + FLD_SET(*outflagsp, REP_C_BULK); + FLD_CLR(*inflagsp, DB_REP_CONF_BULK); + } + if (FLD_ISSET(*inflagsp, DB_REP_CONF_DELAYCLIENT)) { + FLD_SET(*outflagsp, REP_C_DELAYCLIENT); + FLD_CLR(*inflagsp, DB_REP_CONF_DELAYCLIENT); + } + if (FLD_ISSET(*inflagsp, DB_REP_CONF_INMEM)) { + FLD_SET(*outflagsp, REP_C_INMEM); + FLD_CLR(*inflagsp, DB_REP_CONF_INMEM); + } + if (FLD_ISSET(*inflagsp, DB_REP_CONF_LEASE)) { + FLD_SET(*outflagsp, REP_C_LEASE); + FLD_CLR(*inflagsp, DB_REP_CONF_LEASE); + } + if (FLD_ISSET(*inflagsp, DB_REP_CONF_NOAUTOINIT)) { + FLD_SET(*outflagsp, REP_C_NOAUTOINIT); + FLD_CLR(*inflagsp, DB_REP_CONF_NOAUTOINIT); + } + if (FLD_ISSET(*inflagsp, DB_REP_CONF_NOWAIT)) { + FLD_SET(*outflagsp, REP_C_NOWAIT); + FLD_CLR(*inflagsp, DB_REP_CONF_NOWAIT); + } + if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_2SITE_STRICT)) { + FLD_SET(*outflagsp, REP_C_2SITE_STRICT); + FLD_CLR(*inflagsp, DB_REPMGR_CONF_2SITE_STRICT); + } +} + +/* + * __rep_start_pp -- + * Become a master or client, and start sending messages to participate + * in the replication environment. Must be called after the environment + * is open. + * + * PUBLIC: int __rep_start_pp __P((DB_ENV *, DBT *, u_int32_t)); + */ +int +__rep_start_pp(dbenv, dbt, flags) + DB_ENV *dbenv; + DBT *dbt; + u_int32_t flags; +{ + DB_REP *db_rep; + ENV *env; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_REQUIRES_CONFIG_XX( + env, rep_handle, "DB_ENV->rep_start", DB_INIT_REP); + + if (APP_IS_REPMGR(env)) { + __db_errx(env, +"DB_ENV->rep_start: cannot call from Replication Manager application"); + return (EINVAL); + } + + switch (LF_ISSET(DB_REP_CLIENT | DB_REP_MASTER)) { + case DB_REP_CLIENT: + case DB_REP_MASTER: + break; + default: + __db_errx(env, + "DB_ENV->rep_start: must specify DB_REP_CLIENT or DB_REP_MASTER"); + return (EINVAL); + } + + /* We need a transport function because we send messages. */ + if (db_rep->send == NULL) { + __db_errx(env, + "DB_ENV->rep_start: must be called after DB_ENV->rep_set_transport"); + return (EINVAL); + } + + return (__rep_start_int(env, dbt, flags)); +} + +/* + * __rep_start_int -- + * Internal processing to become a master or client and start sending + * messages to participate in the replication environment. + * + * We must protect rep_start_int, which may change the world, with the rest + * of the DB library. Each API interface will count itself as it enters + * the library. Rep_start_int checks the following: + * + * rep->msg_th - this is the count of threads currently in rep_process_message + * rep->handle_cnt - number of threads actively using a dbp in library. + * rep->txn_cnt - number of active txns. + * REP_F_READY_* - Replication flag that indicates that we wish to run + * recovery, and want to prohibit new transactions from entering and cause + * existing ones to return immediately (with a DB_LOCK_DEADLOCK error). + * + * There is also the renv->rep_timestamp which is updated whenever significant + * events (i.e., new masters, log rollback, etc). Upon creation, a handle + * is associated with the current timestamp. Each time a handle enters the + * library it must check if the handle timestamp is the same as the one + * stored in the replication region. This prevents the use of handles on + * clients that reference non-existent files whose creation was backed out + * during a synchronizing recovery. + * + * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t)); + */ +int +__rep_start_int(env, dbt, flags) + ENV *env; + DBT *dbt; + u_int32_t flags; +{ + DB *dbp; + DB_LOG *dblp; + DB_LOGC *logc; + DB_LSN lsn, perm_lsn; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + DB_TXNREGION *region; + LOG *lp; + REGENV *renv; + REGINFO *infop; + REP *rep; + db_timeout_t tmp; + u_int32_t oldvers, pending_event, repflags, role; + int do_ckp, interrupting, locked, ret, role_chg, start_th, t_ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + infop = env->reginfo; + renv = infop->primary; + interrupting = locked = 0; + pending_event = DB_EVENT_NO_SUCH_EVENT; + role = LF_ISSET(DB_REP_CLIENT | DB_REP_MASTER); + start_th = 0; + do_ckp = 0; + + /* + * If we're using master leases, check that all needed + * setup has been done, including setting the lease timeout. + */ + if (IS_USING_LEASES(env) && rep->lease_timeout == 0) { + __db_errx(env, +"DB_ENV->rep_start: must call DB_ENV->rep_set_timeout for leases first"); + return (EINVAL); + } + + ENV_ENTER(env, ip); + + /* + * In order to correctly check log files for old versions, we + * need to flush the logs. + */ + if ((ret = __log_flush(env, NULL)) != 0) + goto out; + + REP_SYSTEM_LOCK(env); + /* + * We only need one thread to start-up replication, so if + * there is another thread in rep_start, we'll let it finish + * its work and have this thread simply return. Similarly, + * if a thread is in a critical lockout section we return. + */ + if (F_ISSET(rep, REP_F_INREPSTART)) { + /* + * There is already someone in rep_start. Return. + */ + RPRINT(env, DB_VERB_REP_MISC, + (env, "Thread already in rep_start")); + REP_SYSTEM_UNLOCK(env); + goto out; + } else { + F_SET(rep, REP_F_INREPSTART); + start_th = 1; + } + + if (F_ISSET(rep, REP_F_READY_MSG)) { + /* + * There is already someone in msg lockout. Return. + */ + RPRINT(env, DB_VERB_REP_MISC, + (env, "Thread already in msg lockout")); + REP_SYSTEM_UNLOCK(env); + goto out; + } else if ((ret = __rep_lockout_msg(env, rep, 0)) != 0) + goto errunlock; + + /* + * If we are internal init and we try to become master, reject it. + * Our environment databases/logs are in an inconsistent state and + * we cannot become master. + */ + if (IN_INTERNAL_INIT(rep) && role == DB_REP_MASTER) { + __db_errx(env, +"DB_ENV->rep_start: Cannot become master during internal init"); + ret = DB_REP_UNAVAIL; + goto errunlock; + } + + role_chg = (!F_ISSET(rep, REP_F_MASTER) && role == DB_REP_MASTER) || + (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT); + + /* + * Wait for any active txns or mpool ops to complete, and + * prevent any new ones from occurring, only if we're + * changing roles. + */ + if (role_chg) { + if ((ret = __rep_lockout_api(env, rep)) != 0) + goto errunlock; + locked = 1; + } + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + if (role == DB_REP_MASTER) { + if (role_chg) { + /* + * If we were previously a client, it's possible we + * could have an interruptible STARTSYNC in progress. + * Interrupt it now, so that it doesn't slow down our + * transition to master, and because its effects aren't + * doing us any good anyway. + */ + (void)__memp_set_config( + env->dbenv, DB_MEMP_SYNC_INTERRUPT, 1); + interrupting = 1; + + /* + * If we're upgrading from having been a client, + * preclose, so that we close our temporary database + * and any files we opened while doing a rep_apply. + * If we don't we can infinitely leak file ids if + * the master crashed with files open (the likely + * case). If we don't close them we can run into + * problems if we try to remove that file or long + * running applications end up with an unbounded + * number of used fileids, each getting written + * on checkpoint. Just close them. + * Then invalidate all files open in the logging + * region. These are files open by other processes + * attached to the environment. They must be + * closed by the other processes when they notice + * the change in role. + */ + if ((ret = __rep_preclose(env)) != 0) + goto errunlock; + + rep->gen++; + /* + * There could have been any number of failed + * elections, so jump the gen if we need to now. + */ + if (rep->egen > rep->gen) + rep->gen = rep->egen; + if (IS_USING_LEASES(env) && + !F_ISSET(rep, REP_F_MASTERELECT)) { + __db_errx(env, + "rep_start: Cannot become master without being elected when using leases."); + ret = EINVAL; + goto errunlock; + } + if (F_ISSET(rep, REP_F_MASTERELECT)) { + __rep_elect_done(env, rep, 0); + F_CLR(rep, REP_F_MASTERELECT); + } + if (rep->egen <= rep->gen) + rep->egen = rep->gen + 1; + RPRINT(env, DB_VERB_REP_MISC, (env, + "New master gen %lu, egen %lu", + (u_long)rep->gen, (u_long)rep->egen)); + /* + * If not running in-memory replication, write + * gen file. + */ + if (!FLD_ISSET(rep->config, REP_C_INMEM)) { + if ((ret = __rep_write_gen(env, rep, rep->gen)) + != 0) + goto errunlock; + } else if (!F_ISSET(rep, REP_F_MASTERELECT)) + /* + * Help detect if application has + * ignored our recommendation against + * reappointing same master after a + * crash/reboot when running in-memory + * replication. Doing this allows a + * slight chance of two masters at the + * same generation resulting in client + * crashes. + */ + RPRINT(env, DB_VERB_REP_MISC, (env, + "Appointed new master while running in-memory replication.")); + } + /* + * Set lease duration assuming clients have faster clock. + * Master needs to compensate so that clients do not + * expire their grant while the master thinks it is valid. + */ + if (IS_USING_LEASES(env) && + (role_chg || !IS_REP_STARTED(env))) { + /* + * If we have already granted our lease, we + * cannot become master. + */ + if ((ret = __rep_islease_granted(env))) { + __db_errx(env, + "rep_start: Cannot become master with outstanding lease granted."); + ret = EINVAL; + goto errunlock; + } + /* + * Set max_perm_lsn to last PERM record on master. + */ + if ((ret = __log_cursor(env, &logc)) != 0) + goto errunlock; + ret = __rep_log_backup(env, rep, logc, &perm_lsn); + (void)__logc_close(logc); + /* + * If we found a perm LSN use it. Otherwise, if + * no perm LSN exists, initialize. + */ + if (ret == 0) + lp->max_perm_lsn = perm_lsn; + else if (ret == DB_NOTFOUND) { + /* + * If we have no perm records, we want to + * force (later) a checkpoint to the log. + * By doing this now, we avoid a sticky + * deadlock with a txn. We need a perm + * record for leases, but if the first perm + * record is a txn, that txn cannot commit + * without leases refreshed. A client may + * be in internal init and cannot sync up if + * it needs to read pages the txn holds write + * locks on and we have an impasse. This + * checkpoint will allow leases to be granted + * on this perm record first and that does not + * need any locks. + */ + do_ckp = 1; + INIT_LSN(lp->max_perm_lsn); + } else + goto errunlock; + + /* + * Simply compute the larger ratio for the lease. + */ + tmp = (db_timeout_t)((double)rep->lease_timeout / + ((double)rep->clock_skew / + (double)rep->clock_base)); + DB_TIMEOUT_TO_TIMESPEC(tmp, &rep->lease_duration); + if ((ret = __rep_lease_table_alloc(env, + rep->nsites)) != 0) + goto errunlock; + } + rep->master_id = rep->eid; + STAT(rep->stat.st_master_changes++); + + /* + * Clear out almost everything, and then set MASTER. Leave + * READY_* alone in case we did a lockout above; + * we'll clear it in a moment (below), once we've written + * the txn_recycle into the log. + */ + repflags = F_ISSET(rep, REP_F_INREPSTART | REP_F_READY_API | + REP_F_READY_MSG | REP_F_READY_OP | REP_F_STICKY_MASK); +#ifdef DIAGNOSTIC + if (!F_ISSET(rep, REP_F_GROUP_ESTD)) + RPRINT(env, DB_VERB_REP_MISC, (env, + "Establishing group as master.")); +#endif + FLD_SET(repflags, REP_F_MASTER | + REP_F_GROUP_ESTD | REP_F_NIMDBS_LOADED); + rep->flags = repflags; + + /* + * We're master. Set the versions to the current ones. + */ + oldvers = lp->persist.version; + /* + * If we're moving forward to the current version, we need + * to force the log file to advance and reset the + * recovery table since it contains pointers to old + * recovery functions. + */ + RPRINT(env, DB_VERB_REP_MISC, (env, + "rep_start: Old log version was %lu", (u_long)oldvers)); + if (lp->persist.version != DB_LOGVERSION) { + if ((ret = __env_init_rec(env, DB_LOGVERSION)) != 0) + goto errunlock; + } + rep->version = DB_REPVERSION; + F_CLR(rep, REP_F_READY_MSG); + REP_SYSTEM_UNLOCK(env); + LOG_SYSTEM_LOCK(env); + lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + + /* + * Send the NEWMASTER message first so that clients know + * subsequent messages are coming from the right master. + * We need to perform all actions below no matter what + * regarding errors. + */ + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0); + ret = 0; + if (role_chg) { + pending_event = DB_EVENT_REP_MASTER; + /* + * If prepared transactions have not been restored + * look to see if there are any. If there are, + * then mark the open files, otherwise close them. + */ + region = env->tx_handle->reginfo.primary; + if (region->stat.st_nrestores == 0 && + (t_ret = __rep_restore_prepared(env)) != 0 && + ret == 0) + ret = t_ret; + if (region->stat.st_nrestores != 0) { + if ((t_ret = __dbreg_mark_restored(env)) != 0 && + ret == 0) + ret = t_ret; + } else { + ret = __dbreg_invalidate_files(env, 0); + if ((t_ret = __rep_closefiles(env)) != 0 && + ret == 0) + ret = t_ret; + } + if ((t_ret = __txn_recycle_id(env)) != 0 && ret == 0) + ret = t_ret; + REP_SYSTEM_LOCK(env); + F_CLR(rep, REP_F_READY_API | REP_F_READY_OP); + locked = 0; + REP_SYSTEM_UNLOCK(env); + (void)__memp_set_config( + env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0); + interrupting = 0; + /* + * Force a checkpoint if this new master has no + * perm record yet. + */ + if (ret == 0 && do_ckp) + ret = __txn_checkpoint(env, 0, 0, + DB_CKP_INTERNAL | DB_FORCE); + } + } else { + if (role_chg) + rep->master_id = DB_EID_INVALID; + /* + * Zero out "everything" except recovery and tally flags. + */ + repflags = F_ISSET(rep, + REP_F_INREPSTART | REP_F_NOARCHIVE | REP_F_READY_MSG | + REP_F_RECOVER_MASK | REP_F_TALLY | REP_F_STICKY_MASK); + FLD_SET(repflags, REP_F_CLIENT); + if (role_chg) { + if ((ret = __log_get_oldversion(env, &oldvers)) != 0) + goto errunlock; + RPRINT(env, DB_VERB_REP_MISC, (env, + "rep_start: Found old version log %d", oldvers)); + if (oldvers >= DB_LOGVERSION_MIN) { + __log_set_version(env, oldvers); + oldvers = __rep_conv_vers(env, oldvers); + DB_ASSERT( + env, oldvers != DB_REPVERSION_INVALID); + rep->version = oldvers; + } + } + rep->flags = repflags; + /* + * On a client, compute the lease duration on the + * assumption that the client has a fast clock. + * Expire any existing leases we might have held as + * a master. + */ + if (IS_USING_LEASES(env) && + (role_chg || !IS_REP_STARTED(env))) { + if ((ret = __rep_lease_expire(env)) != 0) + goto errunlock; + /* + * Since the master is also compensating on its + * side as well, we're being doubly conservative + * to compensate on the client side. Theoretically, + * this compensation is not necessary, as it is + * effectively doubling the skew compensation. + * But we are making guarantees based on time and + * skews across machines. So we are being extra + * cautious. + */ + tmp = (db_timeout_t)((double)rep->lease_timeout * + ((double)rep->clock_skew / + (double)rep->clock_base)); + DB_TIMEOUT_TO_TIMESPEC(tmp, &rep->lease_duration); + if (rep->lease_off != INVALID_ROFF) { + MUTEX_LOCK(env, renv->mtx_regenv); + __env_alloc_free(infop, + R_ADDR(infop, rep->lease_off)); + MUTEX_UNLOCK(env, renv->mtx_regenv); + rep->lease_off = INVALID_ROFF; + } + } + REP_SYSTEM_UNLOCK(env); + + /* + * Abort any prepared transactions that were restored + * by recovery. We won't be able to create any txns of + * our own until they're resolved, but we can't resolve + * them ourselves; the master has to. If any get + * resolved as commits, we'll redo them when commit + * records come in. Aborts will simply be ignored. + */ + if ((ret = __rep_abort_prepared(env)) != 0) + goto errlock; + + /* + * If we're changing roles we need to init the db. + */ + if (role_chg) { + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto errlock; + /* + * Ignore errors, because if the file doesn't exist, + * this is perfectly OK. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + (void)__db_remove(dbp, ip, NULL, REPDBNAME, + NULL, DB_FORCE); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + /* + * Set pending_event after calls that can fail. + */ + pending_event = DB_EVENT_REP_CLIENT; + } + REP_SYSTEM_LOCK(env); + F_CLR(rep, REP_F_READY_MSG); + if (locked) { + F_CLR(rep, REP_F_READY_API | REP_F_READY_OP); + locked = 0; + } + REP_SYSTEM_UNLOCK(env); + + if ((role_chg || rep->master_id == DB_EID_INVALID) && + F_ISSET(env, ENV_PRIVATE)) + /* + * If we think we're a new client, and we have a + * private env, set our gen number down to 0. + * Otherwise, we can restart and think + * we're ready to accept a new record (because our + * gen is okay), but really this client needs to + * sync with the master. + */ + rep->gen = 0; + + /* + * Announce ourselves and send out our data. + */ + if ((ret = __dbt_usercopy(env, dbt)) != 0) + goto out; + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_NEWCLIENT, NULL, dbt, 0, 0); + } + + if (0) { + /* + * We have separate labels for errors. If we're returning an + * error before we've set REP_F_READY_MSG, we use 'err'. If + * we are erroring while holding the region mutex, then we use + * 'errunlock' label. If we error without holding the rep + * mutex we must use 'errlock'. + */ +errlock: REP_SYSTEM_LOCK(env); +errunlock: F_CLR(rep, REP_F_READY_MSG); + if (locked) + F_CLR(rep, REP_F_READY_API | REP_F_READY_OP); + if (interrupting) + (void)__memp_set_config( + env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0); + REP_SYSTEM_UNLOCK(env); + } +out: + if (ret == 0) { + REP_SYSTEM_LOCK(env); + F_SET(rep, REP_F_START_CALLED); + REP_SYSTEM_UNLOCK(env); + } + if (start_th) { + REP_SYSTEM_LOCK(env); + F_CLR(rep, REP_F_INREPSTART); + REP_SYSTEM_UNLOCK(env); + } + if (pending_event != DB_EVENT_NO_SUCH_EVENT) + __rep_fire_event(env, pending_event, NULL); + __dbt_userfree(env, dbt, NULL, NULL); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __rep_client_dbinit -- + * + * Initialize the LSN database on the client side. This is called from the + * client initialization code. The startup flag value indicates if + * this is the first thread/process starting up and therefore should create + * the LSN database. This routine must be called once by each process acting + * as a client. + * + * Assumes caller holds appropriate mutex. + * + * PUBLIC: int __rep_client_dbinit __P((ENV *, int, repdb_t)); + */ +int +__rep_client_dbinit(env, startup, which) + ENV *env; + int startup; + repdb_t which; +{ + DB *dbp, **rdbpp; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + REP *rep; + int ret, t_ret; + u_int32_t flags; + const char *fname, *name, *subdb; + + db_rep = env->rep_handle; + rep = db_rep->region; + dbp = NULL; + + if (which == REP_DB) { + name = REPDBNAME; + rdbpp = &db_rep->rep_db; + } else { + name = REPPAGENAME; + rdbpp = &rep->file_dbp; + } + /* Check if this has already been called on this environment. */ + if (*rdbpp != NULL) + return (0); + + ENV_GET_THREAD_INFO(env, ip); + + /* Set up arguments for __db_remove and __db_open calls. */ + fname = name; + subdb = NULL; + if (FLD_ISSET(rep->config, REP_C_INMEM)) { + fname = NULL; + subdb = name; + } + + if (startup) { + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto err; + /* + * Prevent in-memory database remove from writing to + * non-existent logs. + */ + if (FLD_ISSET(rep->config, REP_C_INMEM)) + (void)__db_set_flags(dbp, DB_TXN_NOT_DURABLE); + /* + * Ignore errors, because if the file doesn't exist, this + * is perfectly OK. + */ + (void)__db_remove(dbp, ip, NULL, fname, subdb, DB_FORCE); + } + + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto err; + if (which == REP_DB && + (ret = __bam_set_bt_compare(dbp, __rep_bt_cmp)) != 0) + goto err; + + /* Don't write log records on the client. */ + if ((ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0) + goto err; + + flags = DB_NO_AUTO_COMMIT | DB_CREATE | + (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0); + + if ((ret = __db_open(dbp, ip, NULL, fname, subdb, + (which == REP_DB ? DB_BTREE : DB_RECNO), + flags, 0, PGNO_BASE_MD)) != 0) + goto err; + + *rdbpp = dbp; + + if (0) { +err: if (dbp != NULL && + (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + *rdbpp = NULL; + } + + return (ret); +} + +/* + * __rep_bt_cmp -- + * + * Comparison function for the LSN table. We use the entire control + * structure as a key (for simplicity, so we don't have to merge the + * other fields in the control with the data field), but really only + * care about the LSNs. + */ +static int +__rep_bt_cmp(dbp, dbt1, dbt2) + DB *dbp; + const DBT *dbt1, *dbt2; +{ + DB_LSN lsn1, lsn2; + __rep_control_args *rp1, *rp2; + + COMPQUIET(dbp, NULL); + + rp1 = dbt1->data; + rp2 = dbt2->data; + + (void)__ua_memcpy(&lsn1, &rp1->lsn, sizeof(DB_LSN)); + (void)__ua_memcpy(&lsn2, &rp2->lsn, sizeof(DB_LSN)); + + if (lsn1.file > lsn2.file) + return (1); + + if (lsn1.file < lsn2.file) + return (-1); + + if (lsn1.offset > lsn2.offset) + return (1); + + if (lsn1.offset < lsn2.offset) + return (-1); + + return (0); +} + +/* + * __rep_abort_prepared -- + * Abort any prepared transactions that recovery restored. + * + * This is used by clients that have just run recovery, since + * they cannot/should not call txn_recover and handle prepared transactions + * themselves. + */ +static int +__rep_abort_prepared(env) + ENV *env; +{ +#define PREPLISTSIZE 50 + DB_LOG *dblp; + DB_PREPLIST prep[PREPLISTSIZE], *p; + DB_TXNMGR *mgr; + DB_TXNREGION *region; + LOG *lp; + int ret; + u_int32_t count, i; + u_int32_t op; + + mgr = env->tx_handle; + region = mgr->reginfo.primary; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + + if (region->stat.st_nrestores == 0) + return (0); + + op = DB_FIRST; + do { + if ((ret = __txn_recover(env, + prep, PREPLISTSIZE, &count, op)) != 0) + return (ret); + for (i = 0; i < count; i++) { + p = &prep[i]; + if ((ret = __txn_abort(p->txn)) != 0) + return (ret); + env->rep_handle->region->op_cnt--; + env->rep_handle->region->max_prep_lsn = lp->lsn; + region->stat.st_nrestores--; + } + op = DB_NEXT; + } while (count == PREPLISTSIZE); + + return (0); +} + +/* + * __rep_restore_prepared -- + * Restore to a prepared state any prepared but not yet committed + * transactions. + * + * This performs, in effect, a "mini-recovery"; it is called from + * __rep_start by newly upgraded masters. There may be transactions that an + * old master prepared but did not resolve, which we need to restore to an + * active state. + */ +static int +__rep_restore_prepared(env) + ENV *env; +{ + DBT rec; + DB_LOGC *logc; + DB_LSN ckp_lsn, lsn; + DB_REP *db_rep; + DB_TXNHEAD *txninfo; + REP *rep; + __txn_ckp_args *ckp_args; + __txn_regop_args *regop_args; + __txn_prepare_args *prep_args; + int ret, t_ret; + u_int32_t hi_txn, low_txn, rectype, status, txnid, txnop; + + db_rep = env->rep_handle; + rep = db_rep->region; + if (IS_ZERO_LSN(rep->max_prep_lsn)) { + RPRINT(env, DB_VERB_REP_MISC, + (env, "restore_prep: No prepares. Skip.")); + return (0); + } + txninfo = NULL; + ckp_args = NULL; + prep_args = NULL; + regop_args = NULL; + ZERO_LSN(ckp_lsn); + ZERO_LSN(lsn); + + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + + /* + * Get our first LSN to see if the prepared LSN is still + * available. If so, it might be unresolved. If not, + * then it is guaranteed to be resolved. + */ + memset(&rec, 0, sizeof(DBT)); + if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) { + __db_errx(env, "First record not found"); + goto err; + } + /* + * If the max_prep_lsn is no longer available, we're sure + * that txn has been resolved. We're done. + */ + if (rep->max_prep_lsn.file < lsn.file) { + RPRINT(env, DB_VERB_REP_MISC, + (env, "restore_prep: Prepare resolved. Skip")); + ZERO_LSN(rep->max_prep_lsn); + goto done; + } + /* + * We need to consider the set of records between the most recent + * checkpoint LSN and the end of the log; any txn in that + * range, and only txns in that range, could still have been + * active, and thus prepared but not yet committed (PBNYC), + * when the old master died. + * + * Find the most recent checkpoint LSN, and get the record there. + * If there is no checkpoint in the log, start off by getting + * the very first record in the log instead. + */ + if ((ret = __txn_getckp(env, &lsn)) == 0) { + if ((ret = __logc_get(logc, &lsn, &rec, DB_SET)) != 0) { + __db_errx(env, + "Checkpoint record at LSN [%lu][%lu] not found", + (u_long)lsn.file, (u_long)lsn.offset); + goto err; + } + + if ((ret = __txn_ckp_read( + env, rec.data, &ckp_args)) == 0) { + ckp_lsn = ckp_args->ckp_lsn; + __os_free(env, ckp_args); + } + if (ret != 0) { + __db_errx(env, + "Invalid checkpoint record at [%lu][%lu]", + (u_long)lsn.file, (u_long)lsn.offset); + goto err; + } + + if ((ret = __logc_get(logc, &ckp_lsn, &rec, DB_SET)) != 0) { + __db_errx(env, + "Checkpoint LSN record [%lu][%lu] not found", + (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset); + goto err; + } + } else if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) { + if (ret == DB_NOTFOUND) { + /* An empty log means no PBNYC txns. */ + ret = 0; + goto done; + } + __db_errx(env, "Attempt to get first log record failed"); + goto err; + } + + /* + * We use the same txnlist infrastructure that recovery does; + * it demands an estimate of the high and low txnids for + * initialization. + * + * First, the low txnid. + */ + do { + /* txnid is after rectype, which is a u_int32. */ + LOGCOPY_32(env, &low_txn, + (u_int8_t *)rec.data + sizeof(u_int32_t)); + if (low_txn != 0) + break; + } while ((ret = __logc_get(logc, &lsn, &rec, DB_NEXT)) == 0); + + /* If there are no txns, there are no PBNYC txns. */ + if (ret == DB_NOTFOUND) { + ret = 0; + goto done; + } else if (ret != 0) + goto err; + + /* Now, the high txnid. */ + if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0) { + /* + * Note that DB_NOTFOUND is unacceptable here because we + * had to have looked at some log record to get this far. + */ + __db_errx(env, "Final log record not found"); + goto err; + } + do { + /* txnid is after rectype, which is a u_int32. */ + LOGCOPY_32(env, &hi_txn, + (u_int8_t *)rec.data + sizeof(u_int32_t)); + if (hi_txn != 0) + break; + } while ((ret = __logc_get(logc, &lsn, &rec, DB_PREV)) == 0); + if (ret == DB_NOTFOUND) { + ret = 0; + goto done; + } else if (ret != 0) + goto err; + + /* We have a high and low txnid. Initialise the txn list. */ + if ((ret = __db_txnlist_init(env, + NULL, low_txn, hi_txn, NULL, &txninfo)) != 0) + goto err; + + /* + * Now, walk backward from the end of the log to ckp_lsn. Any + * prepares that we hit without first hitting a commit or + * abort belong to PBNYC txns, and we need to apply them and + * restore them to a prepared state. + * + * Note that we wind up applying transactions out of order. + * Since all PBNYC txns still held locks on the old master and + * were isolated, this should be safe. + */ + F_SET(env->lg_handle, DBLOG_RECOVER); + for (ret = __logc_get(logc, &lsn, &rec, DB_LAST); + ret == 0 && LOG_COMPARE(&lsn, &ckp_lsn) > 0; + ret = __logc_get(logc, &lsn, &rec, DB_PREV)) { + LOGCOPY_32(env, &rectype, rec.data); + switch (rectype) { + case DB___txn_regop: + /* + * It's a commit or abort--but we don't care + * which! Just add it to the list of txns + * that are resolved. + */ + if ((ret = __txn_regop_read( + env, rec.data, ®op_args)) != 0) + goto err; + txnid = regop_args->txnp->txnid; + txnop = regop_args->opcode; + __os_free(env, regop_args); + + ret = __db_txnlist_find(env, + txninfo, txnid, &status); + if (ret == DB_NOTFOUND) + ret = __db_txnlist_add(env, txninfo, + txnid, txnop, &lsn); + else if (ret != 0) + goto err; + break; + case DB___txn_prepare: + /* + * It's a prepare. If its not aborted and + * we haven't put the txn on our list yet, it + * hasn't been resolved, so apply and restore it. + */ + if ((ret = __txn_prepare_read( + env, rec.data, &prep_args)) != 0) + goto err; + ret = __db_txnlist_find(env, txninfo, + prep_args->txnp->txnid, &status); + if (ret == DB_NOTFOUND) { + if (prep_args->opcode == TXN_ABORT) + ret = __db_txnlist_add(env, txninfo, + prep_args->txnp->txnid, + prep_args->opcode, &lsn); + else if ((ret = + __rep_process_txn(env, &rec)) == 0) { + /* + * We are guaranteed to be single + * threaded here. We need to + * account for this newly + * instantiated txn in the op_cnt + * so that it is counted when it is + * resolved. + */ + rep->op_cnt++; + ret = __txn_restore_txn(env, + &lsn, prep_args); + } + } else if (ret != 0) + goto err; + __os_free(env, prep_args); + break; + default: + continue; + } + } + + /* It's not an error to have hit the beginning of the log. */ + if (ret == DB_NOTFOUND) + ret = 0; + +done: +err: t_ret = __logc_close(logc); + F_CLR(env->lg_handle, DBLOG_RECOVER); + + if (txninfo != NULL) + __db_txnlist_end(env, txninfo); + + return (ret == 0 ? t_ret : ret); +} + +/* + * __rep_get_limit -- + * Get the limit on the amount of data that will be sent during a single + * invocation of __rep_process_message. + * + * PUBLIC: int __rep_get_limit __P((DB_ENV *, u_int32_t *, u_int32_t *)); + */ +int +__rep_get_limit(dbenv, gbytesp, bytesp) + DB_ENV *dbenv; + u_int32_t *gbytesp, *bytesp; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_get_limit", DB_INIT_REP); + + if (REP_ON(env)) { + rep = db_rep->region; + ENV_ENTER(env, ip); + REP_SYSTEM_LOCK(env); + if (gbytesp != NULL) + *gbytesp = rep->gbytes; + if (bytesp != NULL) + *bytesp = rep->bytes; + REP_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else { + if (gbytesp != NULL) + *gbytesp = db_rep->gbytes; + if (bytesp != NULL) + *bytesp = db_rep->bytes; + } + + return (0); +} + +/* + * __rep_set_limit -- + * Set a limit on the amount of data that will be sent during a single + * invocation of __rep_process_message. + * + * PUBLIC: int __rep_set_limit __P((DB_ENV *, u_int32_t, u_int32_t)); + */ +int +__rep_set_limit(dbenv, gbytes, bytes) + DB_ENV *dbenv; + u_int32_t gbytes, bytes; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_set_limit", DB_INIT_REP); + + if (bytes > GIGABYTE) { + gbytes += bytes / GIGABYTE; + bytes = bytes % GIGABYTE; + } + + if (REP_ON(env)) { + rep = db_rep->region; + ENV_ENTER(env, ip); + REP_SYSTEM_LOCK(env); + rep->gbytes = gbytes; + rep->bytes = bytes; + REP_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else { + db_rep->gbytes = gbytes; + db_rep->bytes = bytes; + } + + return (0); +} + +/* + * PUBLIC: int __rep_set_nsites __P((DB_ENV *, u_int32_t)); + */ +int +__rep_set_nsites(dbenv, n) + DB_ENV *dbenv; + u_int32_t n; +{ + DB_REP *db_rep; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_set_nsites", DB_INIT_REP); + + if (IS_USING_LEASES(env) && IS_REP_STARTED(env)) { + __db_errx(env, + "DB_ENV->rep_set_nsites: must be called before DB_ENV->rep_start"); + return (EINVAL); + } + + if (REP_ON(env)) { + rep = db_rep->region; + rep->config_nsites = n; + } else + db_rep->config_nsites = n; + return (0); +} + +/* + * PUBLIC: int __rep_get_nsites __P((DB_ENV *, u_int32_t *)); + */ +int +__rep_get_nsites(dbenv, n) + DB_ENV *dbenv; + u_int32_t *n; +{ + DB_REP *db_rep; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_get_nsites", DB_INIT_REP); + + if (REP_ON(env)) { + rep = db_rep->region; + *n = rep->config_nsites; + } else + *n = db_rep->config_nsites; + + return (0); +} + +/* + * PUBLIC: int __rep_set_priority __P((DB_ENV *, u_int32_t)); + */ +int +__rep_set_priority(dbenv, priority) + DB_ENV *dbenv; + u_int32_t priority; +{ + DB_REP *db_rep; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_set_priority", DB_INIT_REP); + + if (REP_ON(env)) { + rep = db_rep->region; + rep->priority = priority; + } else + db_rep->my_priority = priority; + return (0); +} + +/* + * PUBLIC: int __rep_get_priority __P((DB_ENV *, u_int32_t *)); + */ +int +__rep_get_priority(dbenv, priority) + DB_ENV *dbenv; + u_int32_t *priority; +{ + DB_REP *db_rep; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_get_priority", DB_INIT_REP); + + if (REP_ON(env)) { + rep = db_rep->region; + *priority = rep->priority; + } else + *priority = db_rep->my_priority; + return (0); +} + +/* + * PUBLIC: int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t)); + */ +int +__rep_set_timeout(dbenv, which, timeout) + DB_ENV *dbenv; + int which; + db_timeout_t timeout; +{ + DB_REP *db_rep; + ENV *env; + REP *rep; + int repmgr_timeout, ret; + + env = dbenv->env; + db_rep = env->rep_handle; + rep = db_rep->region; + ret = 0; + repmgr_timeout = 0; + + if (which == DB_REP_ACK_TIMEOUT || which == DB_REP_CONNECTION_RETRY || + which == DB_REP_ELECTION_RETRY || + which == DB_REP_HEARTBEAT_MONITOR || + which == DB_REP_HEARTBEAT_SEND) + repmgr_timeout = 1; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_set_timeout", DB_INIT_REP); + + if (APP_IS_BASEAPI(env) && repmgr_timeout) { + __db_errx(env, "%s %s", "DB_ENV->rep_set_timeout:", +"cannot set Replication Manager timeout from base replication application"); + return (EINVAL); + } + if (which == DB_REP_LEASE_TIMEOUT && IS_REP_STARTED(env)) { + ret = EINVAL; + __db_errx(env, "%s %s", "DB_ENV->rep_set_timeout:", +"lease timeout must be set before DB_ENV->rep_start."); + return (EINVAL); + } + + switch (which) { + case DB_REP_CHECKPOINT_DELAY: + if (REP_ON(env)) + rep->chkpt_delay = timeout; + else + db_rep->chkpt_delay = timeout; + break; + case DB_REP_ELECTION_TIMEOUT: + if (REP_ON(env)) + rep->elect_timeout = timeout; + else + db_rep->elect_timeout = timeout; + break; + case DB_REP_FULL_ELECTION_TIMEOUT: + if (REP_ON(env)) + rep->full_elect_timeout = timeout; + else + db_rep->full_elect_timeout = timeout; + break; + case DB_REP_LEASE_TIMEOUT: + if (REP_ON(env)) + rep->lease_timeout = timeout; + else + db_rep->lease_timeout = timeout; + break; +#ifdef HAVE_REPLICATION_THREADS + case DB_REP_ACK_TIMEOUT: + db_rep->ack_timeout = timeout; + break; + case DB_REP_CONNECTION_RETRY: + db_rep->connection_retry_wait = timeout; + break; + case DB_REP_ELECTION_RETRY: + db_rep->election_retry_wait = timeout; + break; + case DB_REP_HEARTBEAT_MONITOR: + db_rep->heartbeat_monitor_timeout = timeout; + break; + case DB_REP_HEARTBEAT_SEND: + db_rep->heartbeat_frequency = timeout; + break; +#endif + default: + __db_errx(env, + "Unknown timeout type argument to DB_ENV->rep_set_timeout"); + ret = EINVAL; + } + + /* Setting a repmgr timeout makes this a repmgr application */ + if (ret == 0 && repmgr_timeout) + APP_SET_REPMGR(env); + return (ret); +} + +/* + * PUBLIC: int __rep_get_timeout __P((DB_ENV *, int, db_timeout_t *)); + */ +int +__rep_get_timeout(dbenv, which, timeout) + DB_ENV *dbenv; + int which; + db_timeout_t *timeout; +{ + DB_REP *db_rep; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + rep = db_rep->region; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_get_timeout", DB_INIT_REP); + + switch (which) { + case DB_REP_CHECKPOINT_DELAY: + *timeout = REP_ON(env) ? + rep->chkpt_delay : db_rep->chkpt_delay; + break; + case DB_REP_ELECTION_TIMEOUT: + *timeout = REP_ON(env) ? + rep->elect_timeout : db_rep->elect_timeout; + break; + case DB_REP_FULL_ELECTION_TIMEOUT: + *timeout = REP_ON(env) ? + rep->full_elect_timeout : db_rep->full_elect_timeout; + break; + case DB_REP_LEASE_TIMEOUT: + *timeout = REP_ON(env) ? + rep->lease_timeout : db_rep->lease_timeout; + break; +#ifdef HAVE_REPLICATION_THREADS + case DB_REP_ACK_TIMEOUT: + *timeout = db_rep->ack_timeout; + break; + case DB_REP_CONNECTION_RETRY: + *timeout = db_rep->connection_retry_wait; + break; + case DB_REP_ELECTION_RETRY: + *timeout = db_rep->election_retry_wait; + break; + case DB_REP_HEARTBEAT_MONITOR: + *timeout = db_rep->heartbeat_monitor_timeout; + break; + case DB_REP_HEARTBEAT_SEND: + *timeout = db_rep->heartbeat_frequency; + break; +#endif + default: + __db_errx(env, + "unknown timeout type argument to DB_ENV->rep_get_timeout"); + return (EINVAL); + } + + return (0); +} + +/* + * __rep_get_request -- + * Get the minimum and maximum number of log records that we wait + * before retransmitting. + * + * PUBLIC: int __rep_get_request + * PUBLIC: __P((DB_ENV *, db_timeout_t *, db_timeout_t *)); + */ +int +__rep_get_request(dbenv, minp, maxp) + DB_ENV *dbenv; + db_timeout_t *minp, *maxp; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_get_request", DB_INIT_REP); + + if (REP_ON(env)) { + rep = db_rep->region; + ENV_ENTER(env, ip); + /* + * We acquire the mtx_region or mtx_clientdb mutexes as needed. + */ + REP_SYSTEM_LOCK(env); + if (minp != NULL) + DB_TIMESPEC_TO_TIMEOUT((*minp), &rep->request_gap, 0); + if (maxp != NULL) + DB_TIMESPEC_TO_TIMEOUT((*maxp), &rep->max_gap, 0); + REP_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else { + if (minp != NULL) + DB_TIMESPEC_TO_TIMEOUT((*minp), + &db_rep->request_gap, 0); + if (maxp != NULL) + DB_TIMESPEC_TO_TIMEOUT((*maxp), &db_rep->max_gap, 0); + } + + return (0); +} + +/* + * __rep_set_request -- + * Set the minimum and maximum number of log records that we wait + * before retransmitting. + * + * PUBLIC: int __rep_set_request __P((DB_ENV *, db_timeout_t, db_timeout_t)); + */ +int +__rep_set_request(dbenv, min, max) + DB_ENV *dbenv; + db_timeout_t min, max; +{ + DB_LOG *dblp; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + LOG *lp; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_set_request", DB_INIT_REP); + + if (min == 0 || max < min) { + __db_errx(env, + "DB_ENV->rep_set_request: Invalid min or max values"); + return (EINVAL); + } + if (REP_ON(env)) { + rep = db_rep->region; + ENV_ENTER(env, ip); + /* + * We acquire the mtx_region or mtx_clientdb mutexes as needed. + */ + REP_SYSTEM_LOCK(env); + DB_TIMEOUT_TO_TIMESPEC(min, &rep->request_gap); + DB_TIMEOUT_TO_TIMESPEC(max, &rep->max_gap); + REP_SYSTEM_UNLOCK(env); + + MUTEX_LOCK(env, rep->mtx_clientdb); + dblp = env->lg_handle; + if (dblp != NULL && (lp = dblp->reginfo.primary) != NULL) { + DB_TIMEOUT_TO_TIMESPEC(min, &lp->wait_ts); + } + MUTEX_UNLOCK(env, rep->mtx_clientdb); + ENV_LEAVE(env, ip); + } else { + DB_TIMEOUT_TO_TIMESPEC(min, &db_rep->request_gap); + DB_TIMEOUT_TO_TIMESPEC(max, &db_rep->max_gap); + } + + return (0); +} + +/* + * __rep_set_transport_pp -- + * Set the transport function for replication. + * + * PUBLIC: int __rep_set_transport_pp __P((DB_ENV *, int, + * PUBLIC: int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, + * PUBLIC: int, u_int32_t))); + */ +int +__rep_set_transport_pp(dbenv, eid, f_send) + DB_ENV *dbenv; + int eid; + int (*f_send) __P((DB_ENV *, + const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)); +{ + DB_REP *db_rep; + ENV *env; + int ret; + + env = dbenv->env; + db_rep = env->rep_handle; + ret = 0; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_set_transport", DB_INIT_REP); + + if (APP_IS_REPMGR(env)) { + __db_errx(env, +"DB_ENV->rep_set_transport: cannot call from Replication Manager application"); + return (EINVAL); + } + + if (f_send == NULL) { + __db_errx(env, + "DB_ENV->rep_set_transport: no send function specified"); + return (EINVAL); + } + + if (eid < 0) { + __db_errx(env, + "DB_ENV->rep_set_transport: eid must be greater than or equal to 0"); + return (EINVAL); + } + + if ((ret = __rep_set_transport_int(env, eid, f_send)) == 0) + /* + * Setting a non-repmgr send function makes this a base API + * application. + */ + APP_SET_BASEAPI(env); + + return (ret); +} + +/* + * __rep_set_transport_int -- + * Set the internal values for the transport function for replication. + * + * PUBLIC: int __rep_set_transport_int __P((ENV *, int, + * PUBLIC: int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *, + * PUBLIC: int, u_int32_t))); + */ +int +__rep_set_transport_int(env, eid, f_send) + ENV *env; + int eid; + int (*f_send) __P((DB_ENV *, + const DBT *, const DBT *, const DB_LSN *, int, u_int32_t)); +{ + DB_REP *db_rep; + REP *rep; + + db_rep = env->rep_handle; + db_rep->send = f_send; + if (REP_ON(env)) { + rep = db_rep->region; + rep->eid = eid; + } else + db_rep->eid = eid; + return (0); +} + +/* + * PUBLIC: int __rep_get_clockskew __P((DB_ENV *, u_int32_t *, u_int32_t *)); + */ +int +__rep_get_clockskew(dbenv, fast_clockp, slow_clockp) + DB_ENV *dbenv; + u_int32_t *fast_clockp, *slow_clockp; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + REP *rep; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_get_clockskew", DB_INIT_REP); + + if (REP_ON(env)) { + rep = db_rep->region; + ENV_ENTER(env, ip); + REP_SYSTEM_LOCK(env); + *fast_clockp = rep->clock_skew; + *slow_clockp = rep->clock_base; + REP_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else { + *fast_clockp = db_rep->clock_skew; + *slow_clockp = db_rep->clock_base; + } + + return (0); +} + +/* + * PUBLIC: int __rep_set_clockskew __P((DB_ENV *, u_int32_t, u_int32_t)); + */ +int +__rep_set_clockskew(dbenv, fast_clock, slow_clock) + DB_ENV *dbenv; + u_int32_t fast_clock, slow_clock; +{ + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + REP *rep; + int ret; + + env = dbenv->env; + db_rep = env->rep_handle; + ret = 0; + + ENV_NOT_CONFIGURED( + env, db_rep->region, "DB_ENV->rep_set_clockskew", DB_INIT_REP); + + /* + * Check for valid values. The fast clock should be a larger + * number than the slow clock. We use the slow clock value as + * our base for adjustment - therefore, a 2% difference should + * be fast == 102, slow == 100. Check for values being 0. If + * they are, then set them both to 1 internally. + * + * We will use these numbers to compute the larger ratio to be + * most conservative about the user's intention. + */ + if (fast_clock == 0 || slow_clock == 0) { + /* + * If one value is zero, reject if both aren't zero. + */ + if (slow_clock != 0 || fast_clock != 0) { + __db_errx(env, +"DB_ENV->rep_set_clockskew: Zero only valid for when used for both arguments"); + return (EINVAL); + } + fast_clock = 1; + slow_clock = 1; + } + if (fast_clock < slow_clock) { + __db_errx(env, +"DB_ENV->rep_set_clockskew: slow_clock value is larger than fast_clock_value"); + return (EINVAL); + } + if (REP_ON(env)) { + rep = db_rep->region; + if (IS_REP_STARTED(env)) { + __db_errx(env, + "DB_ENV->rep_set_clockskew: must be called before DB_ENV->rep_start"); + return (EINVAL); + } + ENV_ENTER(env, ip); + REP_SYSTEM_LOCK(env); + rep->clock_skew = fast_clock; + rep->clock_base = slow_clock; + REP_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else { + db_rep->clock_skew = fast_clock; + db_rep->clock_base = slow_clock; + } + return (ret); +} + +/* + * __rep_flush -- + * Re-push the last log record to all clients, in case they've lost + * messages and don't know it. + * + * PUBLIC: int __rep_flush __P((DB_ENV *)); + */ +int +__rep_flush(dbenv) + DB_ENV *dbenv; +{ + DBT rec; + DB_LOGC *logc; + DB_LSN lsn; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + int ret, t_ret; + + env = dbenv->env; + db_rep = env->rep_handle; + + ENV_REQUIRES_CONFIG_XX( + env, rep_handle, "DB_ENV->rep_flush", DB_INIT_REP); + + /* We need a transport function because we send messages. */ + if (db_rep->send == NULL) { + __db_errx(env, + "DB_ENV->rep_flush: must be called after DB_ENV->rep_set_transport"); + return (EINVAL); + } + + ENV_ENTER(env, ip); + + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + + memset(&rec, 0, sizeof(rec)); + memset(&lsn, 0, sizeof(lsn)); + + if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0) + goto err; + + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_LOG, &lsn, &rec, 0, 0); + +err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __rep_sync -- + * Force a synchronization to occur between this client and the master. + * This is the other half of configuring DELAYCLIENT. + * + * PUBLIC: int __rep_sync __P((DB_ENV *, u_int32_t)); + */ +int +__rep_sync(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + DB_LOG *dblp; + DB_LSN lsn; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + ENV *env; + LOG *lp; + REP *rep; + int master, ret; + u_int32_t repflags, type; + + env = dbenv->env; + db_rep = env->rep_handle; + + COMPQUIET(flags, 0); + + ENV_REQUIRES_CONFIG_XX( + env, rep_handle, "DB_ENV->rep_sync", DB_INIT_REP); + + /* We need a transport function because we send messages. */ + if (db_rep->send == NULL) { + __db_errx(env, + "DB_ENV->rep_sync: must be called after DB_ENV->rep_set_transport"); + return (EINVAL); + } + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + rep = db_rep->region; + ret = 0; + + ENV_ENTER(env, ip); + + /* + * Simple cases. If we're not in the DELAY state we have nothing + * to do. If we don't know who the master is, send a MASTER_REQ. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + lsn = lp->verify_lsn; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + master = rep->master_id; + if (master == DB_EID_INVALID) { + REP_SYSTEM_UNLOCK(env); + (void)__rep_send_message(env, DB_EID_BROADCAST, + REP_MASTER_REQ, NULL, NULL, 0, 0); + goto out; + } + /* + * We want to hold the rep mutex to test and then clear the + * DELAY flag. Racing threads in here could otherwise result + * in dual data streams. + */ + if (!F_ISSET(rep, REP_F_DELAY)) { + REP_SYSTEM_UNLOCK(env); + goto out; + } + + DB_ASSERT(env, + !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0); + + /* + * If we get here, we clear the delay flag and kick off a + * synchronization. From this point forward, we will + * synchronize until the next time the master changes. + */ + F_CLR(rep, REP_F_DELAY); + if (IS_ZERO_LSN(lsn) && FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) { + F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK); + ret = DB_REP_JOIN_FAILURE; + REP_SYSTEM_UNLOCK(env); + goto out; + } + REP_SYSTEM_UNLOCK(env); + /* + * When we set REP_F_DELAY, we set verify_lsn to the real verify lsn if + * we need to verify, or we zeroed it out if this is a client that needs + * internal init. So, send the type of message now that + * __rep_new_master delayed sending. + */ + if (IS_ZERO_LSN(lsn)) { + DB_ASSERT(env, F_ISSET(rep, REP_F_RECOVER_UPDATE)); + type = REP_UPDATE_REQ; + repflags = 0; + } else { + DB_ASSERT(env, F_ISSET(rep, REP_F_RECOVER_VERIFY)); + type = REP_VERIFY_REQ; + repflags = DB_REP_ANYWHERE; + } + (void)__rep_send_message(env, master, type, &lsn, NULL, 0, repflags); + +out: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __rep_conv_vers -- + * Convert from a log version to the replication message version + * that release used. + */ +static u_int32_t +__rep_conv_vers(env, log_ver) + ENV *env; + u_int32_t log_ver; +{ + COMPQUIET(env, NULL); + + /* + * We can't use a switch statement, some of the DB_LOGVERSION_XX + * constants are the same + */ + if (log_ver == DB_LOGVERSION) + return (DB_REPVERSION); + if (log_ver == DB_LOGVERSION_44) + return (DB_REPVERSION_44); + if (log_ver == DB_LOGVERSION_45) + return (DB_REPVERSION_45); + if (log_ver == DB_LOGVERSION_46) + return (DB_REPVERSION_46); + if (log_ver == DB_LOGVERSION_47) + return (DB_REPVERSION_47); + return (DB_REPVERSION_INVALID); +} diff --git a/rep/rep_record.c b/rep/rep_record.c new file mode 100644 index 0000000..7196ca2 --- /dev/null +++ b/rep/rep_record.c @@ -0,0 +1,2379 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __rep_collect_txn __P((ENV *, DB_LSN *, LSN_COLLECTION *)); +static int __rep_do_ckp __P((ENV *, DBT *, __rep_control_args *)); +static int __rep_fire_newmaster __P((ENV *, u_int32_t, int)); +static int __rep_fire_startupdone __P((ENV *, u_int32_t, int)); +static int __rep_getnext __P((ENV *, DB_THREAD_INFO *)); +static int __rep_lsn_cmp __P((const void *, const void *)); +static int __rep_newfile __P((ENV *, __rep_control_args *, DBT *)); +static int __rep_process_rec __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, + DBT *, db_timespec *, DB_LSN *)); +static int __rep_remfirst __P((ENV *, DB_THREAD_INFO *, DBT *, DBT *)); +static int __rep_skip_msg __P((ENV *, REP *, int, u_int32_t)); + +/* Used to consistently designate which messages ought to be received where. */ + +#define MASTER_ONLY(rep, rp) do { \ + if (!F_ISSET(rep, REP_F_MASTER)) { \ + RPRINT(env, DB_VERB_REP_MSGS, \ + (env, "Master record received on client")); \ + REP_PRINT_MESSAGE(env, \ + eid, rp, "rep_process_message", 0); \ + /* Just skip/ignore it. */ \ + ret = 0; \ + goto errlock; \ + } \ +} while (0) + +#define CLIENT_ONLY(rep, rp) do { \ + if (!F_ISSET(rep, REP_F_CLIENT)) { \ + RPRINT(env, DB_VERB_REP_MSGS, \ + (env, "Client record received on master")); \ + /* \ + * Only broadcast DUPMASTER if leases are not \ + * in effect. If I am an old master, using \ + * leases and I get a newer message, my leases \ + * had better all be expired. \ + */ \ + if (IS_USING_LEASES(env)) \ + DB_ASSERT(env, \ + __rep_lease_check(env, 0) == \ + DB_REP_LEASE_EXPIRED); \ + else { \ + REP_PRINT_MESSAGE(env, \ + eid, rp, "rep_process_message", 0); \ + (void)__rep_send_message(env, DB_EID_BROADCAST, \ + REP_DUPMASTER, NULL, NULL, 0, 0); \ + } \ + ret = DB_REP_DUPMASTER; \ + goto errlock; \ + } \ +} while (0) + +/* + * If a client is attempting to service a request it does not have, + * call rep_skip_msg to skip this message and force a rerequest to the + * sender. We don't hold the mutex for the stats and may miscount. + */ +#define CLIENT_REREQ do { \ + if (F_ISSET(rep, REP_F_CLIENT)) { \ + STAT(rep->stat.st_client_svc_req++); \ + if (ret == DB_NOTFOUND) { \ + STAT(rep->stat.st_client_svc_miss++); \ + ret = __rep_skip_msg(env, rep, eid, rp->rectype);\ + } \ + } \ +} while (0) + +#define MASTER_UPDATE(env, renv) do { \ + REP_SYSTEM_LOCK(env); \ + F_SET((renv), DB_REGENV_REPLOCKED); \ + (void)time(&(renv)->op_timestamp); \ + REP_SYSTEM_UNLOCK(env); \ +} while (0) + +#define RECOVERING_SKIP do { \ + if (IS_REP_CLIENT(env) && recovering) { \ + /* Not holding region mutex, may miscount */ \ + STAT(rep->stat.st_msgs_recover++); \ + ret = __rep_skip_msg(env, rep, eid, rp->rectype); \ + goto errlock; \ + } \ +} while (0) + +/* + * If we're recovering the log we only want log records that are in the + * range we need to recover. Otherwise we can end up storing a huge + * number of "new" records, only to truncate the temp database later after + * we run recovery. If we are actively delaying a sync-up, we also skip + * all incoming log records until the application requests sync-up. + */ +#define RECOVERING_LOG_SKIP do { \ + if (F_ISSET(rep, REP_F_DELAY) || \ + rep->master_id == DB_EID_INVALID || \ + (recovering && \ + (!F_ISSET(rep, REP_F_RECOVER_LOG) || \ + LOG_COMPARE(&rp->lsn, &rep->last_lsn) > 0))) { \ + /* Not holding region mutex, may miscount */ \ + STAT(rep->stat.st_msgs_recover++); \ + ret = __rep_skip_msg(env, rep, eid, rp->rectype); \ + goto errlock; \ + } \ +} while (0) + +#define ANYSITE(rep) + +/* + * __rep_process_message_pp -- + * + * This routine takes an incoming message and processes it. + * + * control: contains the control fields from the record + * rec: contains the actual record + * eid: the environment id of the sender of the message; + * ret_lsnp: On DB_REP_ISPERM and DB_REP_NOTPERM returns, contains the + * lsn of the maximum permanent or current not permanent log record + * (respectively). + * + * PUBLIC: int __rep_process_message_pp + * PUBLIC: __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *)); + */ +int +__rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp) + DB_ENV *dbenv; + DBT *control, *rec; + int eid; + DB_LSN *ret_lsnp; +{ + ENV *env; + int ret; + + env = dbenv->env; + ret = 0; + + ENV_REQUIRES_CONFIG_XX( + env, rep_handle, "DB_ENV->rep_process_message", DB_INIT_REP); + + if (APP_IS_REPMGR(env)) { + __db_errx(env, "%s %s", "DB_ENV->rep_process_message:", + "cannot call from Replication Manager application"); + return (EINVAL); + } + + /* Control argument must be non-Null. */ + if (control == NULL || control->size == 0) { + __db_errx(env, + "DB_ENV->rep_process_message: control argument must be specified"); + return (EINVAL); + } + + /* + * Make sure site is a master or a client, which implies that + * replication has been started. + */ + if (!IS_REP_MASTER(env) && !IS_REP_CLIENT(env)) { + __db_errx(env, + "Environment not configured as replication master or client"); + return (EINVAL); + } + + if ((ret = __dbt_usercopy(env, control)) != 0 || + (ret = __dbt_usercopy(env, rec)) != 0) { + __dbt_userfree(env, control, rec, NULL); + __db_errx(env, + "DB_ENV->rep_process_message: error retrieving DBT contents"); + return ret; + } + + ret = __rep_process_message_int(env, control, rec, eid, ret_lsnp); + + return (ret); +} + +/* + * __rep_process_message_int -- + * + * This routine performs the internal steps to process an incoming message. + * + * PUBLIC: int __rep_process_message_int + * PUBLIC: __P((ENV *, DBT *, DBT *, int, DB_LSN *)); + */ +int +__rep_process_message_int(env, control, rec, eid, ret_lsnp) + ENV *env; + DBT *control, *rec; + int eid; + DB_LSN *ret_lsnp; +{ + DBT data_dbt; + DB_LOG *dblp; + DB_LSN last_lsn, lsn; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + LOG *lp; + REGENV *renv; + REGINFO *infop; + REP *rep; + REP_46_CONTROL *rp46; + REP_OLD_CONTROL *orp; + __rep_control_args *rp, tmprp; + __rep_egen_args egen_arg; + size_t len; + u_int32_t gen, rep_version; + int cmp, do_sync, lockout, recovering, ret, t_ret; + time_t savetime; + u_int8_t buf[__REP_MAXMSG_SIZE]; + + ret = 0; + do_sync = 0; + lockout = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + infop = env->reginfo; + renv = infop->primary; + /* + * Casting this to REP_OLD_CONTROL is just kind of stylistic: the + * rep_version field of course has to be in the same offset in all + * versions in order for this to work. + * + * We can look at the rep_version unswapped here because if we're + * talking to an old version, it will always be unswapped. If + * we're talking to a new version, the only issue is if it is + * swapped and we take one of the old version conditionals + * incorrectly. The rep_version would need to be very, very + * large for a swapped version to look like a small, older + * version. There is no problem here looking at it unswapped. + */ + rep_version = ((REP_OLD_CONTROL *)control->data)->rep_version; + if (rep_version <= DB_REPVERSION_45) { + orp = (REP_OLD_CONTROL *)control->data; + if (rep_version == DB_REPVERSION_45 && + F_ISSET(orp, REPCTL_INIT_45)) { + F_CLR(orp, REPCTL_INIT_45); + F_SET(orp, REPCTL_INIT); + } + tmprp.rep_version = orp->rep_version; + tmprp.log_version = orp->log_version; + tmprp.lsn = orp->lsn; + tmprp.rectype = orp->rectype; + tmprp.gen = orp->gen; + tmprp.flags = orp->flags; + tmprp.msg_sec = 0; + tmprp.msg_nsec = 0; + } else if (rep_version == DB_REPVERSION_46) { + rp46 = (REP_46_CONTROL *)control->data; + tmprp.rep_version = rp46->rep_version; + tmprp.log_version = rp46->log_version; + tmprp.lsn = rp46->lsn; + tmprp.rectype = rp46->rectype; + tmprp.gen = rp46->gen; + tmprp.flags = rp46->flags; + tmprp.msg_sec = (u_int32_t)rp46->msg_time.tv_sec; + tmprp.msg_nsec = (u_int32_t)rp46->msg_time.tv_nsec; + } else + if ((ret = __rep_control_unmarshal(env, &tmprp, + control->data, control->size, NULL)) != 0) + return (ret); + rp = &tmprp; + if (ret_lsnp != NULL) + ZERO_LSN(*ret_lsnp); + + ENV_ENTER(env, ip); + + REP_PRINT_MESSAGE(env, eid, rp, "rep_process_message", 0); + /* + * Check the version number for both rep and log. If it is + * an old version we support, convert it. Otherwise complain. + */ + if (rp->rep_version < DB_REPVERSION) { + if (rp->rep_version < DB_REPVERSION_MIN) { + __db_errx(env, + "unsupported old replication message version %lu, minimum version %d", + (u_long)rp->rep_version, DB_REPVERSION_MIN); + ret = EINVAL; + goto errlock; + } + RPRINT(env, DB_VERB_REP_MSGS, (env, + "Received record %lu with old rep version %lu", + (u_long)rp->rectype, (u_long)rp->rep_version)); + rp->rectype = __rep_msg_from_old(rp->rep_version, rp->rectype); + DB_ASSERT(env, rp->rectype != REP_INVALID); + /* + * We should have a valid new record type for all the old + * versions. + */ + RPRINT(env, DB_VERB_REP_MSGS, (env, + "Converted to record %lu with old rep version %lu", + (u_long)rp->rectype, (u_long)rp->rep_version)); + } else if (rp->rep_version > DB_REPVERSION) { + __db_errx(env, + "unexpected replication message version %lu, expected %d", + (u_long)rp->rep_version, DB_REPVERSION); + ret = EINVAL; + goto errlock; + } + + if (rp->log_version < DB_LOGVERSION) { + if (rp->log_version < DB_LOGVERSION_MIN) { + __db_errx(env, + "unsupported old replication log version %lu, minimum version %d", + (u_long)rp->log_version, DB_LOGVERSION_MIN); + ret = EINVAL; + goto errlock; + } + RPRINT(env, DB_VERB_REP_MSGS, (env, + "Received record %lu with old log version %lu", + (u_long)rp->rectype, (u_long)rp->log_version)); + } else if (rp->log_version > DB_LOGVERSION) { + __db_errx(env, + "unexpected log record version %lu, expected %d", + (u_long)rp->log_version, DB_LOGVERSION); + ret = EINVAL; + goto errlock; + } + + /* + * Acquire the replication lock. + */ + REP_SYSTEM_LOCK(env); + if (F_ISSET(rep, REP_F_READY_MSG)) { + /* + * If we're racing with a thread in rep_start, then + * just ignore the message and return. + */ + RPRINT(env, DB_VERB_REP_MSGS, (env, + "Racing replication msg lockout, ignore message.")); + if (F_ISSET(rp, REPCTL_PERM)) + ret = DB_REP_IGNORE; + REP_SYSTEM_UNLOCK(env); + /* + * If another client has sent a c2c request to us, it may be a + * long time before it resends the request (due to its dual data + * streams avoidance heuristic); let it know we can't serve the + * request just now. + */ + if (F_ISSET(rep, REP_F_CLIENT) && REP_MSG_REQ(rp->rectype)) { + STAT(rep->stat.st_client_svc_req++); + STAT(rep->stat.st_client_svc_miss++); + (void)__rep_send_message(env, + eid, REP_REREQUEST, NULL, NULL, 0, 0); + } + goto out; + } + rep->msg_th++; + gen = rep->gen; + recovering = F_ISSET(rep, REP_F_RECOVER_MASK); + savetime = renv->rep_timestamp; + + STAT(rep->stat.st_msgs_processed++); + REP_SYSTEM_UNLOCK(env); + + /* + * Check for lease configuration matching. Leases must be + * configured all or none. If I am a client and I receive a + * message requesting a lease, and I'm not using leases, that + * is an error. + */ + if (!IS_USING_LEASES(env) && + (F_ISSET(rp, REPCTL_LEASE) || rp->rectype == REP_LEASE_GRANT)) { + __db_errx(env, + "Inconsistent lease configuration"); + RPRINT(env, DB_VERB_REP_MSGS, (env, + "Client received lease message and not using leases")); + ret = EINVAL; + ret = __env_panic(env, ret); + goto errlock; + } + + /* + * Check for generation number matching. Ignore any old messages + * except requests that are indicative of a new client that needs + * to get in sync. + */ + if (rp->gen < gen && rp->rectype != REP_ALIVE_REQ && + rp->rectype != REP_NEWCLIENT && rp->rectype != REP_MASTER_REQ && + rp->rectype != REP_DUPMASTER && rp->rectype != REP_VOTE1) { + /* + * We don't hold the rep mutex, and could miscount if we race. + */ + STAT(rep->stat.st_msgs_badgen++); + if (F_ISSET(rp, REPCTL_PERM)) + ret = DB_REP_IGNORE; + goto errlock; + } + + if (rp->gen > gen) { + /* + * If I am a master and am out of date with a lower generation + * number, I am in bad shape and should downgrade. + */ + if (F_ISSET(rep, REP_F_MASTER)) { + STAT(rep->stat.st_dupmasters++); + ret = DB_REP_DUPMASTER; + /* + * Only broadcast DUPMASTER if leases are not + * in effect. If I am an old master, using + * leases and I get a newer message, my leases + * had better all be expired. + */ + if (IS_USING_LEASES(env)) + DB_ASSERT(env, + __rep_lease_check(env, 0) == + DB_REP_LEASE_EXPIRED); + else if (rp->rectype != REP_DUPMASTER) + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_DUPMASTER, + NULL, NULL, 0, 0); + goto errlock; + } + + /* + * I am a client and am out of date. If this is an election, + * or a response from the first site I contacted, then I can + * accept the generation number and participate in future + * elections and communication. Otherwise, I need to hear about + * a new master and sync up. + */ + if (rp->rectype == REP_ALIVE || + rp->rectype == REP_VOTE1 || rp->rectype == REP_VOTE2) { + REP_SYSTEM_LOCK(env); + RPRINT(env, DB_VERB_REP_MSGS, (env, + "Updating gen from %lu to %lu", + (u_long)gen, (u_long)rp->gen)); + rep->master_id = DB_EID_INVALID; + gen = rep->gen = rp->gen; + /* + * Updating of egen will happen when we process the + * message below for each message type. + */ + REP_SYSTEM_UNLOCK(env); + if (rp->rectype == REP_ALIVE) + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_MASTER_REQ, NULL, + NULL, 0, 0); + } else if (rp->rectype != REP_NEWMASTER) { + /* + * Ignore this message, retransmit if needed. + */ + if (__rep_check_doreq(env, rep)) + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_MASTER_REQ, + NULL, NULL, 0, 0); + goto errlock; + } + /* + * If you get here, then you're a client and either you're + * in an election or you have a NEWMASTER or an ALIVE message + * whose processing will do the right thing below. + */ + } + + /* + * If the sender is part of an established group, so are we now. + */ + if (F_ISSET(rp, REPCTL_GROUP_ESTD)) { + REP_SYSTEM_LOCK(env); +#ifdef DIAGNOSTIC + if (!F_ISSET(rep, REP_F_GROUP_ESTD)) + RPRINT(env, DB_VERB_REP_MSGS, (env, + "I am now part of an established group")); +#endif + F_SET(rep, REP_F_GROUP_ESTD); + REP_SYSTEM_UNLOCK(env); + } + + /* + * We need to check if we're in recovery and if we are + * then we need to ignore any messages except VERIFY*, VOTE*, + * NEW* and ALIVE_REQ, or backup related messages: UPDATE*, + * PAGE* and FILE*. We need to also accept LOG messages + * if we're copying the log for recovery/backup. + */ + switch (rp->rectype) { + case REP_ALIVE: + /* + * Handle even if we're recovering. + */ + ANYSITE(rep); + if (rp->rep_version < DB_REPVERSION_47) + egen_arg.egen = *(u_int32_t *)rec->data; + else if ((ret = __rep_egen_unmarshal(env, &egen_arg, + rec->data, rec->size, NULL)) != 0) + return (ret); + REP_SYSTEM_LOCK(env); + RPRINT(env, DB_VERB_REP_MSGS, (env, + "Received ALIVE egen of %lu, mine %lu", + (u_long)egen_arg.egen, (u_long)rep->egen)); + if (egen_arg.egen > rep->egen) { + /* + * We're changing egen, need to clear out any old + * election information. We need to set the + * REP_F_EGENUPDATE flag here so that any thread + * waiting in rep_elect/rep_wait can distinguish + * this situation (and restart its election) from + * a current master saying it is still master and + * the egen getting incremented on that path. + */ + __rep_elect_done(env, rep, 0); + rep->egen = egen_arg.egen; + F_SET(rep, REP_F_EGENUPDATE); + } + REP_SYSTEM_UNLOCK(env); + break; + case REP_ALIVE_REQ: + /* + * Handle even if we're recovering. + */ + ANYSITE(rep); + LOG_SYSTEM_LOCK(env); + lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); +#ifdef CONFIG_TEST + /* + * Send this first, before the ALIVE message because of the + * way the test suite and messaging is done sequentially. + * In some sequences it is possible to get into a situation + * where the test suite cannot get the later NEWMASTER because + * we break out of the messaging loop too early. + */ + if (F_ISSET(rep, REP_F_MASTER)) + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0); +#endif + REP_SYSTEM_LOCK(env); + egen_arg.egen = rep->egen; + REP_SYSTEM_UNLOCK(env); + if (rep->version < DB_REPVERSION_47) + DB_INIT_DBT(data_dbt, &egen_arg.egen, + sizeof(egen_arg.egen)); + else { + if ((ret = __rep_egen_marshal(env, + &egen_arg, buf, __REP_EGEN_SIZE, &len)) != 0) + goto errlock; + DB_INIT_DBT(data_dbt, buf, len); + } + (void)__rep_send_message(env, + eid, REP_ALIVE, &lsn, &data_dbt, 0, 0); + break; + case REP_ALL_REQ: + RECOVERING_SKIP; + ret = __rep_allreq(env, rp, eid); + CLIENT_REREQ; + break; + case REP_BULK_LOG: + RECOVERING_LOG_SKIP; + CLIENT_ONLY(rep, rp); + ret = __rep_bulk_log(env, ip, rp, rec, savetime, ret_lsnp); + break; + case REP_BULK_PAGE: + /* + * Handle even if we're recovering. + */ + CLIENT_ONLY(rep, rp); + ret = __rep_bulk_page(env, ip, eid, rp, rec); + break; + case REP_DUPMASTER: + /* + * Handle even if we're recovering. + */ + if (F_ISSET(rep, REP_F_MASTER)) + ret = DB_REP_DUPMASTER; + break; +#ifdef NOTYET + case REP_FILE: /* TODO */ + CLIENT_ONLY(rep, rp); + break; + case REP_FILE_REQ: + ret = __rep_send_file(env, rec, eid); + break; +#endif + case REP_FILE_FAIL: + /* + * Handle even if we're recovering. + */ + CLIENT_ONLY(rep, rp); + /* + * Clean up any internal init that was in progress. + */ + if (eid == rep->master_id) { + REP_SYSTEM_LOCK(env); + /* + * If we're already locking out messages, give up. + */ + if (F_ISSET(rep, REP_F_READY_MSG)) + goto errhlk; + /* + * Lock out other messages to prevent race + * conditions. + */ + if ((ret = + __rep_lockout_msg(env, rep, 1)) != 0) { + goto errhlk; + } + lockout = 1; + /* + * Need mtx_clientdb to safely clean up + * page database in __rep_init_cleanup(). + */ + REP_SYSTEM_UNLOCK(env); + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + /* + * Clean up internal init if one was in progress. + */ + if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP)) { + RPRINT(env, DB_VERB_REP_MSGS, (env, + "FILE_FAIL is cleaning up old internal init")); +#ifdef CONFIG_TEST + STAT(rep->stat.st_filefail_cleanups++); +#endif + ret = __rep_init_cleanup(env, rep, DB_FORCE); + F_CLR(rep, + REP_F_ABBREVIATED | REP_F_RECOVER_MASK); + } + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (ret != 0) { + RPRINT(env, DB_VERB_REP_MSGS, (env, + "FILE_FAIL error cleaning up internal init: %d", ret)); + goto errhlk; + } + F_CLR(rep, REP_F_READY_MSG); + lockout = 0; + /* + * Restart internal init, setting UPDATE flag and + * zeroing applicable LSNs. + */ + F_SET(rep, REP_F_RECOVER_UPDATE); + ZERO_LSN(rep->first_lsn); + ZERO_LSN(rep->ckp_lsn); + REP_SYSTEM_UNLOCK(env); + (void)__rep_send_message(env, eid, REP_UPDATE_REQ, + NULL, NULL, 0, 0); + } + break; + case REP_LEASE_GRANT: + /* + * Handle even if we're recovering. + */ + MASTER_ONLY(rep, rp); + ret = __rep_lease_grant(env, rp, rec, eid); + break; + case REP_LOG: + case REP_LOG_MORE: + RECOVERING_LOG_SKIP; + CLIENT_ONLY(rep, rp); + ret = __rep_log(env, ip, rp, rec, savetime, ret_lsnp); + break; + case REP_LOG_REQ: + RECOVERING_SKIP; + if (F_ISSET(rp, REPCTL_INIT)) + MASTER_UPDATE(env, renv); + ret = __rep_logreq(env, rp, rec, eid); + CLIENT_REREQ; + break; + case REP_NEWSITE: + /* + * Handle even if we're recovering. + */ + /* We don't hold the rep mutex, and may miscount. */ + STAT(rep->stat.st_newsites++); + + /* This is a rebroadcast; simply tell the application. */ + if (F_ISSET(rep, REP_F_MASTER)) { + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + LOG_SYSTEM_LOCK(env); + lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + (void)__rep_send_message(env, + eid, REP_NEWMASTER, &lsn, NULL, 0, 0); + } + ret = DB_REP_NEWSITE; + break; + case REP_NEWCLIENT: + /* + * Handle even if we're recovering. + */ + /* + * This message was received and should have resulted in the + * application entering the machine ID in its machine table. + * We respond to this with an ALIVE to send relevant information + * to the new client (if we are a master, we'll send a + * NEWMASTER, so we only need to send the ALIVE if we're a + * client). But first, broadcast the new client's record to + * all the clients. + */ + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_NEWSITE, &rp->lsn, rec, 0, 0); + + ret = DB_REP_NEWSITE; + + if (F_ISSET(rep, REP_F_CLIENT)) { + REP_SYSTEM_LOCK(env); + egen_arg.egen = rep->egen; + + /* + * Clean up any previous master remnants by making + * master_id invalid and cleaning up any internal + * init that was in progress. + */ + if (eid == rep->master_id) { + rep->master_id = DB_EID_INVALID; + + /* + * Already locking out messages, must be + * in sync-up recover or internal init, + * give up. + */ + if (F_ISSET(rep, REP_F_READY_MSG)) + goto errhlk; + + /* + * Lock out other messages to prevent race + * conditions. + */ + if ((t_ret = + __rep_lockout_msg(env, rep, 1)) != 0) { + ret = t_ret; + goto errhlk; + } + lockout = 1; + + /* + * Need mtx_clientdb to safely clean up + * page database in __rep_init_cleanup(). + */ + REP_SYSTEM_UNLOCK(env); + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + + /* + * Clean up internal init if one was in + * progress. + */ + if (F_ISSET(rep, REP_F_READY_API | + REP_F_READY_OP)) { + RPRINT(env, DB_VERB_REP_MSGS, (env, + "NEWCLIENT is cleaning up old internal init for invalid master")); + t_ret = __rep_init_cleanup(env, + rep, DB_FORCE); + F_CLR(rep, REP_F_ABBREVIATED | + REP_F_RECOVER_MASK); + } + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (t_ret != 0) { + ret = t_ret; + RPRINT(env, DB_VERB_REP_MSGS, (env, + "NEWCLIENT error cleaning up internal init for invalid master: %d", ret)); + goto errhlk; + } + F_CLR(rep, REP_F_READY_MSG); + lockout = 0; + } + REP_SYSTEM_UNLOCK(env); + if (rep->version < DB_REPVERSION_47) + DB_INIT_DBT(data_dbt, &egen_arg.egen, + sizeof(egen_arg.egen)); + else { + if ((ret = __rep_egen_marshal(env, &egen_arg, + buf, __REP_EGEN_SIZE, &len)) != 0) + goto errlock; + DB_INIT_DBT(data_dbt, buf, len); + } + (void)__rep_send_message(env, DB_EID_BROADCAST, + REP_ALIVE, &rp->lsn, &data_dbt, 0, 0); + break; + } + /* FALLTHROUGH */ + case REP_MASTER_REQ: + RECOVERING_SKIP; + if (F_ISSET(rep, REP_F_MASTER)) { + LOG_SYSTEM_LOCK(env); + lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0); + if (IS_USING_LEASES(env)) + (void)__rep_lease_refresh(env); + } + /* + * If there is no master, then we could get into a state + * where an old client lost the initial ALIVE message and + * is calling an election under an old gen and can + * never get to the current gen. + */ + if (F_ISSET(rep, REP_F_CLIENT) && rp->gen < gen) { + REP_SYSTEM_LOCK(env); + egen_arg.egen = rep->egen; + if (eid == rep->master_id) + rep->master_id = DB_EID_INVALID; + REP_SYSTEM_UNLOCK(env); + if (rep->version < DB_REPVERSION_47) + DB_INIT_DBT(data_dbt, &egen_arg.egen, + sizeof(egen_arg.egen)); + else { + if ((ret = __rep_egen_marshal(env, &egen_arg, + buf, __REP_EGEN_SIZE, &len)) != 0) + goto errlock; + DB_INIT_DBT(data_dbt, buf, len); + } + (void)__rep_send_message(env, eid, + REP_ALIVE, &rp->lsn, &data_dbt, 0, 0); + } + break; + case REP_NEWFILE: + RECOVERING_LOG_SKIP; + CLIENT_ONLY(rep, rp); + ret = __rep_apply(env, + ip, rp, rec, ret_lsnp, NULL, &last_lsn); + if (ret == DB_REP_LOGREADY) + ret = __rep_logready(env, rep, savetime, &last_lsn); + break; + case REP_NEWMASTER: + /* + * Handle even if we're recovering. + */ + ANYSITE(rep); + if (F_ISSET(rep, REP_F_MASTER) && + eid != rep->eid) { + /* We don't hold the rep mutex, and may miscount. */ + STAT(rep->stat.st_dupmasters++); + ret = DB_REP_DUPMASTER; + if (IS_USING_LEASES(env)) + DB_ASSERT(env, + __rep_lease_check(env, 0) == + DB_REP_LEASE_EXPIRED); + else + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_DUPMASTER, + NULL, NULL, 0, 0); + break; + } + if ((ret = + __rep_new_master(env, rp, eid)) == DB_REP_NEWMASTER) + ret = __rep_fire_newmaster(env, rp->gen, eid); + break; + case REP_PAGE: + case REP_PAGE_MORE: + /* + * Handle even if we're recovering. + */ + CLIENT_ONLY(rep, rp); + ret = __rep_page(env, ip, eid, rp, rec); + if (ret == DB_REP_PAGEDONE) + ret = 0; + break; + case REP_PAGE_FAIL: + /* + * Handle even if we're recovering. + */ + CLIENT_ONLY(rep, rp); + ret = __rep_page_fail(env, ip, eid, rp, rec); + break; + case REP_PAGE_REQ: + RECOVERING_SKIP; + MASTER_UPDATE(env, renv); + ret = __rep_page_req(env, ip, eid, rp, rec); + CLIENT_REREQ; + break; + case REP_REREQUEST: + /* + * Handle even if we're recovering. Don't do a master + * check. + */ + CLIENT_ONLY(rep, rp); + /* + * Don't hold any mutex, may miscount. + */ + STAT(rep->stat.st_client_rerequests++); + ret = __rep_resend_req(env, 1); + break; + case REP_START_SYNC: + RECOVERING_SKIP; + MUTEX_LOCK(env, rep->mtx_clientdb); + cmp = LOG_COMPARE(&rp->lsn, &lp->ready_lsn); + /* + * The comparison needs to be <= because the LSN in + * the message can be the LSN of the first outstanding + * txn, which may be the LSN immediately after the + * previous commit. The ready_lsn is the LSN of the + * next record expected. In that case, the LSNs + * could be equal and the client has the commit and + * wants to sync. [SR #15338] + */ + if (cmp <= 0) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + do_sync = 1; + } else { + STAT(rep->stat.st_startsync_delayed++); + /* + * There are cases where keeping the first ckp_lsn + * LSN is advantageous and cases where keeping + * a later LSN is better. If random, earlier + * log records are missing, keeping the later + * LSN seems to be better. That is what we'll + * do for now. + */ + if (LOG_COMPARE(&rp->lsn, &rep->ckp_lsn) > 0) + rep->ckp_lsn = rp->lsn; + RPRINT(env, DB_VERB_REP_MSGS, (env, + "Delayed START_SYNC memp_sync due to missing records.")); + RPRINT(env, DB_VERB_REP_MSGS, (env, + "ready LSN [%lu][%lu], ckp_lsn [%lu][%lu]", + (u_long)lp->ready_lsn.file, (u_long)lp->ready_lsn.offset, + (u_long)rep->ckp_lsn.file, (u_long)rep->ckp_lsn.offset)); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + } + break; + case REP_UPDATE: + /* + * Handle even if we're recovering. + */ + CLIENT_ONLY(rep, rp); + ret = __rep_update_setup(env, eid, rp, rec, savetime); + break; + case REP_UPDATE_REQ: + /* + * Handle even if we're recovering. + */ + MASTER_ONLY(rep, rp); + infop = env->reginfo; + renv = infop->primary; + MASTER_UPDATE(env, renv); + ret = __rep_update_req(env, rp, eid); + break; + case REP_VERIFY: + if (recovering) { + MUTEX_LOCK(env, rep->mtx_clientdb); + cmp = LOG_COMPARE(&lp->verify_lsn, &rp->lsn); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + /* + * If this is not the verify record I want, skip it. + */ + if (cmp != 0) { + ret = __rep_skip_msg( + env, rep, eid, rp->rectype); + break; + } + } + CLIENT_ONLY(rep, rp); + ret = __rep_verify(env, rp, rec, eid, savetime); + break; + case REP_VERIFY_FAIL: + /* + * Handle even if we're recovering. + */ + CLIENT_ONLY(rep, rp); + ret = __rep_verify_fail(env, rp); + break; + case REP_VERIFY_REQ: + RECOVERING_SKIP; + ret = __rep_verify_req(env, rp, eid); + CLIENT_REREQ; + break; + case REP_VOTE1: + /* + * Handle even if we're recovering. + */ + ret = __rep_vote1(env, rp, rec, eid); + break; + case REP_VOTE2: + /* + * Handle even if we're recovering. + */ + ret = __rep_vote2(env, rp, rec, eid); + break; + default: + __db_errx(env, + "DB_ENV->rep_process_message: unknown replication message: type %lu", + (u_long)rp->rectype); + ret = EINVAL; + break; + } + +errlock: + REP_SYSTEM_LOCK(env); +errhlk: if (lockout) + F_CLR(rep, REP_F_READY_MSG); + rep->msg_th--; + REP_SYSTEM_UNLOCK(env); + if (do_sync) { + MUTEX_LOCK(env, rep->mtx_ckp); + lsn = rp->lsn; + /* + * This is the REP_START_SYNC sync, and so we permit it to be + * interrupted. + */ + ret = __memp_sync( + env, DB_SYNC_CHECKPOINT | DB_SYNC_INTERRUPT_OK, &lsn); + MUTEX_UNLOCK(env, rep->mtx_ckp); + RPRINT(env, DB_VERB_REP_MSGS, + (env, "ALIVE: Completed sync [%lu][%lu]", + (u_long)lsn.file, (u_long)lsn.offset)); + } +out: + if (ret == 0 && F_ISSET(rp, REPCTL_PERM)) { + if (ret_lsnp != NULL) + *ret_lsnp = rp->lsn; + ret = DB_REP_NOTPERM; + } + __dbt_userfree(env, control, rec, NULL); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __rep_apply -- + * + * Handle incoming log records on a client, applying when possible and + * entering into the bookkeeping table otherwise. This routine manages + * the state of the incoming message stream -- processing records, via + * __rep_process_rec, when possible and enqueuing in the __db.rep.db + * when necessary. As gaps in the stream are filled in, this is where + * we try to process as much as possible from __db.rep.db to catch up. + * + * PUBLIC: int __rep_apply __P((ENV *, DB_THREAD_INFO *, __rep_control_args *, + * PUBLIC: DBT *, DB_LSN *, int *, DB_LSN *)); + */ +int +__rep_apply(env, ip, rp, rec, ret_lsnp, is_dupp, last_lsnp) + ENV *env; + DB_THREAD_INFO *ip; + __rep_control_args *rp; + DBT *rec; + DB_LSN *ret_lsnp; + int *is_dupp; + DB_LSN *last_lsnp; +{ + DB *dbp; + DBT control_dbt, key_dbt; + DBT rec_dbt; + DB_LOG *dblp; + DB_LSN max_lsn, save_lsn; + DB_REP *db_rep; + LOG *lp; + REP *rep; + db_timespec msg_time, max_ts; + u_int32_t gen, rectype; + int cmp, event, master, newfile_seen, ret, set_apply, t_ret; + + COMPQUIET(gen, 0); + COMPQUIET(master, DB_EID_INVALID); + + db_rep = env->rep_handle; + rep = db_rep->region; + event = ret = set_apply = 0; + memset(&control_dbt, 0, sizeof(control_dbt)); + memset(&rec_dbt, 0, sizeof(rec_dbt)); + ZERO_LSN(max_lsn); + timespecclear(&max_ts); + timespecset(&msg_time, rp->msg_sec, rp->msg_nsec); + cmp = -2; /* OOB value that LOG_COMPARE can't return. */ + + dblp = env->lg_handle; + MUTEX_LOCK(env, rep->mtx_clientdb); + /* + * Lazily open the temp db. Always set the startup flag to 0 + * because it was initialized from rep_start. + */ + if (db_rep->rep_db == NULL && + (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + goto out; + } + dbp = db_rep->rep_db; + lp = dblp->reginfo.primary; + newfile_seen = 0; + REP_SYSTEM_LOCK(env); + if (F_ISSET(rep, REP_F_RECOVER_LOG) && + LOG_COMPARE(&lp->ready_lsn, &rep->first_lsn) < 0) + lp->ready_lsn = rep->first_lsn; + cmp = LOG_COMPARE(&rp->lsn, &lp->ready_lsn); + /* + * If we are going to skip or process any message other + * than a duplicate, make note of it if we're in an + * election so that the election can rerequest proactively. + */ + if (F_ISSET(rep, REP_F_READY_APPLY) && cmp >= 0) + F_SET(rep, REP_F_SKIPPED_APPLY); + + /* + * If we're in the middle of processing a NEWFILE, we've dropped + * the mutex and if this matches it is a duplicate record. We + * do not want this call taking the "matching" code below because + * we may then process later records in the temp db and the + * original NEWFILE may not have the log file ready. It will + * process those temp db items when it completes. + */ + if (F_ISSET(rep, REP_F_NEWFILE) && cmp == 0) + cmp = -1; + + if (cmp == 0) { + /* + * If we are in an election (i.e. we've sent a vote + * with an LSN in it), then we drop the next record + * we're expecting. When we find a master, we'll + * either go into sync, or if it was an existing + * master, rerequest this one record (later records + * are accumulating in the temp db). + * + * We can simply return here, and rep_process_message + * will set NOTPERM if necessary for this record. + */ + if (F_ISSET(rep, REP_F_READY_APPLY)) { + /* + * We will simply return now. All special return + * processing should be ignored because the special + * values are just initialized. Variables like + * max_lsn are still 0. + */ + RPRINT(env, DB_VERB_REP_MISC, (env, + "rep_apply: In election. Ignoring [%lu][%lu]", + (u_long)rp->lsn.file, (u_long)rp->lsn.offset)); + REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + goto out; + } + rep->apply_th++; + set_apply = 1; + RPRINT(env, DB_VERB_REP_MISC, (env, + "rep_apply: Set apply_th %d", rep->apply_th)); + REP_SYSTEM_UNLOCK(env); + if (rp->rectype == REP_NEWFILE) + newfile_seen = 1; + if ((ret = __rep_process_rec(env, ip, + rp, rec, &max_ts, &max_lsn)) != 0) + goto err; + /* + * If we get the record we are expecting, reset + * the count of records we've received and are applying + * towards the request interval. + */ + __os_gettime(env, &lp->rcvd_ts, 1); + ZERO_LSN(lp->max_wait_lsn); + + /* + * The __rep_remfirst() and __rep_getnext() functions each open, + * use and then close a cursor on the temp db, each time through + * the loop. Although this may seem excessive, it is necessary + * to avoid locking problems with checkpoints. + */ + while (ret == 0 && + LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) == 0) { + /* + * We just filled in a gap in the log record stream. + * Write subsequent records to the log. + */ +gap_check: + if ((ret = __rep_remfirst(env, ip, + &control_dbt, &rec_dbt)) != 0) + goto err; + + rp = (__rep_control_args *)control_dbt.data; + timespecset(&msg_time, rp->msg_sec, rp->msg_nsec); + rec = &rec_dbt; + if (rp->rectype == REP_NEWFILE) + newfile_seen = 1; + if ((ret = __rep_process_rec(env, ip, + rp, rec, &max_ts, &max_lsn)) != 0) + goto err; + + --rep->stat.st_log_queued; + + /* + * Since we just filled a gap in the log stream, and + * we're writing subsequent records to the log, we want + * to use rcvd_ts and wait_ts so that we will + * request the next gap if we end up with a gap and + * not so recent records in the temp db, but not + * request if recent records are in the temp db and + * likely to arrive on its own shortly. We want to + * avoid requesting the record in that case. Also + * reset max_wait_lsn because the next gap is a + * fresh gap. + */ + lp->rcvd_ts = lp->last_ts; + lp->wait_ts = rep->request_gap; + if ((ret = __rep_getnext(env, ip)) == DB_NOTFOUND) { + __os_gettime(env, &lp->rcvd_ts, 1); + ret = 0; + break; + } else if (ret != 0) + goto err; + } + + /* + * Check if we're at a gap in the table and if so, whether we + * need to ask for any records. + */ + if (!IS_ZERO_LSN(lp->waiting_lsn) && + LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) != 0) { + /* + * We got a record and processed it, but we may + * still be waiting for more records. If we + * filled a gap we keep a count of how many other + * records are in the temp database and if we should + * request the next gap at this time. + */ + if (__rep_check_doreq(env, rep) && (ret = + __rep_loggap_req(env, rep, &rp->lsn, 0)) != 0) + goto err; + } else { + lp->wait_ts = rep->request_gap; + ZERO_LSN(lp->max_wait_lsn); + } + + } else if (cmp > 0) { + /* + * The LSN is higher than the one we were waiting for. + * This record isn't in sequence; add it to the temporary + * database, update waiting_lsn if necessary, and perform + * calculations to determine if we should issue requests + * for new records. + */ + REP_SYSTEM_UNLOCK(env); + memset(&key_dbt, 0, sizeof(key_dbt)); + key_dbt.data = rp; + key_dbt.size = sizeof(*rp); + ret = __db_put(dbp, ip, NULL, &key_dbt, rec, DB_NOOVERWRITE); + if (ret == 0) { + rep->stat.st_log_queued++; + __os_gettime(env, &lp->last_ts, 1); +#ifdef HAVE_STATISTICS + STAT(rep->stat.st_log_queued_total++); + if (rep->stat.st_log_queued_max < + rep->stat.st_log_queued) + rep->stat.st_log_queued_max = + rep->stat.st_log_queued; +#endif + } + + if (ret == DB_KEYEXIST) + ret = 0; + if (ret != 0) + goto done; + + if (IS_ZERO_LSN(lp->waiting_lsn) || + LOG_COMPARE(&rp->lsn, &lp->waiting_lsn) < 0) { + /* + * If this is a new gap, then reset the rcvd_ts so + * that an out-of-order record after an idle period + * does not (likely) immediately rerequest. + */ + if (IS_ZERO_LSN(lp->waiting_lsn)) + __os_gettime(env, &lp->rcvd_ts, 1); + lp->waiting_lsn = rp->lsn; + } + + if (__rep_check_doreq(env, rep) && + (ret = __rep_loggap_req(env, rep, &rp->lsn, 0) != 0)) + goto err; + + /* + * If this is permanent; let the caller know that we have + * not yet written it to disk, but we've accepted it. + */ + if (ret == 0 && F_ISSET(rp, REPCTL_PERM)) { + max_lsn = rp->lsn; + ret = DB_REP_NOTPERM; + } + goto done; + } else { + STAT(rep->stat.st_log_duplicated++); + REP_SYSTEM_UNLOCK(env); + if (is_dupp != NULL) + *is_dupp = 1; + LOGCOPY_32(env, &rectype, rec->data); + if (rectype == DB___txn_regop || rectype == DB___txn_ckp) + max_lsn = lp->max_perm_lsn; + /* + * We check REPCTL_LEASE here, because this client may + * have leases configured but the master may not (especially + * in a mixed version group. If the master has leases + * configured, all clients must also. + */ + if (IS_USING_LEASES(env) && + F_ISSET(rp, REPCTL_LEASE) && + timespecisset(&msg_time)) { + if (timespeccmp(&msg_time, &lp->max_lease_ts, >)) + max_ts = msg_time; + else + max_ts = lp->max_lease_ts; + } + goto done; + } + + /* Check if we need to go back into the table. */ + if (ret == 0 && LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) == 0) + goto gap_check; + +done: +err: /* + * In case of a race, to make sure only one thread can get + * DB_REP_LOGREADY, zero out rep->last_lsn to show that we've gotten to + * this point. + */ + REP_SYSTEM_LOCK(env); + if (ret == 0 && + F_ISSET(rep, REP_F_RECOVER_LOG) && + !IS_ZERO_LSN(rep->last_lsn) && + LOG_COMPARE(&lp->ready_lsn, &rep->last_lsn) >= 0) { + *last_lsnp = max_lsn; + ZERO_LSN(rep->last_lsn); + ZERO_LSN(max_lsn); + ret = DB_REP_LOGREADY; + } + /* + * Only decrement if we were actually applying log records. + * We do not care if we processed a dup record or put one + * in the temp db. + */ + if (set_apply) { + rep->apply_th--; + RPRINT(env, DB_VERB_REP_MISC, (env, + "rep_apply: Decrement apply_th %d [%lu][%lu]", + rep->apply_th, (u_long)lp->ready_lsn.file, + (u_long)lp->ready_lsn.offset)); + } + + if (ret == 0 && !F_ISSET(rep, REP_F_RECOVER_LOG) && + !IS_ZERO_LSN(max_lsn)) { + if (ret_lsnp != NULL) + *ret_lsnp = max_lsn; + ret = DB_REP_ISPERM; + DB_ASSERT(env, LOG_COMPARE(&max_lsn, &lp->max_perm_lsn) >= 0); + lp->max_perm_lsn = max_lsn; + } + + /* + * Start-up is complete when we process (or have already processed) up + * to the end of the replication group's log. In case we miss that + * message, as a back-up, we also recognize start-up completion when we + * actually process a live log record. Having cmp==0 here (with a good + * "ret" value) implies we actually processed the record. + */ + if ((ret == 0 || ret == DB_REP_ISPERM) && + rep->stat.st_startup_complete == 0 && + !F_ISSET(rep, REP_F_RECOVER_LOG) && + ((cmp <= 0 && F_ISSET(rp, REPCTL_LOG_END)) || + (cmp == 0 && !F_ISSET(rp, REPCTL_RESEND)))) { + rep->stat.st_startup_complete = 1; + event = 1; + gen = rep->gen; + master = rep->master_id; + } + REP_SYSTEM_UNLOCK(env); + /* + * If we've processed beyond the needed LSN for a pending + * start sync, start it now. We can compare >= here + * because ready_lsn is the next record we expect. + * Since ckp_lsn can point to the last commit record itself, + * but if it does and ready_lsn == commit (i.e. we haven't + * written the commit yet), we can still start to sync + * because we're guaranteed no additional buffers can + * be dirtied. + */ + if (!IS_ZERO_LSN(rep->ckp_lsn) && + LOG_COMPARE(&lp->ready_lsn, &rep->ckp_lsn) >= 0) { + save_lsn = rep->ckp_lsn; + ZERO_LSN(rep->ckp_lsn); + } else + ZERO_LSN(save_lsn); + + /* + * If this is a perm record, we are using leases, update the lease + * grant. We must hold the clientdb mutex. We must not hold + * the region mutex because rep_update_grant will acquire it. + */ + if (ret == DB_REP_ISPERM && IS_USING_LEASES(env) && + timespecisset(&max_ts)) { + if ((t_ret = __rep_update_grant(env, &max_ts)) != 0) + ret = t_ret; + else if (timespeccmp(&max_ts, &lp->max_lease_ts, >)) + lp->max_lease_ts = max_ts; + } + + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (!IS_ZERO_LSN(save_lsn)) { + /* + * Now call memp_sync holding only the ckp mutex. + */ + MUTEX_LOCK(env, rep->mtx_ckp); + RPRINT(env, DB_VERB_REP_MISC, (env, + "Starting delayed __memp_sync call [%lu][%lu]", + (u_long)save_lsn.file, (u_long)save_lsn.offset)); + t_ret = __memp_sync(env, + DB_SYNC_CHECKPOINT | DB_SYNC_INTERRUPT_OK, &save_lsn); + MUTEX_UNLOCK(env, rep->mtx_ckp); + } + if (event) { + RPRINT(env, DB_VERB_REP_MISC, (env, + "Start-up is done [%lu][%lu]", + (u_long)rp->lsn.file, (u_long)rp->lsn.offset)); + + if ((t_ret = __rep_fire_startupdone(env, gen, master)) != 0) { + DB_ASSERT(env, ret == 0 || ret == DB_REP_ISPERM); + /* Failure trumps either of those values. */ + ret = t_ret; + goto out; + } + } + if ((ret == 0 || ret == DB_REP_ISPERM) && + newfile_seen && lp->db_log_autoremove) + __log_autoremove(env); + if (control_dbt.data != NULL) + __os_ufree(env, control_dbt.data); + if (rec_dbt.data != NULL) + __os_ufree(env, rec_dbt.data); + +out: + switch (ret) { + case 0: + break; + case DB_REP_ISPERM: + RPRINT(env, DB_VERB_REP_MSGS, + (env, "Returning ISPERM [%lu][%lu], cmp = %d", + (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp)); + break; + case DB_REP_LOGREADY: + RPRINT(env, DB_VERB_REP_MSGS, (env, + "Returning LOGREADY up to [%lu][%lu], cmp = %d", + (u_long)last_lsnp->file, + (u_long)last_lsnp->offset, cmp)); + break; + case DB_REP_NOTPERM: + if (!F_ISSET(rep, REP_F_RECOVER_LOG) && + !IS_ZERO_LSN(max_lsn) && ret_lsnp != NULL) + *ret_lsnp = max_lsn; + + RPRINT(env, DB_VERB_REP_MSGS, + (env, "Returning NOTPERM [%lu][%lu], cmp = %d", + (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp)); + break; + default: + RPRINT(env, DB_VERB_REP_MSGS, + (env, "Returning %d [%lu][%lu], cmp = %d", ret, + (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp)); + break; + } + + return (ret); +} + +/* + * __rep_process_txn -- + * + * This is the routine that actually gets a transaction ready for + * processing. + * + * PUBLIC: int __rep_process_txn __P((ENV *, DBT *)); + */ +int +__rep_process_txn(env, rec) + ENV *env; + DBT *rec; +{ + DBT data_dbt, *lock_dbt; + DB_LOCKER *locker; + DB_LOCKREQ req, *lvp; + DB_LOGC *logc; + DB_LSN prev_lsn, *lsnp; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + DB_TXNHEAD *txninfo; + LSN_COLLECTION lc; + REP *rep; + __txn_regop_args *txn_args; + __txn_regop_42_args *txn42_args; + __txn_prepare_args *prep_args; + u_int32_t rectype; + u_int i; + int ret, t_ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + logc = NULL; + txn_args = NULL; + txn42_args = NULL; + prep_args = NULL; + txninfo = NULL; + + ENV_ENTER(env, ip); + memset(&data_dbt, 0, sizeof(data_dbt)); + if (F_ISSET(env, ENV_THREAD)) + F_SET(&data_dbt, DB_DBT_REALLOC); + + /* + * There are two phases: First, we have to traverse backwards through + * the log records gathering the list of all LSNs in the transaction. + * Once we have this information, we can loop through and then apply it. + * + * We may be passed a prepare (if we're restoring a prepare on upgrade) + * instead of a commit (the common case). Check which it is and behave + * appropriately. + */ + LOGCOPY_32(env, &rectype, rec->data); + memset(&lc, 0, sizeof(lc)); + if (rectype == DB___txn_regop) { + /* + * We're the end of a transaction. Make sure this is + * really a commit and not an abort! + */ + if (rep->version >= DB_REPVERSION_44) { + if ((ret = __txn_regop_read( + env, rec->data, &txn_args)) != 0) + return (ret); + if (txn_args->opcode != TXN_COMMIT) { + __os_free(env, txn_args); + return (0); + } + prev_lsn = txn_args->prev_lsn; + lock_dbt = &txn_args->locks; + } else { + if ((ret = __txn_regop_42_read( + env, rec->data, &txn42_args)) != 0) + return (ret); + if (txn42_args->opcode != TXN_COMMIT) { + __os_free(env, txn42_args); + return (0); + } + prev_lsn = txn42_args->prev_lsn; + lock_dbt = &txn42_args->locks; + } + } else { + /* We're a prepare. */ + DB_ASSERT(env, rectype == DB___txn_prepare); + + if ((ret = __txn_prepare_read( + env, rec->data, &prep_args)) != 0) + return (ret); + prev_lsn = prep_args->prev_lsn; + lock_dbt = &prep_args->locks; + } + + /* Get locks. */ + if ((ret = __lock_id(env, NULL, &locker)) != 0) + goto err1; + + if ((ret = + __lock_get_list(env, locker, 0, DB_LOCK_WRITE, lock_dbt)) != 0) + goto err; + + /* Phase 1. Get a list of the LSNs in this transaction, and sort it. */ + if ((ret = __rep_collect_txn(env, &prev_lsn, &lc)) != 0) + goto err; + qsort(lc.array, lc.nlsns, sizeof(DB_LSN), __rep_lsn_cmp); + + /* + * The set of records for a transaction may include dbreg_register + * records. Create a txnlist so that they can keep track of file + * state between records. + */ + if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0) + goto err; + + /* Phase 2: Apply updates. */ + if ((ret = __log_cursor(env, &logc)) != 0) + goto err; + for (lsnp = &lc.array[0], i = 0; i < lc.nlsns; i++, lsnp++) { + if ((ret = __logc_get(logc, lsnp, &data_dbt, DB_SET)) != 0) { + __db_errx(env, "failed to read the log at [%lu][%lu]", + (u_long)lsnp->file, (u_long)lsnp->offset); + goto err; + } + if ((ret = __db_dispatch(env, &env->recover_dtab, + &data_dbt, lsnp, DB_TXN_APPLY, txninfo)) != 0) { + __db_errx(env, "transaction failed at [%lu][%lu]", + (u_long)lsnp->file, (u_long)lsnp->offset); + goto err; + } + } + +err: memset(&req, 0, sizeof(req)); + req.op = DB_LOCK_PUT_ALL; + if ((t_ret = + __lock_vec(env, locker, 0, &req, 1, &lvp)) != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0) + ret = t_ret; + +err1: if (txn_args != NULL) + __os_free(env, txn_args); + if (txn42_args != NULL) + __os_free(env, txn42_args); + if (prep_args != NULL) + __os_free(env, prep_args); + if (lc.array != NULL) + __os_free(env, lc.array); + + if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + + if (txninfo != NULL) + __db_txnlist_end(env, txninfo); + + if (F_ISSET(&data_dbt, DB_DBT_REALLOC) && data_dbt.data != NULL) + __os_ufree(env, data_dbt.data); + +#ifdef HAVE_STATISTICS + if (ret == 0) + /* + * We don't hold the rep mutex, and could miscount if we race. + */ + rep->stat.st_txns_applied++; +#endif + + return (ret); +} + +/* + * __rep_collect_txn + * Recursive function that will let us visit every entry in a transaction + * chain including all child transactions so that we can then apply + * the entire transaction family at once. + */ +static int +__rep_collect_txn(env, lsnp, lc) + ENV *env; + DB_LSN *lsnp; + LSN_COLLECTION *lc; +{ + __txn_child_args *argp; + DB_LOGC *logc; + DB_LSN c_lsn; + DBT data; + u_int32_t rectype; + u_int nalloc; + int ret, t_ret; + + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_REALLOC); + + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + + while (!IS_ZERO_LSN(*lsnp) && + (ret = __logc_get(logc, lsnp, &data, DB_SET)) == 0) { + LOGCOPY_32(env, &rectype, data.data); + if (rectype == DB___txn_child) { + if ((ret = __txn_child_read( + env, data.data, &argp)) != 0) + goto err; + c_lsn = argp->c_lsn; + *lsnp = argp->prev_lsn; + __os_free(env, argp); + ret = __rep_collect_txn(env, &c_lsn, lc); + } else { + if (lc->nalloc < lc->nlsns + 1) { + nalloc = lc->nalloc == 0 ? 20 : lc->nalloc * 2; + if ((ret = __os_realloc(env, + nalloc * sizeof(DB_LSN), &lc->array)) != 0) + goto err; + lc->nalloc = nalloc; + } + lc->array[lc->nlsns++] = *lsnp; + + /* + * Explicitly copy the previous lsn. The record + * starts with a u_int32_t record type, a u_int32_t + * txn id, and then the DB_LSN (prev_lsn) that we + * want. We copy explicitly because we have no idea + * what kind of record this is. + */ + LOGCOPY_TOLSN(env, lsnp, (u_int8_t *)data.data + + sizeof(u_int32_t) + sizeof(u_int32_t)); + } + + if (ret != 0) + goto err; + } + if (ret != 0) + __db_errx(env, "collect failed at: [%lu][%lu]", + (u_long)lsnp->file, (u_long)lsnp->offset); + +err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + if (data.data != NULL) + __os_ufree(env, data.data); + return (ret); +} + +/* + * __rep_lsn_cmp -- + * qsort-type-compatible wrapper for LOG_COMPARE. + */ +static int +__rep_lsn_cmp(lsn1, lsn2) + const void *lsn1, *lsn2; +{ + + return (LOG_COMPARE((DB_LSN *)lsn1, (DB_LSN *)lsn2)); +} + +/* + * __rep_newfile -- + * NEWFILE messages have the LSN of the last record in the previous + * log file. When applying a NEWFILE message, make sure we haven't already + * swapped files. Assume caller hold mtx_clientdb. + */ +static int +__rep_newfile(env, rp, rec) + ENV *env; + __rep_control_args *rp; + DBT *rec; +{ + DB_LOG *dblp; + DB_LSN tmplsn; + DB_REP *db_rep; + LOG *lp; + REP *rep; + __rep_newfile_args nf_args; + int ret; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + db_rep = env->rep_handle; + rep = db_rep->region; + + /* + * If a newfile is already in progress, just ignore. + */ + if (F_ISSET(rep, REP_F_NEWFILE)) + return (0); + if (rp->lsn.file + 1 > lp->ready_lsn.file) { + if (rec == NULL || rec->size == 0) { + RPRINT(env, DB_VERB_REP_MISC, (env, +"rep_newfile: Old-style NEWFILE msg. Use control msg log version: %lu", + (u_long) rp->log_version)); + nf_args.version = rp->log_version; + } else if (rp->rep_version < DB_REPVERSION_47) + nf_args.version = *(u_int32_t *)rec->data; + else if ((ret = __rep_newfile_unmarshal(env, &nf_args, + rec->data, rec->size, NULL)) != 0) + return (ret); + RPRINT(env, DB_VERB_REP_MISC, + (env, "rep_newfile: File %lu vers %lu", + (u_long)rp->lsn.file + 1, (u_long)nf_args.version)); + + /* + * We drop the mtx_clientdb mutex during + * the file operation, and then reacquire it when + * we're done. We avoid colliding with new incoming + * log records because lp->ready_lsn is not getting + * updated and there is no real log record at this + * ready_lsn. We avoid colliding with a duplicate + * NEWFILE message by setting an in-progress flag. + */ + REP_SYSTEM_LOCK(env); + F_SET(rep, REP_F_NEWFILE); + REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + LOG_SYSTEM_LOCK(env); + ret = __log_newfile(dblp, &tmplsn, 0, nf_args.version); + LOG_SYSTEM_UNLOCK(env); + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + F_CLR(rep, REP_F_NEWFILE); + REP_SYSTEM_UNLOCK(env); + if (ret == 0) + lp->ready_lsn = tmplsn; + return (ret); + } else + /* We've already applied this NEWFILE. Just ignore it. */ + return (0); +} + +/* + * __rep_do_ckp -- + * Perform the memp_sync necessary for this checkpoint without holding the + * REP->mtx_clientdb. Callers of this function must hold REP->mtx_clientdb + * and must not be holding the region mutex. + */ +static int +__rep_do_ckp(env, rec, rp) + ENV *env; + DBT *rec; + __rep_control_args *rp; +{ + DB_ENV *dbenv; + __txn_ckp_args *ckp_args; + DB_LSN ckp_lsn; + REP *rep; + int ret; + + dbenv = env->dbenv; + + /* Crack the log record and extract the checkpoint LSN. */ + if ((ret = __txn_ckp_read(env, rec->data, &ckp_args)) != 0) + return (ret); + ckp_lsn = ckp_args->ckp_lsn; + __os_free(env, ckp_args); + + rep = env->rep_handle->region; + + MUTEX_UNLOCK(env, rep->mtx_clientdb); + DB_TEST_WAIT(env, env->test_check); + + /* + * Sync the memory pool. + * + * This is the real PERM lock record/ckp. We cannot return ISPERM + * if we haven't truly completed the checkpoint, so we don't allow + * this call to be interrupted. + * + * We may be overlapping our log record with an in-progress startsync + * of this checkpoint; suppress the max_write settings on any running + * cache-flush operation so it completes quickly. + */ + (void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 1); + MUTEX_LOCK(env, rep->mtx_ckp); + ret = __memp_sync(env, DB_SYNC_CHECKPOINT, &ckp_lsn); + MUTEX_UNLOCK(env, rep->mtx_ckp); + (void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 0); + + /* Update the last_ckp in the txn region. */ + if (ret == 0) + ret = __txn_updateckp(env, &rp->lsn); + else { + __db_errx(env, "Error syncing ckp [%lu][%lu]", + (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset); + ret = __env_panic(env, ret); + } + + MUTEX_LOCK(env, rep->mtx_clientdb); + return (ret); +} + +/* + * __rep_remfirst -- + * Remove the first entry from the __db.rep.db + */ +static int +__rep_remfirst(env, ip, cntrl, rec) + ENV *env; + DB_THREAD_INFO *ip; + DBT *cntrl; + DBT *rec; +{ + DB *dbp; + DBC *dbc; + DB_REP *db_rep; + int ret, t_ret; + + db_rep = env->rep_handle; + dbp = db_rep->rep_db; + if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0) + return (ret); + + /* The DBTs need to persist through another call. */ + F_SET(cntrl, DB_DBT_REALLOC); + F_SET(rec, DB_DBT_REALLOC); + if ((ret = __dbc_get(dbc, cntrl, rec, DB_RMW | DB_FIRST)) == 0) + ret = __dbc_del(dbc, 0); + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __rep_getnext -- + * Get the next record out of the __db.rep.db table. + */ +static int +__rep_getnext(env, ip) + ENV *env; + DB_THREAD_INFO *ip; +{ + DB *dbp; + DBC *dbc; + DBT lsn_dbt, nextrec_dbt; + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + __rep_control_args *rp; + int ret, t_ret; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + + db_rep = env->rep_handle; + dbp = db_rep->rep_db; + + if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0) + return (ret); + + /* + * Update waiting_lsn. We need to move it + * forward to the LSN of the next record + * in the queue. + * + * If the next item in the database is a log + * record--the common case--we're not + * interested in its contents, just in its LSN. + * Optimize by doing a partial get of the data item. + */ + memset(&nextrec_dbt, 0, sizeof(nextrec_dbt)); + F_SET(&nextrec_dbt, DB_DBT_PARTIAL); + nextrec_dbt.ulen = nextrec_dbt.dlen = 0; + + memset(&lsn_dbt, 0, sizeof(lsn_dbt)); + ret = __dbc_get(dbc, &lsn_dbt, &nextrec_dbt, DB_FIRST); + if (ret != DB_NOTFOUND && ret != 0) + goto err; + + if (ret == DB_NOTFOUND) { + ZERO_LSN(lp->waiting_lsn); + /* + * Whether or not the current record is + * simple, there's no next one, and + * therefore we haven't got anything + * else to do right now. Break out. + */ + goto err; + } + rp = (__rep_control_args *)lsn_dbt.data; + lp->waiting_lsn = rp->lsn; + +err: if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_process_rec -- + * + * Given a record in 'rp', process it. In the case of a NEWFILE, that means + * potentially switching files. In the case of a checkpoint, it means doing + * the checkpoint, and in other cases, it means simply writing the record into + * the log. + */ +static int +__rep_process_rec(env, ip, rp, rec, ret_tsp, ret_lsnp) + ENV *env; + DB_THREAD_INFO *ip; + __rep_control_args *rp; + DBT *rec; + db_timespec *ret_tsp; + DB_LSN *ret_lsnp; +{ + DB *dbp; + DBT control_dbt, key_dbt, rec_dbt; + DB_REP *db_rep; + REP *rep; + db_timespec msg_time; + u_int32_t rectype, txnid; + int ret, t_ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + dbp = db_rep->rep_db; + ret = 0; + + if (rp->rectype == REP_NEWFILE) { + ret = __rep_newfile(env, rp, rec); + return (0); + } + + LOGCOPY_32(env, &rectype, rec->data); + memset(&control_dbt, 0, sizeof(control_dbt)); + memset(&rec_dbt, 0, sizeof(rec_dbt)); + timespecset(&msg_time, rp->msg_sec, rp->msg_nsec); + + /* + * We write all records except for checkpoint records here. + * All non-checkpoint records need to appear in the log before + * we take action upon them (i.e., we enforce write-ahead logging). + * However, we can't write the checkpoint record here until the + * data buffers are actually written to disk, else we are creating + * an invalid log -- one that says all data before a certain point + * has been written to disk. + * + * If two threads are both processing the same checkpoint record + * (because, for example, it was resent and the original finally + * arrived), we handle that below by checking for the existence of + * the log record when we add it to the replication database. + * + * Any log records that arrive while we are processing the checkpoint + * are added to the bookkeeping database because ready_lsn is not yet + * updated to point after the checkpoint record. + */ + if (rectype != DB___txn_ckp || F_ISSET(rep, REP_F_RECOVER_LOG)) { + if ((ret = __log_rep_put(env, &rp->lsn, rec, 0)) != 0) + return (ret); + STAT(rep->stat.st_log_records++); + if (F_ISSET(rep, REP_F_RECOVER_LOG)) { + *ret_lsnp = rp->lsn; + goto out; + } + } + + switch (rectype) { + case DB___dbreg_register: + /* + * DB opens occur in the context of a transaction, so we can + * simply handle them when we process the transaction. Closes, + * however, are not transaction-protected, so we have to handle + * them here. + * + * It should be unsafe for the master to do a close of a file + * that was opened in an active transaction, so we should be + * guaranteed to get the ordering right. + * + * !!! + * The txn ID is the second 4-byte field of the log record. + * We should really be calling __dbreg_register_read() and + * working from the __dbreg_register_args structure, but this + * is considerably faster and the order of the fields won't + * change. + */ + LOGCOPY_32(env, &txnid, + (u_int8_t *)rec->data + sizeof(u_int32_t)); + if (txnid == TXN_INVALID) + ret = __db_dispatch(env, &env->recover_dtab, + rec, &rp->lsn, DB_TXN_APPLY, NULL); + break; + case DB___txn_regop: + /* + * If an application is doing app-specific recovery + * and acquires locks while applying a transaction, + * it can deadlock. Any other locks held by this + * thread should have been discarded in the + * __rep_process_txn error path, so if we simply + * retry, we should eventually succeed. + */ + do { + ret = 0; + if (!F_ISSET(db_rep, DBREP_OPENFILES)) { + ret = __txn_openfiles(env, ip, NULL, 1); + F_SET(db_rep, DBREP_OPENFILES); + } + if (ret == 0) + ret = __rep_process_txn(env, rec); + } while (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED); + + /* Now flush the log unless we're running TXN_NOSYNC. */ + if (ret == 0 && !F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC)) + ret = __log_flush(env, NULL); + if (ret != 0) { + __db_errx(env, "Error processing txn [%lu][%lu]", + (u_long)rp->lsn.file, (u_long)rp->lsn.offset); + ret = __env_panic(env, ret); + } + *ret_lsnp = rp->lsn; + break; + case DB___txn_prepare: + ret = __log_flush(env, NULL); + /* + * Save the biggest prepared LSN we've seen. + */ + rep->max_prep_lsn = rp->lsn; + RPRINT(env, DB_VERB_REP_MSGS, + (env, "process_rec: prepare at [%lu][%lu]", + (u_long)rep->max_prep_lsn.file, + (u_long)rep->max_prep_lsn.offset)); + break; + case DB___txn_ckp: + /* + * We do not want to hold the REP->mtx_clientdb mutex while + * syncing the mpool, so if we get a checkpoint record we are + * supposed to process, add it to the __db.rep.db, do the + * memp_sync and then go back and process it later, when the + * sync has finished. If this record is already in the table, + * then some other thread will process it, so simply return + * REP_NOTPERM. + */ + memset(&key_dbt, 0, sizeof(key_dbt)); + key_dbt.data = rp; + key_dbt.size = sizeof(*rp); + + /* + * We want to put this record into the tmp DB only if + * it doesn't exist, so use DB_NOOVERWRITE. + */ + ret = __db_put(dbp, ip, NULL, &key_dbt, rec, DB_NOOVERWRITE); + if (ret == DB_KEYEXIST) { + if (ret_lsnp != NULL) + *ret_lsnp = rp->lsn; + ret = DB_REP_NOTPERM; + } + if (ret != 0) + break; + + /* + * Now, do the checkpoint. Regardless of + * whether the checkpoint succeeds or not, + * we need to remove the record we just put + * in the temporary database. If the + * checkpoint failed, return an error. We + * will act like we never received the + * checkpoint. + */ + if ((ret = __rep_do_ckp(env, rec, rp)) == 0) + ret = __log_rep_put(env, &rp->lsn, rec, + DB_LOG_CHKPNT); + if ((t_ret = __rep_remfirst(env, ip, + &control_dbt, &rec_dbt)) != 0 && ret == 0) + ret = t_ret; + /* + * If we're successful putting the log record in the + * log, flush it for a checkpoint. + */ + if (ret == 0) { + *ret_lsnp = rp->lsn; + ret = __log_flush(env, NULL); + } + break; + default: + break; + } + +out: + if (ret == 0 && F_ISSET(rp, REPCTL_PERM)) + *ret_lsnp = rp->lsn; + if (IS_USING_LEASES(env) && + F_ISSET(rp, REPCTL_LEASE)) + *ret_tsp = msg_time; + /* + * Set ret_lsnp before flushing the log because if the + * flush fails, we've still written the record to the + * log and the LSN has been entered. + */ + if (ret == 0 && F_ISSET(rp, REPCTL_FLUSH)) + ret = __log_flush(env, NULL); + if (control_dbt.data != NULL) + __os_ufree(env, control_dbt.data); + if (rec_dbt.data != NULL) + __os_ufree(env, rec_dbt.data); + + return (ret); +} + +/* + * __rep_resend_req -- + * We might have dropped a message, we need to resend our request. + * The request we send is dependent on what recovery state we're in. + * The caller holds no locks. + * + * PUBLIC: int __rep_resend_req __P((ENV *, int)); + */ +int +__rep_resend_req(env, rereq) + ENV *env; + int rereq; +{ + DB_LOG *dblp; + DB_LSN lsn, *lsnp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + int master, ret; + u_int32_t gapflags, msgtype, repflags, sendflags; + + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + ret = 0; + lsnp = NULL; + msgtype = REP_INVALID; + sendflags = 0; + + repflags = rep->flags; + /* + * If we are delayed we do not rerequest anything. + */ + if (FLD_ISSET(repflags, REP_F_DELAY)) + return (ret); + gapflags = rereq ? REP_GAP_REREQUEST : 0; + + if (FLD_ISSET(repflags, REP_F_RECOVER_VERIFY)) { + MUTEX_LOCK(env, rep->mtx_clientdb); + lsn = lp->verify_lsn; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (!IS_ZERO_LSN(lsn)) { + msgtype = REP_VERIFY_REQ; + lsnp = &lsn; + sendflags = DB_REP_REREQUEST; + } + } else if (FLD_ISSET(repflags, REP_F_RECOVER_UPDATE)) { + /* + * UPDATE_REQ only goes to the master. + */ + msgtype = REP_UPDATE_REQ; + } else if (FLD_ISSET(repflags, REP_F_RECOVER_PAGE)) { + REP_SYSTEM_LOCK(env); + ret = __rep_pggap_req(env, rep, NULL, gapflags); + REP_SYSTEM_UNLOCK(env); + } else { + MUTEX_LOCK(env, rep->mtx_clientdb); + ret = __rep_loggap_req(env, rep, NULL, gapflags); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + } + + if (msgtype != REP_INVALID) { + master = rep->master_id; + if (master == DB_EID_INVALID) + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0); + else + (void)__rep_send_message(env, + master, msgtype, lsnp, NULL, 0, sendflags); + } + + return (ret); +} + +/* + * __rep_check_doreq -- + * PUBLIC: int __rep_check_doreq __P((ENV *, REP *)); + * + * Check if we need to send another request. If so, compare with + * the request limits the user might have set. This assumes the + * caller holds the REP->mtx_clientdb mutex. Returns 1 if a request + * needs to be made, and 0 if it does not. + */ +int +__rep_check_doreq(env, rep) + ENV *env; + REP *rep; +{ + + DB_LOG *dblp; + LOG *lp; + db_timespec now; + int req; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + __os_gettime(env, &now, 1); + timespecsub(&now, &lp->rcvd_ts); + req = timespeccmp(&now, &lp->wait_ts, >=); + if (req) { + /* + * Add wait_ts to itself to double it. + */ + timespecadd(&lp->wait_ts, &lp->wait_ts); + if (timespeccmp(&lp->wait_ts, &rep->max_gap, >)) + lp->wait_ts = rep->max_gap; + __os_gettime(env, &lp->rcvd_ts, 1); + } + return (req); +} + +/* + * __rep_skip_msg - + * + * If we're in recovery we want to skip/ignore the message, but + * we also need to see if we need to re-request any retransmissions. + */ +static int +__rep_skip_msg(env, rep, eid, rectype) + ENV *env; + REP *rep; + int eid; + u_int32_t rectype; +{ + int do_req, ret; + + ret = 0; + /* + * If we have a request message from a client then immediately + * send a REP_REREQUEST back to that client since we're skipping it. + */ + if (F_ISSET(rep, REP_F_CLIENT) && REP_MSG_REQ(rectype)) + do_req = 1; + else { + /* Check for need to retransmit. */ + MUTEX_LOCK(env, rep->mtx_clientdb); + do_req = __rep_check_doreq(env, rep); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + } + /* + * Don't respond to a MASTER_REQ with + * a MASTER_REQ or REREQUEST. + */ + if (do_req && rectype != REP_MASTER_REQ) { + /* + * There are three cases: + * 1. If we don't know who the master is, then send MASTER_REQ. + * 2. If the message we're skipping came from the master, + * then we need to rerequest. + * 3. If the message didn't come from a master (i.e. client + * to client), then send a rerequest back to the sender so + * the sender can rerequest it elsewhere, if we are a client. + */ + if (rep->master_id == DB_EID_INVALID) /* Case 1. */ + (void)__rep_send_message(env, + DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0); + else if (eid == rep->master_id) /* Case 2. */ + ret = __rep_resend_req(env, 0); + else if (F_ISSET(rep, REP_F_CLIENT)) /* Case 3. */ + (void)__rep_send_message(env, + eid, REP_REREQUEST, NULL, NULL, 0, 0); + } + return (ret); +} + +static int +__rep_fire_newmaster(env, gen, master) + ENV *env; + u_int32_t gen; + int master; +{ + DB_REP *db_rep; + REP *rep; + + db_rep = env->rep_handle; + rep = db_rep->region; + + REP_EVENT_LOCK(env); + /* + * The firing of this event should be idempotent with respect to a + * particular generation number. + */ + if (rep->newmaster_event_gen < gen) { + __rep_fire_event(env, DB_EVENT_REP_NEWMASTER, &master); + rep->newmaster_event_gen = gen; + } + REP_EVENT_UNLOCK(env); + return (0); +} + +static int +__rep_fire_startupdone(env, gen, master) + ENV *env; + u_int32_t gen; + int master; +{ + DB_REP *db_rep; + REP *rep; + + db_rep = env->rep_handle; + rep = db_rep->region; + + REP_EVENT_LOCK(env); + /* + * Usually NEWMASTER will already have been fired. But if not, fire + * it here now, to ensure the application receives events in the + * expected order. + */ + if (rep->newmaster_event_gen < gen) { + __rep_fire_event(env, DB_EVENT_REP_NEWMASTER, &master); + rep->newmaster_event_gen = gen; + } + + /* + * Caller already ensures that it only tries to fire STARTUPDONE once + * per generation. If we did not want to rely on that, we could add a + * simple boolean flag (to the set of data protected by the mtx_event). + * The precise meaning of that flag would be "STARTUPDONE has been fired + * for the generation value stored in `newmaster_event_gen'". Then the + * more accurate test here would be simply to check that flag, and fire + * the event (and set the flag) if it were not already set. + */ + if (rep->newmaster_event_gen == gen) + __rep_fire_event(env, DB_EVENT_REP_STARTUPDONE, NULL); + REP_EVENT_UNLOCK(env); + return (0); +} diff --git a/rep/rep_region.c b/rep/rep_region.c new file mode 100644 index 0000000..9eacb2c --- /dev/null +++ b/rep/rep_region.c @@ -0,0 +1,488 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" + +static int __rep_egen_init __P((ENV *, REP *)); +static int __rep_gen_init __P((ENV *, REP *)); + +/* + * __rep_open -- + * Initialize the shared memory state for the replication system. + * + * PUBLIC: int __rep_open __P((ENV *)); + */ +int +__rep_open(env) + ENV *env; +{ + DB_REP *db_rep; + REGENV *renv; + REGINFO *infop; + REP *rep; + int ret; + + db_rep = env->rep_handle; + infop = env->reginfo; + renv = infop->primary; + ret = 0; + + if (renv->rep_off == INVALID_ROFF) { + /* Must create the region. */ + if ((ret = __env_alloc(infop, sizeof(REP), &rep)) != 0) + return (ret); + memset(rep, 0, sizeof(*rep)); + + /* + * We have the region; fill in the values. Some values may + * have been configured before we open the region, and those + * are taken from the DB_REP structure. + */ + if ((ret = __mutex_alloc( + env, MTX_REP_REGION, 0, &rep->mtx_region)) != 0) + return (ret); + /* + * Because we have no way to prevent deadlocks and cannot log + * changes made to it, we single-thread access to the client + * bookkeeping database. This is suboptimal, but it only gets + * accessed when messages arrive out-of-order, so it should + * stay small and not be used in a high-performance app. + */ + if ((ret = __mutex_alloc( + env, MTX_REP_DATABASE, 0, &rep->mtx_clientdb)) != 0) + return (ret); + + if ((ret = __mutex_alloc( + env, MTX_REP_CHKPT, 0, &rep->mtx_ckp)) != 0) + return (ret); + + if ((ret = __mutex_alloc( + env, MTX_REP_EVENT, 0, &rep->mtx_event)) != 0) + return (ret); + + rep->newmaster_event_gen = 0; + rep->notified_egen = 0; + rep->lease_off = INVALID_ROFF; + rep->tally_off = INVALID_ROFF; + rep->v2tally_off = INVALID_ROFF; + rep->eid = db_rep->eid; + rep->master_id = DB_EID_INVALID; + rep->gen = 0; + rep->version = DB_REPVERSION; + rep->config = db_rep->config; + if ((ret = __rep_gen_init(env, rep)) != 0) + return (ret); + if ((ret = __rep_egen_init(env, rep)) != 0) + return (ret); + rep->gbytes = db_rep->gbytes; + rep->bytes = db_rep->bytes; + rep->request_gap = db_rep->request_gap; + rep->max_gap = db_rep->max_gap; + rep->config_nsites = db_rep->config_nsites; + rep->elect_timeout = db_rep->elect_timeout; + rep->full_elect_timeout = db_rep->full_elect_timeout; + rep->lease_timeout = db_rep->lease_timeout; + rep->clock_skew = db_rep->clock_skew; + rep->clock_base = db_rep->clock_base; + timespecclear(&rep->lease_duration); + timespecclear(&rep->grant_expire); + rep->chkpt_delay = db_rep->chkpt_delay; + rep->priority = db_rep->my_priority; + + F_SET(rep, REP_F_NOARCHIVE); + + /* Copy application type flags if set before env open. */ + if (F_ISSET(db_rep, DBREP_APP_REPMGR)) + F_SET(rep, REP_F_APP_REPMGR); + if (F_ISSET(db_rep, DBREP_APP_BASEAPI)) + F_SET(rep, REP_F_APP_BASEAPI); + + /* Initialize encapsulating region. */ + renv->rep_off = R_OFFSET(infop, rep); + (void)time(&renv->rep_timestamp); + renv->op_timestamp = 0; + F_CLR(renv, DB_REGENV_REPLOCKED); + +#ifdef HAVE_REPLICATION_THREADS + if ((ret = __repmgr_open(env, rep)) != 0) + return (ret); +#endif + } else { + rep = R_ADDR(infop, renv->rep_off); + /* + * Prevent an application type mismatch between a process + * and the environment it is trying to join. + */ + if ((F_ISSET(db_rep, DBREP_APP_REPMGR) && + F_ISSET(rep, REP_F_APP_BASEAPI)) || + (F_ISSET(db_rep, DBREP_APP_BASEAPI) && + F_ISSET(rep, REP_F_APP_REPMGR))) { + __db_errx(env, +"Application type mismatch for a replication process joining the environment"); + return (EINVAL); + } +#ifdef HAVE_REPLICATION_THREADS + if ((ret = __repmgr_join(env, rep)) != 0) + return (ret); +#endif + } + + db_rep->region = rep; + + return (0); +} + +/* + * __rep_env_refresh -- + * Replication-specific refresh of the ENV structure. + * + * PUBLIC: int __rep_env_refresh __P((ENV *)); + */ +int +__rep_env_refresh(env) + ENV *env; +{ + DB_REP *db_rep; + REGENV *renv; + REGINFO *infop; + REP *rep; + int ret, t_ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + infop = env->reginfo; + renv = infop->primary; + ret = 0; + + /* + * If we are the last reference closing the env, clear our knowledge of + * belonging to a group and that there is a valid handle where + * rep_start had already been called. + */ + if (renv->refcnt == 1) { + F_CLR(rep, REP_F_GROUP_ESTD); + F_CLR(rep, REP_F_START_CALLED); + } + +#ifdef HAVE_REPLICATION_THREADS + ret = __repmgr_env_refresh(env); +#endif + + /* + * If a private region, return the memory to the heap. Not needed for + * filesystem-backed or system shared memory regions, that memory isn't + * owned by any particular process. + */ + if (F_ISSET(env, ENV_PRIVATE)) { + db_rep = env->rep_handle; + if (db_rep->region != NULL) { + ret = __mutex_free(env, &db_rep->region->mtx_region); + if ((t_ret = __mutex_free(env, + &db_rep->region->mtx_clientdb)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __mutex_free(env, + &db_rep->region->mtx_ckp)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __mutex_free(env, + &db_rep->region->mtx_event)) != 0 && ret == 0) + ret = t_ret; + } + + if (renv->rep_off != INVALID_ROFF) + __env_alloc_free(infop, R_ADDR(infop, renv->rep_off)); + } + + env->rep_handle->region = NULL; + return (ret); +} + +/* + * __rep_close -- + * Shut down all of replication. + * + * PUBLIC: int __rep_env_close __P((ENV *)); + */ +int +__rep_env_close(env) + ENV *env; +{ + int ret, t_ret; + + ret = __rep_preclose(env); + if ((t_ret = __rep_closefiles(env)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __rep_preclose -- + * If we are a client, shut down our client database and send + * any outstanding bulk buffers. + * + * PUBLIC: int __rep_preclose __P((ENV *)); + */ +int +__rep_preclose(env) + ENV *env; +{ + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP_BULK bulk; + int ret; + + ret = 0; + + db_rep = env->rep_handle; + dblp = env->lg_handle; + + /* + * If we have a rep region, we can preclose. Otherwise, return. + * If we're on an error path from env open, we may not have + * a region, even though we have a handle. + */ + if (db_rep == NULL || db_rep->region == NULL) + return (ret); + MUTEX_LOCK(env, db_rep->region->mtx_clientdb); + if (db_rep->rep_db != NULL) { + ret = __db_close(db_rep->rep_db, NULL, DB_NOSYNC); + db_rep->rep_db = NULL; + } + /* + * We could be called early in an env_open error path, so + * only do this if we have a log region set up. + */ + if (dblp == NULL) + goto out; + lp = dblp->reginfo.primary; + /* + * If we have something in the bulk buffer, send anything in it + * if we are able to. + */ + if (lp->bulk_off != 0 && db_rep->send != NULL) { + memset(&bulk, 0, sizeof(bulk)); + bulk.addr = R_ADDR(&dblp->reginfo, lp->bulk_buf); + bulk.offp = &lp->bulk_off; + bulk.len = lp->bulk_len; + bulk.type = REP_BULK_LOG; + bulk.eid = DB_EID_BROADCAST; + bulk.flagsp = &lp->bulk_flags; + /* + * Ignore send errors here. This can be called on the + * env->close path - make a best attempt to send. + */ + (void)__rep_send_bulk(env, &bulk, 0); + } +out: MUTEX_UNLOCK(env, db_rep->region->mtx_clientdb); + return (ret); +} + +/* + * __rep_closefiles -- + * If we were a client and are now a master, close all databases + * we've opened while applying messages as a client. This can + * be called from __env_close and we need to check if the env, + * handles and regions are set up, or not. + * + * PUBLIC: int __rep_closefiles __P((ENV *)); + */ +int +__rep_closefiles(env) + ENV *env; +{ + DB_LOG *dblp; + DB_REP *db_rep; + int ret; + + ret = 0; + + db_rep = env->rep_handle; + dblp = env->lg_handle; + + if (db_rep == NULL || db_rep->region == NULL) + return (ret); + if (dblp == NULL) + return (ret); + if ((ret = __dbreg_close_files(env, 0)) == 0) + F_CLR(db_rep, DBREP_OPENFILES); + + return (ret); +} + +/* + * __rep_egen_init -- + * Initialize the value of egen in the region. Called only from + * __rep_region_init, which is guaranteed to be single-threaded + * as we create the rep region. We set the rep->egen field which + * is normally protected by db_rep->region->mutex. + */ +static int +__rep_egen_init(env, rep) + ENV *env; + REP *rep; +{ + DB_FH *fhp; + int ret; + size_t cnt; + char *p; + + if ((ret = __db_appname(env, + DB_APP_NONE, REP_EGENNAME, NULL, &p)) != 0) + return (ret); + /* + * If the file doesn't exist, create it now and initialize with 1. + */ + if (__os_exists(env, p, NULL) != 0) { + rep->egen = rep->gen + 1; + if ((ret = __rep_write_egen(env, rep, rep->egen)) != 0) + goto err; + } else { + /* + * File exists, open it and read in our egen. + */ + if ((ret = __os_open(env, p, 0, + DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0) + goto err; + if ((ret = __os_read(env, fhp, &rep->egen, sizeof(u_int32_t), + &cnt)) != 0 || cnt != sizeof(u_int32_t)) + goto err1; + RPRINT(env, DB_VERB_REP_MISC, + (env, "Read in egen %lu", (u_long)rep->egen)); +err1: (void)__os_closehandle(env, fhp); + } +err: __os_free(env, p); + return (ret); +} + +/* + * __rep_write_egen -- + * Write out the egen into the env file. + * + * PUBLIC: int __rep_write_egen __P((ENV *, REP *, u_int32_t)); + */ +int +__rep_write_egen(env, rep, egen) + ENV *env; + REP *rep; + u_int32_t egen; +{ + DB_FH *fhp; + int ret; + size_t cnt; + char *p; + + /* + * If running in-memory replication, return without any file + * operations. + */ + if (FLD_ISSET(rep->config, REP_C_INMEM)) { + return (0); + } + + if ((ret = __db_appname(env, + DB_APP_NONE, REP_EGENNAME, NULL, &p)) != 0) + return (ret); + if ((ret = __os_open( + env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) { + if ((ret = __os_write(env, fhp, &egen, sizeof(u_int32_t), + &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0)) + __db_err(env, ret, "%s", p); + (void)__os_closehandle(env, fhp); + } + __os_free(env, p); + return (ret); +} + +/* + * __rep_gen_init -- + * Initialize the value of gen in the region. Called only from + * __rep_region_init, which is guaranteed to be single-threaded + * as we create the rep region. We set the rep->gen field which + * is normally protected by db_rep->region->mutex. + */ +static int +__rep_gen_init(env, rep) + ENV *env; + REP *rep; +{ + DB_FH *fhp; + int ret; + size_t cnt; + char *p; + + if ((ret = __db_appname(env, + DB_APP_NONE, REP_GENNAME, NULL, &p)) != 0) + return (ret); + /* + * If the file doesn't exist, create it now and initialize with 0. + */ + if (__os_exists(env, p, NULL) != 0) { + rep->gen = 0; + if ((ret = __rep_write_gen(env, rep, rep->gen)) != 0) + goto err; + } else { + /* + * File exists, open it and read in our gen. + */ + if ((ret = __os_open(env, p, 0, + DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0) + goto err; + if ((ret = __os_read(env, fhp, &rep->gen, sizeof(u_int32_t), + &cnt)) < 0 || cnt == 0) + goto err1; + RPRINT(env, DB_VERB_REP_MISC, (env, "Read in gen %lu", + (u_long)rep->gen)); +err1: (void)__os_closehandle(env, fhp); + } +err: __os_free(env, p); + return (ret); +} + +/* + * __rep_write_gen -- + * Write out the gen into the env file. + * + * PUBLIC: int __rep_write_gen __P((ENV *, REP *, u_int32_t)); + */ +int +__rep_write_gen(env, rep, gen) + ENV *env; + REP *rep; + u_int32_t gen; +{ + DB_FH *fhp; + int ret; + size_t cnt; + char *p; + + /* + * If running in-memory replication, return without any file + * operations. + */ + if (FLD_ISSET(rep->config, REP_C_INMEM)) { + return (0); + } + + if ((ret = __db_appname(env, + DB_APP_NONE, REP_GENNAME, NULL, &p)) != 0) + return (ret); + if ((ret = __os_open( + env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) { + if ((ret = __os_write(env, fhp, &gen, sizeof(u_int32_t), + &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0)) + __db_err(env, ret, "%s", p); + (void)__os_closehandle(env, fhp); + } + __os_free(env, p); + return (ret); +} diff --git a/rep/rep_stat.c b/rep/rep_stat.c new file mode 100644 index 0000000..4a2b93e --- /dev/null +++ b/rep/rep_stat.c @@ -0,0 +1,568 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" + +#ifdef HAVE_STATISTICS +static int __rep_print_all __P((ENV *, u_int32_t)); +static int __rep_print_stats __P((ENV *, u_int32_t)); +static int __rep_stat __P((ENV *, DB_REP_STAT **, u_int32_t)); + +/* + * __rep_stat_pp -- + * ENV->rep_stat pre/post processing. + * + * PUBLIC: int __rep_stat_pp __P((DB_ENV *, DB_REP_STAT **, u_int32_t)); + */ +int +__rep_stat_pp(dbenv, statp, flags) + DB_ENV *dbenv; + DB_REP_STAT **statp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG_XX( + env, rep_handle, "DB_ENV->rep_stat", DB_INIT_REP); + + if ((ret = __db_fchk(env, + "DB_ENV->rep_stat", flags, DB_STAT_CLEAR)) != 0) + return (ret); + + ENV_ENTER(env, ip); + ret = __rep_stat(env, statp, flags); + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * __rep_stat -- + * ENV->rep_stat. + */ +static int +__rep_stat(env, statp, flags) + ENV *env; + DB_REP_STAT **statp; + u_int32_t flags; +{ + DB_LOG *dblp; + DB_REP *db_rep; + DB_REP_STAT *stats; + LOG *lp; + REP *rep; + u_int32_t startupdone; + uintmax_t queued; + int dolock, ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + + *statp = NULL; + + /* Allocate a stat struct to return to the user. */ + if ((ret = __os_umalloc(env, sizeof(DB_REP_STAT), &stats)) != 0) + return (ret); + + /* + * Read without holding the lock. If we are in client recovery, we + * copy just the stats struct so we won't block. We only copy out + * those stats that don't require acquiring any mutex. + */ + dolock = FLD_ISSET(rep->flags, REP_F_RECOVER_MASK) ? 0 : 1; + memcpy(stats, &rep->stat, sizeof(*stats)); + + /* Copy out election stats. */ + if (F_ISSET(rep, REP_F_EPHASE1)) + stats->st_election_status = 1; + else if (F_ISSET(rep, REP_F_EPHASE2)) + stats->st_election_status = 2; + + stats->st_election_nsites = rep->sites; + stats->st_election_cur_winner = rep->winner; + stats->st_election_priority = rep->w_priority; + stats->st_election_gen = rep->w_gen; + stats->st_election_lsn = rep->w_lsn; + stats->st_election_votes = rep->votes; + stats->st_election_nvotes = rep->nvotes; + stats->st_election_tiebreaker = rep->w_tiebreaker; + + /* Copy out other info that's protected by the rep mutex. */ + stats->st_env_id = rep->eid; + stats->st_env_priority = rep->priority; + stats->st_nsites = rep->nsites; + stats->st_master = rep->master_id; + stats->st_gen = rep->gen; + stats->st_egen = rep->egen; + + if (F_ISSET(rep, REP_F_MASTER)) + stats->st_status = DB_REP_MASTER; + else if (F_ISSET(rep, REP_F_CLIENT)) + stats->st_status = DB_REP_CLIENT; + else + stats->st_status = 0; + + if (LF_ISSET(DB_STAT_CLEAR)) { + queued = rep->stat.st_log_queued; + startupdone = rep->stat.st_startup_complete; + memset(&rep->stat, 0, sizeof(rep->stat)); + rep->stat.st_log_queued = rep->stat.st_log_queued_total = + rep->stat.st_log_queued_max = queued; + rep->stat.st_startup_complete = startupdone; + } + + /* + * Log-related replication info is stored in the log system and + * protected by the log region lock. + */ + if (dolock) + MUTEX_LOCK(env, rep->mtx_clientdb); + if (F_ISSET(rep, REP_F_CLIENT)) { + stats->st_next_lsn = lp->ready_lsn; + stats->st_waiting_lsn = lp->waiting_lsn; + stats->st_next_pg = rep->ready_pg; + stats->st_waiting_pg = rep->waiting_pg; + stats->st_max_lease_sec = (u_int32_t)lp->max_lease_ts.tv_sec; + stats->st_max_lease_usec = (u_int32_t) + (lp->max_lease_ts.tv_nsec / NS_PER_US); + } else { + if (F_ISSET(rep, REP_F_MASTER)) { + LOG_SYSTEM_LOCK(env); + stats->st_next_lsn = lp->lsn; + LOG_SYSTEM_UNLOCK(env); + } else + ZERO_LSN(stats->st_next_lsn); + ZERO_LSN(stats->st_waiting_lsn); + stats->st_max_lease_sec = 0; + stats->st_max_lease_usec = 0; + } + stats->st_max_perm_lsn = lp->max_perm_lsn; + if (dolock) + MUTEX_UNLOCK(env, rep->mtx_clientdb); + + *statp = stats; + return (0); +} + +/* + * __rep_stat_print_pp -- + * ENV->rep_stat_print pre/post processing. + * + * PUBLIC: int __rep_stat_print_pp __P((DB_ENV *, u_int32_t)); + */ +int +__rep_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG_XX( + env, rep_handle, "DB_ENV->rep_stat_print", DB_INIT_REP); + + if ((ret = __db_fchk(env, "DB_ENV->rep_stat_print", + flags, DB_STAT_ALL | DB_STAT_CLEAR)) != 0) + return (ret); + + ENV_ENTER(env, ip); + ret = __rep_stat_print(env, flags); + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * __rep_stat_print -- + * ENV->rep_stat_print method. + * + * PUBLIC: int __rep_stat_print __P((ENV *, u_int32_t)); + */ +int +__rep_stat_print(env, flags) + ENV *env; + u_int32_t flags; +{ + u_int32_t orig_flags; + int ret; + + orig_flags = flags; + LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM); + if (flags == 0 || LF_ISSET(DB_STAT_ALL)) { + ret = __rep_print_stats(env, orig_flags); + if (flags == 0 || ret != 0) + return (ret); + } + + if (LF_ISSET(DB_STAT_ALL) && + (ret = __rep_print_all(env, orig_flags)) != 0) + return (ret); + + return (0); +} + +/* + * __rep_print_stats -- + * Print out default statistics. + */ +static int +__rep_print_stats(env, flags) + ENV *env; + u_int32_t flags; +{ + DB_REP_STAT *sp; + int is_client, ret; + char *p; + + if ((ret = __rep_stat(env, &sp, flags)) != 0) + return (ret); + + if (LF_ISSET(DB_STAT_ALL)) + __db_msg(env, "Default replication region information:"); + is_client = 0; + switch (sp->st_status) { + case DB_REP_MASTER: + __db_msg(env, + "Environment configured as a replication master"); + break; + case DB_REP_CLIENT: + __db_msg(env, + "Environment configured as a replication client"); + is_client = 1; + break; + default: + __db_msg(env, + "Environment not configured for replication"); + break; + } + + __db_msg(env, "%lu/%lu\t%s", + (u_long)sp->st_next_lsn.file, (u_long)sp->st_next_lsn.offset, + is_client ? "Next LSN expected" : "Next LSN to be used"); + __db_msg(env, "%lu/%lu\t%s", + (u_long)sp->st_waiting_lsn.file, (u_long)sp->st_waiting_lsn.offset, + sp->st_waiting_lsn.file == 0 ? + "Not waiting for any missed log records" : + "LSN of first log record we have after missed log records"); + __db_msg(env, "%lu/%lu\t%s", + (u_long)sp->st_max_perm_lsn.file, + (u_long)sp->st_max_perm_lsn.offset, + sp->st_max_perm_lsn.file == 0 ? + "No maximum permanent LSN" : + "Maximum permanent LSN"); + + __db_dl(env, "Next page number expected", (u_long)sp->st_next_pg); + p = sp->st_waiting_pg == PGNO_INVALID ? + "Not waiting for any missed pages" : + "Page number of first page we have after missed pages"; + __db_msg(env, "%lu\t%s", (u_long)sp->st_waiting_pg, p); + __db_dl(env, + "Number of duplicate master conditions originally detected at this site", + (u_long)sp->st_dupmasters); + if (sp->st_env_id != DB_EID_INVALID) + __db_dl(env, "Current environment ID", (u_long)sp->st_env_id); + else + __db_msg(env, "No current environment ID"); + __db_dl(env, + "Current environment priority", (u_long)sp->st_env_priority); + __db_dl(env, "Current generation number", (u_long)sp->st_gen); + __db_dl(env, + "Election generation number for the current or next election", + (u_long)sp->st_egen); + __db_dl(env, "Number of duplicate log records received", + (u_long)sp->st_log_duplicated); + __db_dl(env, "Number of log records currently queued", + (u_long)sp->st_log_queued); + __db_dl(env, "Maximum number of log records ever queued at once", + (u_long)sp->st_log_queued_max); + __db_dl(env, "Total number of log records queued", + (u_long)sp->st_log_queued_total); + __db_dl(env, + "Number of log records received and appended to the log", + (u_long)sp->st_log_records); + __db_dl(env, "Number of log records missed and requested", + (u_long)sp->st_log_requested); + if (sp->st_master != DB_EID_INVALID) + __db_dl(env, "Current master ID", (u_long)sp->st_master); + else + __db_msg(env, "No current master ID"); + __db_dl(env, "Number of times the master has changed", + (u_long)sp->st_master_changes); + __db_dl(env, + "Number of messages received with a bad generation number", + (u_long)sp->st_msgs_badgen); + __db_dl(env, "Number of messages received and processed", + (u_long)sp->st_msgs_processed); + __db_dl(env, "Number of messages ignored due to pending recovery", + (u_long)sp->st_msgs_recover); + __db_dl(env, "Number of failed message sends", + (u_long)sp->st_msgs_send_failures); + __db_dl(env, "Number of messages sent", (u_long)sp->st_msgs_sent); + __db_dl(env, + "Number of new site messages received", (u_long)sp->st_newsites); + __db_dl(env, + "Number of environments believed to be in the replication group", + (u_long)sp->st_nsites); + __db_dl(env, "Transmission limited", (u_long)sp->st_nthrottles); + __db_dl(env, "Number of outdated conditions detected", + (u_long)sp->st_outdated); + __db_dl(env, "Number of duplicate page records received", + (u_long)sp->st_pg_duplicated); + __db_dl(env, "Number of page records received and added to databases", + (u_long)sp->st_pg_records); + __db_dl(env, "Number of page records missed and requested", + (u_long)sp->st_pg_requested); + if (sp->st_startup_complete == 0) + __db_msg(env, "Startup incomplete"); + else + __db_msg(env, "Startup complete"); + __db_dl(env, + "Number of transactions applied", (u_long)sp->st_txns_applied); + + __db_dl(env, "Number of startsync messages delayed", + (u_long)sp->st_startsync_delayed); + + __db_dl(env, "Number of elections held", (u_long)sp->st_elections); + __db_dl(env, + "Number of elections won", (u_long)sp->st_elections_won); + + if (sp->st_election_status == 0) { + __db_msg(env, "No election in progress"); + if (sp->st_election_sec > 0 || sp->st_election_usec > 0) + __db_msg(env, + "%lu.%.6lu\tDuration of last election (seconds)", + (u_long)sp->st_election_sec, + (u_long)sp->st_election_usec); + } else { + __db_dl(env, "Current election phase", + (u_long)sp->st_election_status); + __db_dl(env, + "Environment ID of the winner of the current or last election", + (u_long)sp->st_election_cur_winner); + __db_dl(env, + "Master generation number of the winner of the current or last election", + (u_long)sp->st_election_gen); + __db_msg(env, + "%lu/%lu\tMaximum LSN of the winner of the current or last election", + (u_long)sp->st_election_lsn.file, + (u_long)sp->st_election_lsn.offset); + __db_dl(env, + "Number of sites responding to this site during the current election", + (u_long)sp->st_election_nsites); + __db_dl(env, + "Number of votes required in the current or last election", + (u_long)sp->st_election_nvotes); + __db_dl(env, + "Priority of the winner of the current or last election", + (u_long)sp->st_election_priority); + __db_dl(env, + "Tiebreaker value of the winner of the current or last election", + (u_long)sp->st_election_tiebreaker); + __db_dl(env, + "Number of votes received during the current election", + (u_long)sp->st_election_votes); + } + __db_dl(env, "Number of bulk buffer sends triggered by full buffer", + (u_long)sp->st_bulk_fills); + __db_dl(env, "Number of single records exceeding bulk buffer size", + (u_long)sp->st_bulk_overflows); + __db_dl(env, "Number of records added to a bulk buffer", + (u_long)sp->st_bulk_records); + __db_dl(env, "Number of bulk buffers sent", + (u_long)sp->st_bulk_transfers); + __db_dl(env, "Number of re-request messages received", + (u_long)sp->st_client_rerequests); + __db_dl(env, + "Number of request messages this client failed to process", + (u_long)sp->st_client_svc_miss); + __db_dl(env, "Number of request messages received by this client", + (u_long)sp->st_client_svc_req); + if (sp->st_max_lease_sec > 0 || sp->st_max_lease_usec > 0) + __db_msg(env, + "%lu.%.6lu\tDuration of maximum lease (seconds)", + (u_long)sp->st_max_lease_sec, + (u_long)sp->st_max_lease_usec); + + __os_ufree(env, sp); + + return (0); +} + +/* + * __rep_print_all -- + * Display debugging replication region statistics. + */ +static int +__rep_print_all(env, flags) + ENV *env; + u_int32_t flags; +{ + static const FN rep_fn[] = { + { REP_F_ABBREVIATED, "REP_F_ABBREVIATED" }, + { REP_F_APP_BASEAPI, "REP_F_APP_BASEAPI" }, + { REP_F_APP_REPMGR, "REP_F_APP_REPMGR" }, + { REP_F_CLIENT, "REP_F_CLIENT" }, + { REP_F_DELAY, "REP_F_DELAY" }, + { REP_F_EGENUPDATE, "REP_F_EGENUPDATE" }, + { REP_F_EPHASE0, "REP_F_EPHASE0" }, + { REP_F_EPHASE1, "REP_F_EPHASE1" }, + { REP_F_EPHASE2, "REP_F_EPHASE2" }, + { REP_F_GROUP_ESTD, "REP_F_GROUP_ESTD" }, + { REP_F_INREPELECT, "REP_F_INREPELECT" }, + { REP_F_INREPSTART, "REP_F_INREPSTART" }, + { REP_F_LEASE_EXPIRED, "REP_F_LEASE_EXPIRED" }, + { REP_F_MASTER, "REP_F_MASTER" }, + { REP_F_MASTERELECT, "REP_F_MASTERELECT" }, + { REP_F_NEWFILE, "REP_F_NEWFILE" }, + { REP_F_NIMDBS_LOADED, "REP_F_NIMDBS_LOADED" }, + { REP_F_NOARCHIVE, "REP_F_NOARCHIVE" }, + { REP_F_READY_API, "REP_F_READY_API" }, + { REP_F_READY_APPLY, "REP_F_READY_APPLY" }, + { REP_F_READY_MSG, "REP_F_READY_MSG" }, + { REP_F_READY_OP, "REP_F_READY_OP" }, + { REP_F_RECOVER_LOG, "REP_F_RECOVER_LOG" }, + { REP_F_RECOVER_PAGE, "REP_F_RECOVER_PAGE" }, + { REP_F_RECOVER_UPDATE, "REP_F_RECOVER_UPDATE" }, + { REP_F_RECOVER_VERIFY, "REP_F_RECOVER_VERIFY" }, + { REP_F_SKIPPED_APPLY, "REP_F_SKIPPED_APPLY" }, + { REP_F_START_CALLED, "REP_F_START_CALLED" }, + { REP_F_TALLY, "REP_F_TALLY" }, + { 0, NULL } + }; + static const FN dbrep_fn[] = { + { DBREP_APP_BASEAPI, "DBREP_APP_BASEAPI" }, + { DBREP_APP_REPMGR, "DBREP_APP_REPMGR" }, + { DBREP_OPENFILES, "DBREP_OPENFILES" }, + { 0, NULL } + }; + DB_LOG *dblp; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + LOG *lp; + REGENV *renv; + REGINFO *infop; + REP *rep; + char time_buf[CTIME_BUFLEN]; + + db_rep = env->rep_handle; + rep = db_rep->region; + infop = env->reginfo; + renv = infop->primary; + ENV_ENTER(env, ip); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "DB_REP handle information:"); + + if (db_rep->rep_db == NULL) + STAT_ISSET("Bookkeeping database", db_rep->rep_db); + else + (void)__db_stat_print(db_rep->rep_db, ip, flags); + + __db_prflags(env, NULL, db_rep->flags, dbrep_fn, NULL, "\tFlags"); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "REP handle information:"); + __mutex_print_debug_single(env, + "Replication region mutex", rep->mtx_region, flags); + __mutex_print_debug_single(env, + "Bookkeeping database mutex", rep->mtx_clientdb, flags); + + STAT_LONG("Environment ID", rep->eid); + STAT_LONG("Master environment ID", rep->master_id); + STAT_ULONG("Election generation", rep->egen); + STAT_ULONG("Election generation number", rep->gen); + STAT_LONG("Space allocated for sites", rep->asites); + STAT_LONG("Sites in group", rep->nsites); + STAT_LONG("Votes needed for election", rep->nvotes); + STAT_LONG("Priority in election", rep->priority); + __db_dlbytes(env, "Limit on data sent in a single call", + rep->gbytes, (u_long)0, rep->bytes); + STAT_LONG("Request gap seconds", rep->request_gap.tv_sec); + STAT_LONG("Request gap microseconds", + rep->request_gap.tv_nsec / NS_PER_US); + STAT_LONG("Maximum gap seconds", rep->max_gap.tv_sec); + STAT_LONG("Maximum gap microseconds", + rep->max_gap.tv_nsec / NS_PER_US); + + STAT_ULONG("Callers in rep_proc_msg", rep->msg_th); + STAT_ULONG("Library handle count", rep->handle_cnt); + STAT_ULONG("Multi-step operation count", rep->op_cnt); + __db_msg(env, "%.24s\tRecovery timestamp", + renv->rep_timestamp == 0 ? + "0" : __os_ctime(&renv->rep_timestamp, time_buf)); + + STAT_LONG("Sites heard from", rep->sites); + STAT_LONG("Current winner", rep->winner); + STAT_LONG("Winner priority", rep->w_priority); + STAT_ULONG("Winner generation", rep->w_gen); + STAT_LSN("Winner LSN", &rep->w_lsn); + STAT_LONG("Winner tiebreaker", rep->w_tiebreaker); + STAT_LONG("Votes for this site", rep->votes); + + __db_prflags(env, NULL, rep->flags, rep_fn, NULL, "\tFlags"); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "LOG replication information:"); + MUTEX_LOCK(env, rep->mtx_clientdb); + dblp = env->lg_handle; + lp = (LOG *)dblp->reginfo.primary; + STAT_LSN("First log record after a gap", &lp->waiting_lsn); + STAT_LSN("Maximum permanent LSN processed", &lp->max_perm_lsn); + STAT_LSN("LSN waiting to verify", &lp->verify_lsn); + STAT_LSN("Maximum LSN requested", &lp->max_wait_lsn); + STAT_LONG("Time to wait before requesting seconds", lp->wait_ts.tv_sec); + STAT_LONG("Time to wait before requesting microseconds", + lp->wait_ts.tv_nsec / NS_PER_US); + STAT_LSN("Next LSN expected", &lp->ready_lsn); + STAT_LONG("Maximum lease timestamp seconds", lp->max_lease_ts.tv_sec); + STAT_LONG("Maximum lease timestamp microseconds", + lp->max_lease_ts.tv_nsec / NS_PER_US); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + ENV_LEAVE(env, ip); + + return (0); +} + +#else /* !HAVE_STATISTICS */ + +int +__rep_stat_pp(dbenv, statp, flags) + DB_ENV *dbenv; + DB_REP_STAT **statp; + u_int32_t flags; +{ + COMPQUIET(statp, NULL); + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbenv->env)); +} + +int +__rep_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbenv->env)); +} +#endif diff --git a/rep/rep_stub.c b/rep/rep_stub.c new file mode 100644 index 0000000..f2f11d8 --- /dev/null +++ b/rep/rep_stub.c @@ -0,0 +1,391 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef HAVE_REPLICATION +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" + +/* + * If the library wasn't compiled with replication support, various routines + * aren't available. Stub them here, returning an appropriate error. + */ +static int __db_norep __P((ENV *)); + +/* + * __db_norep -- + * Error when a Berkeley DB build doesn't include replication support. + */ +static int +__db_norep(env) + ENV *env; +{ + __db_errx(env, + "library build did not include support for replication"); + return (DB_OPNOTSUP); +} + +int +__db_rep_enter(dbp, checkgen, checklock, return_now) + DB *dbp; + int checkgen, checklock, return_now; +{ + COMPQUIET(checkgen, 0); + COMPQUIET(checklock, 0); + COMPQUIET(return_now, 0); + return (__db_norep(dbp->env)); +} + +int +__env_rep_enter(env, checklock) + ENV *env; + int checklock; +{ + COMPQUIET(checklock, 0); + return (__db_norep(env)); +} + +int +__env_db_rep_exit(env) + ENV *env; +{ + return (__db_norep(env)); +} + +int +__op_rep_enter(env) + ENV *env; +{ + return (__db_norep(env)); +} + +int +__op_rep_exit(env) + ENV *env; +{ + return (__db_norep(env)); +} + +int +__rep_bulk_message(env, bulkp, repth, lsnp, dbt, flags) + ENV *env; + REP_BULK *bulkp; + REP_THROTTLE *repth; + DB_LSN *lsnp; + const DBT *dbt; + u_int32_t flags; +{ + COMPQUIET(bulkp, NULL); + COMPQUIET(repth, NULL); + COMPQUIET(lsnp, NULL); + COMPQUIET(dbt, NULL); + COMPQUIET(flags, 0); + return (__db_norep(env)); +} + +int +__rep_env_refresh(env) + ENV *env; +{ + COMPQUIET(env, NULL); + return (0); +} + +int +__rep_elect_pp(dbenv, nsites, nvotes, flags) + DB_ENV *dbenv; + u_int32_t nsites, nvotes; + u_int32_t flags; +{ + COMPQUIET(nsites, 0); + COMPQUIET(nvotes, 0); + COMPQUIET(flags, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_flush(dbenv) + DB_ENV *dbenv; +{ + return (__db_norep(dbenv->env)); +} + +int +__rep_lease_check(env, refresh) + ENV *env; + int refresh; +{ + COMPQUIET(refresh, 0); + return (__db_norep(env)); +} + +int +__rep_lease_expire(env) + ENV *env; +{ + return (__db_norep(env)); +} + +int +__rep_get_clockskew(dbenv, fast_clockp, slow_clockp) + DB_ENV *dbenv; + u_int32_t *fast_clockp, *slow_clockp; +{ + COMPQUIET(fast_clockp, NULL); + COMPQUIET(slow_clockp, NULL); + return (__db_norep(dbenv->env)); +} + +int +__rep_set_clockskew(dbenv, fast_clock, slow_clock) + DB_ENV *dbenv; + u_int32_t fast_clock, slow_clock; +{ + COMPQUIET(fast_clock, 0); + COMPQUIET(slow_clock, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_set_nsites(dbenv, n) + DB_ENV *dbenv; + u_int32_t n; +{ + COMPQUIET(n, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_get_nsites(dbenv, n) + DB_ENV *dbenv; + u_int32_t *n; +{ + COMPQUIET(n, NULL); + return (__db_norep(dbenv->env)); +} + +int +__rep_set_priority(dbenv, priority) + DB_ENV *dbenv; + u_int32_t priority; +{ + COMPQUIET(priority, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_get_priority(dbenv, priority) + DB_ENV *dbenv; + u_int32_t *priority; +{ + COMPQUIET(priority, NULL); + return (__db_norep(dbenv->env)); +} + +int +__rep_set_timeout(dbenv, which, timeout) + DB_ENV *dbenv; + int which; + db_timeout_t timeout; +{ + COMPQUIET(which, 0); + COMPQUIET(timeout, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_get_timeout(dbenv, which, timeout) + DB_ENV *dbenv; + int which; + db_timeout_t *timeout; +{ + COMPQUIET(which, 0); + COMPQUIET(timeout, NULL); + return (__db_norep(dbenv->env)); +} + +int +__rep_get_config(dbenv, which, onp) + DB_ENV *dbenv; + u_int32_t which; + int *onp; +{ + COMPQUIET(which, 0); + COMPQUIET(onp, NULL); + return (__db_norep(dbenv->env)); +} + +int +__rep_set_config(dbenv, which, on) + DB_ENV *dbenv; + u_int32_t which; + int on; +{ + COMPQUIET(which, 0); + COMPQUIET(on, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_get_limit(dbenv, gbytesp, bytesp) + DB_ENV *dbenv; + u_int32_t *gbytesp, *bytesp; +{ + COMPQUIET(gbytesp, NULL); + COMPQUIET(bytesp, NULL); + return (__db_norep(dbenv->env)); +} + +int +__rep_noarchive(env) + ENV *env; +{ + COMPQUIET(env, NULL); + return (0); +} + +int +__rep_open(env) + ENV *env; +{ + COMPQUIET(env, NULL); + return (0); +} + +int +__rep_preclose(env) + ENV *env; +{ + return (__db_norep(env)); +} + +int +__rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp) + DB_ENV *dbenv; + DBT *control, *rec; + int eid; + DB_LSN *ret_lsnp; +{ + COMPQUIET(control, NULL); + COMPQUIET(rec, NULL); + COMPQUIET(eid, 0); + COMPQUIET(ret_lsnp, NULL); + return (__db_norep(dbenv->env)); +} + +int +__rep_send_message(env, eid, rtype, lsnp, dbtp, logflags, repflags) + ENV *env; + int eid; + u_int32_t rtype; + DB_LSN *lsnp; + const DBT *dbtp; + u_int32_t logflags, repflags; +{ + COMPQUIET(eid, 0); + COMPQUIET(rtype, 0); + COMPQUIET(lsnp, NULL); + COMPQUIET(dbtp, NULL); + COMPQUIET(logflags, 0); + COMPQUIET(repflags, 0); + return (__db_norep(env)); +} + +int +__rep_set_limit(dbenv, gbytes, bytes) + DB_ENV *dbenv; + u_int32_t gbytes, bytes; +{ + COMPQUIET(gbytes, 0); + COMPQUIET(bytes, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_set_transport_pp(dbenv, eid, f_send) + DB_ENV *dbenv; + int eid; + int (*f_send) __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *, + int, u_int32_t)); +{ + COMPQUIET(eid, 0); + COMPQUIET(f_send, NULL); + return (__db_norep(dbenv->env)); +} + +int +__rep_set_request(dbenv, min, max) + DB_ENV *dbenv; + u_int32_t min, max; +{ + COMPQUIET(min, 0); + COMPQUIET(max, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_get_request(dbenv, minp, maxp) + DB_ENV *dbenv; + u_int32_t *minp, *maxp; +{ + COMPQUIET(minp, NULL); + COMPQUIET(maxp, NULL); + return (__db_norep(dbenv->env)); +} + +int +__rep_start_pp(dbenv, dbt, flags) + DB_ENV *dbenv; + DBT *dbt; + u_int32_t flags; +{ + COMPQUIET(dbt, NULL); + COMPQUIET(flags, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_stat_pp(dbenv, statp, flags) + DB_ENV *dbenv; + DB_REP_STAT **statp; + u_int32_t flags; +{ + COMPQUIET(statp, NULL); + COMPQUIET(flags, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + return (__db_norep(dbenv->env)); +} + +int +__rep_stat_print(env, flags) + ENV *env; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + return (__db_norep(env)); +} + +int +__rep_sync(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + return (__db_norep(dbenv->env)); +} +#endif /* !HAVE_REPLICATION */ diff --git a/rep/rep_util.c b/rep/rep_util.c new file mode 100644 index 0000000..8fbf3a0 --- /dev/null +++ b/rep/rep_util.c @@ -0,0 +1,2007 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +#ifdef REP_DIAGNOSTIC +#include "dbinc/db_page.h" +#include "dbinc/fop.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/qam.h" +#endif + +/* + * rep_util.c: + * Miscellaneous replication-related utility functions, including + * those called by other subsystems. + */ +#define TIMESTAMP_CHECK(env, ts, renv) do { \ + if (renv->op_timestamp != 0 && \ + renv->op_timestamp + DB_REGENV_TIMEOUT < ts) { \ + REP_SYSTEM_LOCK(env); \ + F_CLR(renv, DB_REGENV_REPLOCKED); \ + renv->op_timestamp = 0; \ + REP_SYSTEM_UNLOCK(env); \ + } \ +} while (0) + +static int __rep_lockout_int __P((ENV *, REP *, u_int32_t *, u_int32_t, + const char *, u_int32_t)); +static int __rep_newmaster_empty __P((ENV *, int)); +#ifdef REP_DIAGNOSTIC +static void __rep_print_logmsg __P((ENV *, const DBT *, DB_LSN *)); +#endif + +/* + * __rep_bulk_message -- + * This is a wrapper for putting a record into a bulk buffer. Since + * we have different bulk buffers, the caller must hand us the information + * we need to put the record into the correct buffer. All bulk buffers + * are protected by the REP->mtx_clientdb. + * + * PUBLIC: int __rep_bulk_message __P((ENV *, REP_BULK *, REP_THROTTLE *, + * PUBLIC: DB_LSN *, const DBT *, u_int32_t)); + */ +int +__rep_bulk_message(env, bulk, repth, lsn, dbt, flags) + ENV *env; + REP_BULK *bulk; + REP_THROTTLE *repth; + DB_LSN *lsn; + const DBT *dbt; + u_int32_t flags; +{ + DB_REP *db_rep; + REP *rep; + __rep_bulk_args b_args; + size_t len; + int ret; + u_int32_t recsize, typemore; + u_int8_t *p; + + db_rep = env->rep_handle; + rep = db_rep->region; + ret = 0; + + /* + * Figure out the total number of bytes needed for this record. + * !!! The marshalling code includes the given len, but also + * puts its own copy of the dbt->size with the DBT portion of + * the record. Account for that here. + */ + recsize = sizeof(len) + dbt->size + sizeof(DB_LSN) + sizeof(dbt->size); + + /* + * If *this* buffer is actively being transmitted, don't wait, + * just return so that it can be sent as a singleton. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + if (FLD_ISSET(*(bulk->flagsp), BULK_XMIT)) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + return (DB_REP_BULKOVF); + } + + /* + * If the record is bigger than the buffer entirely, send the + * current buffer and then return DB_REP_BULKOVF so that this + * record is sent as a singleton. Do we have enough info to + * do that here? XXX + */ + if (recsize > bulk->len) { + RPRINT(env, DB_VERB_REP_MSGS, (env, + "bulk_msg: Record %d (0x%x) larger than entire buffer 0x%x", + recsize, recsize, bulk->len)); + STAT(rep->stat.st_bulk_overflows++); + (void)__rep_send_bulk(env, bulk, flags); + /* + * XXX __rep_send_message... + */ + MUTEX_UNLOCK(env, rep->mtx_clientdb); + return (DB_REP_BULKOVF); + } + /* + * If this record doesn't fit, send the current buffer. + * Sending the buffer will reset the offset, but we will + * drop the mutex while sending so we need to keep checking + * if we're racing. + */ + while (recsize + *(bulk->offp) > bulk->len) { + RPRINT(env, DB_VERB_REP_MSGS, (env, + "bulk_msg: Record %lu (%#lx) doesn't fit. Send %lu (%#lx) now.", + (u_long)recsize, (u_long)recsize, + (u_long)bulk->len, (u_long)bulk->len)); + STAT(rep->stat.st_bulk_fills++); + if ((ret = __rep_send_bulk(env, bulk, flags)) != 0) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + return (ret); + } + } + + /* + * If we're using throttling, see if we are at the throttling + * limit before we do any more work here, by checking if the + * call to rep_send_throttle changed the repth->type to the + * *_MORE message type. If the throttling code hits the limit + * then we're done here. + */ + if (bulk->type == REP_BULK_LOG) + typemore = REP_LOG_MORE; + else + typemore = REP_PAGE_MORE; + if (repth != NULL) { + if ((ret = __rep_send_throttle(env, + bulk->eid, repth, REP_THROTTLE_ONLY, flags)) != 0) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + return (ret); + } + if (repth->type == typemore) { + RPRINT(env, DB_VERB_REP_MSGS, (env, + "bulk_msg: Record %lu (0x%lx) hit throttle limit.", + (u_long)recsize, (u_long)recsize)); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + return (ret); + } + } + + /* + * Now we own the buffer, and we know our record fits into it. + * The buffer is structured with the len, LSN and then the record. + * Copy the record into the buffer. Then if we need to, + * send the buffer. + */ + p = bulk->addr + *(bulk->offp); + b_args.len = dbt->size; + b_args.lsn = *lsn; + b_args.bulkdata = *dbt; + /* + * If we're the first record, we need to save the first + * LSN in the bulk structure. + */ + if (*(bulk->offp) == 0) + bulk->lsn = *lsn; + if (rep->version < DB_REPVERSION_47) { + len = 0; + memcpy(p, &dbt->size, sizeof(dbt->size)); + p += sizeof(dbt->size); + memcpy(p, lsn, sizeof(DB_LSN)); + p += sizeof(DB_LSN); + memcpy(p, dbt->data, dbt->size); + p += dbt->size; + } else if ((ret = __rep_bulk_marshal(env, &b_args, p, + bulk->len, &len)) != 0) + goto err; + *(bulk->offp) = (uintptr_t)p + (uintptr_t)len - (uintptr_t)bulk->addr; + STAT(rep->stat.st_bulk_records++); + /* + * Send the buffer if it is a perm record or a force. + */ + if (LF_ISSET(REPCTL_PERM)) { + RPRINT(env, DB_VERB_REP_MSGS, (env, + "bulk_msg: Send buffer after copy due to PERM")); + ret = __rep_send_bulk(env, bulk, flags); + } +err: + MUTEX_UNLOCK(env, rep->mtx_clientdb); + return (ret); + +} + +/* + * __rep_send_bulk -- + * This function transmits the bulk buffer given. It assumes the + * caller holds the REP->mtx_clientdb. We may release it and reacquire + * it during this call. We will return with it held. + * + * PUBLIC: int __rep_send_bulk __P((ENV *, REP_BULK *, u_int32_t)); + */ +int +__rep_send_bulk(env, bulkp, ctlflags) + ENV *env; + REP_BULK *bulkp; + u_int32_t ctlflags; +{ + DBT dbt; + DB_REP *db_rep; + REP *rep; + int ret; + + /* + * If the offset is 0, we're done. There is nothing to send. + */ + if (*(bulkp->offp) == 0) + return (0); + + db_rep = env->rep_handle; + rep = db_rep->region; + + /* + * Set that this buffer is being actively transmitted. + */ + FLD_SET(*(bulkp->flagsp), BULK_XMIT); + DB_INIT_DBT(dbt, bulkp->addr, *(bulkp->offp)); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + RPRINT(env, DB_VERB_REP_MSGS, (env, + "send_bulk: Send %d (0x%x) bulk buffer bytes", dbt.size, dbt.size)); + + /* + * Unlocked the mutex and now send the message. + */ + STAT(rep->stat.st_bulk_transfers++); + if ((ret = __rep_send_message(env, + bulkp->eid, bulkp->type, &bulkp->lsn, &dbt, ctlflags, 0)) != 0) + ret = DB_REP_UNAVAIL; + + MUTEX_LOCK(env, rep->mtx_clientdb); + /* + * Ready the buffer for further records. + */ + *(bulkp->offp) = 0; + FLD_CLR(*(bulkp->flagsp), BULK_XMIT); + return (ret); +} + +/* + * __rep_bulk_alloc -- + * This function allocates and initializes an internal bulk buffer. + * This is used by the master when fulfilling a request for a chunk of + * log records or a bunch of pages. + * + * PUBLIC: int __rep_bulk_alloc __P((ENV *, REP_BULK *, int, uintptr_t *, + * PUBLIC: u_int32_t *, u_int32_t)); + */ +int +__rep_bulk_alloc(env, bulkp, eid, offp, flagsp, type) + ENV *env; + REP_BULK *bulkp; + int eid; + uintptr_t *offp; + u_int32_t *flagsp, type; +{ + int ret; + + memset(bulkp, 0, sizeof(REP_BULK)); + *offp = *flagsp = 0; + bulkp->len = MEGABYTE; + if ((ret = __os_malloc(env, bulkp->len, &bulkp->addr)) != 0) + return (ret); + bulkp->offp = offp; + bulkp->type = type; + bulkp->eid = eid; + bulkp->flagsp = flagsp; + return (ret); +} + +/* + * __rep_bulk_free -- + * This function sends the remainder of the bulk buffer and frees it. + * + * PUBLIC: int __rep_bulk_free __P((ENV *, REP_BULK *, u_int32_t)); + */ +int +__rep_bulk_free(env, bulkp, flags) + ENV *env; + REP_BULK *bulkp; + u_int32_t flags; +{ + DB_REP *db_rep; + int ret; + + db_rep = env->rep_handle; + + MUTEX_LOCK(env, db_rep->region->mtx_clientdb); + ret = __rep_send_bulk(env, bulkp, flags); + MUTEX_UNLOCK(env, db_rep->region->mtx_clientdb); + __os_free(env, bulkp->addr); + return (ret); +} + +/* + * __rep_send_message -- + * This is a wrapper for sending a message. It takes care of constructing + * the control structure and calling the user's specified send function. + * + * PUBLIC: int __rep_send_message __P((ENV *, int, + * PUBLIC: u_int32_t, DB_LSN *, const DBT *, u_int32_t, u_int32_t)); + */ +int +__rep_send_message(env, eid, rtype, lsnp, dbt, ctlflags, repflags) + ENV *env; + int eid; + u_int32_t rtype; + DB_LSN *lsnp; + const DBT *dbt; + u_int32_t ctlflags, repflags; +{ + DBT cdbt, scrap_dbt; + DB_ENV *dbenv; + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + REP_46_CONTROL cntrl46; + REP_OLD_CONTROL ocntrl; + __rep_control_args cntrl; + db_timespec msg_time; + int ret; + u_int32_t myflags; + u_int8_t buf[__REP_CONTROL_SIZE]; + size_t len; + + dbenv = env->dbenv; + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + ret = 0; + +#if defined(DEBUG_ROP) || defined(DEBUG_WOP) + if (db_rep->send == NULL) + return (0); +#endif + + /* Set up control structure. */ + memset(&cntrl, 0, sizeof(cntrl)); + memset(&ocntrl, 0, sizeof(ocntrl)); + memset(&cntrl46, 0, sizeof(cntrl46)); + if (lsnp == NULL) + ZERO_LSN(cntrl.lsn); + else + cntrl.lsn = *lsnp; + /* + * Set the rectype based on the version we need to speak. + */ + if (rep->version == DB_REPVERSION) + cntrl.rectype = rtype; + else if (rep->version < DB_REPVERSION) { + cntrl.rectype = __rep_msg_to_old(rep->version, rtype); + RPRINT(env, DB_VERB_REP_MSGS, (env, + "rep_send_msg: rtype %lu to version %lu record %lu.", + (u_long)rtype, (u_long)rep->version, + (u_long)cntrl.rectype)); + if (cntrl.rectype == REP_INVALID) + return (ret); + } else { + __db_errx(env, + "rep_send_message: Unknown rep version %lu, my version %lu", + (u_long)rep->version, (u_long)DB_REPVERSION); + return (__env_panic(env, EINVAL)); + } + cntrl.flags = ctlflags; + cntrl.rep_version = rep->version; + cntrl.log_version = lp->persist.version; + cntrl.gen = rep->gen; + + /* Don't assume the send function will be tolerant of NULL records. */ + if (dbt == NULL) { + memset(&scrap_dbt, 0, sizeof(DBT)); + dbt = &scrap_dbt; + } + + /* + * There are several types of records: commit and checkpoint records + * that affect database durability, regular log records that might + * be buffered on the master before being transmitted, and control + * messages which don't require the guarantees of permanency, but + * should not be buffered. + * + * There are request records that can be sent anywhere, and there + * are rerequest records that the app might want to send to the master. + */ + myflags = repflags; + if (FLD_ISSET(ctlflags, REPCTL_PERM)) + myflags |= DB_REP_PERMANENT; + else if (rtype != REP_LOG || FLD_ISSET(ctlflags, REPCTL_RESEND)) + myflags |= DB_REP_NOBUFFER; + + /* + * Let everyone know if we've been in an established group. + */ + if (F_ISSET(rep, REP_F_GROUP_ESTD)) + F_SET(&cntrl, REPCTL_GROUP_ESTD); + + /* + * We're sending messages to some other version. We cannot + * assume DB_REP_ANYWHERE is available. Turn it off. + */ + if (rep->version != DB_REPVERSION) + FLD_CLR(myflags, DB_REP_ANYWHERE); + + /* + * If we are a master sending a perm record, then set the + * REPCTL_LEASE flag to have the client reply. Also set + * the start time that the client will echo back to us. + * + * !!! If we are a master, using leases, we had better not be + * sending to an older version. + */ + if (IS_REP_MASTER(env) && IS_USING_LEASES(env) && + FLD_ISSET(ctlflags, REPCTL_PERM)) { + F_SET(&cntrl, REPCTL_LEASE); + DB_ASSERT(env, rep->version == DB_REPVERSION); + __os_gettime(env, &msg_time, 1); + cntrl.msg_sec = (u_int32_t)msg_time.tv_sec; + cntrl.msg_nsec = (u_int32_t)msg_time.tv_nsec; + } + + REP_PRINT_MESSAGE(env, eid, &cntrl, "rep_send_message", myflags); +#ifdef REP_DIAGNOSTIC + if (FLD_ISSET( + env->dbenv->verbose, DB_VERB_REP_MSGS) && rtype == REP_LOG) + __rep_print_logmsg(env, dbt, lsnp); +#endif + + /* + * If DB_REP_PERMANENT is set, the LSN better be non-zero. + */ + DB_ASSERT(env, !FLD_ISSET(myflags, DB_REP_PERMANENT) || + !IS_ZERO_LSN(cntrl.lsn)); + + /* + * If we're talking to an old version, send an old control structure. + */ + memset(&cdbt, 0, sizeof(cdbt)); + if (rep->version <= DB_REPVERSION_45) { + if (rep->version == DB_REPVERSION_45 && + F_ISSET(&cntrl, REPCTL_INIT)) { + F_CLR(&cntrl, REPCTL_INIT); + F_SET(&cntrl, REPCTL_INIT_45); + } + ocntrl.rep_version = cntrl.rep_version; + ocntrl.log_version = cntrl.log_version; + ocntrl.lsn = cntrl.lsn; + ocntrl.rectype = cntrl.rectype; + ocntrl.gen = cntrl.gen; + ocntrl.flags = cntrl.flags; + cdbt.data = &ocntrl; + cdbt.size = sizeof(ocntrl); + } else if (rep->version == DB_REPVERSION_46) { + cntrl46.rep_version = cntrl.rep_version; + cntrl46.log_version = cntrl.log_version; + cntrl46.lsn = cntrl.lsn; + cntrl46.rectype = cntrl.rectype; + cntrl46.gen = cntrl.gen; + cntrl46.msg_time.tv_sec = (time_t)cntrl.msg_sec; + cntrl46.msg_time.tv_nsec = (long)cntrl.msg_nsec; + cntrl46.flags = cntrl.flags; + cdbt.data = &cntrl46; + cdbt.size = sizeof(cntrl46); + } else { + (void)__rep_control_marshal(env, &cntrl, buf, + __REP_CONTROL_SIZE, &len); + DB_INIT_DBT(cdbt, buf, len); + } + + /* + * We set the LSN above to something valid. Give the master the + * actual LSN so that they can coordinate with permanent records from + * the client if they want to. + * + * !!! Even though we marshalled the control message for transmission, + * give the transport function the real LSN. + */ + ret = db_rep->send(dbenv, &cdbt, dbt, &cntrl.lsn, eid, myflags); + + /* + * We don't hold the rep lock, so this could miscount if we race. + * I don't think it's worth grabbing the mutex for that bit of + * extra accuracy. + */ + if (ret != 0) { + RPRINT(env, DB_VERB_REP_MSGS, (env, + "rep_send_function returned: %d", ret)); +#ifdef HAVE_STATISTICS + rep->stat.st_msgs_send_failures++; + } else + rep->stat.st_msgs_sent++; +#else + } +#endif + return (ret); +} + +#ifdef REP_DIAGNOSTIC +/* + * __rep_print_logmsg -- + * This is a debugging routine for printing out log records that + * we are about to transmit to a client. + */ +static void +__rep_print_logmsg(env, logdbt, lsnp) + ENV *env; + const DBT *logdbt; + DB_LSN *lsnp; +{ + static int first = 1; + static DB_DISTAB dtab; + + if (first) { + first = 0; + + (void)__bam_init_print(env, &dtab); + (void)__crdel_init_print(env, &dtab); + (void)__db_init_print(env, &dtab); + (void)__dbreg_init_print(env, &dtab); + (void)__fop_init_print(env, &dtab); + (void)__ham_init_print(env, &dtab); + (void)__qam_init_print(env, &dtab); + (void)__txn_init_print(env, &dtab); + } + + (void)__db_dispatch( + env, &dtab, (DBT *)logdbt, lsnp, DB_TXN_PRINT, NULL); +} +#endif + +/* + * __rep_new_master -- + * Called after a master election to sync back up with a new master. + * It's possible that we already know of this new master in which case + * we don't need to do anything. + * + * This is written assuming that this message came from the master; we + * need to enforce that in __rep_process_record, but right now, we have + * no way to identify the master. + * + * PUBLIC: int __rep_new_master __P((ENV *, __rep_control_args *, int)); + */ +int +__rep_new_master(env, cntrl, eid) + ENV *env; + __rep_control_args *cntrl; + int eid; +{ + DBT dbt; + DB_LOG *dblp; + DB_LOGC *logc; + DB_LSN first_lsn, lsn; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + LOG *lp; + REGENV *renv; + REGINFO *infop; + REP *rep; + db_timeout_t lease_to; + u_int32_t unused; + int change, do_req, lockout, ret, t_ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + ret = 0; + logc = NULL; + lockout = 0; + REP_SYSTEM_LOCK(env); + change = rep->gen != cntrl->gen || rep->master_id != eid; + /* + * If we're hearing from a current or new master, then we + * want to clear EPHASE0 in case this site is waiting to + * hear from the master. + */ + F_CLR(rep, REP_F_EPHASE0); + if (change) { + /* + * If we are already locking out others, we're either + * in the middle of sync-up recovery or internal init + * when this newmaster comes in (we also lockout in + * rep_start, but we cannot be racing that because we + * don't allow rep_proc_msg when rep_start is going on). + * + * We're about to become the client of a new master. Since we + * want to be able to sync with the new master as quickly as + * possible, interrupt any STARTSYNC from the old master. The + * new master may need to rely on acks from us and the old + * STARTSYNC is now irrelevant. + * + * Note that, conveniently, the "lockout" flag defines the + * section of this code path during which both "message lockout" + * and "memp sync interrupt" are in effect. + */ + if (F_ISSET(rep, REP_F_READY_MSG)) + goto lckout; + + if ((ret = __rep_lockout_msg(env, rep, 1)) != 0) + goto errlck; + + (void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 1); + lockout = 1; + /* + * We must wait any remaining lease time before accepting + * this new master. This must be after the lockout above + * so that no new message can be processed and re-grant + * the lease out from under us. + */ + if (IS_USING_LEASES(env) && + ((lease_to = __rep_lease_waittime(env)) != 0)) { + REP_SYSTEM_UNLOCK(env); + __os_yield(env, 0, (u_long)lease_to); + REP_SYSTEM_LOCK(env); + F_SET(rep, REP_F_LEASE_EXPIRED); + } + + if ((ret = __env_init_rec(env, cntrl->log_version)) != 0) + goto errlck; + + REP_SYSTEM_UNLOCK(env); + + MUTEX_LOCK(env, rep->mtx_clientdb); + __os_gettime(env, &lp->rcvd_ts, 1); + lp->wait_ts = rep->request_gap; + ZERO_LSN(lp->verify_lsn); + ZERO_LSN(lp->prev_ckp); + ZERO_LSN(lp->waiting_lsn); + ZERO_LSN(lp->max_wait_lsn); + /* + * Open if we need to, in preparation for the truncate + * we'll do in a moment. + */ + if (db_rep->rep_db == NULL && + (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + goto err; + } + + /* + * If we were in the middle of an internal initialization + * and we've discovered a new master instead, clean up + * our old internal init information. We need to clean + * up any flags and unlock our lockout. + */ + REP_SYSTEM_LOCK(env); + if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP)) { + ret = __rep_init_cleanup(env, rep, DB_FORCE); + /* + * Note that if an in-progress internal init was indeed + * "cleaned up", clearing these flags now will allow the + * application to see a completely empty database + * environment for a moment (until the master responds + * to our ALL_REQ). + */ + F_CLR(rep, REP_F_ABBREVIATED | REP_F_RECOVER_MASK); + } + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (ret != 0) { + /* TODO: consider add'l error recovery steps. */ + goto errlck; + } + ENV_GET_THREAD_INFO(env, ip); + if ((ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused)) + != 0) + goto errlck; + rep->stat.st_log_queued = 0; + + /* + * This needs to be performed under message lockout + * if we're actually changing master. + */ + __rep_elect_done(env, rep, 1); + RPRINT(env, DB_VERB_REP_MISC, (env, + "Updating gen from %lu to %lu from master %d", + (u_long)rep->gen, (u_long)cntrl->gen, eid)); + rep->gen = cntrl->gen; + (void)__rep_write_gen(env, rep, rep->gen); + if (rep->egen <= rep->gen) + rep->egen = rep->gen + 1; + rep->master_id = eid; + STAT(rep->stat.st_master_changes++); + rep->stat.st_startup_complete = 0; + __log_set_version(env, cntrl->log_version); + rep->version = cntrl->rep_version; + RPRINT(env, DB_VERB_REP_MISC, (env, + "egen: %lu. rep version %lu", + (u_long)rep->egen, (u_long)rep->version)); + + /* + * If we're delaying client sync-up, we know we have a + * new/changed master now, set flag indicating we are + * actively delaying. + */ + if (FLD_ISSET(rep->config, REP_C_DELAYCLIENT)) + F_SET(rep, REP_F_DELAY); + F_SET(rep, REP_F_NOARCHIVE | REP_F_RECOVER_VERIFY); + F_CLR(rep, REP_F_READY_MSG); + (void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0); + lockout = 0; + } else + __rep_elect_done(env, rep, 1); + REP_SYSTEM_UNLOCK(env); + + MUTEX_LOCK(env, rep->mtx_clientdb); + lsn = lp->ready_lsn; + + if (!change) { + ret = 0; + do_req = __rep_check_doreq(env, rep); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + /* + * If there wasn't a change, we might still have some + * catching up or verification to do. + */ + if (do_req && + (F_ISSET(rep, REP_F_RECOVER_MASK) || + LOG_COMPARE(&lsn, &cntrl->lsn) < 0)) { + ret = __rep_resend_req(env, 0); + if (ret != 0) + RPRINT(env, DB_VERB_REP_MISC, (env, + "resend_req ret is %lu", (u_long)ret)); + } + /* + * If we're not in one of the recovery modes, we need to + * clear the NOARCHIVE flag. Elections set NOARCHIVE + * and if we called an election and found the same + * master, we need to clear NOARCHIVE here. + */ + if (!F_ISSET(rep, REP_F_RECOVER_MASK)) { + REP_SYSTEM_LOCK(env); + F_CLR(rep, REP_F_NOARCHIVE); + REP_SYSTEM_UNLOCK(env); + } + return (ret); + } + MUTEX_UNLOCK(env, rep->mtx_clientdb); + + /* + * If the master changed, we need to start the process of + * figuring out what our last valid log record is. However, + * if both the master and we agree that the max LSN is 0,0, + * then there is no recovery to be done. If we are at 0 and + * the master is not, then we just need to request all the log + * records from the master. + */ + if (IS_INIT_LSN(lsn) || IS_ZERO_LSN(lsn)) { + if ((ret = __rep_newmaster_empty(env, eid)) != 0) + goto err; + goto newmaster_complete; + } + + memset(&dbt, 0, sizeof(dbt)); + /* + * If this client is farther ahead on the log file than the master, see + * if there is any overlap in the logs. If not, the client is too + * far ahead of the master and the client will start over. + */ + if (cntrl->lsn.file < lsn.file) { + if ((ret = __log_cursor(env, &logc)) != 0) + goto err; + ret = __logc_get(logc, &first_lsn, &dbt, DB_FIRST); + if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + if (ret == DB_NOTFOUND) + goto notfound; + else if (ret != 0) + goto err; + if (cntrl->lsn.file < first_lsn.file) + goto notfound; + } + if ((ret = __log_cursor(env, &logc)) != 0) + goto err; + ret = __rep_log_backup(env, rep, logc, &lsn); + if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + if (ret == DB_NOTFOUND) + goto notfound; + else if (ret != 0) + goto err; + + /* + * Finally, we have a record to ask for. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + lp->verify_lsn = lsn; + __os_gettime(env, &lp->rcvd_ts, 1); + lp->wait_ts = rep->request_gap; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (!F_ISSET(rep, REP_F_DELAY)) + (void)__rep_send_message(env, + eid, REP_VERIFY_REQ, &lsn, NULL, 0, DB_REP_ANYWHERE); + goto newmaster_complete; + +err: /* + * If we failed, we need to clear the flags we may have set above + * because we're not going to be setting the verify_lsn. + */ + REP_SYSTEM_LOCK(env); +errlck: if (lockout) { + F_CLR(rep, REP_F_READY_MSG); + (void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0); + } + F_CLR(rep, REP_F_RECOVER_MASK | REP_F_DELAY); +lckout: REP_SYSTEM_UNLOCK(env); + return (ret); + +notfound: + /* + * If we don't have an identification record, we still + * might have some log records but we're discarding them + * to sync up with the master from the start. + * Therefore, truncate our log and treat it as if it + * were empty. In-memory logs can't be completely + * zeroed using __log_vtruncate, so just zero them out. + */ + RPRINT(env, DB_VERB_REP_MISC, + (env, "No commit or ckp found. Truncate log.")); + if (lp->db_log_inmemory) { + ZERO_LSN(lsn); + ret = __log_zero(env, &lsn); + } else { + INIT_LSN(lsn); + ret = __log_vtruncate(env, &lsn, &lsn, NULL); + } + if (ret != 0 && ret != DB_NOTFOUND) + return (ret); + infop = env->reginfo; + renv = infop->primary; + REP_SYSTEM_LOCK(env); + (void)time(&renv->rep_timestamp); + REP_SYSTEM_UNLOCK(env); + if ((ret = __rep_newmaster_empty(env, eid)) != 0) + goto err; +newmaster_complete: + return (DB_REP_NEWMASTER); +} + +/* + * __rep_newmaster_empty + * Handle the case of a NEWMASTER message received when we have an empty + * log. This requires internal init. If we can't do that because of + * NOAUTOINIT, return JOIN_FAILURE. If F_DELAY is in effect, don't even + * consider NOAUTOINIT yet, because they could change it before rep_sync call. + */ +static int +__rep_newmaster_empty(env, eid) + ENV *env; + int eid; +{ + DB_REP *db_rep; + LOG *lp; + REP *rep; + int msg, ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + lp = env->lg_handle->reginfo.primary; + msg = ret = 0; + + MUTEX_LOCK(env, rep->mtx_clientdb); + REP_SYSTEM_LOCK(env); + lp->wait_ts = rep->request_gap; + + /* Usual case is to skip to UPDATE state; we may revise this below. */ + F_CLR(rep, REP_F_RECOVER_VERIFY); + F_SET(rep, REP_F_RECOVER_UPDATE); + + if (F_ISSET(rep, REP_F_DELAY)) { + /* + * Having properly set up wait_ts for later, nothing more to + * do now. + */ + } else if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) { + F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK); + ret = DB_REP_JOIN_FAILURE; + } else { + /* Normal case: neither DELAY nor NOAUTOINIT. */ + msg = 1; + } + REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + + if (msg) + (void)__rep_send_message(env, eid, REP_UPDATE_REQ, + NULL, NULL, 0, 0); + return (ret); +} + +/* + * __rep_noarchive + * Used by log_archive to determine if it is okay to remove + * log files. + * + * PUBLIC: int __rep_noarchive __P((ENV *)); + */ +int +__rep_noarchive(env) + ENV *env; +{ + DB_REP *db_rep; + REGENV *renv; + REGINFO *infop; + REP *rep; + time_t timestamp; + + infop = env->reginfo; + renv = infop->primary; + + /* + * This is tested before REP_ON below because we always need + * to obey if any replication process has disabled archiving. + * Everything is in the environment region that we need here. + */ + if (F_ISSET(renv, DB_REGENV_REPLOCKED)) { + (void)time(×tamp); + TIMESTAMP_CHECK(env, timestamp, renv); + /* + * Check if we're still locked out after checking + * the timestamp. + */ + if (F_ISSET(renv, DB_REGENV_REPLOCKED)) + return (EINVAL); + } + + if (!REP_ON(env)) + return (0); + + db_rep = env->rep_handle; + rep = db_rep->region; + return (F_ISSET(rep, REP_F_NOARCHIVE) ? 1 : 0); +} + +/* + * __rep_send_vote + * Send this site's vote for the election. + * + * PUBLIC: void __rep_send_vote __P((ENV *, DB_LSN *, u_int32_t, u_int32_t, + * PUBLIC: u_int32_t, u_int32_t, u_int32_t, int, u_int32_t, u_int32_t)); + */ +void +__rep_send_vote(env, lsnp, nsites, nvotes, pri, tie, egen, eid, vtype, flags) + ENV *env; + DB_LSN *lsnp; + int eid; + u_int32_t nsites, nvotes, pri; + u_int32_t flags, egen, tie, vtype; +{ + DB_REP *db_rep; + DBT vote_dbt; + REP *rep; + REP_OLD_VOTE_INFO ovi; + __rep_vote_info_args vi; + u_int8_t buf[__REP_VOTE_INFO_SIZE]; + size_t len; + + db_rep = env->rep_handle; + rep = db_rep->region; + + memset(&vi, 0, sizeof(vi)); + memset(&vote_dbt, 0, sizeof(vote_dbt)); + + /* + * In 4.7 we went to fixed sized fields. They may not be + * the same as the sizes in older versions. + */ + if (rep->version < DB_REPVERSION_47) { + memset(&ovi, 0, sizeof(ovi)); + ovi.egen = egen; + ovi.priority = (int) pri; + ovi.nsites = (int) nsites; + ovi.nvotes = (int) nvotes; + ovi.tiebreaker = tie; + vote_dbt.data = &ovi; + vote_dbt.size = sizeof(ovi); + } else { + vi.egen = egen; + vi.priority = pri; + vi.nsites = nsites; + vi.nvotes = nvotes; + vi.tiebreaker = tie; + (void)__rep_vote_info_marshal(env, &vi, buf, + __REP_VOTE_INFO_SIZE, &len); + DB_INIT_DBT(vote_dbt, buf, len); + } + + (void)__rep_send_message(env, eid, vtype, lsnp, &vote_dbt, flags, 0); +} + +/* + * __rep_elect_done + * Clear all election information for this site. Assumes the + * caller hold the region mutex. + * + * PUBLIC: void __rep_elect_done __P((ENV *, REP *, int)); + */ +void +__rep_elect_done(env, rep, found_master) + ENV *env; + REP *rep; + int found_master; +{ + int inelect; + db_timespec endtime; + + inelect = IN_ELECTION(rep); + F_CLR(rep, REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY); + /* + * Finding a master trumps finding a new egen. + */ + if (found_master) + F_CLR(rep, REP_F_EGENUPDATE); + rep->sites = 0; + rep->votes = 0; + if (inelect) { + if (timespecisset(&rep->etime)) { + __os_gettime(env, &endtime, 1); + timespecsub(&endtime, &rep->etime); +#ifdef HAVE_STATISTICS + rep->stat.st_election_sec = (u_int32_t)endtime.tv_sec; + rep->stat.st_election_usec = (u_int32_t) + (endtime.tv_nsec / NS_PER_US); +#endif + RPRINT(env, DB_VERB_REP_ELECT, (env, + "Election finished in %lu.%09lu sec", + (u_long)endtime.tv_sec, (u_long)endtime.tv_nsec)); + timespecclear(&rep->etime); + } + rep->egen++; + } + RPRINT(env, DB_VERB_REP_ELECT, + (env, "Election done; egen %lu", (u_long)rep->egen)); +} + +/* + * __env_rep_enter -- + * + * Check if we are in the middle of replication initialization and/or + * recovery, and if so, disallow operations. If operations are allowed, + * increment handle-counts, so that we do not start recovery while we + * are operating in the library. + * + * PUBLIC: int __env_rep_enter __P((ENV *, int)); + */ +int +__env_rep_enter(env, checklock) + ENV *env; + int checklock; +{ + DB_REP *db_rep; + REGENV *renv; + REGINFO *infop; + REP *rep; + int cnt; + time_t timestamp; + + /* Check if locks have been globally turned off. */ + if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING)) + return (0); + + db_rep = env->rep_handle; + rep = db_rep->region; + + infop = env->reginfo; + renv = infop->primary; + if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) { + (void)time(×tamp); + TIMESTAMP_CHECK(env, timestamp, renv); + /* + * Check if we're still locked out after checking + * the timestamp. + */ + if (F_ISSET(renv, DB_REGENV_REPLOCKED)) + return (EINVAL); + } + + REP_SYSTEM_LOCK(env); + for (cnt = 0; F_ISSET(rep, REP_F_READY_API);) { + REP_SYSTEM_UNLOCK(env); + /* + * We're spinning - environment may be hung. Check if + * recovery has been initiated. + */ + PANIC_CHECK(env); + if (FLD_ISSET(rep->config, REP_C_NOWAIT)) { + __db_errx(env, + "Operation locked out. Waiting for replication lockout to complete"); + return (DB_REP_LOCKOUT); + } + __os_yield(env, 1, 0); + REP_SYSTEM_LOCK(env); + if (++cnt % 60 == 0) + __db_errx(env, + "DB_ENV handle waiting %d minutes for replication lockout to complete", + cnt / 60); + } + rep->handle_cnt++; + REP_SYSTEM_UNLOCK(env); + + return (0); +} + +/* + * __env_db_rep_exit -- + * + * Decrement handle count upon routine exit. + * + * PUBLIC: int __env_db_rep_exit __P((ENV *)); + */ +int +__env_db_rep_exit(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + + /* Check if locks have been globally turned off. */ + if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING)) + return (0); + + db_rep = env->rep_handle; + rep = db_rep->region; + + REP_SYSTEM_LOCK(env); + rep->handle_cnt--; + REP_SYSTEM_UNLOCK(env); + + return (0); +} + +/* + * __db_rep_enter -- + * Called in replicated environments to keep track of in-use handles + * and prevent any concurrent operation during recovery. If checkgen is + * non-zero, then we verify that the dbp has the same handle as the env. + * + * If return_now is non-zero, we'll return DB_DEADLOCK immediately, else we'll + * sleep before returning DB_DEADLOCK. Without the sleep, it is likely + * the application will immediately try again and could reach a retry + * limit before replication has a chance to finish. The sleep increases + * the probability that an application retry will succeed. + * + * Typically calls with txns set return_now so that we return immediately. + * We want to return immediately because we want the txn to abort ASAP + * so that the lockout can proceed. + * + * PUBLIC: int __db_rep_enter __P((DB *, int, int, int)); + */ +int +__db_rep_enter(dbp, checkgen, checklock, return_now) + DB *dbp; + int checkgen, checklock, return_now; +{ + DB_REP *db_rep; + ENV *env; + REGENV *renv; + REGINFO *infop; + REP *rep; + time_t timestamp; + + env = dbp->env; + /* Check if locks have been globally turned off. */ + if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING)) + return (0); + + db_rep = env->rep_handle; + rep = db_rep->region; + infop = env->reginfo; + renv = infop->primary; + + if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) { + (void)time(×tamp); + TIMESTAMP_CHECK(env, timestamp, renv); + /* + * Check if we're still locked out after checking + * the timestamp. + */ + if (F_ISSET(renv, DB_REGENV_REPLOCKED)) + return (EINVAL); + } + REP_SYSTEM_LOCK(env); + /* + * !!! + * Note, we are checking REP_F_READY_OP, but we are + * incrementing rep->handle_cnt. That seems like a mismatch, + * but the intention is to return DEADLOCK to the application + * which will cause them to abort the txn quickly and allow + * the lockout to proceed. + * + * The correctness of doing this depends on the fact that + * lockout of the API always sets REP_F_READY_OP first. + */ + if (F_ISSET(rep, REP_F_READY_OP)) { + REP_SYSTEM_UNLOCK(env); + if (!return_now) + __os_yield(env, 5, 0); + return (DB_LOCK_DEADLOCK); + } + + if (checkgen && dbp->timestamp != renv->rep_timestamp) { + REP_SYSTEM_UNLOCK(env); + __db_errx(env, "%s %s", + "replication recovery unrolled committed transactions;", + "open DB and DBcursor handles must be closed"); + return (DB_REP_HANDLE_DEAD); + } + rep->handle_cnt++; + REP_SYSTEM_UNLOCK(env); + + return (0); +} + +/* + * __op_rep_enter -- + * + * Check if we are in the middle of replication initialization and/or + * recovery, and if so, disallow new multi-step operations, such as + * transaction and memp gets. If operations are allowed, + * increment the op_cnt, so that we do not start recovery while we have + * active operations. + * + * PUBLIC: int __op_rep_enter __P((ENV *)); + */ +int +__op_rep_enter(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + int cnt; + + /* Check if locks have been globally turned off. */ + if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING)) + return (0); + + db_rep = env->rep_handle; + rep = db_rep->region; + + REP_SYSTEM_LOCK(env); + for (cnt = 0; F_ISSET(rep, REP_F_READY_OP);) { + REP_SYSTEM_UNLOCK(env); + /* + * We're spnning - enironment may be hung. Check if + * recovery has been initiated. + */ + PANIC_CHECK(env); + if (FLD_ISSET(rep->config, REP_C_NOWAIT)) { + __db_errx(env, + "Operation locked out. Waiting for replication lockout to complete"); + return (DB_REP_LOCKOUT); + } + __os_yield(env, 5, 0); + cnt += 5; + REP_SYSTEM_LOCK(env); + if (cnt % 60 == 0) + __db_errx(env, + "__op_rep_enter waiting %d minutes for lockout to complete", + cnt / 60); + } + rep->op_cnt++; + REP_SYSTEM_UNLOCK(env); + + return (0); +} + +/* + * __op_rep_exit -- + * + * Decrement op count upon transaction commit/abort/discard or + * memp_fput. + * + * PUBLIC: int __op_rep_exit __P((ENV *)); + */ +int +__op_rep_exit(env) + ENV *env; +{ + DB_REP *db_rep; + REP *rep; + + /* Check if locks have been globally turned off. */ + if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING)) + return (0); + + db_rep = env->rep_handle; + rep = db_rep->region; + + REP_SYSTEM_LOCK(env); + DB_ASSERT(env, rep->op_cnt > 0); + rep->op_cnt--; + REP_SYSTEM_UNLOCK(env); + + return (0); +} + +/* + * __rep_lockout_api -- + * Coordinate with other threads in the library and active txns so + * that we can run single-threaded, for recovery or internal backup. + * Assumes the caller holds the region mutex. + * + * PUBLIC: int __rep_lockout_api __P((ENV *, REP *)); + */ +int +__rep_lockout_api(env, rep) + ENV *env; + REP *rep; +{ + int ret; + + /* + * We must drain long-running operations first. We check + * REP_F_READY_OP in __db_rep_enter in order to allow them + * to abort existing txns quickly. Therefore, we must + * always lockout REP_F_READY_OP first, then REP_F_READY_API. + */ + if ((ret = __rep_lockout_int(env, rep, &rep->op_cnt, 0, + "op_cnt", REP_F_READY_OP)) != 0) + return (ret); + return (__rep_lockout_int(env, rep, &rep->handle_cnt, 0, + "handle_cnt", REP_F_READY_API)); +} + +/* + * __rep_lockout_apply -- + * Coordinate with other threads processing messages so that + * we can run single-threaded and know that no incoming + * message can apply new log records. + * This call should be short-term covering a specific critical + * operation where we need to make sure no new records change + * the log. Currently used to coordinate with elections. + * Assumes the caller holds the region mutex. + * + * PUBLIC: int __rep_lockout_apply __P((ENV *, REP *, u_int32_t)); + */ +int +__rep_lockout_apply(env, rep, apply_th) + ENV *env; + REP *rep; + u_int32_t apply_th; +{ + return (__rep_lockout_int(env, rep, &rep->apply_th, apply_th, + "apply_th", REP_F_READY_APPLY)); +} + +/* + * __rep_lockout_msg -- + * Coordinate with other threads processing messages so that + * we can run single-threaded and know that no incoming + * message can change the world (i.e., like a NEWMASTER message). + * This call should be short-term covering a specific critical + * operation where we need to make sure no new messages arrive + * in the middle and all message threads are out before we start it. + * Assumes the caller holds the region mutex. + * + * PUBLIC: int __rep_lockout_msg __P((ENV *, REP *, u_int32_t)); + */ +int +__rep_lockout_msg(env, rep, msg_th) + ENV *env; + REP *rep; + u_int32_t msg_th; +{ + return (__rep_lockout_int(env, rep, &rep->msg_th, msg_th, + "msg_th", REP_F_READY_MSG)); +} + +/* + * __rep_lockout_int -- + * Internal common code for locking out and coordinating + * with other areas of the code. + * Assumes the caller holds the region mutex. + * + */ +static int +__rep_lockout_int(env, rep, fieldp, field_val, msg, lockout_flag) + ENV *env; + REP *rep; + u_int32_t *fieldp; + const char *msg; + u_int32_t field_val, lockout_flag; +{ + int wait_cnt; + + F_SET(rep, lockout_flag); + for (wait_cnt = 0; *fieldp > field_val;) { + REP_SYSTEM_UNLOCK(env); + /* We're spinning - environment may be hung. Check if + * recovery has been initiated. + */ + PANIC_CHECK(env); + __os_yield(env, 1, 0); +#ifdef DIAGNOSTIC + if (wait_cnt == 5) + __db_errx(env, +"Waiting for %s (%lu) to complete replication lockout", + msg, (u_long)*fieldp); + if (++wait_cnt % 60 == 0) + __db_errx(env, +"Waiting for %s (%lu) to complete replication lockout for %d minutes", + msg, (u_long)*fieldp, wait_cnt / 60); +#endif + REP_SYSTEM_LOCK(env); + } + + COMPQUIET(msg, NULL); + return (0); +} + +/* + * __rep_send_throttle - + * Send a record, throttling if necessary. Callers of this function + * will throttle - breaking out of their loop, if the repth->type field + * changes from the normal message type to the *_MORE message type. + * This function will send the normal type unless throttling gets invoked. + * Then it sets the type field and sends the _MORE message. + * + * Throttling is always only relevant in serving requests, so we always send + * with REPCTL_RESEND. Additional desired flags can be passed in the ctlflags + * argument. + * + * PUBLIC: int __rep_send_throttle __P((ENV *, int, REP_THROTTLE *, + * PUBLIC: u_int32_t, u_int32_t)); + */ +int +__rep_send_throttle(env, eid, repth, flags, ctlflags) + ENV *env; + int eid; + REP_THROTTLE *repth; + u_int32_t ctlflags, flags; +{ + DB_REP *db_rep; + REP *rep; + u_int32_t size, typemore; + int check_limit; + + check_limit = repth->gbytes != 0 || repth->bytes != 0; + /* + * If we only want to do throttle processing and we don't have it + * turned on, return immediately. + */ + if (!check_limit && LF_ISSET(REP_THROTTLE_ONLY)) + return (0); + + db_rep = env->rep_handle; + rep = db_rep->region; + typemore = 0; + if (repth->type == REP_LOG) + typemore = REP_LOG_MORE; + if (repth->type == REP_PAGE) + typemore = REP_PAGE_MORE; + DB_ASSERT(env, typemore != 0); + + /* + * data_dbt.size is only the size of the log + * record; it doesn't count the size of the + * control structure. Factor that in as well + * so we're not off by a lot if our log records + * are small. + */ + size = repth->data_dbt->size + sizeof(__rep_control_args); + if (check_limit) { + while (repth->bytes <= size) { + if (repth->gbytes > 0) { + repth->bytes += GIGABYTE; + --(repth->gbytes); + continue; + } + /* + * We don't hold the rep mutex, + * and may miscount. + */ + STAT(rep->stat.st_nthrottles++); + repth->type = typemore; + goto send; + } + repth->bytes -= size; + } + /* + * Always send if it is typemore, otherwise send only if + * REP_THROTTLE_ONLY is not set. + * + * NOTE: It is the responsibility of the caller to marshal, if + * needed, the data_dbt. This function just sends what it is given. + */ +send: if ((repth->type == typemore || !LF_ISSET(REP_THROTTLE_ONLY)) && + (__rep_send_message(env, eid, repth->type, + &repth->lsn, repth->data_dbt, (REPCTL_RESEND | ctlflags), 0) != 0)) + return (DB_REP_UNAVAIL); + return (0); +} + +/* + * __rep_msg_to_old -- + * Convert current message numbers to old message numbers. + * + * PUBLIC: u_int32_t __rep_msg_to_old __P((u_int32_t, u_int32_t)); + */ +u_int32_t +__rep_msg_to_old(version, rectype) + u_int32_t version, rectype; +{ + /* + * We need to convert from current message numbers to old numbers and + * we need to convert from old numbers to current numbers. Offset by + * one for more readable code. + */ + /* + * Everything for version 0 is invalid, there is no version 0. + */ + static const u_int32_t table[DB_REPVERSION][REP_MAX_MSG+1] = { + /* There is no DB_REPVERSION 0. */ + { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + /* + * 4.2/DB_REPVERSION 1 no longer supported. + */ + { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + /* + * 4.3/DB_REPVERSION 2 no longer supported. + */ + { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + /* + * From 4.7 message number To 4.4/4.5 message number + */ + { REP_INVALID, /* NO message 0 */ + 1, /* REP_ALIVE */ + 2, /* REP_ALIVE_REQ */ + 3, /* REP_ALL_REQ */ + 4, /* REP_BULK_LOG */ + 5, /* REP_BULK_PAGE */ + 6, /* REP_DUPMASTER */ + 7, /* REP_FILE */ + 8, /* REP_FILE_FAIL */ + 9, /* REP_FILE_REQ */ + REP_INVALID, /* REP_LEASE_GRANT */ + 10, /* REP_LOG */ + 11, /* REP_LOG_MORE */ + 12, /* REP_LOG_REQ */ + 13, /* REP_MASTER_REQ */ + 14, /* REP_NEWCLIENT */ + 15, /* REP_NEWFILE */ + 16, /* REP_NEWMASTER */ + 17, /* REP_NEWSITE */ + 18, /* REP_PAGE */ + 19, /* REP_PAGE_FAIL */ + 20, /* REP_PAGE_MORE */ + 21, /* REP_PAGE_REQ */ + 22, /* REP_REREQUEST */ + REP_INVALID, /* REP_START_SYNC */ + 23, /* REP_UPDATE */ + 24, /* REP_UPDATE_REQ */ + 25, /* REP_VERIFY */ + 26, /* REP_VERIFY_FAIL */ + 27, /* REP_VERIFY_REQ */ + 28, /* REP_VOTE1 */ + 29 /* REP_VOTE2 */ + }, + /* + * From 4.7 message number To 4.6 message number. There are + * NO message differences between 4.6 and 4.7. The + * control structure changed. + */ + { REP_INVALID, /* NO message 0 */ + 1, /* REP_ALIVE */ + 2, /* REP_ALIVE_REQ */ + 3, /* REP_ALL_REQ */ + 4, /* REP_BULK_LOG */ + 5, /* REP_BULK_PAGE */ + 6, /* REP_DUPMASTER */ + 7, /* REP_FILE */ + 8, /* REP_FILE_FAIL */ + 9, /* REP_FILE_REQ */ + 10, /* REP_LEASE_GRANT */ + 11, /* REP_LOG */ + 12, /* REP_LOG_MORE */ + 13, /* REP_LOG_REQ */ + 14, /* REP_MASTER_REQ */ + 15, /* REP_NEWCLIENT */ + 16, /* REP_NEWFILE */ + 17, /* REP_NEWMASTER */ + 18, /* REP_NEWSITE */ + 19, /* REP_PAGE */ + 20, /* REP_PAGE_FAIL */ + 21, /* REP_PAGE_MORE */ + 22, /* REP_PAGE_REQ */ + 23, /* REP_REREQUEST */ + 24, /* REP_START_SYNC */ + 25, /* REP_UPDATE */ + 26, /* REP_UPDATE_REQ */ + 27, /* REP_VERIFY */ + 28, /* REP_VERIFY_FAIL */ + 29, /* REP_VERIFY_REQ */ + 30, /* REP_VOTE1 */ + 31 /* REP_VOTE2 */ + } + }; + return (table[version][rectype]); +} + +/* + * __rep_msg_from_old -- + * Convert old message numbers to current message numbers. + * + * PUBLIC: u_int32_t __rep_msg_from_old __P((u_int32_t, u_int32_t)); + */ +u_int32_t +__rep_msg_from_old(version, rectype) + u_int32_t version, rectype; +{ + /* + * We need to convert from current message numbers to old numbers and + * we need to convert from old numbers to current numbers. Offset by + * one for more readable code. + */ + /* + * Everything for version 0 is invalid, there is no version 0. + */ + static const u_int32_t table[DB_REPVERSION][REP_MAX_MSG+1] = { + /* There is no DB_REPVERSION 0. */ + { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + /* + * 4.2/DB_REPVERSION 1 no longer supported. + */ + { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + /* + * 4.3/DB_REPVERSION 2 no longer supported. + */ + { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID, + REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID }, + /* + * From 4.4/4.5 message number To 4.7 message number + */ + { REP_INVALID, /* NO message 0 */ + 1, /* 1, REP_ALIVE */ + 2, /* 2, REP_ALIVE_REQ */ + 3, /* 3, REP_ALL_REQ */ + 4, /* 4, REP_BULK_LOG */ + 5, /* 5, REP_BULK_PAGE */ + 6, /* 6, REP_DUPMASTER */ + 7, /* 7, REP_FILE */ + 8, /* 8, REP_FILE_FAIL */ + 9, /* 9, REP_FILE_REQ */ + /* 10, REP_LEASE_GRANT doesn't exist */ + 11, /* 10, REP_LOG */ + 12, /* 11, REP_LOG_MORE */ + 13, /* 12, REP_LOG_REQ */ + 14, /* 13, REP_MASTER_REQ */ + 15, /* 14, REP_NEWCLIENT */ + 16, /* 15, REP_NEWFILE */ + 17, /* 16, REP_NEWMASTER */ + 18, /* 17, REP_NEWSITE */ + 19, /* 18, REP_PAGE */ + 20, /* 19, REP_PAGE_FAIL */ + 21, /* 20, REP_PAGE_MORE */ + 22, /* 21, REP_PAGE_REQ */ + 23, /* 22, REP_REREQUEST */ + /* 24, REP_START_SYNC doesn't exist */ + 25, /* 23, REP_UPDATE */ + 26, /* 24, REP_UPDATE_REQ */ + 27, /* 25, REP_VERIFY */ + 28, /* 26, REP_VERIFY_FAIL */ + 29, /* 27, REP_VERIFY_REQ */ + 30, /* 28, REP_VOTE1 */ + 31, /* 29, REP_VOTE2 */ + REP_INVALID, /* 30, 4.4/4.5 no message */ + REP_INVALID /* 31, 4.4/4.5 no message */ + }, + /* + * From 4.6 message number To 4.6 message number. There are + * NO message differences between 4.6 and 4.7. The + * control structure changed. + */ + { REP_INVALID, /* NO message 0 */ + 1, /* 1, REP_ALIVE */ + 2, /* 2, REP_ALIVE_REQ */ + 3, /* 3, REP_ALL_REQ */ + 4, /* 4, REP_BULK_LOG */ + 5, /* 5, REP_BULK_PAGE */ + 6, /* 6, REP_DUPMASTER */ + 7, /* 7, REP_FILE */ + 8, /* 8, REP_FILE_FAIL */ + 9, /* 9, REP_FILE_REQ */ + 10, /* 10, REP_LEASE_GRANT */ + 11, /* 11, REP_LOG */ + 12, /* 12, REP_LOG_MORE */ + 13, /* 13, REP_LOG_REQ */ + 14, /* 14, REP_MASTER_REQ */ + 15, /* 15, REP_NEWCLIENT */ + 16, /* 16, REP_NEWFILE */ + 17, /* 17, REP_NEWMASTER */ + 18, /* 18, REP_NEWSITE */ + 19, /* 19, REP_PAGE */ + 20, /* 20, REP_PAGE_FAIL */ + 21, /* 21, REP_PAGE_MORE */ + 22, /* 22, REP_PAGE_REQ */ + 23, /* 22, REP_REREQUEST */ + 24, /* 24, REP_START_SYNC */ + 25, /* 25, REP_UPDATE */ + 26, /* 26, REP_UPDATE_REQ */ + 27, /* 27, REP_VERIFY */ + 28, /* 28, REP_VERIFY_FAIL */ + 29, /* 29, REP_VERIFY_REQ */ + 30, /* 30, REP_VOTE1 */ + 31 /* 31, REP_VOTE2 */ + } + }; + return (table[version][rectype]); +} + +/* + * __rep_print -- + * Optionally print a verbose message. + * + * PUBLIC: void __rep_print __P((ENV *, const char *, ...)) + * PUBLIC: __attribute__ ((__format__ (__printf__, 2, 3))); + */ +void +#ifdef STDC_HEADERS +__rep_print(ENV *env, const char *fmt, ...) +#else +__rep_print(env, fmt, va_alist) + ENV *env; + const char *fmt; + va_dcl +#endif +{ + va_list ap; + DB_MSGBUF mb; + REP *rep; + db_timespec ts; + pid_t pid; + db_threadid_t tid; + const char *s; + char buf[DB_THREADID_STRLEN]; + + DB_MSGBUF_INIT(&mb); + + s = NULL; + if (env->dbenv->db_errpfx != NULL) + s = env->dbenv->db_errpfx; + else if (REP_ON(env)) { + rep = env->rep_handle->region; + if (F_ISSET(rep, REP_F_CLIENT)) + s = "CLIENT"; + else if (F_ISSET(rep, REP_F_MASTER)) + s = "MASTER"; + } + if (s == NULL) + s = "REP_UNDEF"; + __os_gettime(env, &ts, 1); + __os_id(env->dbenv, &pid, &tid); + __db_msgadd(env, &mb, "[%lu:%lu][%s] %s: ", + (u_long)ts.tv_sec, (u_long)ts.tv_nsec/NS_PER_US, + env->dbenv->thread_id_string(env->dbenv, pid, tid, buf), s); + +#ifdef STDC_HEADERS + va_start(ap, fmt); +#else + va_start(ap); +#endif + __db_msgadd_ap(env, &mb, fmt, ap); + va_end(ap); + + DB_MSGBUF_FLUSH(env, &mb); +} + +/* + * PUBLIC: void __rep_print_message + * PUBLIC: __P((ENV *, int, __rep_control_args *, char *, u_int32_t)); + */ +void +__rep_print_message(env, eid, rp, str, flags) + ENV *env; + int eid; + __rep_control_args *rp; + char *str; + u_int32_t flags; +{ + u_int32_t ctlflags, rectype; + char ftype[64], *type; + + rectype = rp->rectype; + ctlflags = rp->flags; + if (rp->rep_version != DB_REPVERSION) + rectype = __rep_msg_from_old(rp->rep_version, rectype); + switch (rectype) { + case REP_ALIVE: + type = "alive"; + break; + case REP_ALIVE_REQ: + type = "alive_req"; + break; + case REP_ALL_REQ: + type = "all_req"; + break; + case REP_BULK_LOG: + type = "bulk_log"; + break; + case REP_BULK_PAGE: + type = "bulk_page"; + break; + case REP_DUPMASTER: + type = "dupmaster"; + break; + case REP_FILE: + type = "file"; + break; + case REP_FILE_FAIL: + type = "file_fail"; + break; + case REP_FILE_REQ: + type = "file_req"; + break; + case REP_LEASE_GRANT: + type = "lease_grant"; + break; + case REP_LOG: + type = "log"; + break; + case REP_LOG_MORE: + type = "log_more"; + break; + case REP_LOG_REQ: + type = "log_req"; + break; + case REP_MASTER_REQ: + type = "master_req"; + break; + case REP_NEWCLIENT: + type = "newclient"; + break; + case REP_NEWFILE: + type = "newfile"; + break; + case REP_NEWMASTER: + type = "newmaster"; + break; + case REP_NEWSITE: + type = "newsite"; + break; + case REP_PAGE: + type = "page"; + break; + case REP_PAGE_FAIL: + type = "page_fail"; + break; + case REP_PAGE_MORE: + type = "page_more"; + break; + case REP_PAGE_REQ: + type = "page_req"; + break; + case REP_REREQUEST: + type = "rerequest"; + break; + case REP_START_SYNC: + type = "start_sync"; + break; + case REP_UPDATE: + type = "update"; + break; + case REP_UPDATE_REQ: + type = "update_req"; + break; + case REP_VERIFY: + type = "verify"; + break; + case REP_VERIFY_FAIL: + type = "verify_fail"; + break; + case REP_VERIFY_REQ: + type = "verify_req"; + break; + case REP_VOTE1: + type = "vote1"; + break; + case REP_VOTE2: + type = "vote2"; + break; + default: + type = "NOTYPE"; + break; + } + + /* + * !!! + * If adding new flags to print out make sure the aggregate + * length cannot overflow the buffer. + */ + ftype[0] = '\0'; + if (LF_ISSET(DB_REP_ANYWHERE)) + (void)strcat(ftype, " any"); /* 4 */ + if (FLD_ISSET(ctlflags, REPCTL_FLUSH)) + (void)strcat(ftype, " flush"); /* 10 */ + /* + * We expect most of the time the messages will indicate + * group membership. Only print if we're not already + * part of a group. + */ + if (!FLD_ISSET(ctlflags, REPCTL_GROUP_ESTD)) + (void)strcat(ftype, " nogroup"); /* 18 */ + if (FLD_ISSET(ctlflags, REPCTL_LEASE)) + (void)strcat(ftype, " lease"); /* 24 */ + if (LF_ISSET(DB_REP_NOBUFFER)) + (void)strcat(ftype, " nobuf"); /* 30 */ + if (FLD_ISSET(ctlflags, REPCTL_PERM)) + (void)strcat(ftype, " perm"); /* 35 */ + if (LF_ISSET(DB_REP_REREQUEST)) + (void)strcat(ftype, " rereq"); /* 41 */ + if (FLD_ISSET(ctlflags, REPCTL_RESEND)) + (void)strcat(ftype, " resend"); /* 48 */ + if (FLD_ISSET(ctlflags, REPCTL_LOG_END)) + (void)strcat(ftype, " logend"); /* 55 */ + RPRINT(env, DB_VERB_REP_MSGS, + (env, + "%s %s: msgv = %lu logv %lu gen = %lu eid %d, type %s, LSN [%lu][%lu] %s", + env->db_home, str, + (u_long)rp->rep_version, (u_long)rp->log_version, (u_long)rp->gen, + eid, type, (u_long)rp->lsn.file, (u_long)rp->lsn.offset, ftype)); + /* + * Make sure the version is close, and not swapped + * here. Check for current version, +/- a little bit. + */ + DB_ASSERT(env, rp->rep_version <= DB_REPVERSION+10); + DB_ASSERT(env, rp->log_version <= DB_LOGVERSION+10); +} + +/* + * PUBLIC: void __rep_fire_event __P((ENV *, u_int32_t, void *)); + */ +void +__rep_fire_event(env, event, info) + ENV *env; + u_int32_t event; + void *info; +{ + int ret; + + /* + * Give repmgr first crack at handling all replication-related events. + * If it can't (or chooses not to) handle the event fully, then pass it + * along to the application. + */ + ret = __repmgr_handle_event(env, event, info); + DB_ASSERT(env, ret == 0 || ret == DB_EVENT_NOT_HANDLED); + + if (ret == DB_EVENT_NOT_HANDLED) + DB_EVENT(env, event, info); +} diff --git a/rep/rep_verify.c b/rep/rep_verify.c new file mode 100644 index 0000000..d90b3aa --- /dev/null +++ b/rep/rep_verify.c @@ -0,0 +1,766 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2004-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +static int __rep_internal_init __P((ENV *, u_int32_t)); + +/* + * __rep_verify -- + * Handle a REP_VERIFY message. + * + * PUBLIC: int __rep_verify __P((ENV *, __rep_control_args *, DBT *, + * PUBLIC: int, time_t)); + */ +int +__rep_verify(env, rp, rec, eid, savetime) + ENV *env; + __rep_control_args *rp; + DBT *rec; + int eid; + time_t savetime; +{ + DBT mylog; + DB_LOG *dblp; + DB_LOGC *logc; + DB_LSN lsn, prev_ckp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + __txn_ckp_args *ckp_args; + u_int32_t logflag, rectype; + int master, match, ret, t_ret; + + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + + /* Do nothing if VERIFY flag is not set. */ + if (!F_ISSET(rep, REP_F_RECOVER_VERIFY)) + return (ret); + +#ifdef DIAGNOSTIC + /* + * We should not ever be in internal init with a lease granted. + */ + if (IS_USING_LEASES(env)) { + REP_SYSTEM_LOCK(env); + DB_ASSERT(env, __rep_islease_granted(env) == 0); + REP_SYSTEM_UNLOCK(env); + } +#endif + + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + memset(&mylog, 0, sizeof(mylog)); + /* If verify_lsn of ZERO is passed in, get last log. */ + MUTEX_LOCK(env, rep->mtx_clientdb); + logflag = IS_ZERO_LSN(lp->verify_lsn) ? DB_LAST : DB_SET; + prev_ckp = lp->prev_ckp; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if ((ret = __logc_get(logc, &rp->lsn, &mylog, logflag)) != 0) + goto out; + match = 0; + if (mylog.size == rec->size && + memcmp(mylog.data, rec->data, rec->size) == 0) + match = 1; + /* + * If we don't have a match, backup to the previous + * identification record and try again. + */ + if (match == 0) { + master = rep->master_id; + /* + * We will eventually roll back over this log record (unless we + * ultimately have to give up and do an internal init). So, if + * it was a checkpoint, make sure we don't end up without any + * checkpoints left in the entire log. + */ + LOGCOPY_32(env, &rectype, mylog.data); + DB_ASSERT(env, ret == 0); + if (!lp->db_log_inmemory && rectype == DB___txn_ckp) { + if ((ret = __txn_ckp_read(env, + mylog.data, &ckp_args)) != 0) + goto out; + lsn = ckp_args->last_ckp; + __os_free(env, ckp_args); + MUTEX_LOCK(env, rep->mtx_clientdb); + lp->prev_ckp = lsn; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (IS_ZERO_LSN(lsn)) { + /* + * No previous checkpoints? The only way this + * is OK is if we have the entire log, all the + * way back to file #1. + */ + if ((ret = __logc_get(logc, + &lsn, &mylog, DB_FIRST)) != 0) + goto out; + if (lsn.file != 1) { + ret = __rep_internal_init(env, 0); + goto out; + } + + /* Restore position of log cursor. */ + if ((ret = __logc_get(logc, + &rp->lsn, &mylog, DB_SET)) != 0) + goto out; + } + } + if ((ret = __rep_log_backup(env, rep, logc, &lsn)) == 0) { + MUTEX_LOCK(env, rep->mtx_clientdb); + lp->verify_lsn = lsn; + __os_gettime(env, &lp->rcvd_ts, 1); + lp->wait_ts = rep->request_gap; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (master != DB_EID_INVALID) + eid = master; + (void)__rep_send_message(env, eid, REP_VERIFY_REQ, + &lsn, NULL, 0, DB_REP_ANYWHERE); + } else if (ret == DB_NOTFOUND) { + /* + * We've either run out of records because + * logs have been removed or we've rolled back + * all the way to the beginning. + */ + ret = __rep_internal_init(env, 0); + } + } else { + /* + * We have a match, so we can probably do a simple sync, without + * needing internal init. But first, check for a couple of + * special cases. + */ + + if (!lp->db_log_inmemory && !IS_ZERO_LSN(prev_ckp)) { + /* + * We previously saw a checkpoint, which means we may + * now be about to roll back over it and lose it. Make + * sure we'll end up still having at least one other + * checkpoint. (Note that if the current record -- the + * one we've just matched -- happens to be a checkpoint, + * then it must be the same as the prev_ckp we're now + * about to try reading. Which means we wouldn't really + * have to read it. But checking for that special case + * doesn't seem worth the trouble.) + */ + if ((ret = __logc_get(logc, + &prev_ckp, &mylog, DB_SET)) != 0) { + if (ret == DB_NOTFOUND) + ret = __rep_internal_init(env, 0); + goto out; + } + /* + * We succeeded reading for the prev_ckp, so it's safe + * to fall through to the verify_match. + */ + } + /* + * Mixed version internal init doesn't work with 4.4, so we + * can't load NIMDBs from a very old-version master. So, fib to + * ourselves that they're already loaded, so that we don't try. + */ + if (rep->version == DB_REPVERSION_44) + F_SET(rep, REP_F_NIMDBS_LOADED); + if (F_ISSET(rep, REP_F_NIMDBS_LOADED)) + ret = __rep_verify_match(env, &rp->lsn, savetime); + else { + /* + * Even though we found a match, we haven't yet loaded + * any NIMDBs, so we have to do an abbreviated internal + * init. We leave lp->verify_lsn set to the matching + * sync point, in case upon eventual examination of the + * UPDATE message it turns out there are no NIMDBs + * (since we can then skip back to a verify_match + * outcome). + */ + ret = __rep_internal_init(env, REP_F_ABBREVIATED); + } + } + +out: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +static int +__rep_internal_init(env, abbrev) + ENV *env; + u_int32_t abbrev; +{ + REP *rep; + int master, ret; + + rep = env->rep_handle->region; + REP_SYSTEM_LOCK(env); +#ifdef HAVE_STATISTICS + if (!abbrev) + rep->stat.st_outdated++; +#endif + + /* + * What we call "abbreviated internal init" is really just NIMDB + * materialization, and we always do that even if NOAUTOINIT has been + * configured. + */ + if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT) && !abbrev) + ret = DB_REP_JOIN_FAILURE; + else { + F_CLR(rep, REP_F_RECOVER_VERIFY); + F_SET(rep, REP_F_RECOVER_UPDATE); + if (abbrev) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "send UPDATE_REQ, merely to check for NIMDB refresh")); + F_SET(rep, REP_F_ABBREVIATED); + } else + F_CLR(rep, REP_F_ABBREVIATED); + ZERO_LSN(rep->first_lsn); + ZERO_LSN(rep->ckp_lsn); + ret = 0; + } + master = rep->master_id; + REP_SYSTEM_UNLOCK(env); + if (ret == 0 && master != DB_EID_INVALID) + (void)__rep_send_message(env, + master, REP_UPDATE_REQ, NULL, NULL, 0, 0); + return (ret); +} + +/* + * __rep_verify_fail -- + * Handle a REP_VERIFY_FAIL message. + * + * PUBLIC: int __rep_verify_fail __P((ENV *, __rep_control_args *)); + */ +int +__rep_verify_fail(env, rp) + ENV *env; + __rep_control_args *rp; +{ + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + int clnt_lock_held, lockout, master, ret; + + clnt_lock_held = lockout = 0; + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + + /* + * If any recovery flags are set, but not LOG or VERIFY, + * then we ignore this message. We are already + * in the middle of updating. + */ + if (F_ISSET(rep, REP_F_RECOVER_MASK) && + !F_ISSET(rep, REP_F_RECOVER_LOG | REP_F_RECOVER_VERIFY)) + return (0); + REP_SYSTEM_LOCK(env); + /* + * We should not ever be in internal init with a lease granted. + */ + DB_ASSERT(env, + !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0); + + /* + * Clean up old internal init in progress if: + * REP_C_NOAUTOINIT is not configured and + * we are recovering LOG and this LSN is in the range we need. + */ + if (F_ISSET(rep, REP_F_RECOVER_LOG) && + LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 && + LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) { + /* + * Already locking out messages, give up. + */ + if (F_ISSET(rep, REP_F_READY_MSG)) + goto unlock; + + /* + * Lock out other messages to prevent race conditions. + */ + if ((ret = __rep_lockout_msg(env, rep, 1)) != 0) + goto unlock; + lockout = 1; + + /* + * Clean up internal init if one was in progress. + */ + if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP)) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "VERIFY_FAIL is cleaning up old internal init for missing log")); + if ((ret = + __rep_init_cleanup(env, rep, DB_FORCE)) != 0) { + RPRINT(env, DB_VERB_REP_SYNC, (env, + "VERIFY_FAIL error cleaning up internal init for missing log: %d", ret)); + goto msglck; + } + F_CLR(rep, REP_F_RECOVER_MASK); + } + F_CLR(rep, REP_F_READY_MSG); + lockout = 0; + } + + REP_SYSTEM_UNLOCK(env); + MUTEX_LOCK(env, rep->mtx_clientdb); + clnt_lock_held = 1; + REP_SYSTEM_LOCK(env); + /* + * Commence an internal init if: + * We are in VERIFY state and the failing LSN is the one we + * were verifying or + * we're recovering LOG and this LSN is in the range we need or + * we are in normal state (no recovery flags set) and + * the failing LSN is the one we're ready for. + * + * We don't want an old or delayed VERIFY_FAIL message to throw us + * into internal initialization when we shouldn't be. + */ + if (((F_ISSET(rep, REP_F_RECOVER_VERIFY)) && + LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) || + (F_ISSET(rep, REP_F_RECOVER_LOG) && + LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 && + LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) || + (F_ISSET(rep, REP_F_RECOVER_MASK) == 0 && + LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0)) { + /* + * Update stats. + */ + STAT(rep->stat.st_outdated++); + + /* + * If REP_C_NOAUTOINIT is configured, return + * DB_REP_JOIN_FAILURE instead of doing internal init. + */ + if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) { + ret = DB_REP_JOIN_FAILURE; + goto unlock; + } + + /* + * Do the internal init. + */ + F_CLR(rep, REP_F_RECOVER_VERIFY); + F_SET(rep, REP_F_RECOVER_UPDATE); + ZERO_LSN(rep->first_lsn); + ZERO_LSN(rep->ckp_lsn); + lp->wait_ts = rep->request_gap; + master = rep->master_id; + REP_SYSTEM_UNLOCK(env); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (master != DB_EID_INVALID) + (void)__rep_send_message(env, + master, REP_UPDATE_REQ, NULL, NULL, 0, 0); + } else { + /* + * Otherwise ignore this message. + */ +msglck: if (lockout) + F_CLR(rep, REP_F_READY_MSG); +unlock: REP_SYSTEM_UNLOCK(env); + if (clnt_lock_held) + MUTEX_UNLOCK(env, rep->mtx_clientdb); + } + return (ret); +} + +/* + * __rep_verify_req -- + * Handle a REP_VERIFY_REQ message. + * + * PUBLIC: int __rep_verify_req __P((ENV *, __rep_control_args *, int)); + */ +int +__rep_verify_req(env, rp, eid) + ENV *env; + __rep_control_args *rp; + int eid; +{ + DBT *d, data_dbt; + DB_LOGC *logc; + DB_REP *db_rep; + REP *rep; + u_int32_t type; + int old, ret; + + ret = 0; + db_rep = env->rep_handle; + rep = db_rep->region; + + type = REP_VERIFY; + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + d = &data_dbt; + memset(d, 0, sizeof(data_dbt)); + F_SET(logc, DB_LOG_SILENT_ERR); + ret = __logc_get(logc, &rp->lsn, d, DB_SET); + /* + * If the LSN was invalid, then we might get a DB_NOTFOUND + * we might get an EIO, we could get anything. + * If we get a DB_NOTFOUND, then there is a chance that + * the LSN comes before the first file present in which + * case we need to return a fail so that the client can + * perform an internal init or return a REP_JOIN_FAILURE. + * + * If we're a client servicing this request and we get a + * NOTFOUND, return it so the caller can rerequest from + * a better source. + */ + if (ret == DB_NOTFOUND) { + if (F_ISSET(rep, REP_F_CLIENT)) { + (void)__logc_close(logc); + return (DB_NOTFOUND); + } + if (__log_is_outdated(env, rp->lsn.file, &old) == 0 && + old != 0) + type = REP_VERIFY_FAIL; + } + + if (ret != 0) + d = NULL; + + (void)__rep_send_message(env, eid, type, &rp->lsn, d, 0, 0); + return (__logc_close(logc)); +} + +/* + * PUBLIC: int __rep_dorecovery __P((ENV *, DB_LSN *, DB_LSN *)); + */ +int +__rep_dorecovery(env, lsnp, trunclsnp) + ENV *env; + DB_LSN *lsnp, *trunclsnp; +{ + DBT mylog; + DB_LOGC *logc; + DB_LSN last_ckp, lsn; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + REP *rep; + int ret, skip_rec, t_ret, update; + u_int32_t rectype, opcode; + __txn_regop_args *txnrec; + __txn_regop_42_args *txn42rec; + + db_rep = env->rep_handle; + rep = db_rep->region; + ENV_GET_THREAD_INFO(env, ip); + + /* Figure out if we are backing out any committed transactions. */ + if ((ret = __log_cursor(env, &logc)) != 0) + return (ret); + + memset(&mylog, 0, sizeof(mylog)); + if (F_ISSET(rep, REP_F_RECOVER_LOG)) { + /* + * Internal init can never skip recovery. + * Internal init must always update the timestamp and + * force dead handles. + */ + skip_rec = 0; + update = 1; + } else { + skip_rec = 1; + update = 0; + } + while (update == 0 && + (ret = __logc_get(logc, &lsn, &mylog, DB_PREV)) == 0 && + LOG_COMPARE(&lsn, lsnp) > 0) { + LOGCOPY_32(env, &rectype, mylog.data); + /* + * Find out if we can skip recovery completely. If we + * are backing up over any record a client usually + * cares about, we must run recovery. + * + * Skipping sync-up recovery can be pretty scary! + * Here's why we can do it: + * If a master downgraded to client and is now running + * sync-up to a new master, that old master must have + * waited for any outstanding txns to resolve before + * becoming a client. Also we are in lockout so there + * can be no other operations right now. + * + * If the client wrote a commit record to the log, but + * was descheduled before processing the txn, and then + * a new master was found, we must've let the txn get + * processed because right now we are the only message + * thread allowed to be running. + */ + DB_ASSERT(env, rep->op_cnt == 0); + DB_ASSERT(env, rep->msg_th == 1); + if (rectype == DB___txn_regop || rectype == DB___txn_ckp || + rectype == DB___dbreg_register) + skip_rec = 0; + if (rectype == DB___txn_regop) { + if (rep->version >= DB_REPVERSION_44) { + if ((ret = __txn_regop_read( + env, mylog.data, &txnrec)) != 0) + goto err; + opcode = txnrec->opcode; + __os_free(env, txnrec); + } else { + if ((ret = __txn_regop_42_read( + env, mylog.data, &txn42rec)) != 0) + goto err; + opcode = txn42rec->opcode; + __os_free(env, txn42rec); + } + if (opcode != TXN_ABORT) + update = 1; + } + } + /* + * Handle if the logc_get fails. + */ + if (ret != 0) + goto err; + + /* + * If we successfully run recovery, we've opened all the necessary + * files. We are guaranteed to be single-threaded here, so no mutex + * is necessary. + */ + if (skip_rec) { + if ((ret = __log_get_stable_lsn(env, &last_ckp)) != 0) { + if (ret != DB_NOTFOUND) + goto err; + ZERO_LSN(last_ckp); + } + RPRINT(env, DB_VERB_REP_SYNC, (env, + "Skip sync-up rec. Truncate log to [%lu][%lu], ckp [%lu][%lu]", + (u_long)lsnp->file, (u_long)lsnp->offset, + (u_long)last_ckp.file, (u_long)last_ckp.offset)); + ret = __log_vtruncate(env, lsnp, &last_ckp, trunclsnp); + } else + ret = __db_apprec(env, ip, lsnp, trunclsnp, update, 0); + + if (ret != 0) + goto err; + F_SET(db_rep, DBREP_OPENFILES); + +err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __rep_verify_match -- + * We have just received a matching log record during verification. + * Figure out if we're going to need to run recovery. If so, wait until + * everything else has exited the library. If not, set up the world + * correctly and move forward. + * + * PUBLIC: int __rep_verify_match __P((ENV *, DB_LSN *, time_t)); + */ +int +__rep_verify_match(env, reclsnp, savetime) + ENV *env; + DB_LSN *reclsnp; + time_t savetime; +{ + DB_LOG *dblp; + DB_LSN trunclsn; + DB_REP *db_rep; + DB_THREAD_INFO *ip; + LOG *lp; + REGENV *renv; + REGINFO *infop; + REP *rep; + int done, master, ret; + u_int32_t unused; + + dblp = env->lg_handle; + db_rep = env->rep_handle; + rep = db_rep->region; + lp = dblp->reginfo.primary; + ret = 0; + infop = env->reginfo; + renv = infop->primary; + ENV_GET_THREAD_INFO(env, ip); + + /* + * Check if the savetime is different than our current time stamp. + * If it is, then we're racing with another thread trying to recover + * and we lost. We must give up. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + done = savetime != renv->rep_timestamp; + if (done) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + return (0); + } + ZERO_LSN(lp->verify_lsn); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + + /* + * Make sure the world hasn't changed while we tried to get + * the lock. If it hasn't then it's time for us to kick all + * operations out of DB and run recovery. + */ + REP_SYSTEM_LOCK(env); + if (F_ISSET(rep, REP_F_READY_MSG) || + (!F_ISSET(rep, REP_F_RECOVER_LOG) && + F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP))) { + /* + * We lost. The world changed and we should do nothing. + */ + STAT(rep->stat.st_msgs_recover++); + goto errunlock; + } + + /* + * Lockout all message threads but ourselves. + */ + if ((ret = __rep_lockout_msg(env, rep, 1)) != 0) + goto errunlock; + + /* + * Lockout the API and wait for operations to complete. + */ + if ((ret = __rep_lockout_api(env, rep)) != 0) + goto errunlock; + + /* OK, everyone is out, we can now run recovery. */ + REP_SYSTEM_UNLOCK(env); + + if ((ret = __rep_dorecovery(env, reclsnp, &trunclsn)) != 0 || + (ret = __rep_remove_init_file(env)) != 0) { + REP_SYSTEM_LOCK(env); + F_CLR(rep, REP_F_READY_API | REP_F_READY_MSG | REP_F_READY_OP); + goto errunlock; + } + + /* + * The log has been truncated (either directly by us or by __db_apprec) + * We want to make sure we're waiting for the LSN at the new end-of-log, + * not some later point. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + lp->ready_lsn = trunclsn; + ZERO_LSN(lp->waiting_lsn); + ZERO_LSN(lp->max_wait_lsn); + lp->max_perm_lsn = *reclsnp; + lp->wait_ts = rep->request_gap; + __os_gettime(env, &lp->rcvd_ts, 1); + ZERO_LSN(lp->verify_lsn); + ZERO_LSN(lp->prev_ckp); + + /* + * Discard any log records we have queued; we're about to re-request + * them, and can't trust the ones in the queue. We need to set the + * DB_AM_RECOVER bit in this handle, so that the operation doesn't + * deadlock. + */ + if (db_rep->rep_db == NULL && + (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + goto out; + } + + F_SET(db_rep->rep_db, DB_AM_RECOVER); + MUTEX_UNLOCK(env, rep->mtx_clientdb); + ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused); + MUTEX_LOCK(env, rep->mtx_clientdb); + F_CLR(db_rep->rep_db, DB_AM_RECOVER); + + REP_SYSTEM_LOCK(env); + rep->stat.st_log_queued = 0; + F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK | REP_F_READY_MSG); + if (ret != 0) + goto errunlock2; + + /* + * If the master_id is invalid, this means that since + * the last record was sent, something happened to the + * master and we may not have a master to request + * things of. + * + * This is not an error; when we find a new master, + * we'll re-negotiate where the end of the log is and + * try to bring ourselves up to date again anyway. + */ + master = rep->master_id; + REP_SYSTEM_UNLOCK(env); + if (master == DB_EID_INVALID) { + MUTEX_UNLOCK(env, rep->mtx_clientdb); + ret = 0; + } else { + /* + * We're making an ALL_REQ. But now that we've + * cleared the flags, we're likely receiving new + * log records from the master, resulting in a gap + * immediately. So to avoid multiple data streams, + * set the wait_ts value high now to give the master + * a chance to start sending us these records before + * the gap code re-requests the same gap. Wait_recs + * will get reset once we start receiving these + * records. + */ + lp->wait_ts = rep->max_gap; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + (void)__rep_send_message(env, + master, REP_ALL_REQ, reclsnp, NULL, 0, DB_REP_ANYWHERE); + } + if (0) { +errunlock2: MUTEX_UNLOCK(env, rep->mtx_clientdb); +errunlock: REP_SYSTEM_UNLOCK(env); + } +out: return (ret); +} + +/* + * __rep_log_backup -- + * + * In the verify handshake, we walk backward looking for + * identification records. Those are the only record types + * we verify and match on. + * + * PUBLIC: int __rep_log_backup __P((ENV *, REP *, DB_LOGC *, DB_LSN *)); + */ +int +__rep_log_backup(env, rep, logc, lsn) + ENV *env; + REP *rep; + DB_LOGC *logc; + DB_LSN *lsn; +{ + DBT mylog; + u_int32_t rectype; + int ret; + + ret = 0; + memset(&mylog, 0, sizeof(mylog)); + while ((ret = __logc_get(logc, lsn, &mylog, DB_PREV)) == 0) { + /* + * Determine what we look for based on version number. + * Due to the contents of records changing between + * versions we have to match based on criteria of that + * particular version. + */ + LOGCOPY_32(env, &rectype, mylog.data); + /* + * In 4.4 and beyond we match checkpoint and commit. + */ + if (rep->version >= DB_REPVERSION_44 && + (rectype == DB___txn_ckp || rectype == DB___txn_regop)) + break; + } + return (ret); +} |