summaryrefslogtreecommitdiff
path: root/rep
diff options
context:
space:
mode:
authorKim Kibum <kb0929.kim@samsung.com>2012-05-21 17:40:46 +0900
committerKim Kibum <kb0929.kim@samsung.com>2012-05-21 17:40:46 +0900
commit2e082c838d2ca750f5daac6dcdabecc22dfd4e46 (patch)
tree01c1dd87d4cc0b62a655c0d768ff695d2d244728 /rep
parenta86e3ca152fb414b376e64c449c201d762e414dd (diff)
downloaddb4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.gz
db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.bz2
db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.zip
Upload Tizen:Base source
Diffstat (limited to 'rep')
-rw-r--r--rep/mlease.html1197
-rw-r--r--rep/rep.src116
-rw-r--r--rep/rep_auto.c679
-rw-r--r--rep/rep_backup.c3379
-rw-r--r--rep/rep_elect.c1353
-rw-r--r--rep/rep_lease.c524
-rw-r--r--rep/rep_log.c872
-rw-r--r--rep/rep_method.c2142
-rw-r--r--rep/rep_record.c2379
-rw-r--r--rep/rep_region.c488
-rw-r--r--rep/rep_stat.c568
-rw-r--r--rep/rep_stub.c391
-rw-r--r--rep/rep_util.c2007
-rw-r--r--rep/rep_verify.c766
14 files changed, 16861 insertions, 0 deletions
diff --git a/rep/mlease.html b/rep/mlease.html
new file mode 100644
index 0000000..85b0aca
--- /dev/null
+++ b/rep/mlease.html
@@ -0,0 +1,1197 @@
+<!DOCTYPE doctype PUBLIC "-//w3c//dtd html 4.0 transitional//en">
+<html>
+<head>
+ <meta http-equiv="Content-Type"
+ content="text/html; charset=iso-8859-1">
+ <meta name="GENERATOR"
+ content="Mozilla/4.76 [en] (X11; U; FreeBSD 4.3-RELEASE i386) [Netscape]">
+ <title>Master Lease</title>
+</head>
+<body>
+<center>
+<h1>Master Leases for Berkeley DB</h1>
+</center>
+<center><i>Susan LoVerso</i> <br>
+<i>sue@sleepycat.com</i> <br>
+<i>Rev 1.1</i><br>
+<i>2007 Feb 2</i><br>
+</center>
+<p><br>
+</p>
+<h2>What are Master Leases?</h2>
+A master lease is a mechanism whereby clients grant master-ship rights
+to a site and that master, by holding lease rights can provide a&nbsp;
+guarantee of durability to a replication group for a given period of
+time.&nbsp; By granting a lease to a master,
+a&nbsp; client will not participate in an election to elect a new
+master until that granted master lease has expired.&nbsp; By holding a
+collection of granted leases, a master will be able to supply
+authoritative read requests to applications.&nbsp; By holding leases a
+read operation on a master can guarantee several things to the
+application:<br>
+<ol>
+ <li>Authoritative reads: a guarantee that the data being read by the
+application is durable and can never be rolled back.</li>
+ <li>Freshness: a guarantee that the data being read by the
+application <b>at the master</b> is
+not stale.</li>
+ <li>Master viability: a guarantee that a current master with valid
+leases will not encounter a duplicate master situation.<br>
+ </li>
+</ol>
+<h2>Requirements</h2>
+The requirements of DB to support this include:<br>
+<ul>
+ <li>After turning them on, users can choose to ignore them in reads
+or not.</li>
+ <li>We are providing read authority on the master only.&nbsp; A
+read on a client is equivalent to a read while ignoring leases.</li>
+ <li>We guarantee that data committed on a master <b>that has been
+read by an application on the
+master</b> will not be rolled back.&nbsp; Data read on a client or
+while ignoring leases <i>or data
+successfully updated/committed but not read,</i>
+may be rolled back.<br>
+ </li>
+ <li>A master will not return successfully from a read operation
+unless it holds a
+majority of leases unless leases are ignored.</li>
+ <li>Master leases will remove the possibility of a current/correct
+master being "shot down" by DUPMASTER.&nbsp; <b>NOTE: Old/Expired
+masters may discover a
+later master and return DUPMASTER to the application however.</b><br>
+ </li>
+ <li>Any send callback failure must result in premature lease
+expiration on the master.<br>
+ </li>
+ <li>Users who change the system clock during master leases void the
+guarantee and may get undefined behavior.&nbsp; We assume time always
+runs forward. <br>
+ </li>
+ <li>Clients are forbidden from participating in elections while they
+have an outstanding lease granted to another site.</li>
+ <li>Clients are forbidden from accepting a new master while they have
+an outstanding lease granted to another site.</li>
+ <li>Clients are forbidden from upgrading themselves to master while
+they have an outstanding lease granted to another site.</li>
+ <li>When asked for a lease grant explicitly by the master, the client
+cannot grant the lease to the master unless the LSN in the master's
+request has been processed by this client.<br>
+ </li>
+</ul>
+The requirements of the
+application using leases include:<br>
+<ul>
+ <li>Users must implement (Base API users on their own, RepMgr users
+via configuration) a majority (or larger) ACK policy. <br>
+ </li>
+ <li>The application must use the election mechanism to decide a master.
+It may not simply declare a site master.</li>
+ <li>The send callback must return an error if the majority ACK policy
+is not met for PERM records.</li>
+ <li>Users must set the number of sites in the group.</li>
+ <li>Using leases in a replication group is all-or-none.&nbsp;
+Therefore, if a site knows it is using leases, it can assume other
+sites are also.<br>
+ </li>
+ <li>All applications that care about read guarantees must forward or
+perform all reads on the master.&nbsp; Reading on the client means a
+read ignoring leases. </li>
+</ul>
+<p>There are some open questions
+remaining.</p>
+<ul>
+ <li>There is one major showstopper issue, see Crashing - Potential
+problem near the end of the document.&nbsp; We need a better solution
+than the one shown there (writing to disk every time a lease is
+granted). Perhaps just documenting that durability means it must be
+flushed to disk before success to avoid that situation?<br>
+ </li>
+ <li>What about db-&gt;join?&nbsp; Users can call join, but the calls
+on the join cursor to get the data would be subject to leases and
+therefore protected.&nbsp; Ok, this is not an open question.</li>
+ <li>What about other read-like operations?&nbsp; Clearly <i>
+DB-&gt;get, DB-&gt;pget, DBC-&gt;get,
+DBC-&gt;pget</i> need lease checks.&nbsp; However, other APIs use
+keys.&nbsp; <i>DB-&gt;key_range</i>
+provides an estimate only so it shouldn't need lease checks. <i>
+DB-&gt;stat</i> provides exact counts
+to <i>bt_nkeys</i> and <i>bt_ndata</i> fields.&nbsp; Are those
+fields considered authoritative that providing those values implies a
+durability guarantee and therefore <i>DB-&gt;stat</i>
+should be subject to lease verification?&nbsp; <i>DBC-&gt;count</i>
+provides a count for
+the number of data items associated with a key.&nbsp; Is this
+authoritative information? This is similar to stat - should it be
+subject to lease verification?<br>
+ </li>
+ <li>Do we require master lease checks on write operations?&nbsp; I
+think lease checks are not needed on write operations.&nbsp; It doesn't
+add correctness and adds a lot of complexity (checking leases in put,
+del, and cursors, then what about rename, remove, etc).<br>
+ </li>
+ <li>Do master leases give an iron-clad guarantee of never rolling
+back a transaction? No, but it should mean that a committed transaction
+can never be <b>read</b> on a master
+unless the lease is valid.&nbsp; A committed transaction on a master
+that has never been presented to the application may get rolled back.<br>
+ </li>
+ <li>Do we need to quarantine or prevent reads on an ex-master until
+sync-up is done?&nbsp; No.&nbsp; A master that is simply downgraded to
+client or crashes and reboots is now a client.&nbsp; Reading from that
+client is the same as saying Ignore Leases.</li>
+ <li>What about adding and removing sites while leases are
+active?&nbsp; This is SR 14778.&nbsp; A consistent <i>nsites</i> value
+is required by master
+leases.&nbsp; &nbsp; It isn't
+clear to me what a master is
+supposed to do if the value of nsites gets smaller while leases are
+active.&nbsp; Perhaps it leaves its larger table intact and simply
+checks for a smaller number of granted leases?<br>
+ </li>
+ <li>Can users turn leases off?&nbsp; No.&nbsp; There is no planned <i>turn
+leases off</i> API.</li>
+ <li>Clock skew will be a percentage.&nbsp; However, the smallest, 1%,
+is probably rather large for clock skew.&nbsp; Percentage was chosen
+for simplicity and similarity to other APIs.&nbsp; What granularity is
+appropriate here?</li>
+</ul>
+<h2>API Changes</h2>
+The API changes that are visible
+to the user are fairly minimal.&nbsp;
+There are a few API calls they need to make to configure master leases
+and then there is the API call to turn them on.&nbsp; There is also a
+new flag to existing APIs to allow read operations to ignore leases and
+return data that
+may be non-durable potentially.<br>
+<h3>Lease Timeout<br>
+</h3>
+There is a new timout the user
+must configure for leases called <b>DB_REP_LEASE_TIMEOUT</b>.&nbsp;
+This timeout will be new to
+the <i>dbenv-&gt;rep_set_timeout</i> method. The <b>DB_REP_LEASE_TIMEOUT</b>
+has no default and it is required that the user configure a timeout
+before they turn on leases (obviously, this timeout need not be set of
+leases will not be used).&nbsp; That timeout is the amount of time
+the lease is valid on the master and how long it is granted
+on the client.&nbsp; This timeout must be the same
+value on all sites (like log file size).&nbsp; The timeout used when
+refreshing leases is the <b>DB_REP_ACK_TIMEOUT</b>
+for RepMgr application.&nbsp; For Base API applications, lease
+refreshes will use the same mechanism as <b>PERM</b> messages and they
+should
+have no additional burden.&nbsp; This timeout is used for lease
+refreshment and is the amount of time a reader will wait to refresh
+leases before returning failure to the application from a read
+operation.<br>
+<br>
+This timeout will be both stored
+with its original value, and also
+converted to a <i>db_timespec</i>
+using the <b>DB_TIMEOUT_TO_TIMESPEC</b>
+macro and have the clock skew accounted for and stored in the shared
+rep structure:<br>
+<pre>db_timeout_t lease_timeout;<br>db_timespec lease_duration;<br></pre>
+NOTE:&nbsp; By sending the lease refresh during DB operations, we are
+forcing/assuming that the operation's process has a replication
+transport function set.&nbsp; That is obviously the case for write
+operations, but would it be a burden for read processes (on a
+master)?&nbsp; I think mostly not, but if we need leases for <i>
+DB-&gt;stat</i> then we need to
+document it as it is certainly possible for an application to have a
+separate or dedicated <i>stat</i>
+application or attempt to use <i>db_stat</i>
+(which will not work if leases must be checked).<br>
+<br>
+Leases should be checked after the local operation so that we don't
+have a window/boundary if we were to check leases first, get
+descheduled, the lose our lease and then perform the operation.&nbsp;
+Do the operation, then check leases before returning to the user.<br>
+<h3>Using Leases</h3>
+There is a new API that the user must call to tell the system to use
+the lease mechanism.&nbsp; The method must be called before the
+application calls <i>dbenv-&gt;rep_start</i>
+or <i>dbenv-&gt;repmgr_start</i>.
+This new
+method is:<br>
+<br>
+<pre>&nbsp;&nbsp;&nbsp; dbenv-&gt;rep_set_lease(DB_ENV *dbenv, u_int32_t clock_scale_factor, u_int32_t flags)<br>
+</pre>
+The <i>clock_scale_factor</i>
+parameter is interpreted as a percentage, greater than 100 (to transmit
+a floating point number as an integer to the API) that represents the
+maximum shkew between any two sites' clocks.&nbsp; That is, a <span
+ style="font-style: italic;">clock_scale_factor</span> of 150 suggests
+that the greatest discrepancy between clocks is that one runs 50%
+faster than the others.&nbsp; Both the
+master and client sides
+compensate for possible clock skew.&nbsp; The master uses the value to
+compensate in case the replica has a slow clock and replicas compensate
+in case they have a fast clock.&nbsp; This scaling factor will need to
+be divided by 100 on all sites to truly represent the percentage for
+adjustments made to time values.<br>
+<br>
+Assume the slowest replica's clock is a factor of <i>clock_scale_factor</i>
+slower than the
+fastest clock.&nbsp; Using that assumption, if the fastest clock goes
+from time t1 to t2 in X
+seconds, the slowest clock does it in (<i>clock_scale_factor</i> / 100)
+* X seconds.<br>
+<br>
+The <i>flags</i> parameter is not
+currently used.<br>
+<br>
+When the <i>dbenv-&gt;rep_set_lease</i>
+method is called, we will set a configuration flag indicating that
+leases are turned on:<br>
+<b>#define REP_C_LEASE &lt;value&gt;</b>.&nbsp;
+We will also record the <b>u_int32_t
+clock_skew</b> value passed in.&nbsp; The <i>rep_set_lease</i> method
+will not allow
+calls after <i>rep_start.&nbsp; </i>If
+multiple calls are made prior to calling <i>rep_start</i> then later
+calls will
+overwrite the earlier clock skew value.&nbsp; <br>
+<br>
+We need a new flag to prevent calling <i>rep_set_lease</i>
+after <i>rep_start</i>.&nbsp; The
+simplest solution would be to reject the call to
+<i>rep_set_lease&nbsp;
+</i>if<b>
+REP_F_CLIENT</b>
+or <b>REP_F_MASTER</b> is set.&nbsp;
+However that does not work in the cases where a site cleanly closes its
+environment and then opens without running recovery.&nbsp; The
+replication state will still be set.&nbsp; The prevention will be
+implemented as:<br>
+<pre>#define REP_F_START_CALLED &lt;some bit value&gt;<br></pre>
+In __rep_start, at the end:<br>
+<pre>if (ret == 0 ) {<br> REP_SYSTEM_LOCK<br> F_SET(rep, REP_F_START_CALLED)<br> REP_SYSTEM_UNLOCK<br>}</pre>
+In <i>__rep_env_refresh</i>, if we
+are the last reference closing the env (we already check for that):<br>
+<pre>F_CLR(rep, REP_F_START_CALLED);</pre>
+In order to avoid run-time floating point operations
+on <i>db_timespec</i> structures,
+when a site is declared as a client or master in <i>rep_start</i> we
+will pre-compute the
+lease duration based on the integer-based clock skew and the
+integer-based lease timeout.&nbsp; A master should set a replica's
+lease expiration to the <b>start time of
+the sent message +
+(lease_timeout / clock_scale_factor)</b> in case the replica has a
+slow clock.&nbsp; Replicas extend their leases to <b>received message
+time + (lease_timeout *
+clock_scale_factor)</b> in case this replica has a fast clock.&nbsp;
+Therefore, the computation will be as follows if the site is becoming a
+master:<br>
+<pre>db_timeout_t tmp;<br>tmp = (db_timeout_t)((double)rep-&gt;lease_timeout / ((double)rep-&gt;clock_skew / (double)100));<br>rep-&gt;lease_duration = DB_TIMEOUT_TO_TIMESPEC(&amp;tmp);<br></pre>
+Similarly, on a client the computation is:<br>
+<pre>tmp = (db_timeout_t)((double)rep-&gt;lease_timeout * ((double)rep-&gt;clock_skew / (double)100));<br></pre>
+When a site changes state, its lease duration will change based on
+whether it is becoming a master or client and it will be recomputed
+from the original values.&nbsp; Note that these computations, coupled
+with the fact that the lease on the master is computed based on the
+master's time that it sent the message means that leases on the master
+are more conservatively computed than on the clients.<br>
+<br>
+The <i>dbenv-&gt;rep_set_lease</i>
+method must be called after <i>dbenv-&gt;open</i>,
+similar to <i>dbenv-&gt;rep_set_config</i>.&nbsp;
+The reason is so that we can check that this is a replication
+environment and we have access to the replication shared memory region.<br>
+<h3>Read Operations<br>
+</h3>
+Authoritative read operations on the master with leases enabled will
+abide by leases by default.&nbsp; We will provide a flag that allows an
+operation on a master to ignore leases.&nbsp; <b>All read operations
+on a client imply
+ignoring leases.</b> If an application wants authoritative reads
+they must forward the read requests to the master and it is the
+application's responsibility to provide the forwarding.
+The consensus was that forcing <span style="font-weight: bold;">DB_IGNORE_LEASE</span>
+on client read operations (with leases enabled, obviously) was too
+heavy handed.&nbsp; Read operations on the client will ignore leases,
+but do no special flag checking.<br>
+<br>
+The flag will be called <b>DB_IGNORE_LEASE</b>
+and it will be a flag that can be OR'd into the DB access method and
+cursor operation values.&nbsp; It will be similar to the <b>DB_READ_UNCOMMITTED</b>
+flag.
+<br>
+</b>The methods that will
+adhere to leases are:<br>
+<ul>
+ <li><i>Db-&gt;get</i></li>
+ <li><i>Db-&gt;pget</i></li>
+ <li><i>Dbc-&gt;get</i></li>
+ <li><i>Dbc-&gt;pget</i></li>
+</ul>
+The code that will check leases for a client reading would look
+something
+like this, if we decide to become heavy-handed:<br>
+<pre>if (IS_REP_CLIENT(dbenv)) {<br> [get to rep structure]<br> if (FLD_ISSET(rep-&gt;config, REP_C_LEASE) &amp;&amp; !LF_ISSET(DB_IGNORE_LEASE)) {<br> db_err("Read operations must ignore leases or go to master");<br> ret = EINVAL;<br> goto err;<br> }<br>}<br></pre>
+On the master, the new code to abide by leases is more complex.&nbsp;
+After the call to perform the operation we will check the lease.&nbsp;
+In that checking code, the master will see if it has a valid
+lease.&nbsp; If so, then all is well.&nbsp; If not, it will try to
+refresh the leases.&nbsp; If that refresh attempt results in leases,
+all is well.&nbsp; If the refresh attempt does not get leases, then the
+master cannot respond to the read as an authority and we return an
+error.&nbsp; The new error is called <b>DB_REP_LEASE_EXPIRED</b>.&nbsp;
+The location of the master lease check is down after the internal call
+to read the data is successful:<br>
+<pre>if (IS_REP_MASTER(dbenv) &amp;&amp; !LF_ISSET(DB_IGNORE_LEASE)) {<br> [get to rep structure]<br> if (FLD_ISSET(rep-&gt;config, REP_C_LEASE) &amp;&amp;<br> (ret = __rep_lease_check(dbenv)) != 0) {<br> /*<br> * We don't hold the lease.<br> */<br> goto err;<br> }<br>}<br></pre>
+See below for the details of <i>__rep_lease_check</i>.<br>
+<br>
+Also note that if leases (or replication) are not configured, then <span
+ style="font-weight: bold;">DB_IGNORE_LEASE</span> is a no-op.&nbsp; It
+is ignored (and won't error) if used when leases are not in
+effect.&nbsp; The reason is so that we can generically set that flag in
+utility programs like <span style="font-style: italic;">db_dump</span>
+that walk the database with a cursor.&nbsp; Note that <span
+ style="font-style: italic;">db_dump</span> is the only utility that
+reads with a cursor.<span style="font-style: italic;"><span
+ style="font-style: italic;"></span></span><br>
+<h3><b>Nsites
+and Elections</b></h3>
+The call to <i>dbenv-&gt;rep_set_nsites</i>
+must be performed before the call to <i>dbenv-&gt;rep_start</i>
+or <i>dbenv-&gt;repmgr_start</i>.&nbsp;
+This document assumes either that <b>SR
+14778</b> gets resolved, or assumes that the value of <i>nsites</i> is
+immutable.&nbsp; The
+master and all clients need to know how many sites and leases are in
+the group.&nbsp; Clients need to know for elections.&nbsp; The master
+needs to know for the size of the lease table and to know what value a
+majority of the group is. <b>[Until
+14778 is resolved, the master lease work must assume <i>nsites</i> is
+immutable and will
+therefore enforce that this is called before <i>rep_start</i> using
+the same mechanism
+as <i>rep_set_lease</i>.]</b><br>
+<br>
+Elections and leases need to agree on the number of sites in the
+group.&nbsp; Therefore, when leases are in effect on clients, all calls
+to <i>dbenv-&gt;rep_elect</i> must
+set the <i>nsites</i> parameter to
+0.&nbsp; The <i>rep_elect</i> code
+path will return <b>EINVAL</b> if <b>REP_C_LEASE</b> is set and <i>nsites</i>
+is non-0.
+<h2>Lease Management</h2>
+<h3>Message Changes</h3>
+In order for clients to grant leases to the master a new message type
+must be added for that purpose.&nbsp; This will be the <b>REP_LEASE_GRANT</b>
+message.&nbsp;
+Granting leases will be a result of applying a <b>DB_REP_PERMANENT</b>
+record and therefore we
+do not need any additional message in order for a master to request a
+lease grant.&nbsp; The <b>REP_LEASE_GRANT</b>
+message will pass a structure as its message DBT:<br>
+<pre>struct __rep_lease_grant {<br> db_timespec msg_time;<br>#ifdef DIAGNOSTIC<br> db_timespec expire_time;<br>#endif<br>} REP_GRANT_INFO;<br></pre>
+In the <b>REP_LEASE_GRANT</b>
+message, the client is actually giving the master several pieces of
+information.&nbsp; We only need the echoed <i>msg_time</i> in this
+structure because
+everything else is already sent.&nbsp; The client is really sending the
+master:<br>
+<ul>
+ <li>Its EID (parameter to <span style="font-style: italic;">rep_send_message</span>
+and <span style="font-style: italic;">rep_process_message</span>)<br>
+ </li>
+ <li>The PERM LSN this message acknowledged (sent in the control
+message)</li>
+ <li>Unique identifier echoed back to master (<i>msg_time</i> sent in
+message as above)</li>
+</ul>
+On the client, we always maintain the maximum PERM LSN already in <i>lp-&gt;max_perm_lsn</i>.&nbsp;
+<h3>Local State Management</h3>
+Each client must maintain a <i>db_timespec</i>
+timestamp containing the expiration of its granted lease.&nbsp; This
+field will be in the replication shared memory structure:<br>
+<pre>db_timespec grant_expire;<br></pre>
+This timestamp already takes into account the clock skew.&nbsp; All
+new fields must be initialized when the region is created. Whenever we
+grant our master lease and want to send the <b>REP_LEASE_GRANT</b>
+message, this value
+will be updated.&nbsp; It will be used in the following way:
+<pre>db_timespec mytime;<br>DB_LSN perm_lsn;<br>DBT lease_dbt;<br>REP_GRANT_INFO gi;<br><br><br>timespecclear(&amp;mytime);<br>timespecclear(&amp;newgrant);<br>memset(&amp;lease_dbt, 0, sizeof(lease_dbt));<br>memset(&amp;gi, 0, sizeof(gi));<br>__os_gettime(dbenv, &amp;mytime);<br>timespecadd(&amp;mytime, &amp;rep-&gt;lease_duration);<br>MUTEX_LOCK(rep-&gt;clientdb_mutex);<br>perm_lsn = lp-&gt;max_perm_lsn;<br>MUTEX_UNLOCK(rep-&gt;clientdb_mutex);<br>REP_SYSTEM_LOCK(dbenv);<br>if (timespeccmp(mytime, rep-&gt;grant_expire, &gt;))<br> rep-&gt;grant_expire = mytime;<br>gi.msg_time = msg-&gt;msg_time;<br>#ifdef DIAGNOSTIC<br>gi.expire_time = rep-&gt;grant_expire;<br>#endif<br>lease_dbt.data = &amp;gi;<br>lease_dbt.size = sizeof(gi);<br>REP_SYSTEM_UNLOCK(dbenv);<br>__rep_send_message(dbenv, eid, REP_LEASE_GRANT, &amp;perm_lsn, &amp;lease_dbt, 0, 0);<br></pre>
+This updating of the lease grant will occur in the <b>PERM</b> code
+path when we have
+successfully applied the permanent record.<br>
+<h3>Maintaining Leases on the
+Master/Rep_start</h3>
+The master maintains a lease table that it checks when fulfilling a
+read request that is subject to leases.&nbsp; This table is initialized
+when a site calls<i>
+dbenv-&gt;rep_start(DB_MASTER)</i> and the site is undergoing a role
+change (i.e. a master making additional calls to <i>dbenv-&gt;rep_start(DB_MASTER)</i>
+does
+not affect an already existing table).<br>
+<br>
+When a non-master site becomes master, it must do two things related to
+leases on a role change.&nbsp; First, a client cannot upgrade to master
+while it has an outstanding lease granted to another site.&nbsp; If a
+client attempts to do so, an error, <b>EINVAL</b>,
+will be returned.&nbsp; The only way this should happen is if the
+application simply declares a site master, instead of using
+elections.&nbsp; Elections will already wait for leases to expire
+before proceeding. (See below.)
+<br>
+<br>
+Second, once we are proceeding with becoming a master, the site must
+allocate the table it will use to maintain lease information.&nbsp;
+This table will be sized based on <i>nsites</i>
+and it will be an array of the following structure:<br>
+<pre>struct {<br> int eid; /* EID of client site. */<br> db_timespec start_time; /* Unique time ID client echoes back on grants. */<br> db_timespec end_time; /* Master's lease expiration time. */<br> DB_LSN lease_lsn; /* Durable LSN this lease applies to. */<br> u_int32_t flags; /* Unused for now?? */<br>} REP_LEASE_ENTRY;<br></pre>
+<h3>Granting Leases</h3>
+It is the burden of the application to make sure that all sites in the
+group
+are using leases, or none are.&nbsp; Therefore, when a client processes
+a <b>PERM</b>
+log record that arrived from the master, it will grant its lease
+automatically if that record is permanent (i.e. <b>DB_REP_ISPERM</b>
+is being returned),
+and leases are configured.&nbsp; A client will not send a
+lease grant when it is processing log records (even <b>PERM</b>
+ones) it receives from other clients that use client-to-client
+synchronization.&nbsp; The reason is that the master requires a unique
+time-of-msg ID (see below) that the client echoes back in its lease
+grant and it will not have such an ID from another client.<br>
+<br>
+The master stores a time-of-msg ID in each message and the client
+simply echoes it back to the master.&nbsp; In its lease table, it does
+keep the base
+time-of-msg for a valid lease.&nbsp; When <b>REP_LEASE_GRANT</b>
+message comes in,
+the master does a number of things:<br>
+<ol>
+ <li>Pulls the echoed timespec from the client message, into <i>msg_time</i>.<br>
+ </li>
+ <li>Finds the entry in its lease table for the client's EID.&nbsp; It
+walks the table searching for the ID.&nbsp; EIDs of <span
+ style="font-weight: bold;">DB_EID_INVALID</span> are
+illegal.&nbsp; Either the master will find the entry, or it will find
+an empty slot in the table (i.e. it is still populating the table with
+leases).</li>
+ <li>If this is a previously unknown site lease, the master
+initializes the entry by copying to the <i>eid</i>, <i>start_time, </i>and
+ <i>lease_lsn</i> fields.&nbsp; The master
+also computes the <i>end_time</i>
+based on the adjusted <i>rep-&gt;lease_duration</i>.</li>
+ <li>If this is a lease from a previously known site, the master must
+perform <i>timespeccmp(&amp;msg_time,
+&amp;table[i].start_time, &gt;)</i> and only update the <i>end_time</i>
+of the lease when this is
+a more recent message.&nbsp; If it is a more recent message, then we
+should update
+the <i>lease_lsn</i> to the LSN in
+the message.</li>
+ <li>Since lease durations are computed taking the clock skew into
+account, clients compute them based on the current time and the master
+computes it based on original sending time, for diagnostic purposes
+only, I also plan to send the client's expiration time.&nbsp; The
+client errs on the side of computing a larger lease expiration time and
+the master errs on the side of computing a smaller duration.&nbsp;
+Since both are taking the clock skew
+into account, the client's ending expiration time should never be
+smaller than
+the master's computed expiration time or their value for clock skew may
+not be correct.<br>
+ </li>
+</ol>
+Any log records (new or resent) that originate from the master and
+result in <b>DB_REP_ISPERM</b> get an
+ack.<br>
+<br>
+<h3>Refreshing Leases</h3>
+Leases get refreshed when a master receives a <b>REP_LEASE_GRANT</b>
+message from a client. There are three pieces to lease
+refreshment.&nbsp; <br>
+<h4>Lazy Lease Refreshing on Read<br>
+</h4>
+If the master discovers that leases are
+expired during the read operation, it attempts to refresh its
+collection of lease grants.&nbsp; It does this by calling a new
+function <i>__rep_lease_refresh</i>.&nbsp;
+This function is very similar to the already-existing function <i>__rep_flush</i>.&nbsp;
+Basically, to
+refresh the lease, the master simply needs to resend the last PERM
+record to the clients.&nbsp; The requirements state that when the
+application send function returns successfully from sending a PERM
+record, the majority of clients have that PERM LSN durable.&nbsp; We
+will have a new public DB error return called <b>DB_REP_LEASE_EXPIRED</b>
+that will be
+returned back to the caller if the master cannot assert its
+authority.&nbsp; The code will look something like this:<br>
+<pre>/*<br> * Use lp-&gt;max_perm_lsn on the master (currently not used on the master)<br> * to keep track of the last PERM record written through the logging system.<br> * need to initialize lp-&gt;max_perm_lsn in rep_start on role_chg.<br> */<br>call __rep_send_message on the last PERM record the master wrote, with DB_REP_PERMANENT<br>if failure<br> expire leases<br> return lease expired error to caller<br>else /* success */<br> recheck lease table<br> /*<br> * We need to recheck the lease table because the client<br> * lease grant messages may not be processed yet, or got<br> * lost, or racing with the application's ACK messages or<br> * whatever. <br> */<br> if we have a majority of valid leases<br> return success<br> else<br> return lease expired error to caller <br></pre>
+<h4>Ongoing Update Refreshment<br>
+</h4>
+Second is having the master indicate to
+the client it needs to send a lease grant in response to the current
+PERM log message.&nbsp; The problem is
+that acknowledgements must contain a master-supplied message timestamp
+that the client sends back to the master.&nbsp; We need to modify the
+structure of the&nbsp; log record messages when leases are configured
+so
+that when a PERM message is sent, the master sends, and the client
+expects, the message timestamp.&nbsp; There are three fairly
+straightforward and different implementations to consider.<br>
+<ol>
+ <li>Adding the timestamp to the <b>REP_CONTROL</b>
+structure.&nbsp; If this option is chosen, then the code trivially
+sends back the timestamp in the client's reply.&nbsp; There is no
+special processing done by either side with the message contents.&nbsp;
+So, on a PERM log record, the master will send a non-zero
+timestamp.&nbsp; On a normal log record the timestamp will be zero or
+some known invalid value.&nbsp; If the client sees a non-zero
+timestamp, it sends a <b>REP_LEASE_GRANT</b>
+with the <i>lp-&gt;max_perm_lsn</i>
+after applying that log record.&nbsp; If it is zero, then the client
+does nothing different.&nbsp; The advantage is ease of code.&nbsp; The
+disadvantage is that for mixed version systems, the client is now
+dealing with different sized control structures.&nbsp; We would have to
+retain the old control structure so that during a mixed version group
+the (upgraded) clients can use, expect and send old control structures
+to the master.&nbsp; This is unfortunate, so let's consider additional
+implementations that don't require modifying the control structure.<br>
+ </li>
+ <li>Adding a new <b>REPCTL_LEASE</b>
+flag to the list of flags for the control structure, but do not change
+the control structure fields.&nbsp; When a master wants to send a
+message that needs a lease ack, it sets the flag.&nbsp; Additionally,
+instead of simply sending a log record DBT as the <i>rec</i> parameter
+for replication, we
+would send a new structure that had the timestamp first and then the
+record (similar to the bulk transfer buffer).&nbsp; The advantage of
+this is that the control structure does not change.&nbsp; Disadvantages
+include more special-cased code in the normal code path where we have
+to check the flag.&nbsp; If the flag is set we have to extract the
+timestamp value and massage the incoming data to pass on the real log
+record to <i>rep_apply</i>.&nbsp; On
+bulk transfer, we would just add the timestamp into the buffer.&nbsp;
+On normal transfers, it would incur an additional data copy on the
+master side.&nbsp; That is unfortunate.&nbsp; Additionally, if this
+record needs to be stored in the temp db, we need some way to get it
+back again later or <span style="font-style: italic;">rep_apply</span>
+would have to extract the timestamp out when it processed the record
+(either live or from the temp db).<br>
+ </li>
+ <li>Adding a different message type, such as <b>REP_LOG_ACK</b>.&nbsp;
+Similarly to <b>REP_LOG_MORE</b> this message would be a
+special-case version of a log record.&nbsp; We would extract out the
+timestamp and then handle as a normal log record.&nbsp; This
+implementation is rejected because it actually would require three new
+message types: <b>REP_LOG_ACK,
+REP_LOG_ACK_MORE, REP_BULK_LOG_ACK</b>.&nbsp; That is just too ugly
+to contemplate.</li>
+</ol>
+<b>[Slight digression:</b> it occurs
+to me while writing about #2 and #3 above, that our implementation of
+all of the *_MORE messages could really be implemented with a <b>REPCTL_MORE</b>
+flag instead of a
+separate message type.&nbsp; We should clean that up and simplify the
+messages but not part of master leases. Hmm, taking that thought
+process further, we really could get rid of the <b>REP_BULK_*</b>
+messages as well if we
+added a <b>REPCTL_BULK</b>
+flag.&nbsp; I think we should definitely do it for the *_MORE
+messages.&nbsp; I am not sure we should do it for bulk because the
+structure of the incoming data record is vastly different.]<br>
+<br>
+Of these options, I believe that modifying the control structure is the
+best alternative.&nbsp; The handling of the old structure will be very
+isolated to code dealing with old versions and is far less complicated
+than injecting the timestamp into the log record DBT and doing a data
+copy.&nbsp; Actually, I will likely combine #1 and the flag from #2
+above.&nbsp; I will have the <b>REPCTL_LEASE</b>
+flag that indicates a lease grant reply is expected and have the
+timestamp in the control structure.&nbsp;
+Also I will probably add in a spare field or two for future use in the <b>REP_CONTROL</b>
+structure.<br>
+<h4>Gap processing</h4>
+No matter which implementation we choose for ongoing lease refreshment,
+gap processing must be considered.&nbsp; The code above assumes the
+timestamps will be placed on PERM records only.&nbsp; Normal log
+records will not have a timestamp, nor a flag or anything else like
+that.&nbsp; However, any log message can fill a gap on a client and
+result in the processing of that normal log record to return <b>DB_REP_ISPERM</b>
+because later records
+were also processed.<br>
+<br>
+The current implementation should work fine in that case because when
+we store the message in the client temp db we store both the control
+DBT and the record DBT.&nbsp; Therefore, when a normal record fills a
+gap, the later PERM record, when retrieved will look just like it did
+when it arrived.&nbsp; The client will have access to the LSN, and the
+timestamp, etc.&nbsp; However, it does mean that sending the <b>REP_LEASE_GRANT</b>
+message must take
+place down in <i>__rep_apply</i>
+because that is the only place we have access to the contents of those
+stored records with the timestamps.<br>
+<br>
+There are two logical choices to consider for granting the lease when
+processing an update.&nbsp; As we process (either a live record or one
+read from the temp db after filling a gap) a PERM message, we send the <b>REP_LEASE_GRANT</b>
+message for each
+PERM record we successfully apply.&nbsp; Or, second, we keep track of
+the largest timestamp of all PERM records we've processed and at the
+end of the function after we've applied all records, we send back a
+single lease grant with the <i>max_perm_lsn</i>
+and a new <i>max_lease_timestamp</i>
+value to the master.&nbsp; The first is easier to implement, the second
+results in possibly slightly fewer messages at the expense of more
+bookkeeping on the client.<br>
+<br>
+A third, more complicated option would be to have the message timestamp
+on all records, but grants are only sent on the PERM messages.&nbsp; A
+reason to do this is that the later timestamp of a normal log record
+would be used as the timestamp sent in the reply and the master would
+get a more up to date timestamp value and a longer lease.&nbsp; <br>
+<br>
+If we change the <span style="font-weight: bold;">REP_CONTROL</span>
+structure to include the timestamp, we potentially break or at least
+need to revisit the gap processing algorithm.&nbsp; That code assumes
+that the control and record elements for the same LSN look the same
+each and every time.&nbsp; The code stores the <span
+ style="font-style: italic;">control</span> DBT as the key and the <span
+ style="font-style: italic;">rec</span> DBT as the data.&nbsp; We use a
+specialized compare function to sort based on the LSN in the control
+DBT.&nbsp; With master leases, the same record transmitted by a master
+multiple times or client for the same LSN will be different because the
+timestamp field will not be the same.&nbsp; Therefore, the client will
+end up with duplicate entries in the temp database for the same
+LSN.&nbsp; Both solutions (adding the timestamp to <span
+ style="font-weight: bold;">REP_CONTROL</span> and adding a <span
+ style="font-weight: bold;">REPCTL_LEASE</span> flag) can yield
+duplicate entries.&nbsp; The flag would cause the same record from the
+master and client to be different as well.<br>
+<h4>Handling Incoming Lease Grants<br>
+</h4>
+The third piece of lease management is handling the incoming <b>REP_LEASE_GRANT</b>
+message on the
+master.&nbsp; When this message is received, the master must do the
+following:<br>
+<pre>REP_SYSTEM_LOCK<br>msg_timestamp = cntrl-&gt;timestamp;<br>client_lease = __rep_lease_entry(dbenv, client eid)<br>if (client_lease == NULL)<br> initial lease for this site, DB_ASSERT there is space in the table<br> add this to the table if there is space<br>} else <br> compare msg_timestamp with client_lease-&gt;start_time<br> if (msg_timestamp is more recent &amp;&amp; msg_lsn &gt;= lease LSN)<br> update entry in table<br>REP_SYSTEM_UNLOCK<br></pre>
+<h3>Expiring Leases</h3>
+Leases can expire in two ways.&nbsp; First they can expire naturally
+due to the passage of time.&nbsp; When checking leases, if the current
+time is later than the lease entry's <i>end_time</i>
+then the lease is expired.&nbsp; Second, they can be forced with a
+premature expiration when the application's transport function returns
+an error.&nbsp; In the first case, there is nothing to do, in the
+second case we need to manipulate the <i>end_time</i>
+so that all future lease checks fail.&nbsp; Since the lease <i>start_time</i>
+is guaranteed to not be in the future we will have a function <i>__rep_lease_expire</i>
+that will:<br>
+<pre>REP_SYSTEM_LOCK<br>for each entry in the lease table<br> entry-&gt;end_time = entry-&gt;start_time;<br>REP_SYSTEM_UNLOCK<br></pre>
+Is there a potential race or problem with prematurely expiring
+leases?&nbsp; Consider an application that enforces an ALL
+acknowledgement policy for PERM records in its transport
+callback.&nbsp; There are four clients and three send the PERM ack to
+the application.&nbsp; The callback returns an error to the master DB
+code.&nbsp; The DB code will now prematurely expire its leases.&nbsp;
+However, at approximately the same time the three clients are also
+sending their <span style="font-weight: bold;">REP_LEASE_GRANT</span>
+messages to the master.&nbsp; There is a race between the master
+processing those messages and the thread handling the callback failure
+expiring the table.&nbsp; This is only an issue if the messages arrive
+after the table has been expired.<br>
+<br>
+Let's assume all three clients send their grants after the master
+expires the table.&nbsp; If we accept those grants and then a read
+occurs the read will succeed since the master has a majority of leases
+even though the callback failed earlier.&nbsp; Is that a problem?&nbsp;
+The lease code is using a majority and the application policy is using
+something other value.&nbsp; It feels like this should be okay since
+the data is held by leases on a majority.&nbsp; Should we consider
+having the lease checking threshold be the same as the permanent ack
+policy?&nbsp; That is difficult because Base API users implement
+whatever they want and DB does not know what it is.<br>
+<h3>Checking Leases</h3>
+When a read operation on the master completes, the last thing we need
+to do is verify the master leases.&nbsp; We've already discussed
+refreshing them when they are expired above.&nbsp; We need two things
+for a lease to be valid.&nbsp; It must be within the timeframe of the
+lease grant and the lease must be valid for the last PERM record
+LSN.&nbsp; Here is the logic
+for checking the validity of leases in <i>__rep_lease_check</i>:<br>
+<pre>#define MAX_REFRESH_TRIES 3<br>DB_LSN lease_lsn;<br>REP_LEASE_ENTRY *entry;<br>u_int32_t min_leases, valid_leases;<br>db_timespec cur_time;<br>int ret, tries;<br><br> tries = 0;<br>retry:<br> ret = 0;<br> LOG_SYSTEM_LOCK<br> lease_lsn = lp-&gt;lsn<br> LOG_SYSTEM_UNLOCK<br> REP_SYSTEM_LOCK<br> min_leases = rep-&gt;nsites / 2;<br> __os_gettime(dbenv, &amp;cur_time);<br> for (entry = head of table, valid_leases = 0; entry != NULL &amp;&amp; valid_leases &lt; min_leases; entry++)<br> if (timespec_cmp(&amp;entry-&gt;end_time, &amp;cur_time) &gt;= 0 &amp;&amp; log_compare(&amp;entry-&gt;lsn, lease_lsn) == 0)<br> valid_leases++;<br> REP_SYSTEM_UNLOCK<br> if (valid_leases &lt; min_leases) {<br> ret =__rep_lease_refresh(dbenv, ...);<br> /*<br> * If we are successful, we need to recheck the leases because <br> * the lease grant messages may have raced with the PERM<br> * acknowledgement. Give those messages a chance to arrive.<br> */<br> if (ret == 0) {<br> if (tries &lt;= MAX_REFRESH_TRIES) {<br> /*<br> * If we were successful sending, but not successful in racing the<br> * message thread, yield the processor so that message<br> * threads may have a chance to run.<br> */<br> if (tries &gt; 0)<br> /* __os_sleep instead?? */<br> __os_yield()<br> tries++;<br> goto retry;<br> } else<br> ret = DB_RET_LEASE_EXPIRED;<br> }<br> }<br> return (ret);</pre>
+If the master has enough valid leases it returns success.&nbsp; If it
+does not have enough, it attempts to refresh them.&nbsp; This attempt
+may fail if sending the PERM record does not receive sufficient
+acks.&nbsp; If we do receive sufficient acknowledgements we may still
+find that scheduling of message threads means the master hasn't yet
+processed the incoming <b>REP_LEASE_GRANT</b>
+messages yet.&nbsp; We will retry a couple times (possibly
+parameterized) if the master discovers that situation.&nbsp; <br>
+<h2>Elections</h2>
+When a client grants a lease to a master, it gives up the right to
+participate in an election until that grant expires.&nbsp; If we are
+the master and <i>dbenv-&gt;rep_elect</i>
+is called, it should return, no matter what, like it does today.&nbsp;
+If we are a client and <i>rep_elect</i>
+is called special processing takes place when leases are in
+effect.&nbsp; First, the easy case is if the lease granted by this
+client has already expired, then the client goes directly into the
+election as normal.&nbsp; If a valid lease grant is outstanding to a
+master, this site cannot participate in an election until that grant
+expires.&nbsp; We have at least two options when a site calls the <i>dbenv-&gt;rep_elect</i>
+API while
+leases are in effect.<br>
+<ol>
+ <li>The simplest coding solution for DB would be simply to refuse to
+participate in the election if this site has a current lease granted to
+a master.&nbsp; We would detect this situation and return EINVAL.&nbsp;
+This is correct behavior and trivial to implement.&nbsp; The
+disadvantage of this solution is that the application would then be
+responsible for repeatedly attempting an election until the lease grant
+expired.<br>
+ </li>
+ <li>The more satisfying solution is for DB to wait the remaining time
+for the grant.&nbsp; If this client hears from the master during that
+time the election does not take place and the call to <i>rep_elect</i>
+returns with the
+information for the current/old master.</li>
+</ol>
+<h3>Election Code Changes</h3>
+The code changes to support leases in the election code are fairly
+isolated.&nbsp; First if leases are configured, we must verify the <i>nsites</i>
+parameter is set to 0.&nbsp;
+Second, in <i>__rep_elect_init</i>
+we must not overwrite the value of <i>rep-&gt;nsites</i>
+for leases because it is controlled by the <i>dbenv-&gt;rep_set_nsites</i>
+API.&nbsp;
+These changes are small and easy to understand.<br>
+<br>
+The more complicated code will be the client code when it has an
+outstanding lease granted.&nbsp; The client will wait for the current
+lease grant to expire before proceeding with the election.&nbsp; The
+client will only do so if it does not hear from the master for the
+remainder of the lease grant time.&nbsp; If the client hears from the
+master, it returns and does not begin participating in the
+election.&nbsp; A new election phase, <b>REP_EPHASE0</b>
+will exist so that the call to <i>__rep_wait</i>
+can detect if a master responds.&nbsp; The client, while waiting for
+the lease grant to expire, will send a <b>REP_MASTER_REQ</b>
+message so that the master will respond with a <b>REP_NEWMASTER</b>
+message and thus,
+allow the client to know the master exists.&nbsp; However, it is also
+desirable that if the master
+replies to the client, the master wants the client to update its lease
+grant.&nbsp; <br>
+<br>
+Recall that the <b>REP_NEWMASTER</b>
+message does not result in a lease grant from the client.&nbsp; The
+client responds when it processes a PERM record that has the <b>REPCTL_LEASE</b>
+flag set in the message
+with its lease grant up to the given LSN.&nbsp; Therefore, we want the
+client's <b>REP_MASTER_REQ</b> to
+yield both the discovery of the existing master and have the master
+refresh its leases.&nbsp; The client will also use the <b>REPCTL_LEASE</b>
+flag in its <b>REP_MASTER_REQ</b> message to the
+master.&nbsp; This flag will serve as the indicator to the master that
+it needs to deal with leases and both send the <b>REP_NEWMASTER</b>
+message and refresh
+the lease.<br>
+The code will work as follows:<br>
+<pre>if (leases_configured &amp;&amp; (my_grant_still_valid || lease_never_granted) {<br> if (lease_never_granted)<br> wait_time = lease_timeout<br> else<br> wait_time = grant_expiration - current_time<br> F_SET(REP_F_EPHASE0);<br> __rep_send_message(..., REP_MASTER_REQ, ... REPCTL_LEASE);<br> ret = __rep_wait(..., REP_F_EPHASE0);<br> if (we found a master)<br> return<br>} /* if we don't return, fall out and proceed with election */<br></pre>
+On the master side, the code handling the <b>REP_MASTER_REQ</b> will
+do:<br>
+<pre>if (I am master) {<br> ...<br> __rep_send_message(REP_NEWMASTER...)<br> if (F_ISSET(rp, REPCTL_LEASE))<br> __rep_lease_refresh(...)<br>}<br></pre>
+Other minor implementation details are that<i> __rep_elect_done</i>
+must also clear
+the <b>REP_F_EPHASE0</b> flag.&nbsp;
+We also, obviously, need to define <b>REP_F_EPHASE0</b>
+in the list of replication flags.&nbsp; Note that the client's call to <i>__rep_wait</i>
+will return upon
+receiving the <b>REP_NEWMASTER</b>
+message.&nbsp; The client will independently refresh its lease when it
+receives the log record from the master's call to refresh the lease.<br>
+<br>
+Again, similar to what I suggested above, the code could simply assume
+global leases are configured, and instead of having the <b>REPCTL_LEASE</b>
+flag at all, the master
+assumes that it needs to refresh leases because it has them configured,
+not because it is specified in the <b>REP_MASTER_REQ</b>
+message it is processing. Right now I don't think every possible
+<b>REP_MASTER_REQ</b> message should result in a lease grant request.<br>
+<h4>Elections and Quiescient Systems</h4>
+It is possible that a master is slow or the client is close to its
+expiration time, or that the master is quiescient and all leases are
+currently expired, but nothing much is going on anyway, yet some client
+calls <i>__rep_elect</i> at that
+time.&nbsp; In the code above, we will not send the <b>REP_MASTER_REQ</b>
+because the lease is
+not valid.&nbsp; The client will simply proceed directly to sending the
+<b>REP_VOTE1</b> message, throwing all
+other clients into an election.&nbsp; The master is still master and
+should stay that way.&nbsp; Currently in response to a vote message, a
+master will broadcast out a <b>REP_NEWMASTER</b>
+to assert its mastership.&nbsp; That causes the election to
+complete.&nbsp; However, if desired the master may want to proactively
+refresh its leases.&nbsp; This situation indicates to me that the
+master should choose to refresh leases based on configuration, not a
+flag sent from the client.&nbsp; I believe anytime the master asserts
+its mastership via sending a <b>REP_NEWMASTER</b>
+message that I need to add code to proactively refresh leases at that
+time.<br>
+<h2>Other Implementation Details</h2>
+<h3>Role Changes<br>
+</h3>
+When a site changes its role via a call to <i>rep_start</i> in either
+direction, we
+must take action when leases are configured.&nbsp; There are three
+types of role changes that all need changes to deal with leases:<br>
+<ol>
+ <li><i>A master downgrading to a
+client.</i> When a master downgrades to a client, it can do so
+immediately after it has proactively expired all existing leases it
+holds.&nbsp; This situation is similar to an error from the send
+callback, and it effectively cancels all outstanding leases held on
+this site.&nbsp; Note that if this master expires its leases, it does
+not have any effect on when the clients' lease grants expire on the
+client side.&nbsp; The clients must still wait their full expected
+grant time.<br>
+ </li>
+ <li><i>A client upgrading to master.</i>
+If a client is upgrading to a master but it has an outstanding lease
+granted to another site, the code will return an <b>EINVAL</b>
+error.&nbsp; This situation
+only arises if the application simply declares this site master.&nbsp;
+If a site wins an election then the election itself should have waited
+long enough for the granted lease to expire and this state should not
+arise then.</li>
+ <li><i>A client finding a new master.</i>
+When a client discovers a new and different master, via a <b>REP_NEWMASTER</b>
+message then the
+client cannot accept that new master until its current lease grant
+expires.&nbsp; This situation should only occur when a site declares
+itself master without an election and that site's lease grant expires
+before this client's grant expires.&nbsp; However, it is <b>possible</b>
+for this situation to arise
+with elections also.&nbsp; If we have 5 sites holding an election and 4
+of those sites have leases expire at about the same time T, and this
+site's lease expires at time T+N and the election timeout is &lt; N,
+then those 4 sites may hold an election and elect a master without this
+site's participation.&nbsp; A client in this situation must call <i>__rep_wait</i>
+with the time remaining
+on its lease.&nbsp; If the lease is expired after waiting the remaining
+time, then the client can accept this new master.&nbsp; If the lease
+was refreshed during the waiting period then the client does not accept
+this new master and returns.<br>
+ </li>
+</ol>
+<h3>DUPMASTER</h3>
+A duplicate master situation can occur if an old master becomes
+disconnected from the rest of the group, that group elects a new master
+and then the partition is resolved.&nbsp; The requirement for master
+leases is that this situation will not cause the newly elected,
+rightful master to receive the <b>DB_REP_DUPMASTER</b>
+return.&nbsp; It is okay for the old master to get that return
+value.&nbsp; When a dual master situation exists, the following will
+happen:<br>
+<ul>
+ <li><i>On the current master and all
+current clients</i> - If the current master receives an update
+message or other conflicting message from the old master then that
+message will be ignored because the generation number is out of date.</li>
+ <li><i>On the old master</i> - If
+the old master receives an update message from the current master, or
+any other message with a later generation from any site, the new
+generation number will trigger this site to return <b>DB_REP_DUPMASTER</b>.&nbsp;
+However,
+instead of broadcasting out the <b>REP_DUPMASTER</b>
+message to shoot down others as well, this site, if leases are
+configured, will call <i>__rep_lease_check</i>
+and if they are expired, return the error.&nbsp; It should be
+impossible for us to receive a later generation message and still hold
+a majority of master leases.&nbsp; Something is seriously wrong and we
+will <b>DB_ASSERT</b> this situation
+cannot happen.<br>
+ </li>
+</ul>
+<h3>Client to Client Synchronization</h3>
+One question to ask is how lease grants interact with client-to-client
+synchronization. The only answer is that they do not.&nbsp; A client
+that is sending log records to another client cannot request the
+receiving client refresh its lease with the master.&nbsp; That client
+does not have a timestamp it can use for the master and clock skew
+makes it meaningless between machines.&nbsp; Therefore, sites that use
+client-to-client synchronization will likely see more lease refreshment
+during the read path and leases will be refreshed during live updates
+only.&nbsp; Of course, if a client supplies log records that fill a
+gap, and the later log records stored came from the master in a live
+update then the client will respond as per the discussion on Gap
+Processing above.<br>
+<h2>Interaction Matrix</h2>
+If leases are granted (by a client) or held (by a master) what should
+the following APIs and messages do?<br>
+<br>
+Other:<br>
+log_archive: Leases do not affect log_archive.&nbsp; OK.<br>
+dbenv-&gt;close: OK.<br>
+crash during lease grant and restart: <b>Potential
+problem here.&nbsp; See discussion below</b>.<br>
+<br>
+Rep Base API method:<br>
+rep_elect: Already discussed above.&nbsp; Must wait for lease to expire.<br>
+rep_flush: Master only, OK - this will be the basis for refreshing
+leases.<br>
+rep_get_*: Not affected by leases.<br>
+rep_process_message: Generally OK.&nbsp; We'll discuss each message
+below.<br>
+rep_set_config: OK.<br>
+rep_set_limit: OK<br>
+rep_set_nsites: Must be called before <i>rep_start</i>
+and <i>nsites</i> is immutable until
+14778 is resolved.<br>
+rep_set_priority: OK<br>
+rep_set_timeout: OK.&nbsp; Used to set lease timeout.<br>
+rep_set_transport: OK.<br>
+rep_start(MASTER): Role changes are discussed above.&nbsp; Make sure
+duplicate rep_start calls are no-ops for leases.<br>
+rep_start(CLIENT): Role changes are discussed above.&nbsp; Make sure
+duplicate calls are no-ops for leases.<br>
+rep_stat: OK.<br>
+rep_sync: Should not be able to happen.&nbsp; Client cannot accept new
+master with outstanding lease grant.&nbsp; Add DB_ASSERT here.<br>
+<br>
+REP_ALIVE: OK.<br>
+REP_ALIVE_REQ: OK.<br>
+REP_ALL_REQ: OK.<br>
+REP_BULK_LOG: OK.&nbsp; Clients check to send ACK.<br>
+REP_BULK_PAGE: Should never process one with lease granted.&nbsp; Add
+DB_ASSERT.<br>
+REP_DUPMASTER: Should never happen, this is what leases are supposed to
+prevent.&nbsp; See above.<br>
+REP_LOG: OK.&nbsp; Clients check to send ACK.<br>
+REP_LOG_MORE: OK.&nbsp; Clients check to send ACK.<br>
+REP_LOG_REQ: OK.<br>
+REP_MASTER_REQ: OK.<br>
+REP_NEWCLIENT: OK.<br>
+REP_NEWFILE: OK.&nbsp; Clients check to send ACK.<br>
+REP_NEWMASTER: See above.<br>
+REP_NEWSITE: OK.<br>
+REP_PAGE: OK.&nbsp; Should never process one with lease granted.&nbsp;
+Add DB_ASSERT.<br>
+REP_PAGE_FAIL:&nbsp; OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_PAGE_MORE:&nbsp; OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_PAGE_REQ: OK.<br>
+REP_REREQUEST: OK.<br>
+REP_UPDATE: OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_UPDATE_REQ: OK.&nbsp; This is a master-only message.<br>
+REP_VERIFY: OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_VERIFY_FAIL: OK.&nbsp; Should never process one with lease
+granted.&nbsp; Add DB_ASSERT.<br>
+REP_VERIFY_REQ: OK.<br>
+REP_VOTE1: OK.&nbsp; See Election discussion above.&nbsp; It is
+possible to receive one with a lease granted.&nbsp; Client cannot send
+one with an outstanding lease however.<br>
+REP_VOTE2: OK.&nbsp; See Election discussion above.&nbsp; It is
+possible to receive one with a lease granted.<br>
+<br>
+If the following method or message processing is in progress and a
+client wants to grant a lease, what should it do?&nbsp; Let's examine
+what this means.&nbsp; The client wanting to grant a lease simply means
+it is responding to the receipt of a <b>REP_LOG</b>
+(or its variants) message and applying a log record.&nbsp; Therefore,
+we need to consider a thread processing a log message racing with these
+other actions.<br>
+<br>
+Other:<br>
+log_archive: OK.&nbsp; <br>
+dbenv-&gt;close: User error.&nbsp; User should not be closing the env
+while other threads are using that handle.&nbsp; Should have no effect
+if a 2nd dbenv handle to same env is closed.<br>
+<br>
+Rep Base API method:<br>
+rep_elect: See Election discussion above.&nbsp; <i>rep_elect</i>
+should wait and may grant
+lease while election is in progress.<br>
+rep_flush: Should not be called on client.<br>
+rep_get_*: OK.<br>
+rep_process_message: Generally OK.&nbsp; See handling each message
+below.<br>
+rep_set_config: OK.<br>
+rep_set_limit: OK.<br>
+rep_set_nsites: Must be called before <i>rep_start</i>
+until 14778 is resolved.<br>
+rep_set_priority: OK.<br>
+rep_set_timeout: OK.<br>
+rep_set_transport: OK.<br>
+rep_start(MASTER): OK, can't happen - already protect racing <i>rep_start</i>
+and <i>rep_process_message</i>.<br>
+rep_start(CLIENT): OK, can't happen - already protect racing <i>rep_start</i>
+and <i>rep_process_message</i>.<br>
+rep_stat: OK.<br>
+rep_sync: Shouldn't happen because client cannot grant leases during
+sync-up.&nbsp; Incoming log message ignored.<br>
+<br>
+REP_ALIVE: OK.<br>
+REP_ALIVE_REQ: OK.<br>
+REP_ALL_REQ: OK.<br>
+REP_BULK_LOG: OK.<br>
+REP_BULK_PAGE: OK.&nbsp; Incoming log message ignored during internal
+init.<br>
+REP_DUPMASTER: Shouldn't happen.&nbsp; See DUPMASTER discussion above.<br>
+REP_LOG: OK.<br>
+REP_LOG_MORE: OK.<br>
+REP_LOG_REQ: OK.<br>
+REP_MASTER_REQ: OK.<br>
+REP_NEWCLIENT: OK.<br>
+REP_NEWFILE: OK.<br>
+REP_NEWMASTER: See above.&nbsp; If a client accepts a new master
+because its lease grant expired, then that master sends a message
+requesting the lease grant, this client will not process the log record
+if it is in sync-up recovery, or it may after the master switch is
+complete and the client doesn't need sync-up recovery.&nbsp; Basically,
+just uses existing log record processing/newmaster infrastructure.<br>
+REP_NEWSITE: OK.<br>
+REP_PAGE: OK.&nbsp; Receiving a log record during internal init PAGE
+phase should ignore log record.<br>
+REP_PAGE_FAIL: OK.<br>
+REP_PAGE_MORE: OK.<br>
+REP_PAGE_REQ: OK.<br>
+REP_REREQUEST: OK.<br>
+REP_UPDATE: OK.&nbsp; Receiving a log record during internal init
+should ignore log record.<br>
+REP_UPDATE_REQ: OK - master-only message.<br>
+REP_VERIFY: OK.&nbsp; Receiving a log record during verify phase
+ignores log record.<br>
+REP_VERIFY_FAIL: OK.<br>
+REP_VERIFY_REQ: OK.<br>
+REP_VOTE1: OK.&nbsp; This client is processing someone else's vote when
+the lease request comes in.&nbsp; That is fine.&nbsp; We protect our
+own election and lease interaction in <i>__rep_elect</i>.<br>
+REP_VOTE2: OK.<br>
+<h4>Crashing - Potential Problem<br>
+</h4>
+It appears there is one area where we could have a problem.&nbsp; I
+believe that crashes can cause us to break our guarantee on durability,
+authoritative reads and inability to elect duplicate masters.&nbsp;
+Consider this scenario:<br>
+<ol>
+ <li>A master and 4 clients are all up and running.</li>
+ <li>The master commits a txn and all 4 clients refresh their lease
+grants at time T.</li>
+ <li>All 4 clients have the txn and log records in the cache.&nbsp;
+None are flushing to disk.</li>
+ <li>All 4 clients have responded to the PERM messages as well as
+refreshed their lease with the master.</li>
+ <li>All 4 clients hit the same application coding error and crash
+(machine/OS stays up).</li>
+ <li>Master authoritatively reads data in txn from step 2.</li>
+ <li>All 4 clients restart the application and run recovery, thus the
+txn from step 2 is lost on all clients because it isn't any logs.<span
+ style="font-weight: bold;"></span><br>
+ </li>
+ <li>A network partition happens and the master is alone on its side.</li>
+ <li>All 4 clients are on the other side and elect a new master.</li>
+ <li>Partition resolves itself and we have duplicate masters, where
+the former master still holds all valid lease grants.<span
+ style="font-weight: bold;"></span><br>
+ </li>
+</ol>
+Therefore, we have broken both guarantees.&nbsp; In step 6 the data is
+really not durable and we've given it to the user.&nbsp; One can argue
+that if this is an issue the application better be syncing somewhere if
+they really want durability.&nbsp; However, worse than that is that we
+have a legitimate DUPMASTER situation in step 10 where both masters
+hold valid leases.&nbsp; The reason is that all lease knowledge is in
+the shared memory and that is lost when the app restarts and runs
+recovery.<br>
+<br>
+How can we solve this?&nbsp; The obvious solution is (ugh, yet another)
+durable BDB-owned file with some information in it, such as the current
+lease expiration time so that rebooting after a crash leaves the
+knowledge that the lease was granted.&nbsp; However, writing and
+syncing every lease grant on every client out to disk is far too
+expensive.<br>
+<br>
+A second possible solution is to have clients wait a full lease timeout
+before entering an election the first time. This solution solves the
+DUPMASTER issue, but not the non-authoritative read.&nbsp; This
+solution naturally falls out of elections and leases really.&nbsp; If a
+client has never granted a lease, it should be considered as having to
+wait a full lease timeout before entering an election.&nbsp;
+Applications already know that leases impact elections and this does
+not seem so bad as it is only on the first election.<br>
+<br>
+Is it sufficient to document that the authoritative read is only as
+authoritative as the durability guarantees they make on the sites that
+indicate it is permanent? Yes, I believe this is sufficient.&nbsp; If
+the application says it is permanent and it really isn't, then the
+application is at fault.&nbsp; Believing the application when it
+indicates with the PERM response that it is permanent avoids the
+authoritative problem.&nbsp; <br>
+<h2>Upgrade/Mixed Versions</h2>
+Clearly leases cannot be used with mixed version sites since masters
+running older releases will not have any knowledge of lease
+support.&nbsp; What considerations are needed in the lease code for
+mixed versions?<br>
+<br>
+First if the <b>REP_CONTROL</b>
+structure changes, we need to maintain and use an old version of the
+structure for talking to older clients and masters.&nbsp; The
+implementation of this would be similar to the way we manage for old <b>REP_VOTE_INFO</b>
+structures.&nbsp;
+Second any new messages need translation table entries added.&nbsp;
+Third, if we are assuming global leases then clearly any mixed versions
+cannot have leases configured, and leases cannot be used in mixed
+version groups.&nbsp; Maintaining two versions of the control structure
+is not necessary if we choose a different style of implementation and
+don't change the control structure.<br>
+<br>
+However, then how could an old application both run continuously,
+upgrade to the new release and take advantage of leases without taking
+down the entire application?&nbsp; I believe it is possible for clients
+to be configured for leases but be subject to the master regarding
+leases, yet the master code can assume that if it has leases
+configured, all client sites do as well.&nbsp; In several places above
+I suggested that a client could make a choice based on either a new <b>REPCTL_LEASE</b>
+flag or simply having
+leases turned on locally.&nbsp; If we choose to use the flag, then we
+can support leases with mixed versions.&nbsp; The upgraded clients can
+configure leases and they simply will not be granted until the old
+master is upgraded and send PERM message with the flag indicating it
+wants a lease grant.&nbsp; The client will not grant a lease until such
+time.&nbsp; The clients, while having the leases configured, will not
+grant a lease until told to do so and will simply have an expired
+lease.&nbsp; Then, when the old master finally upgrades, it too can
+configure leases and suddenly all sites are using them.&nbsp; I believe
+this should work just fine and I will need to make sure a client's
+granting of leases is only in response to the master asking for a
+grant.&nbsp; If the master never asks, then the client has them
+configured, but doesn't grant them.<br>
+<h2>Testing</h2>
+Clearly any user-facing API changes will need the equivalent reflection
+in the Tcl API for testing, under CONFIG_TEST.<br>
+<br>
+I am sure the list of tests will grow but off the top of my head:<br>
+Basic test: have N sites all configure leases, run some,&nbsp; read on
+master, etc.<br>
+Refresh test: Perform update on master, sleep until past expiration,
+read on master and make sure leases are refreshed/read successful<br>
+Error test: Test error conditions (reading on client with leases but no
+ignore flag, calling after rep_start, etc)<br>
+Read test: Test reading on both client and master both with and without
+the IGNORE flag.&nbsp; Test that data read with the ignore flag can be
+rolled back.<br>
+Dupmaster test: Force a DUPMASTER situation and verify that the newer
+master cannot get DUPMASTER error.<br>
+Election test: Call election while grant is outstanding and master
+exists.<br>
+Call election while grant is outstanding and master does not exist.<br>
+Call election after expiration on quiescient system with master
+existing.<br>
+Run with a group where some members have leases configured and other do
+not to make sure we get errors instead of dumping core.<br>
+<br>
+<small><br>
+</small>
+</body>
+</html>
diff --git a/rep/rep.src b/rep/rep.src
new file mode 100644
index 0000000..0d1664b
--- /dev/null
+++ b/rep/rep.src
@@ -0,0 +1,116 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __rep
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/log.h"
+INCLUDE #include "dbinc/mp.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * bulk - message for bulk log records or pages
+ */
+BEGIN_MSG bulk check_length
+ARG len u_int32_t
+ARG lsn DB_LSN
+ARG bulkdata DBT
+END
+
+/*
+ * control - replication control message
+ */
+BEGIN_MSG control check_length
+ARG rep_version u_int32_t
+ARG log_version u_int32_t
+ARG lsn DB_LSN
+ARG rectype u_int32_t
+ARG gen u_int32_t
+ARG msg_sec u_int32_t
+ARG msg_nsec u_int32_t
+ARG flags u_int32_t
+END
+
+/*
+ * egen data
+ */
+BEGIN_MSG egen check_length
+ARG egen u_int32_t
+END
+
+/*
+ * file info
+ */
+BEGIN_MSG fileinfo alloc check_length version
+ARG pgsize u_int32_t
+ARG pgno db_pgno_t
+ARG max_pgno db_pgno_t
+ARG filenum u_int32_t
+ARG finfo_flags u_int32_t
+ARG type u_int32_t
+ARG db_flags u_int32_t
+ARG uid DBT
+ARG info DBT
+END
+
+/*
+ * grant info - clients send to masters granting a lease.
+ */
+BEGIN_MSG grant_info check_length
+ARG msg_sec u_int32_t
+ARG msg_nsec u_int32_t
+END
+
+/*
+ * We do not need to do anything with LOG record data.
+ * It is opaque data to us.
+ */
+
+/*
+ * log request
+ */
+BEGIN_MSG logreq check_length
+ARG endlsn DB_LSN
+END
+
+/*
+ * We do not need to do anything with NEWCLIENT/NEWSITE cdata dbt.
+ * It is user data and the app has to do whatever transformation
+ * it needs to with its own data.
+ */
+/*
+ * newfile version
+ */
+BEGIN_MSG newfile check_length
+ARG version u_int32_t
+END
+
+/*
+ * update - send update information
+ */
+BEGIN_MSG update alloc check_length version
+ARG first_lsn DB_LSN
+ARG first_vers u_int32_t
+ARG num_files u_int32_t
+END
+
+/*
+ * vote info
+ */
+BEGIN_MSG vote_info check_length
+ARG egen u_int32_t
+ARG nsites u_int32_t
+ARG nvotes u_int32_t
+ARG priority u_int32_t
+ARG tiebreaker u_int32_t
+END
+
diff --git a/rep/rep_auto.c b/rep/rep_auto.c
new file mode 100644
index 0000000..3cb3078
--- /dev/null
+++ b/rep/rep_auto.c
@@ -0,0 +1,679 @@
+/* Do not edit: automatically built by gen_msg.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __rep_bulk_marshal __P((ENV *, __rep_bulk_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_bulk_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_bulk_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_BULK_SIZE
+ + (size_t)argp->bulkdata.size)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->len);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+ DB_HTONL_COPYOUT(env, bp, argp->bulkdata.size);
+ if (argp->bulkdata.size > 0) {
+ memcpy(bp, argp->bulkdata.data, argp->bulkdata.size);
+ bp += argp->bulkdata.size;
+ }
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_bulk_unmarshal __P((ENV *, __rep_bulk_args *,
+ * PUBLIC: u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_bulk_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_bulk_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+
+ needed = __REP_BULK_SIZE;
+ if (max < needed)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->len, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+ DB_NTOHL_COPYIN(env, argp->bulkdata.size, bp);
+ argp->bulkdata.data = bp;
+ needed += (size_t)argp->bulkdata.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->bulkdata.size;
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env,
+ "Not enough input bytes to fill a __rep_bulk message");
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_control_marshal __P((ENV *, __rep_control_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_control_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_control_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_CONTROL_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->rep_version);
+ DB_HTONL_COPYOUT(env, bp, argp->log_version);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->lsn.offset);
+ DB_HTONL_COPYOUT(env, bp, argp->rectype);
+ DB_HTONL_COPYOUT(env, bp, argp->gen);
+ DB_HTONL_COPYOUT(env, bp, argp->msg_sec);
+ DB_HTONL_COPYOUT(env, bp, argp->msg_nsec);
+ DB_HTONL_COPYOUT(env, bp, argp->flags);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_control_unmarshal __P((ENV *,
+ * PUBLIC: __rep_control_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_control_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_control_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_CONTROL_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->rep_version, bp);
+ DB_NTOHL_COPYIN(env, argp->log_version, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->lsn.offset, bp);
+ DB_NTOHL_COPYIN(env, argp->rectype, bp);
+ DB_NTOHL_COPYIN(env, argp->gen, bp);
+ DB_NTOHL_COPYIN(env, argp->msg_sec, bp);
+ DB_NTOHL_COPYIN(env, argp->msg_nsec, bp);
+ DB_NTOHL_COPYIN(env, argp->flags, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env,
+ "Not enough input bytes to fill a __rep_control message");
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_egen_marshal __P((ENV *, __rep_egen_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_egen_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_egen_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_EGEN_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->egen);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_egen_unmarshal __P((ENV *, __rep_egen_args *,
+ * PUBLIC: u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_egen_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_egen_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_EGEN_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->egen, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env,
+ "Not enough input bytes to fill a __rep_egen message");
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_marshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_fileinfo_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_fileinfo_marshal(env, version, argp, bp, max, lenp)
+ ENV *env;
+ u_int32_t version;
+ __rep_fileinfo_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ int copy_only;
+ u_int8_t *start;
+
+ if (max < __REP_FILEINFO_SIZE
+ + (size_t)argp->uid.size
+ + (size_t)argp->info.size)
+ return (ENOMEM);
+ start = bp;
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(bp, &argp->pgsize, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->pgsize);
+ if (copy_only) {
+ memcpy(bp, &argp->pgno, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->pgno);
+ if (copy_only) {
+ memcpy(bp, &argp->max_pgno, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->max_pgno);
+ if (copy_only) {
+ memcpy(bp, &argp->filenum, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->filenum);
+ if (copy_only) {
+ memcpy(bp, &argp->finfo_flags, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->finfo_flags);
+ if (copy_only) {
+ memcpy(bp, &argp->type, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->type);
+ if (copy_only) {
+ memcpy(bp, &argp->db_flags, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->db_flags);
+ if (copy_only) {
+ memcpy(bp, &argp->uid.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->uid.size);
+ if (argp->uid.size > 0) {
+ memcpy(bp, argp->uid.data, argp->uid.size);
+ bp += argp->uid.size;
+ }
+ if (copy_only) {
+ memcpy(bp, &argp->info.size, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->info.size);
+ if (argp->info.size > 0) {
+ memcpy(bp, argp->info.data, argp->info.size);
+ bp += argp->info.size;
+ }
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_fileinfo_unmarshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_fileinfo_args **, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_fileinfo_unmarshal(env, version, argpp, bp, max, nextp)
+ ENV *env;
+ u_int32_t version;
+ __rep_fileinfo_args **argpp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ size_t needed;
+ __rep_fileinfo_args *argp;
+ int ret;
+ int copy_only;
+
+ needed = __REP_FILEINFO_SIZE;
+ if (max < needed)
+ goto too_few;
+ if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0)
+ return (ret);
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(&argp->pgsize, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->pgsize, bp);
+ if (copy_only) {
+ memcpy(&argp->pgno, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->pgno, bp);
+ if (copy_only) {
+ memcpy(&argp->max_pgno, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->max_pgno, bp);
+ if (copy_only) {
+ memcpy(&argp->filenum, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->filenum, bp);
+ if (copy_only) {
+ memcpy(&argp->finfo_flags, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->finfo_flags, bp);
+ if (copy_only) {
+ memcpy(&argp->type, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->type, bp);
+ if (copy_only) {
+ memcpy(&argp->db_flags, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->db_flags, bp);
+ if (copy_only) {
+ memcpy(&argp->uid.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->uid.size, bp);
+ argp->uid.data = bp;
+ needed += (size_t)argp->uid.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->uid.size;
+ if (copy_only) {
+ memcpy(&argp->info.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->info.size, bp);
+ argp->info.data = bp;
+ needed += (size_t)argp->info.size;
+ if (max < needed)
+ goto too_few;
+ bp += argp->info.size;
+
+ if (nextp != NULL)
+ *nextp = bp;
+ *argpp = argp;
+ return (0);
+
+too_few:
+ __db_errx(env,
+ "Not enough input bytes to fill a __rep_fileinfo message");
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_grant_info_marshal __P((ENV *,
+ * PUBLIC: __rep_grant_info_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_grant_info_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_grant_info_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_GRANT_INFO_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->msg_sec);
+ DB_HTONL_COPYOUT(env, bp, argp->msg_nsec);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_grant_info_unmarshal __P((ENV *,
+ * PUBLIC: __rep_grant_info_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_grant_info_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_grant_info_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_GRANT_INFO_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->msg_sec, bp);
+ DB_NTOHL_COPYIN(env, argp->msg_nsec, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env,
+ "Not enough input bytes to fill a __rep_grant_info message");
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_logreq_marshal __P((ENV *, __rep_logreq_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_logreq_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_logreq_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_LOGREQ_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->endlsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->endlsn.offset);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_logreq_unmarshal __P((ENV *, __rep_logreq_args *,
+ * PUBLIC: u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_logreq_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_logreq_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_LOGREQ_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->endlsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->endlsn.offset, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env,
+ "Not enough input bytes to fill a __rep_logreq message");
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_newfile_marshal __P((ENV *, __rep_newfile_args *,
+ * PUBLIC: u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_newfile_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_newfile_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_NEWFILE_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->version);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_newfile_unmarshal __P((ENV *,
+ * PUBLIC: __rep_newfile_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_newfile_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_newfile_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_NEWFILE_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->version, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env,
+ "Not enough input bytes to fill a __rep_newfile message");
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_update_marshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_update_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_update_marshal(env, version, argp, bp, max, lenp)
+ ENV *env;
+ u_int32_t version;
+ __rep_update_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ int copy_only;
+ u_int8_t *start;
+
+ if (max < __REP_UPDATE_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(bp, &argp->first_lsn.file, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ memcpy(bp, &argp->first_lsn.offset, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ DB_HTONL_COPYOUT(env, bp, argp->first_lsn.file);
+ DB_HTONL_COPYOUT(env, bp, argp->first_lsn.offset);
+ }
+ if (copy_only) {
+ memcpy(bp, &argp->first_vers, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->first_vers);
+ if (copy_only) {
+ memcpy(bp, &argp->num_files, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_HTONL_COPYOUT(env, bp, argp->num_files);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_update_unmarshal __P((ENV *, u_int32_t,
+ * PUBLIC: __rep_update_args **, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_update_unmarshal(env, version, argpp, bp, max, nextp)
+ ENV *env;
+ u_int32_t version;
+ __rep_update_args **argpp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ __rep_update_args *argp;
+ int ret;
+ int copy_only;
+
+ if (max < __REP_UPDATE_SIZE)
+ goto too_few;
+ if ((ret = __os_malloc(env, sizeof(*argp), &argp)) != 0)
+ return (ret);
+
+ copy_only = 0;
+ if (version < DB_REPVERSION_47)
+ copy_only = 1;
+ if (copy_only) {
+ memcpy(&argp->first_lsn.file, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ memcpy(&argp->first_lsn.offset, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ DB_NTOHL_COPYIN(env, argp->first_lsn.file, bp);
+ DB_NTOHL_COPYIN(env, argp->first_lsn.offset, bp);
+ }
+ if (copy_only) {
+ memcpy(&argp->first_vers, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->first_vers, bp);
+ if (copy_only) {
+ memcpy(&argp->num_files, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else
+ DB_NTOHL_COPYIN(env, argp->num_files, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ *argpp = argp;
+ return (0);
+
+too_few:
+ __db_errx(env,
+ "Not enough input bytes to fill a __rep_update message");
+ return (EINVAL);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_marshal __P((ENV *,
+ * PUBLIC: __rep_vote_info_args *, u_int8_t *, size_t, size_t *));
+ */
+int
+__rep_vote_info_marshal(env, argp, bp, max, lenp)
+ ENV *env;
+ __rep_vote_info_args *argp;
+ u_int8_t *bp;
+ size_t *lenp, max;
+{
+ u_int8_t *start;
+
+ if (max < __REP_VOTE_INFO_SIZE)
+ return (ENOMEM);
+ start = bp;
+
+ DB_HTONL_COPYOUT(env, bp, argp->egen);
+ DB_HTONL_COPYOUT(env, bp, argp->nsites);
+ DB_HTONL_COPYOUT(env, bp, argp->nvotes);
+ DB_HTONL_COPYOUT(env, bp, argp->priority);
+ DB_HTONL_COPYOUT(env, bp, argp->tiebreaker);
+
+ *lenp = (size_t)(bp - start);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_vote_info_unmarshal __P((ENV *,
+ * PUBLIC: __rep_vote_info_args *, u_int8_t *, size_t, u_int8_t **));
+ */
+int
+__rep_vote_info_unmarshal(env, argp, bp, max, nextp)
+ ENV *env;
+ __rep_vote_info_args *argp;
+ u_int8_t *bp;
+ size_t max;
+ u_int8_t **nextp;
+{
+ if (max < __REP_VOTE_INFO_SIZE)
+ goto too_few;
+ DB_NTOHL_COPYIN(env, argp->egen, bp);
+ DB_NTOHL_COPYIN(env, argp->nsites, bp);
+ DB_NTOHL_COPYIN(env, argp->nvotes, bp);
+ DB_NTOHL_COPYIN(env, argp->priority, bp);
+ DB_NTOHL_COPYIN(env, argp->tiebreaker, bp);
+
+ if (nextp != NULL)
+ *nextp = bp;
+ return (0);
+
+too_few:
+ __db_errx(env,
+ "Not enough input bytes to fill a __rep_vote_info message");
+ return (EINVAL);
+}
+
diff --git a/rep/rep_backup.c b/rep/rep_backup.c
new file mode 100644
index 0000000..e3ab31a
--- /dev/null
+++ b/rep/rep_backup.c
@@ -0,0 +1,3379 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * Context information needed for buffer management during the building of a
+ * list of database files present in the environment. When fully built, the
+ * buffer is in the form of an UPDATE message: a (marshaled) update_args,
+ * followed by some number of (marshaled) fileinfo_args.
+ *
+ * Note that the fileinfo for the first file in the list always appears at
+ * (constant) offset __REP_UPDATE_SIZE in the buffer.
+ */
+typedef struct {
+ u_int8_t *buf; /* Buffer base address. */
+ size_t size; /* Total allocated buffer size. */
+ u_int8_t *fillptr; /* Pointer to first unused space. */
+ u_int32_t count; /* Number of entries currently in list. */
+} FILE_LIST_CTX;
+#define FIRST_FILE_PTR(buf) ((buf) + __REP_UPDATE_SIZE)
+
+static int __rep_check_uid __P((ENV *, FILE_LIST_CTX *, u_int32_t,
+ u_int8_t *));
+static int __rep_clean_interrupted __P((ENV *));
+static int __rep_cleanup_nimdbs __P((ENV *));
+static int __rep_filedone __P((ENV *, DB_THREAD_INFO *ip, int,
+ REP *, __rep_fileinfo_args *, u_int32_t));
+static int __rep_find_dbs __P((ENV *, u_int32_t, FILE_LIST_CTX *));
+static int __rep_get_fileinfo __P((ENV *, const char *,
+ const char *, __rep_fileinfo_args *, u_int8_t *));
+static int __rep_get_file_list __P((ENV *,
+ DB_FH *, u_int32_t, u_int32_t *, DBT *));
+static int __rep_log_setup __P((ENV *,
+ REP *, u_int32_t, u_int32_t, DB_LSN *));
+static int __rep_mpf_open __P((ENV *, DB_MPOOLFILE **,
+ __rep_fileinfo_args *, u_int32_t));
+static int __rep_nextfile __P((ENV *, int, REP *));
+static int __rep_page_gap __P((ENV *,
+ REP *, __rep_fileinfo_args *, u_int32_t));
+static int __rep_page_sendpages __P((ENV *, DB_THREAD_INFO *, int,
+ __rep_control_args *, __rep_fileinfo_args *, DB_MPOOLFILE *, DB *));
+static int __rep_queue_filedone __P((ENV *,
+ DB_THREAD_INFO *, REP *, __rep_fileinfo_args *));
+static int __rep_remove_all __P((ENV *, u_int32_t, DBT *));
+static int __rep_remove_by_list __P((ENV *, u_int32_t,
+ u_int8_t *, u_int32_t, u_int32_t));
+static int __rep_remove_by_prefix __P((ENV *, const char *, const char *,
+ size_t, APPNAME));
+static int __rep_remove_file __P((ENV *, u_int8_t *, const char *,
+ u_int32_t, u_int32_t));
+static int __rep_remove_logs __P((ENV *));
+static int __rep_remove_nimdbs __P((ENV *));
+static int __rep_rollback __P((ENV *, DB_LSN *));
+static int __rep_unlink_by_list __P((ENV *, u_int32_t,
+ u_int8_t *, u_int32_t, u_int32_t));
+static int __rep_walk_dir __P((ENV *, const char *, u_int32_t, FILE_LIST_CTX*));
+static int __rep_write_page __P((ENV *,
+ DB_THREAD_INFO *, REP *, __rep_fileinfo_args *));
+
+/*
+ * __rep_update_req -
+ * Process an update_req and send the file information to the client.
+ *
+ * PUBLIC: int __rep_update_req __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_update_req(env, rp, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ int eid;
+{
+ DBT updbt, vdbt;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ __rep_update_args u_args;
+ FILE_LIST_CTX context;
+ size_t updlen;
+ u_int32_t flag, version;
+ int ret, t_ret;
+
+ /*
+ * Start by allocating 1Meg, which ought to be plenty enough to describe
+ * all databases in the environment. (If it's not, __rep_walk_dir can
+ * grow the size.)
+ *
+ * The data we send looks like this:
+ * __rep_update_args
+ * __rep_fileinfo_args
+ * __rep_fileinfo_args
+ * ...
+ */
+ dblp = env->lg_handle;
+ logc = NULL;
+ if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+ return (ret);
+ context.size = MEGABYTE;
+ context.count = 0;
+
+ /* Reserve space for the update_args, and fill in file info. */
+ context.fillptr = FIRST_FILE_PTR(context.buf);
+ if ((ret = __rep_find_dbs(env, rp->rep_version, &context)) != 0)
+ goto err;
+
+ /*
+ * Now get our first LSN. We send the lsn of the first
+ * non-archivable log file.
+ */
+ flag = DB_SET;
+ if ((ret = __log_get_stable_lsn(env, &lsn)) != 0) {
+ if (ret != DB_NOTFOUND)
+ goto err;
+ /*
+ * If ret is DB_NOTFOUND then there is no checkpoint
+ * in this log, that is okay, just start at the beginning.
+ */
+ ret = 0;
+ flag = DB_FIRST;
+ }
+
+ /*
+ * Now get the version number of the log file of that LSN.
+ */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+
+ memset(&vdbt, 0, sizeof(vdbt));
+ /*
+ * Set our log cursor on the LSN we are sending. Or
+ * to the first LSN if we have no stable LSN.
+ */
+ if ((ret = __logc_get(logc, &lsn, &vdbt, flag)) != 0) {
+ /*
+ * We could be racing a fresh master starting up. If we
+ * have no log records, assume an initial LSN and current
+ * log version.
+ */
+ if (ret != DB_NOTFOUND)
+ goto err;
+ INIT_LSN(lsn);
+ version = DB_LOGVERSION;
+ } else {
+ if ((ret = __logc_version(logc, &version)) != 0)
+ goto err;
+ }
+ /*
+ * Package up the update information.
+ */
+ u_args.first_lsn = lsn;
+ u_args.first_vers = version;
+ u_args.num_files = context.count;
+ if ((ret = __rep_update_marshal(env, rp->rep_version,
+ &u_args, context.buf, __REP_UPDATE_SIZE, &updlen)) != 0)
+ goto err;
+ DB_ASSERT(env, updlen == __REP_UPDATE_SIZE);
+
+ /*
+ * We have all the file information now. Send it to the client.
+ */
+ DB_INIT_DBT(updbt, context.buf, context.fillptr - context.buf);
+
+ LOG_SYSTEM_LOCK(env);
+ lsn = ((LOG *)dblp->reginfo.primary)->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(
+ env, eid, REP_UPDATE, &lsn, &updbt, 0, 0);
+
+err: __os_free(env, context.buf);
+ /*
+ * If we got here because the lower code could not get the page
+ * lock then we skipped sending the message, but we don't want
+ * to return an error to the user.
+ */
+ if (ret == DB_REP_PAGELOCKED)
+ ret = 0;
+ if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_find_dbs -
+ * Walk through all the named files/databases including those in the
+ * environment or data_dirs and those that in named and in-memory. We
+ * need to open them, gather the necessary information and then close
+ * them.
+ *
+ * May be called either while holding REP_SYSTEM_LOCK or without.
+ */
+static int
+__rep_find_dbs(env, version, context)
+ ENV *env;
+ u_int32_t version;
+ FILE_LIST_CTX *context;
+{
+ DB_ENV *dbenv;
+ int ret;
+ char **ddir, *real_dir;
+
+ dbenv = env->dbenv;
+ ret = 0;
+ real_dir = NULL;
+
+ if (dbenv->db_data_dir == NULL) {
+ /*
+ * If we don't have a data dir, we have just the
+ * env home dir.
+ */
+ ret = __rep_walk_dir(env, env->db_home, version, context);
+ } else {
+ for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) {
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, *ddir, NULL, &real_dir)) != 0)
+ break;
+ if ((ret = __rep_walk_dir(env,
+ real_dir, version, context)) != 0)
+ break;
+ __os_free(env, real_dir);
+ real_dir = NULL;
+ }
+ }
+
+ /* Now, collect any in-memory named databases. */
+ if (ret == 0)
+ ret = __rep_walk_dir(env, NULL, version, context);
+
+ if (real_dir != NULL)
+ __os_free(env, real_dir);
+ return (ret);
+}
+
+/*
+ * __rep_walk_dir --
+ *
+ * This is the routine that walks a directory and fills in the structures
+ * that we use to generate messages to the client telling it what
+ * files are available. If the directory name is NULL, then we should
+ * walk the list of in-memory named files.
+ */
+static int
+__rep_walk_dir(env, dir, version, context)
+ ENV *env;
+ const char *dir;
+ u_int32_t version;
+ FILE_LIST_CTX *context;
+{
+ __rep_fileinfo_args tmpfp;
+ size_t avail, len;
+ int cnt, first_file, i, ret;
+ u_int8_t uid[DB_FILE_ID_LEN];
+ char *file, **names, *subdb;
+
+ if (dir == NULL) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Walk_dir: Getting info for in-memory named files"));
+ if ((ret = __memp_inmemlist(env, &names, &cnt)) != 0)
+ return (ret);
+ } else {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Walk_dir: Getting info for dir: %s", dir));
+ if ((ret = __os_dirlist(env, dir, 0, &names, &cnt)) != 0)
+ return (ret);
+ }
+ RPRINT(env, DB_VERB_REP_SYNC, (env, "Walk_dir: Dir %s has %d files",
+ (dir == NULL) ? "INMEM" : dir, cnt));
+ first_file = 1;
+ for (i = 0; i < cnt; i++) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Walk_dir: File %d name: %s", i, names[i]));
+ /*
+ * Skip DB-owned files: __db*, DB_CONFIG, log*
+ */
+ if (strncmp(names[i],
+ DB_REGION_PREFIX, sizeof(DB_REGION_PREFIX) - 1) == 0) {
+ /* Process partition files: "__dbp.*". */
+ if (names[i][sizeof(DB_REGION_PREFIX) - 1] != 'p')
+ continue;
+ }
+ if (strncmp(names[i], "DB_CONFIG", 9) == 0)
+ continue;
+ if (strncmp(names[i], "log.", 4) == 0)
+ continue;
+
+ /* We found a file to process. */
+ if (dir == NULL) {
+ file = NULL;
+ subdb = names[i];
+ } else {
+ file = names[i];
+ subdb = NULL;
+ }
+ if ((ret = __rep_get_fileinfo(env,
+ file, subdb, &tmpfp, uid)) != 0) {
+ /*
+ * If we find a file that isn't a database, skip it.
+ */
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Walk_dir: File %d %s: returned error %s",
+ i, names[i], db_strerror(ret)));
+ if (ret == DB_REP_PAGELOCKED)
+ goto err;
+ ret = 0;
+ continue;
+ }
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Walk_dir: File %s at 0x%lx: pgsize %lu, max_pgno %lu",
+ names[i], P_TO_ULONG(context->fillptr),
+ (u_long)tmpfp.pgsize, (u_long)tmpfp.max_pgno));
+
+ /*
+ * On the first time through the loop, check to see if the file
+ * we're about to add is already on the list. If it is, it must
+ * have been added in a previous call, and that means the
+ * directory we're currently scanning has already been scanned
+ * before. (This can happen if the user called
+ * env->set_data_dir() more than once for the same directory.)
+ * If that's the case, we're done: not only is it a waste of
+ * time to scan the same directory again, but doing so would
+ * result in the same files appearing in the list more than
+ * once.
+ */
+ if (first_file && dir != NULL &&
+ (ret = __rep_check_uid(env, context, version, uid)) != 0) {
+ if (ret == DB_KEYEXIST)
+ ret = 0;
+ goto err;
+ }
+ first_file = 0;
+
+ /*
+ * Finally we know that this file is a suitable database file
+ * that we haven't yet included on our list.
+ */
+ tmpfp.filenum = context->count++;
+
+ DB_SET_DBT(tmpfp.info, names[i], strlen(names[i]) + 1);
+ DB_SET_DBT(tmpfp.uid, uid, DB_FILE_ID_LEN);
+retry: avail = (size_t)(&context->buf[context->size] -
+ context->fillptr);
+ ret = __rep_fileinfo_marshal(env, version,
+ &tmpfp, context->fillptr, avail, &len);
+ if (ret == ENOMEM) {
+ /*
+ * Here, 'len' is the total space in use in the buffer.
+ */
+ len = (size_t)(context->fillptr - context->buf);
+ context->size *= 2;
+
+ if ((ret = __os_realloc(env,
+ context->size, &context->buf)) != 0)
+ goto err;
+ context->fillptr = context->buf + len;
+
+ /*
+ * Now that we've reallocated the space, try to
+ * store it again.
+ */
+ goto retry;
+ }
+ /*
+ * Here, 'len' (still) holds the length of the marshaled
+ * information about the current file (as filled in by the last
+ * call to __rep_fileinfo_marshal()).
+ */
+ context->fillptr += len;
+ }
+err:
+ __os_dirfree(env, names, cnt);
+ return (ret);
+}
+
+/*
+ * Check whether the given uid is already present in the list of files being
+ * built in the context buffer. A return of DB_KEYEXIST means it is.
+ */
+static int
+__rep_check_uid(env, context, version, uid)
+ ENV *env;
+ FILE_LIST_CTX *context;
+ u_int32_t version;
+ u_int8_t *uid;
+{
+ __rep_fileinfo_args *rfp;
+ size_t max;
+ u_int8_t *fp;
+ u_int32_t i;
+ int ret;
+
+ ret = 0;
+ rfp = NULL;
+ fp = FIRST_FILE_PTR(context->buf);
+ for (i = 0; i < context->count; i++) {
+ max = (size_t)(context->fillptr - fp);
+ if ((ret = __rep_fileinfo_unmarshal(env, version,
+ &rfp, fp, max, &fp)) != 0) {
+ __db_errx(env, "rep_check_uid: Could not malloc");
+ goto err;
+ }
+ if (memcmp(rfp->uid.data, uid, DB_FILE_ID_LEN) == 0) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Check_uid: Found matching file."));
+ ret = DB_KEYEXIST;
+ goto err;
+ }
+ __os_free(env, rfp);
+ rfp = NULL;
+ }
+err:
+ if (rfp != NULL)
+ __os_free(env, rfp);
+ return (ret);
+
+}
+
+static int
+__rep_get_fileinfo(env, file, subdb, rfp, uid)
+ ENV *env;
+ const char *file, *subdb;
+ __rep_fileinfo_args *rfp;
+ u_int8_t *uid;
+{
+ DB *dbp;
+ DBC *dbc;
+ DBMETA *dbmeta;
+ DB_LOCK lk;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ PAGE *pagep;
+ int lorder, ret, retry, t_ret;
+
+ dbp = NULL;
+ dbc = NULL;
+ pagep = NULL;
+ mpf = NULL;
+ txn = NULL;
+ LOCK_INIT(lk);
+
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /*
+ * If the meta page is locked, try a few times. If we cannot
+ * get it, return.
+ */
+ for (retry = 0; retry < REP_META_RETRY; retry++) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ if ((ret = __txn_begin(env, NULL, NULL, &txn,
+ DB_TXN_NOWAIT)) != 0)
+ goto err;
+ if ((ret = __db_open(dbp, ip, txn, file, subdb, DB_UNKNOWN,
+ DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0),
+ 0, PGNO_BASE_MD)) != 0) {
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "get_fileinfo: open error %d", ret));
+ (void)__txn_abort(txn);
+ txn = NULL;
+ (void)__db_close(dbp, NULL, DB_NOSYNC);
+ dbp = NULL;
+ if (ret == DB_LOCK_DEADLOCK ||
+ ret == DB_LOCK_NOTGRANTED) {
+ __os_yield(env, 1, 0);
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "get_fileinfo: Try %d could not get meta lock for open", retry));
+ continue;
+ } else
+ goto err;
+ } else
+ break;
+ }
+ if (retry == REP_META_RETRY) {
+ ret = DB_REP_PAGELOCKED;
+ goto err;
+ }
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+ /*
+ * If the meta page is locked, try a few times. If we cannot
+ * get it, return.
+ */
+ for (retry = 0; retry < REP_META_RETRY; retry++) {
+ if ((ret = __db_lget(dbc, 0, dbp->meta_pgno,
+ DB_LOCK_READ, DB_LOCK_NOWAIT, &lk)) != 0) {
+ if (ret == DB_LOCK_DEADLOCK ||
+ ret == DB_LOCK_NOTGRANTED) {
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "get_fileinfo: Try %d could not get meta lock", retry));
+ __os_yield(env, 1, 0);
+ continue;
+ } else
+ goto err;
+ } else
+ break;
+ }
+ if (retry == REP_META_RETRY) {
+ ret = DB_REP_PAGELOCKED;
+ goto err;
+ }
+ if ((ret = __memp_fget(dbp->mpf, &dbp->meta_pgno, ip, dbc->txn,
+ 0, &pagep)) != 0)
+ goto err;
+ /*
+ * We have the meta page. Set up our information.
+ */
+ dbmeta = (DBMETA *)pagep;
+ rfp->pgno = 0;
+ /*
+ * Queue is a special-case. We need to set max_pgno to 0 so that
+ * the client can compute the pages from the meta-data.
+ */
+ if (dbp->type == DB_QUEUE)
+ rfp->max_pgno = 0;
+ else
+ rfp->max_pgno = dbmeta->last_pgno;
+ rfp->pgsize = dbp->pgsize;
+ memcpy(uid, dbp->fileid, DB_FILE_ID_LEN);
+ rfp->type = (u_int32_t)dbp->type;
+ rfp->db_flags = dbp->flags;
+ rfp->finfo_flags = 0;
+ /*
+ * Send the lorder of this database.
+ */
+ (void)__db_get_lorder(dbp, &lorder);
+ if (lorder == 1234)
+ FLD_SET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN);
+ else
+ FLD_CLR(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN);
+
+ ret = __memp_fput(dbp->mpf, ip, pagep, dbc->priority);
+ pagep = NULL;
+ if ((t_ret = __LPUT(dbc, lk)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+err:
+ if ((t_ret = __LPUT(dbc, lk)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pagep != NULL && (t_ret =
+ __memp_fput(mpf, ip, pagep, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (txn != NULL)
+ (void)__txn_abort(txn);
+ if (dbp != NULL && (t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_page_req
+ * Process a page_req and send the page information to the client.
+ *
+ * PUBLIC: int __rep_page_req __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_page_req(env, ip, eid, rp, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ __rep_control_args *rp;
+ DBT *rec;
+{
+ __rep_fileinfo_args *msgfp;
+ DB_MPOOLFILE *mpf;
+ DB_REP *db_rep;
+ REP *rep;
+ int ret, t_ret;
+ u_int8_t *next;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version,
+ &msgfp, rec->data, rec->size, &next)) != 0)
+ return (ret);
+
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "page_req: file %d page %lu to %lu",
+ msgfp->filenum, (u_long)msgfp->pgno, (u_long)msgfp->max_pgno));
+
+ /*
+ * We need to open the file and then send its pages.
+ * If we cannot open the file, we send REP_FILE_FAIL.
+ */
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "page_req: Open %d via mpf_open", msgfp->filenum));
+ if ((ret = __rep_mpf_open(env, &mpf, msgfp, 0)) != 0) {
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "page_req: Open %d failed", msgfp->filenum));
+ if (F_ISSET(rep, REP_F_MASTER))
+ (void)__rep_send_message(env, eid, REP_FILE_FAIL,
+ NULL, rec, 0, 0);
+ else
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ ret = __rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, NULL);
+ t_ret = __memp_fclose(mpf, 0);
+ if (ret == 0 && t_ret != 0)
+ ret = t_ret;
+err:
+ __os_free(env, msgfp);
+ return (ret);
+}
+
+static int
+__rep_page_sendpages(env, ip, eid, rp, msgfp, mpf, dbp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ __rep_control_args *rp;
+ __rep_fileinfo_args *msgfp;
+ DB_MPOOLFILE *mpf;
+ DB *dbp;
+{
+ DB *qdbp;
+ DBC *qdbc;
+ DBT lockdbt, msgdbt;
+ DB_LOCK lock;
+ DB_LOCKER *locker;
+ DB_LOCK_ILOCK lock_obj;
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ PAGE *pagep;
+ REP *rep;
+ REP_BULK bulk;
+ REP_THROTTLE repth;
+ db_pgno_t p;
+ uintptr_t bulkoff;
+ size_t len, msgsz;
+ u_int32_t bulkflags, use_bulk;
+ int opened, ret, t_ret;
+ u_int8_t *buf;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ locker = NULL;
+ opened = 0;
+ t_ret = 0;
+ qdbp = NULL;
+ qdbc = NULL;
+ buf = NULL;
+ bulk.addr = NULL;
+ use_bulk = FLD_ISSET(rep->config, REP_C_BULK);
+ if (msgfp->type == (u_int32_t)DB_QUEUE) {
+ if (dbp == NULL) {
+ if ((ret = __db_create_internal(&qdbp, env, 0)) != 0)
+ goto err;
+ /*
+ * We need to check whether this is in-memory so that
+ * we pass the name correctly as either the file or
+ * the database name.
+ */
+ if ((ret = __db_open(qdbp, ip, NULL,
+ FLD_ISSET(msgfp->db_flags, DB_AM_INMEM) ?
+ NULL : msgfp->info.data,
+ FLD_ISSET(msgfp->db_flags, DB_AM_INMEM) ?
+ msgfp->info.data : NULL,
+ DB_UNKNOWN,
+ DB_RDONLY | (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0),
+ 0, PGNO_BASE_MD)) != 0)
+ goto err;
+ opened = 1;
+ } else
+ qdbp = dbp;
+ if ((ret = __db_cursor(qdbp, ip, NULL, &qdbc, 0)) != 0)
+ goto err;
+ }
+ msgsz = __REP_FILEINFO_SIZE + DB_FILE_ID_LEN + msgfp->pgsize;
+ if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
+ goto err;
+ memset(&msgdbt, 0, sizeof(msgdbt));
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "sendpages: file %d page %lu to %lu",
+ msgfp->filenum, (u_long)msgfp->pgno, (u_long)msgfp->max_pgno));
+ memset(&repth, 0, sizeof(repth));
+ /*
+ * If we're doing bulk transfer, allocate a bulk buffer to put our
+ * pages in. We still need to initialize the throttle info
+ * because if we encounter a page larger than our entire bulk
+ * buffer, we need to send it as a singleton.
+ *
+ * Use a local var so that we don't need to worry if someone else
+ * turns on/off bulk in the middle of our call here.
+ */
+ if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid,
+ &bulkoff, &bulkflags, REP_BULK_PAGE)) != 0)
+ goto err;
+ REP_SYSTEM_LOCK(env);
+ repth.gbytes = rep->gbytes;
+ repth.bytes = rep->bytes;
+ repth.type = REP_PAGE;
+ repth.data_dbt = &msgdbt;
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Set up locking.
+ */
+ LOCK_INIT(lock);
+ memset(&lock_obj, 0, sizeof(lock_obj));
+ if ((ret = __lock_id(env, NULL, &locker)) != 0)
+ goto err;
+ memcpy(lock_obj.fileid, mpf->fileid, DB_FILE_ID_LEN);
+ lock_obj.type = DB_PAGE_LOCK;
+
+ memset(&lockdbt, 0, sizeof(lockdbt));
+ lockdbt.data = &lock_obj;
+ lockdbt.size = sizeof(lock_obj);
+
+ for (p = msgfp->pgno; p <= msgfp->max_pgno; p++) {
+ /*
+ * We're not waiting for the lock, if we cannot get
+ * the lock for this page, skip it. The gap
+ * code will rerequest it.
+ */
+ lock_obj.pgno = p;
+ if ((ret = __lock_get(env, locker, DB_LOCK_NOWAIT, &lockdbt,
+ DB_LOCK_READ, &lock)) != 0) {
+ /*
+ * Continue if we couldn't get the lock.
+ */
+ if (ret == DB_LOCK_DEADLOCK ||
+ ret == DB_LOCK_NOTGRANTED) {
+ ret = 0;
+ continue;
+ }
+ /*
+ * Otherwise we have an error.
+ */
+ goto err;
+ }
+ if (msgfp->type == (u_int32_t)DB_QUEUE && p != 0)
+#ifdef HAVE_QUEUE
+ ret = __qam_fget(qdbc, &p, DB_MPOOL_CREATE, &pagep);
+#else
+ ret = DB_PAGE_NOTFOUND;
+#endif
+ else
+ ret = __memp_fget(mpf, &p, ip, NULL,
+ DB_MPOOL_CREATE, &pagep);
+ msgfp->pgno = p;
+ if (ret == DB_PAGE_NOTFOUND) {
+ ZERO_LSN(lsn);
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ ret = 0;
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "sendpages: PAGE_FAIL on page %lu",
+ (u_long)p));
+ (void)__rep_send_message(env, eid,
+ REP_PAGE_FAIL, &lsn, &msgdbt, 0, 0);
+ } else
+ ret = DB_NOTFOUND;
+ goto lockerr;
+ } else if (ret != 0)
+ goto lockerr;
+ else
+ DB_SET_DBT(msgfp->info, pagep, msgfp->pgsize);
+ len = 0;
+ /*
+ * Send along an indication of the byte order of this mpool
+ * page. Since mpool always keeps pages in the native byte
+ * order of the local environment, this is simply my
+ * environment's byte order.
+ *
+ * Since pages can be served from a variety of sites when using
+ * client-to-client synchronization, the receiving client needs
+ * to know the byte order of each page independently.
+ */
+ if (F_ISSET(env, ENV_LITTLEENDIAN))
+ FLD_SET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN);
+ else
+ FLD_CLR(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN);
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "sendpages: %lu, page lsn [%lu][%lu]", (u_long)p,
+ (u_long)pagep->lsn.file, (u_long)pagep->lsn.offset));
+ ret = __rep_fileinfo_marshal(env, rp->rep_version,
+ msgfp, buf, msgsz, &len);
+ if (msgfp->type != (u_int32_t)DB_QUEUE || p == 0)
+ t_ret = __memp_fput(mpf,
+ ip, pagep, DB_PRIORITY_UNCHANGED);
+#ifdef HAVE_QUEUE
+ else
+ /*
+ * We don't need an #else for HAVE_QUEUE here because if
+ * we're not compiled with queue, then we're guaranteed
+ * to have set REP_PAGE_FAIL above.
+ */
+ t_ret = __qam_fput(qdbc, p, pagep, qdbp->priority);
+#endif
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __ENV_LPUT(env, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ DB_ASSERT(env, len <= msgsz);
+ DB_SET_DBT(msgdbt, buf, len);
+
+ dblp = env->lg_handle;
+ LOG_SYSTEM_LOCK(env);
+ repth.lsn = ((LOG *)dblp->reginfo.primary)->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ /*
+ * If we are configured for bulk, try to send this as a bulk
+ * request. If not configured, or it is too big for bulk
+ * then just send normally.
+ */
+ if (use_bulk)
+ ret = __rep_bulk_message(env, &bulk, &repth,
+ &repth.lsn, &msgdbt, 0);
+ if (!use_bulk || ret == DB_REP_BULKOVF)
+ ret = __rep_send_throttle(env, eid, &repth, 0, 0);
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "sendpages: %lu, lsn [%lu][%lu]", (u_long)p,
+ (u_long)repth.lsn.file, (u_long)repth.lsn.offset));
+ /*
+ * If we have REP_PAGE_MORE we need to break this loop.
+ * Otherwise, with REP_PAGE, we keep going.
+ */
+ if (repth.type == REP_PAGE_MORE || ret != 0) {
+ /* Ignore send failure, except to break the loop. */
+ if (ret == DB_REP_UNAVAIL)
+ ret = 0;
+ break;
+ }
+ }
+
+ if (0) {
+lockerr: if ((t_ret = __ENV_LPUT(env, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+err:
+ /*
+ * We're done, force out whatever remains in the bulk buffer and
+ * free it.
+ */
+ if (use_bulk && bulk.addr != NULL &&
+ (t_ret = __rep_bulk_free(env, &bulk, 0)) != 0 && ret == 0 &&
+ t_ret != DB_REP_UNAVAIL)
+ ret = t_ret;
+ if (qdbc != NULL && (t_ret = __dbc_close(qdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (opened && (t_ret = __db_close(qdbp, NULL, DB_NOSYNC)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (buf != NULL)
+ __os_free(env, buf);
+ if (locker != NULL && (t_ret = __lock_id_free(env,
+ locker)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_update_setup
+ * Process and setup with this file information.
+ *
+ * PUBLIC: int __rep_update_setup __P((ENV *, int, __rep_control_args *,
+ * PUBLIC: DBT *, time_t));
+ */
+int
+__rep_update_setup(env, eid, rp, rec, savetime)
+ ENV *env;
+ int eid;
+ __rep_control_args *rp;
+ DBT *rec;
+ time_t savetime;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ __rep_update_args *rup;
+ __rep_fileinfo_args *finfo;
+ DB_LSN verify_lsn;
+ size_t max;
+ int found, ret;
+ u_int32_t count;
+ u_int8_t *end, *next;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ verify_lsn = lp->verify_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ if (!F_ISSET(rep, REP_F_RECOVER_UPDATE) || IN_ELECTION(rep)) {
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+ }
+ F_CLR(rep, REP_F_RECOVER_UPDATE);
+
+ if ((ret = __rep_update_unmarshal(env, rp->rep_version,
+ &rup, rec->data, rec->size, &next)) != 0)
+ return (ret);
+ DB_ASSERT(env, next == FIRST_FILE_PTR((u_int8_t*)rec->data));
+ end = &((u_int8_t*)rec->data)[rec->size];
+
+ /*
+ * If we're doing an abbreviated internal init, it's because we found a
+ * sync point but we needed to materialize any NIMDBs. However, if we
+ * now see that there are no NIMDBs we can just skip to verify_match,
+ * just as we would have done if we had already loaded the NIMDBs. In
+ * other words, if there are no NIMDBs, then I can trivially say that
+ * I've already loaded all of them! The whole abbreviated internal init
+ * turns out not to have been necessary after all.
+ */
+ if (F_ISSET(rep, REP_F_ABBREVIATED)) {
+ count = rup->num_files;
+ found = 0;
+ while (count-- > 0) {
+ max = (size_t)(end - next);
+ if ((ret = __rep_fileinfo_unmarshal(env,
+ rp->rep_version, &finfo, next, max, &next)) != 0)
+ goto err;
+ found = FLD_ISSET(finfo->db_flags, DB_AM_INMEM);
+ __os_free(env, finfo);
+ if (found)
+ break;
+ }
+ if (!found) {
+ /*
+ * Revert to VERIFY state, so that we can pick up where
+ * we left off, except that from now on (i.e., future
+ * master changes) we can skip checking for NIMDBs if we
+ * find a sync point.
+ */
+ F_SET(rep, REP_F_NIMDBS_LOADED | REP_F_RECOVER_VERIFY);
+ F_CLR(rep, REP_F_ABBREVIATED);
+
+ REP_SYSTEM_UNLOCK(env);
+ ret = __rep_verify_match(env, &verify_lsn, savetime);
+ __os_free(env, rup);
+ return (ret);
+ }
+ }
+
+ /*
+ * We know we're the first to come in here due to the
+ * REP_F_RECOVER_UPDATE flag.
+ */
+ F_SET(rep, REP_F_RECOVER_PAGE);
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ /*
+ * We do not clear REP_F_READY_* in this code.
+ * We'll eventually call the normal __rep_verify_match recovery
+ * code and that will clear all the flags and allow others to
+ * proceed. We lockout both the messages and API here.
+ * We lockout messages briefly because we are about to reset
+ * all our LSNs and we do not want another thread possibly
+ * using/needing those. We have to lockout the API for
+ * the duration of internal init.
+ */
+ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+ goto err;
+
+ if ((ret = __rep_lockout_api(env, rep)) != 0)
+ goto err;
+ /*
+ * We need to update the timestamp and kill any open handles
+ * on this client. The files are changing completely.
+ */
+ infop = env->reginfo;
+ renv = infop->primary;
+ (void)time(&renv->rep_timestamp);
+
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->wait_ts = rep->request_gap;
+ ZERO_LSN(lp->ready_lsn);
+ ZERO_LSN(lp->verify_lsn);
+ ZERO_LSN(lp->prev_ckp);
+ ZERO_LSN(lp->waiting_lsn);
+ ZERO_LSN(lp->max_wait_lsn);
+ ZERO_LSN(lp->max_perm_lsn);
+ if (db_rep->rep_db == NULL)
+ ret = __rep_client_dbinit(env, 0, REP_DB);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (ret != 0)
+ goto err_nolock;
+
+ /*
+ * We need to empty out any old log records that might be in the
+ * temp database.
+ */
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_truncate(db_rep->rep_db, ip, NULL, &count)) != 0)
+ goto err_nolock;
+ rep->stat.st_log_queued = 0;
+
+ REP_SYSTEM_LOCK(env);
+ if (F_ISSET(rep, REP_F_ABBREVIATED)) {
+ /*
+ * For an abbreviated internal init, the place from which we'll
+ * want to request master's logs after (NIMDB) pages are loaded
+ * is precisely the sync point we found during VERIFY. We'll
+ * roll back to there in a moment.
+ *
+ * We don't need first_vers, because it's only used with
+ * __log_newfile, which only happens with non-ABBREVIATED
+ * internal init.
+ */
+ rep->first_lsn = verify_lsn;
+ } else {
+ /*
+ * We will remove all logs we have so we need to request
+ * from the master's beginning.
+ */
+ rep->first_lsn = rup->first_lsn;
+ rep->first_vers = rup->first_vers;
+ }
+ rep->last_lsn = rp->lsn;
+ rep->nfiles = rup->num_files;
+
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "Update setup for %d files.", rep->nfiles));
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "Update setup: First LSN [%lu][%lu].",
+ (u_long)rep->first_lsn.file, (u_long)rep->first_lsn.offset));
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "Update setup: Last LSN [%lu][%lu]",
+ (u_long)rep->last_lsn.file, (u_long)rep->last_lsn.offset));
+
+ if (rep->nfiles > 0) {
+ rep->infoversion = rp->rep_version;
+ rep->originfolen = rep->infolen =
+ rec->size - __REP_UPDATE_SIZE;
+ if ((ret = __os_calloc(env, 1, rep->infolen,
+ &rep->originfo)) != 0)
+ goto err;
+ memcpy(rep->originfo,
+ FIRST_FILE_PTR((u_int8_t*)rec->data), rep->infolen);
+ rep->nextinfo = rep->originfo;
+ }
+
+ /*
+ * Clear the decks to make room for the logs and databases that we will
+ * request as part of this internal init. For a normal, full internal
+ * init, that means all logs and databases. For an abbreviated internal
+ * init, it means only the NIMDBs, and only that portion of the log
+ * after the sync point.
+ */
+ if (F_ISSET(rep, REP_F_ABBREVIATED)) {
+ /*
+ * Note that in order to pare the log back to the sync point, we
+ * can't just crudely hack it off there. We need to make sure
+ * that pages in regular databases get rolled back to a state
+ * consistent with that sync point. So we have to do a real
+ * recovery step.
+ */
+ if ((ret = __rep_rollback(env, &rep->first_lsn)) != 0)
+ goto err;
+ ret = __rep_remove_nimdbs(env);
+ } else
+ ret = __rep_remove_all(env, rp->rep_version, rec);
+ if (ret != 0)
+ goto err;
+ F_CLR(rep, REP_F_READY_MSG);
+
+ rep->curfile = 0;
+ ret = __rep_nextfile(env, eid, rep);
+ if (ret != 0)
+ goto err;
+
+ if (0) {
+err_nolock: REP_SYSTEM_LOCK(env);
+ }
+
+err: /*
+ * If we get an error, we cannot leave ourselves in the RECOVER_PAGE
+ * state because we have no file information. That also means undo'ing
+ * the rep_lockout. We need to move back to the RECOVER_UPDATE stage.
+ * In the non-error path, we will have already cleared READY_MSG, but it
+ * doesn't hurt to clear it again.
+ */
+ F_CLR(rep, REP_F_READY_MSG);
+ if (ret != 0) {
+ if (rep->originfo != NULL) {
+ __os_free(env, rep->originfo);
+ rep->originfo = NULL;
+ }
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Update_setup: Error: Clear PAGE, set UPDATE again. %s",
+ db_strerror(ret)));
+ F_CLR(rep, REP_F_RECOVER_PAGE | REP_F_READY_API |
+ REP_F_READY_OP);
+ F_SET(rep, REP_F_RECOVER_UPDATE);
+ }
+ REP_SYSTEM_UNLOCK(env);
+ __os_free(env, rup);
+ return (ret);
+}
+
+/*
+ * Removes any currently existing NIMDBs. We do this at the beginning of
+ * abbreviated internal init, when any existing NIMDBs should be intact, so
+ * walk_dir should produce reliable results.
+ */
+static int
+__rep_remove_nimdbs(env)
+ ENV *env;
+{
+ __rep_fileinfo_args *finfo;
+ FILE_LIST_CTX context;
+ size_t max;
+ u_int8_t *fp;
+ int ret;
+
+ finfo = NULL;
+
+ if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+ return (ret);
+ context.size = MEGABYTE;
+ context.count = 0;
+ context.fillptr = context.buf;
+
+ /* NB: "NULL" asks walk_dir to consider only in-memory DBs */
+ if ((ret = __rep_walk_dir(env, NULL, DB_REPVERSION, &context)) != 0)
+ goto out;
+
+ if ((ret = __rep_closefiles(env)) != 0)
+ goto out;
+
+ fp = context.buf;
+ while (context.count-- > 0) {
+ max = (size_t)(context.fillptr - fp);
+ if ((ret = __rep_fileinfo_unmarshal(env, DB_REPVERSION,
+ &finfo, fp, max, &fp)) != 0)
+ goto out;
+ if ((ret = __rep_remove_file(env, finfo->uid.data,
+ finfo->info.data, finfo->type, finfo->db_flags)) != 0)
+ goto out;
+ __os_free(env, finfo);
+ finfo = NULL;
+ }
+
+out:
+ if (finfo != NULL)
+ __os_free(env, finfo);
+ __os_free(env, context.buf);
+ return (ret);
+}
+
+/*
+ * Removes all existing logs and databases, at the start of internal init. But
+ * before we do, write a list of the databases onto the init file, so that in
+ * case we crash in the middle, we'll know how to resume when we restart.
+ * Finally, also write into the init file the UPDATE message from the master (in
+ * the "rec" DBT), which includes the (new) list of databases we intend to
+ * request copies of (again, so that we know what to do if we crash in the
+ * middle).
+ *
+ * For the sake of simplicity, these database lists are in the form of an UPDATE
+ * message (since we already have the mechanisms in place), even though strictly
+ * speaking that contains more information than we really need to store.
+ *
+ * !!! Must be called with the REP_SYSTEM_LOCK held.
+ */
+static int
+__rep_remove_all(env, msg_version, rec)
+ ENV *env;
+ u_int32_t msg_version;
+ DBT *rec;
+{
+ FILE_LIST_CTX context;
+ __rep_fileinfo_args *finfo;
+ __rep_update_args u_args;
+ DB_FH *fhp;
+ DB_REP *db_rep;
+ REP *rep;
+ size_t cnt, max, updlen;
+ u_int32_t bufsz, fvers, mvers, zero;
+ u_int8_t *fp;
+ int ret, t_ret;
+ char *fname;
+
+ finfo = NULL;
+ fname = NULL;
+ fhp = NULL;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * 1. Get list of databases currently present at this client, which we
+ * intend to remove.
+ */
+ if ((ret = __os_calloc(env, 1, MEGABYTE, &context.buf)) != 0)
+ return (ret);
+ context.size = MEGABYTE;
+ context.count = 0;
+
+ /* Reserve space for the marshaled update_args. */
+ context.fillptr = FIRST_FILE_PTR(context.buf);
+
+ if ((ret = __rep_find_dbs(env, DB_REPVERSION, &context)) != 0)
+ goto out;
+ ZERO_LSN(u_args.first_lsn);
+ u_args.first_vers = 0;
+ u_args.num_files = context.count;
+ if ((ret = __rep_update_marshal(env, DB_REPVERSION,
+ &u_args, context.buf, __REP_UPDATE_SIZE, &updlen)) != 0)
+ goto out;
+ DB_ASSERT(env, updlen == __REP_UPDATE_SIZE);
+
+ /*
+ * 2. Before removing anything, safe-store the database list, so that in
+ * case we crash before we've removed them all, when we restart we
+ * can clean up what we were doing. Only write database list to
+ * file if not running in-memory replication.
+ *
+ * The original version of the file contains:
+ * data1 size (4 bytes)
+ * data1
+ * data2 size (possibly) (4 bytes)
+ * data2 (possibly)
+ *
+ * As of 4.7 the file has the following form:
+ * 0 (4 bytes - to indicate a new style file)
+ * file version (4 bytes)
+ * data1 version (4 bytes)
+ * data1 size (4 bytes)
+ * data1
+ * data2 version (possibly) (4 bytes)
+ * data2 size (possibly) (4 bytes)
+ * data2 (possibly)
+ */
+ if (!FLD_ISSET(rep->config, REP_C_INMEM)) {
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, REP_INITNAME, NULL, &fname)) != 0)
+ goto out;
+ /* Sanity check that the write size fits into 32 bits. */
+ DB_ASSERT(env, (size_t)(context.fillptr - context.buf) ==
+ (u_int32_t)(context.fillptr - context.buf));
+ bufsz = (u_int32_t)(context.fillptr - context.buf);
+
+ /*
+ * (Short writes aren't possible, so we don't have to verify
+ * 'cnt'.) This first list is generated internally, so it is
+ * always in the form of the current message version.
+ */
+ zero = 0;
+ fvers = REP_INITVERSION;
+ mvers = DB_REPVERSION;
+ if ((ret = __os_open(env, fname, 0,
+ DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) != 0 ||
+ (ret =
+ __os_write(env, fhp, &zero, sizeof(zero), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, &fvers, sizeof(fvers), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, &mvers, sizeof(mvers), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, &bufsz, sizeof(bufsz), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, context.buf, bufsz, &cnt)) != 0 ||
+ (ret = __os_fsync(env, fhp)) != 0) {
+ __db_err(env, ret, "%s", fname);
+ goto out;
+ }
+ }
+
+ /*
+ * 3. Go ahead and remove logs and databases. The databases get removed
+ * according to the list we just finished safe-storing.
+ *
+ * Clearing NIMDBS_LOADED might not really be necessary, since once
+ * we've committed to removing all there's no chance of doing an
+ * abbreviated internal init. This just keeps us honest.
+ */
+ if ((ret = __rep_remove_logs(env)) != 0)
+ goto out;
+ if ((ret = __rep_closefiles(env)) != 0)
+ goto out;
+ F_CLR(rep, REP_F_NIMDBS_LOADED);
+ fp = FIRST_FILE_PTR(context.buf);
+ while (context.count-- > 0) {
+ max = (size_t)(context.fillptr - fp);
+ if ((ret = __rep_fileinfo_unmarshal(env, DB_REPVERSION,
+ &finfo, fp, max, &fp)) != 0)
+ goto out;
+ if ((ret = __rep_remove_file(env, finfo->uid.data,
+ finfo->info.data, finfo->type, finfo->db_flags)) != 0)
+ goto out;
+ __os_free(env, finfo);
+ finfo = NULL;
+ }
+
+ /*
+ * 4. Safe-store the (new) list of database files we intend to copy from
+ * the master (again, so that in case we crash before we're finished
+ * doing so, we'll have enough information to clean up and start over
+ * again). This list is the list from the master, so it uses
+ * the message version. Only write to file if not running
+ * in-memory replication.
+ */
+ if (!FLD_ISSET(rep->config, REP_C_INMEM)) {
+ mvers = msg_version;
+ if ((ret =
+ __os_write(env, fhp, &mvers, sizeof(mvers), &cnt)) != 0 ||
+ (ret = __os_write(env, fhp,
+ &rec->size, sizeof(rec->size), &cnt)) != 0 ||
+ (ret =
+ __os_write(env, fhp, rec->data, rec->size, &cnt)) != 0 ||
+ (ret = __os_fsync(env, fhp)) != 0) {
+ __db_err(env, ret, "%s", fname);
+ goto out;
+ }
+ }
+
+out:
+ if (fhp != NULL && (t_ret = __os_closehandle(env, fhp)) && ret == 0)
+ ret = t_ret;
+ if (fname != NULL)
+ __os_free(env, fname);
+ if (finfo != NULL)
+ __os_free(env, finfo);
+ __os_free(env, context.buf);
+ return (ret);
+}
+
+/*
+ * __rep_remove_logs -
+ * Remove our logs to prepare for internal init.
+ */
+static int
+__rep_remove_logs(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ LOG *lp;
+ u_int32_t fnum, lastfile;
+ int ret;
+ char *name;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+
+ /*
+ * Call memp_sync to flush any pages that might be in the log buffers
+ * and not on disk before we remove files on disk. If there were no
+ * dirty pages, the log isn't flushed. Yet the log buffers could still
+ * be dirty: __log_flush should take care of this rare situation.
+ */
+ if ((ret = __memp_sync_int(env,
+ NULL, 0, DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0)
+ return (ret);
+ if ((ret = __log_flush(env, NULL)) != 0)
+ return (ret);
+ /*
+ * Forcibly remove existing log files or reset
+ * the in-memory log space.
+ */
+ if (lp->db_log_inmemory) {
+ ZERO_LSN(lsn);
+ if ((ret = __log_zero(env, &lsn)) != 0)
+ return (ret);
+ } else {
+ lastfile = lp->lsn.file;
+ for (fnum = 1; fnum <= lastfile; fnum++) {
+ if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0)
+ return (ret);
+ (void)time(&lp->timestamp);
+ (void)__os_unlink(env, name, 0);
+ __os_free(env, name);
+ }
+ }
+ return (0);
+}
+
+/*
+ * Removes a file during internal init. Assumes underlying subsystems are
+ * active; therefore, this can't be used for internal init crash recovery.
+ */
+static int
+__rep_remove_file(env, uid, name, type, flags)
+ ENV *env;
+ u_int8_t *uid;
+ const char *name;
+ u_int32_t type, flags;
+{
+ DB *dbp;
+#ifdef HAVE_QUEUE
+ DB_THREAD_INFO *ip;
+#endif
+ int ret, t_ret;
+
+ dbp = NULL;
+
+ /*
+ * Calling __fop_remove will both purge any matching
+ * fileid from mpool and unlink it on disk.
+ */
+#ifdef HAVE_QUEUE
+ /*
+ * Handle queue separately. __fop_remove will not
+ * remove extent files. Use __qam_remove to remove
+ * extent files that might exist under this name. Note that
+ * in-memory queue databases can't have extent files.
+ */
+ if (type == (u_int32_t)DB_QUEUE && !LF_ISSET(DB_AM_INMEM)) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+
+ /*
+ * At present, qam_remove expects the passed-in dbp to have a
+ * locker allocated, and if not, db_open allocates a locker
+ * which qam_remove then leaks.
+ *
+ * TODO: it would be better to avoid cobbling together this
+ * sequence of low-level operations, if fileops provided some
+ * API to allow us to remove a database without write-locking
+ * its handle.
+ */
+ if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+ goto out;
+
+ ENV_GET_THREAD_INFO(env, ip);
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "QAM: Unlink %s via __qam_remove", name));
+ if ((ret = __qam_remove(dbp, ip, NULL, name, NULL, 0)) != 0) {
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "qam_remove returned %d", ret));
+ goto out;
+ }
+ }
+#else
+ COMPQUIET(type, 0);
+#endif
+ /*
+ * We call fop_remove even if we've called qam_remove.
+ * That will only have removed extent files. Now
+ * we need to deal with the actual file itself.
+ */
+ if (LF_ISSET(DB_AM_INMEM)) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+ MAKE_INMEM(dbp);
+ F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */
+ ret = __db_inmem_remove(dbp, NULL, name);
+ } else
+ ret = __fop_remove(env, NULL, uid, name, NULL, DB_APP_DATA, 0);
+#ifdef HAVE_QUEUE
+out:
+#endif
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_bulk_page
+ * Process a bulk page message.
+ *
+ * PUBLIC: int __rep_bulk_page __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_bulk_page(env, ip, eid, rp, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ __rep_control_args *rp;
+ DBT *rec;
+{
+ __rep_control_args tmprp;
+ __rep_bulk_args b_args;
+ int ret;
+ u_int8_t *p, *ep;
+
+ /*
+ * We're going to be modifying the rp LSN contents so make
+ * our own private copy to play with. We need to set the
+ * rectype to REP_PAGE because we're calling through __rep_page
+ * to process each page, and lower functions make decisions
+ * based on the rectypes (for throttling/gap processing)
+ */
+ memcpy(&tmprp, rp, sizeof(tmprp));
+ tmprp.rectype = REP_PAGE;
+ ret = 0;
+ for (ep = (u_int8_t *)rec->data + rec->size, p = (u_int8_t *)rec->data;
+ p < ep;) {
+ /*
+ * First thing in the buffer is the length. Then the LSN
+ * of this page, then the page info itself.
+ */
+ if ((ret = __rep_bulk_unmarshal(env,
+ &b_args, p, rec->size, &p)) != 0)
+ return (ret);
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "rep_bulk_page: Processing LSN [%lu][%lu]",
+ (u_long)tmprp.lsn.file, (u_long)tmprp.lsn.offset));
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "rep_bulk_page: p %#lx ep %#lx pgrec data %#lx, size %lu (%#lx)",
+ P_TO_ULONG(p), P_TO_ULONG(ep),
+ P_TO_ULONG(b_args.bulkdata.data),
+ (u_long)b_args.bulkdata.size,
+ (u_long)b_args.bulkdata.size));
+ /*
+ * Now send the page info DBT to the page processing function.
+ */
+ ret = __rep_page(env, ip, eid, &tmprp, &b_args.bulkdata);
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "rep_bulk_page: rep_page ret %d", ret));
+
+ /*
+ * If this set of pages is already done just return.
+ */
+ if (ret != 0) {
+ if (ret == DB_REP_PAGEDONE)
+ ret = 0;
+ break;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __rep_page
+ * Process a page message.
+ *
+ * PUBLIC: int __rep_page __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_page(env, ip, eid, rp, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ __rep_control_args *rp;
+ DBT *rec;
+{
+
+ DB_REP *db_rep;
+ DBT key, data;
+ REP *rep;
+ __rep_fileinfo_args *msgfp;
+ db_recno_t recno;
+ int ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (!F_ISSET(rep, REP_F_RECOVER_PAGE))
+ return (DB_REP_PAGEDONE);
+ /*
+ * If we restarted internal init, it is possible to receive
+ * an old REP_PAGE message, while we're in the current
+ * stage of recovering pages. Until we have some sort of
+ * an init generation number, ignore any message that has
+ * a message LSN that is before this internal init's first_lsn.
+ */
+ if (LOG_COMPARE(&rp->lsn, &rep->first_lsn) < 0) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE: Old page: msg LSN [%lu][%lu] first_lsn [%lu][%lu]",
+ (u_long)rp->lsn.file, (u_long)rp->lsn.offset,
+ (u_long)rep->first_lsn.file,
+ (u_long)rep->first_lsn.offset));
+ return (DB_REP_PAGEDONE);
+ }
+ if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version,
+ &msgfp, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Check if the world changed.
+ */
+ if (!F_ISSET(rep, REP_F_RECOVER_PAGE)) {
+ ret = DB_REP_PAGEDONE;
+ goto err;
+ }
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE: Received page %lu from file %d",
+ (u_long)msgfp->pgno, msgfp->filenum));
+ /*
+ * Check if this page is from the file we're expecting.
+ * This may be an old or delayed page message.
+ */
+ /*
+ * !!!
+ * If we allow dbrename/dbremove on the master while a client
+ * is updating, then we'd have to verify the file's uid here too.
+ */
+ if (msgfp->filenum != rep->curfile) {
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "Msg file %d != curfile %d",
+ msgfp->filenum, rep->curfile));
+ ret = DB_REP_PAGEDONE;
+ goto err;
+ }
+ /*
+ * We want to create/open our dbp to the database
+ * where we'll keep our page information.
+ */
+ if ((ret = __rep_client_dbinit(env, 1, REP_PG)) != 0) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE: Client_dbinit %s", db_strerror(ret)));
+ goto err;
+ }
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ recno = (db_recno_t)(msgfp->pgno + 1);
+ key.data = &recno;
+ key.ulen = key.size = sizeof(db_recno_t);
+ key.flags = DB_DBT_USERMEM;
+
+ /*
+ * If we already have this page, then we don't want to bother
+ * rewriting it into the file. Otherwise, any other error
+ * we want to return.
+ */
+ ret = __db_put(rep->file_dbp, ip, NULL, &key, &data, DB_NOOVERWRITE);
+ if (ret == DB_KEYEXIST) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE: Received duplicate page %lu from file %d",
+ (u_long)msgfp->pgno, msgfp->filenum));
+ STAT(rep->stat.st_pg_duplicated++);
+ ret = 0;
+ goto err;
+ }
+ if (ret != 0)
+ goto err;
+
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE: Write page %lu into mpool", (u_long)msgfp->pgno));
+ /*
+ * We put the page in the database file itself.
+ */
+ ret = __rep_write_page(env, ip, rep, msgfp);
+ if (ret != 0) {
+ /*
+ * We got an error storing the page, therefore, we need
+ * remove this page marker from the page database too.
+ * !!!
+ * I'm ignoring errors from the delete because we want to
+ * return the original error. If we cannot write the page
+ * and we cannot delete the item we just put, what should
+ * we do? Panic the env and return DB_RUNRECOVERY?
+ */
+ (void)__db_del(rep->file_dbp, NULL, NULL, &key, 0);
+ goto err;
+ }
+ STAT(rep->stat.st_pg_records++);
+ rep->npages++;
+
+ /*
+ * Now check the LSN on the page and save it if it is later
+ * than the one we have.
+ */
+ if (LOG_COMPARE(&rp->lsn, &rep->last_lsn) > 0)
+ rep->last_lsn = rp->lsn;
+
+ /*
+ * We've successfully written the page. Now we need to see if
+ * we're done with this file. __rep_filedone will check if we
+ * have all the pages expected and if so, set up for the next
+ * file and send out a page request for the next file's pages.
+ */
+ ret = __rep_filedone(env, ip, eid, rep, msgfp, rp->rectype);
+
+err: REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ __os_free(env, msgfp);
+ return (ret);
+}
+
+/*
+ * __rep_page_fail
+ * Process a page fail message.
+ *
+ * PUBLIC: int __rep_page_fail __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, int, __rep_control_args *, DBT *));
+ */
+int
+__rep_page_fail(env, ip, eid, rp, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ __rep_control_args *rp;
+ DBT *rec;
+{
+
+ DB_REP *db_rep;
+ REP *rep;
+ __rep_fileinfo_args *msgfp, *rfp;
+ int ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if (!F_ISSET(rep, REP_F_RECOVER_PAGE))
+ return (0);
+ if ((ret = __rep_fileinfo_unmarshal(env, rp->rep_version,
+ &msgfp, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ /*
+ * Check if this page is from the file we're expecting.
+ * This may be an old or delayed page message.
+ */
+ /*
+ * !!!
+ * If we allow dbrename/dbremove on the master while a client
+ * is updating, then we'd have to verify the file's uid here too.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ if (msgfp->filenum != rep->curfile) {
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "Msg file %d != curfile %d",
+ msgfp->filenum, rep->curfile));
+ goto out;
+ }
+ rfp = rep->curinfo;
+ if (rfp->type != (u_int32_t)DB_QUEUE)
+ --rfp->max_pgno;
+ else {
+ /*
+ * Queue is special. Pages at the beginning of the queue
+ * may disappear, as well as at the end. Use msgfp->pgno
+ * to adjust accordingly.
+ */
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "page_fail: BEFORE page %lu failed. ready %lu, max %lu, npages %d",
+ (u_long)msgfp->pgno, (u_long)rep->ready_pg,
+ (u_long)rfp->max_pgno, rep->npages));
+ if (msgfp->pgno == rfp->max_pgno)
+ --rfp->max_pgno;
+ if (msgfp->pgno >= rep->ready_pg) {
+ rep->ready_pg = msgfp->pgno + 1;
+ rep->npages = rep->ready_pg;
+ }
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "page_fail: AFTER page %lu failed. ready %lu, max %lu, npages %d",
+ (u_long)msgfp->pgno, (u_long)rep->ready_pg,
+ (u_long)rfp->max_pgno, rep->npages));
+ }
+
+ /*
+ * We've lowered the number of pages expected. It is possible that
+ * this was the last page we were expecting. Now we need to see if
+ * we're done with this file. __rep_filedone will check if we have
+ * all the pages expected and if so, set up for the next file and
+ * send out a page request for the next file's pages.
+ */
+ ret = __rep_filedone(env, ip, eid, rep, msgfp, REP_PAGE_FAIL);
+out:
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ __os_free(env, msgfp);
+ return (ret);
+}
+
+/*
+ * __rep_write_page -
+ * Write this page into a database.
+ */
+static int
+__rep_write_page(env, ip, rep, msgfp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ __rep_fileinfo_args *msgfp;
+{
+ DB db;
+ DBT pgcookie;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO *pginfo;
+ __rep_fileinfo_args *rfp;
+ int ret;
+ void *dst;
+
+ rfp = NULL;
+
+ /*
+ * If this is the first page we're putting in this database, we need
+ * to create the mpool file. Otherwise call memp_fget to create the
+ * page in mpool. Then copy the data to the page, and memp_fput the
+ * page to give it back to mpool.
+ *
+ * We need to create the file, removing any existing file and associate
+ * the correct file ID with the new one.
+ */
+ rfp = rep->curinfo;
+ if (rep->file_mpf == NULL) {
+ if (!FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+ /*
+ * Recreate the file on disk. We'll be putting
+ * the data into the file via mpool.
+ */
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "rep_write_page: Calling fop_create for %s",
+ (char *)rfp->info.data));
+ if ((ret = __fop_create(env, NULL, NULL,
+ rfp->info.data, NULL, DB_APP_DATA,
+ env->db_mode, 0)) != 0)
+ goto err;
+ }
+
+ if ((ret =
+ __rep_mpf_open(env, &rep->file_mpf, rep->curinfo,
+ FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ?
+ DB_CREATE : 0)) != 0)
+ goto err;
+ }
+ /*
+ * Handle queue specially. If we're a QUEUE database, we need to
+ * use the __qam_fget/put calls. We need to use rep->queue_dbc for
+ * that. That dbp is opened after getting the metapage for the
+ * queue database. Since the meta-page is always in the queue file,
+ * we'll use the normal path for that first page. After that we
+ * can assume the dbp is opened.
+ */
+ if (msgfp->type == (u_int32_t)DB_QUEUE && msgfp->pgno != 0) {
+#ifdef HAVE_QUEUE
+ ret = __qam_fget(rep->queue_dbc, &msgfp->pgno,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &dst);
+#else
+ /*
+ * This always returns an error.
+ */
+ ret = __db_no_queue_am(env);
+#endif
+ } else
+ ret = __memp_fget(rep->file_mpf, &msgfp->pgno, ip, NULL,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &dst);
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Before writing this page into our local mpool, see if its byte order
+ * needs to be swapped. When in mpool the page should be in the native
+ * byte order of our local environment. But the page image we've
+ * received may be in the opposite order (as indicated in finfo_flags).
+ */
+ if ((F_ISSET(env, ENV_LITTLEENDIAN) &&
+ !FLD_ISSET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN)) ||
+ (!F_ISSET(env, ENV_LITTLEENDIAN) &&
+ FLD_ISSET(msgfp->finfo_flags, REPINFO_PG_LITTLEENDIAN))) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "write_page: Page %d needs to be swapped", msgfp->pgno));
+ /*
+ * Set up a dbp to pass into the swap functions. We need
+ * only a few things: The environment and any special
+ * dbp flags and some obvious basics like db type and
+ * pagesize. Those flags were set back in rep_mpf_open
+ * and are available in the pgcookie set up with the
+ * mpoolfile associated with this database.
+ */
+ memset(&db, 0, sizeof(db));
+ db.env = env;
+ db.type = (DBTYPE)msgfp->type;
+ db.pgsize = msgfp->pgsize;
+ mpf = rep->file_mpf;
+ if ((ret = __memp_get_pgcookie(mpf, &pgcookie)) != 0)
+ goto err;
+ pginfo = (DB_PGINFO *)pgcookie.data;
+ db.flags = pginfo->flags;
+ if ((ret = __db_pageswap(&db, msgfp->info.data, msgfp->pgsize,
+ NULL, 1)) != 0)
+ goto err;
+ }
+
+ memcpy(dst, msgfp->info.data, msgfp->pgsize);
+#ifdef HAVE_QUEUE
+ if (msgfp->type == (u_int32_t)DB_QUEUE && msgfp->pgno != 0)
+ ret = __qam_fput(rep->queue_dbc,
+ msgfp->pgno, dst, rep->queue_dbc->priority);
+ else
+#endif
+ ret = __memp_fput(rep->file_mpf,
+ ip, dst, rep->file_dbp->priority);
+
+err: return (ret);
+}
+
+/*
+ * __rep_page_gap -
+ * After we've put the page into the database, we need to check if
+ * we have a page gap and whether we need to request pages.
+ */
+static int
+__rep_page_gap(env, rep, msgfp, type)
+ ENV *env;
+ REP *rep;
+ __rep_fileinfo_args *msgfp;
+ u_int32_t type;
+{
+ DBC *dbc;
+ DBT data, key;
+ DB_LOG *dblp;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ __rep_fileinfo_args *rfp;
+ db_recno_t recno;
+ int ret, t_ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+ dbc = NULL;
+
+ /*
+ * We've successfully put this page into our file.
+ * Now we need to account for it and re-request new pages
+ * if necessary.
+ */
+ /*
+ * We already hold both the db mutex and rep mutex.
+ */
+ rfp = rep->curinfo;
+
+ /*
+ * Make sure we're still talking about the same file.
+ * If not, we're done here.
+ */
+ if (rfp->filenum != msgfp->filenum) {
+ ret = DB_REP_PAGEDONE;
+ goto err;
+ }
+
+ /*
+ * We have 3 possible states:
+ * 1. We receive a page we already have accounted for.
+ * msg pgno < ready pgno
+ * 2. We receive a page that is beyond a gap.
+ * msg pgno > ready pgno
+ * 3. We receive the page we're expecting next.
+ * msg pgno == ready pgno
+ */
+ /*
+ * State 1. This can happen once we put our page record into the
+ * database, but by the time we acquire the mutex other
+ * threads have already accounted for this page and moved on.
+ * We just want to return.
+ */
+ if (msgfp->pgno < rep->ready_pg) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE_GAP: pgno %lu < ready %lu, waiting %lu",
+ (u_long)msgfp->pgno, (u_long)rep->ready_pg,
+ (u_long)rep->waiting_pg));
+ goto err;
+ }
+
+ /*
+ * State 2. This page is beyond the page we're expecting.
+ * We need to update waiting_pg if this page is less than
+ * (earlier) the current waiting_pg. There is nothing
+ * to do but see if we need to request.
+ */
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE_GAP: pgno %lu, max_pg %lu ready %lu, waiting %lu max_wait %lu",
+ (u_long)msgfp->pgno, (u_long)rfp->max_pgno, (u_long)rep->ready_pg,
+ (u_long)rep->waiting_pg, (u_long)rep->max_wait_pg));
+ if (msgfp->pgno > rep->ready_pg) {
+ if (rep->waiting_pg == PGNO_INVALID ||
+ msgfp->pgno < rep->waiting_pg)
+ rep->waiting_pg = msgfp->pgno;
+ } else {
+ /*
+ * We received the page we're expecting.
+ */
+ rep->ready_pg++;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ if (rep->ready_pg == rep->waiting_pg) {
+ /*
+ * If we get here we know we just filled a gap.
+ * Move the cursor to that place and then walk
+ * forward looking for the next gap, if it exists.
+ */
+ lp->wait_ts = rep->request_gap;
+ rep->max_wait_pg = PGNO_INVALID;
+ /*
+ * We need to walk the recno database looking for the
+ * next page we need or expect.
+ */
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_cursor(rep->file_dbp, ip, NULL,
+ &dbc, 0)) != 0)
+ goto err;
+ /*
+ * Set cursor to the first waiting page.
+ * Page numbers/record numbers are offset by 1.
+ */
+ recno = (db_recno_t)rep->waiting_pg + 1;
+ key.data = &recno;
+ key.ulen = key.size = sizeof(db_recno_t);
+ key.flags = DB_DBT_USERMEM;
+ /*
+ * We know that page is there, this should
+ * find the record.
+ */
+ ret = __dbc_get(dbc, &key, &data, DB_SET);
+ if (ret != 0)
+ goto err;
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE_GAP: Set cursor for ready %lu, waiting %lu",
+ (u_long)rep->ready_pg, (u_long)rep->waiting_pg));
+ }
+ while (ret == 0 && rep->ready_pg == rep->waiting_pg) {
+ rep->ready_pg++;
+ ret = __dbc_get(dbc, &key, &data, DB_NEXT);
+ /*
+ * If we get to the end of the list, there are no
+ * more gaps. Reset waiting_pg.
+ */
+ if (ret == DB_NOTFOUND || ret == DB_KEYEMPTY) {
+ rep->waiting_pg = PGNO_INVALID;
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE_GAP: Next cursor No next - ready %lu, waiting %lu",
+ (u_long)rep->ready_pg,
+ (u_long)rep->waiting_pg));
+ break;
+ }
+ /*
+ * Subtract 1 from waiting_pg because record numbers
+ * are 1-based and pages are 0-based and we added 1
+ * into the page number when we put it into the db.
+ */
+ rep->waiting_pg = *(db_pgno_t *)key.data;
+ rep->waiting_pg--;
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "PAGE_GAP: Next cursor ready %lu, waiting %lu",
+ (u_long)rep->ready_pg, (u_long)rep->waiting_pg));
+ }
+ }
+
+ /*
+ * If we filled a gap and now have the entire file, there's
+ * nothing to do. We're done when ready_pg is > max_pgno
+ * because ready_pg is larger than the last page we received.
+ */
+ if (rep->ready_pg > rfp->max_pgno)
+ goto err;
+
+ /*
+ * Check if we need to ask for more pages.
+ */
+ if ((rep->waiting_pg != PGNO_INVALID &&
+ rep->ready_pg != rep->waiting_pg) || type == REP_PAGE_MORE) {
+ /*
+ * We got a page but we may still be waiting for more.
+ * If we got REP_PAGE_MORE we always want to ask for more.
+ * We need to set rfp->pgno to the current page number
+ * we will use to ask for more pages.
+ */
+ if (type == REP_PAGE_MORE)
+ rfp->pgno = msgfp->pgno;
+ if ((__rep_check_doreq(env, rep) || type == REP_PAGE_MORE) &&
+ ((ret = __rep_pggap_req(env, rep, rfp,
+ (type == REP_PAGE_MORE) ? REP_GAP_FORCE : 0)) != 0))
+ goto err;
+ } else {
+ lp->wait_ts = rep->request_gap;
+ rep->max_wait_pg = PGNO_INVALID;
+ }
+
+err:
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __rep_init_cleanup -
+ * Clean up internal initialization pieces.
+ *
+ * !!!
+ * Caller must hold client database mutex (mtx_clientdb) and REP_SYSTEM_LOCK.
+ *
+ * PUBLIC: int __rep_init_cleanup __P((ENV *, REP *, int));
+ */
+int
+__rep_init_cleanup(env, rep, force)
+ ENV *env;
+ REP *rep;
+ int force;
+{
+ DB *queue_dbp;
+ int ret, t_ret;
+
+ ret = 0;
+ /*
+ * 1. Close up the file data pointer we used.
+ * 2. Close/reset the page database.
+ * 3. Close/reset the queue database if we're forcing a cleanup.
+ * 4. Free current file info.
+ * 5. If we have all files or need to force, free original file info.
+ */
+ if (rep->file_mpf != NULL) {
+ ret = __memp_fclose(rep->file_mpf, 0);
+ rep->file_mpf = NULL;
+ }
+ if (rep->file_dbp != NULL) {
+ t_ret = __db_close(rep->file_dbp, NULL, DB_NOSYNC);
+ rep->file_dbp = NULL;
+ if (ret == 0)
+ ret = t_ret;
+ }
+ if (force && rep->queue_dbc != NULL) {
+ queue_dbp = rep->queue_dbc->dbp;
+ if ((t_ret = __dbc_close(rep->queue_dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ rep->queue_dbc = NULL;
+ if ((t_ret = __db_close(queue_dbp, NULL, DB_NOSYNC)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+ if (rep->curinfo != NULL) {
+ __os_free(env, rep->curinfo);
+ rep->curinfo = NULL;
+ }
+ if (IN_INTERNAL_INIT(rep) && force) {
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "clean up interrupted internal init"));
+ t_ret = F_ISSET(rep, REP_F_ABBREVIATED) ?
+ __rep_cleanup_nimdbs(env) :
+ __rep_clean_interrupted(env);
+ if (ret == 0)
+ ret = t_ret;
+
+ if (rep->originfo != NULL) {
+ __os_free(env, rep->originfo);
+ rep->originfo = NULL;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * Remove NIMDBs that may have been fully or partially loaded during an
+ * abbreviated internal init, when the init gets interrupted. At this point,
+ * we know that any databases we have processed are listed in originfo.
+ */
+static int
+__rep_cleanup_nimdbs(env)
+ ENV *env;
+{
+ REP *rep;
+ DB *dbp;
+ __rep_fileinfo_args *rfp;
+ u_int8_t *filelist, *new_fp;
+ char *namep;
+ u_int32_t count, filesz, version;
+ int ret, t_ret;
+
+ /* Use the saved file list from the original UPDATE message. */
+ rep = env->rep_handle->region;
+ version = rep->infoversion;
+ filelist = rep->originfo;
+ filesz = rep->originfolen;
+ count = rep->nfiles;
+
+ ret = 0;
+ rfp = NULL;
+ dbp = NULL;
+ while (count-- > 0) {
+ if ((ret = __rep_fileinfo_unmarshal(env, version,
+ &rfp, filelist, filesz, &new_fp)) != 0)
+ goto out;
+ filesz -= (u_int32_t)(new_fp - filelist);
+ filelist = new_fp;
+
+ if (FLD_ISSET(rfp->db_flags, DB_AM_INMEM)) {
+ namep = rfp->info.data;
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto out;
+ MAKE_INMEM(dbp);
+ F_SET(dbp, DB_AM_RECOVER); /* Skirt locking. */
+
+ /*
+ * Some of these "files" (actually NIMDBs) may not exist
+ * yet, simply because the interrupted abbreviated
+ * internal init had not yet progressed far enough to
+ * retrieve them. So ENOENT is an acceptable outcome.
+ */
+ if ((ret = __db_inmem_remove(dbp, NULL, namep)) != 0 &&
+ ret != ENOENT)
+ goto out;
+ ret = __db_close(dbp, NULL, DB_NOSYNC);
+ dbp = NULL;
+ if (ret != 0)
+ goto out;
+ }
+
+ __os_free(env, rfp);
+ rfp = NULL;
+ }
+
+out:
+ if (rfp != NULL)
+ __os_free(env, rfp);
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * Clean up files involved in an interrupted internal init.
+ */
+static int
+__rep_clean_interrupted(env)
+ ENV *env;
+{
+ REP *rep;
+ DB_LOG *dblp;
+ LOG *lp;
+ int ret, t_ret;
+
+ rep = env->rep_handle->region;
+
+ /*
+ * 1. logs
+ * a) remove old log files
+ * b) set up initial log file #1
+ * 2. database files
+ * 3. the "init file"
+ *
+ * Steps 1 and 2 can be attempted independently. Step 1b is
+ * dependent on successful completion of 1a.
+ */
+
+ /* Step 1a. */
+ if ((ret = __rep_remove_logs(env)) == 0) {
+ /*
+ * Since we have no logs, recover by making it look like
+ * the case when a new client first starts up, namely we
+ * have nothing but a fresh log file #1. This is a
+ * little wasteful, since we may soon remove this log
+ * file again. But it's insignificant in the context of
+ * interrupted internal init.
+ */
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ /* Step 1b. */
+ ret = __rep_log_setup(env,
+ rep, 1, DB_LOGVERSION, &lp->ready_lsn);
+ }
+
+ /* Step 2. */
+ if ((t_ret = __rep_remove_by_list(env, rep->infoversion,
+ rep->originfo, rep->originfolen, rep->nfiles)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Step 3 must not be done if anything fails along the way, because the
+ * init file's raison d'etre is to show that some files remain to be
+ * cleaned up.
+ */
+ if (ret == 0)
+ ret = __rep_remove_init_file(env);
+
+ return (ret);
+}
+
+/*
+ * __rep_filedone -
+ * We need to check if we're done with the current file after
+ * processing the current page. Stat the database to see if
+ * we have all the pages. If so, we need to clean up/close
+ * this one, set up for the next one, and ask for its pages,
+ * or if this is the last file, request the log records and
+ * move to the REP_RECOVER_LOG state.
+ */
+static int
+__rep_filedone(env, ip, eid, rep, msgfp, type)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int eid;
+ REP *rep;
+ __rep_fileinfo_args *msgfp;
+ u_int32_t type;
+{
+ __rep_fileinfo_args *rfp;
+ int ret;
+
+ /*
+ * We've put our page, now we need to do any gap processing
+ * that might be needed to re-request pages.
+ */
+ ret = __rep_page_gap(env, rep, msgfp, type);
+ /*
+ * The world changed while we were doing gap processing.
+ * We're done here.
+ */
+ if (ret == DB_REP_PAGEDONE)
+ return (0);
+
+ rfp = rep->curinfo;
+ /*
+ * max_pgno is 0-based and npages is 1-based, so we don't have
+ * all the pages until npages is > max_pgno.
+ */
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "FILEDONE: have %lu pages. Need %lu.",
+ (u_long)rep->npages, (u_long)rfp->max_pgno + 1));
+ if (rep->npages <= rfp->max_pgno)
+ return (0);
+
+ /*
+ * If we're queue and we think we have all the pages for this file,
+ * we need to do special queue processing. Queue is handled in
+ * several stages.
+ */
+ if (rfp->type == (u_int32_t)DB_QUEUE &&
+ ((ret = __rep_queue_filedone(env, ip, rep, rfp)) !=
+ DB_REP_PAGEDONE))
+ return (ret);
+ /*
+ * We have all the pages for this file. Clean up.
+ */
+ if ((ret = __rep_init_cleanup(env, rep, 0)) != 0)
+ goto err;
+
+ rep->curfile++;
+ ret = __rep_nextfile(env, eid, rep);
+err:
+ return (ret);
+}
+
+/*
+ * Starts requesting pages for the next file in the list (if any), or if not,
+ * proceeds to the next stage: requesting logs.
+ *
+ * !!!
+ * Called with REP_SYSTEM_LOCK held or both clientdb_mutex and REP_SYSTEM,
+ * though we may drop REP_SYSTEM_LOCK momentarily in order to send
+ * a LOG_REQ (but not a PAGE_REQ).
+ */
+static int
+__rep_nextfile(env, eid, rep)
+ ENV *env;
+ int eid;
+ REP *rep;
+{
+ DBT dbt;
+ __rep_logreq_args lr_args;
+ int ret;
+ u_int8_t *buf, *info_ptr, lrbuf[__REP_LOGREQ_SIZE];
+ size_t len, msgsz;
+
+ /*
+ * Always direct the next request to the master (at least nominally),
+ * regardless of where the current response came from. The application
+ * can always still redirect it to another client.
+ */
+ if (rep->master_id != DB_EID_INVALID)
+ eid = rep->master_id;
+
+ while (rep->curfile < rep->nfiles) {
+ /* Set curinfo to next file and examine it. */
+ info_ptr = rep->nextinfo;
+ if ((ret = __rep_fileinfo_unmarshal(env,
+ rep->infoversion, &rep->curinfo,
+ info_ptr, rep->infolen, &rep->nextinfo)) != 0) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "NEXTINFO: Fileinfo read: %s", db_strerror(ret)));
+ return (ret);
+ }
+ rep->infolen -= (u_int32_t)(rep->nextinfo - info_ptr);
+
+ /* Skip over regular DB's in "abbreviated" internal inits. */
+ if (F_ISSET(rep, REP_F_ABBREVIATED) &&
+ !FLD_ISSET(rep->curinfo->db_flags, DB_AM_INMEM)) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Skipping file %d in abbreviated internal init",
+ rep->curinfo->filenum));
+ __os_free(env, rep->curinfo);
+ rep->curinfo = NULL;
+ rep->curfile++;
+ continue;
+ }
+
+ /* Request this file's pages. */
+ DB_ASSERT(env, rep->curinfo->pgno == 0);
+ rep->ready_pg = 0;
+ rep->npages = 0;
+ rep->waiting_pg = PGNO_INVALID;
+ rep->max_wait_pg = PGNO_INVALID;
+ memset(&dbt, 0, sizeof(dbt));
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Next file %d: pgsize %lu, maxpg %lu",
+ rep->curinfo->filenum, (u_long)rep->curinfo->pgsize,
+ (u_long)rep->curinfo->max_pgno));
+ msgsz = __REP_FILEINFO_SIZE +
+ rep->curinfo->uid.size + rep->curinfo->info.size;
+ if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
+ return (ret);
+ if ((ret = __rep_fileinfo_marshal(env, rep->infoversion,
+ rep->curinfo, buf, msgsz, &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(dbt, buf, len);
+ (void)__rep_send_message(env, eid, REP_PAGE_REQ,
+ NULL, &dbt, 0, DB_REP_ANYWHERE);
+ __os_free(env, buf);
+
+ return (0);
+ }
+
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "NEXTFILE: have %d files. RECOVER_LOG now", rep->nfiles));
+ /*
+ * Move to REP_RECOVER_LOG state.
+ * Request logs.
+ */
+ /*
+ * We need to do a sync here so that any later opens
+ * can find the file and file id. We need to do it
+ * before we clear REP_F_RECOVER_PAGE so that we do not
+ * try to flush the log.
+ */
+ if ((ret = __memp_sync_int(env, NULL, 0,
+ DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0)
+ return (ret);
+ F_CLR(rep, REP_F_RECOVER_PAGE);
+ F_SET(rep, REP_F_RECOVER_LOG);
+ memset(&dbt, 0, sizeof(dbt));
+ lr_args.endlsn = rep->last_lsn;
+ if ((ret = __rep_logreq_marshal(env, &lr_args, lrbuf,
+ __REP_LOGREQ_SIZE, &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(dbt, lrbuf, len);
+
+ /*
+ * Get the logging subsystem ready to receive the first log record we
+ * are going to ask for. In the case of a normal internal init, this is
+ * pretty simple, since we only deal in whole log files. In the
+ * ABBREVIATED case we've already taken care of this, back when we
+ * processed the UPDATE message, because we had to do it by rolling back
+ * to a sync point at an arbitrary LSN.
+ */
+ if (!F_ISSET(rep, REP_F_ABBREVIATED) &&
+ (ret = __rep_log_setup(env, rep,
+ rep->first_lsn.file, rep->first_vers, NULL)) != 0)
+ return (ret);
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "NEXTFILE: LOG_REQ from LSN [%lu][%lu] to [%lu][%lu]",
+ (u_long)rep->first_lsn.file, (u_long)rep->first_lsn.offset,
+ (u_long)rep->last_lsn.file, (u_long)rep->last_lsn.offset));
+ REP_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env, eid,
+ REP_LOG_REQ, &rep->first_lsn, &dbt, REPCTL_INIT, DB_REP_ANYWHERE);
+ REP_SYSTEM_LOCK(env);
+ return (0);
+}
+
+/*
+ * Run a recovery, for the purpose of rolling back the client environment to a
+ * specific sync point, in preparation for doing an abbreviated internal init
+ * (materializing only NIMDBs, when we already have the on-disk DBs).
+ *
+ * REP_SYSTEM_LOCK should be held on entry, and will be held on exit, but we
+ * drop it momentarily during the call.
+ */
+static int
+__rep_rollback(env, lsnp)
+ ENV *env;
+ DB_LSN *lsnp;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ DB_THREAD_INFO *ip;
+ DB_LSN trunclsn;
+ int ret;
+ u_int32_t unused;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ENV_GET_THREAD_INFO(env, ip);
+
+ DB_ASSERT(env, F_ISSET(rep,
+ REP_F_READY_API | REP_F_READY_MSG | REP_F_READY_OP));
+
+ REP_SYSTEM_UNLOCK(env);
+
+ if ((ret = __rep_dorecovery(env, lsnp, &trunclsn)) != 0)
+ goto errlock;
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->ready_lsn = trunclsn;
+ ZERO_LSN(lp->waiting_lsn);
+ ZERO_LSN(lp->max_wait_lsn);
+ lp->max_perm_lsn = *lsnp;
+ lp->wait_ts = rep->request_gap;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ ZERO_LSN(lp->verify_lsn);
+
+ if (db_rep->rep_db == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto errlock;
+ }
+
+ F_SET(db_rep->rep_db, DB_AM_RECOVER);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ F_CLR(db_rep->rep_db, DB_AM_RECOVER);
+ rep->stat.st_log_queued = 0;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+errlock:
+ REP_SYSTEM_LOCK(env);
+
+ return (ret);
+}
+
+/*
+ * __rep_mpf_open -
+ * Create and open the mpool file for a database.
+ * Used by both master and client to bring files into mpool.
+ */
+static int
+__rep_mpf_open(env, mpfp, rfp, flags)
+ ENV *env;
+ DB_MPOOLFILE **mpfp;
+ __rep_fileinfo_args *rfp;
+ u_int32_t flags;
+{
+ DB db;
+ int ret;
+
+ if ((ret = __memp_fcreate(env, mpfp)) != 0)
+ return (ret);
+
+ /*
+ * We need a dbp to pass into to __env_mpool. Set up
+ * only the parts that it needs.
+ */
+ memset(&db, 0, sizeof(db));
+ db.env = env;
+ db.type = (DBTYPE)rfp->type;
+ db.pgsize = rfp->pgsize;
+ memcpy(db.fileid, rfp->uid.data, DB_FILE_ID_LEN);
+ db.flags = rfp->db_flags;
+ /* We need to make sure the dbp isn't marked open. */
+ F_CLR(&db, DB_AM_OPEN_CALLED);
+ /*
+ * The byte order of this database may be different from my local native
+ * byte order. If so, set the swap bit so that the necessary swapping
+ * will be done during file I/O.
+ */
+ if ((F_ISSET(env, ENV_LITTLEENDIAN) &&
+ !FLD_ISSET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN)) ||
+ (!F_ISSET(env, ENV_LITTLEENDIAN) &&
+ FLD_ISSET(rfp->finfo_flags, REPINFO_DB_LITTLEENDIAN))) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "rep_mpf_open: Different endian database. Set swap bit."));
+ F_SET(&db, DB_AM_SWAP);
+ } else
+ F_CLR(&db, DB_AM_SWAP);
+
+ db.mpf = *mpfp;
+ if (F_ISSET(&db, DB_AM_INMEM))
+ (void)__memp_set_flags(db.mpf, DB_MPOOL_NOFILE, 1);
+ if ((ret = __env_mpool(&db, rfp->info.data, flags)) != 0) {
+ (void)__memp_fclose(db.mpf, 0);
+ *mpfp = NULL;
+ }
+ return (ret);
+}
+
+/*
+ * __rep_pggap_req -
+ * Request a page gap. Assumes the caller holds the rep_mutex.
+ *
+ * PUBLIC: int __rep_pggap_req __P((ENV *, REP *, __rep_fileinfo_args *,
+ * PUBLIC: u_int32_t));
+ */
+int
+__rep_pggap_req(env, rep, reqfp, gapflags)
+ ENV *env;
+ REP *rep;
+ __rep_fileinfo_args *reqfp;
+ u_int32_t gapflags;
+{
+ DBT max_pg_dbt;
+ __rep_fileinfo_args *tmpfp, t;
+ size_t len, msgsz;
+ u_int32_t flags;
+ int alloc, master, ret;
+ u_int8_t *buf;
+
+ ret = 0;
+ alloc = 0;
+ /*
+ * There is a window where we have to set REP_RECOVER_PAGE when
+ * we receive the update information to transition from getting
+ * file information to getting page information. However, that
+ * thread does release and then reacquire mutexes. So, we might
+ * try re-requesting before the original thread can get curinfo
+ * setup. If curinfo isn't set up there is nothing to do.
+ */
+ if (rep->curinfo == NULL)
+ return (0);
+ if (reqfp == NULL) {
+ if ((ret = __rep_finfo_alloc(env, rep->curinfo, &tmpfp)) != 0)
+ return (ret);
+ alloc = 1;
+ } else {
+ t = *reqfp;
+ tmpfp = &t;
+ }
+
+ /*
+ * If we've never requested this page, then
+ * request everything between it and the first
+ * page we have. If we have requested this page
+ * then only request this record, not the entire gap.
+ */
+ flags = 0;
+ memset(&max_pg_dbt, 0, sizeof(max_pg_dbt));
+ /*
+ * If this is a PAGE_MORE and we're forcing then we want to
+ * force the request to ask for the next page after this one.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_FORCE))
+ tmpfp->pgno++;
+ else
+ tmpfp->pgno = rep->ready_pg;
+ msgsz = __REP_FILEINFO_SIZE +
+ tmpfp->uid.size + tmpfp->info.size;
+ if ((ret = __os_calloc(env, 1, msgsz, &buf)) != 0)
+ goto err;
+ if (rep->max_wait_pg == PGNO_INVALID ||
+ FLD_ISSET(gapflags, REP_GAP_FORCE | REP_GAP_REREQUEST)) {
+ /*
+ * Request the gap - set max to waiting_pg - 1 or if
+ * there is no waiting_pg, just ask for one.
+ */
+ if (rep->waiting_pg == PGNO_INVALID) {
+ if (FLD_ISSET(gapflags,
+ REP_GAP_FORCE | REP_GAP_REREQUEST))
+ rep->max_wait_pg = rep->curinfo->max_pgno;
+ else
+ rep->max_wait_pg = rep->ready_pg;
+ } else {
+ /*
+ * If we're forcing, and waiting_pg is less than
+ * the page we want to start this request at, then
+ * we set max_wait_pg to the max pgno in the file.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_FORCE) &&
+ rep->waiting_pg < tmpfp->pgno)
+ rep->max_wait_pg = rep->curinfo->max_pgno;
+ else
+ rep->max_wait_pg = rep->waiting_pg - 1;
+ }
+ tmpfp->max_pgno = rep->max_wait_pg;
+ /*
+ * Gap requests are "new" and can go anywhere.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_REREQUEST))
+ flags = DB_REP_REREQUEST;
+ else
+ flags = DB_REP_ANYWHERE;
+ } else {
+ /*
+ * Request 1 page - set max to ready_pg.
+ */
+ rep->max_wait_pg = rep->ready_pg;
+ tmpfp->max_pgno = rep->ready_pg;
+ /*
+ * If we're dropping to singletons, this is a rerequest.
+ */
+ flags = DB_REP_REREQUEST;
+ }
+ if ((master = rep->master_id) != DB_EID_INVALID) {
+ STAT(rep->stat.st_pg_requested++);
+ /*
+ * We need to request the pages, but we need to get the
+ * new info into rep->finfo. Assert that the sizes never
+ * change. The only thing this should do is change
+ * the pgno field. Everything else remains the same.
+ */
+ if ((ret = __rep_fileinfo_marshal(env, rep->infoversion,
+ tmpfp, buf, msgsz, &len)) == 0) {
+ DB_INIT_DBT(max_pg_dbt, buf, len);
+ DB_ASSERT(env, len == max_pg_dbt.size);
+ (void)__rep_send_message(env, master,
+ REP_PAGE_REQ, NULL, &max_pg_dbt, 0, flags);
+ }
+ } else
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_MASTER_REQ, NULL, NULL, 0, 0);
+
+ __os_free(env, buf);
+err:
+ if (alloc)
+ __os_free(env, tmpfp);
+ return (ret);
+}
+
+/*
+ * __rep_finfo_alloc -
+ * Allocate and initialize a fileinfo structure.
+ *
+ * PUBLIC: int __rep_finfo_alloc __P((ENV *, __rep_fileinfo_args *,
+ * PUBLIC: __rep_fileinfo_args **));
+ */
+int
+__rep_finfo_alloc(env, rfpsrc, rfpp)
+ ENV *env;
+ __rep_fileinfo_args *rfpsrc, **rfpp;
+{
+ __rep_fileinfo_args *rfp;
+ size_t size;
+ int ret;
+ void *uidp, *infop;
+
+ /*
+ * Allocate enough for the structure and the two DBT data areas.
+ */
+ size = sizeof(__rep_fileinfo_args) + rfpsrc->uid.size +
+ rfpsrc->info.size;
+ if ((ret = __os_malloc(env, size, &rfp)) != 0)
+ return (ret);
+
+ /*
+ * Copy the structure itself, and then set the DBT data pointers
+ * to their space and copy the data itself as well.
+ */
+ memcpy(rfp, rfpsrc, sizeof(__rep_fileinfo_args));
+ uidp = (u_int8_t *)rfp + sizeof(__rep_fileinfo_args);
+ rfp->uid.data = uidp;
+ memcpy(uidp, rfpsrc->uid.data, rfpsrc->uid.size);
+
+ infop = (u_int8_t *)uidp + rfpsrc->uid.size;
+ rfp->info.data = infop;
+ memcpy(infop, rfpsrc->info.data, rfpsrc->info.size);
+ *rfpp = rfp;
+ return (ret);
+}
+
+/*
+ * __rep_log_setup -
+ * We know our first LSN and need to reset the log subsystem
+ * to get our logs set up for the proper file.
+ */
+static int
+__rep_log_setup(env, rep, file, version, lsnp)
+ ENV *env;
+ REP *rep;
+ u_int32_t file;
+ u_int32_t version;
+ DB_LSN *lsnp;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ LOG *lp;
+ int ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+
+ /*
+ * Set up the log starting at the file number of the first LSN we
+ * need to get from the master.
+ */
+ LOG_SYSTEM_LOCK(env);
+ if ((ret = __log_newfile(dblp, &lsn, file, version)) == 0 &&
+ lsnp != NULL)
+ *lsnp = lsn;
+ LOG_SYSTEM_UNLOCK(env);
+
+ /*
+ * We reset first_lsn to the lp->lsn. We were given the LSN of
+ * the checkpoint and we now need the LSN for the beginning of
+ * the file, which __log_newfile conveniently set up for us
+ * in lp->lsn.
+ */
+ rep->first_lsn = lp->lsn;
+ TXN_SYSTEM_LOCK(env);
+ ZERO_LSN(region->last_ckp);
+ TXN_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __rep_queue_filedone -
+ * Determine if we're really done getting the pages for a queue file.
+ * Queue is handled in several steps.
+ * 1. First we get the meta page only.
+ * 2. We use the meta-page information to figure out first and last
+ * page numbers (and if queue wraps, first can be > last.
+ * 3. If first < last, we do a REP_PAGE_REQ for all pages.
+ * 4. If first > last, we REP_PAGE_REQ from first -> max page number.
+ * Then we'll ask for page 1 -> last.
+ *
+ * This function can return several things:
+ * DB_REP_PAGEDONE - if we're done with this file.
+ * 0 - if we're not done with this file.
+ * error - if we get an error doing some operations.
+ *
+ * This function will open a dbp handle to the queue file. This is needed
+ * by most of the QAM macros. We'll open it on the first pass through
+ * here and we'll close it whenever we decide we're done.
+ */
+static int
+__rep_queue_filedone(env, ip, rep, rfp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ __rep_fileinfo_args *rfp;
+{
+#ifndef HAVE_QUEUE
+ COMPQUIET(ip, NULL);
+ COMPQUIET(rep, NULL);
+ COMPQUIET(rfp, NULL);
+ return (__db_no_queue_am(env));
+#else
+ DB *queue_dbp;
+ db_pgno_t first, last;
+ u_int32_t flags;
+ int empty, ret, t_ret;
+
+ ret = 0;
+ queue_dbp = NULL;
+ if (rep->queue_dbc == NULL) {
+ /*
+ * We need to do a sync here so that the open
+ * can find the file and file id.
+ */
+ if ((ret = __memp_sync_int(env, NULL, 0,
+ DB_SYNC_CACHE | DB_SYNC_INTERRUPT_OK, NULL, NULL)) != 0)
+ goto out;
+ if ((ret =
+ __db_create_internal(&queue_dbp, env, 0)) != 0)
+ goto out;
+ flags = DB_NO_AUTO_COMMIT |
+ (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
+ /*
+ * We need to check whether this is in-memory so that we pass
+ * the name correctly as either the file or the database name.
+ */
+ if ((ret = __db_open(queue_dbp, ip, NULL,
+ FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? NULL :
+ rfp->info.data,
+ FLD_ISSET(rfp->db_flags, DB_AM_INMEM) ? rfp->info.data :
+ NULL,
+ DB_QUEUE, flags, 0, PGNO_BASE_MD)) != 0)
+ goto out;
+
+ if ((ret = __db_cursor(queue_dbp,
+ ip, NULL, &rep->queue_dbc, 0)) != 0)
+ goto out;
+ } else
+ queue_dbp = rep->queue_dbc->dbp;
+
+ if ((ret = __queue_pageinfo(queue_dbp,
+ &first, &last, &empty, 0, 0)) != 0)
+ goto out;
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Queue fileinfo: first %lu, last %lu, empty %d",
+ (u_long)first, (u_long)last, empty));
+ /*
+ * We can be at the end of 3 possible states.
+ * 1. We have received the meta-page and now need to get the
+ * rest of the pages in the database.
+ * 2. We have received from first -> max_pgno. We might be done,
+ * or we might need to ask for wrapped pages.
+ * 3. We have received all pages in the file. We're done.
+ */
+ if (rfp->max_pgno == 0) {
+ /*
+ * We have just received the meta page. Set up the next
+ * pages to ask for and check if the file is empty.
+ */
+ if (empty)
+ goto out;
+ if (first > last) {
+ rfp->max_pgno =
+ QAM_RECNO_PAGE(rep->queue_dbc->dbp, UINT32_MAX);
+ } else
+ rfp->max_pgno = last;
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Queue fileinfo: First req: first %lu, last %lu",
+ (u_long)first, (u_long)rfp->max_pgno));
+ goto req;
+ } else if (rfp->max_pgno != last) {
+ /*
+ * If max_pgno != last that means we're dealing with a
+ * wrapped situation. Request next batch of pages.
+ * Set npages to 1 because we already have page 0, the
+ * meta-page, now we need pages 1-max_pgno.
+ */
+ first = 1;
+ rfp->max_pgno = last;
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Queue fileinfo: Wrap req: first %lu, last %lu",
+ (u_long)first, (u_long)last));
+req:
+ /*
+ * Since we're simulating a "gap" to resend new PAGE_REQ
+ * for this file, we need to set waiting page to last + 1
+ * so that we'll ask for all from ready_pg -> last.
+ */
+ rep->npages = first;
+ rep->ready_pg = first;
+ rep->waiting_pg = rfp->max_pgno + 1;
+ rep->max_wait_pg = PGNO_INVALID;
+ ret = __rep_pggap_req(env, rep, rfp, 0);
+ return (ret);
+ }
+ /*
+ * max_pgno == last
+ * If we get here, we have all the pages we need.
+ * Close the dbp and return.
+ */
+out:
+ if (rep->queue_dbc != NULL &&
+ (t_ret = __dbc_close(rep->queue_dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ rep->queue_dbc = NULL;
+
+ if (queue_dbp != NULL &&
+ (t_ret = __db_close(queue_dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == 0)
+ ret = DB_REP_PAGEDONE;
+ return (ret);
+#endif
+}
+
+/*
+ * PUBLIC: int __rep_remove_init_file __P((ENV *));
+ */
+int
+__rep_remove_init_file(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+ char *name;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * If running in-memory replication, return without any file
+ * operations.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
+ return (0);
+
+ /* Abbreviated internal init doesn't use an init file. */
+ if (F_ISSET(rep, REP_F_ABBREVIATED))
+ return (0);
+
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, REP_INITNAME, NULL, &name)) != 0)
+ return (ret);
+ (void)__os_unlink(env, name, 0);
+ __os_free(env, name);
+ return (0);
+}
+
+/*
+ * Checks for the existence of the internal init flag file. If it exists, we
+ * remove all logs and databases, and then remove the flag file. This is
+ * intended to force the internal init to start over again, and thus affords
+ * protection against a client crashing during internal init. This function
+ * must be called before normal recovery in order to be properly effective.
+ *
+ * !!!
+ * This function should only be called during initial set-up of the environment,
+ * before various subsystems are initialized. It doesn't rely on the
+ * subsystems' code having been initialized, and it summarily deletes files "out
+ * from under" them, which might disturb the subsystems if they were up.
+ *
+ * PUBLIC: int __rep_reset_init __P((ENV *));
+ */
+int
+__rep_reset_init(env)
+ ENV *env;
+{
+ DB_FH *fhp;
+ __rep_update_args *rup;
+ DBT dbt;
+ char *allocated_dir, *dir, *init_name;
+ size_t cnt;
+ u_int32_t dbtvers, fvers, zero;
+ u_int8_t *next;
+ int ret, t_ret;
+
+ allocated_dir = NULL;
+ rup = NULL;
+ dbt.data = NULL;
+
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, REP_INITNAME, NULL, &init_name)) != 0)
+ return (ret);
+
+ if ((ret = __os_open(
+ env, init_name, 0, DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0) {
+ if (ret == ENOENT)
+ ret = 0;
+ goto out;
+ }
+
+ RPRINT(env, DB_VERB_REP_SYNC,
+ (env, "Cleaning up interrupted internal init"));
+
+ /* There are a few possibilities:
+ * 1. no init file, or less than 1 full file list
+ * 2. exactly one full file list
+ * 3. more than one, less then a second full file list
+ * 4. second file list in full
+ *
+ * In cases 2 or 4, we need to remove all logs, and then remove files
+ * according to the (most recent) file list. (In case 1 or 3, we don't
+ * have to do anything.)
+ *
+ * The __rep_get_file_list function takes care of folding these cases
+ * into two simple outcomes.
+ *
+ * As of 4.7, the first 4 bytes are 0. Read the first 4 bytes now.
+ * If they are non-zero it means we have an old-style init file.
+ * Otherwise, pass the file version in to rep_get_file_list.
+ */
+ if ((ret = __os_read(env, fhp, &zero, sizeof(zero), &cnt)) != 0)
+ goto out;
+ /*
+ * If we read successfully, but not enough, then unlink the file.
+ */
+ if (cnt != sizeof(zero))
+ goto rm;
+ if (zero != 0) {
+ /*
+ * Old style file. We have to set fvers to the 4.6
+ * version of the file and also rewind the file so
+ * that __rep_get_file_list can read out the length itself.
+ */
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto out;
+ fvers = REP_INITVERSION_46;
+ } else if ((ret = __os_read(env,
+ fhp, &fvers, sizeof(fvers), &cnt)) != 0)
+ goto out;
+ else if (cnt != sizeof(fvers))
+ goto rm;
+ ret = __rep_get_file_list(env, fhp, fvers, &dbtvers, &dbt);
+ if ((t_ret = __os_closehandle(env, fhp)) != 0 || ret != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto out;
+ }
+ if (dbt.data == NULL) {
+ /*
+ * The init file did not end with an intact file list. Since we
+ * never start log/db removal without an intact file list
+ * sync'ed to the init file, this must mean we don't have any
+ * partial set of files to clean up. So all we need to do is
+ * remove the init file.
+ */
+ goto rm;
+ }
+
+ /* Remove all log files. */
+ if (env->dbenv->db_log_dir == NULL)
+ dir = env->db_home;
+ else {
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, env->dbenv->db_log_dir, NULL, &dir)) != 0)
+ goto out;
+ allocated_dir = dir;
+ }
+
+ if ((ret = __rep_remove_by_prefix(env,
+ dir, LFPREFIX, sizeof(LFPREFIX)-1, DB_APP_LOG)) != 0)
+ goto out;
+
+ /*
+ * Remove databases according to the list, and queue extent files by
+ * searching them out on a walk through the data_dir's.
+ */
+ if ((ret = __rep_update_unmarshal(env, dbtvers,
+ &rup, dbt.data, dbt.size, &next)) != 0)
+ goto out;
+ if ((ret = __rep_unlink_by_list(env, dbtvers,
+ next, dbt.size, rup->num_files)) != 0)
+ goto out;
+
+ /* Here, we've established that the file exists. */
+rm: (void)__os_unlink(env, init_name, 0);
+out: if (rup != NULL)
+ __os_free(env, rup);
+ if (allocated_dir != NULL)
+ __os_free(env, allocated_dir);
+ if (dbt.data != NULL)
+ __os_free(env, dbt.data);
+
+ __os_free(env, init_name);
+ return (ret);
+}
+
+/*
+ * Reads the last fully intact file list from the init file. If the file ends
+ * with a partial list (or is empty), we're not interested in it. Lack of a
+ * full file list is indicated by a NULL dbt->data. On success, the list is
+ * returned in allocated space, which becomes the responsibility of the caller.
+ *
+ * The file format is a u_int32_t buffer length, in native format, followed by
+ * the file list itself, in the same format as in an UPDATE message (though
+ * many parts of it in this case are meaningless).
+ */
+static int
+__rep_get_file_list(env, fhp, fvers, dbtvers, dbt)
+ ENV *env;
+ DB_FH *fhp;
+ u_int32_t fvers;
+ u_int32_t *dbtvers;
+ DBT *dbt;
+{
+ u_int32_t length, mvers;
+ size_t cnt;
+ int i, ret;
+
+ /* At most 2 file lists: old and new. */
+ dbt->data = NULL;
+ mvers = DB_REPVERSION_46;
+ length = 0;
+ for (i = 1; i <= 2; i++) {
+ if (fvers >= REP_INITVERSION_47) {
+ if ((ret = __os_read(env, fhp, &mvers,
+ sizeof(mvers), &cnt)) != 0)
+ goto err;
+ if (cnt == 0 && dbt->data != NULL)
+ break;
+ if (cnt != sizeof(mvers))
+ goto err;
+ }
+ if ((ret = __os_read(env,
+ fhp, &length, sizeof(length), &cnt)) != 0)
+ goto err;
+
+ /*
+ * Reaching the end here is fine, if we've been through at least
+ * once already.
+ */
+ if (cnt == 0 && dbt->data != NULL)
+ break;
+ if (cnt != sizeof(length))
+ goto err;
+
+ if ((ret = __os_realloc(env,
+ (size_t)length, &dbt->data)) != 0)
+ goto err;
+
+ if ((ret = __os_read(
+ env, fhp, dbt->data, length, &cnt)) != 0 ||
+ cnt != (size_t)length)
+ goto err;
+ }
+
+ *dbtvers = mvers;
+ dbt->size = length;
+ return (0);
+
+err:
+ /*
+ * Note that it's OK to get here with a zero value in 'ret': it means we
+ * read less than we expected, and dbt->data == NULL indicates to the
+ * caller that we don't have an intact list.
+ */
+ if (dbt->data != NULL)
+ __os_free(env, dbt->data);
+ dbt->data = NULL;
+ return (ret);
+}
+
+/*
+ * Removes every file in a given directory that matches a given prefix. Notice
+ * how similar this is to __rep_walk_dir.
+ */
+static int
+__rep_remove_by_prefix(env, dir, prefix, pref_len, appname)
+ ENV *env;
+ const char *dir;
+ const char *prefix;
+ size_t pref_len;
+ APPNAME appname; /* What kind of name. */
+{
+ char *namep, **names;
+ int cnt, i, ret;
+
+ if ((ret = __os_dirlist(env, dir, 0, &names, &cnt)) != 0)
+ return (ret);
+ for (i = 0; i < cnt; i++) {
+ if (strncmp(names[i], prefix, pref_len) == 0) {
+ if ((ret = __db_appname(env,
+ appname, names[i], NULL, &namep)) != 0)
+ goto out;
+ (void)__os_unlink(env, namep, 0);
+ __os_free(env, namep);
+ }
+ }
+out: __os_dirfree(env, names, cnt);
+ return (ret);
+}
+
+/*
+ * Removes database files according to the contents of a list.
+ *
+ * This function must support removal either during environment creation, or
+ * when an internal init is reset in the middle. This means it must work
+ * regardless of whether underlying subsystems are initialized. However, it may
+ * assume that databases are not open. That means there is no REP!
+ */
+static int
+__rep_unlink_by_list(env, version, filelist, filesz, count)
+ ENV *env;
+ u_int32_t version;
+ u_int8_t *filelist;
+ u_int32_t filesz;
+ u_int32_t count;
+{
+ DB_ENV *dbenv;
+ __rep_fileinfo_args *rfp;
+ char **ddir, *dir, *namep;
+ u_int8_t *new_fp;
+ int ret;
+
+ dbenv = env->dbenv;
+ ret = 0;
+ rfp = NULL;
+ while (count-- > 0) {
+ if ((ret = __rep_fileinfo_unmarshal(env, version,
+ &rfp, filelist, filesz, &new_fp)) != 0)
+ goto out;
+ filesz -= (u_int32_t)(new_fp - filelist);
+ filelist = new_fp;
+
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, rfp->info.data, NULL, &namep)) != 0)
+ goto out;
+ (void)__os_unlink(env, namep, 0);
+ __os_free(env, namep);
+ __os_free(env, rfp);
+ rfp = NULL;
+ }
+
+ /* Notice how similar this code is to __rep_find_dbs. */
+ if (dbenv->db_data_dir == NULL)
+ ret = __rep_remove_by_prefix(env, env->db_home,
+ QUEUE_EXTENT_PREFIX, sizeof(QUEUE_EXTENT_PREFIX) - 1,
+ DB_APP_DATA);
+ else {
+ for (ddir = dbenv->db_data_dir; *ddir != NULL; ++ddir) {
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, *ddir, NULL, &dir)) != 0)
+ break;
+ ret = __rep_remove_by_prefix(env, dir,
+ QUEUE_EXTENT_PREFIX, sizeof(QUEUE_EXTENT_PREFIX)-1,
+ DB_APP_DATA);
+ __os_free(env, dir);
+ if (ret != 0)
+ break;
+ }
+ }
+
+out:
+ if (rfp != NULL)
+ __os_free(env, rfp);
+ return (ret);
+}
+
+static int
+__rep_remove_by_list(env, version, filelist, filesz, count)
+ ENV *env;
+ u_int32_t version;
+ u_int8_t *filelist;
+ u_int32_t filesz;
+ u_int32_t count;
+{
+ __rep_fileinfo_args *rfp;
+ u_int8_t *new_fp;
+ int ret;
+
+ ret = 0;
+ rfp = NULL;
+ while (count-- > 0) {
+ if ((ret = __rep_fileinfo_unmarshal(env, version,
+ &rfp, filelist, filesz, &new_fp)) != 0)
+ break;
+ filesz -= (u_int32_t)(new_fp - filelist);
+ filelist = new_fp;
+
+ if ((ret = __rep_remove_file(env, rfp->uid.data,
+ rfp->info.data, rfp->type, rfp->db_flags)) != 0) {
+ /*
+ * If the file already doesn't exist, that's perfectly
+ * OK. This can easily happen if we're cleaning up an
+ * interrupted internal init, and we only got part-way
+ * through the list of files.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ else
+ break;
+ }
+ __os_free(env, rfp);
+ rfp = NULL;
+ }
+
+ if (rfp != NULL)
+ __os_free(env, rfp);
+ return (ret);
+}
diff --git a/rep/rep_elect.c b/rep/rep_elect.c
new file mode 100644
index 0000000..61f79e4
--- /dev/null
+++ b/rep/rep_elect.c
@@ -0,0 +1,1353 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+
+/*
+ * We need to check sites == nsites, not more than half
+ * like we do in __rep_elect and the VOTE2 code. The
+ * reason is that we want to process all the incoming votes
+ * and not short-circuit once we reach more than half. The
+ * real winner's vote may be in the last half.
+ */
+#define IS_PHASE1_DONE(rep) \
+ ((rep)->sites >= (rep)->nsites && (rep)->w_priority > 0)
+
+#define I_HAVE_WON(rep, winner) \
+ ((rep)->votes >= (rep)->nvotes && winner == (rep)->eid)
+
+static void __rep_cmp_vote __P((ENV *, REP *, int, DB_LSN *,
+ u_int32_t, u_int32_t, u_int32_t, u_int32_t));
+static int __rep_elect_init
+ __P((ENV *, u_int32_t, u_int32_t, int *, u_int32_t *));
+static int __rep_fire_elected __P((ENV *, REP *, u_int32_t));
+static void __rep_elect_master __P((ENV *, REP *));
+static int __rep_grow_sites __P((ENV *, u_int32_t));
+static int __rep_tally __P((ENV *, REP *, int, u_int32_t *, u_int32_t, int));
+static int __rep_wait __P((ENV *, db_timeout_t *, int, u_int32_t, u_int32_t));
+
+/*
+ * __rep_elect_pp --
+ * Called after master failure to hold/participate in an election for
+ * a new master.
+ *
+ * PUBLIC: int __rep_elect_pp
+ * PUBLIC: __P((DB_ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__rep_elect_pp(dbenv, given_nsites, nvotes, flags)
+ DB_ENV *dbenv;
+ u_int32_t given_nsites, nvotes;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_elect", DB_INIT_REP);
+
+ if (APP_IS_REPMGR(env)) {
+ __db_errx(env,
+"DB_ENV->rep_elect: cannot call from Replication Manager application");
+ return (EINVAL);
+ }
+
+ /* We need a transport function because we send messages. */
+ if (db_rep->send == NULL) {
+ __db_errx(env,
+ "DB_ENV->rep_elect: must be called after DB_ENV->rep_set_transport");
+ return (EINVAL);
+ }
+
+ if (IS_USING_LEASES(env) && given_nsites != 0) {
+ __db_errx(env,
+ "DB_ENV->rep_elect: nsites must be zero if leases configured");
+ return (EINVAL);
+ }
+
+ ret = __rep_elect_int(env, given_nsites, nvotes, flags);
+
+ return (ret);
+}
+
+/*
+ * __rep_elect_int --
+ * Internal processing to hold/participate in an election for
+ * a new master after master failure.
+ *
+ * PUBLIC: int __rep_elect_int
+ * PUBLIC: __P((ENV *, u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__rep_elect_int(env, given_nsites, nvotes, flags)
+ ENV *env;
+ u_int32_t given_nsites, nvotes;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REP *rep;
+ int done, elected, full_elect, in_progress, locked, need_req;
+ int ret, send_vote, t_ret;
+ u_int32_t ack, ctlflags, egen, nsites, orig_tally, priority, realpri;
+ u_int32_t repflags, tiebreaker;
+ db_timeout_t last_to, timeout, to;
+
+ COMPQUIET(flags, 0);
+ COMPQUIET(egen, 0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ elected = 0;
+
+ /*
+ * Specifying 0 for nsites signals us to use the value configured
+ * previously via rep_set_nsites. Similarly, if the given nvotes is 0,
+ * it asks us to compute the value representing a simple majority.
+ */
+ nsites = given_nsites == 0 ? rep->config_nsites : given_nsites;
+ ack = nvotes == 0 ? ELECTION_MAJORITY(nsites) : nvotes;
+ locked = 0;
+
+ /*
+ * XXX
+ * If users give us less than a majority, they run the risk of
+ * having a network partition. However, this also allows the
+ * scenario of master/1 client to elect the client. Allow
+ * sub-majority values, but give a warning.
+ */
+ if (ack <= (nsites / 2)) {
+ __db_errx(env,
+ "DB_ENV->rep_elect:WARNING: nvotes (%d) is sub-majority with nsites (%d)",
+ nvotes, nsites);
+ }
+
+ if (nsites < ack) {
+ __db_errx(env,
+ "DB_ENV->rep_elect: nvotes (%d) is larger than nsites (%d)",
+ ack, nsites);
+ return (EINVAL);
+ }
+
+ /*
+ * Default to the normal timeout unless the user configured
+ * a full election timeout and we think we need a full election.
+ */
+ full_elect = 0;
+ timeout = rep->elect_timeout;
+ if (!F_ISSET(rep, REP_F_GROUP_ESTD) && rep->full_elect_timeout != 0) {
+ full_elect = 1;
+ timeout = rep->full_elect_timeout;
+ }
+ realpri = rep->priority;
+
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Start election nsites %d, ack %d, priority %d",
+ nsites, ack, realpri));
+
+ /*
+ * Special case when having an election while running with
+ * sites of potentially mixed versions. We set a bit indicating
+ * we're an electable site, but set our priority to 0.
+ * Old sites will never elect us, with 0 priority, but if all
+ * we have are new sites, then we can elect the best electable
+ * site of the group.
+ * Thus 'priority' is this special, possibly-fake, effective
+ * priority that we'll use for this election, while 'realpri' is our
+ * real, configured priority, as retrieved from REP region.
+ */
+ ctlflags = realpri != 0 ? REPCTL_ELECTABLE : 0;
+ ENV_ENTER(env, ip);
+
+ orig_tally = 0;
+ if ((ret = __rep_elect_init(env, nsites, ack,
+ &in_progress, &orig_tally)) != 0) {
+ if (ret == DB_REP_NEWMASTER)
+ ret = 0;
+ goto err;
+ }
+ /*
+ * If another thread is in the middle of an election we
+ * just quietly return and not interfere.
+ */
+ if (in_progress)
+ goto edone;
+
+ priority = lp->persist.version != DB_LOGVERSION ? 0 : realpri;
+#ifdef CONFIG_TEST
+ /*
+ * This allows us to unit test the ELECTABLE flag simply by
+ * using the priority values.
+ */
+ if (priority > 0 && priority <= 5) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Artificially setting priority 0 (ELECTABLE) for CONFIG_TEST mode"));
+ DB_ASSERT(env, ctlflags == REPCTL_ELECTABLE);
+ priority = 0;
+ }
+#endif
+ __os_gettime(env, &rep->etime, 1);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * If leases are configured, wait for them to expire, and
+ * see if we can discover the master while waiting.
+ */
+ if (IS_USING_LEASES(env)) {
+ to = __rep_lease_waittime(env);
+ if (to != 0) {
+ F_SET(rep, REP_F_EPHASE0);
+ REP_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_MASTER_REQ, NULL, NULL, 0, 0);
+ ret = __rep_wait(env, &to, 0, rep->egen, REP_F_EPHASE0);
+ REP_SYSTEM_LOCK(env);
+ repflags = rep->flags;
+ F_CLR(rep, REP_F_EPHASE0);
+ switch (ret) {
+ /*
+ * If waiting is successful, our flag is cleared
+ * and the master responded. We're done.
+ */
+ case DB_REP_EGENCHG:
+ case 0:
+ REP_SYSTEM_UNLOCK(env);
+ goto edone;
+ /*
+ * If we get a timeout, continue with the election.
+ */
+ case DB_TIMEOUT:
+ /*
+ * We have waited a full lease timeout. We
+ * need to check now under lock to verify that
+ * the phase was not over and that the client
+ * did not grant the lease. If either happened
+ * between the time the wait finished and we
+ * reacquired the mutex, we're done.
+ */
+ if (!FLD_ISSET(repflags, REP_F_EPHASE0) ||
+ __rep_islease_granted(env) != 0) {
+ ret = 0;
+ REP_SYSTEM_UNLOCK(env);
+ goto edone;
+ }
+ F_SET(rep, REP_F_LEASE_EXPIRED);
+ break;
+ default:
+ goto lockdone;
+ }
+ }
+ }
+ /*
+ * We need to lockout applying incoming log records during
+ * the election. We need to use a special rep_lockout_apply
+ * instead of rep_lockout_msg because we do not want to
+ * lockout all incoming messages, like other VOTEs!
+ */
+ if ((ret = __rep_lockout_apply(env, rep, 0)) != 0)
+ goto lockdone;
+ locked = 1;
+ last_to = to = timeout;
+ REP_SYSTEM_UNLOCK(env);
+restart:
+ /* Generate a randomized tiebreaker value. */
+ __os_unique_id(env, &tiebreaker);
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ REP_SYSTEM_LOCK(env);
+
+ F_SET(rep, REP_F_EPHASE1 | REP_F_NOARCHIVE);
+ F_CLR(rep, REP_F_TALLY);
+ /*
+ * We made sure that leases were expired before starting the
+ * election, but an existing master may be slow in responding.
+ * If, during lockout, acquiring mutexes, etc, the client has now
+ * re-granted its lease, we're done - a master exists.
+ */
+ if (IS_USING_LEASES(env) &&
+ __rep_islease_granted(env)) {
+ ret = 0;
+ goto lockdone;
+ }
+
+ /*
+ * If we are in the middle of recovering or internal
+ * init, we participate, but we set our priority to 0
+ * and turn off REPCTL_ELECTABLE. We *cannot* use the
+ * REP_F_RECOVER_MASK macro because we must explicitly
+ * exclude REP_F_RECOVER_VERIFY. If we are in verify
+ * then that is okay, we can be elected (i.e. we are not
+ * in an inconsistent state).
+ */
+ if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP |
+ REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE | REP_F_RECOVER_UPDATE)) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Setting priority 0, unelectable, due to internal init/recovery"));
+ priority = 0;
+ ctlflags = 0;
+ }
+
+ /*
+ * We are about to participate at this egen. We must
+ * write out the next egen before participating in this one
+ * so that if we crash we can never participate in this egen
+ * again.
+ */
+ if ((ret = __rep_write_egen(env, rep, rep->egen + 1)) != 0)
+ goto lockdone;
+
+ /* Tally our own vote */
+ if (__rep_tally(env, rep, rep->eid, &rep->sites, rep->egen, 1) != 0) {
+ ret = EINVAL;
+ goto lockdone;
+ }
+ __rep_cmp_vote(env, rep, rep->eid, &lsn, priority, rep->gen,
+ tiebreaker, ctlflags);
+
+ RPRINT(env, DB_VERB_REP_ELECT, (env, "Beginning an election"));
+
+ /* Now send vote */
+ send_vote = DB_EID_INVALID;
+ egen = rep->egen;
+ done = IS_PHASE1_DONE(rep);
+ REP_SYSTEM_UNLOCK(env);
+ __rep_send_vote(env, &lsn, nsites, ack, priority, tiebreaker, egen,
+ DB_EID_BROADCAST, REP_VOTE1, ctlflags);
+ DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTVOTE1, ret, NULL);
+ if (done) {
+ REP_SYSTEM_LOCK(env);
+ goto vote;
+ }
+ last_to = to;
+ ret = __rep_wait(env, &to, full_elect, egen, REP_F_EPHASE1);
+ switch (ret) {
+ case 0:
+ /* Check if election complete or phase complete. */
+ if (!IN_ELECTION(rep)) {
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Ended election phase 1"));
+ goto edone;
+ }
+ goto phase2;
+ case DB_REP_EGENCHG:
+ /*
+ * Pick up reducing our timeout where we last
+ * left off.
+ */
+ if (to > last_to)
+ to = last_to;
+ to = (to * 8) / 10;
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+"Egen changed while waiting. Now %lu. New timeout %lu, orig timeout %lu",
+ (u_long)rep->egen, (u_long)to, (u_long)timeout));
+ /*
+ * If the egen changed while we were sleeping, that
+ * means we're probably late to the next election,
+ * so we'll backoff our timeout so that we don't get
+ * into an out-of-phase election scenario.
+ *
+ * Backoff to 80% of the current timeout.
+ */
+ goto restart;
+ case DB_TIMEOUT:
+ break;
+ default:
+ goto err;
+ }
+
+ REP_SYSTEM_LOCK(env);
+ /*
+ * If we got here, we haven't heard from everyone, but we've
+ * run out of time, so it's time to decide if we have enough
+ * votes to pick a winner and if so, to send out a vote to
+ * the winner.
+ *
+ * Check the state of the world after reacquiring the mutex.
+ * See if the election actually finished anyway.
+ */
+ if (!IN_ELECTION(rep)) {
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Ended election after acquiring mutex"));
+ ret = 0;
+ goto lockdone;
+ }
+ /*
+ * If our egen changed while we were waiting. We need to
+ * essentially reinitialize our election.
+ */
+ if (egen != rep->egen) {
+ REP_SYSTEM_UNLOCK(env);
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Egen changed from %lu to %lu",
+ (u_long)egen, (u_long)rep->egen));
+ goto restart;
+ }
+ if (rep->sites >= rep->nvotes) {
+vote:
+ /* We think we've seen enough to cast a vote. */
+ send_vote = rep->winner;
+ /*
+ * See if we won. This will make sure we
+ * don't count ourselves twice if we're racing
+ * with incoming votes.
+ */
+ if (rep->winner == rep->eid) {
+ (void)__rep_tally(env, rep, rep->eid, &rep->votes,
+ egen, 2);
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Counted my vote %d", rep->votes));
+ }
+ F_SET(rep, REP_F_EPHASE2);
+ F_CLR(rep, REP_F_EPHASE1);
+ }
+ REP_SYSTEM_UNLOCK(env);
+ if (send_vote == DB_EID_INVALID) {
+ /* We do not have enough votes to elect. */
+ if (rep->sites >= rep->nvotes)
+ __db_errx(env,
+ "No electable site found: recvd %d of %d votes from %d sites",
+ rep->sites, rep->nvotes, rep->nsites);
+ else
+ __db_errx(env,
+ "Not enough votes to elect: recvd %d of %d from %d sites",
+ rep->sites, rep->nvotes, rep->nsites);
+ ret = DB_REP_UNAVAIL;
+ goto err;
+ }
+
+ /*
+ * We have seen enough vote1's. Now we need to wait
+ * for all the vote2's.
+ */
+ if (send_vote != rep->eid) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env, "Sending vote"));
+ __rep_send_vote(env, NULL, 0, 0, 0, 0, egen,
+ send_vote, REP_VOTE2, 0);
+ /*
+ * If we are NOT the new master we want to send
+ * our vote to the winner, and wait longer. The
+ * reason is that the winner may be "behind" us
+ * in the election waiting and if the master is
+ * down, the winner will wait the full timeout
+ * and we want to give the winner enough time to
+ * process all the votes. Otherwise we could
+ * incorrectly return DB_REP_UNAVAIL and start a
+ * new election before the winner can declare
+ * itself.
+ */
+ to = to * 2;
+ }
+
+phase2:
+ if (I_HAVE_WON(rep, rep->winner)) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Skipping phase2 wait: already got %d votes", rep->votes));
+ REP_SYSTEM_LOCK(env);
+ goto i_won;
+ }
+ /*
+ * Don't set last_to to 'to' here because we may have adjusted
+ * it above. If egen changes we want to pick up reducing the
+ * timeout from the point we were above.
+ */
+ ret = __rep_wait(env, &to, full_elect, egen, REP_F_EPHASE2);
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Ended election phase 2 %d", ret));
+ switch (ret) {
+ case 0:
+ goto edone;
+ case DB_REP_EGENCHG:
+ if (to > last_to)
+ to = last_to;
+ to = (to * 8) / 10;
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+"While waiting egen changed to %lu. Phase 2 New timeout %lu, orig timeout %lu",
+ (u_long)rep->egen,
+ (u_long)to, (u_long)timeout));
+ goto restart;
+ case DB_TIMEOUT:
+ ret = DB_REP_UNAVAIL;
+ break;
+ default:
+ goto err;
+ }
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Check the state of the world after reacquiring the mutex.
+ * See if the election actually finished anyway.
+ */
+ if (!IN_ELECTION(rep)) {
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Ended election phase 2 after acquiring mutex"));
+ ret = 0;
+ goto lockdone;
+ }
+ if (egen != rep->egen) {
+ REP_SYSTEM_UNLOCK(env);
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Egen ph2 changed from %lu to %lu",
+ (u_long)egen, (u_long)rep->egen));
+ goto restart;
+ }
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "After phase 2: votes %d, nvotes %d, nsites %d",
+ rep->votes, rep->nvotes, rep->nsites));
+ if (I_HAVE_WON(rep, rep->winner)) {
+i_won: __rep_elect_master(env, rep);
+ ret = 0;
+ elected = 1;
+ }
+ if (0) {
+err: REP_SYSTEM_LOCK(env);
+ }
+lockdone:
+ /*
+ * If we get here because of a non-election error, then we
+ * did not tally our vote. The only non-election error is
+ * from elect_init where we were unable to grow_sites. In
+ * that case we do not want to discard all known election info.
+ */
+ if (ret == 0 || ret == DB_REP_UNAVAIL)
+ __rep_elect_done(env, rep, 0);
+ else if (orig_tally)
+ F_SET(rep, orig_tally);
+
+ /*
+ * If the election finished elsewhere, we need to clear
+ * the elect flag anyway.
+ */
+ if (0) {
+edone: REP_SYSTEM_LOCK(env);
+ }
+ F_CLR(rep, REP_F_INREPELECT);
+ if (locked) {
+ need_req = F_ISSET(rep, REP_F_SKIPPED_APPLY) &&
+ !I_HAVE_WON(rep, rep->winner);
+ F_CLR(rep, REP_F_READY_APPLY | REP_F_SKIPPED_APPLY);
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * If we skipped any log records, request them now.
+ */
+ if (need_req && (t_ret = __rep_resend_req(env, 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ } else
+ REP_SYSTEM_UNLOCK(env);
+
+ if (elected)
+ ret = __rep_fire_elected(env, rep, egen);
+
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Ended election with %d, sites %d, egen %lu, flags 0x%lx",
+ ret, rep->sites, (u_long)rep->egen, (u_long)rep->flags));
+
+DB_TEST_RECOVERY_LABEL
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __rep_vote1 --
+ * Handle incoming vote1 message on a client.
+ *
+ * PUBLIC: int __rep_vote1 __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_vote1(env, rp, rec, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+{
+ DBT data_dbt;
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ REP_OLD_VOTE_INFO *ovi;
+ __rep_egen_args egen_arg;
+ __rep_vote_info_args tmpvi, *vi;
+ u_int32_t egen;
+ int elected, inelect, master, ret;
+ u_int8_t buf[__REP_MAXMSG_SIZE];
+ size_t len;
+
+ COMPQUIET(egen, 0);
+
+ elected = ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env, "Master received vote"));
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ return (ret);
+ }
+
+ /*
+ * In 4.7 we changed to having fixed sized u_int32_t's from
+ * non-fixed 'int' fields in the vote structure.
+ */
+ if (rp->rep_version < DB_REPVERSION_47) {
+ ovi = (REP_OLD_VOTE_INFO *)rec->data;
+ tmpvi.egen = ovi->egen;
+ tmpvi.nsites = (u_int32_t)ovi->nsites;
+ tmpvi.nvotes = (u_int32_t)ovi->nvotes;
+ tmpvi.priority = (u_int32_t)ovi->priority;
+ tmpvi.tiebreaker = ovi->tiebreaker;
+ } else
+ if ((ret = __rep_vote_info_unmarshal(env,
+ &tmpvi, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ vi = &tmpvi;
+ REP_SYSTEM_LOCK(env);
+
+ /*
+ * If we get a vote from a later election gen, we
+ * clear everything from the current one, and we'll
+ * start over by tallying it. If we get an old vote,
+ * send an ALIVE to the old participant.
+ */
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Received vote1 egen %lu, egen %lu",
+ (u_long)vi->egen, (u_long)rep->egen));
+ if (vi->egen < rep->egen) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Received old vote %lu, egen %lu, ignoring vote1",
+ (u_long)vi->egen, (u_long)rep->egen));
+ egen_arg.egen = rep->egen;
+ REP_SYSTEM_UNLOCK(env);
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(data_dbt, &egen_arg.egen,
+ sizeof(egen_arg.egen));
+ else {
+ if ((ret = __rep_egen_marshal(env,
+ &egen_arg, buf, __REP_EGEN_SIZE, &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(data_dbt, buf, len);
+ }
+ (void)__rep_send_message(env,
+ eid, REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
+ return (ret);
+ }
+ inelect = F_ISSET(rep, REP_F_INREPELECT);
+ if (vi->egen > rep->egen) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Received VOTE1 from egen %lu, my egen %lu; reset",
+ (u_long)vi->egen, (u_long)rep->egen));
+ /*
+ * Record if we're currently in rep_elect. If so, don't
+ * return HOLDELECTION because the election thread should
+ * notice it. However, there is a window where the thread
+ * could be at the tail end of processing the previous
+ * election and it would not see this change in egen.
+ */
+ __rep_elect_done(env, rep, 0);
+ rep->egen = vi->egen;
+ F_SET(rep, REP_F_EGENUPDATE);
+ }
+
+ /*
+ * If this site (sender of the VOTE1) is the first to the party, simply
+ * initialize values from the message. Otherwise, see if the site knows
+ * about more sites, and/or requires more votes, than we do.
+ */
+ if (!IN_ELECTION_TALLY(rep)) {
+ F_SET(rep, REP_F_TALLY);
+ rep->nsites = vi->nsites;
+ rep->nvotes = vi->nvotes;
+ } else {
+ if (vi->nsites > rep->nsites)
+ rep->nsites = vi->nsites;
+ if (vi->nvotes > rep->nvotes)
+ rep->nvotes = vi->nvotes;
+ }
+
+ /*
+ * We are keeping the vote, let's see if that changes our
+ * count of the number of sites.
+ */
+ if (rep->sites + 1 > rep->nsites)
+ rep->nsites = rep->sites + 1;
+ /*
+ * Ignore vote1's if we're in phase 2.
+ */
+ if (F_ISSET(rep, REP_F_EPHASE2)) {
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "In phase 2, ignoring vote1"));
+ goto err;
+ }
+
+ /*
+ * Record this vote. If we get back non-zero, we
+ * ignore the vote.
+ */
+ if ((ret = __rep_tally(env, rep, eid, &rep->sites, vi->egen, 1)) != 0) {
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Tally returned %d, sites %d", ret, rep->sites));
+ ret = 0;
+ goto err;
+ }
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Incoming vote: (eid)%d (pri)%lu %s (gen)%lu (egen)%lu [%lu,%lu]",
+ eid, (u_long)vi->priority,
+ F_ISSET(rp, REPCTL_ELECTABLE) ? "ELECTABLE" : "",
+ (u_long)rp->gen, (u_long)vi->egen,
+ (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
+ if (rep->sites > 1)
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Existing vote: (eid)%d (pri)%lu (gen)%lu (sites)%d [%lu,%lu]",
+ rep->winner, (u_long)rep->w_priority,
+ (u_long)rep->w_gen, rep->sites,
+ (u_long)rep->w_lsn.file,
+ (u_long)rep->w_lsn.offset));
+
+ __rep_cmp_vote(env, rep, eid, &rp->lsn, vi->priority,
+ rp->gen, vi->tiebreaker, rp->flags);
+ /*
+ * If you get a vote and you're not in an election, we've
+ * already recorded this vote. But that is all we need
+ * to do.
+ */
+ if (!IN_ELECTION(rep)) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Not in election, but received vote1 0x%x", rep->flags));
+ /*
+ * If we were in the middle of an election and we ended up
+ * clearing the election out from under the rep_elect caller,
+ * we want to just return here.
+ */
+ if (inelect)
+ ret = 0;
+ else
+ ret = DB_REP_HOLDELECTION;
+ goto err;
+ }
+
+ master = rep->winner;
+ lsn = rep->w_lsn;
+ if (IS_PHASE1_DONE(rep)) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env, "Phase1 election done"));
+ RPRINT(env, DB_VERB_REP_ELECT, (env, "Voting for %d%s",
+ master, master == rep->eid ? "(self)" : ""));
+ egen = rep->egen;
+ F_SET(rep, REP_F_EPHASE2);
+ F_CLR(rep, REP_F_EPHASE1);
+ if (master == rep->eid) {
+ (void)__rep_tally(env, rep, rep->eid,
+ &rep->votes, egen, 2);
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "After phase 1 done: counted vote %d of %d",
+ rep->votes, rep->nvotes));
+ if (I_HAVE_WON(rep, rep->winner)) {
+ __rep_elect_master(env, rep);
+ elected = 1;
+ }
+ goto err;
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ /* Vote for someone else. */
+ __rep_send_vote(env, NULL, 0, 0, 0, 0, egen,
+ master, REP_VOTE2, 0);
+ } else
+err: REP_SYSTEM_UNLOCK(env);
+ if (elected)
+ ret = __rep_fire_elected(env, rep, egen);
+ return (ret);
+}
+
+/*
+ * __rep_vote2 --
+ * Handle incoming vote2 message on a client.
+ *
+ * PUBLIC: int __rep_vote2 __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_vote2(env, rp, rec, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ REP_OLD_VOTE_INFO *ovi;
+ __rep_vote_info_args tmpvi, *vi;
+ u_int32_t egen;
+ int ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ RPRINT(env, DB_VERB_REP_ELECT, (env, "We received a vote%s",
+ F_ISSET(rep, REP_F_MASTER) ? " (master)" : ""));
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ STAT(rep->stat.st_elections_won++);
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ if (IS_USING_LEASES(env))
+ ret = __rep_lease_refresh(env);
+ return (ret);
+ }
+
+ REP_SYSTEM_LOCK(env);
+ egen = rep->egen;
+
+ /*
+ * We might be the last to the party and we haven't had
+ * time to tally all the vote1's, but others have and
+ * decided we're the winner. So, if we're in the process
+ * of tallying sites, keep the vote so that when our
+ * election thread catches up we'll have the votes we
+ * already received.
+ */
+ /*
+ * In 4.7 we changed to having fixed sized u_int32_t's from
+ * non-fixed 'int' fields in the vote structure.
+ */
+ if (rp->rep_version < DB_REPVERSION_47) {
+ ovi = (REP_OLD_VOTE_INFO *)rec->data;
+ tmpvi.egen = ovi->egen;
+ tmpvi.nsites = (u_int32_t)ovi->nsites;
+ tmpvi.nvotes = (u_int32_t)ovi->nvotes;
+ tmpvi.priority = (u_int32_t)ovi->priority;
+ tmpvi.tiebreaker = ovi->tiebreaker;
+ } else
+ if ((ret = __rep_vote_info_unmarshal(env,
+ &tmpvi, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ vi = &tmpvi;
+ if (!IN_ELECTION_TALLY(rep) && vi->egen >= rep->egen) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Not in election gen %lu, at %lu, got vote",
+ (u_long)vi->egen, (u_long)rep->egen));
+ ret = DB_REP_HOLDELECTION;
+ goto err;
+ }
+
+ /*
+ * Record this vote. In a VOTE2, the only valid entry
+ * in the vote information is the election generation.
+ *
+ * There are several things which can go wrong that we
+ * need to account for:
+ * 1. If we receive a latent VOTE2 from an earlier election,
+ * we want to ignore it.
+ * 2. If we receive a VOTE2 from a site from which we never
+ * received a VOTE1, we want to record it, because we simply
+ * may be processing messages out of order or its vote1 got lost,
+ * but that site got all the votes it needed to send it.
+ * 3. If we have received a duplicate VOTE2 from this election
+ * from the same site we want to ignore it.
+ * 4. If this is from the current election and someone is
+ * really voting for us, then we finally get to record it.
+ */
+ /*
+ * Case 1.
+ */
+ if (vi->egen != rep->egen) {
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Bad vote egen %lu. Mine %lu",
+ (u_long)vi->egen, (u_long)rep->egen));
+ ret = 0;
+ goto err;
+ }
+
+ /*
+ * __rep_tally takes care of cases 2, 3 and 4.
+ */
+ if ((ret = __rep_tally(env, rep, eid, &rep->votes, vi->egen, 2)) != 0) {
+ ret = 0;
+ goto err;
+ }
+ RPRINT(env, DB_VERB_REP_ELECT, (env, "Counted vote %d of %d",
+ rep->votes, rep->nvotes));
+ if (I_HAVE_WON(rep, rep->winner)) {
+ __rep_elect_master(env, rep);
+ ret = DB_REP_NEWMASTER;
+ }
+
+err: REP_SYSTEM_UNLOCK(env);
+ if (ret == DB_REP_NEWMASTER)
+ ret = __rep_fire_elected(env, rep, egen);
+ return (ret);
+}
+
+/*
+ * __rep_tally --
+ * Handle incoming vote message on a client. Called with the db_rep
+ * mutex held. This function will return 0 if we successfully tally
+ * the vote and non-zero if the vote is ignored. This will record
+ * both VOTE1 and VOTE2 records, depending on which region offset the
+ * caller passed in.
+ */
+static int
+__rep_tally(env, rep, eid, countp, egen, phase)
+ ENV *env;
+ REP *rep;
+ int eid;
+ u_int32_t *countp;
+ u_int32_t egen;
+ int phase;
+{
+ REP_VTALLY *tally, *vtp;
+ u_int32_t i, max_sites;
+ int ret;
+
+ /*
+ * The counts are indices, and therefore 0-based.
+ */
+ if ((*countp + 1) > rep->nsites)
+ max_sites = (*countp + 1);
+ else
+ max_sites = rep->nsites;
+ if (max_sites > rep->asites &&
+ (ret = __rep_grow_sites(env, max_sites)) != 0) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Grow sites returned error %d", ret));
+ return (ret);
+ }
+ if (phase == 1)
+ tally = R_ADDR(env->reginfo, rep->tally_off);
+ else
+ tally = R_ADDR(env->reginfo, rep->v2tally_off);
+ vtp = &tally[0];
+ for (i = 0; i < *countp;) {
+ /*
+ * Ignore votes from earlier elections (i.e. we've heard
+ * from this site in this election, but its vote from an
+ * earlier election got delayed and we received it now).
+ * However, if we happened to hear from an earlier vote
+ * and we recorded it and we're now hearing from a later
+ * election we want to keep the updated one. Note that
+ * updating the entry will not increase the count.
+ * Also ignore votes that are duplicates.
+ */
+ if (vtp->eid == eid) {
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Tally found[%d] (%d, %lu), this vote (%d, %lu)",
+ i, vtp->eid, (u_long)vtp->egen,
+ eid, (u_long)egen));
+ if (vtp->egen >= egen)
+ return (1);
+ else {
+ vtp->egen = egen;
+ return (0);
+ }
+ }
+ i++;
+ vtp = &tally[i];
+ }
+
+ /*
+ * If we get here, we have a new voter we haven't seen before. Tally
+ * this vote.
+ */
+ RPRINT(env, DB_VERB_REP_ELECT, (env, "Tallying VOTE%d[%d] (%d, %lu)",
+ phase, i, eid, (u_long)egen));
+
+ vtp->eid = eid;
+ vtp->egen = egen;
+ (*countp)++;
+ return (0);
+}
+
+/*
+ * __rep_cmp_vote --
+ * Compare incoming vote1 message on a client. Called with the db_rep
+ * mutex held.
+ *
+ */
+static void
+__rep_cmp_vote(env, rep, eid, lsnp, priority, gen, tiebreaker, flags)
+ ENV *env;
+ REP *rep;
+ int eid;
+ DB_LSN *lsnp;
+ u_int32_t priority;
+ u_int32_t flags, gen, tiebreaker;
+{
+ int cmp;
+
+ cmp = LOG_COMPARE(lsnp, &rep->w_lsn);
+ /*
+ * If we've seen more than one, compare us to the best so far.
+ * If we're the first, make ourselves the winner to start.
+ */
+ if (rep->sites > 1 &&
+ (priority != 0 || LF_ISSET(REPCTL_ELECTABLE))) {
+ /*
+ * Special case, if we have a mixed version group of sites,
+ * we set priority to 0, but set the ELECTABLE flag so that
+ * all sites talking at lower versions can correctly elect.
+ * If a non-zero priority comes in and current winner is
+ * zero priority (but was electable), then the non-zero
+ * site takes precedence no matter what its LSN is.
+ *
+ * Then LSN is determinant only if we're comparing
+ * like-styled version/priorities. I.e. both with
+ * 0/ELECTABLE priority or both with non-zero priority.
+ * Then actual priority value if LSNs
+ * are equal, then tiebreaker if both are equal.
+ */
+ if ((priority != 0 && rep->w_priority == 0) ||
+ (((priority == 0 && rep->w_priority == 0) ||
+ (priority != 0 && rep->w_priority != 0)) && cmp > 0) ||
+ (cmp == 0 && (priority > rep->w_priority ||
+ (priority == rep->w_priority &&
+ (tiebreaker > rep->w_tiebreaker))))) {
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Accepting new vote"));
+ rep->winner = eid;
+ rep->w_priority = priority;
+ rep->w_lsn = *lsnp;
+ rep->w_gen = gen;
+ rep->w_tiebreaker = tiebreaker;
+ }
+ } else if (rep->sites == 1) {
+ if (priority != 0 || LF_ISSET(REPCTL_ELECTABLE)) {
+ /* Make ourselves the winner to start. */
+ rep->winner = eid;
+ rep->w_priority = priority;
+ rep->w_gen = gen;
+ rep->w_lsn = *lsnp;
+ rep->w_tiebreaker = tiebreaker;
+ } else {
+ rep->winner = DB_EID_INVALID;
+ rep->w_priority = 0;
+ rep->w_gen = 0;
+ ZERO_LSN(rep->w_lsn);
+ rep->w_tiebreaker = 0;
+ }
+ }
+}
+
+/*
+ * __rep_elect_init
+ * Initialize an election. Sets beginp non-zero if the election is
+ * already in progress; makes it 0 otherwise.
+ */
+static int
+__rep_elect_init(env, nsites, nvotes, beginp, otally)
+ ENV *env;
+ u_int32_t nsites, nvotes;
+ int *beginp;
+ u_int32_t *otally;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ret = 0;
+
+ /* We may miscount, as we don't hold the replication mutex here. */
+ STAT(rep->stat.st_elections++);
+
+ /* If we are already master; simply broadcast that fact and return. */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ if (IS_USING_LEASES(env))
+ ret = __rep_lease_refresh(env);
+ STAT(rep->stat.st_elections_won++);
+ return (DB_REP_NEWMASTER);
+ }
+
+ REP_SYSTEM_LOCK(env);
+ if (otally != NULL)
+ *otally = F_ISSET(rep, REP_F_TALLY);
+ *beginp = IN_ELECTION(rep) || F_ISSET(rep, REP_F_INREPELECT);
+ if (!*beginp) {
+ /*
+ * Make sure that we always initialize all the election fields
+ * before putting ourselves in an election state. That means
+ * issuing calls that can fail (allocation) before setting all
+ * the variables.
+ */
+ if (nsites > rep->asites &&
+ (ret = __rep_grow_sites(env, nsites)) != 0)
+ goto err;
+ DB_ENV_TEST_RECOVERY(env, DB_TEST_ELECTINIT, ret, NULL);
+ F_SET(rep, REP_F_INREPELECT);
+ F_CLR(rep, REP_F_EGENUPDATE);
+ /*
+ * If we're the first to the party, we simply set initial
+ * values: pre-existing values would be left over from previous
+ * election.
+ */
+ if (!IN_ELECTION_TALLY(rep)) {
+ rep->nsites = nsites;
+ rep->nvotes = nvotes;
+ } else {
+ if (nsites > rep->nsites)
+ rep->nsites = nsites;
+ if (nvotes > rep->nvotes)
+ rep->nvotes = nvotes;
+ }
+ }
+DB_TEST_RECOVERY_LABEL
+err: REP_SYSTEM_UNLOCK(env);
+ return (ret);
+}
+
+/*
+ * __rep_elect_master
+ * Set up for new master from election. Must be called with
+ * the replication region mutex held.
+ */
+static void
+__rep_elect_master(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ if (F_ISSET(rep, REP_F_MASTERELECT | REP_F_MASTER)) {
+ /* We've been through here already; avoid double counting. */
+ return;
+ }
+
+ F_SET(rep, REP_F_MASTERELECT);
+ STAT(rep->stat.st_elections_won++);
+
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Got enough votes to win; election done; winner is %d, gen %lu",
+ rep->master_id, (u_long)rep->gen));
+}
+
+static int
+__rep_fire_elected(env, rep, egen)
+ ENV *env;
+ REP *rep;
+ u_int32_t egen;
+{
+ REP_EVENT_LOCK(env);
+ if (rep->notified_egen < egen) {
+ __rep_fire_event(env, DB_EVENT_REP_ELECTED, NULL);
+ rep->notified_egen = egen;
+ }
+ REP_EVENT_UNLOCK(env);
+ return (0);
+}
+
+/*
+ * Compute a sleep interval. Set it to the smaller of .5s or
+ * timeout/10, making sure we sleep at least 1usec if timeout < 10.
+ */
+#define SLEEPTIME(timeout) \
+ (timeout > 5000000) ? 500000 : ((timeout >= 10) ? timeout / 10 : 1);
+
+static int
+__rep_wait(env, timeoutp, full_elect, egen, flags)
+ ENV *env;
+ db_timeout_t *timeoutp;
+ int full_elect;
+ u_int32_t egen, flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int done, echg, phase_over, ret;
+ u_int32_t sleeptime, sleeptotal, timeout;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ done = echg = phase_over = ret = 0;
+
+ timeout = *timeoutp;
+ /*
+ * The user specifies an overall timeout function, but checking
+ * is cheap and the timeout may be a generous upper bound.
+ * Sleep repeatedly for the smaller of .5s and timeout/10.
+ */
+ sleeptime = SLEEPTIME(timeout);
+ sleeptotal = 0;
+ while (sleeptotal < timeout) {
+ __os_yield(env, 0, sleeptime);
+ sleeptotal += sleeptime;
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Check if group membership changed while we were
+ * sleeping. Specifically we're trying for a full
+ * election and someone is telling us we're joining
+ * a previously established replication group.
+ */
+ if (full_elect && F_ISSET(rep, REP_F_GROUP_ESTD)) {
+ *timeoutp = rep->elect_timeout;
+ timeout = *timeoutp;
+ /*
+ * We adjusted timeout, if we've already waited
+ * that long, then return as though this phase
+ * timed out. However, we want to give other
+ * changes a chance to return, so if we both
+ * found a group and found a new egen, we
+ * override this return with the egen information.
+ * If we found a group and our election finished
+ * then we want to return the election completion.
+ */
+ if (sleeptotal >= timeout) {
+ done = 1;
+ ret = DB_TIMEOUT;
+ } else
+ sleeptime = SLEEPTIME(timeout);
+ }
+
+ echg = egen != rep->egen;
+ phase_over = !F_ISSET(rep, flags);
+
+ /*
+ * Since we're not clearing out master_id any more,
+ * we need to do more to detect the difference between
+ * a new master getting elected and egen changing,
+ * or a new election starting because the old one
+ * timed out at another site (which easily happens
+ * when sites have very different timeout settings).
+ *
+ * Detect this by:
+ * If my phase was over, egen has changed but
+ * there are still election flags set, or we're
+ * told our egen was out of date and updated
+ * then return DB_REP_EGENCHG.
+ *
+ * Phase 0 doesn't care about egen, only the phase over.
+ *
+ * Otherwise, if my phase is over I want to
+ * set my idea of the master and return.
+ */
+ if (phase_over && echg &&
+ flags != REP_F_EPHASE0 &&
+ (IN_ELECTION_TALLY(rep) ||
+ F_ISSET(rep, REP_F_EGENUPDATE))) {
+ done = 1;
+ F_CLR(rep, REP_F_EGENUPDATE);
+ ret = DB_REP_EGENCHG;
+ } else if (phase_over) {
+ done = 1;
+ ret = 0;
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ if (done)
+ return (ret);
+ }
+ return (DB_TIMEOUT);
+}
+
+/*
+ * __rep_grow_sites --
+ * Called to allocate more space in the election tally information.
+ * Called with the rep mutex held. We need to call the region mutex, so
+ * we need to make sure that we *never* acquire those mutexes in the
+ * opposite order.
+ */
+static int
+__rep_grow_sites(env, nsites)
+ ENV *env;
+ u_int32_t nsites;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ int ret, *tally;
+ u_int32_t nalloc;
+
+ rep = env->rep_handle->region;
+
+ /*
+ * Allocate either twice the current allocation or nsites,
+ * whichever is more.
+ */
+ nalloc = 2 * rep->asites;
+ if (nalloc < nsites)
+ nalloc = nsites;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ MUTEX_LOCK(env, renv->mtx_regenv);
+
+ /*
+ * We allocate 2 tally regions, one for tallying VOTE1's and
+ * one for VOTE2's. Always grow them in tandem, because if we
+ * get more VOTE1's we'll always expect more VOTE2's then too.
+ */
+ if ((ret = __env_alloc(infop,
+ (size_t)nalloc * sizeof(REP_VTALLY), &tally)) == 0) {
+ if (rep->tally_off != INVALID_ROFF)
+ __env_alloc_free(
+ infop, R_ADDR(infop, rep->tally_off));
+ rep->tally_off = R_OFFSET(infop, tally);
+ if ((ret = __env_alloc(infop,
+ (size_t)nalloc * sizeof(REP_VTALLY), &tally)) == 0) {
+ /* Success */
+ if (rep->v2tally_off != INVALID_ROFF)
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->v2tally_off));
+ rep->v2tally_off = R_OFFSET(infop, tally);
+ rep->asites = nalloc;
+ rep->nsites = nsites;
+ } else {
+ /*
+ * We were unable to allocate both. So, we must
+ * free the first one and reinitialize. If
+ * v2tally_off is valid, it is from an old
+ * allocation and we are clearing it all out due
+ * to the error.
+ */
+ if (rep->v2tally_off != INVALID_ROFF)
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->v2tally_off));
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->tally_off));
+ rep->v2tally_off = rep->tally_off = INVALID_ROFF;
+ rep->asites = 0;
+ rep->nsites = 0;
+ }
+ }
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ return (ret);
+}
diff --git a/rep/rep_lease.c b/rep/rep_lease.c
new file mode 100644
index 0000000..a13318e
--- /dev/null
+++ b/rep/rep_lease.c
@@ -0,0 +1,524 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2007-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+static void __rep_find_entry __P((ENV *, REP *, int, REP_LEASE_ENTRY **));
+
+/*
+ * __rep_update_grant -
+ * Update a client's lease grant for this perm record
+ * and send the grant to the master. Caller must
+ * hold the mtx_clientdb mutex. Timespec given is in
+ * host local format.
+ *
+ * PUBLIC: int __rep_update_grant __P((ENV *, db_timespec *));
+ */
+int
+__rep_update_grant(env, ts)
+ ENV *env;
+ db_timespec *ts;
+{
+ DBT lease_dbt;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ __rep_grant_info_args gi;
+ db_timespec mytime;
+ u_int8_t buf[__REP_GRANT_INFO_SIZE];
+ int master, ret;
+ size_t len;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ timespecclear(&mytime);
+
+ /*
+ * Get current time, and add in the (skewed) lease duration
+ * time to send the grant to the master.
+ */
+ __os_gettime(env, &mytime, 1);
+ timespecadd(&mytime, &rep->lease_duration);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * If we are in an election, we cannot grant the lease.
+ * We need to check under the region mutex.
+ */
+ if (IN_ELECTION(rep)) {
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+ }
+ if (timespeccmp(&mytime, &rep->grant_expire, >))
+ rep->grant_expire = mytime;
+ F_CLR(rep, REP_F_LEASE_EXPIRED);
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Send the LEASE_GRANT message with the current lease grant
+ * no matter if we've actually extended the lease or not.
+ */
+ gi.msg_sec = (u_int32_t)ts->tv_sec;
+ gi.msg_nsec = (u_int32_t)ts->tv_nsec;
+
+ if ((ret = __rep_grant_info_marshal(env, &gi, buf,
+ __REP_GRANT_INFO_SIZE, &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(lease_dbt, buf, len);
+ if ((master = rep->master_id) != DB_EID_INVALID)
+ (void)__rep_send_message(env, master, REP_LEASE_GRANT,
+ &lp->max_perm_lsn, &lease_dbt, 0, 0);
+ return (0);
+}
+
+/*
+ * __rep_islease_granted -
+ * Return 0 if this client has no outstanding lease granted.
+ * Return 1 otherwise.
+ * Caller must hold the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: int __rep_islease_granted __P((ENV *));
+ */
+int
+__rep_islease_granted(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ db_timespec mytime;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ /*
+ * Get current time and compare against our granted lease.
+ */
+ timespecclear(&mytime);
+ __os_gettime(env, &mytime, 1);
+
+ return (timespeccmp(&mytime, &rep->grant_expire, <=) ? 1 : 0);
+}
+
+/*
+ * __rep_lease_table_alloc -
+ * Allocate the lease table on a master. Called with rep mutex
+ * held. We need to acquire the env region mutex, so we need to
+ * make sure we never acquire those mutexes in the opposite order.
+ *
+ * PUBLIC: int __rep_lease_table_alloc __P((ENV *, u_int32_t));
+ */
+int
+__rep_lease_table_alloc(env, nsites)
+ ENV *env;
+ u_int32_t nsites;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ REP_LEASE_ENTRY *le, *table;
+ int *lease, ret;
+ u_int32_t i;
+
+ rep = env->rep_handle->region;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ /*
+ * If we have an old table from some other time, free it and
+ * allocate ourselves a new one that is known to be for
+ * the right number of sites.
+ */
+ if (rep->lease_off != INVALID_ROFF) {
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->lease_off));
+ rep->lease_off = INVALID_ROFF;
+ }
+ ret = __env_alloc(infop, (size_t)nsites * sizeof(REP_LEASE_ENTRY),
+ &lease);
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ if (ret != 0)
+ return (ret);
+ else
+ rep->lease_off = R_OFFSET(infop, lease);
+ table = R_ADDR(infop, rep->lease_off);
+ for (i = 0; i < nsites; i++) {
+ le = &table[i];
+ le->eid = DB_EID_INVALID;
+ timespecclear(&le->start_time);
+ timespecclear(&le->end_time);
+ ZERO_LSN(le->lease_lsn);
+ }
+ return (0);
+}
+
+/*
+ * __rep_lease_grant -
+ * Handle incoming REP_LEASE_GRANT message on a master.
+ *
+ * PUBLIC: int __rep_lease_grant __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_lease_grant(env, rp, rec, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ __rep_grant_info_args gi;
+ REP_LEASE_ENTRY *le;
+ db_timespec msg_time;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ if ((ret = __rep_grant_info_unmarshal(env,
+ &gi, rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ timespecset(&msg_time, gi.msg_sec, gi.msg_nsec);
+ le = NULL;
+
+ /*
+ * Get current time, and add in the (skewed) lease duration
+ * time to send the grant to the master.
+ */
+ REP_SYSTEM_LOCK(env);
+ __rep_find_entry(env, rep, eid, &le);
+ /*
+ * We either get back this site's entry, or an empty entry
+ * that we need to initialize.
+ */
+ DB_ASSERT(env, le != NULL);
+ /*
+ * Update the entry if it is an empty entry or if the new
+ * lease grant is a later start time than the current one.
+ */
+ RPRINT(env, DB_VERB_REP_LEASE,
+ (env, "lease_grant: grant msg time %lu %lu",
+ (u_long)msg_time.tv_sec, (u_long)msg_time.tv_nsec));
+ if (le->eid == DB_EID_INVALID ||
+ timespeccmp(&msg_time, &le->start_time, >)) {
+ le->eid = eid;
+ le->start_time = msg_time;
+ le->end_time = le->start_time;
+ timespecadd(&le->end_time, &rep->lease_duration);
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "lease_grant: eid %d, start %lu %lu, end %lu %lu, duration %lu %lu",
+ le->eid, (u_long)le->start_time.tv_sec, (u_long)le->start_time.tv_nsec,
+ (u_long)le->end_time.tv_sec, (u_long)le->end_time.tv_nsec,
+ (u_long)rep->lease_duration.tv_sec, (u_long)rep->lease_duration.tv_nsec));
+ /*
+ * XXX Is this really true? Could we have a lagging
+ * record that has a later start time, but smaller
+ * LSN than we have previously seen??
+ */
+ DB_ASSERT(env, LOG_COMPARE(&rp->lsn, &le->lease_lsn) >= 0);
+ le->lease_lsn = rp->lsn;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ return (0);
+}
+
+/*
+ * Find the entry for the given EID. Or the first empty one.
+ */
+static void
+__rep_find_entry(env, rep, eid, lep)
+ ENV *env;
+ REP *rep;
+ int eid;
+ REP_LEASE_ENTRY **lep;
+{
+ REGINFO *infop;
+ REP_LEASE_ENTRY *le, *table;
+ u_int32_t i;
+
+ infop = env->reginfo;
+ table = R_ADDR(infop, rep->lease_off);
+
+ for (i = 0; i < rep->nsites; i++) {
+ le = &table[i];
+ /*
+ * Find either the one that matches the client's
+ * EID or the first empty one.
+ */
+ if (le->eid == eid || le->eid == DB_EID_INVALID) {
+ *lep = le;
+ return;
+ }
+ }
+ return;
+}
+
+/*
+ * __rep_lease_check -
+ * Return 0 if this master holds valid leases and can confirm
+ * its mastership. If leases are expired, an attempt is made
+ * to refresh the leases. If that fails, then return the
+ * DB_REP_LEASE_EXPIRED error to the user. No mutexes held.
+ *
+ * PUBLIC: int __rep_lease_check __P((ENV *, int));
+ */
+int
+__rep_lease_check(env, refresh)
+ ENV *env;
+ int refresh;
+{
+ DB_LOG *dblp;
+ DB_LSN lease_lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REGINFO *infop;
+ REP *rep;
+ REP_LEASE_ENTRY *le, *table;
+ db_timespec curtime;
+ int ret, tries;
+ u_int32_t i, min_leases, valid_leases;
+
+ infop = env->reginfo;
+ tries = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ LOG_SYSTEM_LOCK(env);
+ lease_lsn = lp->max_perm_lsn;
+ LOG_SYSTEM_UNLOCK(env);
+
+retry:
+ REP_SYSTEM_LOCK(env);
+ min_leases = rep->nsites / 2;
+ ret = 0;
+ __os_gettime(env, &curtime, 1);
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "lease_check: try %d min_leases %lu curtime %lu %lu, maxLSN [%lu][%lu]",
+ tries,
+ (u_long)min_leases, (u_long)curtime.tv_sec,
+ (u_long)curtime.tv_nsec,
+ (u_long)lease_lsn.file,
+ (u_long)lease_lsn.offset));
+ table = R_ADDR(infop, rep->lease_off);
+ for (i = 0, valid_leases = 0;
+ i < rep->nsites && valid_leases < min_leases; i++) {
+ le = &table[i];
+ /*
+ * Count this lease as valid if:
+ * - It is a valid entry (has an EID).
+ * - The lease has not expired.
+ * - The LSN is up to date.
+ */
+ if (le->eid != DB_EID_INVALID) {
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "lease_check: valid %lu eid %d, lease_lsn [%lu][%lu]",
+ (u_long)valid_leases, le->eid,
+ (u_long)le->lease_lsn.file,
+ (u_long)le->lease_lsn.offset));
+ RPRINT(env, DB_VERB_REP_LEASE,
+ (env, "lease_check: endtime %lu %lu",
+ (u_long)le->end_time.tv_sec,
+ (u_long)le->end_time.tv_nsec));
+ }
+ if (le->eid != DB_EID_INVALID &&
+ timespeccmp(&le->end_time, &curtime, >=) &&
+ LOG_COMPARE(&le->lease_lsn, &lease_lsn) >= 0)
+ valid_leases++;
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Now see if we have enough.
+ */
+ RPRINT(env, DB_VERB_REP_LEASE, (env, "valid %lu, min %lu",
+ (u_long)valid_leases, (u_long)min_leases));
+ if (valid_leases < min_leases) {
+ if (!refresh)
+ ret = DB_REP_LEASE_EXPIRED;
+ else {
+ /*
+ * If we are successful, we need to recheck the leases
+ * because the lease grant messages may have raced with
+ * the PERM acknowledgement. Give the grant messages
+ * a chance to arrive and be processed.
+ */
+ if ((ret = __rep_lease_refresh(env)) == 0) {
+ if (tries <= LEASE_REFRESH_TRIES) {
+ /*
+ * If we were successful sending, but
+ * not in racing the message threads,
+ * then yield the processor so that
+ * the message threads get a chance
+ * to run.
+ */
+ if (tries > 0)
+ __os_yield(env, 1, 0);
+ tries++;
+ goto retry;
+ } else
+ ret = DB_REP_LEASE_EXPIRED;
+ }
+ }
+ }
+
+ if (ret == DB_REP_LEASE_EXPIRED)
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "lease_check: Expired. Only %lu valid",
+ (u_long)valid_leases));
+ return (ret);
+}
+
+/*
+ * __rep_lease_refresh -
+ * Find the last permanent record and send that out so that it
+ * forces clients to grant their leases.
+ *
+ * If there is no permanent record, this function cannot refresh
+ * leases. That should not happen because the master should write
+ * a checkpoint when it starts, if there is no other perm record.
+ *
+ * PUBLIC: int __rep_lease_refresh __P((ENV *));
+ */
+int
+__rep_lease_refresh(env)
+ ENV *env;
+{
+ DBT rec;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ REP *rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ memset(&rec, 0, sizeof(rec));
+ memset(&lsn, 0, sizeof(lsn));
+ /*
+ * Use __rep_log_backup to find the last PERM record.
+ */
+ if ((ret = __rep_log_backup(env, rep, logc, &lsn)) != 0) {
+ /*
+ * If there is no PERM record, then we get DB_NOTFOUND.
+ */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_CURRENT)) != 0)
+ goto err;
+
+ (void)__rep_send_message(env, DB_EID_BROADCAST, REP_LOG, &lsn,
+ &rec, REPCTL_PERM, 0);
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_lease_expire -
+ * Proactively expire all leases granted to us.
+ * Assume the caller holds the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: int __rep_lease_expire __P((ENV *));
+ */
+int
+__rep_lease_expire(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REGINFO *infop;
+ REP *rep;
+ REP_LEASE_ENTRY *le, *table;
+ int ret;
+ u_int32_t i;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+
+ if (rep->lease_off != INVALID_ROFF) {
+ table = R_ADDR(infop, rep->lease_off);
+ /*
+ * Expire all leases forcibly. We are guaranteed that the
+ * start_time for all leases are not in the future. Therefore,
+ * set the end_time to the start_time.
+ */
+ for (i = 0; i < rep->nsites; i++) {
+ le = &table[i];
+ le->end_time = le->start_time;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __rep_lease_waittime -
+ * Return the amount of time remaining on a granted lease.
+ * Assume the caller holds the REP_SYSTEM (region) mutex.
+ *
+ * PUBLIC: db_timeout_t __rep_lease_waittime __P((ENV *));
+ */
+db_timeout_t
+__rep_lease_waittime(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ db_timespec exptime, mytime;
+ db_timeout_t to;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ exptime = rep->grant_expire;
+ to = 0;
+ /*
+ * If the lease has never been granted, we must wait a full
+ * lease timeout because we could be freshly rebooted after
+ * a crash and a lease could be granted from a previous
+ * incarnation of this client. However, if the lease has never
+ * been granted, and this client has already waited a full
+ * lease timeout, we know our lease cannot be granted and there
+ * is no need to wait again.
+ */
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "wait_time: grant_expire %lu %lu lease_to %lu",
+ (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec,
+ (u_long)rep->lease_timeout));
+ if (!timespecisset(&exptime)) {
+ if (!F_ISSET(rep, REP_F_LEASE_EXPIRED))
+ to = rep->lease_timeout;
+ } else {
+ __os_gettime(env, &mytime, 1);
+ RPRINT(env, DB_VERB_REP_LEASE, (env,
+ "wait_time: mytime %lu %lu, grant_expire %lu %lu",
+ (u_long)mytime.tv_sec, (u_long)mytime.tv_nsec,
+ (u_long)exptime.tv_sec, (u_long)exptime.tv_nsec));
+ if (timespeccmp(&mytime, &exptime, <=)) {
+ /*
+ * If the current time is before the grant expiration
+ * compute the difference and return remaining grant
+ * time.
+ */
+ timespecsub(&exptime, &mytime);
+ DB_TIMESPEC_TO_TIMEOUT(to, &exptime, 1);
+ }
+ }
+ return (to);
+}
diff --git a/rep/rep_log.c b/rep/rep_log.c
new file mode 100644
index 0000000..d413daa
--- /dev/null
+++ b/rep/rep_log.c
@@ -0,0 +1,872 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/log.h"
+
+static int __rep_chk_newfile __P((ENV *, DB_LOGC *, REP *,
+ __rep_control_args *, int));
+
+/*
+ * __rep_allreq --
+ * Handle a REP_ALL_REQ message.
+ *
+ * PUBLIC: int __rep_allreq __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_allreq(env, rp, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ int eid;
+{
+ DBT data_dbt, newfiledbt;
+ DB_LOGC *logc;
+ DB_LSN log_end, oldfilelsn;
+ DB_REP *db_rep;
+ REP *rep;
+ REP_BULK bulk;
+ REP_THROTTLE repth;
+ __rep_newfile_args nf_args;
+ uintptr_t bulkoff;
+ u_int32_t bulkflags, end_flag, flags, use_bulk;
+ int arch_flag, ret, t_ret;
+ u_int8_t buf[__REP_NEWFILE_SIZE];
+ size_t len;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ end_flag = 0;
+ arch_flag = 0;
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ /*
+ * If we're doing bulk transfer, allocate a bulk buffer to put our
+ * log records in. We still need to initialize the throttle info
+ * because if we encounter a log record larger than our entire bulk
+ * buffer, we need to send it as a singleton and also we want to
+ * support throttling with bulk.
+ *
+ * Use a local var so we don't need to worry if someone else turns
+ * on/off bulk in the middle of our call.
+ */
+ use_bulk = FLD_ISSET(rep->config, REP_C_BULK);
+ bulk.addr = NULL;
+ if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid,
+ &bulkoff, &bulkflags, REP_BULK_LOG)) != 0)
+ goto err;
+ memset(&repth, 0, sizeof(repth));
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_NOARCHIVE);
+ arch_flag = 1;
+ repth.gbytes = rep->gbytes;
+ repth.bytes = rep->bytes;
+ oldfilelsn = repth.lsn = rp->lsn;
+ repth.type = REP_LOG;
+ repth.data_dbt = &data_dbt;
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Get the LSN of the end of the log, so that in our reading loop
+ * (below), we can recognize when we get there, and set the
+ * REPCTL_LOG_END flag.
+ */
+ if ((ret = __logc_get(logc, &log_end, &data_dbt, DB_LAST)) != 0) {
+ if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER))
+ ret = 0;
+ goto err;
+ }
+
+ flags = IS_ZERO_LSN(rp->lsn) ||
+ IS_INIT_LSN(rp->lsn) ? DB_FIRST : DB_SET;
+ /*
+ * We get the first item so that a client servicing requests
+ * can distinguish between not having the records and reaching
+ * the end of its log. Return the DB_NOTFOUND if the client
+ * cannot get the record. Return 0 if we finish the loop and
+ * sent all that we have.
+ */
+ ret = __logc_get(logc, &repth.lsn, &data_dbt, flags);
+ /*
+ * If the client is asking for all records
+ * because it doesn't have any, and our first
+ * record is not in the first log file, then
+ * the client is outdated and needs to get a
+ * VERIFY_FAIL.
+ */
+ if (ret == 0 && repth.lsn.file != 1 && flags == DB_FIRST) {
+ if (F_ISSET(rep, REP_F_CLIENT))
+ ret = DB_NOTFOUND;
+ else
+ (void)__rep_send_message(env, eid,
+ REP_VERIFY_FAIL, &repth.lsn, NULL, 0, 0);
+ goto err;
+ }
+ /*
+ * If we got DB_NOTFOUND it could be because the LSN we were
+ * given is at the end of the log file and we need to switch
+ * log files. Reinitialize and get the current record when we return.
+ */
+ if (ret == DB_NOTFOUND) {
+ ret = __rep_chk_newfile(env, logc, rep, rp, eid);
+ /*
+ * If we still get DB_NOTFOUND the client gave us a
+ * bad or unknown LSN. Ignore it if we're the master.
+ * Any other error is returned.
+ */
+ if (ret == 0)
+ ret = __logc_get(logc, &repth.lsn,
+ &data_dbt, DB_CURRENT);
+ if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER)) {
+ ret = 0;
+ goto err;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * For singleton log records, we break when we get a REP_LOG_MORE.
+ * Or if we're not using throttling, or we are using bulk, we stop
+ * when we reach the end (i.e. ret != 0).
+ */
+ for (end_flag = 0;
+ ret == 0 && repth.type != REP_LOG_MORE && end_flag == 0;
+ ret = __logc_get(logc, &repth.lsn, &data_dbt, DB_NEXT)) {
+ /*
+ * If we just changed log files, we need to send the
+ * version of this log file to the client.
+ */
+ if (repth.lsn.file != oldfilelsn.file) {
+ if ((ret = __logc_version(logc, &nf_args.version)) != 0)
+ break;
+ memset(&newfiledbt, 0, sizeof(newfiledbt));
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(newfiledbt, &nf_args.version,
+ sizeof(nf_args.version));
+ else {
+ if ((ret = __rep_newfile_marshal(env, &nf_args,
+ buf, __REP_NEWFILE_SIZE, &len)) != 0)
+ goto err;
+ DB_INIT_DBT(newfiledbt, buf, len);
+ }
+ (void)__rep_send_message(env,
+ eid, REP_NEWFILE, &oldfilelsn, &newfiledbt,
+ REPCTL_RESEND, 0);
+ }
+
+ /*
+ * Mark the end of the ALL_REQ response to show that the
+ * receiving client should now be "caught up" with the
+ * replication group. If we're the master, then our log end is
+ * certainly authoritative. If we're another client, only if we
+ * ourselves have reached STARTUPDONE.
+ */
+ end_flag = (LOG_COMPARE(&repth.lsn, &log_end) >= 0 &&
+ (F_ISSET(rep, REP_F_MASTER) ||
+ rep->stat.st_startup_complete)) ?
+ REPCTL_LOG_END : 0;
+ /*
+ * If we are configured for bulk, try to send this as a bulk
+ * request. If not configured, or it is too big for bulk
+ * then just send normally.
+ */
+ if (use_bulk)
+ ret = __rep_bulk_message(env, &bulk, &repth,
+ &repth.lsn, &data_dbt, (REPCTL_RESEND | end_flag));
+ if (!use_bulk || ret == DB_REP_BULKOVF)
+ ret = __rep_send_throttle(env,
+ eid, &repth, 0, end_flag);
+ if (ret != 0)
+ break;
+ /*
+ * If we are about to change files, then we'll need the
+ * last LSN in the previous file. Save it here.
+ */
+ oldfilelsn = repth.lsn;
+ oldfilelsn.offset += logc->len;
+ }
+
+ if (ret == DB_NOTFOUND || ret == DB_REP_UNAVAIL)
+ ret = 0;
+ /*
+ * We're done, force out whatever remains in the bulk buffer and
+ * free it.
+ */
+err:
+ /*
+ * We could have raced an unlink from an earlier log_archive
+ * and the user is removing the files themselves, now. If
+ * we get an error indicating the log file might no longer
+ * exist, ignore it.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ if (bulk.addr != NULL && (t_ret = __rep_bulk_free(env, &bulk,
+ (REPCTL_RESEND | end_flag))) != 0 && ret == 0 &&
+ t_ret != DB_REP_UNAVAIL)
+ ret = t_ret;
+ if (arch_flag) {
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_NOARCHIVE);
+ REP_SYSTEM_UNLOCK(env);
+ }
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_log --
+ * Handle a REP_LOG/REP_LOG_MORE message.
+ *
+ * PUBLIC: int __rep_log __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC: __rep_control_args *, DBT *, time_t, DB_LSN *));
+ */
+int
+__rep_log(env, ip, rp, rec, savetime, ret_lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __rep_control_args *rp;
+ DBT *rec;
+ time_t savetime;
+ DB_LSN *ret_lsnp;
+{
+ DB_LOG *dblp;
+ DB_LSN last_lsn, lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int is_dup, master, ret;
+
+ is_dup = ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ ret = __rep_apply(env, ip, rp, rec, ret_lsnp, &is_dup, &last_lsn);
+ switch (ret) {
+ /*
+ * We're in an internal backup and we've gotten
+ * all the log we need to run recovery. Do so now.
+ */
+ case DB_REP_LOGREADY:
+ if ((ret =
+ __rep_logready(env, rep, savetime, &last_lsn)) != 0)
+ goto out;
+ break;
+ /*
+ * If we get any of the "normal" returns, we only process
+ * LOG_MORE if this is not a duplicate record. If the
+ * record is a duplicate we don't want to handle LOG_MORE
+ * and request a multiple data stream (or trigger internal
+ * initialization) since this could be a very old record
+ * that no longer exists on the master.
+ */
+ case DB_REP_ISPERM:
+ case DB_REP_NOTPERM:
+ case 0:
+ if (is_dup)
+ goto out;
+ else
+ break;
+ /*
+ * Any other return (errors), we're done.
+ */
+ default:
+ goto out;
+ }
+ if (rp->rectype == REP_LOG_MORE) {
+ master = rep->master_id;
+
+ /*
+ * Keep the cycle from stalling: In case we got the LOG_MORE out
+ * of order, before some preceding log records, we want to make
+ * sure our follow-up request resumes from where the LOG_MORE
+ * said it should. (If the preceding log records never arrive,
+ * normal gap processing should take care of asking for them.)
+ * But if we already have this record and/or more, we need to
+ * ask to resume from what we need. The upshot is we need the
+ * max of lp->lsn and the lsn from the message.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lsn = lp->ready_lsn;
+ if (LOG_COMPARE(&rp->lsn, &lsn) > 0)
+ lsn = rp->lsn;
+
+ /*
+ * If the master_id is invalid, this means that since
+ * the last record was sent, somebody declared an
+ * election and we may not have a master to request
+ * things of.
+ *
+ * This is not an error; when we find a new master,
+ * we'll re-negotiate where the end of the log is and
+ * try to bring ourselves up to date again anyway.
+ */
+ if (master == DB_EID_INVALID) {
+ ret = 0;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto out;
+ }
+ /*
+ * If we're waiting for records, set the wait_ts
+ * high so that we avoid re-requesting too soon and
+ * end up with multiple data streams.
+ */
+ if (IS_ZERO_LSN(lp->waiting_lsn))
+ lp->wait_ts = rep->max_gap;
+ ret = __rep_loggap_req(env, rep, &lsn, REP_GAP_FORCE);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+out:
+ return (ret);
+}
+
+/*
+ * __rep_bulk_log --
+ * Handle a REP_BULK_LOG message.
+ *
+ * PUBLIC: int __rep_bulk_log __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC: __rep_control_args *, DBT *, time_t, DB_LSN *));
+ */
+int
+__rep_bulk_log(env, ip, rp, rec, savetime, ret_lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __rep_control_args *rp;
+ DBT *rec;
+ time_t savetime;
+ DB_LSN *ret_lsnp;
+{
+ DB_LSN last_lsn;
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ret = __log_rep_split(env, ip, rp, rec, ret_lsnp, &last_lsn);
+ switch (ret) {
+ /*
+ * We're in an internal backup and we've gotten
+ * all the log we need to run recovery. Do so now.
+ */
+ case DB_REP_LOGREADY:
+ ret = __rep_logready(env, rep, savetime, &last_lsn);
+ break;
+ /*
+ * Any other return (errors), we're done.
+ */
+ default:
+ break;
+ }
+ return (ret);
+}
+
+/*
+ * __rep_log_req --
+ * Handle a REP_LOG_REQ message.
+ *
+ * PUBLIC: int __rep_logreq __P((ENV *, __rep_control_args *, DBT *, int));
+ */
+int
+__rep_logreq(env, rp, rec, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+{
+ DBT data_dbt, newfiledbt;
+ DB_LOGC *logc;
+ DB_LSN firstlsn, lsn, oldfilelsn;
+ DB_REP *db_rep;
+ REP *rep;
+ REP_BULK bulk;
+ REP_THROTTLE repth;
+ __rep_logreq_args lr_args;
+ __rep_newfile_args nf_args;
+ uintptr_t bulkoff;
+ u_int32_t bulkflags, use_bulk;
+ int count, ret, t_ret;
+ u_int8_t buf[__REP_NEWFILE_SIZE];
+ size_t len;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /* COMPQUIET_LSN is what this is... */
+ ZERO_LSN(lr_args.endlsn);
+
+ if (rec != NULL && rec->size != 0) {
+ if (rp->rep_version < DB_REPVERSION_47)
+ lr_args.endlsn = *(DB_LSN *)rec->data;
+ else if ((ret = __rep_logreq_unmarshal(env, &lr_args,
+ rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "[%lu][%lu]: LOG_REQ max lsn: [%lu][%lu]",
+ (u_long) rp->lsn.file, (u_long)rp->lsn.offset,
+ (u_long)lr_args.endlsn.file,
+ (u_long)lr_args.endlsn.offset));
+ }
+ /*
+ * There are several different cases here.
+ * 1. We asked logc_get for a particular LSN and got it.
+ * 2. We asked logc_get for an LSN and it's not found because it is
+ * beyond the end of a log file and we need a NEWFILE msg.
+ * and then the record that was requested.
+ * 3. We asked logc_get for an LSN and it is already archived.
+ * 4. We asked logc_get for an LSN and it simply doesn't exist, but
+ * doesn't meet any of those other criteria, in which case
+ * it's an error (that should never happen on a master).
+ *
+ * If we have a valid LSN and the request has a data_dbt with
+ * it, the sender is asking for a chunk of log records.
+ * Then we need to send all records up to the LSN in the data dbt.
+ */
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ oldfilelsn = lsn = rp->lsn;
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_NOARCHIVE);
+ REP_SYSTEM_UNLOCK(env);
+ if ((ret = __logc_get(logc, &lsn, &data_dbt, DB_SET)) == 0) {
+ /* Case 1 */
+ (void)__rep_send_message(env,
+ eid, REP_LOG, &lsn, &data_dbt, REPCTL_RESEND, 0);
+ oldfilelsn.offset += logc->len;
+ } else if (ret == DB_NOTFOUND) {
+ /*
+ * If logc_get races with log_archive, it might return
+ * DB_NOTFOUND. We expect there to be some log record
+ * that is the first one. Loop until we either get
+ * a log record or some error. Since we only expect
+ * to get this racing log_archive, bound it to a few
+ * tries.
+ */
+ count = 0;
+ do {
+ ret = __logc_get(logc, &firstlsn, &data_dbt, DB_FIRST);
+ count++;
+ } while (ret == DB_NOTFOUND && count < 10);
+ if (ret != 0)
+ goto err;
+ if (LOG_COMPARE(&firstlsn, &rp->lsn) > 0) {
+ /* Case 3 */
+ if (F_ISSET(rep, REP_F_CLIENT)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ (void)__rep_send_message(env, eid,
+ REP_VERIFY_FAIL, &rp->lsn, NULL, 0, 0);
+ ret = 0;
+ goto err;
+ }
+ ret = __rep_chk_newfile(env, logc, rep, rp, eid);
+ if (ret == DB_NOTFOUND) {
+ /* Case 4 */
+ /*
+ * If we still get DB_NOTFOUND the client gave us an
+ * unknown LSN, perhaps at the end of the log. Ignore
+ * it if we're the master. Return DB_NOTFOUND if
+ * we are the client.
+ */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ __db_errx(env,
+ "Request for LSN [%lu][%lu] not found",
+ (u_long)rp->lsn.file,
+ (u_long)rp->lsn.offset);
+ ret = 0;
+ goto err;
+ } else
+ ret = DB_NOTFOUND;
+ }
+ }
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If the user requested a gap, send the whole thing, while observing
+ * the limits from rep_set_limit.
+ *
+ * If we're doing bulk transfer, allocate a bulk buffer to put our
+ * log records in. We still need to initialize the throttle info
+ * because if we encounter a log record larger than our entire bulk
+ * buffer, we need to send it as a singleton.
+ *
+ * Use a local var so we don't need to worry if someone else turns
+ * on/off bulk in the middle of our call.
+ */
+ use_bulk = FLD_ISSET(rep->config, REP_C_BULK);
+ if (use_bulk && (ret = __rep_bulk_alloc(env, &bulk, eid,
+ &bulkoff, &bulkflags, REP_BULK_LOG)) != 0)
+ goto err;
+ memset(&repth, 0, sizeof(repth));
+ REP_SYSTEM_LOCK(env);
+ repth.gbytes = rep->gbytes;
+ repth.bytes = rep->bytes;
+ repth.type = REP_LOG;
+ repth.data_dbt = &data_dbt;
+ REP_SYSTEM_UNLOCK(env);
+ while (ret == 0 && rec != NULL && rec->size != 0 &&
+ repth.type == REP_LOG) {
+ if ((ret =
+ __logc_get(logc, &repth.lsn, &data_dbt, DB_NEXT)) != 0) {
+ /*
+ * If we're a client and we only have part of the gap,
+ * return DB_NOTFOUND so that we send a REREQUEST
+ * back to the requester and it can ask for more.
+ */
+ if (ret == DB_NOTFOUND && F_ISSET(rep, REP_F_MASTER))
+ ret = 0;
+ break;
+ }
+ if (LOG_COMPARE(&repth.lsn, &lr_args.endlsn) >= 0)
+ break;
+ if (repth.lsn.file != oldfilelsn.file) {
+ if ((ret = __logc_version(logc, &nf_args.version)) != 0)
+ break;
+ memset(&newfiledbt, 0, sizeof(newfiledbt));
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(newfiledbt, &nf_args.version,
+ sizeof(nf_args.version));
+ else {
+ if ((ret = __rep_newfile_marshal(env, &nf_args,
+ buf, __REP_NEWFILE_SIZE, &len)) != 0)
+ goto err;
+ DB_INIT_DBT(newfiledbt, buf, len);
+ }
+ (void)__rep_send_message(env,
+ eid, REP_NEWFILE, &oldfilelsn, &newfiledbt,
+ REPCTL_RESEND, 0);
+ }
+ /*
+ * If we are configured for bulk, try to send this as a bulk
+ * request. If not configured, or it is too big for bulk
+ * then just send normally.
+ */
+ if (use_bulk)
+ ret = __rep_bulk_message(env, &bulk, &repth,
+ &repth.lsn, &data_dbt, REPCTL_RESEND);
+ if (!use_bulk || ret == DB_REP_BULKOVF)
+ ret = __rep_send_throttle(env, eid, &repth, 0, 0);
+ if (ret != 0) {
+ /* Ignore send failure, except to break the loop. */
+ if (ret == DB_REP_UNAVAIL)
+ ret = 0;
+ break;
+ }
+ /*
+ * If we are about to change files, then we'll need the
+ * last LSN in the previous file. Save it here.
+ */
+ oldfilelsn = repth.lsn;
+ oldfilelsn.offset += logc->len;
+ }
+
+ /*
+ * We're done, force out whatever remains in the bulk buffer and
+ * free it.
+ */
+ if (use_bulk && (t_ret = __rep_bulk_free(env, &bulk,
+ REPCTL_RESEND)) != 0 && ret == 0 &&
+ t_ret != DB_REP_UNAVAIL)
+ ret = t_ret;
+err:
+ /*
+ * We could have raced an unlink from an earlier log_archive
+ * and the user is removing the files themselves, now. If
+ * we get an error indicating the log file might no longer
+ * exist, ignore it.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_NOARCHIVE);
+ REP_SYSTEM_UNLOCK(env);
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_loggap_req -
+ * Request a log gap. Assumes the caller holds the REP->mtx_clientdb.
+ *
+ * lsnp is the current LSN we're handling. It is used to help decide
+ * if we ask for a gap or singleton.
+ * gapflags are flags that may override the algorithm or control the
+ * processing in some way.
+ *
+ * PUBLIC: int __rep_loggap_req __P((ENV *, REP *, DB_LSN *, u_int32_t));
+ */
+int
+__rep_loggap_req(env, rep, lsnp, gapflags)
+ ENV *env;
+ REP *rep;
+ DB_LSN *lsnp;
+ u_int32_t gapflags;
+{
+ DBT max_lsn_dbt, *max_lsn_dbtp;
+ DB_LOG *dblp;
+ DB_LSN next_lsn;
+ LOG *lp;
+ __rep_logreq_args lr_args;
+ size_t len;
+ u_int32_t ctlflags, flags, type;
+ int master, ret;
+ u_int8_t buf[__REP_LOGREQ_SIZE];
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (FLD_ISSET(gapflags, REP_GAP_FORCE))
+ next_lsn = *lsnp;
+ else
+ next_lsn = lp->ready_lsn;
+ ctlflags = flags = 0;
+ type = REP_LOG_REQ;
+ ret = 0;
+
+ /*
+ * Check if we need to ask for the gap.
+ * We ask for the gap if:
+ * We are forced to with gapflags.
+ * If max_wait_lsn is ZERO_LSN - we've never asked for
+ * records before.
+ * If we asked for a single record and received it.
+ *
+ * If we want a gap, but don't have an ending LSN (waiting_lsn)
+ * send an ALL_REQ. This is primarily used by REP_REREQUEST when
+ * an ALL_REQ was not able to be fulfilled by another client.
+ */
+ if (FLD_ISSET(gapflags, (REP_GAP_FORCE | REP_GAP_REREQUEST)) ||
+ IS_ZERO_LSN(lp->max_wait_lsn) ||
+ (lsnp != NULL && LOG_COMPARE(lsnp, &lp->max_wait_lsn) == 0)) {
+ lp->max_wait_lsn = lp->waiting_lsn;
+ /*
+ * If we are forcing a gap, we need to send a max_wait_lsn
+ * that may be beyond the current gap/waiting_lsn (but
+ * it may not be). If we cannot determine any future
+ * waiting LSN, then it should be zero. If we're in
+ * internal init, it should be our ending LSN.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_FORCE)) {
+ if (LOG_COMPARE(&lp->max_wait_lsn, lsnp) <= 0) {
+ if (F_ISSET(rep, REP_F_RECOVER_LOG)) {
+ DB_ASSERT(env, LOG_COMPARE(lsnp,
+ &rep->last_lsn) <= 0);
+ lp->max_wait_lsn = rep->last_lsn;
+ } else
+ ZERO_LSN(lp->max_wait_lsn);
+ }
+ }
+ if (IS_ZERO_LSN(lp->max_wait_lsn))
+ type = REP_ALL_REQ;
+ memset(&max_lsn_dbt, 0, sizeof(max_lsn_dbt));
+ lr_args.endlsn = lp->max_wait_lsn;
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(max_lsn_dbt, &lp->max_wait_lsn,
+ sizeof(DB_LSN));
+ else {
+ if ((ret = __rep_logreq_marshal(env, &lr_args, buf,
+ __REP_LOGREQ_SIZE, &len)) != 0)
+ goto err;
+ DB_INIT_DBT(max_lsn_dbt, buf, len);
+ }
+ max_lsn_dbtp = &max_lsn_dbt;
+ /*
+ * Gap requests are "new" and can go anywhere, unless
+ * this is already a re-request.
+ */
+ if (FLD_ISSET(gapflags, REP_GAP_REREQUEST))
+ flags = DB_REP_REREQUEST;
+ else
+ flags = DB_REP_ANYWHERE;
+ } else {
+ max_lsn_dbtp = NULL;
+ lp->max_wait_lsn = next_lsn;
+ /*
+ * If we're dropping to singletons, this is a re-request.
+ */
+ flags = DB_REP_REREQUEST;
+ }
+ if ((master = rep->master_id) != DB_EID_INVALID) {
+ STAT(rep->stat.st_log_requested++);
+ if (F_ISSET(rep, REP_F_RECOVER_LOG))
+ ctlflags = REPCTL_INIT;
+ (void)__rep_send_message(env, master,
+ type, &next_lsn, max_lsn_dbtp, ctlflags, flags);
+ } else
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_MASTER_REQ, NULL, NULL, 0, 0);
+err:
+ return (ret);
+}
+
+/*
+ * __rep_logready -
+ * Handle getting back REP_LOGREADY. Any call to __rep_apply
+ * can return it.
+ *
+ * PUBLIC: int __rep_logready __P((ENV *, REP *, time_t, DB_LSN *));
+ */
+int
+__rep_logready(env, rep, savetime, last_lsnp)
+ ENV *env;
+ REP *rep;
+ time_t savetime;
+ DB_LSN *last_lsnp;
+{
+ int ret;
+
+ if ((ret = __log_flush(env, NULL)) != 0)
+ goto out;
+ if ((ret = __rep_verify_match(env, last_lsnp,
+ savetime)) == 0) {
+ REP_SYSTEM_LOCK(env);
+ ZERO_LSN(rep->first_lsn);
+
+ if (rep->originfo != NULL) {
+ __os_free(env, rep->originfo);
+ rep->originfo = NULL;
+ }
+
+ F_CLR(rep, REP_F_RECOVER_LOG);
+ F_SET(rep, REP_F_NIMDBS_LOADED);
+ REP_SYSTEM_UNLOCK(env);
+ } else {
+out: __db_errx(env,
+ "Client initialization failed. Need to manually restore client");
+ return (__env_panic(env, ret));
+ }
+ return (ret);
+
+}
+
+/*
+ * __rep_chk_newfile --
+ * Determine if getting DB_NOTFOUND is because we're at the
+ * end of a log file and need to send a NEWFILE message.
+ *
+ * This function handles these cases:
+ * [Case 1 was that we found the record we were looking for - it
+ * is already handled by the caller.]
+ * 2. We asked logc_get for an LSN and it's not found because it is
+ * beyond the end of a log file and we need a NEWFILE msg.
+ * 3. We asked logc_get for an LSN and it simply doesn't exist, but
+ * doesn't meet any of those other criteria, in which case
+ * we return DB_NOTFOUND and the caller decides if it's an error.
+ *
+ * This function returns 0 if we had to send a message and the bad
+ * LSN is dealt with and DB_NOTFOUND if this really is an unknown LSN
+ * (on a client) and errors if it isn't found on the master.
+ */
+static int
+__rep_chk_newfile(env, logc, rep, rp, eid)
+ ENV *env;
+ DB_LOGC *logc;
+ REP *rep;
+ __rep_control_args *rp;
+ int eid;
+{
+ DBT data_dbt, newfiledbt;
+ DB_LOG *dblp;
+ DB_LSN endlsn;
+ LOG *lp;
+ __rep_newfile_args nf_args;
+ int ret;
+ u_int8_t buf[__REP_NEWFILE_SIZE];
+ size_t len;
+
+ ret = 0;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ LOG_SYSTEM_LOCK(env);
+ endlsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ if (endlsn.file > rp->lsn.file) {
+ /*
+ * Case 2:
+ * Need to find the LSN of the last record in
+ * file lsn.file so that we can send it with
+ * the NEWFILE call. In order to do that, we
+ * need to try to get {lsn.file + 1, 0} and
+ * then backup.
+ */
+ endlsn.file = rp->lsn.file + 1;
+ endlsn.offset = 0;
+ if ((ret = __logc_get(logc,
+ &endlsn, &data_dbt, DB_SET)) != 0 ||
+ (ret = __logc_get(logc,
+ &endlsn, &data_dbt, DB_PREV)) != 0) {
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "Unable to get prev of [%lu][%lu]",
+ (u_long)rp->lsn.file,
+ (u_long)rp->lsn.offset));
+ /*
+ * We want to push the error back
+ * to the client so that the client
+ * does an internal backup. The
+ * client asked for a log record
+ * we no longer have and it is
+ * outdated.
+ * XXX - This could be optimized by
+ * having the master perform and
+ * send a REP_UPDATE message. We
+ * currently want the client to set
+ * up its 'update' state prior to
+ * requesting REP_UPDATE_REQ.
+ *
+ * If we're a client servicing a request
+ * just return DB_NOTFOUND.
+ */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ ret = 0;
+ (void)__rep_send_message(env, eid,
+ REP_VERIFY_FAIL, &rp->lsn,
+ NULL, 0, 0);
+ } else
+ ret = DB_NOTFOUND;
+ } else {
+ endlsn.offset += logc->len;
+ if ((ret = __logc_version(logc,
+ &nf_args.version)) == 0) {
+ memset(&newfiledbt, 0,
+ sizeof(newfiledbt));
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(newfiledbt,
+ &nf_args.version,
+ sizeof(nf_args.version));
+ else {
+ if ((ret = __rep_newfile_marshal(env,
+ &nf_args, buf, __REP_NEWFILE_SIZE,
+ &len)) != 0)
+ return (ret);
+ DB_INIT_DBT(newfiledbt, buf, len);
+ }
+ (void)__rep_send_message(env, eid,
+ REP_NEWFILE, &endlsn,
+ &newfiledbt, REPCTL_RESEND, 0);
+ }
+ }
+ } else
+ ret = DB_NOTFOUND;
+
+ return (ret);
+}
diff --git a/rep/rep_method.c b/rep/rep_method.c
new file mode 100644
index 0000000..fb21f7e
--- /dev/null
+++ b/rep/rep_method.c
@@ -0,0 +1,2142 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __rep_abort_prepared __P((ENV *));
+static int __rep_bt_cmp __P((DB *, const DBT *, const DBT *));
+static void __rep_config_map __P((ENV *, u_int32_t *, u_int32_t *));
+static u_int32_t __rep_conv_vers __P((ENV *, u_int32_t));
+static int __rep_restore_prepared __P((ENV *));
+
+/*
+ * __rep_env_create --
+ * Replication-specific initialization of the ENV structure.
+ *
+ * PUBLIC: int __rep_env_create __P((DB_ENV *));
+ */
+int
+__rep_env_create(dbenv)
+ DB_ENV *dbenv;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ if ((ret = __os_calloc(env, 1, sizeof(DB_REP), &db_rep)) != 0)
+ return (ret);
+
+ db_rep->eid = DB_EID_INVALID;
+ db_rep->bytes = REP_DEFAULT_THROTTLE;
+ DB_TIMEOUT_TO_TIMESPEC(DB_REP_REQUEST_GAP, &db_rep->request_gap);
+ DB_TIMEOUT_TO_TIMESPEC(DB_REP_MAX_GAP, &db_rep->max_gap);
+ db_rep->elect_timeout = 2 * US_PER_SEC; /* 2 seconds */
+ db_rep->chkpt_delay = 30 * US_PER_SEC; /* 30 seconds */
+ db_rep->my_priority = DB_REP_DEFAULT_PRIORITY;
+ /*
+ * Make no clock skew the default. Setting both fields
+ * to the same non-zero value means no skew.
+ */
+ db_rep->clock_skew = 1;
+ db_rep->clock_base = 1;
+
+#ifdef HAVE_REPLICATION_THREADS
+ if ((ret = __repmgr_env_create(env, db_rep)) != 0) {
+ __os_free(env, db_rep);
+ return (ret);
+ }
+#endif
+
+ env->rep_handle = db_rep;
+ return (0);
+}
+
+/*
+ * __rep_env_destroy --
+ * Replication-specific destruction of the ENV structure.
+ *
+ * PUBLIC: void __rep_env_destroy __P((DB_ENV *));
+ */
+void
+__rep_env_destroy(dbenv)
+ DB_ENV *dbenv;
+{
+ ENV *env;
+
+ env = dbenv->env;
+
+ if (env->rep_handle != NULL) {
+#ifdef HAVE_REPLICATION_THREADS
+ __repmgr_env_destroy(env, env->rep_handle);
+#endif
+ __os_free(env, env->rep_handle);
+ env->rep_handle = NULL;
+ }
+}
+
+/*
+ * __rep_get_config --
+ * Return the replication subsystem configuration.
+ *
+ * PUBLIC: int __rep_get_config __P((DB_ENV *, u_int32_t, int *));
+ */
+int
+__rep_get_config(dbenv, which, onp)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int *onp;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+ u_int32_t mapped;
+
+ env = dbenv->env;
+
+#undef OK_FLAGS
+#define OK_FLAGS \
+ (DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \
+ DB_REP_CONF_LEASE | DB_REP_CONF_NOAUTOINIT | DB_REP_CONF_NOWAIT | \
+ DB_REPMGR_CONF_2SITE_STRICT)
+
+ if (FLD_ISSET(which, ~OK_FLAGS))
+ return (__db_ferr(env, "DB_ENV->rep_get_config", 0));
+
+ db_rep = env->rep_handle;
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_config", DB_INIT_REP);
+
+ mapped = 0;
+ __rep_config_map(env, &which, &mapped);
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ if (FLD_ISSET(rep->config, mapped))
+ *onp = 1;
+ else
+ *onp = 0;
+ } else {
+ if (FLD_ISSET(db_rep->config, mapped))
+ *onp = 1;
+ else
+ *onp = 0;
+ }
+ return (0);
+}
+
+/*
+ * __rep_set_config --
+ * Configure the replication subsystem.
+ *
+ * PUBLIC: int __rep_set_config __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__rep_set_config(dbenv, which, on)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int on;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ LOG *lp;
+ REP *rep;
+ REP_BULK bulk;
+ u_int32_t mapped, orig;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+#undef OK_FLAGS
+#define OK_FLAGS \
+ (DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \
+ DB_REP_CONF_LEASE | DB_REP_CONF_NOAUTOINIT | DB_REP_CONF_NOWAIT | \
+ DB_REPMGR_CONF_2SITE_STRICT)
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_config", DB_INIT_REP);
+
+ if (FLD_ISSET(which, ~OK_FLAGS))
+ return (__db_ferr(env, "DB_ENV->rep_set_config", 0));
+
+ mapped = 0;
+ __rep_config_map(env, &which, &mapped);
+
+ if (APP_IS_BASEAPI(env) && FLD_ISSET(mapped, REP_C_2SITE_STRICT)) {
+ __db_errx(env, "%s %s", "DB_ENV->rep_set_config:",
+"cannot configure 2SITE_STRICT from base replication application");
+ return (EINVAL);
+ }
+
+ if (REP_ON(env)) {
+ ENV_ENTER(env, ip);
+
+ rep = db_rep->region;
+ /*
+ * In-memory replication must be called before calling
+ * env->open. If it is turned on and off before env->open,
+ * it doesn't matter. Any attempt to turn it on or off after
+ * env->open is intercepted by this error.
+ */
+ if (FLD_ISSET(mapped, REP_C_INMEM)) {
+ __db_errx(env, "%s %s", "DB_ENV->rep_set_config:",
+ "in-memory replication must be configured before DB_ENV->open");
+ return (EINVAL);
+ }
+ /*
+ * Leases must be turned on before calling rep_start.
+ * Leases can never be turned off once they're turned on.
+ */
+ if (FLD_ISSET(mapped, REP_C_LEASE)) {
+ if (F_ISSET(rep, REP_F_START_CALLED)) {
+ __db_errx(env,
+"DB_ENV->rep_set_config: leases must be configured before DB_ENV->rep_start");
+ ret = EINVAL;
+ }
+ if (on == 0) {
+ __db_errx(env,
+ "DB_ENV->rep_set_config: leases cannot be turned off");
+ ret = EINVAL;
+ }
+ if (ret != 0)
+ return (ret);
+ }
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ orig = rep->config;
+ if (on)
+ FLD_SET(rep->config, mapped);
+ else
+ FLD_CLR(rep->config, mapped);
+
+ /*
+ * Bulk transfer requires special processing if it is getting
+ * toggled.
+ */
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (FLD_ISSET(rep->config, REP_C_BULK) &&
+ !FLD_ISSET(orig, REP_C_BULK))
+ db_rep->bulk = R_ADDR(&dblp->reginfo, lp->bulk_buf);
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * If turning bulk off and it was on, send out whatever is in
+ * the buffer already.
+ */
+ if (FLD_ISSET(orig, REP_C_BULK) &&
+ !FLD_ISSET(rep->config, REP_C_BULK) && lp->bulk_off != 0) {
+ memset(&bulk, 0, sizeof(bulk));
+ if (db_rep->bulk == NULL)
+ bulk.addr =
+ R_ADDR(&dblp->reginfo, lp->bulk_buf);
+ else
+ bulk.addr = db_rep->bulk;
+ bulk.offp = &lp->bulk_off;
+ bulk.len = lp->bulk_len;
+ bulk.type = REP_BULK_LOG;
+ bulk.eid = DB_EID_BROADCAST;
+ bulk.flagsp = &lp->bulk_flags;
+ ret = __rep_send_bulk(env, &bulk, 0);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ ENV_LEAVE(env, ip);
+ } else {
+ if (on)
+ FLD_SET(db_rep->config, mapped);
+ else
+ FLD_CLR(db_rep->config, mapped);
+ }
+ /* Configuring 2SITE_STRICT makes this a repmgr application */
+ if (ret == 0 && FLD_ISSET(mapped, REP_C_2SITE_STRICT))
+ APP_SET_REPMGR(env);
+ return (ret);
+}
+
+static void
+__rep_config_map(env, inflagsp, outflagsp)
+ ENV *env;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ COMPQUIET(env, NULL);
+
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_BULK)) {
+ FLD_SET(*outflagsp, REP_C_BULK);
+ FLD_CLR(*inflagsp, DB_REP_CONF_BULK);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_DELAYCLIENT)) {
+ FLD_SET(*outflagsp, REP_C_DELAYCLIENT);
+ FLD_CLR(*inflagsp, DB_REP_CONF_DELAYCLIENT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_INMEM)) {
+ FLD_SET(*outflagsp, REP_C_INMEM);
+ FLD_CLR(*inflagsp, DB_REP_CONF_INMEM);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_LEASE)) {
+ FLD_SET(*outflagsp, REP_C_LEASE);
+ FLD_CLR(*inflagsp, DB_REP_CONF_LEASE);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_NOAUTOINIT)) {
+ FLD_SET(*outflagsp, REP_C_NOAUTOINIT);
+ FLD_CLR(*inflagsp, DB_REP_CONF_NOAUTOINIT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_NOWAIT)) {
+ FLD_SET(*outflagsp, REP_C_NOWAIT);
+ FLD_CLR(*inflagsp, DB_REP_CONF_NOWAIT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_2SITE_STRICT)) {
+ FLD_SET(*outflagsp, REP_C_2SITE_STRICT);
+ FLD_CLR(*inflagsp, DB_REPMGR_CONF_2SITE_STRICT);
+ }
+}
+
+/*
+ * __rep_start_pp --
+ * Become a master or client, and start sending messages to participate
+ * in the replication environment. Must be called after the environment
+ * is open.
+ *
+ * PUBLIC: int __rep_start_pp __P((DB_ENV *, DBT *, u_int32_t));
+ */
+int
+__rep_start_pp(dbenv, dbt, flags)
+ DB_ENV *dbenv;
+ DBT *dbt;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ ENV *env;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_start", DB_INIT_REP);
+
+ if (APP_IS_REPMGR(env)) {
+ __db_errx(env,
+"DB_ENV->rep_start: cannot call from Replication Manager application");
+ return (EINVAL);
+ }
+
+ switch (LF_ISSET(DB_REP_CLIENT | DB_REP_MASTER)) {
+ case DB_REP_CLIENT:
+ case DB_REP_MASTER:
+ break;
+ default:
+ __db_errx(env,
+ "DB_ENV->rep_start: must specify DB_REP_CLIENT or DB_REP_MASTER");
+ return (EINVAL);
+ }
+
+ /* We need a transport function because we send messages. */
+ if (db_rep->send == NULL) {
+ __db_errx(env,
+ "DB_ENV->rep_start: must be called after DB_ENV->rep_set_transport");
+ return (EINVAL);
+ }
+
+ return (__rep_start_int(env, dbt, flags));
+}
+
+/*
+ * __rep_start_int --
+ * Internal processing to become a master or client and start sending
+ * messages to participate in the replication environment.
+ *
+ * We must protect rep_start_int, which may change the world, with the rest
+ * of the DB library. Each API interface will count itself as it enters
+ * the library. Rep_start_int checks the following:
+ *
+ * rep->msg_th - this is the count of threads currently in rep_process_message
+ * rep->handle_cnt - number of threads actively using a dbp in library.
+ * rep->txn_cnt - number of active txns.
+ * REP_F_READY_* - Replication flag that indicates that we wish to run
+ * recovery, and want to prohibit new transactions from entering and cause
+ * existing ones to return immediately (with a DB_LOCK_DEADLOCK error).
+ *
+ * There is also the renv->rep_timestamp which is updated whenever significant
+ * events (i.e., new masters, log rollback, etc). Upon creation, a handle
+ * is associated with the current timestamp. Each time a handle enters the
+ * library it must check if the handle timestamp is the same as the one
+ * stored in the replication region. This prevents the use of handles on
+ * clients that reference non-existent files whose creation was backed out
+ * during a synchronizing recovery.
+ *
+ * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t));
+ */
+int
+__rep_start_int(env, dbt, flags)
+ ENV *env;
+ DBT *dbt;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN lsn, perm_lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ DB_TXNREGION *region;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ db_timeout_t tmp;
+ u_int32_t oldvers, pending_event, repflags, role;
+ int do_ckp, interrupting, locked, ret, role_chg, start_th, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ renv = infop->primary;
+ interrupting = locked = 0;
+ pending_event = DB_EVENT_NO_SUCH_EVENT;
+ role = LF_ISSET(DB_REP_CLIENT | DB_REP_MASTER);
+ start_th = 0;
+ do_ckp = 0;
+
+ /*
+ * If we're using master leases, check that all needed
+ * setup has been done, including setting the lease timeout.
+ */
+ if (IS_USING_LEASES(env) && rep->lease_timeout == 0) {
+ __db_errx(env,
+"DB_ENV->rep_start: must call DB_ENV->rep_set_timeout for leases first");
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ /*
+ * In order to correctly check log files for old versions, we
+ * need to flush the logs.
+ */
+ if ((ret = __log_flush(env, NULL)) != 0)
+ goto out;
+
+ REP_SYSTEM_LOCK(env);
+ /*
+ * We only need one thread to start-up replication, so if
+ * there is another thread in rep_start, we'll let it finish
+ * its work and have this thread simply return. Similarly,
+ * if a thread is in a critical lockout section we return.
+ */
+ if (F_ISSET(rep, REP_F_INREPSTART)) {
+ /*
+ * There is already someone in rep_start. Return.
+ */
+ RPRINT(env, DB_VERB_REP_MISC,
+ (env, "Thread already in rep_start"));
+ REP_SYSTEM_UNLOCK(env);
+ goto out;
+ } else {
+ F_SET(rep, REP_F_INREPSTART);
+ start_th = 1;
+ }
+
+ if (F_ISSET(rep, REP_F_READY_MSG)) {
+ /*
+ * There is already someone in msg lockout. Return.
+ */
+ RPRINT(env, DB_VERB_REP_MISC,
+ (env, "Thread already in msg lockout"));
+ REP_SYSTEM_UNLOCK(env);
+ goto out;
+ } else if ((ret = __rep_lockout_msg(env, rep, 0)) != 0)
+ goto errunlock;
+
+ /*
+ * If we are internal init and we try to become master, reject it.
+ * Our environment databases/logs are in an inconsistent state and
+ * we cannot become master.
+ */
+ if (IN_INTERNAL_INIT(rep) && role == DB_REP_MASTER) {
+ __db_errx(env,
+"DB_ENV->rep_start: Cannot become master during internal init");
+ ret = DB_REP_UNAVAIL;
+ goto errunlock;
+ }
+
+ role_chg = (!F_ISSET(rep, REP_F_MASTER) && role == DB_REP_MASTER) ||
+ (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT);
+
+ /*
+ * Wait for any active txns or mpool ops to complete, and
+ * prevent any new ones from occurring, only if we're
+ * changing roles.
+ */
+ if (role_chg) {
+ if ((ret = __rep_lockout_api(env, rep)) != 0)
+ goto errunlock;
+ locked = 1;
+ }
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (role == DB_REP_MASTER) {
+ if (role_chg) {
+ /*
+ * If we were previously a client, it's possible we
+ * could have an interruptible STARTSYNC in progress.
+ * Interrupt it now, so that it doesn't slow down our
+ * transition to master, and because its effects aren't
+ * doing us any good anyway.
+ */
+ (void)__memp_set_config(
+ env->dbenv, DB_MEMP_SYNC_INTERRUPT, 1);
+ interrupting = 1;
+
+ /*
+ * If we're upgrading from having been a client,
+ * preclose, so that we close our temporary database
+ * and any files we opened while doing a rep_apply.
+ * If we don't we can infinitely leak file ids if
+ * the master crashed with files open (the likely
+ * case). If we don't close them we can run into
+ * problems if we try to remove that file or long
+ * running applications end up with an unbounded
+ * number of used fileids, each getting written
+ * on checkpoint. Just close them.
+ * Then invalidate all files open in the logging
+ * region. These are files open by other processes
+ * attached to the environment. They must be
+ * closed by the other processes when they notice
+ * the change in role.
+ */
+ if ((ret = __rep_preclose(env)) != 0)
+ goto errunlock;
+
+ rep->gen++;
+ /*
+ * There could have been any number of failed
+ * elections, so jump the gen if we need to now.
+ */
+ if (rep->egen > rep->gen)
+ rep->gen = rep->egen;
+ if (IS_USING_LEASES(env) &&
+ !F_ISSET(rep, REP_F_MASTERELECT)) {
+ __db_errx(env,
+ "rep_start: Cannot become master without being elected when using leases.");
+ ret = EINVAL;
+ goto errunlock;
+ }
+ if (F_ISSET(rep, REP_F_MASTERELECT)) {
+ __rep_elect_done(env, rep, 0);
+ F_CLR(rep, REP_F_MASTERELECT);
+ }
+ if (rep->egen <= rep->gen)
+ rep->egen = rep->gen + 1;
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "New master gen %lu, egen %lu",
+ (u_long)rep->gen, (u_long)rep->egen));
+ /*
+ * If not running in-memory replication, write
+ * gen file.
+ */
+ if (!FLD_ISSET(rep->config, REP_C_INMEM)) {
+ if ((ret = __rep_write_gen(env, rep, rep->gen))
+ != 0)
+ goto errunlock;
+ } else if (!F_ISSET(rep, REP_F_MASTERELECT))
+ /*
+ * Help detect if application has
+ * ignored our recommendation against
+ * reappointing same master after a
+ * crash/reboot when running in-memory
+ * replication. Doing this allows a
+ * slight chance of two masters at the
+ * same generation resulting in client
+ * crashes.
+ */
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "Appointed new master while running in-memory replication."));
+ }
+ /*
+ * Set lease duration assuming clients have faster clock.
+ * Master needs to compensate so that clients do not
+ * expire their grant while the master thinks it is valid.
+ */
+ if (IS_USING_LEASES(env) &&
+ (role_chg || !IS_REP_STARTED(env))) {
+ /*
+ * If we have already granted our lease, we
+ * cannot become master.
+ */
+ if ((ret = __rep_islease_granted(env))) {
+ __db_errx(env,
+ "rep_start: Cannot become master with outstanding lease granted.");
+ ret = EINVAL;
+ goto errunlock;
+ }
+ /*
+ * Set max_perm_lsn to last PERM record on master.
+ */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto errunlock;
+ ret = __rep_log_backup(env, rep, logc, &perm_lsn);
+ (void)__logc_close(logc);
+ /*
+ * If we found a perm LSN use it. Otherwise, if
+ * no perm LSN exists, initialize.
+ */
+ if (ret == 0)
+ lp->max_perm_lsn = perm_lsn;
+ else if (ret == DB_NOTFOUND) {
+ /*
+ * If we have no perm records, we want to
+ * force (later) a checkpoint to the log.
+ * By doing this now, we avoid a sticky
+ * deadlock with a txn. We need a perm
+ * record for leases, but if the first perm
+ * record is a txn, that txn cannot commit
+ * without leases refreshed. A client may
+ * be in internal init and cannot sync up if
+ * it needs to read pages the txn holds write
+ * locks on and we have an impasse. This
+ * checkpoint will allow leases to be granted
+ * on this perm record first and that does not
+ * need any locks.
+ */
+ do_ckp = 1;
+ INIT_LSN(lp->max_perm_lsn);
+ } else
+ goto errunlock;
+
+ /*
+ * Simply compute the larger ratio for the lease.
+ */
+ tmp = (db_timeout_t)((double)rep->lease_timeout /
+ ((double)rep->clock_skew /
+ (double)rep->clock_base));
+ DB_TIMEOUT_TO_TIMESPEC(tmp, &rep->lease_duration);
+ if ((ret = __rep_lease_table_alloc(env,
+ rep->nsites)) != 0)
+ goto errunlock;
+ }
+ rep->master_id = rep->eid;
+ STAT(rep->stat.st_master_changes++);
+
+ /*
+ * Clear out almost everything, and then set MASTER. Leave
+ * READY_* alone in case we did a lockout above;
+ * we'll clear it in a moment (below), once we've written
+ * the txn_recycle into the log.
+ */
+ repflags = F_ISSET(rep, REP_F_INREPSTART | REP_F_READY_API |
+ REP_F_READY_MSG | REP_F_READY_OP | REP_F_STICKY_MASK);
+#ifdef DIAGNOSTIC
+ if (!F_ISSET(rep, REP_F_GROUP_ESTD))
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "Establishing group as master."));
+#endif
+ FLD_SET(repflags, REP_F_MASTER |
+ REP_F_GROUP_ESTD | REP_F_NIMDBS_LOADED);
+ rep->flags = repflags;
+
+ /*
+ * We're master. Set the versions to the current ones.
+ */
+ oldvers = lp->persist.version;
+ /*
+ * If we're moving forward to the current version, we need
+ * to force the log file to advance and reset the
+ * recovery table since it contains pointers to old
+ * recovery functions.
+ */
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "rep_start: Old log version was %lu", (u_long)oldvers));
+ if (lp->persist.version != DB_LOGVERSION) {
+ if ((ret = __env_init_rec(env, DB_LOGVERSION)) != 0)
+ goto errunlock;
+ }
+ rep->version = DB_REPVERSION;
+ F_CLR(rep, REP_F_READY_MSG);
+ REP_SYSTEM_UNLOCK(env);
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+
+ /*
+ * Send the NEWMASTER message first so that clients know
+ * subsequent messages are coming from the right master.
+ * We need to perform all actions below no matter what
+ * regarding errors.
+ */
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ ret = 0;
+ if (role_chg) {
+ pending_event = DB_EVENT_REP_MASTER;
+ /*
+ * If prepared transactions have not been restored
+ * look to see if there are any. If there are,
+ * then mark the open files, otherwise close them.
+ */
+ region = env->tx_handle->reginfo.primary;
+ if (region->stat.st_nrestores == 0 &&
+ (t_ret = __rep_restore_prepared(env)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (region->stat.st_nrestores != 0) {
+ if ((t_ret = __dbreg_mark_restored(env)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ } else {
+ ret = __dbreg_invalidate_files(env, 0);
+ if ((t_ret = __rep_closefiles(env)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+ if ((t_ret = __txn_recycle_id(env)) != 0 && ret == 0)
+ ret = t_ret;
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_READY_API | REP_F_READY_OP);
+ locked = 0;
+ REP_SYSTEM_UNLOCK(env);
+ (void)__memp_set_config(
+ env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+ interrupting = 0;
+ /*
+ * Force a checkpoint if this new master has no
+ * perm record yet.
+ */
+ if (ret == 0 && do_ckp)
+ ret = __txn_checkpoint(env, 0, 0,
+ DB_CKP_INTERNAL | DB_FORCE);
+ }
+ } else {
+ if (role_chg)
+ rep->master_id = DB_EID_INVALID;
+ /*
+ * Zero out "everything" except recovery and tally flags.
+ */
+ repflags = F_ISSET(rep,
+ REP_F_INREPSTART | REP_F_NOARCHIVE | REP_F_READY_MSG |
+ REP_F_RECOVER_MASK | REP_F_TALLY | REP_F_STICKY_MASK);
+ FLD_SET(repflags, REP_F_CLIENT);
+ if (role_chg) {
+ if ((ret = __log_get_oldversion(env, &oldvers)) != 0)
+ goto errunlock;
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "rep_start: Found old version log %d", oldvers));
+ if (oldvers >= DB_LOGVERSION_MIN) {
+ __log_set_version(env, oldvers);
+ oldvers = __rep_conv_vers(env, oldvers);
+ DB_ASSERT(
+ env, oldvers != DB_REPVERSION_INVALID);
+ rep->version = oldvers;
+ }
+ }
+ rep->flags = repflags;
+ /*
+ * On a client, compute the lease duration on the
+ * assumption that the client has a fast clock.
+ * Expire any existing leases we might have held as
+ * a master.
+ */
+ if (IS_USING_LEASES(env) &&
+ (role_chg || !IS_REP_STARTED(env))) {
+ if ((ret = __rep_lease_expire(env)) != 0)
+ goto errunlock;
+ /*
+ * Since the master is also compensating on its
+ * side as well, we're being doubly conservative
+ * to compensate on the client side. Theoretically,
+ * this compensation is not necessary, as it is
+ * effectively doubling the skew compensation.
+ * But we are making guarantees based on time and
+ * skews across machines. So we are being extra
+ * cautious.
+ */
+ tmp = (db_timeout_t)((double)rep->lease_timeout *
+ ((double)rep->clock_skew /
+ (double)rep->clock_base));
+ DB_TIMEOUT_TO_TIMESPEC(tmp, &rep->lease_duration);
+ if (rep->lease_off != INVALID_ROFF) {
+ MUTEX_LOCK(env, renv->mtx_regenv);
+ __env_alloc_free(infop,
+ R_ADDR(infop, rep->lease_off));
+ MUTEX_UNLOCK(env, renv->mtx_regenv);
+ rep->lease_off = INVALID_ROFF;
+ }
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Abort any prepared transactions that were restored
+ * by recovery. We won't be able to create any txns of
+ * our own until they're resolved, but we can't resolve
+ * them ourselves; the master has to. If any get
+ * resolved as commits, we'll redo them when commit
+ * records come in. Aborts will simply be ignored.
+ */
+ if ((ret = __rep_abort_prepared(env)) != 0)
+ goto errlock;
+
+ /*
+ * If we're changing roles we need to init the db.
+ */
+ if (role_chg) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto errlock;
+ /*
+ * Ignore errors, because if the file doesn't exist,
+ * this is perfectly OK.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ (void)__db_remove(dbp, ip, NULL, REPDBNAME,
+ NULL, DB_FORCE);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ /*
+ * Set pending_event after calls that can fail.
+ */
+ pending_event = DB_EVENT_REP_CLIENT;
+ }
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_READY_MSG);
+ if (locked) {
+ F_CLR(rep, REP_F_READY_API | REP_F_READY_OP);
+ locked = 0;
+ }
+ REP_SYSTEM_UNLOCK(env);
+
+ if ((role_chg || rep->master_id == DB_EID_INVALID) &&
+ F_ISSET(env, ENV_PRIVATE))
+ /*
+ * If we think we're a new client, and we have a
+ * private env, set our gen number down to 0.
+ * Otherwise, we can restart and think
+ * we're ready to accept a new record (because our
+ * gen is okay), but really this client needs to
+ * sync with the master.
+ */
+ rep->gen = 0;
+
+ /*
+ * Announce ourselves and send out our data.
+ */
+ if ((ret = __dbt_usercopy(env, dbt)) != 0)
+ goto out;
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWCLIENT, NULL, dbt, 0, 0);
+ }
+
+ if (0) {
+ /*
+ * We have separate labels for errors. If we're returning an
+ * error before we've set REP_F_READY_MSG, we use 'err'. If
+ * we are erroring while holding the region mutex, then we use
+ * 'errunlock' label. If we error without holding the rep
+ * mutex we must use 'errlock'.
+ */
+errlock: REP_SYSTEM_LOCK(env);
+errunlock: F_CLR(rep, REP_F_READY_MSG);
+ if (locked)
+ F_CLR(rep, REP_F_READY_API | REP_F_READY_OP);
+ if (interrupting)
+ (void)__memp_set_config(
+ env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+ REP_SYSTEM_UNLOCK(env);
+ }
+out:
+ if (ret == 0) {
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_START_CALLED);
+ REP_SYSTEM_UNLOCK(env);
+ }
+ if (start_th) {
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_INREPSTART);
+ REP_SYSTEM_UNLOCK(env);
+ }
+ if (pending_event != DB_EVENT_NO_SUCH_EVENT)
+ __rep_fire_event(env, pending_event, NULL);
+ __dbt_userfree(env, dbt, NULL, NULL);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __rep_client_dbinit --
+ *
+ * Initialize the LSN database on the client side. This is called from the
+ * client initialization code. The startup flag value indicates if
+ * this is the first thread/process starting up and therefore should create
+ * the LSN database. This routine must be called once by each process acting
+ * as a client.
+ *
+ * Assumes caller holds appropriate mutex.
+ *
+ * PUBLIC: int __rep_client_dbinit __P((ENV *, int, repdb_t));
+ */
+int
+__rep_client_dbinit(env, startup, which)
+ ENV *env;
+ int startup;
+ repdb_t which;
+{
+ DB *dbp, **rdbpp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ int ret, t_ret;
+ u_int32_t flags;
+ const char *fname, *name, *subdb;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dbp = NULL;
+
+ if (which == REP_DB) {
+ name = REPDBNAME;
+ rdbpp = &db_rep->rep_db;
+ } else {
+ name = REPPAGENAME;
+ rdbpp = &rep->file_dbp;
+ }
+ /* Check if this has already been called on this environment. */
+ if (*rdbpp != NULL)
+ return (0);
+
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /* Set up arguments for __db_remove and __db_open calls. */
+ fname = name;
+ subdb = NULL;
+ if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+ fname = NULL;
+ subdb = name;
+ }
+
+ if (startup) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ /*
+ * Prevent in-memory database remove from writing to
+ * non-existent logs.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM))
+ (void)__db_set_flags(dbp, DB_TXN_NOT_DURABLE);
+ /*
+ * Ignore errors, because if the file doesn't exist, this
+ * is perfectly OK.
+ */
+ (void)__db_remove(dbp, ip, NULL, fname, subdb, DB_FORCE);
+ }
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ if (which == REP_DB &&
+ (ret = __bam_set_bt_compare(dbp, __rep_bt_cmp)) != 0)
+ goto err;
+
+ /* Don't write log records on the client. */
+ if ((ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+
+ flags = DB_NO_AUTO_COMMIT | DB_CREATE |
+ (F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
+
+ if ((ret = __db_open(dbp, ip, NULL, fname, subdb,
+ (which == REP_DB ? DB_BTREE : DB_RECNO),
+ flags, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ *rdbpp = dbp;
+
+ if (0) {
+err: if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ *rdbpp = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __rep_bt_cmp --
+ *
+ * Comparison function for the LSN table. We use the entire control
+ * structure as a key (for simplicity, so we don't have to merge the
+ * other fields in the control with the data field), but really only
+ * care about the LSNs.
+ */
+static int
+__rep_bt_cmp(dbp, dbt1, dbt2)
+ DB *dbp;
+ const DBT *dbt1, *dbt2;
+{
+ DB_LSN lsn1, lsn2;
+ __rep_control_args *rp1, *rp2;
+
+ COMPQUIET(dbp, NULL);
+
+ rp1 = dbt1->data;
+ rp2 = dbt2->data;
+
+ (void)__ua_memcpy(&lsn1, &rp1->lsn, sizeof(DB_LSN));
+ (void)__ua_memcpy(&lsn2, &rp2->lsn, sizeof(DB_LSN));
+
+ if (lsn1.file > lsn2.file)
+ return (1);
+
+ if (lsn1.file < lsn2.file)
+ return (-1);
+
+ if (lsn1.offset > lsn2.offset)
+ return (1);
+
+ if (lsn1.offset < lsn2.offset)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * __rep_abort_prepared --
+ * Abort any prepared transactions that recovery restored.
+ *
+ * This is used by clients that have just run recovery, since
+ * they cannot/should not call txn_recover and handle prepared transactions
+ * themselves.
+ */
+static int
+__rep_abort_prepared(env)
+ ENV *env;
+{
+#define PREPLISTSIZE 50
+ DB_LOG *dblp;
+ DB_PREPLIST prep[PREPLISTSIZE], *p;
+ DB_TXNMGR *mgr;
+ DB_TXNREGION *region;
+ LOG *lp;
+ int ret;
+ u_int32_t count, i;
+ u_int32_t op;
+
+ mgr = env->tx_handle;
+ region = mgr->reginfo.primary;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ if (region->stat.st_nrestores == 0)
+ return (0);
+
+ op = DB_FIRST;
+ do {
+ if ((ret = __txn_recover(env,
+ prep, PREPLISTSIZE, &count, op)) != 0)
+ return (ret);
+ for (i = 0; i < count; i++) {
+ p = &prep[i];
+ if ((ret = __txn_abort(p->txn)) != 0)
+ return (ret);
+ env->rep_handle->region->op_cnt--;
+ env->rep_handle->region->max_prep_lsn = lp->lsn;
+ region->stat.st_nrestores--;
+ }
+ op = DB_NEXT;
+ } while (count == PREPLISTSIZE);
+
+ return (0);
+}
+
+/*
+ * __rep_restore_prepared --
+ * Restore to a prepared state any prepared but not yet committed
+ * transactions.
+ *
+ * This performs, in effect, a "mini-recovery"; it is called from
+ * __rep_start by newly upgraded masters. There may be transactions that an
+ * old master prepared but did not resolve, which we need to restore to an
+ * active state.
+ */
+static int
+__rep_restore_prepared(env)
+ ENV *env;
+{
+ DBT rec;
+ DB_LOGC *logc;
+ DB_LSN ckp_lsn, lsn;
+ DB_REP *db_rep;
+ DB_TXNHEAD *txninfo;
+ REP *rep;
+ __txn_ckp_args *ckp_args;
+ __txn_regop_args *regop_args;
+ __txn_prepare_args *prep_args;
+ int ret, t_ret;
+ u_int32_t hi_txn, low_txn, rectype, status, txnid, txnop;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ if (IS_ZERO_LSN(rep->max_prep_lsn)) {
+ RPRINT(env, DB_VERB_REP_MISC,
+ (env, "restore_prep: No prepares. Skip."));
+ return (0);
+ }
+ txninfo = NULL;
+ ckp_args = NULL;
+ prep_args = NULL;
+ regop_args = NULL;
+ ZERO_LSN(ckp_lsn);
+ ZERO_LSN(lsn);
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ /*
+ * Get our first LSN to see if the prepared LSN is still
+ * available. If so, it might be unresolved. If not,
+ * then it is guaranteed to be resolved.
+ */
+ memset(&rec, 0, sizeof(DBT));
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
+ __db_errx(env, "First record not found");
+ goto err;
+ }
+ /*
+ * If the max_prep_lsn is no longer available, we're sure
+ * that txn has been resolved. We're done.
+ */
+ if (rep->max_prep_lsn.file < lsn.file) {
+ RPRINT(env, DB_VERB_REP_MISC,
+ (env, "restore_prep: Prepare resolved. Skip"));
+ ZERO_LSN(rep->max_prep_lsn);
+ goto done;
+ }
+ /*
+ * We need to consider the set of records between the most recent
+ * checkpoint LSN and the end of the log; any txn in that
+ * range, and only txns in that range, could still have been
+ * active, and thus prepared but not yet committed (PBNYC),
+ * when the old master died.
+ *
+ * Find the most recent checkpoint LSN, and get the record there.
+ * If there is no checkpoint in the log, start off by getting
+ * the very first record in the log instead.
+ */
+ if ((ret = __txn_getckp(env, &lsn)) == 0) {
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_SET)) != 0) {
+ __db_errx(env,
+ "Checkpoint record at LSN [%lu][%lu] not found",
+ (u_long)lsn.file, (u_long)lsn.offset);
+ goto err;
+ }
+
+ if ((ret = __txn_ckp_read(
+ env, rec.data, &ckp_args)) == 0) {
+ ckp_lsn = ckp_args->ckp_lsn;
+ __os_free(env, ckp_args);
+ }
+ if (ret != 0) {
+ __db_errx(env,
+ "Invalid checkpoint record at [%lu][%lu]",
+ (u_long)lsn.file, (u_long)lsn.offset);
+ goto err;
+ }
+
+ if ((ret = __logc_get(logc, &ckp_lsn, &rec, DB_SET)) != 0) {
+ __db_errx(env,
+ "Checkpoint LSN record [%lu][%lu] not found",
+ (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset);
+ goto err;
+ }
+ } else if ((ret = __logc_get(logc, &lsn, &rec, DB_FIRST)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ /* An empty log means no PBNYC txns. */
+ ret = 0;
+ goto done;
+ }
+ __db_errx(env, "Attempt to get first log record failed");
+ goto err;
+ }
+
+ /*
+ * We use the same txnlist infrastructure that recovery does;
+ * it demands an estimate of the high and low txnids for
+ * initialization.
+ *
+ * First, the low txnid.
+ */
+ do {
+ /* txnid is after rectype, which is a u_int32. */
+ LOGCOPY_32(env, &low_txn,
+ (u_int8_t *)rec.data + sizeof(u_int32_t));
+ if (low_txn != 0)
+ break;
+ } while ((ret = __logc_get(logc, &lsn, &rec, DB_NEXT)) == 0);
+
+ /* If there are no txns, there are no PBNYC txns. */
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ goto done;
+ } else if (ret != 0)
+ goto err;
+
+ /* Now, the high txnid. */
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0) {
+ /*
+ * Note that DB_NOTFOUND is unacceptable here because we
+ * had to have looked at some log record to get this far.
+ */
+ __db_errx(env, "Final log record not found");
+ goto err;
+ }
+ do {
+ /* txnid is after rectype, which is a u_int32. */
+ LOGCOPY_32(env, &hi_txn,
+ (u_int8_t *)rec.data + sizeof(u_int32_t));
+ if (hi_txn != 0)
+ break;
+ } while ((ret = __logc_get(logc, &lsn, &rec, DB_PREV)) == 0);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ goto done;
+ } else if (ret != 0)
+ goto err;
+
+ /* We have a high and low txnid. Initialise the txn list. */
+ if ((ret = __db_txnlist_init(env,
+ NULL, low_txn, hi_txn, NULL, &txninfo)) != 0)
+ goto err;
+
+ /*
+ * Now, walk backward from the end of the log to ckp_lsn. Any
+ * prepares that we hit without first hitting a commit or
+ * abort belong to PBNYC txns, and we need to apply them and
+ * restore them to a prepared state.
+ *
+ * Note that we wind up applying transactions out of order.
+ * Since all PBNYC txns still held locks on the old master and
+ * were isolated, this should be safe.
+ */
+ F_SET(env->lg_handle, DBLOG_RECOVER);
+ for (ret = __logc_get(logc, &lsn, &rec, DB_LAST);
+ ret == 0 && LOG_COMPARE(&lsn, &ckp_lsn) > 0;
+ ret = __logc_get(logc, &lsn, &rec, DB_PREV)) {
+ LOGCOPY_32(env, &rectype, rec.data);
+ switch (rectype) {
+ case DB___txn_regop:
+ /*
+ * It's a commit or abort--but we don't care
+ * which! Just add it to the list of txns
+ * that are resolved.
+ */
+ if ((ret = __txn_regop_read(
+ env, rec.data, &regop_args)) != 0)
+ goto err;
+ txnid = regop_args->txnp->txnid;
+ txnop = regop_args->opcode;
+ __os_free(env, regop_args);
+
+ ret = __db_txnlist_find(env,
+ txninfo, txnid, &status);
+ if (ret == DB_NOTFOUND)
+ ret = __db_txnlist_add(env, txninfo,
+ txnid, txnop, &lsn);
+ else if (ret != 0)
+ goto err;
+ break;
+ case DB___txn_prepare:
+ /*
+ * It's a prepare. If its not aborted and
+ * we haven't put the txn on our list yet, it
+ * hasn't been resolved, so apply and restore it.
+ */
+ if ((ret = __txn_prepare_read(
+ env, rec.data, &prep_args)) != 0)
+ goto err;
+ ret = __db_txnlist_find(env, txninfo,
+ prep_args->txnp->txnid, &status);
+ if (ret == DB_NOTFOUND) {
+ if (prep_args->opcode == TXN_ABORT)
+ ret = __db_txnlist_add(env, txninfo,
+ prep_args->txnp->txnid,
+ prep_args->opcode, &lsn);
+ else if ((ret =
+ __rep_process_txn(env, &rec)) == 0) {
+ /*
+ * We are guaranteed to be single
+ * threaded here. We need to
+ * account for this newly
+ * instantiated txn in the op_cnt
+ * so that it is counted when it is
+ * resolved.
+ */
+ rep->op_cnt++;
+ ret = __txn_restore_txn(env,
+ &lsn, prep_args);
+ }
+ } else if (ret != 0)
+ goto err;
+ __os_free(env, prep_args);
+ break;
+ default:
+ continue;
+ }
+ }
+
+ /* It's not an error to have hit the beginning of the log. */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+done:
+err: t_ret = __logc_close(logc);
+ F_CLR(env->lg_handle, DBLOG_RECOVER);
+
+ if (txninfo != NULL)
+ __db_txnlist_end(env, txninfo);
+
+ return (ret == 0 ? t_ret : ret);
+}
+
+/*
+ * __rep_get_limit --
+ * Get the limit on the amount of data that will be sent during a single
+ * invocation of __rep_process_message.
+ *
+ * PUBLIC: int __rep_get_limit __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__rep_get_limit(dbenv, gbytesp, bytesp)
+ DB_ENV *dbenv;
+ u_int32_t *gbytesp, *bytesp;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_limit", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ REP_SYSTEM_LOCK(env);
+ if (gbytesp != NULL)
+ *gbytesp = rep->gbytes;
+ if (bytesp != NULL)
+ *bytesp = rep->bytes;
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ if (gbytesp != NULL)
+ *gbytesp = db_rep->gbytes;
+ if (bytesp != NULL)
+ *bytesp = db_rep->bytes;
+ }
+
+ return (0);
+}
+
+/*
+ * __rep_set_limit --
+ * Set a limit on the amount of data that will be sent during a single
+ * invocation of __rep_process_message.
+ *
+ * PUBLIC: int __rep_set_limit __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__rep_set_limit(dbenv, gbytes, bytes)
+ DB_ENV *dbenv;
+ u_int32_t gbytes, bytes;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_limit", DB_INIT_REP);
+
+ if (bytes > GIGABYTE) {
+ gbytes += bytes / GIGABYTE;
+ bytes = bytes % GIGABYTE;
+ }
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ REP_SYSTEM_LOCK(env);
+ rep->gbytes = gbytes;
+ rep->bytes = bytes;
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ db_rep->gbytes = gbytes;
+ db_rep->bytes = bytes;
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_nsites __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_set_nsites(dbenv, n)
+ DB_ENV *dbenv;
+ u_int32_t n;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_nsites", DB_INIT_REP);
+
+ if (IS_USING_LEASES(env) && IS_REP_STARTED(env)) {
+ __db_errx(env,
+ "DB_ENV->rep_set_nsites: must be called before DB_ENV->rep_start");
+ return (EINVAL);
+ }
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ rep->config_nsites = n;
+ } else
+ db_rep->config_nsites = n;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_get_nsites __P((DB_ENV *, u_int32_t *));
+ */
+int
+__rep_get_nsites(dbenv, n)
+ DB_ENV *dbenv;
+ u_int32_t *n;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_nsites", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ *n = rep->config_nsites;
+ } else
+ *n = db_rep->config_nsites;
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_priority __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_set_priority(dbenv, priority)
+ DB_ENV *dbenv;
+ u_int32_t priority;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_priority", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ rep->priority = priority;
+ } else
+ db_rep->my_priority = priority;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_get_priority __P((DB_ENV *, u_int32_t *));
+ */
+int
+__rep_get_priority(dbenv, priority)
+ DB_ENV *dbenv;
+ u_int32_t *priority;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_priority", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ *priority = rep->priority;
+ } else
+ *priority = db_rep->my_priority;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t));
+ */
+int
+__rep_set_timeout(dbenv, which, timeout)
+ DB_ENV *dbenv;
+ int which;
+ db_timeout_t timeout;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+ int repmgr_timeout, ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+ repmgr_timeout = 0;
+
+ if (which == DB_REP_ACK_TIMEOUT || which == DB_REP_CONNECTION_RETRY ||
+ which == DB_REP_ELECTION_RETRY ||
+ which == DB_REP_HEARTBEAT_MONITOR ||
+ which == DB_REP_HEARTBEAT_SEND)
+ repmgr_timeout = 1;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_timeout", DB_INIT_REP);
+
+ if (APP_IS_BASEAPI(env) && repmgr_timeout) {
+ __db_errx(env, "%s %s", "DB_ENV->rep_set_timeout:",
+"cannot set Replication Manager timeout from base replication application");
+ return (EINVAL);
+ }
+ if (which == DB_REP_LEASE_TIMEOUT && IS_REP_STARTED(env)) {
+ ret = EINVAL;
+ __db_errx(env, "%s %s", "DB_ENV->rep_set_timeout:",
+"lease timeout must be set before DB_ENV->rep_start.");
+ return (EINVAL);
+ }
+
+ switch (which) {
+ case DB_REP_CHECKPOINT_DELAY:
+ if (REP_ON(env))
+ rep->chkpt_delay = timeout;
+ else
+ db_rep->chkpt_delay = timeout;
+ break;
+ case DB_REP_ELECTION_TIMEOUT:
+ if (REP_ON(env))
+ rep->elect_timeout = timeout;
+ else
+ db_rep->elect_timeout = timeout;
+ break;
+ case DB_REP_FULL_ELECTION_TIMEOUT:
+ if (REP_ON(env))
+ rep->full_elect_timeout = timeout;
+ else
+ db_rep->full_elect_timeout = timeout;
+ break;
+ case DB_REP_LEASE_TIMEOUT:
+ if (REP_ON(env))
+ rep->lease_timeout = timeout;
+ else
+ db_rep->lease_timeout = timeout;
+ break;
+#ifdef HAVE_REPLICATION_THREADS
+ case DB_REP_ACK_TIMEOUT:
+ db_rep->ack_timeout = timeout;
+ break;
+ case DB_REP_CONNECTION_RETRY:
+ db_rep->connection_retry_wait = timeout;
+ break;
+ case DB_REP_ELECTION_RETRY:
+ db_rep->election_retry_wait = timeout;
+ break;
+ case DB_REP_HEARTBEAT_MONITOR:
+ db_rep->heartbeat_monitor_timeout = timeout;
+ break;
+ case DB_REP_HEARTBEAT_SEND:
+ db_rep->heartbeat_frequency = timeout;
+ break;
+#endif
+ default:
+ __db_errx(env,
+ "Unknown timeout type argument to DB_ENV->rep_set_timeout");
+ ret = EINVAL;
+ }
+
+ /* Setting a repmgr timeout makes this a repmgr application */
+ if (ret == 0 && repmgr_timeout)
+ APP_SET_REPMGR(env);
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __rep_get_timeout __P((DB_ENV *, int, db_timeout_t *));
+ */
+int
+__rep_get_timeout(dbenv, which, timeout)
+ DB_ENV *dbenv;
+ int which;
+ db_timeout_t *timeout;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_timeout", DB_INIT_REP);
+
+ switch (which) {
+ case DB_REP_CHECKPOINT_DELAY:
+ *timeout = REP_ON(env) ?
+ rep->chkpt_delay : db_rep->chkpt_delay;
+ break;
+ case DB_REP_ELECTION_TIMEOUT:
+ *timeout = REP_ON(env) ?
+ rep->elect_timeout : db_rep->elect_timeout;
+ break;
+ case DB_REP_FULL_ELECTION_TIMEOUT:
+ *timeout = REP_ON(env) ?
+ rep->full_elect_timeout : db_rep->full_elect_timeout;
+ break;
+ case DB_REP_LEASE_TIMEOUT:
+ *timeout = REP_ON(env) ?
+ rep->lease_timeout : db_rep->lease_timeout;
+ break;
+#ifdef HAVE_REPLICATION_THREADS
+ case DB_REP_ACK_TIMEOUT:
+ *timeout = db_rep->ack_timeout;
+ break;
+ case DB_REP_CONNECTION_RETRY:
+ *timeout = db_rep->connection_retry_wait;
+ break;
+ case DB_REP_ELECTION_RETRY:
+ *timeout = db_rep->election_retry_wait;
+ break;
+ case DB_REP_HEARTBEAT_MONITOR:
+ *timeout = db_rep->heartbeat_monitor_timeout;
+ break;
+ case DB_REP_HEARTBEAT_SEND:
+ *timeout = db_rep->heartbeat_frequency;
+ break;
+#endif
+ default:
+ __db_errx(env,
+ "unknown timeout type argument to DB_ENV->rep_get_timeout");
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __rep_get_request --
+ * Get the minimum and maximum number of log records that we wait
+ * before retransmitting.
+ *
+ * PUBLIC: int __rep_get_request
+ * PUBLIC: __P((DB_ENV *, db_timeout_t *, db_timeout_t *));
+ */
+int
+__rep_get_request(dbenv, minp, maxp)
+ DB_ENV *dbenv;
+ db_timeout_t *minp, *maxp;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_request", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ /*
+ * We acquire the mtx_region or mtx_clientdb mutexes as needed.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (minp != NULL)
+ DB_TIMESPEC_TO_TIMEOUT((*minp), &rep->request_gap, 0);
+ if (maxp != NULL)
+ DB_TIMESPEC_TO_TIMEOUT((*maxp), &rep->max_gap, 0);
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ if (minp != NULL)
+ DB_TIMESPEC_TO_TIMEOUT((*minp),
+ &db_rep->request_gap, 0);
+ if (maxp != NULL)
+ DB_TIMESPEC_TO_TIMEOUT((*maxp), &db_rep->max_gap, 0);
+ }
+
+ return (0);
+}
+
+/*
+ * __rep_set_request --
+ * Set the minimum and maximum number of log records that we wait
+ * before retransmitting.
+ *
+ * PUBLIC: int __rep_set_request __P((DB_ENV *, db_timeout_t, db_timeout_t));
+ */
+int
+__rep_set_request(dbenv, min, max)
+ DB_ENV *dbenv;
+ db_timeout_t min, max;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ LOG *lp;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_request", DB_INIT_REP);
+
+ if (min == 0 || max < min) {
+ __db_errx(env,
+ "DB_ENV->rep_set_request: Invalid min or max values");
+ return (EINVAL);
+ }
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ /*
+ * We acquire the mtx_region or mtx_clientdb mutexes as needed.
+ */
+ REP_SYSTEM_LOCK(env);
+ DB_TIMEOUT_TO_TIMESPEC(min, &rep->request_gap);
+ DB_TIMEOUT_TO_TIMESPEC(max, &rep->max_gap);
+ REP_SYSTEM_UNLOCK(env);
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ dblp = env->lg_handle;
+ if (dblp != NULL && (lp = dblp->reginfo.primary) != NULL) {
+ DB_TIMEOUT_TO_TIMESPEC(min, &lp->wait_ts);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ENV_LEAVE(env, ip);
+ } else {
+ DB_TIMEOUT_TO_TIMESPEC(min, &db_rep->request_gap);
+ DB_TIMEOUT_TO_TIMESPEC(max, &db_rep->max_gap);
+ }
+
+ return (0);
+}
+
+/*
+ * __rep_set_transport_pp --
+ * Set the transport function for replication.
+ *
+ * PUBLIC: int __rep_set_transport_pp __P((DB_ENV *, int,
+ * PUBLIC: int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *,
+ * PUBLIC: int, u_int32_t)));
+ */
+int
+__rep_set_transport_pp(dbenv, eid, f_send)
+ DB_ENV *dbenv;
+ int eid;
+ int (*f_send) __P((DB_ENV *,
+ const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+{
+ DB_REP *db_rep;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_transport", DB_INIT_REP);
+
+ if (APP_IS_REPMGR(env)) {
+ __db_errx(env,
+"DB_ENV->rep_set_transport: cannot call from Replication Manager application");
+ return (EINVAL);
+ }
+
+ if (f_send == NULL) {
+ __db_errx(env,
+ "DB_ENV->rep_set_transport: no send function specified");
+ return (EINVAL);
+ }
+
+ if (eid < 0) {
+ __db_errx(env,
+ "DB_ENV->rep_set_transport: eid must be greater than or equal to 0");
+ return (EINVAL);
+ }
+
+ if ((ret = __rep_set_transport_int(env, eid, f_send)) == 0)
+ /*
+ * Setting a non-repmgr send function makes this a base API
+ * application.
+ */
+ APP_SET_BASEAPI(env);
+
+ return (ret);
+}
+
+/*
+ * __rep_set_transport_int --
+ * Set the internal values for the transport function for replication.
+ *
+ * PUBLIC: int __rep_set_transport_int __P((ENV *, int,
+ * PUBLIC: int (*)(DB_ENV *, const DBT *, const DBT *, const DB_LSN *,
+ * PUBLIC: int, u_int32_t)));
+ */
+int
+__rep_set_transport_int(env, eid, f_send)
+ ENV *env;
+ int eid;
+ int (*f_send) __P((DB_ENV *,
+ const DBT *, const DBT *, const DB_LSN *, int, u_int32_t));
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ db_rep = env->rep_handle;
+ db_rep->send = f_send;
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ rep->eid = eid;
+ } else
+ db_rep->eid = eid;
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_get_clockskew __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__rep_get_clockskew(dbenv, fast_clockp, slow_clockp)
+ DB_ENV *dbenv;
+ u_int32_t *fast_clockp, *slow_clockp;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_get_clockskew", DB_INIT_REP);
+
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ ENV_ENTER(env, ip);
+ REP_SYSTEM_LOCK(env);
+ *fast_clockp = rep->clock_skew;
+ *slow_clockp = rep->clock_base;
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ *fast_clockp = db_rep->clock_skew;
+ *slow_clockp = db_rep->clock_base;
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __rep_set_clockskew __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__rep_set_clockskew(dbenv, fast_clock, slow_clock)
+ DB_ENV *dbenv;
+ u_int32_t fast_clock, slow_clock;
+{
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REP *rep;
+ int ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+ ret = 0;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_clockskew", DB_INIT_REP);
+
+ /*
+ * Check for valid values. The fast clock should be a larger
+ * number than the slow clock. We use the slow clock value as
+ * our base for adjustment - therefore, a 2% difference should
+ * be fast == 102, slow == 100. Check for values being 0. If
+ * they are, then set them both to 1 internally.
+ *
+ * We will use these numbers to compute the larger ratio to be
+ * most conservative about the user's intention.
+ */
+ if (fast_clock == 0 || slow_clock == 0) {
+ /*
+ * If one value is zero, reject if both aren't zero.
+ */
+ if (slow_clock != 0 || fast_clock != 0) {
+ __db_errx(env,
+"DB_ENV->rep_set_clockskew: Zero only valid for when used for both arguments");
+ return (EINVAL);
+ }
+ fast_clock = 1;
+ slow_clock = 1;
+ }
+ if (fast_clock < slow_clock) {
+ __db_errx(env,
+"DB_ENV->rep_set_clockskew: slow_clock value is larger than fast_clock_value");
+ return (EINVAL);
+ }
+ if (REP_ON(env)) {
+ rep = db_rep->region;
+ if (IS_REP_STARTED(env)) {
+ __db_errx(env,
+ "DB_ENV->rep_set_clockskew: must be called before DB_ENV->rep_start");
+ return (EINVAL);
+ }
+ ENV_ENTER(env, ip);
+ REP_SYSTEM_LOCK(env);
+ rep->clock_skew = fast_clock;
+ rep->clock_base = slow_clock;
+ REP_SYSTEM_UNLOCK(env);
+ ENV_LEAVE(env, ip);
+ } else {
+ db_rep->clock_skew = fast_clock;
+ db_rep->clock_base = slow_clock;
+ }
+ return (ret);
+}
+
+/*
+ * __rep_flush --
+ * Re-push the last log record to all clients, in case they've lost
+ * messages and don't know it.
+ *
+ * PUBLIC: int __rep_flush __P((DB_ENV *));
+ */
+int
+__rep_flush(dbenv)
+ DB_ENV *dbenv;
+{
+ DBT rec;
+ DB_LOGC *logc;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_flush", DB_INIT_REP);
+
+ /* We need a transport function because we send messages. */
+ if (db_rep->send == NULL) {
+ __db_errx(env,
+ "DB_ENV->rep_flush: must be called after DB_ENV->rep_set_transport");
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ memset(&rec, 0, sizeof(rec));
+ memset(&lsn, 0, sizeof(lsn));
+
+ if ((ret = __logc_get(logc, &lsn, &rec, DB_LAST)) != 0)
+ goto err;
+
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_LOG, &lsn, &rec, 0, 0);
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __rep_sync --
+ * Force a synchronization to occur between this client and the master.
+ * This is the other half of configuring DELAYCLIENT.
+ *
+ * PUBLIC: int __rep_sync __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_sync(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ LOG *lp;
+ REP *rep;
+ int master, ret;
+ u_int32_t repflags, type;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ COMPQUIET(flags, 0);
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_sync", DB_INIT_REP);
+
+ /* We need a transport function because we send messages. */
+ if (db_rep->send == NULL) {
+ __db_errx(env,
+ "DB_ENV->rep_sync: must be called after DB_ENV->rep_set_transport");
+ return (EINVAL);
+ }
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ rep = db_rep->region;
+ ret = 0;
+
+ ENV_ENTER(env, ip);
+
+ /*
+ * Simple cases. If we're not in the DELAY state we have nothing
+ * to do. If we don't know who the master is, send a MASTER_REQ.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lsn = lp->verify_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ master = rep->master_id;
+ if (master == DB_EID_INVALID) {
+ REP_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_MASTER_REQ, NULL, NULL, 0, 0);
+ goto out;
+ }
+ /*
+ * We want to hold the rep mutex to test and then clear the
+ * DELAY flag. Racing threads in here could otherwise result
+ * in dual data streams.
+ */
+ if (!F_ISSET(rep, REP_F_DELAY)) {
+ REP_SYSTEM_UNLOCK(env);
+ goto out;
+ }
+
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ /*
+ * If we get here, we clear the delay flag and kick off a
+ * synchronization. From this point forward, we will
+ * synchronize until the next time the master changes.
+ */
+ F_CLR(rep, REP_F_DELAY);
+ if (IS_ZERO_LSN(lsn) && FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) {
+ F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK);
+ ret = DB_REP_JOIN_FAILURE;
+ REP_SYSTEM_UNLOCK(env);
+ goto out;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * When we set REP_F_DELAY, we set verify_lsn to the real verify lsn if
+ * we need to verify, or we zeroed it out if this is a client that needs
+ * internal init. So, send the type of message now that
+ * __rep_new_master delayed sending.
+ */
+ if (IS_ZERO_LSN(lsn)) {
+ DB_ASSERT(env, F_ISSET(rep, REP_F_RECOVER_UPDATE));
+ type = REP_UPDATE_REQ;
+ repflags = 0;
+ } else {
+ DB_ASSERT(env, F_ISSET(rep, REP_F_RECOVER_VERIFY));
+ type = REP_VERIFY_REQ;
+ repflags = DB_REP_ANYWHERE;
+ }
+ (void)__rep_send_message(env, master, type, &lsn, NULL, 0, repflags);
+
+out: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __rep_conv_vers --
+ * Convert from a log version to the replication message version
+ * that release used.
+ */
+static u_int32_t
+__rep_conv_vers(env, log_ver)
+ ENV *env;
+ u_int32_t log_ver;
+{
+ COMPQUIET(env, NULL);
+
+ /*
+ * We can't use a switch statement, some of the DB_LOGVERSION_XX
+ * constants are the same
+ */
+ if (log_ver == DB_LOGVERSION)
+ return (DB_REPVERSION);
+ if (log_ver == DB_LOGVERSION_44)
+ return (DB_REPVERSION_44);
+ if (log_ver == DB_LOGVERSION_45)
+ return (DB_REPVERSION_45);
+ if (log_ver == DB_LOGVERSION_46)
+ return (DB_REPVERSION_46);
+ if (log_ver == DB_LOGVERSION_47)
+ return (DB_REPVERSION_47);
+ return (DB_REPVERSION_INVALID);
+}
diff --git a/rep/rep_record.c b/rep/rep_record.c
new file mode 100644
index 0000000..7196ca2
--- /dev/null
+++ b/rep/rep_record.c
@@ -0,0 +1,2379 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __rep_collect_txn __P((ENV *, DB_LSN *, LSN_COLLECTION *));
+static int __rep_do_ckp __P((ENV *, DBT *, __rep_control_args *));
+static int __rep_fire_newmaster __P((ENV *, u_int32_t, int));
+static int __rep_fire_startupdone __P((ENV *, u_int32_t, int));
+static int __rep_getnext __P((ENV *, DB_THREAD_INFO *));
+static int __rep_lsn_cmp __P((const void *, const void *));
+static int __rep_newfile __P((ENV *, __rep_control_args *, DBT *));
+static int __rep_process_rec __P((ENV *, DB_THREAD_INFO *, __rep_control_args *,
+ DBT *, db_timespec *, DB_LSN *));
+static int __rep_remfirst __P((ENV *, DB_THREAD_INFO *, DBT *, DBT *));
+static int __rep_skip_msg __P((ENV *, REP *, int, u_int32_t));
+
+/* Used to consistently designate which messages ought to be received where. */
+
+#define MASTER_ONLY(rep, rp) do { \
+ if (!F_ISSET(rep, REP_F_MASTER)) { \
+ RPRINT(env, DB_VERB_REP_MSGS, \
+ (env, "Master record received on client")); \
+ REP_PRINT_MESSAGE(env, \
+ eid, rp, "rep_process_message", 0); \
+ /* Just skip/ignore it. */ \
+ ret = 0; \
+ goto errlock; \
+ } \
+} while (0)
+
+#define CLIENT_ONLY(rep, rp) do { \
+ if (!F_ISSET(rep, REP_F_CLIENT)) { \
+ RPRINT(env, DB_VERB_REP_MSGS, \
+ (env, "Client record received on master")); \
+ /* \
+ * Only broadcast DUPMASTER if leases are not \
+ * in effect. If I am an old master, using \
+ * leases and I get a newer message, my leases \
+ * had better all be expired. \
+ */ \
+ if (IS_USING_LEASES(env)) \
+ DB_ASSERT(env, \
+ __rep_lease_check(env, 0) == \
+ DB_REP_LEASE_EXPIRED); \
+ else { \
+ REP_PRINT_MESSAGE(env, \
+ eid, rp, "rep_process_message", 0); \
+ (void)__rep_send_message(env, DB_EID_BROADCAST, \
+ REP_DUPMASTER, NULL, NULL, 0, 0); \
+ } \
+ ret = DB_REP_DUPMASTER; \
+ goto errlock; \
+ } \
+} while (0)
+
+/*
+ * If a client is attempting to service a request it does not have,
+ * call rep_skip_msg to skip this message and force a rerequest to the
+ * sender. We don't hold the mutex for the stats and may miscount.
+ */
+#define CLIENT_REREQ do { \
+ if (F_ISSET(rep, REP_F_CLIENT)) { \
+ STAT(rep->stat.st_client_svc_req++); \
+ if (ret == DB_NOTFOUND) { \
+ STAT(rep->stat.st_client_svc_miss++); \
+ ret = __rep_skip_msg(env, rep, eid, rp->rectype);\
+ } \
+ } \
+} while (0)
+
+#define MASTER_UPDATE(env, renv) do { \
+ REP_SYSTEM_LOCK(env); \
+ F_SET((renv), DB_REGENV_REPLOCKED); \
+ (void)time(&(renv)->op_timestamp); \
+ REP_SYSTEM_UNLOCK(env); \
+} while (0)
+
+#define RECOVERING_SKIP do { \
+ if (IS_REP_CLIENT(env) && recovering) { \
+ /* Not holding region mutex, may miscount */ \
+ STAT(rep->stat.st_msgs_recover++); \
+ ret = __rep_skip_msg(env, rep, eid, rp->rectype); \
+ goto errlock; \
+ } \
+} while (0)
+
+/*
+ * If we're recovering the log we only want log records that are in the
+ * range we need to recover. Otherwise we can end up storing a huge
+ * number of "new" records, only to truncate the temp database later after
+ * we run recovery. If we are actively delaying a sync-up, we also skip
+ * all incoming log records until the application requests sync-up.
+ */
+#define RECOVERING_LOG_SKIP do { \
+ if (F_ISSET(rep, REP_F_DELAY) || \
+ rep->master_id == DB_EID_INVALID || \
+ (recovering && \
+ (!F_ISSET(rep, REP_F_RECOVER_LOG) || \
+ LOG_COMPARE(&rp->lsn, &rep->last_lsn) > 0))) { \
+ /* Not holding region mutex, may miscount */ \
+ STAT(rep->stat.st_msgs_recover++); \
+ ret = __rep_skip_msg(env, rep, eid, rp->rectype); \
+ goto errlock; \
+ } \
+} while (0)
+
+#define ANYSITE(rep)
+
+/*
+ * __rep_process_message_pp --
+ *
+ * This routine takes an incoming message and processes it.
+ *
+ * control: contains the control fields from the record
+ * rec: contains the actual record
+ * eid: the environment id of the sender of the message;
+ * ret_lsnp: On DB_REP_ISPERM and DB_REP_NOTPERM returns, contains the
+ * lsn of the maximum permanent or current not permanent log record
+ * (respectively).
+ *
+ * PUBLIC: int __rep_process_message_pp
+ * PUBLIC: __P((DB_ENV *, DBT *, DBT *, int, DB_LSN *));
+ */
+int
+__rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp)
+ DB_ENV *dbenv;
+ DBT *control, *rec;
+ int eid;
+ DB_LSN *ret_lsnp;
+{
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+ ret = 0;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_process_message", DB_INIT_REP);
+
+ if (APP_IS_REPMGR(env)) {
+ __db_errx(env, "%s %s", "DB_ENV->rep_process_message:",
+ "cannot call from Replication Manager application");
+ return (EINVAL);
+ }
+
+ /* Control argument must be non-Null. */
+ if (control == NULL || control->size == 0) {
+ __db_errx(env,
+ "DB_ENV->rep_process_message: control argument must be specified");
+ return (EINVAL);
+ }
+
+ /*
+ * Make sure site is a master or a client, which implies that
+ * replication has been started.
+ */
+ if (!IS_REP_MASTER(env) && !IS_REP_CLIENT(env)) {
+ __db_errx(env,
+ "Environment not configured as replication master or client");
+ return (EINVAL);
+ }
+
+ if ((ret = __dbt_usercopy(env, control)) != 0 ||
+ (ret = __dbt_usercopy(env, rec)) != 0) {
+ __dbt_userfree(env, control, rec, NULL);
+ __db_errx(env,
+ "DB_ENV->rep_process_message: error retrieving DBT contents");
+ return ret;
+ }
+
+ ret = __rep_process_message_int(env, control, rec, eid, ret_lsnp);
+
+ return (ret);
+}
+
+/*
+ * __rep_process_message_int --
+ *
+ * This routine performs the internal steps to process an incoming message.
+ *
+ * PUBLIC: int __rep_process_message_int
+ * PUBLIC: __P((ENV *, DBT *, DBT *, int, DB_LSN *));
+ */
+int
+__rep_process_message_int(env, control, rec, eid, ret_lsnp)
+ ENV *env;
+ DBT *control, *rec;
+ int eid;
+ DB_LSN *ret_lsnp;
+{
+ DBT data_dbt;
+ DB_LOG *dblp;
+ DB_LSN last_lsn, lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ REP_46_CONTROL *rp46;
+ REP_OLD_CONTROL *orp;
+ __rep_control_args *rp, tmprp;
+ __rep_egen_args egen_arg;
+ size_t len;
+ u_int32_t gen, rep_version;
+ int cmp, do_sync, lockout, recovering, ret, t_ret;
+ time_t savetime;
+ u_int8_t buf[__REP_MAXMSG_SIZE];
+
+ ret = 0;
+ do_sync = 0;
+ lockout = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ infop = env->reginfo;
+ renv = infop->primary;
+ /*
+ * Casting this to REP_OLD_CONTROL is just kind of stylistic: the
+ * rep_version field of course has to be in the same offset in all
+ * versions in order for this to work.
+ *
+ * We can look at the rep_version unswapped here because if we're
+ * talking to an old version, it will always be unswapped. If
+ * we're talking to a new version, the only issue is if it is
+ * swapped and we take one of the old version conditionals
+ * incorrectly. The rep_version would need to be very, very
+ * large for a swapped version to look like a small, older
+ * version. There is no problem here looking at it unswapped.
+ */
+ rep_version = ((REP_OLD_CONTROL *)control->data)->rep_version;
+ if (rep_version <= DB_REPVERSION_45) {
+ orp = (REP_OLD_CONTROL *)control->data;
+ if (rep_version == DB_REPVERSION_45 &&
+ F_ISSET(orp, REPCTL_INIT_45)) {
+ F_CLR(orp, REPCTL_INIT_45);
+ F_SET(orp, REPCTL_INIT);
+ }
+ tmprp.rep_version = orp->rep_version;
+ tmprp.log_version = orp->log_version;
+ tmprp.lsn = orp->lsn;
+ tmprp.rectype = orp->rectype;
+ tmprp.gen = orp->gen;
+ tmprp.flags = orp->flags;
+ tmprp.msg_sec = 0;
+ tmprp.msg_nsec = 0;
+ } else if (rep_version == DB_REPVERSION_46) {
+ rp46 = (REP_46_CONTROL *)control->data;
+ tmprp.rep_version = rp46->rep_version;
+ tmprp.log_version = rp46->log_version;
+ tmprp.lsn = rp46->lsn;
+ tmprp.rectype = rp46->rectype;
+ tmprp.gen = rp46->gen;
+ tmprp.flags = rp46->flags;
+ tmprp.msg_sec = (u_int32_t)rp46->msg_time.tv_sec;
+ tmprp.msg_nsec = (u_int32_t)rp46->msg_time.tv_nsec;
+ } else
+ if ((ret = __rep_control_unmarshal(env, &tmprp,
+ control->data, control->size, NULL)) != 0)
+ return (ret);
+ rp = &tmprp;
+ if (ret_lsnp != NULL)
+ ZERO_LSN(*ret_lsnp);
+
+ ENV_ENTER(env, ip);
+
+ REP_PRINT_MESSAGE(env, eid, rp, "rep_process_message", 0);
+ /*
+ * Check the version number for both rep and log. If it is
+ * an old version we support, convert it. Otherwise complain.
+ */
+ if (rp->rep_version < DB_REPVERSION) {
+ if (rp->rep_version < DB_REPVERSION_MIN) {
+ __db_errx(env,
+ "unsupported old replication message version %lu, minimum version %d",
+ (u_long)rp->rep_version, DB_REPVERSION_MIN);
+ ret = EINVAL;
+ goto errlock;
+ }
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "Received record %lu with old rep version %lu",
+ (u_long)rp->rectype, (u_long)rp->rep_version));
+ rp->rectype = __rep_msg_from_old(rp->rep_version, rp->rectype);
+ DB_ASSERT(env, rp->rectype != REP_INVALID);
+ /*
+ * We should have a valid new record type for all the old
+ * versions.
+ */
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "Converted to record %lu with old rep version %lu",
+ (u_long)rp->rectype, (u_long)rp->rep_version));
+ } else if (rp->rep_version > DB_REPVERSION) {
+ __db_errx(env,
+ "unexpected replication message version %lu, expected %d",
+ (u_long)rp->rep_version, DB_REPVERSION);
+ ret = EINVAL;
+ goto errlock;
+ }
+
+ if (rp->log_version < DB_LOGVERSION) {
+ if (rp->log_version < DB_LOGVERSION_MIN) {
+ __db_errx(env,
+ "unsupported old replication log version %lu, minimum version %d",
+ (u_long)rp->log_version, DB_LOGVERSION_MIN);
+ ret = EINVAL;
+ goto errlock;
+ }
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "Received record %lu with old log version %lu",
+ (u_long)rp->rectype, (u_long)rp->log_version));
+ } else if (rp->log_version > DB_LOGVERSION) {
+ __db_errx(env,
+ "unexpected log record version %lu, expected %d",
+ (u_long)rp->log_version, DB_LOGVERSION);
+ ret = EINVAL;
+ goto errlock;
+ }
+
+ /*
+ * Acquire the replication lock.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (F_ISSET(rep, REP_F_READY_MSG)) {
+ /*
+ * If we're racing with a thread in rep_start, then
+ * just ignore the message and return.
+ */
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "Racing replication msg lockout, ignore message."));
+ if (F_ISSET(rp, REPCTL_PERM))
+ ret = DB_REP_IGNORE;
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * If another client has sent a c2c request to us, it may be a
+ * long time before it resends the request (due to its dual data
+ * streams avoidance heuristic); let it know we can't serve the
+ * request just now.
+ */
+ if (F_ISSET(rep, REP_F_CLIENT) && REP_MSG_REQ(rp->rectype)) {
+ STAT(rep->stat.st_client_svc_req++);
+ STAT(rep->stat.st_client_svc_miss++);
+ (void)__rep_send_message(env,
+ eid, REP_REREQUEST, NULL, NULL, 0, 0);
+ }
+ goto out;
+ }
+ rep->msg_th++;
+ gen = rep->gen;
+ recovering = F_ISSET(rep, REP_F_RECOVER_MASK);
+ savetime = renv->rep_timestamp;
+
+ STAT(rep->stat.st_msgs_processed++);
+ REP_SYSTEM_UNLOCK(env);
+
+ /*
+ * Check for lease configuration matching. Leases must be
+ * configured all or none. If I am a client and I receive a
+ * message requesting a lease, and I'm not using leases, that
+ * is an error.
+ */
+ if (!IS_USING_LEASES(env) &&
+ (F_ISSET(rp, REPCTL_LEASE) || rp->rectype == REP_LEASE_GRANT)) {
+ __db_errx(env,
+ "Inconsistent lease configuration");
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "Client received lease message and not using leases"));
+ ret = EINVAL;
+ ret = __env_panic(env, ret);
+ goto errlock;
+ }
+
+ /*
+ * Check for generation number matching. Ignore any old messages
+ * except requests that are indicative of a new client that needs
+ * to get in sync.
+ */
+ if (rp->gen < gen && rp->rectype != REP_ALIVE_REQ &&
+ rp->rectype != REP_NEWCLIENT && rp->rectype != REP_MASTER_REQ &&
+ rp->rectype != REP_DUPMASTER && rp->rectype != REP_VOTE1) {
+ /*
+ * We don't hold the rep mutex, and could miscount if we race.
+ */
+ STAT(rep->stat.st_msgs_badgen++);
+ if (F_ISSET(rp, REPCTL_PERM))
+ ret = DB_REP_IGNORE;
+ goto errlock;
+ }
+
+ if (rp->gen > gen) {
+ /*
+ * If I am a master and am out of date with a lower generation
+ * number, I am in bad shape and should downgrade.
+ */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ STAT(rep->stat.st_dupmasters++);
+ ret = DB_REP_DUPMASTER;
+ /*
+ * Only broadcast DUPMASTER if leases are not
+ * in effect. If I am an old master, using
+ * leases and I get a newer message, my leases
+ * had better all be expired.
+ */
+ if (IS_USING_LEASES(env))
+ DB_ASSERT(env,
+ __rep_lease_check(env, 0) ==
+ DB_REP_LEASE_EXPIRED);
+ else if (rp->rectype != REP_DUPMASTER)
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_DUPMASTER,
+ NULL, NULL, 0, 0);
+ goto errlock;
+ }
+
+ /*
+ * I am a client and am out of date. If this is an election,
+ * or a response from the first site I contacted, then I can
+ * accept the generation number and participate in future
+ * elections and communication. Otherwise, I need to hear about
+ * a new master and sync up.
+ */
+ if (rp->rectype == REP_ALIVE ||
+ rp->rectype == REP_VOTE1 || rp->rectype == REP_VOTE2) {
+ REP_SYSTEM_LOCK(env);
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "Updating gen from %lu to %lu",
+ (u_long)gen, (u_long)rp->gen));
+ rep->master_id = DB_EID_INVALID;
+ gen = rep->gen = rp->gen;
+ /*
+ * Updating of egen will happen when we process the
+ * message below for each message type.
+ */
+ REP_SYSTEM_UNLOCK(env);
+ if (rp->rectype == REP_ALIVE)
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ, NULL,
+ NULL, 0, 0);
+ } else if (rp->rectype != REP_NEWMASTER) {
+ /*
+ * Ignore this message, retransmit if needed.
+ */
+ if (__rep_check_doreq(env, rep))
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ,
+ NULL, NULL, 0, 0);
+ goto errlock;
+ }
+ /*
+ * If you get here, then you're a client and either you're
+ * in an election or you have a NEWMASTER or an ALIVE message
+ * whose processing will do the right thing below.
+ */
+ }
+
+ /*
+ * If the sender is part of an established group, so are we now.
+ */
+ if (F_ISSET(rp, REPCTL_GROUP_ESTD)) {
+ REP_SYSTEM_LOCK(env);
+#ifdef DIAGNOSTIC
+ if (!F_ISSET(rep, REP_F_GROUP_ESTD))
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "I am now part of an established group"));
+#endif
+ F_SET(rep, REP_F_GROUP_ESTD);
+ REP_SYSTEM_UNLOCK(env);
+ }
+
+ /*
+ * We need to check if we're in recovery and if we are
+ * then we need to ignore any messages except VERIFY*, VOTE*,
+ * NEW* and ALIVE_REQ, or backup related messages: UPDATE*,
+ * PAGE* and FILE*. We need to also accept LOG messages
+ * if we're copying the log for recovery/backup.
+ */
+ switch (rp->rectype) {
+ case REP_ALIVE:
+ /*
+ * Handle even if we're recovering.
+ */
+ ANYSITE(rep);
+ if (rp->rep_version < DB_REPVERSION_47)
+ egen_arg.egen = *(u_int32_t *)rec->data;
+ else if ((ret = __rep_egen_unmarshal(env, &egen_arg,
+ rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ REP_SYSTEM_LOCK(env);
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "Received ALIVE egen of %lu, mine %lu",
+ (u_long)egen_arg.egen, (u_long)rep->egen));
+ if (egen_arg.egen > rep->egen) {
+ /*
+ * We're changing egen, need to clear out any old
+ * election information. We need to set the
+ * REP_F_EGENUPDATE flag here so that any thread
+ * waiting in rep_elect/rep_wait can distinguish
+ * this situation (and restart its election) from
+ * a current master saying it is still master and
+ * the egen getting incremented on that path.
+ */
+ __rep_elect_done(env, rep, 0);
+ rep->egen = egen_arg.egen;
+ F_SET(rep, REP_F_EGENUPDATE);
+ }
+ REP_SYSTEM_UNLOCK(env);
+ break;
+ case REP_ALIVE_REQ:
+ /*
+ * Handle even if we're recovering.
+ */
+ ANYSITE(rep);
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+#ifdef CONFIG_TEST
+ /*
+ * Send this first, before the ALIVE message because of the
+ * way the test suite and messaging is done sequentially.
+ * In some sequences it is possible to get into a situation
+ * where the test suite cannot get the later NEWMASTER because
+ * we break out of the messaging loop too early.
+ */
+ if (F_ISSET(rep, REP_F_MASTER))
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+#endif
+ REP_SYSTEM_LOCK(env);
+ egen_arg.egen = rep->egen;
+ REP_SYSTEM_UNLOCK(env);
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(data_dbt, &egen_arg.egen,
+ sizeof(egen_arg.egen));
+ else {
+ if ((ret = __rep_egen_marshal(env,
+ &egen_arg, buf, __REP_EGEN_SIZE, &len)) != 0)
+ goto errlock;
+ DB_INIT_DBT(data_dbt, buf, len);
+ }
+ (void)__rep_send_message(env,
+ eid, REP_ALIVE, &lsn, &data_dbt, 0, 0);
+ break;
+ case REP_ALL_REQ:
+ RECOVERING_SKIP;
+ ret = __rep_allreq(env, rp, eid);
+ CLIENT_REREQ;
+ break;
+ case REP_BULK_LOG:
+ RECOVERING_LOG_SKIP;
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_bulk_log(env, ip, rp, rec, savetime, ret_lsnp);
+ break;
+ case REP_BULK_PAGE:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_bulk_page(env, ip, eid, rp, rec);
+ break;
+ case REP_DUPMASTER:
+ /*
+ * Handle even if we're recovering.
+ */
+ if (F_ISSET(rep, REP_F_MASTER))
+ ret = DB_REP_DUPMASTER;
+ break;
+#ifdef NOTYET
+ case REP_FILE: /* TODO */
+ CLIENT_ONLY(rep, rp);
+ break;
+ case REP_FILE_REQ:
+ ret = __rep_send_file(env, rec, eid);
+ break;
+#endif
+ case REP_FILE_FAIL:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ /*
+ * Clean up any internal init that was in progress.
+ */
+ if (eid == rep->master_id) {
+ REP_SYSTEM_LOCK(env);
+ /*
+ * If we're already locking out messages, give up.
+ */
+ if (F_ISSET(rep, REP_F_READY_MSG))
+ goto errhlk;
+ /*
+ * Lock out other messages to prevent race
+ * conditions.
+ */
+ if ((ret =
+ __rep_lockout_msg(env, rep, 1)) != 0) {
+ goto errhlk;
+ }
+ lockout = 1;
+ /*
+ * Need mtx_clientdb to safely clean up
+ * page database in __rep_init_cleanup().
+ */
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Clean up internal init if one was in progress.
+ */
+ if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP)) {
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "FILE_FAIL is cleaning up old internal init"));
+#ifdef CONFIG_TEST
+ STAT(rep->stat.st_filefail_cleanups++);
+#endif
+ ret = __rep_init_cleanup(env, rep, DB_FORCE);
+ F_CLR(rep,
+ REP_F_ABBREVIATED | REP_F_RECOVER_MASK);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (ret != 0) {
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "FILE_FAIL error cleaning up internal init: %d", ret));
+ goto errhlk;
+ }
+ F_CLR(rep, REP_F_READY_MSG);
+ lockout = 0;
+ /*
+ * Restart internal init, setting UPDATE flag and
+ * zeroing applicable LSNs.
+ */
+ F_SET(rep, REP_F_RECOVER_UPDATE);
+ ZERO_LSN(rep->first_lsn);
+ ZERO_LSN(rep->ckp_lsn);
+ REP_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env, eid, REP_UPDATE_REQ,
+ NULL, NULL, 0, 0);
+ }
+ break;
+ case REP_LEASE_GRANT:
+ /*
+ * Handle even if we're recovering.
+ */
+ MASTER_ONLY(rep, rp);
+ ret = __rep_lease_grant(env, rp, rec, eid);
+ break;
+ case REP_LOG:
+ case REP_LOG_MORE:
+ RECOVERING_LOG_SKIP;
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_log(env, ip, rp, rec, savetime, ret_lsnp);
+ break;
+ case REP_LOG_REQ:
+ RECOVERING_SKIP;
+ if (F_ISSET(rp, REPCTL_INIT))
+ MASTER_UPDATE(env, renv);
+ ret = __rep_logreq(env, rp, rec, eid);
+ CLIENT_REREQ;
+ break;
+ case REP_NEWSITE:
+ /*
+ * Handle even if we're recovering.
+ */
+ /* We don't hold the rep mutex, and may miscount. */
+ STAT(rep->stat.st_newsites++);
+
+ /* This is a rebroadcast; simply tell the application. */
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env,
+ eid, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ }
+ ret = DB_REP_NEWSITE;
+ break;
+ case REP_NEWCLIENT:
+ /*
+ * Handle even if we're recovering.
+ */
+ /*
+ * This message was received and should have resulted in the
+ * application entering the machine ID in its machine table.
+ * We respond to this with an ALIVE to send relevant information
+ * to the new client (if we are a master, we'll send a
+ * NEWMASTER, so we only need to send the ALIVE if we're a
+ * client). But first, broadcast the new client's record to
+ * all the clients.
+ */
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWSITE, &rp->lsn, rec, 0, 0);
+
+ ret = DB_REP_NEWSITE;
+
+ if (F_ISSET(rep, REP_F_CLIENT)) {
+ REP_SYSTEM_LOCK(env);
+ egen_arg.egen = rep->egen;
+
+ /*
+ * Clean up any previous master remnants by making
+ * master_id invalid and cleaning up any internal
+ * init that was in progress.
+ */
+ if (eid == rep->master_id) {
+ rep->master_id = DB_EID_INVALID;
+
+ /*
+ * Already locking out messages, must be
+ * in sync-up recover or internal init,
+ * give up.
+ */
+ if (F_ISSET(rep, REP_F_READY_MSG))
+ goto errhlk;
+
+ /*
+ * Lock out other messages to prevent race
+ * conditions.
+ */
+ if ((t_ret =
+ __rep_lockout_msg(env, rep, 1)) != 0) {
+ ret = t_ret;
+ goto errhlk;
+ }
+ lockout = 1;
+
+ /*
+ * Need mtx_clientdb to safely clean up
+ * page database in __rep_init_cleanup().
+ */
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+
+ /*
+ * Clean up internal init if one was in
+ * progress.
+ */
+ if (F_ISSET(rep, REP_F_READY_API |
+ REP_F_READY_OP)) {
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "NEWCLIENT is cleaning up old internal init for invalid master"));
+ t_ret = __rep_init_cleanup(env,
+ rep, DB_FORCE);
+ F_CLR(rep, REP_F_ABBREVIATED |
+ REP_F_RECOVER_MASK);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (t_ret != 0) {
+ ret = t_ret;
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "NEWCLIENT error cleaning up internal init for invalid master: %d", ret));
+ goto errhlk;
+ }
+ F_CLR(rep, REP_F_READY_MSG);
+ lockout = 0;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(data_dbt, &egen_arg.egen,
+ sizeof(egen_arg.egen));
+ else {
+ if ((ret = __rep_egen_marshal(env, &egen_arg,
+ buf, __REP_EGEN_SIZE, &len)) != 0)
+ goto errlock;
+ DB_INIT_DBT(data_dbt, buf, len);
+ }
+ (void)__rep_send_message(env, DB_EID_BROADCAST,
+ REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
+ break;
+ }
+ /* FALLTHROUGH */
+ case REP_MASTER_REQ:
+ RECOVERING_SKIP;
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ LOG_SYSTEM_LOCK(env);
+ lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
+ if (IS_USING_LEASES(env))
+ (void)__rep_lease_refresh(env);
+ }
+ /*
+ * If there is no master, then we could get into a state
+ * where an old client lost the initial ALIVE message and
+ * is calling an election under an old gen and can
+ * never get to the current gen.
+ */
+ if (F_ISSET(rep, REP_F_CLIENT) && rp->gen < gen) {
+ REP_SYSTEM_LOCK(env);
+ egen_arg.egen = rep->egen;
+ if (eid == rep->master_id)
+ rep->master_id = DB_EID_INVALID;
+ REP_SYSTEM_UNLOCK(env);
+ if (rep->version < DB_REPVERSION_47)
+ DB_INIT_DBT(data_dbt, &egen_arg.egen,
+ sizeof(egen_arg.egen));
+ else {
+ if ((ret = __rep_egen_marshal(env, &egen_arg,
+ buf, __REP_EGEN_SIZE, &len)) != 0)
+ goto errlock;
+ DB_INIT_DBT(data_dbt, buf, len);
+ }
+ (void)__rep_send_message(env, eid,
+ REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
+ }
+ break;
+ case REP_NEWFILE:
+ RECOVERING_LOG_SKIP;
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_apply(env,
+ ip, rp, rec, ret_lsnp, NULL, &last_lsn);
+ if (ret == DB_REP_LOGREADY)
+ ret = __rep_logready(env, rep, savetime, &last_lsn);
+ break;
+ case REP_NEWMASTER:
+ /*
+ * Handle even if we're recovering.
+ */
+ ANYSITE(rep);
+ if (F_ISSET(rep, REP_F_MASTER) &&
+ eid != rep->eid) {
+ /* We don't hold the rep mutex, and may miscount. */
+ STAT(rep->stat.st_dupmasters++);
+ ret = DB_REP_DUPMASTER;
+ if (IS_USING_LEASES(env))
+ DB_ASSERT(env,
+ __rep_lease_check(env, 0) ==
+ DB_REP_LEASE_EXPIRED);
+ else
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_DUPMASTER,
+ NULL, NULL, 0, 0);
+ break;
+ }
+ if ((ret =
+ __rep_new_master(env, rp, eid)) == DB_REP_NEWMASTER)
+ ret = __rep_fire_newmaster(env, rp->gen, eid);
+ break;
+ case REP_PAGE:
+ case REP_PAGE_MORE:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_page(env, ip, eid, rp, rec);
+ if (ret == DB_REP_PAGEDONE)
+ ret = 0;
+ break;
+ case REP_PAGE_FAIL:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_page_fail(env, ip, eid, rp, rec);
+ break;
+ case REP_PAGE_REQ:
+ RECOVERING_SKIP;
+ MASTER_UPDATE(env, renv);
+ ret = __rep_page_req(env, ip, eid, rp, rec);
+ CLIENT_REREQ;
+ break;
+ case REP_REREQUEST:
+ /*
+ * Handle even if we're recovering. Don't do a master
+ * check.
+ */
+ CLIENT_ONLY(rep, rp);
+ /*
+ * Don't hold any mutex, may miscount.
+ */
+ STAT(rep->stat.st_client_rerequests++);
+ ret = __rep_resend_req(env, 1);
+ break;
+ case REP_START_SYNC:
+ RECOVERING_SKIP;
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ cmp = LOG_COMPARE(&rp->lsn, &lp->ready_lsn);
+ /*
+ * The comparison needs to be <= because the LSN in
+ * the message can be the LSN of the first outstanding
+ * txn, which may be the LSN immediately after the
+ * previous commit. The ready_lsn is the LSN of the
+ * next record expected. In that case, the LSNs
+ * could be equal and the client has the commit and
+ * wants to sync. [SR #15338]
+ */
+ if (cmp <= 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ do_sync = 1;
+ } else {
+ STAT(rep->stat.st_startsync_delayed++);
+ /*
+ * There are cases where keeping the first ckp_lsn
+ * LSN is advantageous and cases where keeping
+ * a later LSN is better. If random, earlier
+ * log records are missing, keeping the later
+ * LSN seems to be better. That is what we'll
+ * do for now.
+ */
+ if (LOG_COMPARE(&rp->lsn, &rep->ckp_lsn) > 0)
+ rep->ckp_lsn = rp->lsn;
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "Delayed START_SYNC memp_sync due to missing records."));
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "ready LSN [%lu][%lu], ckp_lsn [%lu][%lu]",
+ (u_long)lp->ready_lsn.file, (u_long)lp->ready_lsn.offset,
+ (u_long)rep->ckp_lsn.file, (u_long)rep->ckp_lsn.offset));
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+ break;
+ case REP_UPDATE:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_update_setup(env, eid, rp, rec, savetime);
+ break;
+ case REP_UPDATE_REQ:
+ /*
+ * Handle even if we're recovering.
+ */
+ MASTER_ONLY(rep, rp);
+ infop = env->reginfo;
+ renv = infop->primary;
+ MASTER_UPDATE(env, renv);
+ ret = __rep_update_req(env, rp, eid);
+ break;
+ case REP_VERIFY:
+ if (recovering) {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ cmp = LOG_COMPARE(&lp->verify_lsn, &rp->lsn);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ /*
+ * If this is not the verify record I want, skip it.
+ */
+ if (cmp != 0) {
+ ret = __rep_skip_msg(
+ env, rep, eid, rp->rectype);
+ break;
+ }
+ }
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_verify(env, rp, rec, eid, savetime);
+ break;
+ case REP_VERIFY_FAIL:
+ /*
+ * Handle even if we're recovering.
+ */
+ CLIENT_ONLY(rep, rp);
+ ret = __rep_verify_fail(env, rp);
+ break;
+ case REP_VERIFY_REQ:
+ RECOVERING_SKIP;
+ ret = __rep_verify_req(env, rp, eid);
+ CLIENT_REREQ;
+ break;
+ case REP_VOTE1:
+ /*
+ * Handle even if we're recovering.
+ */
+ ret = __rep_vote1(env, rp, rec, eid);
+ break;
+ case REP_VOTE2:
+ /*
+ * Handle even if we're recovering.
+ */
+ ret = __rep_vote2(env, rp, rec, eid);
+ break;
+ default:
+ __db_errx(env,
+ "DB_ENV->rep_process_message: unknown replication message: type %lu",
+ (u_long)rp->rectype);
+ ret = EINVAL;
+ break;
+ }
+
+errlock:
+ REP_SYSTEM_LOCK(env);
+errhlk: if (lockout)
+ F_CLR(rep, REP_F_READY_MSG);
+ rep->msg_th--;
+ REP_SYSTEM_UNLOCK(env);
+ if (do_sync) {
+ MUTEX_LOCK(env, rep->mtx_ckp);
+ lsn = rp->lsn;
+ /*
+ * This is the REP_START_SYNC sync, and so we permit it to be
+ * interrupted.
+ */
+ ret = __memp_sync(
+ env, DB_SYNC_CHECKPOINT | DB_SYNC_INTERRUPT_OK, &lsn);
+ MUTEX_UNLOCK(env, rep->mtx_ckp);
+ RPRINT(env, DB_VERB_REP_MSGS,
+ (env, "ALIVE: Completed sync [%lu][%lu]",
+ (u_long)lsn.file, (u_long)lsn.offset));
+ }
+out:
+ if (ret == 0 && F_ISSET(rp, REPCTL_PERM)) {
+ if (ret_lsnp != NULL)
+ *ret_lsnp = rp->lsn;
+ ret = DB_REP_NOTPERM;
+ }
+ __dbt_userfree(env, control, rec, NULL);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __rep_apply --
+ *
+ * Handle incoming log records on a client, applying when possible and
+ * entering into the bookkeeping table otherwise. This routine manages
+ * the state of the incoming message stream -- processing records, via
+ * __rep_process_rec, when possible and enqueuing in the __db.rep.db
+ * when necessary. As gaps in the stream are filled in, this is where
+ * we try to process as much as possible from __db.rep.db to catch up.
+ *
+ * PUBLIC: int __rep_apply __P((ENV *, DB_THREAD_INFO *, __rep_control_args *,
+ * PUBLIC: DBT *, DB_LSN *, int *, DB_LSN *));
+ */
+int
+__rep_apply(env, ip, rp, rec, ret_lsnp, is_dupp, last_lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __rep_control_args *rp;
+ DBT *rec;
+ DB_LSN *ret_lsnp;
+ int *is_dupp;
+ DB_LSN *last_lsnp;
+{
+ DB *dbp;
+ DBT control_dbt, key_dbt;
+ DBT rec_dbt;
+ DB_LOG *dblp;
+ DB_LSN max_lsn, save_lsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ db_timespec msg_time, max_ts;
+ u_int32_t gen, rectype;
+ int cmp, event, master, newfile_seen, ret, set_apply, t_ret;
+
+ COMPQUIET(gen, 0);
+ COMPQUIET(master, DB_EID_INVALID);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ event = ret = set_apply = 0;
+ memset(&control_dbt, 0, sizeof(control_dbt));
+ memset(&rec_dbt, 0, sizeof(rec_dbt));
+ ZERO_LSN(max_lsn);
+ timespecclear(&max_ts);
+ timespecset(&msg_time, rp->msg_sec, rp->msg_nsec);
+ cmp = -2; /* OOB value that LOG_COMPARE can't return. */
+
+ dblp = env->lg_handle;
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ /*
+ * Lazily open the temp db. Always set the startup flag to 0
+ * because it was initialized from rep_start.
+ */
+ if (db_rep->rep_db == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto out;
+ }
+ dbp = db_rep->rep_db;
+ lp = dblp->reginfo.primary;
+ newfile_seen = 0;
+ REP_SYSTEM_LOCK(env);
+ if (F_ISSET(rep, REP_F_RECOVER_LOG) &&
+ LOG_COMPARE(&lp->ready_lsn, &rep->first_lsn) < 0)
+ lp->ready_lsn = rep->first_lsn;
+ cmp = LOG_COMPARE(&rp->lsn, &lp->ready_lsn);
+ /*
+ * If we are going to skip or process any message other
+ * than a duplicate, make note of it if we're in an
+ * election so that the election can rerequest proactively.
+ */
+ if (F_ISSET(rep, REP_F_READY_APPLY) && cmp >= 0)
+ F_SET(rep, REP_F_SKIPPED_APPLY);
+
+ /*
+ * If we're in the middle of processing a NEWFILE, we've dropped
+ * the mutex and if this matches it is a duplicate record. We
+ * do not want this call taking the "matching" code below because
+ * we may then process later records in the temp db and the
+ * original NEWFILE may not have the log file ready. It will
+ * process those temp db items when it completes.
+ */
+ if (F_ISSET(rep, REP_F_NEWFILE) && cmp == 0)
+ cmp = -1;
+
+ if (cmp == 0) {
+ /*
+ * If we are in an election (i.e. we've sent a vote
+ * with an LSN in it), then we drop the next record
+ * we're expecting. When we find a master, we'll
+ * either go into sync, or if it was an existing
+ * master, rerequest this one record (later records
+ * are accumulating in the temp db).
+ *
+ * We can simply return here, and rep_process_message
+ * will set NOTPERM if necessary for this record.
+ */
+ if (F_ISSET(rep, REP_F_READY_APPLY)) {
+ /*
+ * We will simply return now. All special return
+ * processing should be ignored because the special
+ * values are just initialized. Variables like
+ * max_lsn are still 0.
+ */
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "rep_apply: In election. Ignoring [%lu][%lu]",
+ (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto out;
+ }
+ rep->apply_th++;
+ set_apply = 1;
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "rep_apply: Set apply_th %d", rep->apply_th));
+ REP_SYSTEM_UNLOCK(env);
+ if (rp->rectype == REP_NEWFILE)
+ newfile_seen = 1;
+ if ((ret = __rep_process_rec(env, ip,
+ rp, rec, &max_ts, &max_lsn)) != 0)
+ goto err;
+ /*
+ * If we get the record we are expecting, reset
+ * the count of records we've received and are applying
+ * towards the request interval.
+ */
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ ZERO_LSN(lp->max_wait_lsn);
+
+ /*
+ * The __rep_remfirst() and __rep_getnext() functions each open,
+ * use and then close a cursor on the temp db, each time through
+ * the loop. Although this may seem excessive, it is necessary
+ * to avoid locking problems with checkpoints.
+ */
+ while (ret == 0 &&
+ LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) == 0) {
+ /*
+ * We just filled in a gap in the log record stream.
+ * Write subsequent records to the log.
+ */
+gap_check:
+ if ((ret = __rep_remfirst(env, ip,
+ &control_dbt, &rec_dbt)) != 0)
+ goto err;
+
+ rp = (__rep_control_args *)control_dbt.data;
+ timespecset(&msg_time, rp->msg_sec, rp->msg_nsec);
+ rec = &rec_dbt;
+ if (rp->rectype == REP_NEWFILE)
+ newfile_seen = 1;
+ if ((ret = __rep_process_rec(env, ip,
+ rp, rec, &max_ts, &max_lsn)) != 0)
+ goto err;
+
+ --rep->stat.st_log_queued;
+
+ /*
+ * Since we just filled a gap in the log stream, and
+ * we're writing subsequent records to the log, we want
+ * to use rcvd_ts and wait_ts so that we will
+ * request the next gap if we end up with a gap and
+ * not so recent records in the temp db, but not
+ * request if recent records are in the temp db and
+ * likely to arrive on its own shortly. We want to
+ * avoid requesting the record in that case. Also
+ * reset max_wait_lsn because the next gap is a
+ * fresh gap.
+ */
+ lp->rcvd_ts = lp->last_ts;
+ lp->wait_ts = rep->request_gap;
+ if ((ret = __rep_getnext(env, ip)) == DB_NOTFOUND) {
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ ret = 0;
+ break;
+ } else if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * Check if we're at a gap in the table and if so, whether we
+ * need to ask for any records.
+ */
+ if (!IS_ZERO_LSN(lp->waiting_lsn) &&
+ LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) != 0) {
+ /*
+ * We got a record and processed it, but we may
+ * still be waiting for more records. If we
+ * filled a gap we keep a count of how many other
+ * records are in the temp database and if we should
+ * request the next gap at this time.
+ */
+ if (__rep_check_doreq(env, rep) && (ret =
+ __rep_loggap_req(env, rep, &rp->lsn, 0)) != 0)
+ goto err;
+ } else {
+ lp->wait_ts = rep->request_gap;
+ ZERO_LSN(lp->max_wait_lsn);
+ }
+
+ } else if (cmp > 0) {
+ /*
+ * The LSN is higher than the one we were waiting for.
+ * This record isn't in sequence; add it to the temporary
+ * database, update waiting_lsn if necessary, and perform
+ * calculations to determine if we should issue requests
+ * for new records.
+ */
+ REP_SYSTEM_UNLOCK(env);
+ memset(&key_dbt, 0, sizeof(key_dbt));
+ key_dbt.data = rp;
+ key_dbt.size = sizeof(*rp);
+ ret = __db_put(dbp, ip, NULL, &key_dbt, rec, DB_NOOVERWRITE);
+ if (ret == 0) {
+ rep->stat.st_log_queued++;
+ __os_gettime(env, &lp->last_ts, 1);
+#ifdef HAVE_STATISTICS
+ STAT(rep->stat.st_log_queued_total++);
+ if (rep->stat.st_log_queued_max <
+ rep->stat.st_log_queued)
+ rep->stat.st_log_queued_max =
+ rep->stat.st_log_queued;
+#endif
+ }
+
+ if (ret == DB_KEYEXIST)
+ ret = 0;
+ if (ret != 0)
+ goto done;
+
+ if (IS_ZERO_LSN(lp->waiting_lsn) ||
+ LOG_COMPARE(&rp->lsn, &lp->waiting_lsn) < 0) {
+ /*
+ * If this is a new gap, then reset the rcvd_ts so
+ * that an out-of-order record after an idle period
+ * does not (likely) immediately rerequest.
+ */
+ if (IS_ZERO_LSN(lp->waiting_lsn))
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->waiting_lsn = rp->lsn;
+ }
+
+ if (__rep_check_doreq(env, rep) &&
+ (ret = __rep_loggap_req(env, rep, &rp->lsn, 0) != 0))
+ goto err;
+
+ /*
+ * If this is permanent; let the caller know that we have
+ * not yet written it to disk, but we've accepted it.
+ */
+ if (ret == 0 && F_ISSET(rp, REPCTL_PERM)) {
+ max_lsn = rp->lsn;
+ ret = DB_REP_NOTPERM;
+ }
+ goto done;
+ } else {
+ STAT(rep->stat.st_log_duplicated++);
+ REP_SYSTEM_UNLOCK(env);
+ if (is_dupp != NULL)
+ *is_dupp = 1;
+ LOGCOPY_32(env, &rectype, rec->data);
+ if (rectype == DB___txn_regop || rectype == DB___txn_ckp)
+ max_lsn = lp->max_perm_lsn;
+ /*
+ * We check REPCTL_LEASE here, because this client may
+ * have leases configured but the master may not (especially
+ * in a mixed version group. If the master has leases
+ * configured, all clients must also.
+ */
+ if (IS_USING_LEASES(env) &&
+ F_ISSET(rp, REPCTL_LEASE) &&
+ timespecisset(&msg_time)) {
+ if (timespeccmp(&msg_time, &lp->max_lease_ts, >))
+ max_ts = msg_time;
+ else
+ max_ts = lp->max_lease_ts;
+ }
+ goto done;
+ }
+
+ /* Check if we need to go back into the table. */
+ if (ret == 0 && LOG_COMPARE(&lp->ready_lsn, &lp->waiting_lsn) == 0)
+ goto gap_check;
+
+done:
+err: /*
+ * In case of a race, to make sure only one thread can get
+ * DB_REP_LOGREADY, zero out rep->last_lsn to show that we've gotten to
+ * this point.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (ret == 0 &&
+ F_ISSET(rep, REP_F_RECOVER_LOG) &&
+ !IS_ZERO_LSN(rep->last_lsn) &&
+ LOG_COMPARE(&lp->ready_lsn, &rep->last_lsn) >= 0) {
+ *last_lsnp = max_lsn;
+ ZERO_LSN(rep->last_lsn);
+ ZERO_LSN(max_lsn);
+ ret = DB_REP_LOGREADY;
+ }
+ /*
+ * Only decrement if we were actually applying log records.
+ * We do not care if we processed a dup record or put one
+ * in the temp db.
+ */
+ if (set_apply) {
+ rep->apply_th--;
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "rep_apply: Decrement apply_th %d [%lu][%lu]",
+ rep->apply_th, (u_long)lp->ready_lsn.file,
+ (u_long)lp->ready_lsn.offset));
+ }
+
+ if (ret == 0 && !F_ISSET(rep, REP_F_RECOVER_LOG) &&
+ !IS_ZERO_LSN(max_lsn)) {
+ if (ret_lsnp != NULL)
+ *ret_lsnp = max_lsn;
+ ret = DB_REP_ISPERM;
+ DB_ASSERT(env, LOG_COMPARE(&max_lsn, &lp->max_perm_lsn) >= 0);
+ lp->max_perm_lsn = max_lsn;
+ }
+
+ /*
+ * Start-up is complete when we process (or have already processed) up
+ * to the end of the replication group's log. In case we miss that
+ * message, as a back-up, we also recognize start-up completion when we
+ * actually process a live log record. Having cmp==0 here (with a good
+ * "ret" value) implies we actually processed the record.
+ */
+ if ((ret == 0 || ret == DB_REP_ISPERM) &&
+ rep->stat.st_startup_complete == 0 &&
+ !F_ISSET(rep, REP_F_RECOVER_LOG) &&
+ ((cmp <= 0 && F_ISSET(rp, REPCTL_LOG_END)) ||
+ (cmp == 0 && !F_ISSET(rp, REPCTL_RESEND)))) {
+ rep->stat.st_startup_complete = 1;
+ event = 1;
+ gen = rep->gen;
+ master = rep->master_id;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * If we've processed beyond the needed LSN for a pending
+ * start sync, start it now. We can compare >= here
+ * because ready_lsn is the next record we expect.
+ * Since ckp_lsn can point to the last commit record itself,
+ * but if it does and ready_lsn == commit (i.e. we haven't
+ * written the commit yet), we can still start to sync
+ * because we're guaranteed no additional buffers can
+ * be dirtied.
+ */
+ if (!IS_ZERO_LSN(rep->ckp_lsn) &&
+ LOG_COMPARE(&lp->ready_lsn, &rep->ckp_lsn) >= 0) {
+ save_lsn = rep->ckp_lsn;
+ ZERO_LSN(rep->ckp_lsn);
+ } else
+ ZERO_LSN(save_lsn);
+
+ /*
+ * If this is a perm record, we are using leases, update the lease
+ * grant. We must hold the clientdb mutex. We must not hold
+ * the region mutex because rep_update_grant will acquire it.
+ */
+ if (ret == DB_REP_ISPERM && IS_USING_LEASES(env) &&
+ timespecisset(&max_ts)) {
+ if ((t_ret = __rep_update_grant(env, &max_ts)) != 0)
+ ret = t_ret;
+ else if (timespeccmp(&max_ts, &lp->max_lease_ts, >))
+ lp->max_lease_ts = max_ts;
+ }
+
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (!IS_ZERO_LSN(save_lsn)) {
+ /*
+ * Now call memp_sync holding only the ckp mutex.
+ */
+ MUTEX_LOCK(env, rep->mtx_ckp);
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "Starting delayed __memp_sync call [%lu][%lu]",
+ (u_long)save_lsn.file, (u_long)save_lsn.offset));
+ t_ret = __memp_sync(env,
+ DB_SYNC_CHECKPOINT | DB_SYNC_INTERRUPT_OK, &save_lsn);
+ MUTEX_UNLOCK(env, rep->mtx_ckp);
+ }
+ if (event) {
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "Start-up is done [%lu][%lu]",
+ (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
+
+ if ((t_ret = __rep_fire_startupdone(env, gen, master)) != 0) {
+ DB_ASSERT(env, ret == 0 || ret == DB_REP_ISPERM);
+ /* Failure trumps either of those values. */
+ ret = t_ret;
+ goto out;
+ }
+ }
+ if ((ret == 0 || ret == DB_REP_ISPERM) &&
+ newfile_seen && lp->db_log_autoremove)
+ __log_autoremove(env);
+ if (control_dbt.data != NULL)
+ __os_ufree(env, control_dbt.data);
+ if (rec_dbt.data != NULL)
+ __os_ufree(env, rec_dbt.data);
+
+out:
+ switch (ret) {
+ case 0:
+ break;
+ case DB_REP_ISPERM:
+ RPRINT(env, DB_VERB_REP_MSGS,
+ (env, "Returning ISPERM [%lu][%lu], cmp = %d",
+ (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp));
+ break;
+ case DB_REP_LOGREADY:
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "Returning LOGREADY up to [%lu][%lu], cmp = %d",
+ (u_long)last_lsnp->file,
+ (u_long)last_lsnp->offset, cmp));
+ break;
+ case DB_REP_NOTPERM:
+ if (!F_ISSET(rep, REP_F_RECOVER_LOG) &&
+ !IS_ZERO_LSN(max_lsn) && ret_lsnp != NULL)
+ *ret_lsnp = max_lsn;
+
+ RPRINT(env, DB_VERB_REP_MSGS,
+ (env, "Returning NOTPERM [%lu][%lu], cmp = %d",
+ (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp));
+ break;
+ default:
+ RPRINT(env, DB_VERB_REP_MSGS,
+ (env, "Returning %d [%lu][%lu], cmp = %d", ret,
+ (u_long)max_lsn.file, (u_long)max_lsn.offset, cmp));
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * __rep_process_txn --
+ *
+ * This is the routine that actually gets a transaction ready for
+ * processing.
+ *
+ * PUBLIC: int __rep_process_txn __P((ENV *, DBT *));
+ */
+int
+__rep_process_txn(env, rec)
+ ENV *env;
+ DBT *rec;
+{
+ DBT data_dbt, *lock_dbt;
+ DB_LOCKER *locker;
+ DB_LOCKREQ req, *lvp;
+ DB_LOGC *logc;
+ DB_LSN prev_lsn, *lsnp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ DB_TXNHEAD *txninfo;
+ LSN_COLLECTION lc;
+ REP *rep;
+ __txn_regop_args *txn_args;
+ __txn_regop_42_args *txn42_args;
+ __txn_prepare_args *prep_args;
+ u_int32_t rectype;
+ u_int i;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ logc = NULL;
+ txn_args = NULL;
+ txn42_args = NULL;
+ prep_args = NULL;
+ txninfo = NULL;
+
+ ENV_ENTER(env, ip);
+ memset(&data_dbt, 0, sizeof(data_dbt));
+ if (F_ISSET(env, ENV_THREAD))
+ F_SET(&data_dbt, DB_DBT_REALLOC);
+
+ /*
+ * There are two phases: First, we have to traverse backwards through
+ * the log records gathering the list of all LSNs in the transaction.
+ * Once we have this information, we can loop through and then apply it.
+ *
+ * We may be passed a prepare (if we're restoring a prepare on upgrade)
+ * instead of a commit (the common case). Check which it is and behave
+ * appropriately.
+ */
+ LOGCOPY_32(env, &rectype, rec->data);
+ memset(&lc, 0, sizeof(lc));
+ if (rectype == DB___txn_regop) {
+ /*
+ * We're the end of a transaction. Make sure this is
+ * really a commit and not an abort!
+ */
+ if (rep->version >= DB_REPVERSION_44) {
+ if ((ret = __txn_regop_read(
+ env, rec->data, &txn_args)) != 0)
+ return (ret);
+ if (txn_args->opcode != TXN_COMMIT) {
+ __os_free(env, txn_args);
+ return (0);
+ }
+ prev_lsn = txn_args->prev_lsn;
+ lock_dbt = &txn_args->locks;
+ } else {
+ if ((ret = __txn_regop_42_read(
+ env, rec->data, &txn42_args)) != 0)
+ return (ret);
+ if (txn42_args->opcode != TXN_COMMIT) {
+ __os_free(env, txn42_args);
+ return (0);
+ }
+ prev_lsn = txn42_args->prev_lsn;
+ lock_dbt = &txn42_args->locks;
+ }
+ } else {
+ /* We're a prepare. */
+ DB_ASSERT(env, rectype == DB___txn_prepare);
+
+ if ((ret = __txn_prepare_read(
+ env, rec->data, &prep_args)) != 0)
+ return (ret);
+ prev_lsn = prep_args->prev_lsn;
+ lock_dbt = &prep_args->locks;
+ }
+
+ /* Get locks. */
+ if ((ret = __lock_id(env, NULL, &locker)) != 0)
+ goto err1;
+
+ if ((ret =
+ __lock_get_list(env, locker, 0, DB_LOCK_WRITE, lock_dbt)) != 0)
+ goto err;
+
+ /* Phase 1. Get a list of the LSNs in this transaction, and sort it. */
+ if ((ret = __rep_collect_txn(env, &prev_lsn, &lc)) != 0)
+ goto err;
+ qsort(lc.array, lc.nlsns, sizeof(DB_LSN), __rep_lsn_cmp);
+
+ /*
+ * The set of records for a transaction may include dbreg_register
+ * records. Create a txnlist so that they can keep track of file
+ * state between records.
+ */
+ if ((ret = __db_txnlist_init(env, ip, 0, 0, NULL, &txninfo)) != 0)
+ goto err;
+
+ /* Phase 2: Apply updates. */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+ for (lsnp = &lc.array[0], i = 0; i < lc.nlsns; i++, lsnp++) {
+ if ((ret = __logc_get(logc, lsnp, &data_dbt, DB_SET)) != 0) {
+ __db_errx(env, "failed to read the log at [%lu][%lu]",
+ (u_long)lsnp->file, (u_long)lsnp->offset);
+ goto err;
+ }
+ if ((ret = __db_dispatch(env, &env->recover_dtab,
+ &data_dbt, lsnp, DB_TXN_APPLY, txninfo)) != 0) {
+ __db_errx(env, "transaction failed at [%lu][%lu]",
+ (u_long)lsnp->file, (u_long)lsnp->offset);
+ goto err;
+ }
+ }
+
+err: memset(&req, 0, sizeof(req));
+ req.op = DB_LOCK_PUT_ALL;
+ if ((t_ret =
+ __lock_vec(env, locker, 0, &req, 1, &lvp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0)
+ ret = t_ret;
+
+err1: if (txn_args != NULL)
+ __os_free(env, txn_args);
+ if (txn42_args != NULL)
+ __os_free(env, txn42_args);
+ if (prep_args != NULL)
+ __os_free(env, prep_args);
+ if (lc.array != NULL)
+ __os_free(env, lc.array);
+
+ if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (txninfo != NULL)
+ __db_txnlist_end(env, txninfo);
+
+ if (F_ISSET(&data_dbt, DB_DBT_REALLOC) && data_dbt.data != NULL)
+ __os_ufree(env, data_dbt.data);
+
+#ifdef HAVE_STATISTICS
+ if (ret == 0)
+ /*
+ * We don't hold the rep mutex, and could miscount if we race.
+ */
+ rep->stat.st_txns_applied++;
+#endif
+
+ return (ret);
+}
+
+/*
+ * __rep_collect_txn
+ * Recursive function that will let us visit every entry in a transaction
+ * chain including all child transactions so that we can then apply
+ * the entire transaction family at once.
+ */
+static int
+__rep_collect_txn(env, lsnp, lc)
+ ENV *env;
+ DB_LSN *lsnp;
+ LSN_COLLECTION *lc;
+{
+ __txn_child_args *argp;
+ DB_LOGC *logc;
+ DB_LSN c_lsn;
+ DBT data;
+ u_int32_t rectype;
+ u_int nalloc;
+ int ret, t_ret;
+
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_REALLOC);
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ while (!IS_ZERO_LSN(*lsnp) &&
+ (ret = __logc_get(logc, lsnp, &data, DB_SET)) == 0) {
+ LOGCOPY_32(env, &rectype, data.data);
+ if (rectype == DB___txn_child) {
+ if ((ret = __txn_child_read(
+ env, data.data, &argp)) != 0)
+ goto err;
+ c_lsn = argp->c_lsn;
+ *lsnp = argp->prev_lsn;
+ __os_free(env, argp);
+ ret = __rep_collect_txn(env, &c_lsn, lc);
+ } else {
+ if (lc->nalloc < lc->nlsns + 1) {
+ nalloc = lc->nalloc == 0 ? 20 : lc->nalloc * 2;
+ if ((ret = __os_realloc(env,
+ nalloc * sizeof(DB_LSN), &lc->array)) != 0)
+ goto err;
+ lc->nalloc = nalloc;
+ }
+ lc->array[lc->nlsns++] = *lsnp;
+
+ /*
+ * Explicitly copy the previous lsn. The record
+ * starts with a u_int32_t record type, a u_int32_t
+ * txn id, and then the DB_LSN (prev_lsn) that we
+ * want. We copy explicitly because we have no idea
+ * what kind of record this is.
+ */
+ LOGCOPY_TOLSN(env, lsnp, (u_int8_t *)data.data +
+ sizeof(u_int32_t) + sizeof(u_int32_t));
+ }
+
+ if (ret != 0)
+ goto err;
+ }
+ if (ret != 0)
+ __db_errx(env, "collect failed at: [%lu][%lu]",
+ (u_long)lsnp->file, (u_long)lsnp->offset);
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (data.data != NULL)
+ __os_ufree(env, data.data);
+ return (ret);
+}
+
+/*
+ * __rep_lsn_cmp --
+ * qsort-type-compatible wrapper for LOG_COMPARE.
+ */
+static int
+__rep_lsn_cmp(lsn1, lsn2)
+ const void *lsn1, *lsn2;
+{
+
+ return (LOG_COMPARE((DB_LSN *)lsn1, (DB_LSN *)lsn2));
+}
+
+/*
+ * __rep_newfile --
+ * NEWFILE messages have the LSN of the last record in the previous
+ * log file. When applying a NEWFILE message, make sure we haven't already
+ * swapped files. Assume caller hold mtx_clientdb.
+ */
+static int
+__rep_newfile(env, rp, rec)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+{
+ DB_LOG *dblp;
+ DB_LSN tmplsn;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ __rep_newfile_args nf_args;
+ int ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * If a newfile is already in progress, just ignore.
+ */
+ if (F_ISSET(rep, REP_F_NEWFILE))
+ return (0);
+ if (rp->lsn.file + 1 > lp->ready_lsn.file) {
+ if (rec == NULL || rec->size == 0) {
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+"rep_newfile: Old-style NEWFILE msg. Use control msg log version: %lu",
+ (u_long) rp->log_version));
+ nf_args.version = rp->log_version;
+ } else if (rp->rep_version < DB_REPVERSION_47)
+ nf_args.version = *(u_int32_t *)rec->data;
+ else if ((ret = __rep_newfile_unmarshal(env, &nf_args,
+ rec->data, rec->size, NULL)) != 0)
+ return (ret);
+ RPRINT(env, DB_VERB_REP_MISC,
+ (env, "rep_newfile: File %lu vers %lu",
+ (u_long)rp->lsn.file + 1, (u_long)nf_args.version));
+
+ /*
+ * We drop the mtx_clientdb mutex during
+ * the file operation, and then reacquire it when
+ * we're done. We avoid colliding with new incoming
+ * log records because lp->ready_lsn is not getting
+ * updated and there is no real log record at this
+ * ready_lsn. We avoid colliding with a duplicate
+ * NEWFILE message by setting an in-progress flag.
+ */
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_NEWFILE);
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ LOG_SYSTEM_LOCK(env);
+ ret = __log_newfile(dblp, &tmplsn, 0, nf_args.version);
+ LOG_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_NEWFILE);
+ REP_SYSTEM_UNLOCK(env);
+ if (ret == 0)
+ lp->ready_lsn = tmplsn;
+ return (ret);
+ } else
+ /* We've already applied this NEWFILE. Just ignore it. */
+ return (0);
+}
+
+/*
+ * __rep_do_ckp --
+ * Perform the memp_sync necessary for this checkpoint without holding the
+ * REP->mtx_clientdb. Callers of this function must hold REP->mtx_clientdb
+ * and must not be holding the region mutex.
+ */
+static int
+__rep_do_ckp(env, rec, rp)
+ ENV *env;
+ DBT *rec;
+ __rep_control_args *rp;
+{
+ DB_ENV *dbenv;
+ __txn_ckp_args *ckp_args;
+ DB_LSN ckp_lsn;
+ REP *rep;
+ int ret;
+
+ dbenv = env->dbenv;
+
+ /* Crack the log record and extract the checkpoint LSN. */
+ if ((ret = __txn_ckp_read(env, rec->data, &ckp_args)) != 0)
+ return (ret);
+ ckp_lsn = ckp_args->ckp_lsn;
+ __os_free(env, ckp_args);
+
+ rep = env->rep_handle->region;
+
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ DB_TEST_WAIT(env, env->test_check);
+
+ /*
+ * Sync the memory pool.
+ *
+ * This is the real PERM lock record/ckp. We cannot return ISPERM
+ * if we haven't truly completed the checkpoint, so we don't allow
+ * this call to be interrupted.
+ *
+ * We may be overlapping our log record with an in-progress startsync
+ * of this checkpoint; suppress the max_write settings on any running
+ * cache-flush operation so it completes quickly.
+ */
+ (void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 1);
+ MUTEX_LOCK(env, rep->mtx_ckp);
+ ret = __memp_sync(env, DB_SYNC_CHECKPOINT, &ckp_lsn);
+ MUTEX_UNLOCK(env, rep->mtx_ckp);
+ (void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 0);
+
+ /* Update the last_ckp in the txn region. */
+ if (ret == 0)
+ ret = __txn_updateckp(env, &rp->lsn);
+ else {
+ __db_errx(env, "Error syncing ckp [%lu][%lu]",
+ (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset);
+ ret = __env_panic(env, ret);
+ }
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ return (ret);
+}
+
+/*
+ * __rep_remfirst --
+ * Remove the first entry from the __db.rep.db
+ */
+static int
+__rep_remfirst(env, ip, cntrl, rec)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DBT *cntrl;
+ DBT *rec;
+{
+ DB *dbp;
+ DBC *dbc;
+ DB_REP *db_rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ dbp = db_rep->rep_db;
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /* The DBTs need to persist through another call. */
+ F_SET(cntrl, DB_DBT_REALLOC);
+ F_SET(rec, DB_DBT_REALLOC);
+ if ((ret = __dbc_get(dbc, cntrl, rec, DB_RMW | DB_FIRST)) == 0)
+ ret = __dbc_del(dbc, 0);
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __rep_getnext --
+ * Get the next record out of the __db.rep.db table.
+ */
+static int
+__rep_getnext(env, ip)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+{
+ DB *dbp;
+ DBC *dbc;
+ DBT lsn_dbt, nextrec_dbt;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ __rep_control_args *rp;
+ int ret, t_ret;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ db_rep = env->rep_handle;
+ dbp = db_rep->rep_db;
+
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /*
+ * Update waiting_lsn. We need to move it
+ * forward to the LSN of the next record
+ * in the queue.
+ *
+ * If the next item in the database is a log
+ * record--the common case--we're not
+ * interested in its contents, just in its LSN.
+ * Optimize by doing a partial get of the data item.
+ */
+ memset(&nextrec_dbt, 0, sizeof(nextrec_dbt));
+ F_SET(&nextrec_dbt, DB_DBT_PARTIAL);
+ nextrec_dbt.ulen = nextrec_dbt.dlen = 0;
+
+ memset(&lsn_dbt, 0, sizeof(lsn_dbt));
+ ret = __dbc_get(dbc, &lsn_dbt, &nextrec_dbt, DB_FIRST);
+ if (ret != DB_NOTFOUND && ret != 0)
+ goto err;
+
+ if (ret == DB_NOTFOUND) {
+ ZERO_LSN(lp->waiting_lsn);
+ /*
+ * Whether or not the current record is
+ * simple, there's no next one, and
+ * therefore we haven't got anything
+ * else to do right now. Break out.
+ */
+ goto err;
+ }
+ rp = (__rep_control_args *)lsn_dbt.data;
+ lp->waiting_lsn = rp->lsn;
+
+err: if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_process_rec --
+ *
+ * Given a record in 'rp', process it. In the case of a NEWFILE, that means
+ * potentially switching files. In the case of a checkpoint, it means doing
+ * the checkpoint, and in other cases, it means simply writing the record into
+ * the log.
+ */
+static int
+__rep_process_rec(env, ip, rp, rec, ret_tsp, ret_lsnp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __rep_control_args *rp;
+ DBT *rec;
+ db_timespec *ret_tsp;
+ DB_LSN *ret_lsnp;
+{
+ DB *dbp;
+ DBT control_dbt, key_dbt, rec_dbt;
+ DB_REP *db_rep;
+ REP *rep;
+ db_timespec msg_time;
+ u_int32_t rectype, txnid;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dbp = db_rep->rep_db;
+ ret = 0;
+
+ if (rp->rectype == REP_NEWFILE) {
+ ret = __rep_newfile(env, rp, rec);
+ return (0);
+ }
+
+ LOGCOPY_32(env, &rectype, rec->data);
+ memset(&control_dbt, 0, sizeof(control_dbt));
+ memset(&rec_dbt, 0, sizeof(rec_dbt));
+ timespecset(&msg_time, rp->msg_sec, rp->msg_nsec);
+
+ /*
+ * We write all records except for checkpoint records here.
+ * All non-checkpoint records need to appear in the log before
+ * we take action upon them (i.e., we enforce write-ahead logging).
+ * However, we can't write the checkpoint record here until the
+ * data buffers are actually written to disk, else we are creating
+ * an invalid log -- one that says all data before a certain point
+ * has been written to disk.
+ *
+ * If two threads are both processing the same checkpoint record
+ * (because, for example, it was resent and the original finally
+ * arrived), we handle that below by checking for the existence of
+ * the log record when we add it to the replication database.
+ *
+ * Any log records that arrive while we are processing the checkpoint
+ * are added to the bookkeeping database because ready_lsn is not yet
+ * updated to point after the checkpoint record.
+ */
+ if (rectype != DB___txn_ckp || F_ISSET(rep, REP_F_RECOVER_LOG)) {
+ if ((ret = __log_rep_put(env, &rp->lsn, rec, 0)) != 0)
+ return (ret);
+ STAT(rep->stat.st_log_records++);
+ if (F_ISSET(rep, REP_F_RECOVER_LOG)) {
+ *ret_lsnp = rp->lsn;
+ goto out;
+ }
+ }
+
+ switch (rectype) {
+ case DB___dbreg_register:
+ /*
+ * DB opens occur in the context of a transaction, so we can
+ * simply handle them when we process the transaction. Closes,
+ * however, are not transaction-protected, so we have to handle
+ * them here.
+ *
+ * It should be unsafe for the master to do a close of a file
+ * that was opened in an active transaction, so we should be
+ * guaranteed to get the ordering right.
+ *
+ * !!!
+ * The txn ID is the second 4-byte field of the log record.
+ * We should really be calling __dbreg_register_read() and
+ * working from the __dbreg_register_args structure, but this
+ * is considerably faster and the order of the fields won't
+ * change.
+ */
+ LOGCOPY_32(env, &txnid,
+ (u_int8_t *)rec->data + sizeof(u_int32_t));
+ if (txnid == TXN_INVALID)
+ ret = __db_dispatch(env, &env->recover_dtab,
+ rec, &rp->lsn, DB_TXN_APPLY, NULL);
+ break;
+ case DB___txn_regop:
+ /*
+ * If an application is doing app-specific recovery
+ * and acquires locks while applying a transaction,
+ * it can deadlock. Any other locks held by this
+ * thread should have been discarded in the
+ * __rep_process_txn error path, so if we simply
+ * retry, we should eventually succeed.
+ */
+ do {
+ ret = 0;
+ if (!F_ISSET(db_rep, DBREP_OPENFILES)) {
+ ret = __txn_openfiles(env, ip, NULL, 1);
+ F_SET(db_rep, DBREP_OPENFILES);
+ }
+ if (ret == 0)
+ ret = __rep_process_txn(env, rec);
+ } while (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED);
+
+ /* Now flush the log unless we're running TXN_NOSYNC. */
+ if (ret == 0 && !F_ISSET(env->dbenv, DB_ENV_TXN_NOSYNC))
+ ret = __log_flush(env, NULL);
+ if (ret != 0) {
+ __db_errx(env, "Error processing txn [%lu][%lu]",
+ (u_long)rp->lsn.file, (u_long)rp->lsn.offset);
+ ret = __env_panic(env, ret);
+ }
+ *ret_lsnp = rp->lsn;
+ break;
+ case DB___txn_prepare:
+ ret = __log_flush(env, NULL);
+ /*
+ * Save the biggest prepared LSN we've seen.
+ */
+ rep->max_prep_lsn = rp->lsn;
+ RPRINT(env, DB_VERB_REP_MSGS,
+ (env, "process_rec: prepare at [%lu][%lu]",
+ (u_long)rep->max_prep_lsn.file,
+ (u_long)rep->max_prep_lsn.offset));
+ break;
+ case DB___txn_ckp:
+ /*
+ * We do not want to hold the REP->mtx_clientdb mutex while
+ * syncing the mpool, so if we get a checkpoint record we are
+ * supposed to process, add it to the __db.rep.db, do the
+ * memp_sync and then go back and process it later, when the
+ * sync has finished. If this record is already in the table,
+ * then some other thread will process it, so simply return
+ * REP_NOTPERM.
+ */
+ memset(&key_dbt, 0, sizeof(key_dbt));
+ key_dbt.data = rp;
+ key_dbt.size = sizeof(*rp);
+
+ /*
+ * We want to put this record into the tmp DB only if
+ * it doesn't exist, so use DB_NOOVERWRITE.
+ */
+ ret = __db_put(dbp, ip, NULL, &key_dbt, rec, DB_NOOVERWRITE);
+ if (ret == DB_KEYEXIST) {
+ if (ret_lsnp != NULL)
+ *ret_lsnp = rp->lsn;
+ ret = DB_REP_NOTPERM;
+ }
+ if (ret != 0)
+ break;
+
+ /*
+ * Now, do the checkpoint. Regardless of
+ * whether the checkpoint succeeds or not,
+ * we need to remove the record we just put
+ * in the temporary database. If the
+ * checkpoint failed, return an error. We
+ * will act like we never received the
+ * checkpoint.
+ */
+ if ((ret = __rep_do_ckp(env, rec, rp)) == 0)
+ ret = __log_rep_put(env, &rp->lsn, rec,
+ DB_LOG_CHKPNT);
+ if ((t_ret = __rep_remfirst(env, ip,
+ &control_dbt, &rec_dbt)) != 0 && ret == 0)
+ ret = t_ret;
+ /*
+ * If we're successful putting the log record in the
+ * log, flush it for a checkpoint.
+ */
+ if (ret == 0) {
+ *ret_lsnp = rp->lsn;
+ ret = __log_flush(env, NULL);
+ }
+ break;
+ default:
+ break;
+ }
+
+out:
+ if (ret == 0 && F_ISSET(rp, REPCTL_PERM))
+ *ret_lsnp = rp->lsn;
+ if (IS_USING_LEASES(env) &&
+ F_ISSET(rp, REPCTL_LEASE))
+ *ret_tsp = msg_time;
+ /*
+ * Set ret_lsnp before flushing the log because if the
+ * flush fails, we've still written the record to the
+ * log and the LSN has been entered.
+ */
+ if (ret == 0 && F_ISSET(rp, REPCTL_FLUSH))
+ ret = __log_flush(env, NULL);
+ if (control_dbt.data != NULL)
+ __os_ufree(env, control_dbt.data);
+ if (rec_dbt.data != NULL)
+ __os_ufree(env, rec_dbt.data);
+
+ return (ret);
+}
+
+/*
+ * __rep_resend_req --
+ * We might have dropped a message, we need to resend our request.
+ * The request we send is dependent on what recovery state we're in.
+ * The caller holds no locks.
+ *
+ * PUBLIC: int __rep_resend_req __P((ENV *, int));
+ */
+int
+__rep_resend_req(env, rereq)
+ ENV *env;
+ int rereq;
+{
+ DB_LOG *dblp;
+ DB_LSN lsn, *lsnp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int master, ret;
+ u_int32_t gapflags, msgtype, repflags, sendflags;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+ lsnp = NULL;
+ msgtype = REP_INVALID;
+ sendflags = 0;
+
+ repflags = rep->flags;
+ /*
+ * If we are delayed we do not rerequest anything.
+ */
+ if (FLD_ISSET(repflags, REP_F_DELAY))
+ return (ret);
+ gapflags = rereq ? REP_GAP_REREQUEST : 0;
+
+ if (FLD_ISSET(repflags, REP_F_RECOVER_VERIFY)) {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lsn = lp->verify_lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (!IS_ZERO_LSN(lsn)) {
+ msgtype = REP_VERIFY_REQ;
+ lsnp = &lsn;
+ sendflags = DB_REP_REREQUEST;
+ }
+ } else if (FLD_ISSET(repflags, REP_F_RECOVER_UPDATE)) {
+ /*
+ * UPDATE_REQ only goes to the master.
+ */
+ msgtype = REP_UPDATE_REQ;
+ } else if (FLD_ISSET(repflags, REP_F_RECOVER_PAGE)) {
+ REP_SYSTEM_LOCK(env);
+ ret = __rep_pggap_req(env, rep, NULL, gapflags);
+ REP_SYSTEM_UNLOCK(env);
+ } else {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ ret = __rep_loggap_req(env, rep, NULL, gapflags);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+
+ if (msgtype != REP_INVALID) {
+ master = rep->master_id;
+ if (master == DB_EID_INVALID)
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0);
+ else
+ (void)__rep_send_message(env,
+ master, msgtype, lsnp, NULL, 0, sendflags);
+ }
+
+ return (ret);
+}
+
+/*
+ * __rep_check_doreq --
+ * PUBLIC: int __rep_check_doreq __P((ENV *, REP *));
+ *
+ * Check if we need to send another request. If so, compare with
+ * the request limits the user might have set. This assumes the
+ * caller holds the REP->mtx_clientdb mutex. Returns 1 if a request
+ * needs to be made, and 0 if it does not.
+ */
+int
+__rep_check_doreq(env, rep)
+ ENV *env;
+ REP *rep;
+{
+
+ DB_LOG *dblp;
+ LOG *lp;
+ db_timespec now;
+ int req;
+
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ __os_gettime(env, &now, 1);
+ timespecsub(&now, &lp->rcvd_ts);
+ req = timespeccmp(&now, &lp->wait_ts, >=);
+ if (req) {
+ /*
+ * Add wait_ts to itself to double it.
+ */
+ timespecadd(&lp->wait_ts, &lp->wait_ts);
+ if (timespeccmp(&lp->wait_ts, &rep->max_gap, >))
+ lp->wait_ts = rep->max_gap;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ }
+ return (req);
+}
+
+/*
+ * __rep_skip_msg -
+ *
+ * If we're in recovery we want to skip/ignore the message, but
+ * we also need to see if we need to re-request any retransmissions.
+ */
+static int
+__rep_skip_msg(env, rep, eid, rectype)
+ ENV *env;
+ REP *rep;
+ int eid;
+ u_int32_t rectype;
+{
+ int do_req, ret;
+
+ ret = 0;
+ /*
+ * If we have a request message from a client then immediately
+ * send a REP_REREQUEST back to that client since we're skipping it.
+ */
+ if (F_ISSET(rep, REP_F_CLIENT) && REP_MSG_REQ(rectype))
+ do_req = 1;
+ else {
+ /* Check for need to retransmit. */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ do_req = __rep_check_doreq(env, rep);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+ /*
+ * Don't respond to a MASTER_REQ with
+ * a MASTER_REQ or REREQUEST.
+ */
+ if (do_req && rectype != REP_MASTER_REQ) {
+ /*
+ * There are three cases:
+ * 1. If we don't know who the master is, then send MASTER_REQ.
+ * 2. If the message we're skipping came from the master,
+ * then we need to rerequest.
+ * 3. If the message didn't come from a master (i.e. client
+ * to client), then send a rerequest back to the sender so
+ * the sender can rerequest it elsewhere, if we are a client.
+ */
+ if (rep->master_id == DB_EID_INVALID) /* Case 1. */
+ (void)__rep_send_message(env,
+ DB_EID_BROADCAST, REP_MASTER_REQ, NULL, NULL, 0, 0);
+ else if (eid == rep->master_id) /* Case 2. */
+ ret = __rep_resend_req(env, 0);
+ else if (F_ISSET(rep, REP_F_CLIENT)) /* Case 3. */
+ (void)__rep_send_message(env,
+ eid, REP_REREQUEST, NULL, NULL, 0, 0);
+ }
+ return (ret);
+}
+
+static int
+__rep_fire_newmaster(env, gen, master)
+ ENV *env;
+ u_int32_t gen;
+ int master;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_EVENT_LOCK(env);
+ /*
+ * The firing of this event should be idempotent with respect to a
+ * particular generation number.
+ */
+ if (rep->newmaster_event_gen < gen) {
+ __rep_fire_event(env, DB_EVENT_REP_NEWMASTER, &master);
+ rep->newmaster_event_gen = gen;
+ }
+ REP_EVENT_UNLOCK(env);
+ return (0);
+}
+
+static int
+__rep_fire_startupdone(env, gen, master)
+ ENV *env;
+ u_int32_t gen;
+ int master;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_EVENT_LOCK(env);
+ /*
+ * Usually NEWMASTER will already have been fired. But if not, fire
+ * it here now, to ensure the application receives events in the
+ * expected order.
+ */
+ if (rep->newmaster_event_gen < gen) {
+ __rep_fire_event(env, DB_EVENT_REP_NEWMASTER, &master);
+ rep->newmaster_event_gen = gen;
+ }
+
+ /*
+ * Caller already ensures that it only tries to fire STARTUPDONE once
+ * per generation. If we did not want to rely on that, we could add a
+ * simple boolean flag (to the set of data protected by the mtx_event).
+ * The precise meaning of that flag would be "STARTUPDONE has been fired
+ * for the generation value stored in `newmaster_event_gen'". Then the
+ * more accurate test here would be simply to check that flag, and fire
+ * the event (and set the flag) if it were not already set.
+ */
+ if (rep->newmaster_event_gen == gen)
+ __rep_fire_event(env, DB_EVENT_REP_STARTUPDONE, NULL);
+ REP_EVENT_UNLOCK(env);
+ return (0);
+}
diff --git a/rep/rep_region.c b/rep/rep_region.c
new file mode 100644
index 0000000..9eacb2c
--- /dev/null
+++ b/rep/rep_region.c
@@ -0,0 +1,488 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+
+static int __rep_egen_init __P((ENV *, REP *));
+static int __rep_gen_init __P((ENV *, REP *));
+
+/*
+ * __rep_open --
+ * Initialize the shared memory state for the replication system.
+ *
+ * PUBLIC: int __rep_open __P((ENV *));
+ */
+int
+__rep_open(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ infop = env->reginfo;
+ renv = infop->primary;
+ ret = 0;
+
+ if (renv->rep_off == INVALID_ROFF) {
+ /* Must create the region. */
+ if ((ret = __env_alloc(infop, sizeof(REP), &rep)) != 0)
+ return (ret);
+ memset(rep, 0, sizeof(*rep));
+
+ /*
+ * We have the region; fill in the values. Some values may
+ * have been configured before we open the region, and those
+ * are taken from the DB_REP structure.
+ */
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_REGION, 0, &rep->mtx_region)) != 0)
+ return (ret);
+ /*
+ * Because we have no way to prevent deadlocks and cannot log
+ * changes made to it, we single-thread access to the client
+ * bookkeeping database. This is suboptimal, but it only gets
+ * accessed when messages arrive out-of-order, so it should
+ * stay small and not be used in a high-performance app.
+ */
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_DATABASE, 0, &rep->mtx_clientdb)) != 0)
+ return (ret);
+
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_CHKPT, 0, &rep->mtx_ckp)) != 0)
+ return (ret);
+
+ if ((ret = __mutex_alloc(
+ env, MTX_REP_EVENT, 0, &rep->mtx_event)) != 0)
+ return (ret);
+
+ rep->newmaster_event_gen = 0;
+ rep->notified_egen = 0;
+ rep->lease_off = INVALID_ROFF;
+ rep->tally_off = INVALID_ROFF;
+ rep->v2tally_off = INVALID_ROFF;
+ rep->eid = db_rep->eid;
+ rep->master_id = DB_EID_INVALID;
+ rep->gen = 0;
+ rep->version = DB_REPVERSION;
+ rep->config = db_rep->config;
+ if ((ret = __rep_gen_init(env, rep)) != 0)
+ return (ret);
+ if ((ret = __rep_egen_init(env, rep)) != 0)
+ return (ret);
+ rep->gbytes = db_rep->gbytes;
+ rep->bytes = db_rep->bytes;
+ rep->request_gap = db_rep->request_gap;
+ rep->max_gap = db_rep->max_gap;
+ rep->config_nsites = db_rep->config_nsites;
+ rep->elect_timeout = db_rep->elect_timeout;
+ rep->full_elect_timeout = db_rep->full_elect_timeout;
+ rep->lease_timeout = db_rep->lease_timeout;
+ rep->clock_skew = db_rep->clock_skew;
+ rep->clock_base = db_rep->clock_base;
+ timespecclear(&rep->lease_duration);
+ timespecclear(&rep->grant_expire);
+ rep->chkpt_delay = db_rep->chkpt_delay;
+ rep->priority = db_rep->my_priority;
+
+ F_SET(rep, REP_F_NOARCHIVE);
+
+ /* Copy application type flags if set before env open. */
+ if (F_ISSET(db_rep, DBREP_APP_REPMGR))
+ F_SET(rep, REP_F_APP_REPMGR);
+ if (F_ISSET(db_rep, DBREP_APP_BASEAPI))
+ F_SET(rep, REP_F_APP_BASEAPI);
+
+ /* Initialize encapsulating region. */
+ renv->rep_off = R_OFFSET(infop, rep);
+ (void)time(&renv->rep_timestamp);
+ renv->op_timestamp = 0;
+ F_CLR(renv, DB_REGENV_REPLOCKED);
+
+#ifdef HAVE_REPLICATION_THREADS
+ if ((ret = __repmgr_open(env, rep)) != 0)
+ return (ret);
+#endif
+ } else {
+ rep = R_ADDR(infop, renv->rep_off);
+ /*
+ * Prevent an application type mismatch between a process
+ * and the environment it is trying to join.
+ */
+ if ((F_ISSET(db_rep, DBREP_APP_REPMGR) &&
+ F_ISSET(rep, REP_F_APP_BASEAPI)) ||
+ (F_ISSET(db_rep, DBREP_APP_BASEAPI) &&
+ F_ISSET(rep, REP_F_APP_REPMGR))) {
+ __db_errx(env,
+"Application type mismatch for a replication process joining the environment");
+ return (EINVAL);
+ }
+#ifdef HAVE_REPLICATION_THREADS
+ if ((ret = __repmgr_join(env, rep)) != 0)
+ return (ret);
+#endif
+ }
+
+ db_rep->region = rep;
+
+ return (0);
+}
+
+/*
+ * __rep_env_refresh --
+ * Replication-specific refresh of the ENV structure.
+ *
+ * PUBLIC: int __rep_env_refresh __P((ENV *));
+ */
+int
+__rep_env_refresh(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ int ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ renv = infop->primary;
+ ret = 0;
+
+ /*
+ * If we are the last reference closing the env, clear our knowledge of
+ * belonging to a group and that there is a valid handle where
+ * rep_start had already been called.
+ */
+ if (renv->refcnt == 1) {
+ F_CLR(rep, REP_F_GROUP_ESTD);
+ F_CLR(rep, REP_F_START_CALLED);
+ }
+
+#ifdef HAVE_REPLICATION_THREADS
+ ret = __repmgr_env_refresh(env);
+#endif
+
+ /*
+ * If a private region, return the memory to the heap. Not needed for
+ * filesystem-backed or system shared memory regions, that memory isn't
+ * owned by any particular process.
+ */
+ if (F_ISSET(env, ENV_PRIVATE)) {
+ db_rep = env->rep_handle;
+ if (db_rep->region != NULL) {
+ ret = __mutex_free(env, &db_rep->region->mtx_region);
+ if ((t_ret = __mutex_free(env,
+ &db_rep->region->mtx_clientdb)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __mutex_free(env,
+ &db_rep->region->mtx_ckp)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __mutex_free(env,
+ &db_rep->region->mtx_event)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ if (renv->rep_off != INVALID_ROFF)
+ __env_alloc_free(infop, R_ADDR(infop, renv->rep_off));
+ }
+
+ env->rep_handle->region = NULL;
+ return (ret);
+}
+
+/*
+ * __rep_close --
+ * Shut down all of replication.
+ *
+ * PUBLIC: int __rep_env_close __P((ENV *));
+ */
+int
+__rep_env_close(env)
+ ENV *env;
+{
+ int ret, t_ret;
+
+ ret = __rep_preclose(env);
+ if ((t_ret = __rep_closefiles(env)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __rep_preclose --
+ * If we are a client, shut down our client database and send
+ * any outstanding bulk buffers.
+ *
+ * PUBLIC: int __rep_preclose __P((ENV *));
+ */
+int
+__rep_preclose(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP_BULK bulk;
+ int ret;
+
+ ret = 0;
+
+ db_rep = env->rep_handle;
+ dblp = env->lg_handle;
+
+ /*
+ * If we have a rep region, we can preclose. Otherwise, return.
+ * If we're on an error path from env open, we may not have
+ * a region, even though we have a handle.
+ */
+ if (db_rep == NULL || db_rep->region == NULL)
+ return (ret);
+ MUTEX_LOCK(env, db_rep->region->mtx_clientdb);
+ if (db_rep->rep_db != NULL) {
+ ret = __db_close(db_rep->rep_db, NULL, DB_NOSYNC);
+ db_rep->rep_db = NULL;
+ }
+ /*
+ * We could be called early in an env_open error path, so
+ * only do this if we have a log region set up.
+ */
+ if (dblp == NULL)
+ goto out;
+ lp = dblp->reginfo.primary;
+ /*
+ * If we have something in the bulk buffer, send anything in it
+ * if we are able to.
+ */
+ if (lp->bulk_off != 0 && db_rep->send != NULL) {
+ memset(&bulk, 0, sizeof(bulk));
+ bulk.addr = R_ADDR(&dblp->reginfo, lp->bulk_buf);
+ bulk.offp = &lp->bulk_off;
+ bulk.len = lp->bulk_len;
+ bulk.type = REP_BULK_LOG;
+ bulk.eid = DB_EID_BROADCAST;
+ bulk.flagsp = &lp->bulk_flags;
+ /*
+ * Ignore send errors here. This can be called on the
+ * env->close path - make a best attempt to send.
+ */
+ (void)__rep_send_bulk(env, &bulk, 0);
+ }
+out: MUTEX_UNLOCK(env, db_rep->region->mtx_clientdb);
+ return (ret);
+}
+
+/*
+ * __rep_closefiles --
+ * If we were a client and are now a master, close all databases
+ * we've opened while applying messages as a client. This can
+ * be called from __env_close and we need to check if the env,
+ * handles and regions are set up, or not.
+ *
+ * PUBLIC: int __rep_closefiles __P((ENV *));
+ */
+int
+__rep_closefiles(env)
+ ENV *env;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ int ret;
+
+ ret = 0;
+
+ db_rep = env->rep_handle;
+ dblp = env->lg_handle;
+
+ if (db_rep == NULL || db_rep->region == NULL)
+ return (ret);
+ if (dblp == NULL)
+ return (ret);
+ if ((ret = __dbreg_close_files(env, 0)) == 0)
+ F_CLR(db_rep, DBREP_OPENFILES);
+
+ return (ret);
+}
+
+/*
+ * __rep_egen_init --
+ * Initialize the value of egen in the region. Called only from
+ * __rep_region_init, which is guaranteed to be single-threaded
+ * as we create the rep region. We set the rep->egen field which
+ * is normally protected by db_rep->region->mutex.
+ */
+static int
+__rep_egen_init(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ DB_FH *fhp;
+ int ret;
+ size_t cnt;
+ char *p;
+
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, REP_EGENNAME, NULL, &p)) != 0)
+ return (ret);
+ /*
+ * If the file doesn't exist, create it now and initialize with 1.
+ */
+ if (__os_exists(env, p, NULL) != 0) {
+ rep->egen = rep->gen + 1;
+ if ((ret = __rep_write_egen(env, rep, rep->egen)) != 0)
+ goto err;
+ } else {
+ /*
+ * File exists, open it and read in our egen.
+ */
+ if ((ret = __os_open(env, p, 0,
+ DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0)
+ goto err;
+ if ((ret = __os_read(env, fhp, &rep->egen, sizeof(u_int32_t),
+ &cnt)) != 0 || cnt != sizeof(u_int32_t))
+ goto err1;
+ RPRINT(env, DB_VERB_REP_MISC,
+ (env, "Read in egen %lu", (u_long)rep->egen));
+err1: (void)__os_closehandle(env, fhp);
+ }
+err: __os_free(env, p);
+ return (ret);
+}
+
+/*
+ * __rep_write_egen --
+ * Write out the egen into the env file.
+ *
+ * PUBLIC: int __rep_write_egen __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_write_egen(env, rep, egen)
+ ENV *env;
+ REP *rep;
+ u_int32_t egen;
+{
+ DB_FH *fhp;
+ int ret;
+ size_t cnt;
+ char *p;
+
+ /*
+ * If running in-memory replication, return without any file
+ * operations.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+ return (0);
+ }
+
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, REP_EGENNAME, NULL, &p)) != 0)
+ return (ret);
+ if ((ret = __os_open(
+ env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) {
+ if ((ret = __os_write(env, fhp, &egen, sizeof(u_int32_t),
+ &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0))
+ __db_err(env, ret, "%s", p);
+ (void)__os_closehandle(env, fhp);
+ }
+ __os_free(env, p);
+ return (ret);
+}
+
+/*
+ * __rep_gen_init --
+ * Initialize the value of gen in the region. Called only from
+ * __rep_region_init, which is guaranteed to be single-threaded
+ * as we create the rep region. We set the rep->gen field which
+ * is normally protected by db_rep->region->mutex.
+ */
+static int
+__rep_gen_init(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ DB_FH *fhp;
+ int ret;
+ size_t cnt;
+ char *p;
+
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, REP_GENNAME, NULL, &p)) != 0)
+ return (ret);
+ /*
+ * If the file doesn't exist, create it now and initialize with 0.
+ */
+ if (__os_exists(env, p, NULL) != 0) {
+ rep->gen = 0;
+ if ((ret = __rep_write_gen(env, rep, rep->gen)) != 0)
+ goto err;
+ } else {
+ /*
+ * File exists, open it and read in our gen.
+ */
+ if ((ret = __os_open(env, p, 0,
+ DB_OSO_RDONLY, DB_MODE_600, &fhp)) != 0)
+ goto err;
+ if ((ret = __os_read(env, fhp, &rep->gen, sizeof(u_int32_t),
+ &cnt)) < 0 || cnt == 0)
+ goto err1;
+ RPRINT(env, DB_VERB_REP_MISC, (env, "Read in gen %lu",
+ (u_long)rep->gen));
+err1: (void)__os_closehandle(env, fhp);
+ }
+err: __os_free(env, p);
+ return (ret);
+}
+
+/*
+ * __rep_write_gen --
+ * Write out the gen into the env file.
+ *
+ * PUBLIC: int __rep_write_gen __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_write_gen(env, rep, gen)
+ ENV *env;
+ REP *rep;
+ u_int32_t gen;
+{
+ DB_FH *fhp;
+ int ret;
+ size_t cnt;
+ char *p;
+
+ /*
+ * If running in-memory replication, return without any file
+ * operations.
+ */
+ if (FLD_ISSET(rep->config, REP_C_INMEM)) {
+ return (0);
+ }
+
+ if ((ret = __db_appname(env,
+ DB_APP_NONE, REP_GENNAME, NULL, &p)) != 0)
+ return (ret);
+ if ((ret = __os_open(
+ env, p, 0, DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &fhp)) == 0) {
+ if ((ret = __os_write(env, fhp, &gen, sizeof(u_int32_t),
+ &cnt)) != 0 || ((ret = __os_fsync(env, fhp)) != 0))
+ __db_err(env, ret, "%s", p);
+ (void)__os_closehandle(env, fhp);
+ }
+ __os_free(env, p);
+ return (ret);
+}
diff --git a/rep/rep_stat.c b/rep/rep_stat.c
new file mode 100644
index 0000000..4a2b93e
--- /dev/null
+++ b/rep/rep_stat.c
@@ -0,0 +1,568 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+
+#ifdef HAVE_STATISTICS
+static int __rep_print_all __P((ENV *, u_int32_t));
+static int __rep_print_stats __P((ENV *, u_int32_t));
+static int __rep_stat __P((ENV *, DB_REP_STAT **, u_int32_t));
+
+/*
+ * __rep_stat_pp --
+ * ENV->rep_stat pre/post processing.
+ *
+ * PUBLIC: int __rep_stat_pp __P((DB_ENV *, DB_REP_STAT **, u_int32_t));
+ */
+int
+__rep_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_REP_STAT **statp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_stat", DB_INIT_REP);
+
+ if ((ret = __db_fchk(env,
+ "DB_ENV->rep_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ ret = __rep_stat(env, statp, flags);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __rep_stat --
+ * ENV->rep_stat.
+ */
+static int
+__rep_stat(env, statp, flags)
+ ENV *env;
+ DB_REP_STAT **statp;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_REP_STAT *stats;
+ LOG *lp;
+ REP *rep;
+ u_int32_t startupdone;
+ uintmax_t queued;
+ int dolock, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ *statp = NULL;
+
+ /* Allocate a stat struct to return to the user. */
+ if ((ret = __os_umalloc(env, sizeof(DB_REP_STAT), &stats)) != 0)
+ return (ret);
+
+ /*
+ * Read without holding the lock. If we are in client recovery, we
+ * copy just the stats struct so we won't block. We only copy out
+ * those stats that don't require acquiring any mutex.
+ */
+ dolock = FLD_ISSET(rep->flags, REP_F_RECOVER_MASK) ? 0 : 1;
+ memcpy(stats, &rep->stat, sizeof(*stats));
+
+ /* Copy out election stats. */
+ if (F_ISSET(rep, REP_F_EPHASE1))
+ stats->st_election_status = 1;
+ else if (F_ISSET(rep, REP_F_EPHASE2))
+ stats->st_election_status = 2;
+
+ stats->st_election_nsites = rep->sites;
+ stats->st_election_cur_winner = rep->winner;
+ stats->st_election_priority = rep->w_priority;
+ stats->st_election_gen = rep->w_gen;
+ stats->st_election_lsn = rep->w_lsn;
+ stats->st_election_votes = rep->votes;
+ stats->st_election_nvotes = rep->nvotes;
+ stats->st_election_tiebreaker = rep->w_tiebreaker;
+
+ /* Copy out other info that's protected by the rep mutex. */
+ stats->st_env_id = rep->eid;
+ stats->st_env_priority = rep->priority;
+ stats->st_nsites = rep->nsites;
+ stats->st_master = rep->master_id;
+ stats->st_gen = rep->gen;
+ stats->st_egen = rep->egen;
+
+ if (F_ISSET(rep, REP_F_MASTER))
+ stats->st_status = DB_REP_MASTER;
+ else if (F_ISSET(rep, REP_F_CLIENT))
+ stats->st_status = DB_REP_CLIENT;
+ else
+ stats->st_status = 0;
+
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ queued = rep->stat.st_log_queued;
+ startupdone = rep->stat.st_startup_complete;
+ memset(&rep->stat, 0, sizeof(rep->stat));
+ rep->stat.st_log_queued = rep->stat.st_log_queued_total =
+ rep->stat.st_log_queued_max = queued;
+ rep->stat.st_startup_complete = startupdone;
+ }
+
+ /*
+ * Log-related replication info is stored in the log system and
+ * protected by the log region lock.
+ */
+ if (dolock)
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ if (F_ISSET(rep, REP_F_CLIENT)) {
+ stats->st_next_lsn = lp->ready_lsn;
+ stats->st_waiting_lsn = lp->waiting_lsn;
+ stats->st_next_pg = rep->ready_pg;
+ stats->st_waiting_pg = rep->waiting_pg;
+ stats->st_max_lease_sec = (u_int32_t)lp->max_lease_ts.tv_sec;
+ stats->st_max_lease_usec = (u_int32_t)
+ (lp->max_lease_ts.tv_nsec / NS_PER_US);
+ } else {
+ if (F_ISSET(rep, REP_F_MASTER)) {
+ LOG_SYSTEM_LOCK(env);
+ stats->st_next_lsn = lp->lsn;
+ LOG_SYSTEM_UNLOCK(env);
+ } else
+ ZERO_LSN(stats->st_next_lsn);
+ ZERO_LSN(stats->st_waiting_lsn);
+ stats->st_max_lease_sec = 0;
+ stats->st_max_lease_usec = 0;
+ }
+ stats->st_max_perm_lsn = lp->max_perm_lsn;
+ if (dolock)
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ *statp = stats;
+ return (0);
+}
+
+/*
+ * __rep_stat_print_pp --
+ * ENV->rep_stat_print pre/post processing.
+ *
+ * PUBLIC: int __rep_stat_print_pp __P((DB_ENV *, u_int32_t));
+ */
+int
+__rep_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_REQUIRES_CONFIG_XX(
+ env, rep_handle, "DB_ENV->rep_stat_print", DB_INIT_REP);
+
+ if ((ret = __db_fchk(env, "DB_ENV->rep_stat_print",
+ flags, DB_STAT_ALL | DB_STAT_CLEAR)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ ret = __rep_stat_print(env, flags);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __rep_stat_print --
+ * ENV->rep_stat_print method.
+ *
+ * PUBLIC: int __rep_stat_print __P((ENV *, u_int32_t));
+ */
+int
+__rep_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ u_int32_t orig_flags;
+ int ret;
+
+ orig_flags = flags;
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
+ if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
+ ret = __rep_print_stats(env, orig_flags);
+ if (flags == 0 || ret != 0)
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_STAT_ALL) &&
+ (ret = __rep_print_all(env, orig_flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __rep_print_stats --
+ * Print out default statistics.
+ */
+static int
+__rep_print_stats(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ DB_REP_STAT *sp;
+ int is_client, ret;
+ char *p;
+
+ if ((ret = __rep_stat(env, &sp, flags)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL))
+ __db_msg(env, "Default replication region information:");
+ is_client = 0;
+ switch (sp->st_status) {
+ case DB_REP_MASTER:
+ __db_msg(env,
+ "Environment configured as a replication master");
+ break;
+ case DB_REP_CLIENT:
+ __db_msg(env,
+ "Environment configured as a replication client");
+ is_client = 1;
+ break;
+ default:
+ __db_msg(env,
+ "Environment not configured for replication");
+ break;
+ }
+
+ __db_msg(env, "%lu/%lu\t%s",
+ (u_long)sp->st_next_lsn.file, (u_long)sp->st_next_lsn.offset,
+ is_client ? "Next LSN expected" : "Next LSN to be used");
+ __db_msg(env, "%lu/%lu\t%s",
+ (u_long)sp->st_waiting_lsn.file, (u_long)sp->st_waiting_lsn.offset,
+ sp->st_waiting_lsn.file == 0 ?
+ "Not waiting for any missed log records" :
+ "LSN of first log record we have after missed log records");
+ __db_msg(env, "%lu/%lu\t%s",
+ (u_long)sp->st_max_perm_lsn.file,
+ (u_long)sp->st_max_perm_lsn.offset,
+ sp->st_max_perm_lsn.file == 0 ?
+ "No maximum permanent LSN" :
+ "Maximum permanent LSN");
+
+ __db_dl(env, "Next page number expected", (u_long)sp->st_next_pg);
+ p = sp->st_waiting_pg == PGNO_INVALID ?
+ "Not waiting for any missed pages" :
+ "Page number of first page we have after missed pages";
+ __db_msg(env, "%lu\t%s", (u_long)sp->st_waiting_pg, p);
+ __db_dl(env,
+ "Number of duplicate master conditions originally detected at this site",
+ (u_long)sp->st_dupmasters);
+ if (sp->st_env_id != DB_EID_INVALID)
+ __db_dl(env, "Current environment ID", (u_long)sp->st_env_id);
+ else
+ __db_msg(env, "No current environment ID");
+ __db_dl(env,
+ "Current environment priority", (u_long)sp->st_env_priority);
+ __db_dl(env, "Current generation number", (u_long)sp->st_gen);
+ __db_dl(env,
+ "Election generation number for the current or next election",
+ (u_long)sp->st_egen);
+ __db_dl(env, "Number of duplicate log records received",
+ (u_long)sp->st_log_duplicated);
+ __db_dl(env, "Number of log records currently queued",
+ (u_long)sp->st_log_queued);
+ __db_dl(env, "Maximum number of log records ever queued at once",
+ (u_long)sp->st_log_queued_max);
+ __db_dl(env, "Total number of log records queued",
+ (u_long)sp->st_log_queued_total);
+ __db_dl(env,
+ "Number of log records received and appended to the log",
+ (u_long)sp->st_log_records);
+ __db_dl(env, "Number of log records missed and requested",
+ (u_long)sp->st_log_requested);
+ if (sp->st_master != DB_EID_INVALID)
+ __db_dl(env, "Current master ID", (u_long)sp->st_master);
+ else
+ __db_msg(env, "No current master ID");
+ __db_dl(env, "Number of times the master has changed",
+ (u_long)sp->st_master_changes);
+ __db_dl(env,
+ "Number of messages received with a bad generation number",
+ (u_long)sp->st_msgs_badgen);
+ __db_dl(env, "Number of messages received and processed",
+ (u_long)sp->st_msgs_processed);
+ __db_dl(env, "Number of messages ignored due to pending recovery",
+ (u_long)sp->st_msgs_recover);
+ __db_dl(env, "Number of failed message sends",
+ (u_long)sp->st_msgs_send_failures);
+ __db_dl(env, "Number of messages sent", (u_long)sp->st_msgs_sent);
+ __db_dl(env,
+ "Number of new site messages received", (u_long)sp->st_newsites);
+ __db_dl(env,
+ "Number of environments believed to be in the replication group",
+ (u_long)sp->st_nsites);
+ __db_dl(env, "Transmission limited", (u_long)sp->st_nthrottles);
+ __db_dl(env, "Number of outdated conditions detected",
+ (u_long)sp->st_outdated);
+ __db_dl(env, "Number of duplicate page records received",
+ (u_long)sp->st_pg_duplicated);
+ __db_dl(env, "Number of page records received and added to databases",
+ (u_long)sp->st_pg_records);
+ __db_dl(env, "Number of page records missed and requested",
+ (u_long)sp->st_pg_requested);
+ if (sp->st_startup_complete == 0)
+ __db_msg(env, "Startup incomplete");
+ else
+ __db_msg(env, "Startup complete");
+ __db_dl(env,
+ "Number of transactions applied", (u_long)sp->st_txns_applied);
+
+ __db_dl(env, "Number of startsync messages delayed",
+ (u_long)sp->st_startsync_delayed);
+
+ __db_dl(env, "Number of elections held", (u_long)sp->st_elections);
+ __db_dl(env,
+ "Number of elections won", (u_long)sp->st_elections_won);
+
+ if (sp->st_election_status == 0) {
+ __db_msg(env, "No election in progress");
+ if (sp->st_election_sec > 0 || sp->st_election_usec > 0)
+ __db_msg(env,
+ "%lu.%.6lu\tDuration of last election (seconds)",
+ (u_long)sp->st_election_sec,
+ (u_long)sp->st_election_usec);
+ } else {
+ __db_dl(env, "Current election phase",
+ (u_long)sp->st_election_status);
+ __db_dl(env,
+ "Environment ID of the winner of the current or last election",
+ (u_long)sp->st_election_cur_winner);
+ __db_dl(env,
+ "Master generation number of the winner of the current or last election",
+ (u_long)sp->st_election_gen);
+ __db_msg(env,
+ "%lu/%lu\tMaximum LSN of the winner of the current or last election",
+ (u_long)sp->st_election_lsn.file,
+ (u_long)sp->st_election_lsn.offset);
+ __db_dl(env,
+ "Number of sites responding to this site during the current election",
+ (u_long)sp->st_election_nsites);
+ __db_dl(env,
+ "Number of votes required in the current or last election",
+ (u_long)sp->st_election_nvotes);
+ __db_dl(env,
+ "Priority of the winner of the current or last election",
+ (u_long)sp->st_election_priority);
+ __db_dl(env,
+ "Tiebreaker value of the winner of the current or last election",
+ (u_long)sp->st_election_tiebreaker);
+ __db_dl(env,
+ "Number of votes received during the current election",
+ (u_long)sp->st_election_votes);
+ }
+ __db_dl(env, "Number of bulk buffer sends triggered by full buffer",
+ (u_long)sp->st_bulk_fills);
+ __db_dl(env, "Number of single records exceeding bulk buffer size",
+ (u_long)sp->st_bulk_overflows);
+ __db_dl(env, "Number of records added to a bulk buffer",
+ (u_long)sp->st_bulk_records);
+ __db_dl(env, "Number of bulk buffers sent",
+ (u_long)sp->st_bulk_transfers);
+ __db_dl(env, "Number of re-request messages received",
+ (u_long)sp->st_client_rerequests);
+ __db_dl(env,
+ "Number of request messages this client failed to process",
+ (u_long)sp->st_client_svc_miss);
+ __db_dl(env, "Number of request messages received by this client",
+ (u_long)sp->st_client_svc_req);
+ if (sp->st_max_lease_sec > 0 || sp->st_max_lease_usec > 0)
+ __db_msg(env,
+ "%lu.%.6lu\tDuration of maximum lease (seconds)",
+ (u_long)sp->st_max_lease_sec,
+ (u_long)sp->st_max_lease_usec);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+/*
+ * __rep_print_all --
+ * Display debugging replication region statistics.
+ */
+static int
+__rep_print_all(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ static const FN rep_fn[] = {
+ { REP_F_ABBREVIATED, "REP_F_ABBREVIATED" },
+ { REP_F_APP_BASEAPI, "REP_F_APP_BASEAPI" },
+ { REP_F_APP_REPMGR, "REP_F_APP_REPMGR" },
+ { REP_F_CLIENT, "REP_F_CLIENT" },
+ { REP_F_DELAY, "REP_F_DELAY" },
+ { REP_F_EGENUPDATE, "REP_F_EGENUPDATE" },
+ { REP_F_EPHASE0, "REP_F_EPHASE0" },
+ { REP_F_EPHASE1, "REP_F_EPHASE1" },
+ { REP_F_EPHASE2, "REP_F_EPHASE2" },
+ { REP_F_GROUP_ESTD, "REP_F_GROUP_ESTD" },
+ { REP_F_INREPELECT, "REP_F_INREPELECT" },
+ { REP_F_INREPSTART, "REP_F_INREPSTART" },
+ { REP_F_LEASE_EXPIRED, "REP_F_LEASE_EXPIRED" },
+ { REP_F_MASTER, "REP_F_MASTER" },
+ { REP_F_MASTERELECT, "REP_F_MASTERELECT" },
+ { REP_F_NEWFILE, "REP_F_NEWFILE" },
+ { REP_F_NIMDBS_LOADED, "REP_F_NIMDBS_LOADED" },
+ { REP_F_NOARCHIVE, "REP_F_NOARCHIVE" },
+ { REP_F_READY_API, "REP_F_READY_API" },
+ { REP_F_READY_APPLY, "REP_F_READY_APPLY" },
+ { REP_F_READY_MSG, "REP_F_READY_MSG" },
+ { REP_F_READY_OP, "REP_F_READY_OP" },
+ { REP_F_RECOVER_LOG, "REP_F_RECOVER_LOG" },
+ { REP_F_RECOVER_PAGE, "REP_F_RECOVER_PAGE" },
+ { REP_F_RECOVER_UPDATE, "REP_F_RECOVER_UPDATE" },
+ { REP_F_RECOVER_VERIFY, "REP_F_RECOVER_VERIFY" },
+ { REP_F_SKIPPED_APPLY, "REP_F_SKIPPED_APPLY" },
+ { REP_F_START_CALLED, "REP_F_START_CALLED" },
+ { REP_F_TALLY, "REP_F_TALLY" },
+ { 0, NULL }
+ };
+ static const FN dbrep_fn[] = {
+ { DBREP_APP_BASEAPI, "DBREP_APP_BASEAPI" },
+ { DBREP_APP_REPMGR, "DBREP_APP_REPMGR" },
+ { DBREP_OPENFILES, "DBREP_OPENFILES" },
+ { 0, NULL }
+ };
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ char time_buf[CTIME_BUFLEN];
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ renv = infop->primary;
+ ENV_ENTER(env, ip);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB_REP handle information:");
+
+ if (db_rep->rep_db == NULL)
+ STAT_ISSET("Bookkeeping database", db_rep->rep_db);
+ else
+ (void)__db_stat_print(db_rep->rep_db, ip, flags);
+
+ __db_prflags(env, NULL, db_rep->flags, dbrep_fn, NULL, "\tFlags");
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "REP handle information:");
+ __mutex_print_debug_single(env,
+ "Replication region mutex", rep->mtx_region, flags);
+ __mutex_print_debug_single(env,
+ "Bookkeeping database mutex", rep->mtx_clientdb, flags);
+
+ STAT_LONG("Environment ID", rep->eid);
+ STAT_LONG("Master environment ID", rep->master_id);
+ STAT_ULONG("Election generation", rep->egen);
+ STAT_ULONG("Election generation number", rep->gen);
+ STAT_LONG("Space allocated for sites", rep->asites);
+ STAT_LONG("Sites in group", rep->nsites);
+ STAT_LONG("Votes needed for election", rep->nvotes);
+ STAT_LONG("Priority in election", rep->priority);
+ __db_dlbytes(env, "Limit on data sent in a single call",
+ rep->gbytes, (u_long)0, rep->bytes);
+ STAT_LONG("Request gap seconds", rep->request_gap.tv_sec);
+ STAT_LONG("Request gap microseconds",
+ rep->request_gap.tv_nsec / NS_PER_US);
+ STAT_LONG("Maximum gap seconds", rep->max_gap.tv_sec);
+ STAT_LONG("Maximum gap microseconds",
+ rep->max_gap.tv_nsec / NS_PER_US);
+
+ STAT_ULONG("Callers in rep_proc_msg", rep->msg_th);
+ STAT_ULONG("Library handle count", rep->handle_cnt);
+ STAT_ULONG("Multi-step operation count", rep->op_cnt);
+ __db_msg(env, "%.24s\tRecovery timestamp",
+ renv->rep_timestamp == 0 ?
+ "0" : __os_ctime(&renv->rep_timestamp, time_buf));
+
+ STAT_LONG("Sites heard from", rep->sites);
+ STAT_LONG("Current winner", rep->winner);
+ STAT_LONG("Winner priority", rep->w_priority);
+ STAT_ULONG("Winner generation", rep->w_gen);
+ STAT_LSN("Winner LSN", &rep->w_lsn);
+ STAT_LONG("Winner tiebreaker", rep->w_tiebreaker);
+ STAT_LONG("Votes for this site", rep->votes);
+
+ __db_prflags(env, NULL, rep->flags, rep_fn, NULL, "\tFlags");
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "LOG replication information:");
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ dblp = env->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+ STAT_LSN("First log record after a gap", &lp->waiting_lsn);
+ STAT_LSN("Maximum permanent LSN processed", &lp->max_perm_lsn);
+ STAT_LSN("LSN waiting to verify", &lp->verify_lsn);
+ STAT_LSN("Maximum LSN requested", &lp->max_wait_lsn);
+ STAT_LONG("Time to wait before requesting seconds", lp->wait_ts.tv_sec);
+ STAT_LONG("Time to wait before requesting microseconds",
+ lp->wait_ts.tv_nsec / NS_PER_US);
+ STAT_LSN("Next LSN expected", &lp->ready_lsn);
+ STAT_LONG("Maximum lease timestamp seconds", lp->max_lease_ts.tv_sec);
+ STAT_LONG("Maximum lease timestamp microseconds",
+ lp->max_lease_ts.tv_nsec / NS_PER_US);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ENV_LEAVE(env, ip);
+
+ return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__rep_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_REP_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+
+int
+__rep_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbenv->env));
+}
+#endif
diff --git a/rep/rep_stub.c b/rep/rep_stub.c
new file mode 100644
index 0000000..f2f11d8
--- /dev/null
+++ b/rep/rep_stub.c
@@ -0,0 +1,391 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_REPLICATION
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+/*
+ * If the library wasn't compiled with replication support, various routines
+ * aren't available. Stub them here, returning an appropriate error.
+ */
+static int __db_norep __P((ENV *));
+
+/*
+ * __db_norep --
+ * Error when a Berkeley DB build doesn't include replication support.
+ */
+static int
+__db_norep(env)
+ ENV *env;
+{
+ __db_errx(env,
+ "library build did not include support for replication");
+ return (DB_OPNOTSUP);
+}
+
+int
+__db_rep_enter(dbp, checkgen, checklock, return_now)
+ DB *dbp;
+ int checkgen, checklock, return_now;
+{
+ COMPQUIET(checkgen, 0);
+ COMPQUIET(checklock, 0);
+ COMPQUIET(return_now, 0);
+ return (__db_norep(dbp->env));
+}
+
+int
+__env_rep_enter(env, checklock)
+ ENV *env;
+ int checklock;
+{
+ COMPQUIET(checklock, 0);
+ return (__db_norep(env));
+}
+
+int
+__env_db_rep_exit(env)
+ ENV *env;
+{
+ return (__db_norep(env));
+}
+
+int
+__op_rep_enter(env)
+ ENV *env;
+{
+ return (__db_norep(env));
+}
+
+int
+__op_rep_exit(env)
+ ENV *env;
+{
+ return (__db_norep(env));
+}
+
+int
+__rep_bulk_message(env, bulkp, repth, lsnp, dbt, flags)
+ ENV *env;
+ REP_BULK *bulkp;
+ REP_THROTTLE *repth;
+ DB_LSN *lsnp;
+ const DBT *dbt;
+ u_int32_t flags;
+{
+ COMPQUIET(bulkp, NULL);
+ COMPQUIET(repth, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(dbt, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norep(env));
+}
+
+int
+__rep_env_refresh(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+int
+__rep_elect_pp(dbenv, nsites, nvotes, flags)
+ DB_ENV *dbenv;
+ u_int32_t nsites, nvotes;
+ u_int32_t flags;
+{
+ COMPQUIET(nsites, 0);
+ COMPQUIET(nvotes, 0);
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_flush(dbenv)
+ DB_ENV *dbenv;
+{
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_lease_check(env, refresh)
+ ENV *env;
+ int refresh;
+{
+ COMPQUIET(refresh, 0);
+ return (__db_norep(env));
+}
+
+int
+__rep_lease_expire(env)
+ ENV *env;
+{
+ return (__db_norep(env));
+}
+
+int
+__rep_get_clockskew(dbenv, fast_clockp, slow_clockp)
+ DB_ENV *dbenv;
+ u_int32_t *fast_clockp, *slow_clockp;
+{
+ COMPQUIET(fast_clockp, NULL);
+ COMPQUIET(slow_clockp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_clockskew(dbenv, fast_clock, slow_clock)
+ DB_ENV *dbenv;
+ u_int32_t fast_clock, slow_clock;
+{
+ COMPQUIET(fast_clock, 0);
+ COMPQUIET(slow_clock, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_nsites(dbenv, n)
+ DB_ENV *dbenv;
+ u_int32_t n;
+{
+ COMPQUIET(n, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_nsites(dbenv, n)
+ DB_ENV *dbenv;
+ u_int32_t *n;
+{
+ COMPQUIET(n, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_priority(dbenv, priority)
+ DB_ENV *dbenv;
+ u_int32_t priority;
+{
+ COMPQUIET(priority, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_priority(dbenv, priority)
+ DB_ENV *dbenv;
+ u_int32_t *priority;
+{
+ COMPQUIET(priority, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_timeout(dbenv, which, timeout)
+ DB_ENV *dbenv;
+ int which;
+ db_timeout_t timeout;
+{
+ COMPQUIET(which, 0);
+ COMPQUIET(timeout, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_timeout(dbenv, which, timeout)
+ DB_ENV *dbenv;
+ int which;
+ db_timeout_t *timeout;
+{
+ COMPQUIET(which, 0);
+ COMPQUIET(timeout, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_config(dbenv, which, onp)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int *onp;
+{
+ COMPQUIET(which, 0);
+ COMPQUIET(onp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_config(dbenv, which, on)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int on;
+{
+ COMPQUIET(which, 0);
+ COMPQUIET(on, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_limit(dbenv, gbytesp, bytesp)
+ DB_ENV *dbenv;
+ u_int32_t *gbytesp, *bytesp;
+{
+ COMPQUIET(gbytesp, NULL);
+ COMPQUIET(bytesp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_noarchive(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+int
+__rep_open(env)
+ ENV *env;
+{
+ COMPQUIET(env, NULL);
+ return (0);
+}
+
+int
+__rep_preclose(env)
+ ENV *env;
+{
+ return (__db_norep(env));
+}
+
+int
+__rep_process_message_pp(dbenv, control, rec, eid, ret_lsnp)
+ DB_ENV *dbenv;
+ DBT *control, *rec;
+ int eid;
+ DB_LSN *ret_lsnp;
+{
+ COMPQUIET(control, NULL);
+ COMPQUIET(rec, NULL);
+ COMPQUIET(eid, 0);
+ COMPQUIET(ret_lsnp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_send_message(env, eid, rtype, lsnp, dbtp, logflags, repflags)
+ ENV *env;
+ int eid;
+ u_int32_t rtype;
+ DB_LSN *lsnp;
+ const DBT *dbtp;
+ u_int32_t logflags, repflags;
+{
+ COMPQUIET(eid, 0);
+ COMPQUIET(rtype, 0);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(logflags, 0);
+ COMPQUIET(repflags, 0);
+ return (__db_norep(env));
+}
+
+int
+__rep_set_limit(dbenv, gbytes, bytes)
+ DB_ENV *dbenv;
+ u_int32_t gbytes, bytes;
+{
+ COMPQUIET(gbytes, 0);
+ COMPQUIET(bytes, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_transport_pp(dbenv, eid, f_send)
+ DB_ENV *dbenv;
+ int eid;
+ int (*f_send) __P((DB_ENV *, const DBT *, const DBT *, const DB_LSN *,
+ int, u_int32_t));
+{
+ COMPQUIET(eid, 0);
+ COMPQUIET(f_send, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_set_request(dbenv, min, max)
+ DB_ENV *dbenv;
+ u_int32_t min, max;
+{
+ COMPQUIET(min, 0);
+ COMPQUIET(max, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_get_request(dbenv, minp, maxp)
+ DB_ENV *dbenv;
+ u_int32_t *minp, *maxp;
+{
+ COMPQUIET(minp, NULL);
+ COMPQUIET(maxp, NULL);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_start_pp(dbenv, dbt, flags)
+ DB_ENV *dbenv;
+ DBT *dbt;
+ u_int32_t flags;
+{
+ COMPQUIET(dbt, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_stat_pp(dbenv, statp, flags)
+ DB_ENV *dbenv;
+ DB_REP_STAT **statp;
+ u_int32_t flags;
+{
+ COMPQUIET(statp, NULL);
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_stat_print_pp(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+
+int
+__rep_stat_print(env, flags)
+ ENV *env;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_norep(env));
+}
+
+int
+__rep_sync(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__db_norep(dbenv->env));
+}
+#endif /* !HAVE_REPLICATION */
diff --git a/rep/rep_util.c b/rep/rep_util.c
new file mode 100644
index 0000000..8fbf3a0
--- /dev/null
+++ b/rep/rep_util.c
@@ -0,0 +1,2007 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+#ifdef REP_DIAGNOSTIC
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#endif
+
+/*
+ * rep_util.c:
+ * Miscellaneous replication-related utility functions, including
+ * those called by other subsystems.
+ */
+#define TIMESTAMP_CHECK(env, ts, renv) do { \
+ if (renv->op_timestamp != 0 && \
+ renv->op_timestamp + DB_REGENV_TIMEOUT < ts) { \
+ REP_SYSTEM_LOCK(env); \
+ F_CLR(renv, DB_REGENV_REPLOCKED); \
+ renv->op_timestamp = 0; \
+ REP_SYSTEM_UNLOCK(env); \
+ } \
+} while (0)
+
+static int __rep_lockout_int __P((ENV *, REP *, u_int32_t *, u_int32_t,
+ const char *, u_int32_t));
+static int __rep_newmaster_empty __P((ENV *, int));
+#ifdef REP_DIAGNOSTIC
+static void __rep_print_logmsg __P((ENV *, const DBT *, DB_LSN *));
+#endif
+
+/*
+ * __rep_bulk_message --
+ * This is a wrapper for putting a record into a bulk buffer. Since
+ * we have different bulk buffers, the caller must hand us the information
+ * we need to put the record into the correct buffer. All bulk buffers
+ * are protected by the REP->mtx_clientdb.
+ *
+ * PUBLIC: int __rep_bulk_message __P((ENV *, REP_BULK *, REP_THROTTLE *,
+ * PUBLIC: DB_LSN *, const DBT *, u_int32_t));
+ */
+int
+__rep_bulk_message(env, bulk, repth, lsn, dbt, flags)
+ ENV *env;
+ REP_BULK *bulk;
+ REP_THROTTLE *repth;
+ DB_LSN *lsn;
+ const DBT *dbt;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ __rep_bulk_args b_args;
+ size_t len;
+ int ret;
+ u_int32_t recsize, typemore;
+ u_int8_t *p;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
+
+ /*
+ * Figure out the total number of bytes needed for this record.
+ * !!! The marshalling code includes the given len, but also
+ * puts its own copy of the dbt->size with the DBT portion of
+ * the record. Account for that here.
+ */
+ recsize = sizeof(len) + dbt->size + sizeof(DB_LSN) + sizeof(dbt->size);
+
+ /*
+ * If *this* buffer is actively being transmitted, don't wait,
+ * just return so that it can be sent as a singleton.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ if (FLD_ISSET(*(bulk->flagsp), BULK_XMIT)) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (DB_REP_BULKOVF);
+ }
+
+ /*
+ * If the record is bigger than the buffer entirely, send the
+ * current buffer and then return DB_REP_BULKOVF so that this
+ * record is sent as a singleton. Do we have enough info to
+ * do that here? XXX
+ */
+ if (recsize > bulk->len) {
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "bulk_msg: Record %d (0x%x) larger than entire buffer 0x%x",
+ recsize, recsize, bulk->len));
+ STAT(rep->stat.st_bulk_overflows++);
+ (void)__rep_send_bulk(env, bulk, flags);
+ /*
+ * XXX __rep_send_message...
+ */
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (DB_REP_BULKOVF);
+ }
+ /*
+ * If this record doesn't fit, send the current buffer.
+ * Sending the buffer will reset the offset, but we will
+ * drop the mutex while sending so we need to keep checking
+ * if we're racing.
+ */
+ while (recsize + *(bulk->offp) > bulk->len) {
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "bulk_msg: Record %lu (%#lx) doesn't fit. Send %lu (%#lx) now.",
+ (u_long)recsize, (u_long)recsize,
+ (u_long)bulk->len, (u_long)bulk->len));
+ STAT(rep->stat.st_bulk_fills++);
+ if ((ret = __rep_send_bulk(env, bulk, flags)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (ret);
+ }
+ }
+
+ /*
+ * If we're using throttling, see if we are at the throttling
+ * limit before we do any more work here, by checking if the
+ * call to rep_send_throttle changed the repth->type to the
+ * *_MORE message type. If the throttling code hits the limit
+ * then we're done here.
+ */
+ if (bulk->type == REP_BULK_LOG)
+ typemore = REP_LOG_MORE;
+ else
+ typemore = REP_PAGE_MORE;
+ if (repth != NULL) {
+ if ((ret = __rep_send_throttle(env,
+ bulk->eid, repth, REP_THROTTLE_ONLY, flags)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (ret);
+ }
+ if (repth->type == typemore) {
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "bulk_msg: Record %lu (0x%lx) hit throttle limit.",
+ (u_long)recsize, (u_long)recsize));
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (ret);
+ }
+ }
+
+ /*
+ * Now we own the buffer, and we know our record fits into it.
+ * The buffer is structured with the len, LSN and then the record.
+ * Copy the record into the buffer. Then if we need to,
+ * send the buffer.
+ */
+ p = bulk->addr + *(bulk->offp);
+ b_args.len = dbt->size;
+ b_args.lsn = *lsn;
+ b_args.bulkdata = *dbt;
+ /*
+ * If we're the first record, we need to save the first
+ * LSN in the bulk structure.
+ */
+ if (*(bulk->offp) == 0)
+ bulk->lsn = *lsn;
+ if (rep->version < DB_REPVERSION_47) {
+ len = 0;
+ memcpy(p, &dbt->size, sizeof(dbt->size));
+ p += sizeof(dbt->size);
+ memcpy(p, lsn, sizeof(DB_LSN));
+ p += sizeof(DB_LSN);
+ memcpy(p, dbt->data, dbt->size);
+ p += dbt->size;
+ } else if ((ret = __rep_bulk_marshal(env, &b_args, p,
+ bulk->len, &len)) != 0)
+ goto err;
+ *(bulk->offp) = (uintptr_t)p + (uintptr_t)len - (uintptr_t)bulk->addr;
+ STAT(rep->stat.st_bulk_records++);
+ /*
+ * Send the buffer if it is a perm record or a force.
+ */
+ if (LF_ISSET(REPCTL_PERM)) {
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "bulk_msg: Send buffer after copy due to PERM"));
+ ret = __rep_send_bulk(env, bulk, flags);
+ }
+err:
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (ret);
+
+}
+
+/*
+ * __rep_send_bulk --
+ * This function transmits the bulk buffer given. It assumes the
+ * caller holds the REP->mtx_clientdb. We may release it and reacquire
+ * it during this call. We will return with it held.
+ *
+ * PUBLIC: int __rep_send_bulk __P((ENV *, REP_BULK *, u_int32_t));
+ */
+int
+__rep_send_bulk(env, bulkp, ctlflags)
+ ENV *env;
+ REP_BULK *bulkp;
+ u_int32_t ctlflags;
+{
+ DBT dbt;
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ /*
+ * If the offset is 0, we're done. There is nothing to send.
+ */
+ if (*(bulkp->offp) == 0)
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ /*
+ * Set that this buffer is being actively transmitted.
+ */
+ FLD_SET(*(bulkp->flagsp), BULK_XMIT);
+ DB_INIT_DBT(dbt, bulkp->addr, *(bulkp->offp));
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "send_bulk: Send %d (0x%x) bulk buffer bytes", dbt.size, dbt.size));
+
+ /*
+ * Unlocked the mutex and now send the message.
+ */
+ STAT(rep->stat.st_bulk_transfers++);
+ if ((ret = __rep_send_message(env,
+ bulkp->eid, bulkp->type, &bulkp->lsn, &dbt, ctlflags, 0)) != 0)
+ ret = DB_REP_UNAVAIL;
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ /*
+ * Ready the buffer for further records.
+ */
+ *(bulkp->offp) = 0;
+ FLD_CLR(*(bulkp->flagsp), BULK_XMIT);
+ return (ret);
+}
+
+/*
+ * __rep_bulk_alloc --
+ * This function allocates and initializes an internal bulk buffer.
+ * This is used by the master when fulfilling a request for a chunk of
+ * log records or a bunch of pages.
+ *
+ * PUBLIC: int __rep_bulk_alloc __P((ENV *, REP_BULK *, int, uintptr_t *,
+ * PUBLIC: u_int32_t *, u_int32_t));
+ */
+int
+__rep_bulk_alloc(env, bulkp, eid, offp, flagsp, type)
+ ENV *env;
+ REP_BULK *bulkp;
+ int eid;
+ uintptr_t *offp;
+ u_int32_t *flagsp, type;
+{
+ int ret;
+
+ memset(bulkp, 0, sizeof(REP_BULK));
+ *offp = *flagsp = 0;
+ bulkp->len = MEGABYTE;
+ if ((ret = __os_malloc(env, bulkp->len, &bulkp->addr)) != 0)
+ return (ret);
+ bulkp->offp = offp;
+ bulkp->type = type;
+ bulkp->eid = eid;
+ bulkp->flagsp = flagsp;
+ return (ret);
+}
+
+/*
+ * __rep_bulk_free --
+ * This function sends the remainder of the bulk buffer and frees it.
+ *
+ * PUBLIC: int __rep_bulk_free __P((ENV *, REP_BULK *, u_int32_t));
+ */
+int
+__rep_bulk_free(env, bulkp, flags)
+ ENV *env;
+ REP_BULK *bulkp;
+ u_int32_t flags;
+{
+ DB_REP *db_rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+
+ MUTEX_LOCK(env, db_rep->region->mtx_clientdb);
+ ret = __rep_send_bulk(env, bulkp, flags);
+ MUTEX_UNLOCK(env, db_rep->region->mtx_clientdb);
+ __os_free(env, bulkp->addr);
+ return (ret);
+}
+
+/*
+ * __rep_send_message --
+ * This is a wrapper for sending a message. It takes care of constructing
+ * the control structure and calling the user's specified send function.
+ *
+ * PUBLIC: int __rep_send_message __P((ENV *, int,
+ * PUBLIC: u_int32_t, DB_LSN *, const DBT *, u_int32_t, u_int32_t));
+ */
+int
+__rep_send_message(env, eid, rtype, lsnp, dbt, ctlflags, repflags)
+ ENV *env;
+ int eid;
+ u_int32_t rtype;
+ DB_LSN *lsnp;
+ const DBT *dbt;
+ u_int32_t ctlflags, repflags;
+{
+ DBT cdbt, scrap_dbt;
+ DB_ENV *dbenv;
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ REP_46_CONTROL cntrl46;
+ REP_OLD_CONTROL ocntrl;
+ __rep_control_args cntrl;
+ db_timespec msg_time;
+ int ret;
+ u_int32_t myflags;
+ u_int8_t buf[__REP_CONTROL_SIZE];
+ size_t len;
+
+ dbenv = env->dbenv;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+
+#if defined(DEBUG_ROP) || defined(DEBUG_WOP)
+ if (db_rep->send == NULL)
+ return (0);
+#endif
+
+ /* Set up control structure. */
+ memset(&cntrl, 0, sizeof(cntrl));
+ memset(&ocntrl, 0, sizeof(ocntrl));
+ memset(&cntrl46, 0, sizeof(cntrl46));
+ if (lsnp == NULL)
+ ZERO_LSN(cntrl.lsn);
+ else
+ cntrl.lsn = *lsnp;
+ /*
+ * Set the rectype based on the version we need to speak.
+ */
+ if (rep->version == DB_REPVERSION)
+ cntrl.rectype = rtype;
+ else if (rep->version < DB_REPVERSION) {
+ cntrl.rectype = __rep_msg_to_old(rep->version, rtype);
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "rep_send_msg: rtype %lu to version %lu record %lu.",
+ (u_long)rtype, (u_long)rep->version,
+ (u_long)cntrl.rectype));
+ if (cntrl.rectype == REP_INVALID)
+ return (ret);
+ } else {
+ __db_errx(env,
+ "rep_send_message: Unknown rep version %lu, my version %lu",
+ (u_long)rep->version, (u_long)DB_REPVERSION);
+ return (__env_panic(env, EINVAL));
+ }
+ cntrl.flags = ctlflags;
+ cntrl.rep_version = rep->version;
+ cntrl.log_version = lp->persist.version;
+ cntrl.gen = rep->gen;
+
+ /* Don't assume the send function will be tolerant of NULL records. */
+ if (dbt == NULL) {
+ memset(&scrap_dbt, 0, sizeof(DBT));
+ dbt = &scrap_dbt;
+ }
+
+ /*
+ * There are several types of records: commit and checkpoint records
+ * that affect database durability, regular log records that might
+ * be buffered on the master before being transmitted, and control
+ * messages which don't require the guarantees of permanency, but
+ * should not be buffered.
+ *
+ * There are request records that can be sent anywhere, and there
+ * are rerequest records that the app might want to send to the master.
+ */
+ myflags = repflags;
+ if (FLD_ISSET(ctlflags, REPCTL_PERM))
+ myflags |= DB_REP_PERMANENT;
+ else if (rtype != REP_LOG || FLD_ISSET(ctlflags, REPCTL_RESEND))
+ myflags |= DB_REP_NOBUFFER;
+
+ /*
+ * Let everyone know if we've been in an established group.
+ */
+ if (F_ISSET(rep, REP_F_GROUP_ESTD))
+ F_SET(&cntrl, REPCTL_GROUP_ESTD);
+
+ /*
+ * We're sending messages to some other version. We cannot
+ * assume DB_REP_ANYWHERE is available. Turn it off.
+ */
+ if (rep->version != DB_REPVERSION)
+ FLD_CLR(myflags, DB_REP_ANYWHERE);
+
+ /*
+ * If we are a master sending a perm record, then set the
+ * REPCTL_LEASE flag to have the client reply. Also set
+ * the start time that the client will echo back to us.
+ *
+ * !!! If we are a master, using leases, we had better not be
+ * sending to an older version.
+ */
+ if (IS_REP_MASTER(env) && IS_USING_LEASES(env) &&
+ FLD_ISSET(ctlflags, REPCTL_PERM)) {
+ F_SET(&cntrl, REPCTL_LEASE);
+ DB_ASSERT(env, rep->version == DB_REPVERSION);
+ __os_gettime(env, &msg_time, 1);
+ cntrl.msg_sec = (u_int32_t)msg_time.tv_sec;
+ cntrl.msg_nsec = (u_int32_t)msg_time.tv_nsec;
+ }
+
+ REP_PRINT_MESSAGE(env, eid, &cntrl, "rep_send_message", myflags);
+#ifdef REP_DIAGNOSTIC
+ if (FLD_ISSET(
+ env->dbenv->verbose, DB_VERB_REP_MSGS) && rtype == REP_LOG)
+ __rep_print_logmsg(env, dbt, lsnp);
+#endif
+
+ /*
+ * If DB_REP_PERMANENT is set, the LSN better be non-zero.
+ */
+ DB_ASSERT(env, !FLD_ISSET(myflags, DB_REP_PERMANENT) ||
+ !IS_ZERO_LSN(cntrl.lsn));
+
+ /*
+ * If we're talking to an old version, send an old control structure.
+ */
+ memset(&cdbt, 0, sizeof(cdbt));
+ if (rep->version <= DB_REPVERSION_45) {
+ if (rep->version == DB_REPVERSION_45 &&
+ F_ISSET(&cntrl, REPCTL_INIT)) {
+ F_CLR(&cntrl, REPCTL_INIT);
+ F_SET(&cntrl, REPCTL_INIT_45);
+ }
+ ocntrl.rep_version = cntrl.rep_version;
+ ocntrl.log_version = cntrl.log_version;
+ ocntrl.lsn = cntrl.lsn;
+ ocntrl.rectype = cntrl.rectype;
+ ocntrl.gen = cntrl.gen;
+ ocntrl.flags = cntrl.flags;
+ cdbt.data = &ocntrl;
+ cdbt.size = sizeof(ocntrl);
+ } else if (rep->version == DB_REPVERSION_46) {
+ cntrl46.rep_version = cntrl.rep_version;
+ cntrl46.log_version = cntrl.log_version;
+ cntrl46.lsn = cntrl.lsn;
+ cntrl46.rectype = cntrl.rectype;
+ cntrl46.gen = cntrl.gen;
+ cntrl46.msg_time.tv_sec = (time_t)cntrl.msg_sec;
+ cntrl46.msg_time.tv_nsec = (long)cntrl.msg_nsec;
+ cntrl46.flags = cntrl.flags;
+ cdbt.data = &cntrl46;
+ cdbt.size = sizeof(cntrl46);
+ } else {
+ (void)__rep_control_marshal(env, &cntrl, buf,
+ __REP_CONTROL_SIZE, &len);
+ DB_INIT_DBT(cdbt, buf, len);
+ }
+
+ /*
+ * We set the LSN above to something valid. Give the master the
+ * actual LSN so that they can coordinate with permanent records from
+ * the client if they want to.
+ *
+ * !!! Even though we marshalled the control message for transmission,
+ * give the transport function the real LSN.
+ */
+ ret = db_rep->send(dbenv, &cdbt, dbt, &cntrl.lsn, eid, myflags);
+
+ /*
+ * We don't hold the rep lock, so this could miscount if we race.
+ * I don't think it's worth grabbing the mutex for that bit of
+ * extra accuracy.
+ */
+ if (ret != 0) {
+ RPRINT(env, DB_VERB_REP_MSGS, (env,
+ "rep_send_function returned: %d", ret));
+#ifdef HAVE_STATISTICS
+ rep->stat.st_msgs_send_failures++;
+ } else
+ rep->stat.st_msgs_sent++;
+#else
+ }
+#endif
+ return (ret);
+}
+
+#ifdef REP_DIAGNOSTIC
+/*
+ * __rep_print_logmsg --
+ * This is a debugging routine for printing out log records that
+ * we are about to transmit to a client.
+ */
+static void
+__rep_print_logmsg(env, logdbt, lsnp)
+ ENV *env;
+ const DBT *logdbt;
+ DB_LSN *lsnp;
+{
+ static int first = 1;
+ static DB_DISTAB dtab;
+
+ if (first) {
+ first = 0;
+
+ (void)__bam_init_print(env, &dtab);
+ (void)__crdel_init_print(env, &dtab);
+ (void)__db_init_print(env, &dtab);
+ (void)__dbreg_init_print(env, &dtab);
+ (void)__fop_init_print(env, &dtab);
+ (void)__ham_init_print(env, &dtab);
+ (void)__qam_init_print(env, &dtab);
+ (void)__txn_init_print(env, &dtab);
+ }
+
+ (void)__db_dispatch(
+ env, &dtab, (DBT *)logdbt, lsnp, DB_TXN_PRINT, NULL);
+}
+#endif
+
+/*
+ * __rep_new_master --
+ * Called after a master election to sync back up with a new master.
+ * It's possible that we already know of this new master in which case
+ * we don't need to do anything.
+ *
+ * This is written assuming that this message came from the master; we
+ * need to enforce that in __rep_process_record, but right now, we have
+ * no way to identify the master.
+ *
+ * PUBLIC: int __rep_new_master __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_new_master(env, cntrl, eid)
+ ENV *env;
+ __rep_control_args *cntrl;
+ int eid;
+{
+ DBT dbt;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN first_lsn, lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ db_timeout_t lease_to;
+ u_int32_t unused;
+ int change, do_req, lockout, ret, t_ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+ logc = NULL;
+ lockout = 0;
+ REP_SYSTEM_LOCK(env);
+ change = rep->gen != cntrl->gen || rep->master_id != eid;
+ /*
+ * If we're hearing from a current or new master, then we
+ * want to clear EPHASE0 in case this site is waiting to
+ * hear from the master.
+ */
+ F_CLR(rep, REP_F_EPHASE0);
+ if (change) {
+ /*
+ * If we are already locking out others, we're either
+ * in the middle of sync-up recovery or internal init
+ * when this newmaster comes in (we also lockout in
+ * rep_start, but we cannot be racing that because we
+ * don't allow rep_proc_msg when rep_start is going on).
+ *
+ * We're about to become the client of a new master. Since we
+ * want to be able to sync with the new master as quickly as
+ * possible, interrupt any STARTSYNC from the old master. The
+ * new master may need to rely on acks from us and the old
+ * STARTSYNC is now irrelevant.
+ *
+ * Note that, conveniently, the "lockout" flag defines the
+ * section of this code path during which both "message lockout"
+ * and "memp sync interrupt" are in effect.
+ */
+ if (F_ISSET(rep, REP_F_READY_MSG))
+ goto lckout;
+
+ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+ goto errlck;
+
+ (void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 1);
+ lockout = 1;
+ /*
+ * We must wait any remaining lease time before accepting
+ * this new master. This must be after the lockout above
+ * so that no new message can be processed and re-grant
+ * the lease out from under us.
+ */
+ if (IS_USING_LEASES(env) &&
+ ((lease_to = __rep_lease_waittime(env)) != 0)) {
+ REP_SYSTEM_UNLOCK(env);
+ __os_yield(env, 0, (u_long)lease_to);
+ REP_SYSTEM_LOCK(env);
+ F_SET(rep, REP_F_LEASE_EXPIRED);
+ }
+
+ if ((ret = __env_init_rec(env, cntrl->log_version)) != 0)
+ goto errlck;
+
+ REP_SYSTEM_UNLOCK(env);
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->wait_ts = rep->request_gap;
+ ZERO_LSN(lp->verify_lsn);
+ ZERO_LSN(lp->prev_ckp);
+ ZERO_LSN(lp->waiting_lsn);
+ ZERO_LSN(lp->max_wait_lsn);
+ /*
+ * Open if we need to, in preparation for the truncate
+ * we'll do in a moment.
+ */
+ if (db_rep->rep_db == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto err;
+ }
+
+ /*
+ * If we were in the middle of an internal initialization
+ * and we've discovered a new master instead, clean up
+ * our old internal init information. We need to clean
+ * up any flags and unlock our lockout.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP)) {
+ ret = __rep_init_cleanup(env, rep, DB_FORCE);
+ /*
+ * Note that if an in-progress internal init was indeed
+ * "cleaned up", clearing these flags now will allow the
+ * application to see a completely empty database
+ * environment for a moment (until the master responds
+ * to our ALL_REQ).
+ */
+ F_CLR(rep, REP_F_ABBREVIATED | REP_F_RECOVER_MASK);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (ret != 0) {
+ /* TODO: consider add'l error recovery steps. */
+ goto errlck;
+ }
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused))
+ != 0)
+ goto errlck;
+ rep->stat.st_log_queued = 0;
+
+ /*
+ * This needs to be performed under message lockout
+ * if we're actually changing master.
+ */
+ __rep_elect_done(env, rep, 1);
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "Updating gen from %lu to %lu from master %d",
+ (u_long)rep->gen, (u_long)cntrl->gen, eid));
+ rep->gen = cntrl->gen;
+ (void)__rep_write_gen(env, rep, rep->gen);
+ if (rep->egen <= rep->gen)
+ rep->egen = rep->gen + 1;
+ rep->master_id = eid;
+ STAT(rep->stat.st_master_changes++);
+ rep->stat.st_startup_complete = 0;
+ __log_set_version(env, cntrl->log_version);
+ rep->version = cntrl->rep_version;
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "egen: %lu. rep version %lu",
+ (u_long)rep->egen, (u_long)rep->version));
+
+ /*
+ * If we're delaying client sync-up, we know we have a
+ * new/changed master now, set flag indicating we are
+ * actively delaying.
+ */
+ if (FLD_ISSET(rep->config, REP_C_DELAYCLIENT))
+ F_SET(rep, REP_F_DELAY);
+ F_SET(rep, REP_F_NOARCHIVE | REP_F_RECOVER_VERIFY);
+ F_CLR(rep, REP_F_READY_MSG);
+ (void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+ lockout = 0;
+ } else
+ __rep_elect_done(env, rep, 1);
+ REP_SYSTEM_UNLOCK(env);
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lsn = lp->ready_lsn;
+
+ if (!change) {
+ ret = 0;
+ do_req = __rep_check_doreq(env, rep);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ /*
+ * If there wasn't a change, we might still have some
+ * catching up or verification to do.
+ */
+ if (do_req &&
+ (F_ISSET(rep, REP_F_RECOVER_MASK) ||
+ LOG_COMPARE(&lsn, &cntrl->lsn) < 0)) {
+ ret = __rep_resend_req(env, 0);
+ if (ret != 0)
+ RPRINT(env, DB_VERB_REP_MISC, (env,
+ "resend_req ret is %lu", (u_long)ret));
+ }
+ /*
+ * If we're not in one of the recovery modes, we need to
+ * clear the NOARCHIVE flag. Elections set NOARCHIVE
+ * and if we called an election and found the same
+ * master, we need to clear NOARCHIVE here.
+ */
+ if (!F_ISSET(rep, REP_F_RECOVER_MASK)) {
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_NOARCHIVE);
+ REP_SYSTEM_UNLOCK(env);
+ }
+ return (ret);
+ }
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ /*
+ * If the master changed, we need to start the process of
+ * figuring out what our last valid log record is. However,
+ * if both the master and we agree that the max LSN is 0,0,
+ * then there is no recovery to be done. If we are at 0 and
+ * the master is not, then we just need to request all the log
+ * records from the master.
+ */
+ if (IS_INIT_LSN(lsn) || IS_ZERO_LSN(lsn)) {
+ if ((ret = __rep_newmaster_empty(env, eid)) != 0)
+ goto err;
+ goto newmaster_complete;
+ }
+
+ memset(&dbt, 0, sizeof(dbt));
+ /*
+ * If this client is farther ahead on the log file than the master, see
+ * if there is any overlap in the logs. If not, the client is too
+ * far ahead of the master and the client will start over.
+ */
+ if (cntrl->lsn.file < lsn.file) {
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+ ret = __logc_get(logc, &first_lsn, &dbt, DB_FIRST);
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == DB_NOTFOUND)
+ goto notfound;
+ else if (ret != 0)
+ goto err;
+ if (cntrl->lsn.file < first_lsn.file)
+ goto notfound;
+ }
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ goto err;
+ ret = __rep_log_backup(env, rep, logc, &lsn);
+ if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == DB_NOTFOUND)
+ goto notfound;
+ else if (ret != 0)
+ goto err;
+
+ /*
+ * Finally, we have a record to ask for.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->verify_lsn = lsn;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->wait_ts = rep->request_gap;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (!F_ISSET(rep, REP_F_DELAY))
+ (void)__rep_send_message(env,
+ eid, REP_VERIFY_REQ, &lsn, NULL, 0, DB_REP_ANYWHERE);
+ goto newmaster_complete;
+
+err: /*
+ * If we failed, we need to clear the flags we may have set above
+ * because we're not going to be setting the verify_lsn.
+ */
+ REP_SYSTEM_LOCK(env);
+errlck: if (lockout) {
+ F_CLR(rep, REP_F_READY_MSG);
+ (void)__memp_set_config(env->dbenv, DB_MEMP_SYNC_INTERRUPT, 0);
+ }
+ F_CLR(rep, REP_F_RECOVER_MASK | REP_F_DELAY);
+lckout: REP_SYSTEM_UNLOCK(env);
+ return (ret);
+
+notfound:
+ /*
+ * If we don't have an identification record, we still
+ * might have some log records but we're discarding them
+ * to sync up with the master from the start.
+ * Therefore, truncate our log and treat it as if it
+ * were empty. In-memory logs can't be completely
+ * zeroed using __log_vtruncate, so just zero them out.
+ */
+ RPRINT(env, DB_VERB_REP_MISC,
+ (env, "No commit or ckp found. Truncate log."));
+ if (lp->db_log_inmemory) {
+ ZERO_LSN(lsn);
+ ret = __log_zero(env, &lsn);
+ } else {
+ INIT_LSN(lsn);
+ ret = __log_vtruncate(env, &lsn, &lsn, NULL);
+ }
+ if (ret != 0 && ret != DB_NOTFOUND)
+ return (ret);
+ infop = env->reginfo;
+ renv = infop->primary;
+ REP_SYSTEM_LOCK(env);
+ (void)time(&renv->rep_timestamp);
+ REP_SYSTEM_UNLOCK(env);
+ if ((ret = __rep_newmaster_empty(env, eid)) != 0)
+ goto err;
+newmaster_complete:
+ return (DB_REP_NEWMASTER);
+}
+
+/*
+ * __rep_newmaster_empty
+ * Handle the case of a NEWMASTER message received when we have an empty
+ * log. This requires internal init. If we can't do that because of
+ * NOAUTOINIT, return JOIN_FAILURE. If F_DELAY is in effect, don't even
+ * consider NOAUTOINIT yet, because they could change it before rep_sync call.
+ */
+static int
+__rep_newmaster_empty(env, eid)
+ ENV *env;
+ int eid;
+{
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int msg, ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ lp = env->lg_handle->reginfo.primary;
+ msg = ret = 0;
+
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ REP_SYSTEM_LOCK(env);
+ lp->wait_ts = rep->request_gap;
+
+ /* Usual case is to skip to UPDATE state; we may revise this below. */
+ F_CLR(rep, REP_F_RECOVER_VERIFY);
+ F_SET(rep, REP_F_RECOVER_UPDATE);
+
+ if (F_ISSET(rep, REP_F_DELAY)) {
+ /*
+ * Having properly set up wait_ts for later, nothing more to
+ * do now.
+ */
+ } else if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) {
+ F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK);
+ ret = DB_REP_JOIN_FAILURE;
+ } else {
+ /* Normal case: neither DELAY nor NOAUTOINIT. */
+ msg = 1;
+ }
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ if (msg)
+ (void)__rep_send_message(env, eid, REP_UPDATE_REQ,
+ NULL, NULL, 0, 0);
+ return (ret);
+}
+
+/*
+ * __rep_noarchive
+ * Used by log_archive to determine if it is okay to remove
+ * log files.
+ *
+ * PUBLIC: int __rep_noarchive __P((ENV *));
+ */
+int
+__rep_noarchive(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ time_t timestamp;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ /*
+ * This is tested before REP_ON below because we always need
+ * to obey if any replication process has disabled archiving.
+ * Everything is in the environment region that we need here.
+ */
+ if (F_ISSET(renv, DB_REGENV_REPLOCKED)) {
+ (void)time(&timestamp);
+ TIMESTAMP_CHECK(env, timestamp, renv);
+ /*
+ * Check if we're still locked out after checking
+ * the timestamp.
+ */
+ if (F_ISSET(renv, DB_REGENV_REPLOCKED))
+ return (EINVAL);
+ }
+
+ if (!REP_ON(env))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ return (F_ISSET(rep, REP_F_NOARCHIVE) ? 1 : 0);
+}
+
+/*
+ * __rep_send_vote
+ * Send this site's vote for the election.
+ *
+ * PUBLIC: void __rep_send_vote __P((ENV *, DB_LSN *, u_int32_t, u_int32_t,
+ * PUBLIC: u_int32_t, u_int32_t, u_int32_t, int, u_int32_t, u_int32_t));
+ */
+void
+__rep_send_vote(env, lsnp, nsites, nvotes, pri, tie, egen, eid, vtype, flags)
+ ENV *env;
+ DB_LSN *lsnp;
+ int eid;
+ u_int32_t nsites, nvotes, pri;
+ u_int32_t flags, egen, tie, vtype;
+{
+ DB_REP *db_rep;
+ DBT vote_dbt;
+ REP *rep;
+ REP_OLD_VOTE_INFO ovi;
+ __rep_vote_info_args vi;
+ u_int8_t buf[__REP_VOTE_INFO_SIZE];
+ size_t len;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ memset(&vi, 0, sizeof(vi));
+ memset(&vote_dbt, 0, sizeof(vote_dbt));
+
+ /*
+ * In 4.7 we went to fixed sized fields. They may not be
+ * the same as the sizes in older versions.
+ */
+ if (rep->version < DB_REPVERSION_47) {
+ memset(&ovi, 0, sizeof(ovi));
+ ovi.egen = egen;
+ ovi.priority = (int) pri;
+ ovi.nsites = (int) nsites;
+ ovi.nvotes = (int) nvotes;
+ ovi.tiebreaker = tie;
+ vote_dbt.data = &ovi;
+ vote_dbt.size = sizeof(ovi);
+ } else {
+ vi.egen = egen;
+ vi.priority = pri;
+ vi.nsites = nsites;
+ vi.nvotes = nvotes;
+ vi.tiebreaker = tie;
+ (void)__rep_vote_info_marshal(env, &vi, buf,
+ __REP_VOTE_INFO_SIZE, &len);
+ DB_INIT_DBT(vote_dbt, buf, len);
+ }
+
+ (void)__rep_send_message(env, eid, vtype, lsnp, &vote_dbt, flags, 0);
+}
+
+/*
+ * __rep_elect_done
+ * Clear all election information for this site. Assumes the
+ * caller hold the region mutex.
+ *
+ * PUBLIC: void __rep_elect_done __P((ENV *, REP *, int));
+ */
+void
+__rep_elect_done(env, rep, found_master)
+ ENV *env;
+ REP *rep;
+ int found_master;
+{
+ int inelect;
+ db_timespec endtime;
+
+ inelect = IN_ELECTION(rep);
+ F_CLR(rep, REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY);
+ /*
+ * Finding a master trumps finding a new egen.
+ */
+ if (found_master)
+ F_CLR(rep, REP_F_EGENUPDATE);
+ rep->sites = 0;
+ rep->votes = 0;
+ if (inelect) {
+ if (timespecisset(&rep->etime)) {
+ __os_gettime(env, &endtime, 1);
+ timespecsub(&endtime, &rep->etime);
+#ifdef HAVE_STATISTICS
+ rep->stat.st_election_sec = (u_int32_t)endtime.tv_sec;
+ rep->stat.st_election_usec = (u_int32_t)
+ (endtime.tv_nsec / NS_PER_US);
+#endif
+ RPRINT(env, DB_VERB_REP_ELECT, (env,
+ "Election finished in %lu.%09lu sec",
+ (u_long)endtime.tv_sec, (u_long)endtime.tv_nsec));
+ timespecclear(&rep->etime);
+ }
+ rep->egen++;
+ }
+ RPRINT(env, DB_VERB_REP_ELECT,
+ (env, "Election done; egen %lu", (u_long)rep->egen));
+}
+
+/*
+ * __env_rep_enter --
+ *
+ * Check if we are in the middle of replication initialization and/or
+ * recovery, and if so, disallow operations. If operations are allowed,
+ * increment handle-counts, so that we do not start recovery while we
+ * are operating in the library.
+ *
+ * PUBLIC: int __env_rep_enter __P((ENV *, int));
+ */
+int
+__env_rep_enter(env, checklock)
+ ENV *env;
+ int checklock;
+{
+ DB_REP *db_rep;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ int cnt;
+ time_t timestamp;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ infop = env->reginfo;
+ renv = infop->primary;
+ if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) {
+ (void)time(&timestamp);
+ TIMESTAMP_CHECK(env, timestamp, renv);
+ /*
+ * Check if we're still locked out after checking
+ * the timestamp.
+ */
+ if (F_ISSET(renv, DB_REGENV_REPLOCKED))
+ return (EINVAL);
+ }
+
+ REP_SYSTEM_LOCK(env);
+ for (cnt = 0; F_ISSET(rep, REP_F_READY_API);) {
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * We're spinning - environment may be hung. Check if
+ * recovery has been initiated.
+ */
+ PANIC_CHECK(env);
+ if (FLD_ISSET(rep->config, REP_C_NOWAIT)) {
+ __db_errx(env,
+ "Operation locked out. Waiting for replication lockout to complete");
+ return (DB_REP_LOCKOUT);
+ }
+ __os_yield(env, 1, 0);
+ REP_SYSTEM_LOCK(env);
+ if (++cnt % 60 == 0)
+ __db_errx(env,
+ "DB_ENV handle waiting %d minutes for replication lockout to complete",
+ cnt / 60);
+ }
+ rep->handle_cnt++;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __env_db_rep_exit --
+ *
+ * Decrement handle count upon routine exit.
+ *
+ * PUBLIC: int __env_db_rep_exit __P((ENV *));
+ */
+int
+__env_db_rep_exit(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_SYSTEM_LOCK(env);
+ rep->handle_cnt--;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __db_rep_enter --
+ * Called in replicated environments to keep track of in-use handles
+ * and prevent any concurrent operation during recovery. If checkgen is
+ * non-zero, then we verify that the dbp has the same handle as the env.
+ *
+ * If return_now is non-zero, we'll return DB_DEADLOCK immediately, else we'll
+ * sleep before returning DB_DEADLOCK. Without the sleep, it is likely
+ * the application will immediately try again and could reach a retry
+ * limit before replication has a chance to finish. The sleep increases
+ * the probability that an application retry will succeed.
+ *
+ * Typically calls with txns set return_now so that we return immediately.
+ * We want to return immediately because we want the txn to abort ASAP
+ * so that the lockout can proceed.
+ *
+ * PUBLIC: int __db_rep_enter __P((DB *, int, int, int));
+ */
+int
+__db_rep_enter(dbp, checkgen, checklock, return_now)
+ DB *dbp;
+ int checkgen, checklock, return_now;
+{
+ DB_REP *db_rep;
+ ENV *env;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ time_t timestamp;
+
+ env = dbp->env;
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ infop = env->reginfo;
+ renv = infop->primary;
+
+ if (checklock && F_ISSET(renv, DB_REGENV_REPLOCKED)) {
+ (void)time(&timestamp);
+ TIMESTAMP_CHECK(env, timestamp, renv);
+ /*
+ * Check if we're still locked out after checking
+ * the timestamp.
+ */
+ if (F_ISSET(renv, DB_REGENV_REPLOCKED))
+ return (EINVAL);
+ }
+ REP_SYSTEM_LOCK(env);
+ /*
+ * !!!
+ * Note, we are checking REP_F_READY_OP, but we are
+ * incrementing rep->handle_cnt. That seems like a mismatch,
+ * but the intention is to return DEADLOCK to the application
+ * which will cause them to abort the txn quickly and allow
+ * the lockout to proceed.
+ *
+ * The correctness of doing this depends on the fact that
+ * lockout of the API always sets REP_F_READY_OP first.
+ */
+ if (F_ISSET(rep, REP_F_READY_OP)) {
+ REP_SYSTEM_UNLOCK(env);
+ if (!return_now)
+ __os_yield(env, 5, 0);
+ return (DB_LOCK_DEADLOCK);
+ }
+
+ if (checkgen && dbp->timestamp != renv->rep_timestamp) {
+ REP_SYSTEM_UNLOCK(env);
+ __db_errx(env, "%s %s",
+ "replication recovery unrolled committed transactions;",
+ "open DB and DBcursor handles must be closed");
+ return (DB_REP_HANDLE_DEAD);
+ }
+ rep->handle_cnt++;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __op_rep_enter --
+ *
+ * Check if we are in the middle of replication initialization and/or
+ * recovery, and if so, disallow new multi-step operations, such as
+ * transaction and memp gets. If operations are allowed,
+ * increment the op_cnt, so that we do not start recovery while we have
+ * active operations.
+ *
+ * PUBLIC: int __op_rep_enter __P((ENV *));
+ */
+int
+__op_rep_enter(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int cnt;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_SYSTEM_LOCK(env);
+ for (cnt = 0; F_ISSET(rep, REP_F_READY_OP);) {
+ REP_SYSTEM_UNLOCK(env);
+ /*
+ * We're spnning - enironment may be hung. Check if
+ * recovery has been initiated.
+ */
+ PANIC_CHECK(env);
+ if (FLD_ISSET(rep->config, REP_C_NOWAIT)) {
+ __db_errx(env,
+ "Operation locked out. Waiting for replication lockout to complete");
+ return (DB_REP_LOCKOUT);
+ }
+ __os_yield(env, 5, 0);
+ cnt += 5;
+ REP_SYSTEM_LOCK(env);
+ if (cnt % 60 == 0)
+ __db_errx(env,
+ "__op_rep_enter waiting %d minutes for lockout to complete",
+ cnt / 60);
+ }
+ rep->op_cnt++;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __op_rep_exit --
+ *
+ * Decrement op count upon transaction commit/abort/discard or
+ * memp_fput.
+ *
+ * PUBLIC: int __op_rep_exit __P((ENV *));
+ */
+int
+__op_rep_exit(env)
+ ENV *env;
+{
+ DB_REP *db_rep;
+ REP *rep;
+
+ /* Check if locks have been globally turned off. */
+ if (F_ISSET(env->dbenv, DB_ENV_NOLOCKING))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ REP_SYSTEM_LOCK(env);
+ DB_ASSERT(env, rep->op_cnt > 0);
+ rep->op_cnt--;
+ REP_SYSTEM_UNLOCK(env);
+
+ return (0);
+}
+
+/*
+ * __rep_lockout_api --
+ * Coordinate with other threads in the library and active txns so
+ * that we can run single-threaded, for recovery or internal backup.
+ * Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_api __P((ENV *, REP *));
+ */
+int
+__rep_lockout_api(env, rep)
+ ENV *env;
+ REP *rep;
+{
+ int ret;
+
+ /*
+ * We must drain long-running operations first. We check
+ * REP_F_READY_OP in __db_rep_enter in order to allow them
+ * to abort existing txns quickly. Therefore, we must
+ * always lockout REP_F_READY_OP first, then REP_F_READY_API.
+ */
+ if ((ret = __rep_lockout_int(env, rep, &rep->op_cnt, 0,
+ "op_cnt", REP_F_READY_OP)) != 0)
+ return (ret);
+ return (__rep_lockout_int(env, rep, &rep->handle_cnt, 0,
+ "handle_cnt", REP_F_READY_API));
+}
+
+/*
+ * __rep_lockout_apply --
+ * Coordinate with other threads processing messages so that
+ * we can run single-threaded and know that no incoming
+ * message can apply new log records.
+ * This call should be short-term covering a specific critical
+ * operation where we need to make sure no new records change
+ * the log. Currently used to coordinate with elections.
+ * Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_apply __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_lockout_apply(env, rep, apply_th)
+ ENV *env;
+ REP *rep;
+ u_int32_t apply_th;
+{
+ return (__rep_lockout_int(env, rep, &rep->apply_th, apply_th,
+ "apply_th", REP_F_READY_APPLY));
+}
+
+/*
+ * __rep_lockout_msg --
+ * Coordinate with other threads processing messages so that
+ * we can run single-threaded and know that no incoming
+ * message can change the world (i.e., like a NEWMASTER message).
+ * This call should be short-term covering a specific critical
+ * operation where we need to make sure no new messages arrive
+ * in the middle and all message threads are out before we start it.
+ * Assumes the caller holds the region mutex.
+ *
+ * PUBLIC: int __rep_lockout_msg __P((ENV *, REP *, u_int32_t));
+ */
+int
+__rep_lockout_msg(env, rep, msg_th)
+ ENV *env;
+ REP *rep;
+ u_int32_t msg_th;
+{
+ return (__rep_lockout_int(env, rep, &rep->msg_th, msg_th,
+ "msg_th", REP_F_READY_MSG));
+}
+
+/*
+ * __rep_lockout_int --
+ * Internal common code for locking out and coordinating
+ * with other areas of the code.
+ * Assumes the caller holds the region mutex.
+ *
+ */
+static int
+__rep_lockout_int(env, rep, fieldp, field_val, msg, lockout_flag)
+ ENV *env;
+ REP *rep;
+ u_int32_t *fieldp;
+ const char *msg;
+ u_int32_t field_val, lockout_flag;
+{
+ int wait_cnt;
+
+ F_SET(rep, lockout_flag);
+ for (wait_cnt = 0; *fieldp > field_val;) {
+ REP_SYSTEM_UNLOCK(env);
+ /* We're spinning - environment may be hung. Check if
+ * recovery has been initiated.
+ */
+ PANIC_CHECK(env);
+ __os_yield(env, 1, 0);
+#ifdef DIAGNOSTIC
+ if (wait_cnt == 5)
+ __db_errx(env,
+"Waiting for %s (%lu) to complete replication lockout",
+ msg, (u_long)*fieldp);
+ if (++wait_cnt % 60 == 0)
+ __db_errx(env,
+"Waiting for %s (%lu) to complete replication lockout for %d minutes",
+ msg, (u_long)*fieldp, wait_cnt / 60);
+#endif
+ REP_SYSTEM_LOCK(env);
+ }
+
+ COMPQUIET(msg, NULL);
+ return (0);
+}
+
+/*
+ * __rep_send_throttle -
+ * Send a record, throttling if necessary. Callers of this function
+ * will throttle - breaking out of their loop, if the repth->type field
+ * changes from the normal message type to the *_MORE message type.
+ * This function will send the normal type unless throttling gets invoked.
+ * Then it sets the type field and sends the _MORE message.
+ *
+ * Throttling is always only relevant in serving requests, so we always send
+ * with REPCTL_RESEND. Additional desired flags can be passed in the ctlflags
+ * argument.
+ *
+ * PUBLIC: int __rep_send_throttle __P((ENV *, int, REP_THROTTLE *,
+ * PUBLIC: u_int32_t, u_int32_t));
+ */
+int
+__rep_send_throttle(env, eid, repth, flags, ctlflags)
+ ENV *env;
+ int eid;
+ REP_THROTTLE *repth;
+ u_int32_t ctlflags, flags;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ u_int32_t size, typemore;
+ int check_limit;
+
+ check_limit = repth->gbytes != 0 || repth->bytes != 0;
+ /*
+ * If we only want to do throttle processing and we don't have it
+ * turned on, return immediately.
+ */
+ if (!check_limit && LF_ISSET(REP_THROTTLE_ONLY))
+ return (0);
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ typemore = 0;
+ if (repth->type == REP_LOG)
+ typemore = REP_LOG_MORE;
+ if (repth->type == REP_PAGE)
+ typemore = REP_PAGE_MORE;
+ DB_ASSERT(env, typemore != 0);
+
+ /*
+ * data_dbt.size is only the size of the log
+ * record; it doesn't count the size of the
+ * control structure. Factor that in as well
+ * so we're not off by a lot if our log records
+ * are small.
+ */
+ size = repth->data_dbt->size + sizeof(__rep_control_args);
+ if (check_limit) {
+ while (repth->bytes <= size) {
+ if (repth->gbytes > 0) {
+ repth->bytes += GIGABYTE;
+ --(repth->gbytes);
+ continue;
+ }
+ /*
+ * We don't hold the rep mutex,
+ * and may miscount.
+ */
+ STAT(rep->stat.st_nthrottles++);
+ repth->type = typemore;
+ goto send;
+ }
+ repth->bytes -= size;
+ }
+ /*
+ * Always send if it is typemore, otherwise send only if
+ * REP_THROTTLE_ONLY is not set.
+ *
+ * NOTE: It is the responsibility of the caller to marshal, if
+ * needed, the data_dbt. This function just sends what it is given.
+ */
+send: if ((repth->type == typemore || !LF_ISSET(REP_THROTTLE_ONLY)) &&
+ (__rep_send_message(env, eid, repth->type,
+ &repth->lsn, repth->data_dbt, (REPCTL_RESEND | ctlflags), 0) != 0))
+ return (DB_REP_UNAVAIL);
+ return (0);
+}
+
+/*
+ * __rep_msg_to_old --
+ * Convert current message numbers to old message numbers.
+ *
+ * PUBLIC: u_int32_t __rep_msg_to_old __P((u_int32_t, u_int32_t));
+ */
+u_int32_t
+__rep_msg_to_old(version, rectype)
+ u_int32_t version, rectype;
+{
+ /*
+ * We need to convert from current message numbers to old numbers and
+ * we need to convert from old numbers to current numbers. Offset by
+ * one for more readable code.
+ */
+ /*
+ * Everything for version 0 is invalid, there is no version 0.
+ */
+ static const u_int32_t table[DB_REPVERSION][REP_MAX_MSG+1] = {
+ /* There is no DB_REPVERSION 0. */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * 4.2/DB_REPVERSION 1 no longer supported.
+ */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * 4.3/DB_REPVERSION 2 no longer supported.
+ */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * From 4.7 message number To 4.4/4.5 message number
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* REP_ALIVE */
+ 2, /* REP_ALIVE_REQ */
+ 3, /* REP_ALL_REQ */
+ 4, /* REP_BULK_LOG */
+ 5, /* REP_BULK_PAGE */
+ 6, /* REP_DUPMASTER */
+ 7, /* REP_FILE */
+ 8, /* REP_FILE_FAIL */
+ 9, /* REP_FILE_REQ */
+ REP_INVALID, /* REP_LEASE_GRANT */
+ 10, /* REP_LOG */
+ 11, /* REP_LOG_MORE */
+ 12, /* REP_LOG_REQ */
+ 13, /* REP_MASTER_REQ */
+ 14, /* REP_NEWCLIENT */
+ 15, /* REP_NEWFILE */
+ 16, /* REP_NEWMASTER */
+ 17, /* REP_NEWSITE */
+ 18, /* REP_PAGE */
+ 19, /* REP_PAGE_FAIL */
+ 20, /* REP_PAGE_MORE */
+ 21, /* REP_PAGE_REQ */
+ 22, /* REP_REREQUEST */
+ REP_INVALID, /* REP_START_SYNC */
+ 23, /* REP_UPDATE */
+ 24, /* REP_UPDATE_REQ */
+ 25, /* REP_VERIFY */
+ 26, /* REP_VERIFY_FAIL */
+ 27, /* REP_VERIFY_REQ */
+ 28, /* REP_VOTE1 */
+ 29 /* REP_VOTE2 */
+ },
+ /*
+ * From 4.7 message number To 4.6 message number. There are
+ * NO message differences between 4.6 and 4.7. The
+ * control structure changed.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* REP_ALIVE */
+ 2, /* REP_ALIVE_REQ */
+ 3, /* REP_ALL_REQ */
+ 4, /* REP_BULK_LOG */
+ 5, /* REP_BULK_PAGE */
+ 6, /* REP_DUPMASTER */
+ 7, /* REP_FILE */
+ 8, /* REP_FILE_FAIL */
+ 9, /* REP_FILE_REQ */
+ 10, /* REP_LEASE_GRANT */
+ 11, /* REP_LOG */
+ 12, /* REP_LOG_MORE */
+ 13, /* REP_LOG_REQ */
+ 14, /* REP_MASTER_REQ */
+ 15, /* REP_NEWCLIENT */
+ 16, /* REP_NEWFILE */
+ 17, /* REP_NEWMASTER */
+ 18, /* REP_NEWSITE */
+ 19, /* REP_PAGE */
+ 20, /* REP_PAGE_FAIL */
+ 21, /* REP_PAGE_MORE */
+ 22, /* REP_PAGE_REQ */
+ 23, /* REP_REREQUEST */
+ 24, /* REP_START_SYNC */
+ 25, /* REP_UPDATE */
+ 26, /* REP_UPDATE_REQ */
+ 27, /* REP_VERIFY */
+ 28, /* REP_VERIFY_FAIL */
+ 29, /* REP_VERIFY_REQ */
+ 30, /* REP_VOTE1 */
+ 31 /* REP_VOTE2 */
+ }
+ };
+ return (table[version][rectype]);
+}
+
+/*
+ * __rep_msg_from_old --
+ * Convert old message numbers to current message numbers.
+ *
+ * PUBLIC: u_int32_t __rep_msg_from_old __P((u_int32_t, u_int32_t));
+ */
+u_int32_t
+__rep_msg_from_old(version, rectype)
+ u_int32_t version, rectype;
+{
+ /*
+ * We need to convert from current message numbers to old numbers and
+ * we need to convert from old numbers to current numbers. Offset by
+ * one for more readable code.
+ */
+ /*
+ * Everything for version 0 is invalid, there is no version 0.
+ */
+ static const u_int32_t table[DB_REPVERSION][REP_MAX_MSG+1] = {
+ /* There is no DB_REPVERSION 0. */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * 4.2/DB_REPVERSION 1 no longer supported.
+ */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * 4.3/DB_REPVERSION 2 no longer supported.
+ */
+ { REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID,
+ REP_INVALID, REP_INVALID, REP_INVALID, REP_INVALID },
+ /*
+ * From 4.4/4.5 message number To 4.7 message number
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* 1, REP_ALIVE */
+ 2, /* 2, REP_ALIVE_REQ */
+ 3, /* 3, REP_ALL_REQ */
+ 4, /* 4, REP_BULK_LOG */
+ 5, /* 5, REP_BULK_PAGE */
+ 6, /* 6, REP_DUPMASTER */
+ 7, /* 7, REP_FILE */
+ 8, /* 8, REP_FILE_FAIL */
+ 9, /* 9, REP_FILE_REQ */
+ /* 10, REP_LEASE_GRANT doesn't exist */
+ 11, /* 10, REP_LOG */
+ 12, /* 11, REP_LOG_MORE */
+ 13, /* 12, REP_LOG_REQ */
+ 14, /* 13, REP_MASTER_REQ */
+ 15, /* 14, REP_NEWCLIENT */
+ 16, /* 15, REP_NEWFILE */
+ 17, /* 16, REP_NEWMASTER */
+ 18, /* 17, REP_NEWSITE */
+ 19, /* 18, REP_PAGE */
+ 20, /* 19, REP_PAGE_FAIL */
+ 21, /* 20, REP_PAGE_MORE */
+ 22, /* 21, REP_PAGE_REQ */
+ 23, /* 22, REP_REREQUEST */
+ /* 24, REP_START_SYNC doesn't exist */
+ 25, /* 23, REP_UPDATE */
+ 26, /* 24, REP_UPDATE_REQ */
+ 27, /* 25, REP_VERIFY */
+ 28, /* 26, REP_VERIFY_FAIL */
+ 29, /* 27, REP_VERIFY_REQ */
+ 30, /* 28, REP_VOTE1 */
+ 31, /* 29, REP_VOTE2 */
+ REP_INVALID, /* 30, 4.4/4.5 no message */
+ REP_INVALID /* 31, 4.4/4.5 no message */
+ },
+ /*
+ * From 4.6 message number To 4.6 message number. There are
+ * NO message differences between 4.6 and 4.7. The
+ * control structure changed.
+ */
+ { REP_INVALID, /* NO message 0 */
+ 1, /* 1, REP_ALIVE */
+ 2, /* 2, REP_ALIVE_REQ */
+ 3, /* 3, REP_ALL_REQ */
+ 4, /* 4, REP_BULK_LOG */
+ 5, /* 5, REP_BULK_PAGE */
+ 6, /* 6, REP_DUPMASTER */
+ 7, /* 7, REP_FILE */
+ 8, /* 8, REP_FILE_FAIL */
+ 9, /* 9, REP_FILE_REQ */
+ 10, /* 10, REP_LEASE_GRANT */
+ 11, /* 11, REP_LOG */
+ 12, /* 12, REP_LOG_MORE */
+ 13, /* 13, REP_LOG_REQ */
+ 14, /* 14, REP_MASTER_REQ */
+ 15, /* 15, REP_NEWCLIENT */
+ 16, /* 16, REP_NEWFILE */
+ 17, /* 17, REP_NEWMASTER */
+ 18, /* 18, REP_NEWSITE */
+ 19, /* 19, REP_PAGE */
+ 20, /* 20, REP_PAGE_FAIL */
+ 21, /* 21, REP_PAGE_MORE */
+ 22, /* 22, REP_PAGE_REQ */
+ 23, /* 22, REP_REREQUEST */
+ 24, /* 24, REP_START_SYNC */
+ 25, /* 25, REP_UPDATE */
+ 26, /* 26, REP_UPDATE_REQ */
+ 27, /* 27, REP_VERIFY */
+ 28, /* 28, REP_VERIFY_FAIL */
+ 29, /* 29, REP_VERIFY_REQ */
+ 30, /* 30, REP_VOTE1 */
+ 31 /* 31, REP_VOTE2 */
+ }
+ };
+ return (table[version][rectype]);
+}
+
+/*
+ * __rep_print --
+ * Optionally print a verbose message.
+ *
+ * PUBLIC: void __rep_print __P((ENV *, const char *, ...))
+ * PUBLIC: __attribute__ ((__format__ (__printf__, 2, 3)));
+ */
+void
+#ifdef STDC_HEADERS
+__rep_print(ENV *env, const char *fmt, ...)
+#else
+__rep_print(env, fmt, va_alist)
+ ENV *env;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+ DB_MSGBUF mb;
+ REP *rep;
+ db_timespec ts;
+ pid_t pid;
+ db_threadid_t tid;
+ const char *s;
+ char buf[DB_THREADID_STRLEN];
+
+ DB_MSGBUF_INIT(&mb);
+
+ s = NULL;
+ if (env->dbenv->db_errpfx != NULL)
+ s = env->dbenv->db_errpfx;
+ else if (REP_ON(env)) {
+ rep = env->rep_handle->region;
+ if (F_ISSET(rep, REP_F_CLIENT))
+ s = "CLIENT";
+ else if (F_ISSET(rep, REP_F_MASTER))
+ s = "MASTER";
+ }
+ if (s == NULL)
+ s = "REP_UNDEF";
+ __os_gettime(env, &ts, 1);
+ __os_id(env->dbenv, &pid, &tid);
+ __db_msgadd(env, &mb, "[%lu:%lu][%s] %s: ",
+ (u_long)ts.tv_sec, (u_long)ts.tv_nsec/NS_PER_US,
+ env->dbenv->thread_id_string(env->dbenv, pid, tid, buf), s);
+
+#ifdef STDC_HEADERS
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ __db_msgadd_ap(env, &mb, fmt, ap);
+ va_end(ap);
+
+ DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * PUBLIC: void __rep_print_message
+ * PUBLIC: __P((ENV *, int, __rep_control_args *, char *, u_int32_t));
+ */
+void
+__rep_print_message(env, eid, rp, str, flags)
+ ENV *env;
+ int eid;
+ __rep_control_args *rp;
+ char *str;
+ u_int32_t flags;
+{
+ u_int32_t ctlflags, rectype;
+ char ftype[64], *type;
+
+ rectype = rp->rectype;
+ ctlflags = rp->flags;
+ if (rp->rep_version != DB_REPVERSION)
+ rectype = __rep_msg_from_old(rp->rep_version, rectype);
+ switch (rectype) {
+ case REP_ALIVE:
+ type = "alive";
+ break;
+ case REP_ALIVE_REQ:
+ type = "alive_req";
+ break;
+ case REP_ALL_REQ:
+ type = "all_req";
+ break;
+ case REP_BULK_LOG:
+ type = "bulk_log";
+ break;
+ case REP_BULK_PAGE:
+ type = "bulk_page";
+ break;
+ case REP_DUPMASTER:
+ type = "dupmaster";
+ break;
+ case REP_FILE:
+ type = "file";
+ break;
+ case REP_FILE_FAIL:
+ type = "file_fail";
+ break;
+ case REP_FILE_REQ:
+ type = "file_req";
+ break;
+ case REP_LEASE_GRANT:
+ type = "lease_grant";
+ break;
+ case REP_LOG:
+ type = "log";
+ break;
+ case REP_LOG_MORE:
+ type = "log_more";
+ break;
+ case REP_LOG_REQ:
+ type = "log_req";
+ break;
+ case REP_MASTER_REQ:
+ type = "master_req";
+ break;
+ case REP_NEWCLIENT:
+ type = "newclient";
+ break;
+ case REP_NEWFILE:
+ type = "newfile";
+ break;
+ case REP_NEWMASTER:
+ type = "newmaster";
+ break;
+ case REP_NEWSITE:
+ type = "newsite";
+ break;
+ case REP_PAGE:
+ type = "page";
+ break;
+ case REP_PAGE_FAIL:
+ type = "page_fail";
+ break;
+ case REP_PAGE_MORE:
+ type = "page_more";
+ break;
+ case REP_PAGE_REQ:
+ type = "page_req";
+ break;
+ case REP_REREQUEST:
+ type = "rerequest";
+ break;
+ case REP_START_SYNC:
+ type = "start_sync";
+ break;
+ case REP_UPDATE:
+ type = "update";
+ break;
+ case REP_UPDATE_REQ:
+ type = "update_req";
+ break;
+ case REP_VERIFY:
+ type = "verify";
+ break;
+ case REP_VERIFY_FAIL:
+ type = "verify_fail";
+ break;
+ case REP_VERIFY_REQ:
+ type = "verify_req";
+ break;
+ case REP_VOTE1:
+ type = "vote1";
+ break;
+ case REP_VOTE2:
+ type = "vote2";
+ break;
+ default:
+ type = "NOTYPE";
+ break;
+ }
+
+ /*
+ * !!!
+ * If adding new flags to print out make sure the aggregate
+ * length cannot overflow the buffer.
+ */
+ ftype[0] = '\0';
+ if (LF_ISSET(DB_REP_ANYWHERE))
+ (void)strcat(ftype, " any"); /* 4 */
+ if (FLD_ISSET(ctlflags, REPCTL_FLUSH))
+ (void)strcat(ftype, " flush"); /* 10 */
+ /*
+ * We expect most of the time the messages will indicate
+ * group membership. Only print if we're not already
+ * part of a group.
+ */
+ if (!FLD_ISSET(ctlflags, REPCTL_GROUP_ESTD))
+ (void)strcat(ftype, " nogroup"); /* 18 */
+ if (FLD_ISSET(ctlflags, REPCTL_LEASE))
+ (void)strcat(ftype, " lease"); /* 24 */
+ if (LF_ISSET(DB_REP_NOBUFFER))
+ (void)strcat(ftype, " nobuf"); /* 30 */
+ if (FLD_ISSET(ctlflags, REPCTL_PERM))
+ (void)strcat(ftype, " perm"); /* 35 */
+ if (LF_ISSET(DB_REP_REREQUEST))
+ (void)strcat(ftype, " rereq"); /* 41 */
+ if (FLD_ISSET(ctlflags, REPCTL_RESEND))
+ (void)strcat(ftype, " resend"); /* 48 */
+ if (FLD_ISSET(ctlflags, REPCTL_LOG_END))
+ (void)strcat(ftype, " logend"); /* 55 */
+ RPRINT(env, DB_VERB_REP_MSGS,
+ (env,
+ "%s %s: msgv = %lu logv %lu gen = %lu eid %d, type %s, LSN [%lu][%lu] %s",
+ env->db_home, str,
+ (u_long)rp->rep_version, (u_long)rp->log_version, (u_long)rp->gen,
+ eid, type, (u_long)rp->lsn.file, (u_long)rp->lsn.offset, ftype));
+ /*
+ * Make sure the version is close, and not swapped
+ * here. Check for current version, +/- a little bit.
+ */
+ DB_ASSERT(env, rp->rep_version <= DB_REPVERSION+10);
+ DB_ASSERT(env, rp->log_version <= DB_LOGVERSION+10);
+}
+
+/*
+ * PUBLIC: void __rep_fire_event __P((ENV *, u_int32_t, void *));
+ */
+void
+__rep_fire_event(env, event, info)
+ ENV *env;
+ u_int32_t event;
+ void *info;
+{
+ int ret;
+
+ /*
+ * Give repmgr first crack at handling all replication-related events.
+ * If it can't (or chooses not to) handle the event fully, then pass it
+ * along to the application.
+ */
+ ret = __repmgr_handle_event(env, event, info);
+ DB_ASSERT(env, ret == 0 || ret == DB_EVENT_NOT_HANDLED);
+
+ if (ret == DB_EVENT_NOT_HANDLED)
+ DB_EVENT(env, event, info);
+}
diff --git a/rep/rep_verify.c b/rep/rep_verify.c
new file mode 100644
index 0000000..d90b3aa
--- /dev/null
+++ b/rep/rep_verify.c
@@ -0,0 +1,766 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+static int __rep_internal_init __P((ENV *, u_int32_t));
+
+/*
+ * __rep_verify --
+ * Handle a REP_VERIFY message.
+ *
+ * PUBLIC: int __rep_verify __P((ENV *, __rep_control_args *, DBT *,
+ * PUBLIC: int, time_t));
+ */
+int
+__rep_verify(env, rp, rec, eid, savetime)
+ ENV *env;
+ __rep_control_args *rp;
+ DBT *rec;
+ int eid;
+ time_t savetime;
+{
+ DBT mylog;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN lsn, prev_ckp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ __txn_ckp_args *ckp_args;
+ u_int32_t logflag, rectype;
+ int master, match, ret, t_ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ /* Do nothing if VERIFY flag is not set. */
+ if (!F_ISSET(rep, REP_F_RECOVER_VERIFY))
+ return (ret);
+
+#ifdef DIAGNOSTIC
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ if (IS_USING_LEASES(env)) {
+ REP_SYSTEM_LOCK(env);
+ DB_ASSERT(env, __rep_islease_granted(env) == 0);
+ REP_SYSTEM_UNLOCK(env);
+ }
+#endif
+
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ memset(&mylog, 0, sizeof(mylog));
+ /* If verify_lsn of ZERO is passed in, get last log. */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ logflag = IS_ZERO_LSN(lp->verify_lsn) ? DB_LAST : DB_SET;
+ prev_ckp = lp->prev_ckp;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if ((ret = __logc_get(logc, &rp->lsn, &mylog, logflag)) != 0)
+ goto out;
+ match = 0;
+ if (mylog.size == rec->size &&
+ memcmp(mylog.data, rec->data, rec->size) == 0)
+ match = 1;
+ /*
+ * If we don't have a match, backup to the previous
+ * identification record and try again.
+ */
+ if (match == 0) {
+ master = rep->master_id;
+ /*
+ * We will eventually roll back over this log record (unless we
+ * ultimately have to give up and do an internal init). So, if
+ * it was a checkpoint, make sure we don't end up without any
+ * checkpoints left in the entire log.
+ */
+ LOGCOPY_32(env, &rectype, mylog.data);
+ DB_ASSERT(env, ret == 0);
+ if (!lp->db_log_inmemory && rectype == DB___txn_ckp) {
+ if ((ret = __txn_ckp_read(env,
+ mylog.data, &ckp_args)) != 0)
+ goto out;
+ lsn = ckp_args->last_ckp;
+ __os_free(env, ckp_args);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->prev_ckp = lsn;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (IS_ZERO_LSN(lsn)) {
+ /*
+ * No previous checkpoints? The only way this
+ * is OK is if we have the entire log, all the
+ * way back to file #1.
+ */
+ if ((ret = __logc_get(logc,
+ &lsn, &mylog, DB_FIRST)) != 0)
+ goto out;
+ if (lsn.file != 1) {
+ ret = __rep_internal_init(env, 0);
+ goto out;
+ }
+
+ /* Restore position of log cursor. */
+ if ((ret = __logc_get(logc,
+ &rp->lsn, &mylog, DB_SET)) != 0)
+ goto out;
+ }
+ }
+ if ((ret = __rep_log_backup(env, rep, logc, &lsn)) == 0) {
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->verify_lsn = lsn;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ lp->wait_ts = rep->request_gap;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (master != DB_EID_INVALID)
+ eid = master;
+ (void)__rep_send_message(env, eid, REP_VERIFY_REQ,
+ &lsn, NULL, 0, DB_REP_ANYWHERE);
+ } else if (ret == DB_NOTFOUND) {
+ /*
+ * We've either run out of records because
+ * logs have been removed or we've rolled back
+ * all the way to the beginning.
+ */
+ ret = __rep_internal_init(env, 0);
+ }
+ } else {
+ /*
+ * We have a match, so we can probably do a simple sync, without
+ * needing internal init. But first, check for a couple of
+ * special cases.
+ */
+
+ if (!lp->db_log_inmemory && !IS_ZERO_LSN(prev_ckp)) {
+ /*
+ * We previously saw a checkpoint, which means we may
+ * now be about to roll back over it and lose it. Make
+ * sure we'll end up still having at least one other
+ * checkpoint. (Note that if the current record -- the
+ * one we've just matched -- happens to be a checkpoint,
+ * then it must be the same as the prev_ckp we're now
+ * about to try reading. Which means we wouldn't really
+ * have to read it. But checking for that special case
+ * doesn't seem worth the trouble.)
+ */
+ if ((ret = __logc_get(logc,
+ &prev_ckp, &mylog, DB_SET)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = __rep_internal_init(env, 0);
+ goto out;
+ }
+ /*
+ * We succeeded reading for the prev_ckp, so it's safe
+ * to fall through to the verify_match.
+ */
+ }
+ /*
+ * Mixed version internal init doesn't work with 4.4, so we
+ * can't load NIMDBs from a very old-version master. So, fib to
+ * ourselves that they're already loaded, so that we don't try.
+ */
+ if (rep->version == DB_REPVERSION_44)
+ F_SET(rep, REP_F_NIMDBS_LOADED);
+ if (F_ISSET(rep, REP_F_NIMDBS_LOADED))
+ ret = __rep_verify_match(env, &rp->lsn, savetime);
+ else {
+ /*
+ * Even though we found a match, we haven't yet loaded
+ * any NIMDBs, so we have to do an abbreviated internal
+ * init. We leave lp->verify_lsn set to the matching
+ * sync point, in case upon eventual examination of the
+ * UPDATE message it turns out there are no NIMDBs
+ * (since we can then skip back to a verify_match
+ * outcome).
+ */
+ ret = __rep_internal_init(env, REP_F_ABBREVIATED);
+ }
+ }
+
+out: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static int
+__rep_internal_init(env, abbrev)
+ ENV *env;
+ u_int32_t abbrev;
+{
+ REP *rep;
+ int master, ret;
+
+ rep = env->rep_handle->region;
+ REP_SYSTEM_LOCK(env);
+#ifdef HAVE_STATISTICS
+ if (!abbrev)
+ rep->stat.st_outdated++;
+#endif
+
+ /*
+ * What we call "abbreviated internal init" is really just NIMDB
+ * materialization, and we always do that even if NOAUTOINIT has been
+ * configured.
+ */
+ if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT) && !abbrev)
+ ret = DB_REP_JOIN_FAILURE;
+ else {
+ F_CLR(rep, REP_F_RECOVER_VERIFY);
+ F_SET(rep, REP_F_RECOVER_UPDATE);
+ if (abbrev) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "send UPDATE_REQ, merely to check for NIMDB refresh"));
+ F_SET(rep, REP_F_ABBREVIATED);
+ } else
+ F_CLR(rep, REP_F_ABBREVIATED);
+ ZERO_LSN(rep->first_lsn);
+ ZERO_LSN(rep->ckp_lsn);
+ ret = 0;
+ }
+ master = rep->master_id;
+ REP_SYSTEM_UNLOCK(env);
+ if (ret == 0 && master != DB_EID_INVALID)
+ (void)__rep_send_message(env,
+ master, REP_UPDATE_REQ, NULL, NULL, 0, 0);
+ return (ret);
+}
+
+/*
+ * __rep_verify_fail --
+ * Handle a REP_VERIFY_FAIL message.
+ *
+ * PUBLIC: int __rep_verify_fail __P((ENV *, __rep_control_args *));
+ */
+int
+__rep_verify_fail(env, rp)
+ ENV *env;
+ __rep_control_args *rp;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ LOG *lp;
+ REP *rep;
+ int clnt_lock_held, lockout, master, ret;
+
+ clnt_lock_held = lockout = 0;
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ dblp = env->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ /*
+ * If any recovery flags are set, but not LOG or VERIFY,
+ * then we ignore this message. We are already
+ * in the middle of updating.
+ */
+ if (F_ISSET(rep, REP_F_RECOVER_MASK) &&
+ !F_ISSET(rep, REP_F_RECOVER_LOG | REP_F_RECOVER_VERIFY))
+ return (0);
+ REP_SYSTEM_LOCK(env);
+ /*
+ * We should not ever be in internal init with a lease granted.
+ */
+ DB_ASSERT(env,
+ !IS_USING_LEASES(env) || __rep_islease_granted(env) == 0);
+
+ /*
+ * Clean up old internal init in progress if:
+ * REP_C_NOAUTOINIT is not configured and
+ * we are recovering LOG and this LSN is in the range we need.
+ */
+ if (F_ISSET(rep, REP_F_RECOVER_LOG) &&
+ LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 &&
+ LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) {
+ /*
+ * Already locking out messages, give up.
+ */
+ if (F_ISSET(rep, REP_F_READY_MSG))
+ goto unlock;
+
+ /*
+ * Lock out other messages to prevent race conditions.
+ */
+ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+ goto unlock;
+ lockout = 1;
+
+ /*
+ * Clean up internal init if one was in progress.
+ */
+ if (F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP)) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "VERIFY_FAIL is cleaning up old internal init for missing log"));
+ if ((ret =
+ __rep_init_cleanup(env, rep, DB_FORCE)) != 0) {
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "VERIFY_FAIL error cleaning up internal init for missing log: %d", ret));
+ goto msglck;
+ }
+ F_CLR(rep, REP_F_RECOVER_MASK);
+ }
+ F_CLR(rep, REP_F_READY_MSG);
+ lockout = 0;
+ }
+
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ clnt_lock_held = 1;
+ REP_SYSTEM_LOCK(env);
+ /*
+ * Commence an internal init if:
+ * We are in VERIFY state and the failing LSN is the one we
+ * were verifying or
+ * we're recovering LOG and this LSN is in the range we need or
+ * we are in normal state (no recovery flags set) and
+ * the failing LSN is the one we're ready for.
+ *
+ * We don't want an old or delayed VERIFY_FAIL message to throw us
+ * into internal initialization when we shouldn't be.
+ */
+ if (((F_ISSET(rep, REP_F_RECOVER_VERIFY)) &&
+ LOG_COMPARE(&rp->lsn, &lp->verify_lsn) == 0) ||
+ (F_ISSET(rep, REP_F_RECOVER_LOG) &&
+ LOG_COMPARE(&rep->first_lsn, &rp->lsn) <= 0 &&
+ LOG_COMPARE(&rep->last_lsn, &rp->lsn) >= 0) ||
+ (F_ISSET(rep, REP_F_RECOVER_MASK) == 0 &&
+ LOG_COMPARE(&rp->lsn, &lp->ready_lsn) >= 0)) {
+ /*
+ * Update stats.
+ */
+ STAT(rep->stat.st_outdated++);
+
+ /*
+ * If REP_C_NOAUTOINIT is configured, return
+ * DB_REP_JOIN_FAILURE instead of doing internal init.
+ */
+ if (FLD_ISSET(rep->config, REP_C_NOAUTOINIT)) {
+ ret = DB_REP_JOIN_FAILURE;
+ goto unlock;
+ }
+
+ /*
+ * Do the internal init.
+ */
+ F_CLR(rep, REP_F_RECOVER_VERIFY);
+ F_SET(rep, REP_F_RECOVER_UPDATE);
+ ZERO_LSN(rep->first_lsn);
+ ZERO_LSN(rep->ckp_lsn);
+ lp->wait_ts = rep->request_gap;
+ master = rep->master_id;
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ if (master != DB_EID_INVALID)
+ (void)__rep_send_message(env,
+ master, REP_UPDATE_REQ, NULL, NULL, 0, 0);
+ } else {
+ /*
+ * Otherwise ignore this message.
+ */
+msglck: if (lockout)
+ F_CLR(rep, REP_F_READY_MSG);
+unlock: REP_SYSTEM_UNLOCK(env);
+ if (clnt_lock_held)
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ }
+ return (ret);
+}
+
+/*
+ * __rep_verify_req --
+ * Handle a REP_VERIFY_REQ message.
+ *
+ * PUBLIC: int __rep_verify_req __P((ENV *, __rep_control_args *, int));
+ */
+int
+__rep_verify_req(env, rp, eid)
+ ENV *env;
+ __rep_control_args *rp;
+ int eid;
+{
+ DBT *d, data_dbt;
+ DB_LOGC *logc;
+ DB_REP *db_rep;
+ REP *rep;
+ u_int32_t type;
+ int old, ret;
+
+ ret = 0;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+
+ type = REP_VERIFY;
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+ d = &data_dbt;
+ memset(d, 0, sizeof(data_dbt));
+ F_SET(logc, DB_LOG_SILENT_ERR);
+ ret = __logc_get(logc, &rp->lsn, d, DB_SET);
+ /*
+ * If the LSN was invalid, then we might get a DB_NOTFOUND
+ * we might get an EIO, we could get anything.
+ * If we get a DB_NOTFOUND, then there is a chance that
+ * the LSN comes before the first file present in which
+ * case we need to return a fail so that the client can
+ * perform an internal init or return a REP_JOIN_FAILURE.
+ *
+ * If we're a client servicing this request and we get a
+ * NOTFOUND, return it so the caller can rerequest from
+ * a better source.
+ */
+ if (ret == DB_NOTFOUND) {
+ if (F_ISSET(rep, REP_F_CLIENT)) {
+ (void)__logc_close(logc);
+ return (DB_NOTFOUND);
+ }
+ if (__log_is_outdated(env, rp->lsn.file, &old) == 0 &&
+ old != 0)
+ type = REP_VERIFY_FAIL;
+ }
+
+ if (ret != 0)
+ d = NULL;
+
+ (void)__rep_send_message(env, eid, type, &rp->lsn, d, 0, 0);
+ return (__logc_close(logc));
+}
+
+/*
+ * PUBLIC: int __rep_dorecovery __P((ENV *, DB_LSN *, DB_LSN *));
+ */
+int
+__rep_dorecovery(env, lsnp, trunclsnp)
+ ENV *env;
+ DB_LSN *lsnp, *trunclsnp;
+{
+ DBT mylog;
+ DB_LOGC *logc;
+ DB_LSN last_ckp, lsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ REP *rep;
+ int ret, skip_rec, t_ret, update;
+ u_int32_t rectype, opcode;
+ __txn_regop_args *txnrec;
+ __txn_regop_42_args *txn42rec;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /* Figure out if we are backing out any committed transactions. */
+ if ((ret = __log_cursor(env, &logc)) != 0)
+ return (ret);
+
+ memset(&mylog, 0, sizeof(mylog));
+ if (F_ISSET(rep, REP_F_RECOVER_LOG)) {
+ /*
+ * Internal init can never skip recovery.
+ * Internal init must always update the timestamp and
+ * force dead handles.
+ */
+ skip_rec = 0;
+ update = 1;
+ } else {
+ skip_rec = 1;
+ update = 0;
+ }
+ while (update == 0 &&
+ (ret = __logc_get(logc, &lsn, &mylog, DB_PREV)) == 0 &&
+ LOG_COMPARE(&lsn, lsnp) > 0) {
+ LOGCOPY_32(env, &rectype, mylog.data);
+ /*
+ * Find out if we can skip recovery completely. If we
+ * are backing up over any record a client usually
+ * cares about, we must run recovery.
+ *
+ * Skipping sync-up recovery can be pretty scary!
+ * Here's why we can do it:
+ * If a master downgraded to client and is now running
+ * sync-up to a new master, that old master must have
+ * waited for any outstanding txns to resolve before
+ * becoming a client. Also we are in lockout so there
+ * can be no other operations right now.
+ *
+ * If the client wrote a commit record to the log, but
+ * was descheduled before processing the txn, and then
+ * a new master was found, we must've let the txn get
+ * processed because right now we are the only message
+ * thread allowed to be running.
+ */
+ DB_ASSERT(env, rep->op_cnt == 0);
+ DB_ASSERT(env, rep->msg_th == 1);
+ if (rectype == DB___txn_regop || rectype == DB___txn_ckp ||
+ rectype == DB___dbreg_register)
+ skip_rec = 0;
+ if (rectype == DB___txn_regop) {
+ if (rep->version >= DB_REPVERSION_44) {
+ if ((ret = __txn_regop_read(
+ env, mylog.data, &txnrec)) != 0)
+ goto err;
+ opcode = txnrec->opcode;
+ __os_free(env, txnrec);
+ } else {
+ if ((ret = __txn_regop_42_read(
+ env, mylog.data, &txn42rec)) != 0)
+ goto err;
+ opcode = txn42rec->opcode;
+ __os_free(env, txn42rec);
+ }
+ if (opcode != TXN_ABORT)
+ update = 1;
+ }
+ }
+ /*
+ * Handle if the logc_get fails.
+ */
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If we successfully run recovery, we've opened all the necessary
+ * files. We are guaranteed to be single-threaded here, so no mutex
+ * is necessary.
+ */
+ if (skip_rec) {
+ if ((ret = __log_get_stable_lsn(env, &last_ckp)) != 0) {
+ if (ret != DB_NOTFOUND)
+ goto err;
+ ZERO_LSN(last_ckp);
+ }
+ RPRINT(env, DB_VERB_REP_SYNC, (env,
+ "Skip sync-up rec. Truncate log to [%lu][%lu], ckp [%lu][%lu]",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (u_long)last_ckp.file, (u_long)last_ckp.offset));
+ ret = __log_vtruncate(env, lsnp, &last_ckp, trunclsnp);
+ } else
+ ret = __db_apprec(env, ip, lsnp, trunclsnp, update, 0);
+
+ if (ret != 0)
+ goto err;
+ F_SET(db_rep, DBREP_OPENFILES);
+
+err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __rep_verify_match --
+ * We have just received a matching log record during verification.
+ * Figure out if we're going to need to run recovery. If so, wait until
+ * everything else has exited the library. If not, set up the world
+ * correctly and move forward.
+ *
+ * PUBLIC: int __rep_verify_match __P((ENV *, DB_LSN *, time_t));
+ */
+int
+__rep_verify_match(env, reclsnp, savetime)
+ ENV *env;
+ DB_LSN *reclsnp;
+ time_t savetime;
+{
+ DB_LOG *dblp;
+ DB_LSN trunclsn;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ LOG *lp;
+ REGENV *renv;
+ REGINFO *infop;
+ REP *rep;
+ int done, master, ret;
+ u_int32_t unused;
+
+ dblp = env->lg_handle;
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ lp = dblp->reginfo.primary;
+ ret = 0;
+ infop = env->reginfo;
+ renv = infop->primary;
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /*
+ * Check if the savetime is different than our current time stamp.
+ * If it is, then we're racing with another thread trying to recover
+ * and we lost. We must give up.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ done = savetime != renv->rep_timestamp;
+ if (done) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ return (0);
+ }
+ ZERO_LSN(lp->verify_lsn);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+
+ /*
+ * Make sure the world hasn't changed while we tried to get
+ * the lock. If it hasn't then it's time for us to kick all
+ * operations out of DB and run recovery.
+ */
+ REP_SYSTEM_LOCK(env);
+ if (F_ISSET(rep, REP_F_READY_MSG) ||
+ (!F_ISSET(rep, REP_F_RECOVER_LOG) &&
+ F_ISSET(rep, REP_F_READY_API | REP_F_READY_OP))) {
+ /*
+ * We lost. The world changed and we should do nothing.
+ */
+ STAT(rep->stat.st_msgs_recover++);
+ goto errunlock;
+ }
+
+ /*
+ * Lockout all message threads but ourselves.
+ */
+ if ((ret = __rep_lockout_msg(env, rep, 1)) != 0)
+ goto errunlock;
+
+ /*
+ * Lockout the API and wait for operations to complete.
+ */
+ if ((ret = __rep_lockout_api(env, rep)) != 0)
+ goto errunlock;
+
+ /* OK, everyone is out, we can now run recovery. */
+ REP_SYSTEM_UNLOCK(env);
+
+ if ((ret = __rep_dorecovery(env, reclsnp, &trunclsn)) != 0 ||
+ (ret = __rep_remove_init_file(env)) != 0) {
+ REP_SYSTEM_LOCK(env);
+ F_CLR(rep, REP_F_READY_API | REP_F_READY_MSG | REP_F_READY_OP);
+ goto errunlock;
+ }
+
+ /*
+ * The log has been truncated (either directly by us or by __db_apprec)
+ * We want to make sure we're waiting for the LSN at the new end-of-log,
+ * not some later point.
+ */
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ lp->ready_lsn = trunclsn;
+ ZERO_LSN(lp->waiting_lsn);
+ ZERO_LSN(lp->max_wait_lsn);
+ lp->max_perm_lsn = *reclsnp;
+ lp->wait_ts = rep->request_gap;
+ __os_gettime(env, &lp->rcvd_ts, 1);
+ ZERO_LSN(lp->verify_lsn);
+ ZERO_LSN(lp->prev_ckp);
+
+ /*
+ * Discard any log records we have queued; we're about to re-request
+ * them, and can't trust the ones in the queue. We need to set the
+ * DB_AM_RECOVER bit in this handle, so that the operation doesn't
+ * deadlock.
+ */
+ if (db_rep->rep_db == NULL &&
+ (ret = __rep_client_dbinit(env, 0, REP_DB)) != 0) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ goto out;
+ }
+
+ F_SET(db_rep->rep_db, DB_AM_RECOVER);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ret = __db_truncate(db_rep->rep_db, ip, NULL, &unused);
+ MUTEX_LOCK(env, rep->mtx_clientdb);
+ F_CLR(db_rep->rep_db, DB_AM_RECOVER);
+
+ REP_SYSTEM_LOCK(env);
+ rep->stat.st_log_queued = 0;
+ F_CLR(rep, REP_F_NOARCHIVE | REP_F_RECOVER_MASK | REP_F_READY_MSG);
+ if (ret != 0)
+ goto errunlock2;
+
+ /*
+ * If the master_id is invalid, this means that since
+ * the last record was sent, something happened to the
+ * master and we may not have a master to request
+ * things of.
+ *
+ * This is not an error; when we find a new master,
+ * we'll re-negotiate where the end of the log is and
+ * try to bring ourselves up to date again anyway.
+ */
+ master = rep->master_id;
+ REP_SYSTEM_UNLOCK(env);
+ if (master == DB_EID_INVALID) {
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ret = 0;
+ } else {
+ /*
+ * We're making an ALL_REQ. But now that we've
+ * cleared the flags, we're likely receiving new
+ * log records from the master, resulting in a gap
+ * immediately. So to avoid multiple data streams,
+ * set the wait_ts value high now to give the master
+ * a chance to start sending us these records before
+ * the gap code re-requests the same gap. Wait_recs
+ * will get reset once we start receiving these
+ * records.
+ */
+ lp->wait_ts = rep->max_gap;
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ (void)__rep_send_message(env,
+ master, REP_ALL_REQ, reclsnp, NULL, 0, DB_REP_ANYWHERE);
+ }
+ if (0) {
+errunlock2: MUTEX_UNLOCK(env, rep->mtx_clientdb);
+errunlock: REP_SYSTEM_UNLOCK(env);
+ }
+out: return (ret);
+}
+
+/*
+ * __rep_log_backup --
+ *
+ * In the verify handshake, we walk backward looking for
+ * identification records. Those are the only record types
+ * we verify and match on.
+ *
+ * PUBLIC: int __rep_log_backup __P((ENV *, REP *, DB_LOGC *, DB_LSN *));
+ */
+int
+__rep_log_backup(env, rep, logc, lsn)
+ ENV *env;
+ REP *rep;
+ DB_LOGC *logc;
+ DB_LSN *lsn;
+{
+ DBT mylog;
+ u_int32_t rectype;
+ int ret;
+
+ ret = 0;
+ memset(&mylog, 0, sizeof(mylog));
+ while ((ret = __logc_get(logc, lsn, &mylog, DB_PREV)) == 0) {
+ /*
+ * Determine what we look for based on version number.
+ * Due to the contents of records changing between
+ * versions we have to match based on criteria of that
+ * particular version.
+ */
+ LOGCOPY_32(env, &rectype, mylog.data);
+ /*
+ * In 4.4 and beyond we match checkpoint and commit.
+ */
+ if (rep->version >= DB_REPVERSION_44 &&
+ (rectype == DB___txn_ckp || rectype == DB___txn_regop))
+ break;
+ }
+ return (ret);
+}