summaryrefslogtreecommitdiff
path: root/db/os_windows
diff options
context:
space:
mode:
Diffstat (limited to 'db/os_windows')
-rw-r--r--db/os_windows/os_abs.c31
-rw-r--r--db/os_windows/os_clock.c30
-rw-r--r--db/os_windows/os_config.c118
-rw-r--r--db/os_windows/os_dir.c104
-rw-r--r--db/os_windows/os_errno.c405
-rw-r--r--db/os_windows/os_fid.c147
-rw-r--r--db/os_windows/os_flock.c69
-rw-r--r--db/os_windows/os_fsync.c38
-rw-r--r--db/os_windows/os_getenv.c97
-rw-r--r--db/os_windows/os_handle.c119
-rw-r--r--db/os_windows/os_map.c309
-rw-r--r--db/os_windows/os_open.c186
-rw-r--r--db/os_windows/os_rename.c70
-rw-r--r--db/os_windows/os_rw.c186
-rw-r--r--db/os_windows/os_seek.c55
-rw-r--r--db/os_windows/os_sleep.c34
-rw-r--r--db/os_windows/os_spin.c38
-rw-r--r--db/os_windows/os_stat.c84
-rw-r--r--db/os_windows/os_truncate.c90
-rw-r--r--db/os_windows/os_unlink.c109
-rw-r--r--db/os_windows/os_yield.c27
21 files changed, 2346 insertions, 0 deletions
diff --git a/db/os_windows/os_abs.c b/db/os_windows/os_abs.c
new file mode 100644
index 000000000..e4ace7bfb
--- /dev/null
+++ b/db/os_windows/os_abs.c
@@ -0,0 +1,31 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_abs.c,v 12.3 2006/08/24 14:46:21 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_abspath --
+ * Return if a path is an absolute path.
+ */
+int
+__os_abspath(path)
+ const char *path;
+{
+ /*
+ * !!!
+ * Check for drive specifications, e.g., "C:". In addition, the path
+ * separator used by the win32 DB (PATH_SEPARATOR) is \; look for both
+ * / and \ since these are user-input paths.
+ */
+ if (isalpha(path[0]) && path[1] == ':')
+ path += 2;
+ return (path[0] == '/' || path[0] == '\\');
+}
diff --git a/db/os_windows/os_clock.c b/db/os_windows/os_clock.c
new file mode 100644
index 000000000..2a0203237
--- /dev/null
+++ b/db/os_windows/os_clock.c
@@ -0,0 +1,30 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_clock.c,v 12.6 2006/08/24 14:46:21 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_clock --
+ * Return the current time-of-day clock in seconds and microseconds.
+ */
+void
+__os_clock(dbenv, secsp, usecsp)
+ DB_ENV *dbenv;
+ u_int32_t *secsp, *usecsp; /* Seconds and microseconds. */
+{
+ struct _timeb now;
+
+ _ftime(&now);
+ if (secsp != NULL)
+ *secsp = (u_int32_t)now.time;
+ if (usecsp != NULL)
+ *usecsp = now.millitm * 1000;
+}
diff --git a/db/os_windows/os_config.c b/db/os_windows/os_config.c
new file mode 100644
index 000000000..51fc1b3ad
--- /dev/null
+++ b/db/os_windows/os_config.c
@@ -0,0 +1,118 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_config.c,v 12.6 2006/08/24 14:46:21 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_is_winnt --
+ * Return 1 if Windows/NT, otherwise 0.
+ *
+ * PUBLIC: int __os_is_winnt __P((void));
+ */
+int
+__os_is_winnt()
+{
+ static int __os_type = -1;
+
+ /*
+ * The value of __os_type is computed only once, and cached to
+ * avoid the overhead of repeated calls to GetVersion().
+ */
+ if (__os_type == -1) {
+ if ((GetVersion() & 0x80000000) == 0)
+ __os_type = 1;
+ else
+ __os_type = 0;
+ }
+ return (__os_type);
+}
+
+/*
+ * __os_fs_notzero --
+ * Return 1 if allocated filesystem blocks are not zeroed.
+ */
+int
+__os_fs_notzero()
+{
+ static int __os_notzero = -1;
+ OSVERSIONINFO osvi;
+
+ /*
+ * Windows/NT zero-fills pages that were never explicitly written to
+ * the file. Note however that this is *NOT* documented. In fact, the
+ * Win32 documentation makes it clear that there are no guarantees that
+ * uninitialized bytes will be zeroed:
+ *
+ * If the file is extended, the contents of the file between the old
+ * EOF position and the new position are not defined.
+ *
+ * Experiments confirm that NT/2K/XP all zero fill for both NTFS and
+ * FAT32. Cygwin also relies on this behavior. This is the relevant
+ * comment from Cygwin:
+ *
+ * Oops, this is the bug case - Win95 uses whatever is on the disk
+ * instead of some known (safe) value, so we must seek back and fill
+ * in the gap with zeros. - DJ
+ * Note: this bug doesn't happen on NT4, even though the
+ * documentation for WriteFile() says that it *may* happen on any OS.
+ *
+ * We're making a bet, here, but we made it a long time ago and haven't
+ * yet seen any evidence that it was wrong.
+ *
+ * Windows 95/98 and On-Time give random garbage, and that breaks
+ * Berkeley DB.
+ *
+ * The value of __os_notzero is computed only once, and cached to
+ * avoid the overhead of repeated calls to GetVersion().
+ */
+ if (__os_notzero == -1) {
+ if (__os_is_winnt()) {
+ osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+ GetVersionEx(&osvi);
+ if (_tcscmp(osvi.szCSDVersion, _T("RTTarget-32")) == 0)
+ __os_notzero = 1; /* On-Time */
+ else
+ __os_notzero = 0; /* Windows/NT */
+ } else
+ __os_notzero = 1; /* Not Windows/NT */
+ }
+ return (__os_notzero);
+}
+
+/*
+ * __os_support_direct_io --
+ * Check to see if we support direct I/O.
+ */
+int
+__os_support_direct_io()
+{
+ return (1);
+}
+
+/*
+ * __os_support_db_register --
+ * Return 1 if the system supports DB_REGISTER.
+ */
+int
+__os_support_db_register()
+{
+ return (__os_is_winnt());
+}
+
+/*
+ * __os_support_replication --
+ * Return 1 if the system supports replication.
+ */
+int
+__os_support_replication()
+{
+ return (__os_is_winnt());
+}
diff --git a/db/os_windows/os_dir.c b/db/os_windows/os_dir.c
new file mode 100644
index 000000000..4e47fa771
--- /dev/null
+++ b/db/os_windows/os_dir.c
@@ -0,0 +1,104 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_dir.c,v 12.8 2006/08/24 14:46:21 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_dirlist --
+ * Return a list of the files in a directory.
+ */
+int
+__os_dirlist(dbenv, dir, namesp, cntp)
+ DB_ENV *dbenv;
+ const char *dir;
+ char ***namesp;
+ int *cntp;
+{
+ HANDLE dirhandle;
+ WIN32_FIND_DATA fdata;
+ int arraysz, cnt, ret;
+ char **names, *onename;
+ _TCHAR tfilespec[DB_MAXPATHLEN + 1];
+ _TCHAR *tdir;
+
+ TO_TSTRING(dbenv, dir, tdir, ret);
+ if (ret != 0)
+ return (ret);
+
+ (void)_sntprintf(tfilespec, DB_MAXPATHLEN,
+ _T("%s%hc*"), tdir, PATH_SEPARATOR[0]);
+ if ((dirhandle =
+ FindFirstFile(tfilespec, &fdata)) == INVALID_HANDLE_VALUE)
+ return (__os_posix_err(__os_get_syserr()));
+
+ names = NULL;
+ arraysz = cnt = ret = 0;
+ for (;;) {
+ if ((fdata.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == 0) {
+ if (cnt >= arraysz) {
+ arraysz += 100;
+ if ((ret = __os_realloc(dbenv,
+ arraysz * sizeof(names[0]), &names)) != 0)
+ goto err;
+ }
+ /*
+ * FROM_TSTRING doesn't necessarily allocate new
+ * memory, so we must do that explicitly.
+ * Unfortunately, when compiled with UNICODE, we'll
+ * copy twice.
+ */
+ FROM_TSTRING(dbenv, fdata.cFileName, onename, ret);
+ if (ret != 0)
+ goto err;
+ ret = __os_strdup(dbenv, onename, &names[cnt]);
+ FREE_STRING(dbenv, onename);
+ if (ret != 0)
+ goto err;
+ cnt++;
+ }
+ if (!FindNextFile(dirhandle, &fdata)) {
+ if (GetLastError() == ERROR_NO_MORE_FILES)
+ break;
+ else {
+ ret = __os_posix_err(__os_get_syserr());
+ goto err;
+ }
+ }
+ }
+
+err: if (!FindClose(dirhandle) && ret == 0)
+ ret = __os_posix_err(__os_get_syserr());
+
+ if (ret == 0) {
+ *namesp = names;
+ *cntp = cnt;
+ } else if (names != NULL)
+ __os_dirfree(dbenv, names, cnt);
+
+ FREE_STRING(dbenv, tdir);
+
+ return (ret);
+}
+
+/*
+ * __os_dirfree --
+ * Free the list of files.
+ */
+void
+__os_dirfree(dbenv, names, cnt)
+ DB_ENV *dbenv;
+ char **names;
+ int cnt;
+{
+ while (cnt > 0)
+ __os_free(dbenv, names[--cnt]);
+ __os_free(dbenv, names);
+}
diff --git a/db/os_windows/os_errno.c b/db/os_windows/os_errno.c
new file mode 100644
index 000000000..14bdab6c5
--- /dev/null
+++ b/db/os_windows/os_errno.c
@@ -0,0 +1,405 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_errno.c,v 12.10 2006/09/19 14:14:13 mjc Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_get_errno_ret_zero --
+ * Return the last system error, including an error of zero.
+ */
+int
+__os_get_errno_ret_zero()
+{
+ /* This routine must be able to return the same value repeatedly. */
+ return (errno);
+}
+
+/*
+ * We've seen cases where system calls failed but errno was never set. For
+ * that reason, __os_get_errno() and __os_get_syserr set errno to EAGAIN if
+ * it's not already set, to work around the problem. For obvious reasons,
+ * we can only call this function if we know an error has occurred, that
+ * is, we can't test the return for a non-zero value after the get call.
+ *
+ * __os_get_errno --
+ * Return the last ANSI C "errno" value or EAGAIN if the last error
+ * is zero.
+ */
+int
+__os_get_errno()
+{
+ /* This routine must be able to return the same value repeatedly. */
+ if (errno == 0)
+ __os_set_errno(EAGAIN);
+ return (errno);
+}
+
+/*
+ * __os_get_neterr --
+ * Return the last networking error or EAGAIN if the last error is zero.
+ *
+ * PUBLIC: int __os_get_neterr __P((void));
+ */
+int
+__os_get_neterr()
+{
+ int err;
+
+ /* This routine must be able to return the same value repeatedly. */
+ err = WSAGetLastError();
+ if (err == 0)
+ WSASetLastError(err = ERROR_RETRY);
+ return (err);
+}
+
+/*
+ * __os_get_syserr --
+ * Return the last system error or EAGAIN if the last error is zero.
+ */
+int
+__os_get_syserr()
+{
+ int err;
+
+ /* This routine must be able to return the same value repeatedly. */
+ err = GetLastError();
+ if (err == 0)
+ SetLastError(err = ERROR_RETRY);
+ return (err);
+}
+
+/*
+ * __os_set_errno --
+ * Set the value of errno.
+ */
+void
+__os_set_errno(evalue)
+ int evalue;
+{
+ /*
+ * This routine is called by the compatibility interfaces (DB 1.85,
+ * dbm and hsearch). Force values > 0, that is, not one of DB 2.X
+ * and later's public error returns. If something bad has happened,
+ * default to EFAULT -- a nasty return. Otherwise, default to EINVAL.
+ * As the compatibility APIs aren't included on Windows, the Windows
+ * version of this routine doesn't need this behavior.
+ */
+ errno =
+ evalue >= 0 ? evalue : (evalue == DB_RUNRECOVERY ? EFAULT : EINVAL);
+}
+
+/*
+ * __os_strerror --
+ * Return a string associated with the system error.
+ */
+char *
+__os_strerror(error, buf, len)
+ int error;
+ char *buf;
+ size_t len;
+{
+ DB_ASSERT(NULL, error != 0);
+
+ /*
+ * Explicitly call FormatMessageA, since we want to receive a char
+ * string back, not a tchar string.
+ */
+ FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM,
+ 0, (DWORD)error, 0, buf, (DWORD)(len - 1), NULL);
+ buf[len - 1] = '\0';
+
+ return (buf);
+}
+
+/*
+ * __os_posix_err --
+ * Convert a system error to a POSIX error.
+ */
+int
+__os_posix_err(error)
+ int error;
+{
+ /* Handle calls on successful returns. */
+ if (error == 0)
+ return (0);
+
+ /*
+ * Translate the Windows error codes we care about.
+ */
+ switch (error) {
+ case ERROR_FILE_NOT_FOUND:
+ case ERROR_INVALID_DRIVE:
+ case ERROR_PATH_NOT_FOUND:
+ return (ENOENT);
+
+ case ERROR_NO_MORE_FILES:
+ case ERROR_TOO_MANY_OPEN_FILES:
+ return (EMFILE);
+
+ case ERROR_ACCESS_DENIED:
+ return (EPERM);
+
+ case ERROR_INVALID_HANDLE:
+ return (EBADF);
+
+ case ERROR_NOT_ENOUGH_MEMORY:
+ return (ENOMEM);
+
+ case ERROR_DISK_FULL:
+ return (ENOSPC);
+
+ case ERROR_ARENA_TRASHED:
+ case ERROR_BAD_COMMAND:
+ case ERROR_BAD_ENVIRONMENT:
+ case ERROR_BAD_FORMAT:
+ case ERROR_GEN_FAILURE:
+ case ERROR_INVALID_ACCESS:
+ case ERROR_INVALID_BLOCK:
+ case ERROR_INVALID_DATA:
+ case ERROR_READ_FAULT:
+ case ERROR_WRITE_FAULT:
+ return (EFAULT);
+
+ case ERROR_ALREADY_EXISTS:
+ case ERROR_FILE_EXISTS:
+ return (EEXIST);
+
+ case ERROR_NOT_SAME_DEVICE:
+ return (EXDEV);
+
+ case ERROR_WRITE_PROTECT:
+ return (EACCES);
+
+ case ERROR_LOCK_FAILED:
+ case ERROR_LOCK_VIOLATION:
+ case ERROR_NOT_READY:
+ case ERROR_SHARING_VIOLATION:
+ return (EBUSY);
+
+ case ERROR_RETRY:
+ return (EINTR);
+ }
+
+ /*
+ * Translate the Windows socket error codes.
+ */
+ switch (error) {
+ case WSAEADDRINUSE:
+#ifdef EADDRINUSE
+ return (EADDRINUSE);
+#else
+ break;
+#endif
+ case WSAEADDRNOTAVAIL:
+#ifdef EADDRNOTAVAIL
+ return (EADDRNOTAVAIL);
+#else
+ break;
+#endif
+ case WSAEAFNOSUPPORT:
+#ifdef EAFNOSUPPORT
+ return (EAFNOSUPPORT);
+#else
+ break;
+#endif
+ case WSAEALREADY:
+#ifdef EALREADY
+ return (EALREADY);
+#else
+ break;
+#endif
+ case WSAEBADF:
+ return (EBADF);
+ case WSAECONNABORTED:
+#ifdef ECONNABORTED
+ return (ECONNABORTED);
+#else
+ break;
+#endif
+ case WSAECONNREFUSED:
+#ifdef ECONNREFUSED
+ return (ECONNREFUSED);
+#else
+ break;
+#endif
+ case WSAECONNRESET:
+#ifdef ECONNRESET
+ return (ECONNRESET);
+#else
+ break;
+#endif
+ case WSAEDESTADDRREQ:
+#ifdef EDESTADDRREQ
+ return (EDESTADDRREQ);
+#else
+ break;
+#endif
+ case WSAEFAULT:
+ return (EFAULT);
+ case WSAEHOSTDOWN:
+#ifdef EHOSTDOWN
+ return (EHOSTDOWN);
+#else
+ break;
+#endif
+ case WSAEHOSTUNREACH:
+#ifdef EHOSTUNREACH
+ return (EHOSTUNREACH);
+#else
+ break;
+#endif
+ case WSAEINPROGRESS:
+#ifdef EINPROGRESS
+ return (EINPROGRESS);
+#else
+ break;
+#endif
+ case WSAEINTR:
+ return (EINTR);
+ case WSAEINVAL:
+ return (EINVAL);
+ case WSAEISCONN:
+#ifdef EISCONN
+ return (EISCONN);
+#else
+ break;
+#endif
+ case WSAELOOP:
+#ifdef ELOOP
+ return (ELOOP);
+#else
+ break;
+#endif
+ case WSAEMFILE:
+ return (EMFILE);
+ case WSAEMSGSIZE:
+#ifdef EMSGSIZE
+ return (EMSGSIZE);
+#else
+ break;
+#endif
+ case WSAENAMETOOLONG:
+ return (ENAMETOOLONG);
+ case WSAENETDOWN:
+#ifdef ENETDOWN
+ return (ENETDOWN);
+#else
+ break;
+#endif
+ case WSAENETRESET:
+#ifdef ENETRESET
+ return (ENETRESET);
+#else
+ break;
+#endif
+ case WSAENETUNREACH:
+#ifdef ENETUNREACH
+ return (ENETUNREACH);
+#else
+ break;
+#endif
+ case WSAENOBUFS:
+#ifdef ENOBUFS
+ return (ENOBUFS);
+#else
+ break;
+#endif
+ case WSAENOPROTOOPT:
+#ifdef ENOPROTOOPT
+ return (ENOPROTOOPT);
+#else
+ break;
+#endif
+ case WSAENOTCONN:
+#ifdef ENOTCONN
+ return (ENOTCONN);
+#else
+ break;
+#endif
+ case WSANOTINITIALISED:
+ return (EAGAIN);
+ case WSAENOTSOCK:
+#ifdef ENOTSOCK
+ return (ENOTSOCK);
+#else
+ break;
+#endif
+ case WSAEOPNOTSUPP:
+ return (DB_OPNOTSUP);
+ case WSAEPFNOSUPPORT:
+#ifdef EPFNOSUPPORT
+ return (EPFNOSUPPORT);
+#else
+ break;
+#endif
+ case WSAEPROTONOSUPPORT:
+#ifdef EPROTONOSUPPORT
+ return (EPROTONOSUPPORT);
+#else
+ break;
+#endif
+ case WSAEPROTOTYPE:
+#ifdef EPROTOTYPE
+ return (EPROTOTYPE);
+#else
+ break;
+#endif
+ case WSAESHUTDOWN:
+#ifdef ESHUTDOWN
+ return (ESHUTDOWN);
+#else
+ break;
+#endif
+ case WSAESOCKTNOSUPPORT:
+#ifdef ESOCKTNOSUPPORT
+ return (ESOCKTNOSUPPORT);
+#else
+ break;
+#endif
+ case WSAETIMEDOUT:
+#ifdef ETIMEDOUT
+ return (ETIMEDOUT);
+#else
+ break;
+#endif
+ case WSAETOOMANYREFS:
+#ifdef ETOOMANYREFS
+ return (ETOOMANYREFS);
+#else
+ break;
+#endif
+ case WSAEWOULDBLOCK:
+#ifdef EWOULDBLOCK
+ return (EWOULDBLOCK);
+#else
+ return (EAGAIN);
+#endif
+ case WSAHOST_NOT_FOUND:
+#ifdef EHOSTUNREACH
+ return (EHOSTUNREACH);
+#else
+ break;
+#endif
+ case WSASYSNOTREADY:
+ return (EAGAIN);
+ case WSATRY_AGAIN:
+ return (EAGAIN);
+ case WSAVERNOTSUPPORTED:
+ return (DB_OPNOTSUP);
+ case WSAEACCES:
+ return (EACCES);
+ }
+
+ /*
+ * EFAULT is the default if we don't have a translation.
+ */
+ return (EFAULT);
+}
diff --git a/db/os_windows/os_fid.c b/db/os_windows/os_fid.c
new file mode 100644
index 000000000..9fa2a57c9
--- /dev/null
+++ b/db/os_windows/os_fid.c
@@ -0,0 +1,147 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_fid.c,v 12.9 2006/08/24 14:46:21 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+#define SERIAL_INIT 0
+static u_int32_t fid_serial = SERIAL_INIT;
+
+/*
+ * __os_fileid --
+ * Return a unique identifier for a file.
+ */
+int
+__os_fileid(dbenv, fname, unique_okay, fidp)
+ DB_ENV *dbenv;
+ const char *fname;
+ int unique_okay;
+ u_int8_t *fidp;
+{
+ pid_t pid;
+ size_t i;
+ u_int32_t tmp;
+ u_int8_t *p;
+ int ret;
+
+ /*
+ * The documentation for GetFileInformationByHandle() states that the
+ * inode-type numbers are not constant between processes. Actually,
+ * they are, they're the NTFS MFT indexes. So, this works on NTFS,
+ * but perhaps not on other platforms, and perhaps not over a network.
+ * Can't think of a better solution right now.
+ */
+ DB_FH *fhp;
+ BY_HANDLE_FILE_INFORMATION fi;
+ BOOL retval = FALSE;
+
+ DB_ASSERT(dbenv, fname != NULL);
+
+ /* Clear the buffer. */
+ memset(fidp, 0, DB_FILE_ID_LEN);
+
+ /*
+ * Initialize/increment the serial number we use to help avoid
+ * fileid collisions. Note that we don't bother with locking;
+ * it's unpleasant to do from down in here, and if we race on
+ * this no real harm will be done, since the finished fileid
+ * has so many other components.
+ *
+ * We use the bottom 32-bits of the process ID, hoping they
+ * are more random than the top 32-bits (should we be on a
+ * machine with 64-bit process IDs).
+ *
+ * We increment by 100000 on each call as a simple way of
+ * randomizing; simply incrementing seems potentially less useful
+ * if pids are also simply incremented, since this is process-local
+ * and we may be one of a set of processes starting up. 100000
+ * pushes us out of pid space on most platforms, and has few
+ * interesting properties in base 2.
+ */
+ if (fid_serial == SERIAL_INIT) {
+ __os_id(dbenv, &pid, NULL);
+ fid_serial = pid;
+ } else
+ fid_serial += 100000;
+
+ /*
+ * First we open the file, because we're not given a handle to it.
+ * If we can't open it, we're in trouble.
+ */
+ if ((ret = __os_open(dbenv, fname, DB_OSO_RDONLY, _S_IREAD, &fhp)) != 0)
+ return (ret);
+
+ /* File open, get its info */
+ if ((retval = GetFileInformationByHandle(fhp->handle, &fi)) == FALSE)
+ ret = __os_get_syserr();
+ (void)__os_closehandle(dbenv, fhp);
+
+ if (retval == FALSE)
+ return (__os_posix_err(ret));
+
+ /*
+ * We want the three 32-bit words which tell us the volume ID and
+ * the file ID. We make a crude attempt to copy the bytes over to
+ * the callers buffer.
+ *
+ * We don't worry about byte sexing or the actual variable sizes.
+ *
+ * When this routine is called from the DB access methods, it's only
+ * called once -- whatever ID is generated when a database is created
+ * is stored in the database file's metadata, and that is what is
+ * saved in the mpool region's information to uniquely identify the
+ * file.
+ *
+ * When called from the mpool layer this routine will be called each
+ * time a new thread of control wants to share the file, which makes
+ * things tougher. As far as byte sexing goes, since the mpool region
+ * lives on a single host, there's no issue of that -- the entire
+ * region is byte sex dependent. As far as variable sizes go, we make
+ * the simplifying assumption that 32-bit and 64-bit processes will
+ * get the same 32-bit values if we truncate any returned 64-bit value
+ * to a 32-bit value.
+ */
+ tmp = (u_int32_t)fi.nFileIndexLow;
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+ tmp = (u_int32_t)fi.nFileIndexHigh;
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+
+ if (unique_okay) {
+ /*
+ * Use the system time to try to get a unique value
+ * within this process. A millisecond counter
+ * overflows 32 bits in about 49 days. So we use 8
+ * bytes, and don't bother with the volume ID, which
+ * is not very useful for our purposes.
+ */
+ SYSTEMTIME st;
+
+ GetSystemTime(&st);
+ tmp = (st.wYear - 1900) * 12 + (st.wMonth - 1);
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+ tmp = ((((st.wDay - 1) * 24 + st.wHour) * 60 +
+ st.wMinute) * 60 + st.wSecond) * 1000 +
+ st.wMilliseconds;
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+ for (p = (u_int8_t *)&fid_serial, i = sizeof(u_int32_t);
+ i > 0; --i)
+ *fidp++ = *p++;
+ } else {
+ tmp = (u_int32_t)fi.dwVolumeSerialNumber;
+ for (p = (u_int8_t *)&tmp, i = sizeof(u_int32_t); i > 0; --i)
+ *fidp++ = *p++;
+ }
+
+ return (0);
+}
diff --git a/db/os_windows/os_flock.c b/db/os_windows/os_flock.c
new file mode 100644
index 000000000..c08193891
--- /dev/null
+++ b/db/os_windows/os_flock.c
@@ -0,0 +1,69 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_flock.c,v 1.12 2006/08/24 14:46:21 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fdlock --
+ * Acquire/release a lock on a byte in a file.
+ */
+int
+__os_fdlock(dbenv, fhp, offset, acquire, nowait)
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+ int acquire, nowait;
+ off_t offset;
+{
+ int ret;
+ DWORD low, high;
+ OVERLAPPED over;
+
+ DB_ASSERT(dbenv,
+ F_ISSET(fhp, DB_FH_OPENED) && fhp->handle != INVALID_HANDLE_VALUE);
+
+ /*
+ * Windows file locking interferes with read/write operations, so we
+ * map the ranges to an area past the end of the file.
+ */
+ DB_ASSERT(dbenv, offset < (u_int64_t)INT64_MAX);
+ offset = UINT64_MAX - offset;
+ low = (DWORD)offset;
+ high = (DWORD)(offset >> 32);
+
+ if (acquire) {
+ if (nowait)
+ RETRY_CHK_EINTR_ONLY(
+ !LockFile(fhp->handle, low, high, 1, 0), ret);
+ else if (__os_is_winnt()) {
+ memset(&over, 0, sizeof(over));
+ over.Offset = low;
+ over.OffsetHigh = high;
+ RETRY_CHK_EINTR_ONLY(
+ !LockFileEx(fhp->handle, LOCKFILE_EXCLUSIVE_LOCK,
+ 0, 1, 0, &over),
+ ret);
+ } else {
+ /* Windows 9x/ME doesn't support a blocking call. */
+ for (;;) {
+ RETRY_CHK_EINTR_ONLY(
+ !LockFile(fhp->handle, low, high, 1, 0),
+ ret);
+ if (__os_posix_err(ret) != EAGAIN)
+ break;
+ __os_sleep(dbenv, 1, 0);
+ }
+ }
+ } else
+ RETRY_CHK_EINTR_ONLY(
+ !UnlockFile(fhp->handle, low, high, 1, 0), ret);
+
+ return (__os_posix_err(ret));
+}
diff --git a/db/os_windows/os_fsync.c b/db/os_windows/os_fsync.c
new file mode 100644
index 000000000..050d68e55
--- /dev/null
+++ b/db/os_windows/os_fsync.c
@@ -0,0 +1,38 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_fsync.c,v 12.7 2006/08/24 14:46:21 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_fsync --
+ * Flush a file descriptor.
+ */
+int
+__os_fsync(dbenv, fhp)
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+{
+ int ret;
+
+ /*
+ * Do nothing if the file descriptor has been marked as not requiring
+ * any sync to disk.
+ */
+ if (F_ISSET(fhp, DB_FH_NOSYNC))
+ return (0);
+
+ RETRY_CHK((!FlushFileBuffers(fhp->handle)), ret);
+ if (ret != 0) {
+ __db_syserr(dbenv, ret, "FlushFileBuffers");
+ ret = __os_posix_err(ret);
+ }
+ return (ret);
+}
diff --git a/db/os_windows/os_getenv.c b/db/os_windows/os_getenv.c
new file mode 100644
index 000000000..a42dbd677
--- /dev/null
+++ b/db/os_windows/os_getenv.c
@@ -0,0 +1,97 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_getenv.c,v 1.4 2006/08/24 14:46:21 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_getenv --
+ * Retrieve an environment variable.
+ */
+int
+__os_getenv(dbenv, name, bpp, buflen)
+ DB_ENV *dbenv;
+ const char *name;
+ char **bpp;
+ size_t buflen;
+{
+ _TCHAR *tname, tbuf[1024];
+ int ret;
+ char *p;
+
+ /*
+ * If there's a value and the buffer is large enough:
+ * copy value into the pointer, return 0
+ * If there's a value and the buffer is too short:
+ * set pointer to NULL, return EINVAL
+ * If there's no value:
+ * set pointer to NULL, return 0
+ */
+ if ((p = getenv(name)) != NULL) {
+ if (strlen(p) < buflen) {
+ (void)strcpy(*bpp, p);
+ return (0);
+ }
+ goto small_buf;
+ }
+
+ TO_TSTRING(dbenv, name, tname, ret);
+ if (ret != 0)
+ return (ret);
+ /*
+ * The declared size of the tbuf buffer limits the maximum environment
+ * variable size in Berkeley DB on Windows. If that's too small, or if
+ * we need to get rid of large allocations on the BDB stack, we should
+ * malloc the tbuf memory.
+ */
+ ret = GetEnvironmentVariable(tname, tbuf, sizeof(tbuf));
+ FREE_STRING(dbenv, tname);
+
+ /*
+ * If GetEnvironmentVariable succeeds, the return value is the number
+ * of characters stored in the buffer pointed to by lpBuffer, not
+ * including the terminating null character. If the buffer is not
+ * large enough to hold the data, the return value is the buffer size,
+ * in characters, required to hold the string and its terminating null
+ * character. If GetEnvironmentVariable fails, the return value is
+ * zero. If the specified environment variable was not found in the
+ * environment block, GetLastError returns ERROR_ENVVAR_NOT_FOUND.
+ */
+ if (ret == 0) {
+ if ((ret = __os_get_syserr()) == ERROR_ENVVAR_NOT_FOUND) {
+ *bpp = NULL;
+ return (0);
+ }
+ __db_syserr(dbenv, ret, "GetEnvironmentVariable");
+ return (__os_posix_err(ret));
+ }
+ if (ret > (int)sizeof(tbuf))
+ goto small_buf;
+
+ FROM_TSTRING(dbenv, tbuf, p, ret);
+ if (ret != 0)
+ return (ret);
+ if (strlen(p) < buflen)
+ (void)strcpy(*bpp, p);
+ else
+ *bpp = NULL;
+ FREE_STRING(dbenv, p);
+ if (*bpp == NULL)
+ goto small_buf;
+
+ return (0);
+
+small_buf:
+ *bpp = NULL;
+ __db_errx(dbenv,
+ "%s: buffer too small to hold environment variable %s",
+ name, p);
+ return (EINVAL);
+}
diff --git a/db/os_windows/os_handle.c b/db/os_windows/os_handle.c
new file mode 100644
index 000000000..c03a5ecd8
--- /dev/null
+++ b/db/os_windows/os_handle.c
@@ -0,0 +1,119 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_handle.c,v 12.10 2006/09/05 15:02:31 mjc Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_openhandle --
+ * Open a file, using POSIX 1003.1 open flags.
+ */
+int
+__os_openhandle(dbenv, name, flags, mode, fhpp)
+ DB_ENV *dbenv;
+ const char *name;
+ int flags, mode;
+ DB_FH **fhpp;
+{
+ DB_FH *fhp;
+ int ret, nrepeat, retries;
+
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), fhpp)) != 0)
+ return (ret);
+ fhp = *fhpp;
+
+ retries = 0;
+ for (nrepeat = 1; nrepeat < 4; ++nrepeat) {
+ ret = 0;
+ fhp->fd = _open(name, flags, mode);
+
+ if (fhp->fd != -1) {
+ F_SET(fhp, DB_FH_OPENED);
+ break;
+ }
+
+ switch (ret = __os_posix_err(__os_get_syserr())) {
+ case EMFILE:
+ case ENFILE:
+ case ENOSPC:
+ /*
+ * If it's a "temporary" error, we retry up to 3 times,
+ * waiting up to 12 seconds. While it's not a problem
+ * if we can't open a database, an inability to open a
+ * log file is cause for serious dismay.
+ */
+ __os_sleep(dbenv, nrepeat * 2, 0);
+ break;
+ case EAGAIN:
+ case EBUSY:
+ case EINTR:
+ /*
+ * If an EAGAIN, EBUSY or EINTR, retry immediately for
+ * DB_RETRY times.
+ */
+ if (++retries < DB_RETRY)
+ --nrepeat;
+ break;
+ }
+ }
+
+ if (ret != 0) {
+ (void)__os_closehandle(dbenv, fhp);
+ *fhpp = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __os_closehandle --
+ * Close a file.
+ */
+int
+__os_closehandle(dbenv, fhp)
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+{
+ int ret, t_ret;
+
+ ret = 0;
+
+ /*
+ * If we have a valid handle, close it and unlink any temporary
+ * file.
+ */
+ if (F_ISSET(fhp, DB_FH_OPENED)) {
+ if (fhp->handle != INVALID_HANDLE_VALUE)
+ RETRY_CHK((!CloseHandle(fhp->handle)), ret);
+ else
+ RETRY_CHK((_close(fhp->fd)), ret);
+
+ if (fhp->trunc_handle != INVALID_HANDLE_VALUE) {
+ RETRY_CHK((!CloseHandle(fhp->trunc_handle)), t_ret);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ if (ret != 0) {
+ __db_syserr(dbenv, ret, "CloseHandle");
+ ret = __os_posix_err(ret);
+ }
+
+ /* Unlink the file if we haven't already done so. */
+ if (F_ISSET(fhp, DB_FH_UNLINK)) {
+ (void)__os_unlink(dbenv, fhp->name);
+ __os_free(dbenv, fhp->name);
+ }
+ }
+
+ __os_free(dbenv, fhp);
+
+ return (ret);
+}
diff --git a/db/os_windows/os_map.c b/db/os_windows/os_map.c
new file mode 100644
index 000000000..e254ea429
--- /dev/null
+++ b/db/os_windows/os_map.c
@@ -0,0 +1,309 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_map.c,v 12.8 2006/08/24 14:46:21 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+static int __os_map
+ __P((DB_ENV *, char *, REGINFO *, DB_FH *, size_t, int, int, int, void **));
+static int __os_unique_name __P((_TCHAR *, HANDLE, _TCHAR *, size_t));
+
+/*
+ * __os_r_sysattach --
+ * Create/join a shared memory region.
+ */
+int
+__os_r_sysattach(dbenv, infop, rp)
+ DB_ENV *dbenv;
+ REGINFO *infop;
+ REGION *rp;
+{
+ DB_FH *fhp;
+ int is_system, ret;
+
+ /*
+ * Try to open/create the file. We DO NOT need to ensure that multiple
+ * threads/processes attempting to simultaneously create the region are
+ * properly ordered, our caller has already taken care of that.
+ */
+ if ((ret = __os_open(dbenv, infop->name,
+ F_ISSET(infop, REGION_CREATE_OK) ? DB_OSO_CREATE: 0,
+ dbenv->db_mode, &fhp)) != 0) {
+ __db_err(dbenv, ret, "%s", infop->name);
+ return (ret);
+ }
+
+ /*
+ * On Windows/9X, files that are opened by multiple processes do not
+ * share data correctly. For this reason, the DB_SYSTEM_MEM flag is
+ * implied for any application that does not specify the DB_PRIVATE
+ * flag.
+ */
+ is_system = F_ISSET(dbenv, DB_ENV_SYSTEM_MEM) ||
+ (!F_ISSET(dbenv, DB_ENV_PRIVATE) && __os_is_winnt() == 0);
+
+ /*
+ * Map the file in. If we're creating an in-system-memory region,
+ * specify a segment ID (which is never used again) so that the
+ * calling code writes out the REGENV_REF structure to the primary
+ * environment file.
+ */
+ ret = __os_map(dbenv, infop->name, infop, fhp, rp->size,
+ 1, is_system, 0, &infop->addr);
+ if (ret == 0 && is_system == 1)
+ rp->segid = 1;
+
+ (void)__os_closehandle(dbenv, fhp);
+
+ return (ret);
+}
+
+/*
+ * __os_r_sysdetach --
+ * Detach from a shared memory region.
+ */
+int
+__os_r_sysdetach(dbenv, infop, destroy)
+ DB_ENV *dbenv;
+ REGINFO *infop;
+ int destroy;
+{
+ int ret, t_ret;
+
+ if (infop->wnt_handle != NULL) {
+ (void)CloseHandle(infop->wnt_handle);
+ infop->wnt_handle = NULL;
+ }
+
+ ret = !UnmapViewOfFile(infop->addr) ? __os_get_syserr() : 0;
+ if (ret != 0) {
+ __db_syserr(dbenv, ret, "UnmapViewOfFile");
+ ret = __os_posix_err(ret);
+ }
+
+ if (!F_ISSET(dbenv, DB_ENV_SYSTEM_MEM) && destroy) {
+ if (F_ISSET(dbenv, DB_ENV_OVERWRITE))
+ (void)__db_file_multi_write(dbenv, infop->name);
+ if ((t_ret = __os_unlink(dbenv, infop->name)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
+ * __os_mapfile --
+ * Map in a shared memory file.
+ */
+int
+__os_mapfile(dbenv, path, fhp, len, is_rdonly, addr)
+ DB_ENV *dbenv;
+ char *path;
+ DB_FH *fhp;
+ int is_rdonly;
+ size_t len;
+ void **addr;
+{
+ return (__os_map(dbenv, path, NULL, fhp, len, 0, 0, is_rdonly, addr));
+}
+
+/*
+ * __os_unmapfile --
+ * Unmap the shared memory file.
+ */
+int
+__os_unmapfile(dbenv, addr, len)
+ DB_ENV *dbenv;
+ void *addr;
+ size_t len;
+{
+ return (!UnmapViewOfFile(addr) ? __os_posix_err(__os_get_syserr()) : 0);
+}
+
+/*
+ * __os_unique_name --
+ * Create a unique identifying name from a pathname (may be absolute or
+ * relative) and/or a file descriptor.
+ *
+ * The name returned must be unique (different files map to different
+ * names), and repeatable (same files, map to same names). It's not
+ * so easy to do by name. Should handle not only:
+ *
+ * foo.bar == ./foo.bar == c:/whatever_path/foo.bar
+ *
+ * but also understand that:
+ *
+ * foo.bar == Foo.Bar (FAT file system)
+ * foo.bar != Foo.Bar (NTFS)
+ *
+ * The best solution is to use the file index, found in the file
+ * information structure (similar to UNIX inode #).
+ *
+ * When a file is deleted, its file index may be reused,
+ * but if the unique name has not gone from its namespace,
+ * we may get a conflict. So to ensure some tie in to the
+ * original pathname, we also use the creation time and the
+ * file basename. This is not a perfect system, but it
+ * should work for all but anamolous test cases.
+ *
+ */
+static int
+__os_unique_name(orig_path, hfile, result_path, result_path_len)
+ _TCHAR *orig_path, *result_path;
+ HANDLE hfile;
+ size_t result_path_len;
+{
+ BY_HANDLE_FILE_INFORMATION fileinfo;
+ _TCHAR *basename, *p;
+
+ /*
+ * In Windows, pathname components are delimited by '/' or '\', and
+ * if neither is present, we need to strip off leading drive letter
+ * (e.g. c:foo.txt).
+ */
+ basename = _tcsrchr(orig_path, '/');
+ p = _tcsrchr(orig_path, '\\');
+ if (basename == NULL || (p != NULL && p > basename))
+ basename = p;
+ if (basename == NULL)
+ basename = _tcsrchr(orig_path, ':');
+
+ if (basename == NULL)
+ basename = orig_path;
+ else
+ basename++;
+
+ if (!GetFileInformationByHandle(hfile, &fileinfo))
+ return (__os_posix_err(__os_get_syserr()));
+
+ (void)_sntprintf(result_path, result_path_len,
+ _T("__db_shmem.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%8.8lx.%s"),
+ fileinfo.dwVolumeSerialNumber,
+ fileinfo.nFileIndexHigh,
+ fileinfo.nFileIndexLow,
+ fileinfo.ftCreationTime.dwHighDateTime,
+ fileinfo.ftCreationTime.dwHighDateTime,
+ basename);
+
+ return (0);
+}
+
+/*
+ * __os_map --
+ * The mmap(2) function for Windows.
+ */
+static int
+__os_map(dbenv, path, infop, fhp, len, is_region, is_system, is_rdonly, addr)
+ DB_ENV *dbenv;
+ REGINFO *infop;
+ char *path;
+ DB_FH *fhp;
+ int is_region, is_system, is_rdonly;
+ size_t len;
+ void **addr;
+{
+ HANDLE hMemory;
+ int ret, use_pagefile;
+ _TCHAR *tpath, shmem_name[DB_MAXPATHLEN];
+ void *pMemory;
+
+ ret = 0;
+ if (infop != NULL)
+ infop->wnt_handle = NULL;
+
+ use_pagefile = is_region && is_system;
+
+ /*
+ * If creating a region in system space, get a matching name in the
+ * paging file namespace.
+ */
+ if (use_pagefile) {
+ TO_TSTRING(dbenv, path, tpath, ret);
+ if (ret != 0)
+ return (ret);
+ ret = __os_unique_name(tpath, fhp->handle,
+ shmem_name, sizeof(shmem_name));
+ FREE_STRING(dbenv, tpath);
+ if (ret != 0)
+ return (ret);
+ }
+
+ /*
+ * XXX
+ * DB: We have not implemented copy-on-write here.
+ *
+ * If this is an region in system memory, we try to open it using the
+ * OpenFileMapping() first, and only call CreateFileMapping() if we're
+ * really creating the section. There are two reasons:
+ *
+ * 1) We only create the mapping if we have newly created the region.
+ * This avoids a long-running problem caused by Windows reference
+ * counting, where regions that are closed by all processes are
+ * deleted. It turns out that just checking for a zeroed region
+ * is not good enough. See [#4882] and [#7127] for the details.
+ *
+ * 2) CreateFileMapping seems to mess up making the commit charge to
+ * the process. It thinks, incorrectly, that when we want to join a
+ * previously existing section, that it should make a commit charge
+ * for the whole section. In fact, there is no new committed memory
+ * whatever. The call can fail if there is insufficient memory free
+ * to handle the erroneous commit charge. So, we find that the
+ * bogus commit is not made if we call OpenFileMapping.
+ */
+ hMemory = NULL;
+ if (use_pagefile) {
+ hMemory = OpenFileMapping(
+ is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS,
+ 0, shmem_name);
+
+ if (hMemory == NULL && F_ISSET(infop, REGION_CREATE_OK))
+ hMemory = CreateFileMapping((HANDLE)-1, 0,
+ is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
+ 0, (DWORD)len, shmem_name);
+ } else
+ hMemory = CreateFileMapping(fhp->handle, 0,
+ is_rdonly ? PAGE_READONLY : PAGE_READWRITE,
+ 0, (DWORD)len, NULL);
+
+ if (hMemory == NULL) {
+ ret = __os_get_syserr();
+ __db_syserr(dbenv, ret, "OpenFileMapping");
+ return (__db_panic(dbenv, __os_posix_err(ret)));
+ }
+
+ pMemory = MapViewOfFile(hMemory,
+ (is_rdonly ? FILE_MAP_READ : FILE_MAP_ALL_ACCESS), 0, 0, len);
+ if (pMemory == NULL) {
+ ret = __os_get_syserr();
+ __db_syserr(dbenv, ret, "MapViewOfFile");
+ return (__db_panic(dbenv, __os_posix_err(ret)));
+ }
+
+ /*
+ * XXX
+ * It turns out that the kernel object underlying the named section
+ * is reference counted, but that the call to MapViewOfFile() above
+ * does NOT increment the reference count! So, if we close the handle
+ * here, the kernel deletes the object from the kernel namespace.
+ * When a second process comes along to join the region, the kernel
+ * happily creates a new object with the same name, but completely
+ * different identity. The two processes then have distinct isolated
+ * mapped sections, not at all what was wanted. Not closing the handle
+ * here fixes this problem. We carry the handle around in the region
+ * structure so we can close it when unmap is called.
+ */
+ if (use_pagefile && infop != NULL)
+ infop->wnt_handle = hMemory;
+ else
+ CloseHandle(hMemory);
+
+ *addr = pMemory;
+ return (ret);
+}
diff --git a/db/os_windows/os_open.c b/db/os_windows/os_open.c
new file mode 100644
index 000000000..9346722ae
--- /dev/null
+++ b/db/os_windows/os_open.c
@@ -0,0 +1,186 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_open.c,v 12.16 2006/09/12 01:49:36 mjc Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_open --
+ * Open a file descriptor.
+ */
+__os_open(dbenv, name, flags, mode, fhpp)
+ DB_ENV *dbenv;
+ const char *name;
+ u_int32_t flags;
+ int mode;
+ DB_FH **fhpp;
+{
+ return (__os_open_extend(dbenv, name, 0, flags, mode, fhpp));
+}
+
+/*
+ * __os_open_extend --
+ * Open a file descriptor (including page size and log size information).
+ */
+int
+__os_open_extend(dbenv, name, page_size, flags, mode, fhpp)
+ DB_ENV *dbenv;
+ const char *name;
+ u_int32_t page_size, flags;
+ int mode;
+ DB_FH **fhpp;
+{
+ DB_FH *fhp;
+ DWORD cluster_size, sector_size, free_clusters, total_clusters;
+ int access, attr, createflag, nrepeat, ret, share;
+ _TCHAR *drive, *tname;
+ _TCHAR dbuf[4]; /* <letter><colon><slash><nul> */
+
+ fhp = NULL;
+ tname = NULL;
+
+#define OKFLAGS \
+ (DB_OSO_ABSMODE | DB_OSO_CREATE | DB_OSO_DIRECT | DB_OSO_DSYNC |\
+ DB_OSO_EXCL | DB_OSO_RDONLY | DB_OSO_REGION | DB_OSO_SEQ | \
+ DB_OSO_TEMP | DB_OSO_TRUNC)
+ if ((ret = __db_fchk(dbenv, "__os_open", flags, OKFLAGS)) != 0)
+ return (ret);
+
+ TO_TSTRING(dbenv, name, tname, ret);
+ if (ret != 0)
+ goto err;
+
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &fhp)) != 0)
+ goto err;
+
+ /*
+ * Otherwise, use the Windows/32 CreateFile interface so that we can
+ * play magic games with files to get data flush effects similar to
+ * the POSIX O_DSYNC flag.
+ *
+ * !!!
+ * We currently ignore the 'mode' argument. It would be possible
+ * to construct a set of security attributes that we could pass to
+ * CreateFile that would accurately represents the mode. In worst
+ * case, this would require looking up user and all group names and
+ * creating an entry for each. Alternatively, we could call the
+ * _chmod (partial emulation) function after file creation, although
+ * this leaves us with an obvious race. However, these efforts are
+ * largely meaningless on FAT, the most common file system, which
+ * only has a "readable" and "writeable" flag, applying to all users.
+ */
+ access = GENERIC_READ;
+ if (!LF_ISSET(DB_OSO_RDONLY))
+ access |= GENERIC_WRITE;
+
+ share = FILE_SHARE_READ | FILE_SHARE_WRITE;
+ if (__os_is_winnt())
+ share |= FILE_SHARE_DELETE;
+ attr = FILE_ATTRIBUTE_NORMAL;
+
+ /*
+ * Reproduce POSIX 1003.1 semantics: if O_CREATE and O_EXCL are both
+ * specified, fail, returning EEXIST, unless we create the file.
+ */
+ if (LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_EXCL))
+ createflag = CREATE_NEW; /* create only if !exist*/
+ else if (!LF_ISSET(DB_OSO_CREATE) && LF_ISSET(DB_OSO_TRUNC))
+ createflag = TRUNCATE_EXISTING; /* truncate, fail if !exist */
+ else if (LF_ISSET(DB_OSO_TRUNC))
+ createflag = CREATE_ALWAYS; /* create and truncate */
+ else if (LF_ISSET(DB_OSO_CREATE))
+ createflag = OPEN_ALWAYS; /* open or create */
+ else
+ createflag = OPEN_EXISTING; /* open only if existing */
+
+ if (LF_ISSET(DB_OSO_DSYNC)) {
+ F_SET(fhp, DB_FH_NOSYNC);
+ attr |= FILE_FLAG_WRITE_THROUGH;
+ }
+
+ if (LF_ISSET(DB_OSO_SEQ))
+ attr |= FILE_FLAG_SEQUENTIAL_SCAN;
+ else
+ attr |= FILE_FLAG_RANDOM_ACCESS;
+
+ if (LF_ISSET(DB_OSO_TEMP))
+ attr |= FILE_FLAG_DELETE_ON_CLOSE;
+
+ /*
+ * We can turn filesystem buffering off if the page size is a
+ * multiple of the disk's sector size. To find the sector size,
+ * we call GetDiskFreeSpace, which expects a drive name like "d:\\"
+ * or NULL for the current disk (i.e., a relative path)
+ */
+ if (LF_ISSET(DB_OSO_DIRECT) && page_size != 0 && name[0] != '\0') {
+ if (name[1] == ':') {
+ drive = dbuf;
+ _sntprintf(dbuf, sizeof(dbuf), _T("%c:\\"), tname[0]);
+ } else
+ drive = NULL;
+
+ /*
+ * We ignore all results except sectorsize, but some versions
+ * of Windows require that the parameters are non-NULL.
+ */
+ if (GetDiskFreeSpace(drive, &cluster_size,
+ &sector_size, &free_clusters, &total_clusters) &&
+ page_size % sector_size == 0)
+ attr |= FILE_FLAG_NO_BUFFERING;
+ }
+
+ fhp->handle = fhp->trunc_handle = INVALID_HANDLE_VALUE;
+ for (nrepeat = 1;; ++nrepeat) {
+ if (fhp->handle == INVALID_HANDLE_VALUE)
+ fhp->handle = CreateFile(
+ tname, access, share, NULL, createflag, attr, 0);
+
+ /*
+ * Windows does not provide truncate directly. There is no
+ * safe way to use a handle for truncate concurrently with
+ * reads or writes. To deal with this, we open a second handle
+ * used just for truncating.
+ */
+ if (fhp->handle != INVALID_HANDLE_VALUE &&
+ !LF_ISSET(DB_OSO_RDONLY | DB_OSO_TEMP) &&
+ fhp->trunc_handle == INVALID_HANDLE_VALUE)
+ fhp->trunc_handle = CreateFile(
+ tname, access, share, NULL, OPEN_EXISTING, attr, 0);
+
+ if (fhp->handle == INVALID_HANDLE_VALUE ||
+ (!LF_ISSET(DB_OSO_RDONLY | DB_OSO_TEMP) &&
+ fhp->trunc_handle == INVALID_HANDLE_VALUE)) {
+ /*
+ * If it's a "temporary" error, we retry up to 3 times,
+ * waiting up to 12 seconds. While it's not a problem
+ * if we can't open a database, an inability to open a
+ * log file is cause for serious dismay.
+ */
+ ret = __os_posix_err(__os_get_syserr());
+ if ((ret != ENFILE && ret != EMFILE && ret != ENOSPC) ||
+ nrepeat > 3)
+ goto err;
+
+ __os_sleep(dbenv, nrepeat * 2, 0);
+ } else
+ break;
+ }
+
+ FREE_STRING(dbenv, tname);
+
+ F_SET(fhp, DB_FH_OPENED);
+ *fhpp = fhp;
+ return (0);
+
+err: FREE_STRING(dbenv, tname);
+ if (fhp != NULL)
+ (void)__os_closehandle(dbenv, fhp);
+ return (ret);
+}
diff --git a/db/os_windows/os_rename.c b/db/os_windows/os_rename.c
new file mode 100644
index 000000000..a7bdfac2b
--- /dev/null
+++ b/db/os_windows/os_rename.c
@@ -0,0 +1,70 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_rename.c,v 12.6 2006/08/24 14:46:22 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_rename --
+ * Rename a file.
+ */
+int
+__os_rename(dbenv, oldname, newname, silent)
+ DB_ENV *dbenv;
+ const char *oldname, *newname;
+ u_int32_t silent;
+{
+ _TCHAR *toldname, *tnewname;
+ int ret;
+
+ TO_TSTRING(dbenv, oldname, toldname, ret);
+ if (ret != 0)
+ return (ret);
+ TO_TSTRING(dbenv, newname, tnewname, ret);
+ if (ret != 0) {
+ FREE_STRING(dbenv, toldname);
+ return (ret);
+ }
+
+ if (!MoveFile(toldname, tnewname))
+ ret = __os_get_syserr();
+
+ if (__os_posix_err(ret) == EEXIST) {
+ ret = 0;
+ if (__os_is_winnt()) {
+ if (!MoveFileEx(
+ toldname, tnewname, MOVEFILE_REPLACE_EXISTING))
+ ret = __os_get_syserr();
+ } else {
+ /*
+ * There is no MoveFileEx for Win9x/Me, so we have to
+ * do the best we can. Note that the MoveFile call
+ * above would have succeeded if oldname and newname
+ * refer to the same file, so we don't need to check
+ * that here.
+ */
+ (void)DeleteFile(tnewname);
+ if (!MoveFile(toldname, tnewname))
+ ret = __os_get_syserr();
+ }
+ }
+
+ FREE_STRING(dbenv, tnewname);
+ FREE_STRING(dbenv, toldname);
+
+ if (ret != 0) {
+ if (silent == 0)
+ __db_syserr(
+ dbenv, ret, "MoveFileEx %s %s", oldname, newname);
+ ret = __os_posix_err(ret);
+ }
+
+ return (ret);
+}
diff --git a/db/os_windows/os_rw.c b/db/os_windows/os_rw.c
new file mode 100644
index 000000000..2d98a0f28
--- /dev/null
+++ b/db/os_windows/os_rw.c
@@ -0,0 +1,186 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_rw.c,v 12.15 2006/08/24 14:46:22 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_io --
+ * Do an I/O.
+ */
+int
+__os_io(dbenv, op, fhp, pgno, pgsize, relative, io_len, buf, niop)
+ DB_ENV *dbenv;
+ int op;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+ u_int32_t pgsize, relative, io_len;
+ u_int8_t *buf;
+ size_t *niop;
+{
+ int ret;
+
+ if (__os_is_winnt()) {
+ ULONG64 off;
+ OVERLAPPED over;
+ DWORD nbytes;
+ if ((off = relative) == 0)
+ off = (ULONG64)pgsize * pgno;
+ over.Offset = (DWORD)(off & 0xffffffff);
+ over.OffsetHigh = (DWORD)(off >> 32);
+ over.hEvent = 0; /* we don't want asynchronous notifications */
+
+ switch (op) {
+ case DB_IO_READ:
+ if (!ReadFile(fhp->handle,
+ buf, (DWORD)io_len, &nbytes, &over))
+ goto slow;
+ break;
+ case DB_IO_WRITE:
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ if (__os_fs_notzero())
+ goto slow;
+#endif
+ if (!WriteFile(fhp->handle,
+ buf, (DWORD)io_len, &nbytes, &over))
+ goto slow;
+ break;
+ }
+ if (nbytes == io_len) {
+ *niop = (size_t)nbytes;
+ return (0);
+ }
+ }
+
+slow: MUTEX_LOCK(dbenv, fhp->mtx_fh);
+
+ if ((ret = __os_seek(dbenv, fhp, pgno, pgsize, relative)) != 0)
+ goto err;
+
+ switch (op) {
+ case DB_IO_READ:
+ ret = __os_read(dbenv, fhp, buf, io_len, niop);
+ break;
+ case DB_IO_WRITE:
+ ret = __os_write(dbenv, fhp, buf, io_len, niop);
+ break;
+ }
+
+err: MUTEX_UNLOCK(dbenv, fhp->mtx_fh);
+
+ return (ret);
+}
+
+/*
+ * __os_read --
+ * Read from a file handle.
+ */
+int
+__os_read(dbenv, fhp, addr, len, nrp)
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+ void *addr;
+ size_t len;
+ size_t *nrp;
+{
+ size_t offset, nr;
+ DWORD count;
+ int ret;
+ u_int8_t *taddr;
+
+ ret = 0;
+ for (taddr = addr,
+ offset = 0; offset < len; taddr += nr, offset += nr) {
+ RETRY_CHK((!ReadFile(fhp->handle,
+ taddr, (DWORD)(len - offset), &count, NULL)), ret);
+ if (count == 0 || ret != 0)
+ break;
+ nr = (size_t)count;
+ }
+ *nrp = taddr - (u_int8_t *)addr;
+ if (ret != 0) {
+ __db_syserr(dbenv, ret, "read: 0x%lx, %lu",
+ P_TO_ULONG(taddr), (u_long)len - offset);
+ ret = __os_posix_err(ret);
+ }
+ return (ret);
+}
+
+/*
+ * __os_write --
+ * Write to a file handle.
+ */
+int
+__os_write(dbenv, fhp, addr, len, nwp)
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+ void *addr;
+ size_t len;
+ size_t *nwp;
+{
+ int ret;
+
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ /* Zero-fill as necessary. */
+ if (__os_fs_notzero() && (ret = __os_zerofill(dbenv, fhp)) != 0)
+ return (ret);
+#endif
+ return (__os_physwrite(dbenv, fhp, addr, len, nwp));
+}
+
+/*
+ * __os_physwrite --
+ * Physical write to a file handle.
+ */
+int
+__os_physwrite(dbenv, fhp, addr, len, nwp)
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+ void *addr;
+ size_t len;
+ size_t *nwp;
+{
+ size_t offset, nw;
+ DWORD count;
+ int ret;
+ u_int8_t *taddr;
+
+ /*
+ * Make a last "panic" check. Imagine a thread of control running in
+ * Berkeley DB, going to sleep. Another thread of control decides to
+ * run recovery because the environment is broken. The first thing
+ * recovery does is panic the existing environment, but we only check
+ * the panic flag when crossing the public API. If the sleeping thread
+ * wakes up and writes something, we could have two threads of control
+ * writing the log files at the same time. So, before writing, make a
+ * last panic check. Obviously, there's still a window, but it's very,
+ * very small.
+ */
+ PANIC_CHECK(dbenv);
+
+ ret = 0;
+ for (taddr = addr,
+ offset = 0; offset < len; taddr += nw, offset += nw) {
+ RETRY_CHK((!WriteFile(fhp->handle,
+ taddr, (DWORD)(len - offset), &count, NULL)), ret);
+ if (ret != 0)
+ break;
+ nw = (size_t)count;
+ }
+ *nwp = len;
+ if (ret != 0) {
+ __db_syserr(dbenv, ret, "write: %#lx, %lu",
+ P_TO_ULONG(taddr), (u_long)len - offset);
+ ret = __os_posix_err(ret);
+
+ DB_EVENT(dbenv, DB_EVENT_WRITE_FAILED, NULL);
+ }
+ return (ret);
+}
diff --git a/db/os_windows/os_seek.c b/db/os_windows/os_seek.c
new file mode 100644
index 000000000..1a2131879
--- /dev/null
+++ b/db/os_windows/os_seek.c
@@ -0,0 +1,55 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_seek.c,v 12.8 2006/08/24 14:46:22 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_seek --
+ * Seek to a page/byte offset in the file.
+ */
+int
+__os_seek(dbenv, fhp, pgno, pgsize, relative)
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+ u_int32_t pgsize;
+ u_int32_t relative;
+{
+ /* Yes, this really is how Microsoft designed their API. */
+ union {
+ __int64 bigint;
+ struct {
+ unsigned long low;
+ long high;
+ };
+ } offbytes;
+ off_t offset;
+ int ret;
+
+ offset = (off_t)pgsize * pgno + relative;
+
+ offbytes.bigint = offset;
+ ret = (SetFilePointer(fhp->handle, offbytes.low,
+ &offbytes.high, FILE_BEGIN) == (DWORD)-1) ? __os_get_syserr() : 0;
+
+ if (ret == 0) {
+ fhp->pgsize = pgsize;
+ fhp->pgno = pgno;
+ fhp->offset = relative;
+ } else {
+ __db_syserr(dbenv, ret,
+ "seek: %lu: (%lu * %lu) + %lu", (u_long)offset,
+ (u_long)pgno, (u_long)pgsize, (u_long)relative);
+ ret = __os_posix_err(ret);
+ }
+
+ return (ret);
+}
diff --git a/db/os_windows/os_sleep.c b/db/os_windows/os_sleep.c
new file mode 100644
index 000000000..f3709fb1a
--- /dev/null
+++ b/db/os_windows/os_sleep.c
@@ -0,0 +1,34 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_sleep.c,v 12.4 2006/08/24 14:46:22 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_sleep --
+ * Yield the processor for a period of time.
+ */
+void
+__os_sleep(dbenv, secs, usecs)
+ DB_ENV *dbenv;
+ u_long secs, usecs; /* Seconds and microseconds. */
+{
+ COMPQUIET(dbenv, NULL);
+
+ /* Don't require that the values be normalized. */
+ for (; usecs >= 1000000; ++secs, usecs -= 1000000)
+ ;
+
+ /*
+ * It's important that we yield the processor here so that other
+ * processes or threads are permitted to run.
+ */
+ Sleep(secs * 1000 + usecs / 1000);
+}
diff --git a/db/os_windows/os_spin.c b/db/os_windows/os_spin.c
new file mode 100644
index 000000000..5d5a23a28
--- /dev/null
+++ b/db/os_windows/os_spin.c
@@ -0,0 +1,38 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_spin.c,v 12.6 2006/08/24 14:46:22 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_spin --
+ * Return the number of default spins before blocking.
+ */
+u_int32_t
+__os_spin(dbenv)
+ DB_ENV *dbenv;
+{
+ SYSTEM_INFO SystemInfo;
+ u_int32_t tas_spins;
+
+ /* Get the number of processors */
+ GetSystemInfo(&SystemInfo);
+
+ /*
+ * Spin 50 times per processor -- we have anecdotal evidence that this
+ * is a reasonable value.
+ */
+ if (SystemInfo.dwNumberOfProcessors > 1)
+ tas_spins = 50 * SystemInfo.dwNumberOfProcessors;
+ else
+ tas_spins = 1;
+
+ return (tas_spins);
+}
diff --git a/db/os_windows/os_stat.c b/db/os_windows/os_stat.c
new file mode 100644
index 000000000..cd018b83f
--- /dev/null
+++ b/db/os_windows/os_stat.c
@@ -0,0 +1,84 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_stat.c,v 12.9 2006/08/24 14:46:22 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_exists --
+ * Return if the file exists.
+ */
+int
+__os_exists(dbenv, path, isdirp)
+ DB_ENV *dbenv;
+ const char *path;
+ int *isdirp;
+{
+ int ret;
+ DWORD attrs;
+ _TCHAR *tpath;
+
+ TO_TSTRING(dbenv, path, tpath, ret);
+ if (ret != 0)
+ return (ret);
+
+ RETRY_CHK(
+ ((attrs = GetFileAttributes(tpath)) == (DWORD)-1 ? 1 : 0), ret);
+ if (ret == 0) {
+ if (isdirp != NULL)
+ *isdirp = (attrs & FILE_ATTRIBUTE_DIRECTORY);
+ } else
+ ret = __os_posix_err(ret);
+
+ FREE_STRING(dbenv, tpath);
+ return (ret);
+}
+
+/*
+ * __os_ioinfo --
+ * Return file size and I/O size; abstracted to make it easier
+ * to replace.
+ */
+int
+__os_ioinfo(dbenv, path, fhp, mbytesp, bytesp, iosizep)
+ DB_ENV *dbenv;
+ const char *path;
+ DB_FH *fhp;
+ u_int32_t *mbytesp, *bytesp, *iosizep;
+{
+ int ret;
+ BY_HANDLE_FILE_INFORMATION bhfi;
+ unsigned __int64 filesize;
+
+ RETRY_CHK((!GetFileInformationByHandle(fhp->handle, &bhfi)), ret);
+ if (ret != 0) {
+ __db_syserr(dbenv, ret, "GetFileInformationByHandle");
+ return (__os_posix_err(ret));
+ }
+
+ filesize = ((unsigned __int64)bhfi.nFileSizeHigh << 32) +
+ bhfi.nFileSizeLow;
+
+ /* Return the size of the file. */
+ if (mbytesp != NULL)
+ *mbytesp = (u_int32_t)(filesize / MEGABYTE);
+ if (bytesp != NULL)
+ *bytesp = (u_int32_t)(filesize % MEGABYTE);
+
+ /*
+ * The filesystem I/O size is not easily available. In particular,
+ * the values returned by GetDiskFreeSpace() are not very helpful
+ * (NTFS volumes often report 512B clusters, which are too small to
+ * be a useful default).
+ */
+ if (iosizep != NULL)
+ *iosizep = DB_DEF_IOSIZE;
+ return (0);
+}
diff --git a/db/os_windows/os_truncate.c b/db/os_windows/os_truncate.c
new file mode 100644
index 000000000..936d080ef
--- /dev/null
+++ b/db/os_windows/os_truncate.c
@@ -0,0 +1,90 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2004-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_truncate.c,v 12.12 2006/09/05 15:30:18 mjc Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_truncate --
+ * Truncate the file.
+ */
+int
+__os_truncate(dbenv, fhp, pgno, pgsize)
+ DB_ENV *dbenv;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+ u_int32_t pgsize;
+{
+ /* Yes, this really is how Microsoft have designed their API */
+ union {
+ __int64 bigint;
+ struct {
+ unsigned long low;
+ long high;
+ };
+ } off;
+ off_t offset;
+ int ret;
+
+ ret = 0;
+ offset = (off_t)pgsize * pgno;
+
+#ifdef HAVE_FILESYSTEM_NOTZERO
+ /*
+ * If the filesystem doesn't zero fill, it isn't safe to extend the
+ * file, or we end up with junk blocks. Just return in that case.
+ */
+ if (__os_fs_notzero()) {
+ off_t stat_offset;
+ u_int32_t mbytes, bytes;
+
+ /* Stat the file. */
+ if ((ret =
+ __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
+ return (ret);
+ stat_offset = (off_t)mbytes * MEGABYTE + bytes;
+
+ if (offset > stat_offset)
+ return (0);
+ }
+#endif
+
+ /*
+ * Windows doesn't provide truncate directly. Instead, it has
+ * SetEndOfFile, which truncates to the current position. To
+ * deal with that, we open a duplicate file handle for truncating.
+ *
+ * We want to retry the truncate call, which involves a SetFilePointer
+ * and a SetEndOfFile, but there are several complications:
+ *
+ * 1) since the Windows API deals in 32-bit values, it's possible that
+ * the return from SetFilePointer (the low 32-bits) is
+ * INVALID_SET_FILE_POINTER even when the call has succeeded. So we
+ * have to also check whether GetLastError() returns NO_ERROR.
+ *
+ * 2) when it returns, SetFilePointer overwrites the high bits of the
+ * offset, so if we need to retry, we have to reset the offset each
+ * time.
+ *
+ * We can't switch to SetFilePointerEx, which knows about 64-bit
+ * offsets, because it isn't supported on Win9x/ME.
+ */
+ RETRY_CHK((off.bigint = (__int64)pgsize * pgno,
+ (SetFilePointer(fhp->trunc_handle, off.low, &off.high, FILE_BEGIN)
+ == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) ||
+ !SetEndOfFile(fhp->trunc_handle)), ret);
+
+ if (ret != 0) {
+ __db_syserr(dbenv, ret, "SetFilePointer: %lu", pgno * pgsize);
+ ret = __os_posix_err(ret);
+ }
+
+ return (ret);
+}
diff --git a/db/os_windows/os_unlink.c b/db/os_windows/os_unlink.c
new file mode 100644
index 000000000..d6a7359c2
--- /dev/null
+++ b/db/os_windows/os_unlink.c
@@ -0,0 +1,109 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_unlink.c,v 12.15 2006/08/24 14:46:22 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_region_unlink --
+ * Remove a shared memory object file.
+ */
+int
+__os_region_unlink(dbenv, path)
+ DB_ENV *dbenv;
+ const char *path;
+{
+ if (F_ISSET(dbenv, DB_ENV_OVERWRITE))
+ (void)__db_file_multi_write(dbenv, path);
+
+ return (__os_unlink(dbenv, path));
+}
+
+/*
+ * __os_unlink --
+ * Remove a file.
+ */
+int
+__os_unlink(dbenv, path)
+ DB_ENV *dbenv;
+ const char *path;
+{
+ HANDLE h;
+ _TCHAR *tpath, *orig_tpath, buf[DB_MAXPATHLEN];
+ u_int32_t id;
+ int ret, t_ret;
+
+ TO_TSTRING(dbenv, path, tpath, ret);
+ if (ret != 0)
+ return (ret);
+ orig_tpath = tpath;
+
+ /*
+ * Windows NT and its descendents allow removal of open files, but the
+ * DeleteFile Win32 system call isn't equivalent to a POSIX unlink.
+ * Firstly, it only succeeds if FILE_SHARE_DELETE is set when the file
+ * is opened. Secondly, it leaves the file in a "zombie" state, where
+ * it can't be opened again, but a new file with the same name can't be
+ * created either.
+ *
+ * Since we depend on being able to recreate files (during recovery,
+ * say), we have to first rename the file, and then delete it. It
+ * still hangs around, but with a name we don't care about. The rename
+ * will fail if the file doesn't exist, which isn't a problem, but if
+ * it fails for some other reason, we need to know about it or a
+ * subsequent open may fail for no apparent reason.
+ */
+ if (__os_is_winnt()) {
+ __os_unique_id(dbenv, &id);
+ _sntprintf(buf, DB_MAXPATHLEN, _T("%s.del.%010u"), tpath, id);
+ if (MoveFile(tpath, buf))
+ tpath = buf;
+ else {
+ ret = __os_get_syserr();
+ if (__os_posix_err(ret) != ENOENT)
+ __db_err(dbenv, ret,
+ "MoveFile: rename %s to temporary file",
+ path);
+ }
+
+ /*
+ * Try removing the file using the delete-on-close flag. This
+ * plays nicer with files that are still open than DeleteFile.
+ */
+ h = CreateFile(tpath, 0, FILE_SHARE_READ, NULL, OPEN_EXISTING,
+ FILE_FLAG_DELETE_ON_CLOSE, 0);
+ if (h != INVALID_HANDLE_VALUE) {
+ (void)CloseHandle (h);
+ if (GetFileAttributes(tpath) == INVALID_FILE_ATTRIBUTES)
+ goto skipdel;
+ }
+ }
+
+ RETRY_CHK((!DeleteFile(tpath)), ret);
+
+skipdel:
+ FREE_STRING(dbenv, orig_tpath);
+
+ /*
+ * XXX
+ * We shouldn't be testing for an errno of ENOENT here, but ENOENT
+ * signals that a file is missing, and we attempt to unlink things
+ * (such as v. 2.x environment regions, in DB_ENV->remove) that we
+ * are expecting not to be there. Reporting errors in these cases
+ * is annoying.
+ */
+ if (ret != 0) {
+ if ((t_ret = __os_posix_err(ret)) != ENOENT)
+ __db_syserr(dbenv, ret, "DeleteFile: %s", path);
+ ret = t_ret;
+ }
+
+ return (ret);
+}
diff --git a/db/os_windows/os_yield.c b/db/os_windows/os_yield.c
new file mode 100644
index 000000000..200633cc7
--- /dev/null
+++ b/db/os_windows/os_yield.c
@@ -0,0 +1,27 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2006
+ * Oracle Corporation. All rights reserved.
+ *
+ * $Id: os_yield.c,v 12.7 2006/08/24 14:46:22 bostic Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+
+/*
+ * __os_yield --
+ * Yield the processor.
+ */
+void
+__os_yield(dbenv)
+ DB_ENV *dbenv;
+{
+ /*
+ * The call to Sleep(0) is specified by MSDN to yield the current
+ * thread's time slice to another thread of equal or greater priority.
+ */
+ Sleep(0);
+}