summaryrefslogtreecommitdiff
path: root/src/invlib.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/invlib.c')
-rw-r--r--src/invlib.c1186
1 files changed, 1186 insertions, 0 deletions
diff --git a/src/invlib.c b/src/invlib.c
new file mode 100644
index 0000000..872ca38
--- /dev/null
+++ b/src/invlib.c
@@ -0,0 +1,1186 @@
+/*===========================================================================
+ Copyright (c) 1998-2000, The Santa Cruz Operation
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ *Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+
+ *Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+ *Neither name of The Santa Cruz Operation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
+ IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ DAMAGE.
+ =========================================================================*/
+
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#if SHARE
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#define ERR -1
+#endif
+#include "invlib.h"
+#include "global.h"
+
+#include <assert.h>
+
+#define DEBUG 0 /* debugging code and realloc messages */
+#define BLOCKSIZE 2 * BUFSIZ /* logical block size */
+#define POSTINC 10000 /* posting buffer size increment */
+#define SEP ' ' /* sorted posting field separator */
+#define SETINC 100 /* posting set size increment */
+#define STATS 0 /* print statistics */
+#define SUPERINC 10000 /* super index size increment */
+#define TERMMAX 512 /* term max size */
+#define FMTVERSION 1 /* inverted index format version */
+#define ZIPFSIZE 200 /* zipf curve size */
+
+static char const rcsid[] = "$Id: invlib.c,v 1.21 2012/07/10 20:01:40 nhorman Exp $";
+
+#if DEBUG
+/* FIXME HBB 20010705: nowhere in the source is `invbreak' ever set to
+ * a value other than the (silent) initialization to zero. Pretty
+ * useless, that looks */
+int invbreak;
+#endif
+
+static int boolready(void);
+static int invnewterm(void);
+static void invstep(INVCONTROL *invcntl);
+static void invcannotalloc(unsigned n);
+static void invcannotopen(char *file);
+static void invcannotwrite(char *file);
+
+#if STATS
+int showzipf; /* show postings per term distribution */
+#endif
+
+static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
+static unsigned setsize1, setsize2;
+static long numitems, totterm, zerolong;
+static char *indexfile, *postingfile;
+static FILE *outfile, *fpost;
+static unsigned supersize = SUPERINC, supintsize;
+static unsigned int numpost, numlogblk, amtused, nextpost;
+static unsigned int lastinblk, numinvitems;
+static POSTING *POST, *postptr;
+static unsigned long *SUPINT, *supint, nextsupfing;
+static char *SUPFING, *supfing;
+static char thisterm[TERMMAX];
+typedef union logicalblk {
+ long invblk[BLOCKSIZE / sizeof(long)];
+ char chrblk[BLOCKSIZE];
+} t_logicalblk;
+static t_logicalblk logicalblk;
+
+#if DEBUG || STATS
+static long totpost;
+#endif
+
+#if STATS
+static int zipf[ZIPFSIZE + 1];
+#endif
+
+long
+invmake(char *invname, char *invpost, FILE *infile)
+{
+ unsigned char *s;
+ long num;
+ int i;
+ long fileindex = 0; /* initialze, to avoid warning */
+ unsigned postsize = POSTINC * sizeof(POSTING);
+ unsigned long *intptr;
+ char line[TERMMAX];
+ long tlong;
+ PARAM param;
+ POSTING posting;
+ char temp[BLOCKSIZE];
+#if STATS
+ int j;
+ unsigned maxtermlen = 0;
+#endif
+ /* output file */
+ if ((outfile = vpfopen(invname, "w+b")) == NULL) {
+ invcannotopen(invname);
+ return(0);
+ }
+ indexfile = invname;
+ fseek(outfile, BUFSIZ, SEEK_SET);
+
+ /* posting file */
+ if ((fpost = vpfopen(invpost, "wb")) == NULL) {
+ invcannotopen(invpost);
+ return(0);
+ }
+ postingfile = invpost;
+ nextpost = 0;
+ /* get space for the postings list */
+ if ((POST = malloc(postsize)) == NULL) {
+ invcannotalloc(postsize);
+ return(0);
+ }
+ postptr = POST;
+ /* get space for the superfinger (superindex) */
+ if ((SUPFING = malloc(supersize)) == NULL) {
+ invcannotalloc(supersize);
+ return(0);
+ }
+ supfing = SUPFING;
+ /* FIXME HBB: magic number alert (40) */
+ supintsize = supersize / 40;
+ /* also for the superfinger index */
+ if ((SUPINT = malloc(supintsize * sizeof(long))) == NULL) {
+ invcannotalloc(supintsize * sizeof(long));
+ return(0);
+ }
+ supint = SUPINT;
+ supint++; /* leave first term open for a count */
+ /* initialize using an empty term */
+ strcpy(thisterm, "");
+ *supint++ = 0;
+ *supfing++ = ' ';
+ *supfing++ = '\0';
+ nextsupfing = 2;
+#if DEBUG || STATS
+ totpost = 0L;
+#endif
+ totterm = 0L;
+ numpost = 1;
+
+ /* set up as though a block had come and gone, i.e., set up for new block */
+ /* 3 longs needed for: numinvitems, next block, and previous block */
+ amtused = 3 * sizeof(long);
+ numinvitems = 0;
+ numlogblk = 0;
+ lastinblk = sizeof(t_logicalblk);
+
+ /* now loop as long as more to read (till eof) */
+ while (fgets(line, TERMMAX, infile) != NULL) {
+#if DEBUG || STATS
+ ++totpost;
+#endif
+ s = strchr(line, SEP);
+ if (s != NULL) {
+ *s = '\0';
+ }
+ else {
+ continue;
+ }
+#if STATS
+ if ((i = strlen(line)) > maxtermlen) {
+ maxtermlen = i;
+ }
+#endif
+#if DEBUG
+ printf("%ld: %s ", totpost, line);
+ fflush(stdout);
+#endif
+ if (strcmp(thisterm, line) == 0) {
+ if (postptr + 10 > POST + postsize / sizeof(POSTING)) {
+ i = postptr - POST;
+ postsize += POSTINC * sizeof(POSTING);
+ if ((POST = realloc(POST, postsize)) == NULL) {
+ invcannotalloc(postsize);
+ return(0);
+ }
+ postptr = i + POST;
+#if DEBUG
+ printf("reallocated post space to %u, totpost=%ld\n",
+ postsize, totpost);
+#endif
+ }
+ numpost++;
+ } else {
+ /* have a new term */
+ if (!invnewterm()) {
+ return(0);
+ }
+ strcpy(thisterm, line);
+ numpost = 1;
+ postptr = POST;
+ fileindex = 0;
+ }
+ /* get the new posting */
+ num = *++s - '!';
+ i = 1;
+ do {
+ num = BASE * num + *++s - '!';
+ } while (++i < PRECISION);
+ posting.lineoffset = num;
+ while (++fileindex < nsrcfiles && num > srcoffset[fileindex]) {
+ ;
+ }
+ posting.fileindex = --fileindex;
+ posting.type = *++s;
+ ++s;
+ if (*s != '\n') {
+ num = *++s - '!';
+ while (*++s != '\n') {
+ num = BASE * num + *s - '!';
+ }
+ posting.fcnoffset = num;
+ }
+ else {
+ posting.fcnoffset = 0;
+ }
+ *postptr++ = posting;
+#if DEBUG
+ printf("%ld %ld %ld %ld\n", posting.fileindex,
+ posting.fcnoffset, posting.lineoffset, posting.type);
+ fflush(stdout);
+#endif
+ }
+ if (!invnewterm()) {
+ return(0);
+ }
+ /* now clean up final block */
+ logicalblk.invblk[0] = numinvitems;
+ /* loops pointer around to start */
+ logicalblk.invblk[1] = 0;
+ logicalblk.invblk[2] = numlogblk - 1;
+ if (fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) {
+ goto cannotwrite;
+ }
+ numlogblk++;
+ /* write out block to save space. what in it doesn't matter */
+ if (fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) {
+ goto cannotwrite;
+ }
+ /* finish up the super finger */
+ *SUPINT = numlogblk;
+ /* add to the offsets the size of the offset pointers */
+ intptr = (SUPINT + 1);
+ i = (char *)supint - (char *)SUPINT;
+ while (intptr < supint)
+ *intptr++ += i;
+ /* write out the offsets (1 for the N at start) and the super finger */
+ if (fwrite(SUPINT, sizeof(*SUPINT), numlogblk + 1, outfile) == 0 ||
+ fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
+ goto cannotwrite;
+ }
+ /* save the size for reference later */
+ nextsupfing = sizeof(long) + sizeof(long) * numlogblk + (supfing - SUPFING);
+ /* make sure the file ends at a logical block boundary. This is
+ necessary for invinsert to correctly create extended blocks
+ */
+ i = nextsupfing % sizeof(t_logicalblk);
+ /* write out junk to fill log blk */
+ if (fwrite(temp, sizeof(t_logicalblk) - i, 1, outfile) == 0 ||
+ fflush(outfile) == EOF) { /* rewind doesn't check for write failure */
+ goto cannotwrite;
+ }
+ /* write the control area */
+ rewind(outfile);
+ param.version = FMTVERSION;
+ param.filestat = 0;
+ param.sizeblk = sizeof(t_logicalblk);
+ param.startbyte = (numlogblk + 1) * sizeof(t_logicalblk) + BUFSIZ;;
+ param.supsize = nextsupfing;
+ param.cntlsize = BUFSIZ;
+ param.share = 0;
+ if (fwrite(&param, sizeof(param), 1, outfile) == 0) {
+ goto cannotwrite;
+ }
+ for (i = 0; i < 10; i++) /* for future use */
+ if (fwrite(&zerolong, sizeof(zerolong), 1, outfile) == 0) {
+ goto cannotwrite;
+ }
+
+ /* make first block loop backwards to last block */
+ if (fflush(outfile) == EOF) { /* fseek doesn't check for write failure */
+ goto cannotwrite;
+ }
+ /* get to second word first block */
+ fseek(outfile, BUFSIZ + 2 * sizeof(long), SEEK_SET);
+ tlong = numlogblk - 1;
+ if (fwrite(&tlong, sizeof(tlong), 1, outfile) == 0 ||
+ fclose(outfile) == EOF) {
+ cannotwrite:
+ invcannotwrite(invname);
+ return(0);
+ }
+ if (fclose(fpost) == EOF) {
+ invcannotwrite(postingfile);
+ return(0);
+ }
+ --totterm; /* don't count null term */
+#if STATS
+ printf("logical blocks = %d, postings = %ld, terms = %ld, max term length = %d\n",
+ numlogblk, totpost, totterm, maxtermlen);
+ if (showzipf) {
+ printf("\n************* ZIPF curve ****************\n");
+ for (j = ZIPFSIZE; j > 1; j--)
+ if (zipf[j])
+ break;
+ for (i = 1; i < j; ++i) {
+ printf("%3d -%6d ", i, zipf[i]);
+ if (i % 6 == 0) putchar('\n');
+ }
+ printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
+ }
+#endif
+ /* free all malloc'd memory */
+ free(POST);
+ free(SUPFING);
+ free(SUPINT);
+ return(totterm);
+}
+
+/* add a term to the data base */
+
+static int
+invnewterm(void)
+{
+ int backupflag, i, j, holditems, gooditems, howfar;
+ unsigned int maxback, len, numwilluse, wdlen;
+ char *tptr, *tptr3;
+
+ union {
+ unsigned long packword[2];
+ ENTRY e;
+ } iteminfo;
+
+ gooditems = 0; /* initialize, to avoid warning */
+ totterm++;
+#if STATS
+ /* keep zipfian info on the distribution */
+ if (numpost <= ZIPFSIZE)
+ zipf[numpost]++;
+ else
+ zipf[0]++;
+#endif
+ len = strlen(thisterm);
+ /* length of term rounded up to long boundary */
+ wdlen = (len + (sizeof(long) - 1)) / sizeof(long);
+ /* each term needs 2 longs for its iteminfo and
+ * 1 long for its offset */
+ numwilluse = (wdlen + 3) * sizeof(long);
+ /* new block if at least 1 item in block */
+ if (numinvitems && numwilluse + amtused > sizeof(t_logicalblk)) {
+ /* set up new block */
+ if (supfing + 500 > SUPFING + supersize) {
+ i = supfing - SUPFING;
+ supersize += 20000;
+ if ((SUPFING = (char *)realloc(SUPFING, supersize)) == NULL) {
+ invcannotalloc(supersize);
+ return(0);
+ }
+ supfing = i + SUPFING;
+#if DEBUG
+ printf("reallocated superfinger space to %d, totpost=%ld\n",
+ supersize, totpost);
+#endif
+ }
+ /* check that room for the offset as well */
+ /* FIXME HBB: magic number alert (10) */
+ if ((numlogblk + 10) > supintsize) {
+ i = supint - SUPINT;
+ supintsize += SUPERINC;
+ if ((SUPINT = realloc(SUPINT, supintsize * sizeof(long))) == NULL) {
+ invcannotalloc(supintsize * sizeof(long));
+ return(0);
+ }
+ supint = i + SUPINT;
+#if DEBUG
+ printf("reallocated superfinger offset to %d, totpost = %ld\n",
+ supintsize * sizeof(long), totpost);
+#endif
+ }
+ /* See if backup is efficatious */
+ backupflag = 0;
+ maxback = (int) strlen(thisterm) / 10;
+ holditems = numinvitems;
+ if (maxback > numinvitems)
+ maxback = numinvitems - 2;
+ howfar = 0;
+ while (maxback-- > 1) {
+ howfar++;
+ iteminfo.packword[0] =
+ logicalblk.invblk[--holditems * 2 + (sizeof(long) - 1)];
+ if ((i = iteminfo.e.size / 10) < maxback) {
+ maxback = i;
+ backupflag = howfar;
+ gooditems = holditems;
+ }
+ }
+ /* see if backup will occur */
+ if (backupflag) {
+ numinvitems = gooditems;
+ }
+ logicalblk.invblk[0] = numinvitems;
+ /* set forward pointer pointing to next */
+ logicalblk.invblk[1] = numlogblk + 1;
+ /* set back pointer to last block */
+ logicalblk.invblk[2] = numlogblk - 1;
+ if (fwrite(logicalblk.chrblk, 1, sizeof(t_logicalblk), outfile) == 0) {
+ invcannotwrite(indexfile);
+ return(0);
+ }
+ /* 3 longs needed for: numinvitems, next block, and previous block */
+ amtused = 3 * sizeof(long);
+ numlogblk++;
+ /* check if had to back up, if so do it */
+ if (backupflag) {
+ char *tptr2;
+
+ /* find out where the end of the new block is */
+ iteminfo.packword[0] = logicalblk.invblk[numinvitems*2+1];
+ tptr3 = logicalblk.chrblk + iteminfo.e.offset;
+ /* move the index for this block */
+ for (i = 3; i <= (backupflag * 2 + 2); i++)
+ logicalblk.invblk[i] = logicalblk.invblk[numinvitems*2+i];
+ /* move the word into the super index */
+ iteminfo.packword[0] = logicalblk.invblk[3];
+ iteminfo.packword[1] = logicalblk.invblk[4];
+ tptr2 = logicalblk.chrblk + iteminfo.e.offset;
+ strncpy(supfing, tptr2, (int) iteminfo.e.size);
+ *(supfing + iteminfo.e.size) = '\0';
+#if DEBUG
+ printf("backup %d at term=%s to term=%s\n",
+ backupflag, thisterm, supfing);
+#endif
+ *supint++ = nextsupfing;
+ nextsupfing += strlen(supfing) + 1;
+ supfing += strlen(supfing) + 1;
+ /* now fix up the logical block */
+ tptr = logicalblk.chrblk + lastinblk;
+ lastinblk = sizeof(t_logicalblk);
+ tptr2 = logicalblk.chrblk + lastinblk;
+ j = tptr3 - tptr;
+ while (tptr3 > tptr)
+ *--tptr2 = *--tptr3;
+ lastinblk -= j;
+ amtused += ((2 * sizeof(long)) * backupflag + j);
+ for (i = 3; i < (backupflag * 2 + 2); i += 2) {
+ iteminfo.packword[0] = logicalblk.invblk[i];
+ iteminfo.e.offset += (tptr2 - tptr3);
+ logicalblk.invblk[i] = iteminfo.packword[0];
+ }
+ numinvitems = backupflag;
+ } else { /* no backup needed */
+ numinvitems = 0;
+ lastinblk = sizeof(t_logicalblk);
+ /* add new term to superindex */
+ strcpy(supfing, thisterm);
+ supfing += strlen(thisterm) + 1;
+ *supint++ = nextsupfing;
+ nextsupfing += strlen(thisterm) + 1;
+ }
+ }
+ /* HBB 20010501: Fixed bug by replacing magic number '8' by
+ * what it actually represents. */
+ lastinblk -= (numwilluse - 2 * sizeof(long));
+ iteminfo.e.offset = lastinblk;
+ iteminfo.e.size = len;
+ iteminfo.e.space = 0;
+ iteminfo.e.post = numpost;
+ strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
+ amtused += numwilluse;
+ logicalblk.invblk[(lastinblk/sizeof(long))+wdlen] = nextpost;
+ if ((i = postptr - POST) > 0) {
+ if (fwrite(POST, sizeof(POSTING), i, fpost) == 0) {
+ invcannotwrite(postingfile);
+ return(0);
+ }
+ nextpost += i * sizeof(POSTING);
+ }
+ logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
+ logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
+ return(1);
+}
+
+/*
+ * If 'invname' ends with the 'from' substring, it is replaced inline with the
+ * 'to' substring (which must be of the exact same length), and the function
+ * returns 0. Otherwise, returns -1.
+ */
+
+static int
+invflipname(char * invname, const char *from, const char *to)
+{
+ char *temp, *i = NULL;
+
+ assert(strlen(from) == strlen(to));
+
+ temp = invname - 1;
+ while( (temp = strstr(temp + 1, from)))
+ i = temp;
+ if (!i || i[strlen(from)] != '\0')
+ return -1;
+ while(*to)
+ *i++ = *to++;
+ return 0;
+}
+
+int
+invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
+{
+ int read_index;
+
+ if ((invcntl->invfile = vpfopen(invname, ((stat == 0) ? "rb" : "r+b"))) == NULL) {
+ /* If db created without '-f', but now invoked with '-f cscope.out',
+ * we need to check for 'cscope.in.out', rather than 'cscope.out.in':
+ * I.e, hack around our own violation of the inverse db naming convention */
+ if (!invflipname(invname, INVNAME2, INVNAME)) {
+ if ((invcntl->invfile = vpfopen(invname, ((stat == 0) ? "rb" : "r+b"))))
+ goto openedinvname;
+ invflipname(invname, INVNAME, INVNAME2); /* change back for err msg */
+ }
+ /* more silliness: if you create the db with '-f cscope', then try to open
+ * it without '-f cscope', you'll fail unless we check for 'cscope.out.in'
+ * here. */
+ else if (!invflipname(invname, INVNAME, INVNAME2)) {
+ if ((invcntl->invfile = vpfopen(invname, ((stat == 0) ? "rb" : "r+b"))))
+ goto openedinvname;
+ invflipname(invname, INVNAME2, INVNAME); /* change back for err msg */
+ }
+ invcannotopen(invname);
+ return(-1);
+ }
+openedinvname:
+ if (fread(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile) == 0) {
+ fprintf(stderr, "%s: empty inverted file\n", argv0);
+ goto closeinv;
+ }
+ if (invcntl->param.version != FMTVERSION) {
+ fprintf(stderr, "%s: cannot read old index format; use -U option to force database to rebuild\n", argv0);
+ goto closeinv;
+ }
+ assert(invcntl->param.sizeblk == sizeof(t_logicalblk));
+
+ if (stat == 0 && invcntl->param.filestat == INVALONE) {
+ fprintf(stderr, "%s: inverted file is locked\n", argv0);
+ goto closeinv;
+ }
+ if ((invcntl->postfile = vpfopen(invpost, ((stat == 0) ? "rb" : "r+b"))) == NULL) {
+ /* exact same naming convention hacks as above for invname */
+ if (!invflipname(invpost, INVPOST2, INVPOST)) {
+ if ((invcntl->postfile = vpfopen(invpost, ((stat == 0) ? "rb" : "r+b"))))
+ goto openedinvpost;
+ invflipname(invpost, INVPOST, INVPOST2); /* change back for err msg */
+ } else if (!invflipname(invpost, INVPOST, INVPOST2)) {
+ if ((invcntl->postfile = vpfopen(invpost,((stat == 0)?"rb":"r+b"))))
+ goto openedinvpost;
+ invflipname(invpost, INVPOST2, INVPOST); /* change back for err msg */
+ }
+ invcannotopen(invpost);
+ goto closeinv;
+ }
+openedinvpost:
+ /* allocate core for a logical block */
+ if ((invcntl->logblk = malloc((unsigned) invcntl->param.sizeblk)) == NULL) {
+ invcannotalloc((unsigned) invcntl->param.sizeblk);
+ goto closeboth;
+ }
+ /* allocate for and read in superfinger */
+ read_index = 1;
+ invcntl->iindex = NULL;
+#if SHARE
+ if (invcntl->param.share == 1) {
+ key_t shm_key;
+ struct shmid_ds shm_buf;
+ int shm_id;
+
+ /* see if the shared segment exists */
+ shm_key = ftok(invname, 2);
+ shm_id = shmget(shm_key, 0, 0);
+ /* Failure simply means (hopefully) that segment doesn't exists */
+ if (shm_id == -1) {
+ /* Have to give general write permission due to AMdahl not having protected segments */
+ shm_id = shmget(shm_key, invcntl->param.supsize + sizeof(long), IPC_CREAT | 0666);
+ if (shm_id == -1)
+ perror("Could not create shared memory segment");
+ } else
+ read_index = 0;
+
+ if (shm_id != -1) {
+ invcntl->iindex = shmat(shm_id, 0, ((read_index) ? 0 : SHM_RDONLY));
+ if (invcntl->iindex == (char *)ERR) {
+ fprintf(stderr, "%s: shared memory link failed\n", argv0);
+ invcntl->iindex = NULL;
+ read_index = 1;
+ }
+ }
+ }
+#endif
+ if (invcntl->iindex == NULL)
+ /* FIXME HBB: magic number alert (4) */
+ invcntl->iindex = malloc((unsigned) invcntl->param.supsize
+ + 4 *sizeof(long));
+ if (invcntl->iindex == NULL) {
+ invcannotalloc((unsigned) invcntl->param.supsize);
+ free(invcntl->logblk);
+ goto closeboth;
+ }
+ if (read_index) {
+ fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
+ fread(invcntl->iindex, (int) invcntl->param.supsize, 1,
+ invcntl->invfile);
+ }
+ invcntl->numblk = -1;
+ if (boolready() == -1) {
+ closeboth:
+ fclose(invcntl->postfile);
+ closeinv:
+ fclose(invcntl->invfile);
+ return(-1);
+ }
+ /* write back out the control block if anything changed */
+ invcntl->param.filestat = stat;
+ if (stat > invcntl->param.filestat ) {
+ rewind(invcntl->invfile);
+ fwrite(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile);
+ }
+ return(1);
+}
+
+/** invclose must be called to wrap things up and deallocate core **/
+void
+invclose(INVCONTROL *invcntl)
+{
+ /* write out the control block in case anything changed */
+ if (invcntl->param.filestat > 0) {
+ invcntl->param.filestat = 0;
+ rewind(invcntl->invfile);
+ fwrite(&invcntl->param, 1,
+ sizeof(invcntl->param), invcntl->invfile);
+ }
+ if (invcntl->param.filestat == INVALONE) {
+ /* write out the super finger */
+ fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
+ fwrite(invcntl->iindex, 1,
+ (int) invcntl->param.supsize, invcntl->invfile);
+ }
+ fclose(invcntl->invfile);
+ fclose(invcntl->postfile);
+#if SHARE
+ if (invcntl->param.share > 0) {
+ shmdt(invcntl->iindex);
+ invcntl->iindex = NULL;
+ }
+#endif
+ if (invcntl->iindex != NULL)
+ free(invcntl->iindex);
+ free(invcntl->logblk);
+}
+
+/** invstep steps the inverted file forward one item **/
+static void
+invstep(INVCONTROL *invcntl)
+{
+ if (invcntl->keypnt < (invcntl->logblk->invblk[0] - 1)) {
+ invcntl->keypnt++;
+ return;
+ }
+
+ /* move forward a block else wrap */
+ invcntl->numblk = invcntl->logblk->invblk[1]; /* was: *(int *)(invcntl->logblk + sizeof(long))*/
+
+ /* now read in the block */
+ fseek(invcntl->invfile,
+ invcntl->numblk*invcntl->param.sizeblk + invcntl->param.cntlsize,
+ SEEK_SET);
+ fread(invcntl->logblk, (int) invcntl->param.sizeblk, 1,
+ invcntl->invfile);
+ invcntl->keypnt = 0;
+}
+
+/** invforward moves forward one term in the inverted file **/
+int
+invforward(INVCONTROL *invcntl)
+{
+ invstep(invcntl);
+ /* skip things with 0 postings */
+ /* FIXME HBB: magic number alert! (3) */
+ while (((ENTRY * )(invcntl->logblk->invblk + 3) + invcntl->keypnt)->post == 0) {
+ invstep(invcntl);
+ }
+ /* Check for having wrapped - reached start of inverted file! */
+ if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
+ return(0);
+ return(1);
+}
+
+/** invterm gets the present term from the present logical block **/
+long
+invterm(INVCONTROL *invcntl, char *term)
+{
+ ENTRY * entryptr;
+
+ /* FIXME HBB: magic number alert! (3) */
+ entryptr = (ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt;
+ strncpy(term, invcntl->logblk->chrblk + entryptr->offset,
+ (int) entryptr->size);
+ *(term + entryptr->size) = '\0';
+ return(entryptr->post);
+}
+
+/** invfind searches for an individual item in the inverted file **/
+long
+invfind(INVCONTROL *invcntl, char *searchterm) /* term being searched for */
+{
+ int imid, ilow, ihigh;
+ long num;
+ int i;
+ unsigned long *intptr, *intptr2;
+ ENTRY *entryptr;
+
+ /* make sure it is initialized via invready */
+ if (invcntl->invfile == 0)
+ return(-1L);
+
+ /* now search for the appropriate finger block */
+ intptr = (unsigned long *)invcntl->iindex;
+
+ ilow = 0;
+ ihigh = *intptr++ - 1;
+ while (ilow <= ihigh) {
+ imid = (ilow + ihigh) / 2;
+ intptr2 = intptr + imid;
+ i = strcmp(searchterm, (invcntl->iindex + *intptr2));
+ if (i < 0)
+ ihigh = imid - 1;
+ else if (i > 0)
+ ilow = ++imid;
+ else {
+ ilow = imid + 1;
+ break;
+ }
+ }
+ /* be careful about case where searchterm is after last in this block */
+ imid = (ilow) ? ilow - 1 : 0;
+
+ /* fetch the appropriate logical block if not in core */
+ /* note always fetch it if the file is busy */
+ if ((imid != invcntl->numblk) || (invcntl->param.filestat >= INVBUSY)) {
+ fseek(invcntl->invfile,
+ (imid*invcntl->param.sizeblk) + invcntl->param.cntlsize,
+ SEEK_SET);
+ invcntl->numblk = imid;
+ fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1,
+ invcntl->invfile);
+ }
+
+srch_ext:
+ /* now find the term in this block. tricky this */
+ intptr = (unsigned long *) invcntl->logblk->invblk;
+
+ ilow = 0;
+ ihigh = *intptr - 1;
+ intptr += 3;
+ num = 0;
+ while (ilow <= ihigh) {
+ imid = (ilow + ihigh) / 2;
+ entryptr = (ENTRY *)intptr + imid;
+ i = strncmp(searchterm, invcntl->logblk->chrblk + entryptr->offset,
+ (int) entryptr->size );
+ if (i == 0)
+ i = strlen(searchterm) - entryptr->size;
+ if (i < 0)
+ ihigh = imid - 1;
+ else if (i > 0)
+ ilow = ++imid;
+ else {
+ num = entryptr->post;
+ break;
+ }
+ }
+ /* be careful about case where searchterm is after last in this block */
+ if (imid >= invcntl->logblk->invblk[0]) {
+ invcntl->keypnt = invcntl->logblk->invblk[0];
+ invstep(invcntl);
+ /* note if this happens the term could be in extended block */
+ if (invcntl->param.startbyte < invcntl->numblk * invcntl->param.sizeblk)
+ goto srch_ext;
+ } else
+ invcntl->keypnt = imid;
+ return(num);
+}
+
+#if DEBUG
+
+/** invdump dumps the block the term parameter is in **/
+void
+invdump(INVCONTROL *invcntl, char *term)
+{
+ long i, j, n, *longptr;
+ ENTRY * entryptr;
+ char temp[512], *ptr;
+
+ /* dump superindex if term is "-" */
+ if (*term == '-') {
+ j = atoi(term + 1);
+ longptr = (long *)invcntl->iindex;
+ n = *longptr++;
+ printf("Superindex dump, num blocks=%ld\n", n);
+ longptr += j;
+ while ((longptr <= ((long *)invcntl->iindex) + n) && invbreak == 0) {
+ printf("%2ld %6ld %s\n", j++, *longptr, invcntl->iindex + *longptr);
+ longptr++;
+ }
+ return;
+ } else if (*term == '#') {
+ j = atoi(term + 1);
+ /* fetch the appropriate logical block */
+ invcntl->numblk = j;
+ fseek(invcntl->invfile,
+ (j * invcntl->param.sizeblk) + invcntl->param.cntlsize,
+ SEEK_SET);
+ fread(invcntl->logblk, (int) invcntl->param.sizeblk, 1,
+ invcntl->invfile);
+ } else
+ i = abs((int) invfind(invcntl, term));
+ longptr = invcntl->logblk->invblk;
+ n = *longptr++;
+ printf("Entry term to invdump=%s, postings=%ld, forwrd ptr=%ld, back ptr=%ld\n"
+ , term, i, *(longptr), *(longptr + 1));
+ /* FIXME HBB: magic number alert! (3) */
+ entryptr = (ENTRY *) (invcntl->logblk->invblk + 3);
+ printf("%ld terms in this block, block=%ld\n", n, invcntl->numblk);
+ printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
+ for (j = 0; j < n && invbreak == 0; j++) {
+ ptr = invcntl->logblk->chrblk + entryptr->offset;
+ strncpy(temp, ptr, (int) entryptr->size);
+ temp[entryptr->size] = '\0';
+ ptr += (sizeof(long) * (long)((entryptr->size + (sizeof(long) - 1)) / sizeof(long)));
+ printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp, entryptr->post,
+ entryptr->size, entryptr->offset, entryptr->space,
+ *(long *)ptr);
+ entryptr++;
+ }
+}
+#endif
+
+static int
+boolready(void)
+{
+ numitems = 0;
+ if (item1 != NULL)
+ free(item1);
+ setsize1 = SETINC;
+ if ((item1 = malloc(SETINC * sizeof(POSTING))) == NULL) {
+ invcannotalloc(SETINC);
+ return(-1);
+ }
+ if (item2 != NULL)
+ free(item2);
+ setsize2 = SETINC;
+ if ((item2 = malloc(SETINC * sizeof(POSTING))) == NULL) {
+ invcannotalloc(SETINC);
+ return(-1);
+ }
+ item = item1;
+ enditem = item;
+ return(0);
+}
+
+void
+boolclear(void)
+{
+ numitems = 0;
+ item = item1;
+ enditem = item;
+}
+
+POSTING *
+boolfile(INVCONTROL *invcntl, long *num, int boolarg)
+{
+ ENTRY *entryptr;
+ FILE *file;
+ void *ptr;
+ unsigned long *ptr2;
+ POSTING *newitem = NULL; /* initialize, to avoid warning */
+ POSTING posting;
+ unsigned u;
+ POSTING *newsetp = NULL, *set1p;
+ long newsetc, set1c, set2c;
+
+ /* FIXME HBB: magic number alert! (3) */
+ entryptr = (ENTRY *) (invcntl->logblk->invblk + 3) + invcntl->keypnt;
+ ptr = invcntl->logblk->chrblk + entryptr->offset;
+ ptr2 = ((unsigned long *) ptr) + (entryptr->size + (sizeof(long) - 1)) / sizeof(long);
+ *num = entryptr->post;
+ switch (boolarg) {
+ case BOOL_OR:
+ case NOT:
+ if (*num == 0) {
+ *num = numitems;
+ return(item);
+ }
+ }
+ /* make room for the new set */
+ u = 0;
+ switch (boolarg) {
+ case AND:
+ case NOT:
+ newsetp = item;
+ break;
+
+ case BOOL_OR:
+ u = enditem - item;
+ /* FALLTHROUGH */
+ case REVERSENOT:
+ u += *num;
+ if (item == item2) {
+ if (u > setsize1) {
+ u += SETINC;
+ if ((item1 = realloc(
+ item1, u * sizeof(POSTING))) == NULL) {
+ goto cannotalloc;
+ }
+ setsize1 = u;
+ }
+ newitem = item1;
+ }
+ else {
+ if (u > setsize2) {
+ u += SETINC;
+ if ((item2 = realloc(
+ item2, u * sizeof(POSTING))) == NULL) {
+ cannotalloc:
+ invcannotalloc(u * sizeof(POSTING));
+ boolready();
+ *num = -1;
+ return(NULL);
+ }
+ setsize2 = u;
+ }
+ newitem = item2;
+ }
+#if 0 /* this write is only need by commented-out code later */
+ set1p = item;
+#endif
+ newsetp = newitem;
+ }
+ file = invcntl->postfile;
+ fseek(file, *ptr2, SEEK_SET);
+ fread(&posting, sizeof(posting), 1, file);
+ newsetc = 0;
+ switch (boolarg) {
+ case BOOL_OR:
+ /* while something in both sets */
+ set1p = item;
+ newsetp = newitem;
+ for (set1c = 0, set2c = 0;
+ set1c < numitems && set2c < *num; newsetc++) {
+ if (set1p->lineoffset < posting.lineoffset) {
+ *newsetp++ = *set1p++;
+ set1c++;
+ }
+ else if (set1p->lineoffset > posting.lineoffset) {
+ *newsetp++ = posting;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ else if (set1p->type < posting.type) {
+ *newsetp++ = *set1p++;
+ set1c++;
+ }
+ else if (set1p->type > posting.type) {
+ *newsetp++ = posting;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ else { /* identical postings */
+ *newsetp++ = *set1p++;
+ set1c++;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ }
+ /* find out what ran out and move the rest in */
+ if (set1c < numitems) {
+ newsetc += numitems - set1c;
+ while (set1c++ < numitems) {
+ *newsetp++ = *set1p++;
+ }
+ } else {
+ while (set2c++ < *num) {
+ *newsetp++ = posting;
+ newsetc++;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ }
+ }
+ item = newitem;
+ break; /* end of BOOL_OR */
+#if 0
+ case AND:
+ for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
+ if (set1p->lineoffset < posting.lineoffset) {
+ set1p++;
+ set1c++;
+ }
+ else if (set1p->lineoffset > posting.lineoffset) {
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ else if (set1p->type < posting.type) {
+ *set1p++;
+ set1c++;
+ }
+ else if (set1p->type > posting.type) {
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ else { /* identical postings */
+ *newsetp++ = *set1p++;
+ newsetc++;
+ set1c++;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ }
+ break; /* end of AND */
+
+ case NOT:
+ for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
+ if (set1p->lineoffset < posting.lineoffset) {
+ *newsetp++ = *set1p++;
+ newsetc++;
+ set1c++;
+ }
+ else if (set1p->lineoffset > posting.lineoffset) {
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ else if (set1p->type < posting.type) {
+ *newsetp++ = *set1p++;
+ newsetc++;
+ set1c++;
+ }
+ else if (set1p->type > posting.type) {
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ else { /* identical postings */
+ set1c++;
+ set1p++;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ }
+ newsetc += numitems - set1c;
+ while (set1c++ < numitems) {
+ *newsetp++ = *set1p++;
+ }
+ break; /* end of NOT */
+
+ case REVERSENOT: /* core NOT incoming set */
+ for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
+ if (set1p->lineoffset < posting.lineoffset) {
+ set1p++;
+ set1c++;
+ }
+ else if (set1p->lineoffset > posting.lineoffset) {
+ *newsetp++ = posting;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ else if (set1p->type < posting.type) {
+ set1p++;
+ set1c++;
+ }
+ else if (set1p->type > posting.type) {
+ *newsetp++ = posting;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ else { /* identical postings */
+ set1c++;
+ set1p++;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ set2c++;
+ }
+ }
+ while (set2c++ < *num) {
+ *newsetp++ = posting;
+ newsetc++;
+ fread(&posting, (int) sizeof(posting), 1, file);
+ }
+ item = newitem;
+ break; /* end of REVERSENOT */
+#endif
+ }
+ numitems = newsetc;
+ *num = newsetc;
+ enditem = (POSTING *) newsetp;
+ return((POSTING *) item);
+}
+
+#if 0
+POSTING *
+boolsave(int clear) /* flag about whether to clear core */
+{
+ int i;
+ POSTING *ptr;
+ POSTING *oldstuff, *newstuff;
+
+ if (numitems == 0) {
+ if (clear)
+ boolclear();
+ return(NULL);
+ }
+ /* if clear then give them what we have and use boolready to realloc */
+ if (clear) {
+ ptr = item;
+ /* free up the space we didn't give them */
+ if (item == item1)
+ item1 = NULL;
+ else
+ item2 = NULL;
+ boolready();
+ return(ptr);
+ }
+ i = (enditem - item) * sizeof(POSTING) + 100;
+ if ((ptr = malloc(i))r == NULL) {
+ invcannotalloc(i);
+ return(ptr);
+ }
+ /* move present set into place */
+ oldstuff = item;
+ newstuff = ptr;
+ while (oldstuff < enditem)
+ *newstuff++ = *oldstuff++;
+ return(ptr);
+}
+#endif
+
+static void
+invcannotalloc(unsigned n)
+{
+ fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
+}
+
+static void
+invcannotopen(char *file)
+{
+ fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
+}
+
+static void
+invcannotwrite(char *file)
+{
+ perror(argv0); /* must be first to preserve errno */
+ fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
+}