1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
|
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1997,2007 Oracle. All rights reserved.
*
* $Id: bt_recno.c,v 12.35 2007/05/17 15:14:46 bostic Exp $
*/
#include "db_config.h"
#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/btree.h"
#include "dbinc/lock.h"
#include "dbinc/mp.h"
static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
static int __ram_source __P((DB *));
static int __ram_sread __P((DBC *, db_recno_t));
static int __ram_update __P((DBC *, db_recno_t, int));
/*
* In recno, there are two meanings to the on-page "deleted" flag. If we're
* re-numbering records, it means the record was implicitly created. We skip
* over implicitly created records if doing a cursor "next" or "prev", and
* return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering
* records, it means that the record was implicitly created, or was deleted.
* We skip over implicitly created or deleted records if doing a cursor "next"
* or "prev", and return DB_KEYEMPTY if they're explicitly requested.
*
* If we're re-numbering records, then we have to detect in the cursor that
* a record was deleted, and adjust the cursor as necessary on the next get.
* If we're not re-numbering records, then we can detect that a record has
* been deleted by looking at the actual on-page record, so we completely
* ignore the cursor's delete flag. This is different from the B+tree code.
* It also maintains whether the cursor references a deleted record in the
* cursor, and it doesn't always check the on-page value.
*/
#define CD_SET(cp) { \
if (F_ISSET(cp, C_RENUMBER)) \
F_SET(cp, C_DELETED); \
}
#define CD_CLR(cp) { \
if (F_ISSET(cp, C_RENUMBER)) { \
F_CLR(cp, C_DELETED); \
cp->order = INVALID_ORDER; \
} \
}
#define CD_ISSET(cp) \
(F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED) ? 1 : 0)
/*
* Macros for comparing the ordering of two cursors.
* cp1 comes before cp2 iff one of the following holds:
* cp1's recno is less than cp2's recno
* recnos are equal, both deleted, and cp1's order is less than cp2's
* recnos are equal, cp1 deleted, and cp2 not deleted
*/
#define C_LESSTHAN(cp1, cp2) \
(((cp1)->recno < (cp2)->recno) || \
(((cp1)->recno == (cp2)->recno) && \
((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \
(CD_ISSET((cp1)) && !CD_ISSET((cp2))))))
/*
* cp1 is equal to cp2 iff their recnos and delete flags are identical,
* and if the delete flag is set their orders are also identical.
*/
#define C_EQUAL(cp1, cp2) \
((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \
(!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order))
/*
* Do we need to log the current cursor adjustment?
*/
#define CURADJ_LOG(dbc) \
(DBC_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL)
/*
* After a search, copy the found page into the cursor, discarding any
* currently held lock.
*/
#define STACK_TO_CURSOR(cp, ret) { \
int __t_ret; \
(cp)->page = (cp)->csp->page; \
(cp)->pgno = (cp)->csp->page->pgno; \
(cp)->indx = (cp)->csp->indx; \
if ((__t_ret = __TLPUT(dbc, (cp)->lock)) != 0 && (ret) == 0) \
ret = __t_ret; \
(cp)->lock = (cp)->csp->lock; \
(cp)->lock_mode = (cp)->csp->lock_mode; \
}
/*
* __ram_open --
* Recno open function.
*
* PUBLIC: int __ram_open __P((DB *,
* PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
*/
int
__ram_open(dbp, txn, name, base_pgno, flags)
DB *dbp;
DB_TXN *txn;
const char *name;
db_pgno_t base_pgno;
u_int32_t flags;
{
BTREE *t;
DBC *dbc;
int ret, t_ret;
COMPQUIET(name, NULL);
t = dbp->bt_internal;
/* Start up the tree. */
if ((ret = __bam_read_root(dbp, txn, base_pgno, flags)) != 0)
return (ret);
/*
* If the user specified a source tree, open it and map it in.
*
* !!!
* We don't complain if the user specified transactions or threads.
* It's possible to make it work, but you'd better know what you're
* doing!
*/
if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0)
return (ret);
/* If we're snapshotting an underlying source file, do it now. */
if (F_ISSET(dbp, DB_AM_SNAPSHOT)) {
/* Allocate a cursor. */
if ((ret = __db_cursor(dbp, NULL, &dbc, 0)) != 0)
return (ret);
/* Do the snapshot. */
if ((ret = __ram_update(dbc,
DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
ret = 0;
/* Discard the cursor. */
if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
ret = t_ret;
}
return (ret);
}
/*
* __ram_append --
* Recno append function.
*
* PUBLIC: int __ram_append __P((DBC *, DBT *, DBT *));
*/
int
__ram_append(dbc, key, data)
DBC *dbc;
DBT *key, *data;
{
BTREE_CURSOR *cp;
int ret;
cp = (BTREE_CURSOR *)dbc->internal;
/*
* Make sure we've read in all of the backing source file. If
* we found the record or it simply didn't exist, add the
* user's record.
*/
ret = __ram_update(dbc, DB_MAX_RECORDS, 0);
if (ret == 0 || ret == DB_NOTFOUND)
ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0);
/* Return the record number. */
if (ret == 0 && key != NULL)
ret = __db_retcopy(dbc->dbp->dbenv, key, &cp->recno,
sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
return (ret);
}
/*
* __ramc_del --
* Recno DBC->del function.
*
* PUBLIC: int __ramc_del __P((DBC *));
*/
int
__ramc_del(dbc)
DBC *dbc;
{
BKEYDATA bk;
BTREE *t;
BTREE_CURSOR *cp;
DB *dbp;
DB_LSN lsn;
DBT hdr, data;
int exact, nc, ret, stack, t_ret;
dbp = dbc->dbp;
cp = (BTREE_CURSOR *)dbc->internal;
t = dbp->bt_internal;
stack = 0;
/*
* The semantics of cursors during delete are as follows: in
* non-renumbering recnos, records are replaced with a marker
* containing a delete flag. If the record referenced by this cursor
* has already been deleted, we will detect that as part of the delete
* operation, and fail.
*
* In renumbering recnos, cursors which represent deleted items
* are flagged with the C_DELETED flag, and it is an error to
* call c_del a second time without an intervening cursor motion.
*/
if (CD_ISSET(cp))
return (DB_KEYEMPTY);
/* Search the tree for the key; delete only deletes exact matches. */
if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0)
goto err;
if (!exact) {
ret = DB_NOTFOUND;
goto err;
}
stack = 1;
/* Copy the page into the cursor. */
if ((ret = __memp_dirty(dbp->mpf,
&cp->csp->page, dbc->txn, dbc->priority, 0)) != 0)
goto err;
STACK_TO_CURSOR(cp, ret);
if (ret != 0)
goto err;
/*
* If re-numbering records, the on-page deleted flag can only mean
* that this record was implicitly created. Applications aren't
* permitted to delete records they never created, return an error.
*
* If not re-numbering records, the on-page deleted flag means that
* this record was implicitly created, or, was deleted at some time.
* The former is an error because applications aren't permitted to
* delete records they never created, the latter is an error because
* if the record was "deleted", we could never have found it.
*/
if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) {
ret = DB_KEYEMPTY;
goto err;
}
if (F_ISSET(cp, C_RENUMBER)) {
/* Delete the item, adjust the counts, adjust the cursors. */
if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
goto err;
if ((ret = __bam_adjust(dbc, -1)) != 0)
goto err;
if ((ret = __ram_ca(dbc, CA_DELETE, &nc)) != 0)
goto err;
if (nc > 0 &&
CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp, dbc->txn,
&lsn, 0, CA_DELETE, cp->root, cp->recno, cp->order)) != 0)
goto err;
/*
* If the page is empty, delete it.
*
* We never delete a root page. First, root pages of primary
* databases never go away, recno or otherwise. However, if
* it's the root page of an off-page duplicates database, then
* it can be deleted. We don't delete it here because we have
* no way of telling the primary database page holder (e.g.,
* the hash access method) that its page element should cleaned
* up because the underlying tree is gone. So, we keep the page
* around until the last cursor referencing the empty tree is
* are closed, and then clean it up.
*/
if (NUM_ENT(cp->page) == 0 && PGNO(cp->page) != cp->root) {
/*
* We want to delete a single item out of the last page
* that we're not deleting.
*/
ret = __bam_dpages(dbc, 0, 0);
/*
* Regardless of the return from __bam_dpages, it will
* discard our stack and pinned page.
*/
stack = 0;
cp->page = NULL;
}
} else {
/* Use a delete/put pair to replace the record with a marker. */
if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
goto err;
B_TSET_DELETED(bk.type, B_KEYDATA);
bk.len = 0;
DB_INIT_DBT(hdr, &bk, SSZA(BKEYDATA, data));
DB_INIT_DBT(data, "", 0);
if ((ret = __db_pitem(dbc,
cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
goto err;
}
t->re_modified = 1;
err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
/*
* __ramc_get --
* Recno DBC->get function.
*
* PUBLIC: int __ramc_get
* PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
*/
int
__ramc_get(dbc, key, data, flags, pgnop)
DBC *dbc;
DBT *key, *data;
u_int32_t flags;
db_pgno_t *pgnop;
{
BTREE_CURSOR *cp;
DB *dbp;
int cmp, exact, ret;
COMPQUIET(pgnop, NULL);
dbp = dbc->dbp;
cp = (BTREE_CURSOR *)dbc->internal;
LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
retry: switch (flags) {
case DB_CURRENT:
/*
* If we're using mutable records and the deleted flag is
* set, the cursor is pointing at a nonexistent record;
* return an error.
*/
if (CD_ISSET(cp))
return (DB_KEYEMPTY);
break;
case DB_NEXT_DUP:
/*
* If we're not in an off-page dup set, we know there's no
* next duplicate since recnos don't have them. If we
* are in an off-page dup set, the next item assuredly is
* a dup, so we set flags to DB_NEXT and keep going.
*/
if (!F_ISSET(dbc, DBC_OPD))
return (DB_NOTFOUND);
/* FALLTHROUGH */
case DB_NEXT_NODUP:
/*
* Recno databases don't have duplicates, set flags to DB_NEXT
* and keep going.
*/
/* FALLTHROUGH */
case DB_NEXT:
flags = DB_NEXT;
/*
* If record numbers are mutable: if we just deleted a record,
* we have to avoid incrementing the record number so that we
* return the right record by virtue of renumbering the tree.
*/
if (CD_ISSET(cp)) {
/*
* Clear the flag, we've moved off the deleted record.
*/
CD_CLR(cp);
break;
}
if (cp->recno != RECNO_OOB) {
++cp->recno;
break;
}
/* FALLTHROUGH */
case DB_FIRST:
flags = DB_NEXT;
cp->recno = 1;
break;
case DB_PREV_DUP:
/*
* If we're not in an off-page dup set, we know there's no
* previous duplicate since recnos don't have them. If we
* are in an off-page dup set, the previous item assuredly
* is a dup, so we set flags to DB_PREV and keep going.
*/
if (!F_ISSET(dbc, DBC_OPD))
return (DB_NOTFOUND);
/* FALLTHROUGH */
case DB_PREV_NODUP:
/*
* Recno databases don't have duplicates, set flags to DB_PREV
* and keep going.
*/
/* FALLTHROUGH */
case DB_PREV:
flags = DB_PREV;
if (cp->recno != RECNO_OOB) {
if (cp->recno == 1) {
ret = DB_NOTFOUND;
goto err;
}
--cp->recno;
break;
}
/* FALLTHROUGH */
case DB_LAST:
flags = DB_PREV;
if (((ret = __ram_update(dbc,
DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
goto err;
if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
goto err;
if (cp->recno == 0) {
ret = DB_NOTFOUND;
goto err;
}
break;
case DB_GET_BOTHC:
/*
* If we're doing a join and these are offpage dups,
* we want to keep searching forward from after the
* current cursor position. Increment the recno by 1,
* then proceed as for a DB_SET.
*
* Otherwise, we know there are no additional matching
* data, as recnos don't have dups. return DB_NOTFOUND.
*/
if (F_ISSET(dbc, DBC_OPD)) {
cp->recno++;
break;
}
ret = DB_NOTFOUND;
goto err;
/* NOTREACHED */
case DB_GET_BOTH:
case DB_GET_BOTH_RANGE:
/*
* If we're searching a set of off-page dups, we start
* a new linear search from the first record. Otherwise,
* we compare the single data item associated with the
* requested record for a match.
*/
if (F_ISSET(dbc, DBC_OPD)) {
cp->recno = 1;
break;
}
/* FALLTHROUGH */
case DB_SET:
case DB_SET_RANGE:
if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
goto err;
break;
default:
ret = __db_unknown_flag(dbp->dbenv, "__ramc_get", flags);
goto err;
}
/*
* For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already
* called __ram_update() to make sure sufficient records have been
* read from the backing source file. Do it now for DB_CURRENT (if
* the current record was deleted we may need more records from the
* backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT.
* (We don't have to test for flags == DB_FIRST, because the switch
* statement above re-set flags to DB_NEXT in that case.)
*/
if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret =
__ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND)
goto err;
for (;; ++cp->recno) {
/* Search the tree for the record. */
if ((ret = __bam_rsearch(dbc, &cp->recno,
F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND,
1, &exact)) != 0)
goto err;
if (!exact) {
ret = DB_NOTFOUND;
goto err;
}
/* Copy the page into the cursor. */
STACK_TO_CURSOR(cp, ret);
if (ret != 0)
goto err;
/*
* If re-numbering records, the on-page deleted flag means this
* record was implicitly created. If not re-numbering records,
* the on-page deleted flag means this record was implicitly
* created, or, it was deleted at some time. Regardless, we
* skip such records if doing cursor next/prev operations or
* walking through off-page duplicates, and fail if they were
* requested explicitly by the application.
*/
if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type))
switch (flags) {
case DB_NEXT:
case DB_PREV:
(void)__bam_stkrel(dbc, STK_CLRDBC);
goto retry;
case DB_GET_BOTH:
case DB_GET_BOTH_RANGE:
/*
* If we're an OPD tree, we don't care about
* matching a record number on a DB_GET_BOTH
* -- everything belongs to the same tree. A
* normal recno should give up and return
* DB_NOTFOUND if the matching recno is deleted.
*/
if (F_ISSET(dbc, DBC_OPD)) {
(void)__bam_stkrel(dbc, STK_CLRDBC);
continue;
}
ret = DB_NOTFOUND;
goto err;
default:
ret = DB_KEYEMPTY;
goto err;
}
if (flags == DB_GET_BOTH ||
flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
if ((ret = __bam_cmp(dbp, dbc->txn, data,
cp->page, cp->indx, __bam_defcmp, &cmp)) != 0)
return (ret);
if (cmp == 0)
break;
if (!F_ISSET(dbc, DBC_OPD)) {
ret = DB_NOTFOUND;
goto err;
}
(void)__bam_stkrel(dbc, STK_CLRDBC);
} else
break;
}
/* Return the key if the user didn't give us one. */
if (!F_ISSET(dbc, DBC_OPD) && !F_ISSET(key, DB_DBT_ISSET)) {
ret = __db_retcopy(dbp->dbenv,
key, &cp->recno, sizeof(cp->recno),
&dbc->rkey->data, &dbc->rkey->ulen);
F_SET(key, DB_DBT_ISSET);
}
/* The cursor was reset, no further delete adjustment is necessary. */
err: CD_CLR(cp);
return (ret);
}
/*
* __ramc_put --
* Recno DBC->put function.
*
* PUBLIC: int __ramc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
*/
int
__ramc_put(dbc, key, data, flags, pgnop)
DBC *dbc;
DBT *key, *data;
u_int32_t flags;
db_pgno_t *pgnop;
{
BTREE_CURSOR *cp;
DB *dbp;
DB_ENV *dbenv;
DB_LSN lsn;
u_int32_t iiflags;
int exact, nc, ret, t_ret;
void *arg;
COMPQUIET(pgnop, NULL);
dbp = dbc->dbp;
dbenv = dbp->dbenv;
cp = (BTREE_CURSOR *)dbc->internal;
/*
* DB_KEYFIRST and DB_KEYLAST mean different things if they're
* used in an off-page duplicate tree. If we're an off-page
* duplicate tree, they really mean "put at the beginning of the
* tree" and "put at the end of the tree" respectively, so translate
* them to something else.
*/
if (F_ISSET(dbc, DBC_OPD))
switch (flags) {
case DB_KEYFIRST:
cp->recno = 1;
flags = DB_BEFORE;
break;
case DB_KEYLAST:
if ((ret = __ram_add(dbc,
&cp->recno, data, DB_APPEND, 0)) != 0)
return (ret);
if (CURADJ_LOG(dbc) &&
(ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
return (ret);
return (0);
default:
break;
}
/*
* Handle normal DB_KEYFIRST/DB_KEYLAST; for a recno, which has
* no duplicates, these are identical and mean "put the given
* datum at the given recno".
*/
if (flags == DB_KEYFIRST ||
flags == DB_KEYLAST || flags == DB_NOOVERWRITE) {
ret = __ram_getno(dbc, key, &cp->recno, 1);
if (ret == 0 || ret == DB_NOTFOUND)
ret = __ram_add(dbc, &cp->recno, data, flags, 0);
return (ret);
}
/*
* If we're putting with a cursor that's marked C_DELETED, we need to
* take special care; the cursor doesn't "really" reference the item
* corresponding to its current recno, but instead is "between" that
* record and the current one. Translate the actual insert into
* DB_BEFORE, and let the __ram_ca work out the gory details of what
* should wind up pointing where.
*/
if (CD_ISSET(cp))
iiflags = DB_BEFORE;
else
iiflags = flags;
split: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_INSERT, 1, &exact)) != 0)
goto err;
/*
* An inexact match is okay; it just means we're one record past the
* end, which is reasonable if we're marked deleted.
*/
DB_ASSERT(dbenv, exact || CD_ISSET(cp));
/* Copy the page into the cursor. */
STACK_TO_CURSOR(cp, ret);
if (ret != 0)
goto err;
ret = __bam_iitem(dbc, key, data, iiflags, 0);
t_ret = __bam_stkrel(dbc, STK_CLRDBC);
if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT))
ret = t_ret;
else if (ret == DB_NEEDSPLIT) {
arg = &cp->recno;
if ((ret = __bam_split(dbc, arg, NULL)) != 0)
goto err;
goto split;
}
if (ret != 0)
goto err;
switch (flags) { /* Adjust the cursors. */
case DB_AFTER:
if ((ret = __ram_ca(dbc, CA_IAFTER, &nc)) != 0)
goto err;
/*
* We only need to adjust this cursor forward if we truly added
* the item after the current recno, rather than remapping it
* to DB_BEFORE.
*/
if (iiflags == DB_AFTER)
++cp->recno;
/* Only log if __ram_ca found any relevant cursors. */
if (nc > 0 && CURADJ_LOG(dbc) &&
(ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IAFTER,
cp->root, cp->recno, cp->order)) != 0)
goto err;
break;
case DB_BEFORE:
if ((ret = __ram_ca(dbc, CA_IBEFORE, &nc)) != 0)
goto err;
--cp->recno;
/* Only log if __ram_ca found any relevant cursors. */
if (nc > 0 && CURADJ_LOG(dbc) &&
(ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IBEFORE,
cp->root, cp->recno, cp->order)) != 0)
goto err;
break;
case DB_CURRENT:
/*
* We only need to do an adjustment if we actually
* added an item, which we only would have done if the
* cursor was marked deleted.
*/
if (!CD_ISSET(cp))
break;
/* Only log if __ram_ca found any relevant cursors. */
if ((ret = __ram_ca(dbc, CA_ICURRENT, &nc)) != 0)
goto err;
if (nc > 0 && CURADJ_LOG(dbc) &&
(ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
goto err;
break;
default:
break;
}
/* Return the key if we've created a new record. */
if (!F_ISSET(dbc, DBC_OPD) &&
(flags == DB_AFTER || flags == DB_BEFORE) && key != NULL)
ret = __db_retcopy(dbenv, key, &cp->recno,
sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
/* The cursor was reset, no further delete adjustment is necessary. */
err: CD_CLR(cp);
return (ret);
}
/*
* __ram_ca --
* Adjust cursors. Returns the number of relevant cursors.
*
* PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg, int *));
*/
int
__ram_ca(dbc_arg, op, foundp)
DBC *dbc_arg;
ca_recno_arg op;
int *foundp;
{
BTREE_CURSOR *cp, *cp_arg;
DB *dbp, *ldbp;
DB_ENV *dbenv;
DBC *dbc;
db_recno_t recno;
u_int32_t order;
int adjusted, found;
dbp = dbc_arg->dbp;
dbenv = dbp->dbenv;
cp_arg = (BTREE_CURSOR *)dbc_arg->internal;
recno = cp_arg->recno;
/*
* It only makes sense to adjust cursors if we're a renumbering
* recno; we should only be called if this is one.
*/
DB_ASSERT(dbenv, F_ISSET(cp_arg, C_RENUMBER));
MUTEX_LOCK(dbenv, dbenv->mtx_dblist);
/*
* Adjust the cursors. See the comment in __bam_ca_delete().
*
* If we're doing a delete, we need to find the highest
* order of any cursor currently pointing at this item,
* so we can assign a higher order to the newly deleted
* cursor. Unfortunately, this requires a second pass through
* the cursor list.
*/
if (op == CA_DELETE) {
FIND_FIRST_DB_MATCH(dbenv, dbp, ldbp);
for (order = 1;
ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
MUTEX_LOCK(dbenv, dbp->mutex);
TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
cp = (BTREE_CURSOR *)dbc->internal;
if (cp_arg->root == cp->root &&
recno == cp->recno && CD_ISSET(cp) &&
order <= cp->order &&
!MVCC_SKIP_CURADJ(dbc, cp->root))
order = cp->order + 1;
}
MUTEX_UNLOCK(dbenv, dbp->mutex);
}
} else
order = INVALID_ORDER;
/* Now go through and do the actual adjustments. */
FIND_FIRST_DB_MATCH(dbenv, dbp, ldbp);
for (found = 0;
ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
MUTEX_LOCK(dbenv, dbp->mutex);
TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
cp = (BTREE_CURSOR *)dbc->internal;
if (cp_arg->root != cp->root ||
MVCC_SKIP_CURADJ(dbc, cp->root))
continue;
++found;
adjusted = 0;
switch (op) {
case CA_DELETE:
if (recno < cp->recno) {
--cp->recno;
/*
* If the adjustment made them equal,
* we have to merge the orders.
*/
if (recno == cp->recno && CD_ISSET(cp))
cp->order += order;
} else if (recno == cp->recno &&
!CD_ISSET(cp)) {
CD_SET(cp);
cp->order = order;
}
break;
case CA_IBEFORE:
/*
* IBEFORE is just like IAFTER, except that we
* adjust cursors on the current record too.
*/
if (C_EQUAL(cp_arg, cp)) {
++cp->recno;
adjusted = 1;
}
goto iafter;
case CA_ICURRENT:
/*
* If the original cursor wasn't deleted, we
* just did a replacement and so there's no
* need to adjust anything--we shouldn't have
* gotten this far. Otherwise, we behave
* much like an IAFTER, except that all
* cursors pointing to the current item get
* marked undeleted and point to the new
* item.
*/
DB_ASSERT(dbenv, CD_ISSET(cp_arg));
if (C_EQUAL(cp_arg, cp)) {
CD_CLR(cp);
break;
}
/* FALLTHROUGH */
case CA_IAFTER:
iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) {
++cp->recno;
adjusted = 1;
}
if (recno == cp->recno && adjusted)
/*
* If we've moved this cursor's recno,
* split its order number--i.e.,
* decrement it by enough so that
* the lowest cursor moved has order 1.
* cp_arg->order is the split point,
* so decrement by one less than that.
*/
cp->order -= (cp_arg->order - 1);
break;
}
}
MUTEX_UNLOCK(dbp->dbenv, dbp->mutex);
}
MUTEX_UNLOCK(dbenv, dbenv->mtx_dblist);
if (foundp != NULL)
*foundp = found;
return (0);
}
/*
* __ram_getno --
* Check the user's record number, and make sure we've seen it.
*
* PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
*/
int
__ram_getno(dbc, key, rep, can_create)
DBC *dbc;
const DBT *key;
db_recno_t *rep;
int can_create;
{
DB *dbp;
db_recno_t recno;
dbp = dbc->dbp;
/* Check the user's record number. */
if ((recno = *(db_recno_t *)key->data) == 0) {
__db_errx(dbp->dbenv, "illegal record number of 0");
return (EINVAL);
}
if (rep != NULL)
*rep = recno;
/*
* Btree can neither create records nor read them in. Recno can
* do both, see if we can find the record.
*/
return (dbc->dbtype == DB_RECNO ?
__ram_update(dbc, recno, can_create) : 0);
}
/*
* __ram_update --
* Ensure the tree has records up to and including the specified one.
*/
static int
__ram_update(dbc, recno, can_create)
DBC *dbc;
db_recno_t recno;
int can_create;
{
BTREE *t;
DB *dbp;
DBT *rdata;
db_recno_t nrecs;
int ret;
dbp = dbc->dbp;
t = dbp->bt_internal;
/*
* If we can't create records and we've read the entire backing input
* file, we're done.
*/
if (!can_create && t->re_eof)
return (0);
/*
* If we haven't seen this record yet, try to get it from the original
* file.
*/
if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
return (ret);
if (!t->re_eof && recno > nrecs) {
if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND)
return (ret);
if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
return (ret);
}
/*
* If we can create records, create empty ones up to the requested
* record.
*/
if (!can_create || recno <= nrecs + 1)
return (0);
rdata = &dbc->my_rdata;
rdata->flags = 0;
rdata->size = 0;
while (recno > ++nrecs)
if ((ret = __ram_add(dbc,
&nrecs, rdata, 0, BI_DELETED)) != 0)
return (ret);
return (0);
}
/*
* __ram_source --
* Load information about the backing file.
*/
static int
__ram_source(dbp)
DB *dbp;
{
DB_ENV *dbenv;
BTREE *t;
char *source;
int ret;
dbenv = dbp->dbenv;
t = dbp->bt_internal;
/* Find the real name, and swap out the one we had before. */
if ((ret = __db_appname(dbenv,
DB_APP_DATA, t->re_source, 0, NULL, &source)) != 0)
return (ret);
__os_free(dbenv, t->re_source);
t->re_source = source;
/*
* !!!
* It's possible that the backing source file is read-only. We don't
* much care other than we'll complain if there are any modifications
* when it comes time to write the database back to the source.
*/
if ((t->re_fp = fopen(t->re_source, "rb")) == NULL) {
ret = __os_get_errno();
__db_err(dbenv, ret, "%s", t->re_source);
return (ret);
}
t->re_eof = 0;
return (0);
}
/*
* __ram_writeback --
* Rewrite the backing file.
*
* PUBLIC: int __ram_writeback __P((DB *));
*/
int
__ram_writeback(dbp)
DB *dbp;
{
BTREE *t;
DB_ENV *dbenv;
DBC *dbc;
DBT key, data;
FILE *fp;
db_recno_t keyno;
int ret, t_ret;
u_int8_t delim, *pad;
t = dbp->bt_internal;
dbenv = dbp->dbenv;
fp = NULL;
pad = NULL;
/* If the file wasn't modified, we're done. */
if (!t->re_modified)
return (0);
/* If there's no backing source file, we're done. */
if (t->re_source == NULL) {
t->re_modified = 0;
return (0);
}
/*
* We step through the records, writing each one out. Use the record
* number and the dbp->get() function, instead of a cursor, so we find
* and write out "deleted" or non-existent records. The DB handle may
* be threaded, so allocate memory as we go.
*/
memset(&key, 0, sizeof(key));
key.size = sizeof(db_recno_t);
key.data = &keyno;
memset(&data, 0, sizeof(data));
F_SET(&data, DB_DBT_REALLOC);
/* Allocate a cursor. */
if ((ret = __db_cursor(dbp, NULL, &dbc, 0)) != 0)
return (ret);
/*
* Read any remaining records into the tree.
*
* !!!
* This is why we can't support transactions when applications specify
* backing (re_source) files. At this point we have to read in the
* rest of the records from the file so that we can write all of the
* records back out again, which could modify a page for which we'd
* have to log changes and which we don't have locked. This could be
* partially fixed by taking a snapshot of the entire file during the
* DB->open as DB->open is transaction protected. But, if a checkpoint
* occurs then, the part of the log holding the copy of the file could
* be discarded, and that would make it impossible to recover in the
* face of disaster. This could all probably be fixed, but it would
* require transaction protecting the backing source file.
*
* XXX
* This could be made to work now that we have transactions protecting
* file operations. Margo has specifically asked for the privilege of
* doing this work.
*/
if ((ret =
__ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
goto err;
/*
* Close any existing file handle and re-open the file, truncating it.
*/
if (t->re_fp != NULL) {
if (fclose(t->re_fp) != 0) {
ret = __os_get_errno();
__db_err(dbenv, ret, "%s", t->re_source);
goto err;
}
t->re_fp = NULL;
}
if ((fp = fopen(t->re_source, "wb")) == NULL) {
ret = __os_get_errno();
__db_err(dbenv, ret, "%s", t->re_source);
goto err;
}
/*
* We'll need the delimiter if we're doing variable-length records,
* and the pad character if we're doing fixed-length records.
*/
delim = t->re_delim;
for (keyno = 1;; ++keyno) {
switch (ret = __db_get(dbp, NULL, &key, &data, 0)) {
case 0:
if (data.size != 0 &&
fwrite(data.data, 1, data.size, fp) != data.size)
goto write_err;
break;
case DB_KEYEMPTY:
if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
if (pad == NULL) {
if ((ret = __os_malloc(
dbenv, t->re_len, &pad)) != 0)
goto err;
memset(pad, t->re_pad, t->re_len);
}
if (fwrite(pad, 1, t->re_len, fp) != t->re_len)
goto write_err;
}
break;
case DB_NOTFOUND:
ret = 0;
goto done;
default:
goto err;
}
if (!F_ISSET(dbp, DB_AM_FIXEDLEN) &&
fwrite(&delim, 1, 1, fp) != 1) {
write_err: ret = __os_get_errno();
__db_err(dbenv, ret,
"%s: write failed to backing file", t->re_source);
goto err;
}
}
err:
done: /* Close the file descriptor. */
if (fp != NULL && fclose(fp) != 0) {
t_ret = __os_get_errno();
__db_err(dbenv, t_ret, "%s", t->re_source);
if (ret == 0)
ret = t_ret;
}
/* Discard the cursor. */
if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
ret = t_ret;
/* Discard memory allocated to hold the data items. */
if (data.data != NULL)
__os_ufree(dbenv, data.data);
if (pad != NULL)
__os_free(dbenv, pad);
if (ret == 0)
t->re_modified = 0;
return (ret);
}
/*
* __ram_sread --
* Read records from a source file.
*/
static int
__ram_sread(dbc, top)
DBC *dbc;
db_recno_t top;
{
BTREE *t;
DB *dbp;
DBT data, *rdata;
db_recno_t recno;
size_t len;
int ch, ret, was_modified;
t = dbc->dbp->bt_internal;
dbp = dbc->dbp;
was_modified = t->re_modified;
if ((ret = __bam_nrecs(dbc, &recno)) != 0)
return (ret);
/*
* Use the record key return memory, it's only a short-term use.
* The record data return memory is used by __bam_iitem, which
* we'll indirectly call, so use the key so as not to collide.
*/
len = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : 256;
rdata = &dbc->my_rkey;
if (rdata->ulen < len) {
if ((ret = __os_realloc(
dbp->dbenv, len, &rdata->data)) != 0) {
rdata->ulen = 0;
rdata->data = NULL;
return (ret);
}
rdata->ulen = (u_int32_t)len;
}
memset(&data, 0, sizeof(data));
while (recno < top) {
data.data = rdata->data;
data.size = 0;
if (F_ISSET(dbp, DB_AM_FIXEDLEN))
for (len = t->re_len; len > 0; --len) {
if ((ch = fgetc(t->re_fp)) == EOF) {
if (data.size == 0)
goto eof;
break;
}
((u_int8_t *)data.data)[data.size++] = ch;
}
else
for (;;) {
if ((ch = fgetc(t->re_fp)) == EOF) {
if (data.size == 0)
goto eof;
break;
}
if (ch == t->re_delim)
break;
((u_int8_t *)data.data)[data.size++] = ch;
if (data.size == rdata->ulen) {
if ((ret = __os_realloc(dbp->dbenv,
rdata->ulen *= 2,
&rdata->data)) != 0) {
rdata->ulen = 0;
rdata->data = NULL;
return (ret);
} else
data.data = rdata->data;
}
}
/*
* Another process may have read this record from the input
* file and stored it into the database already, in which
* case we don't need to repeat that operation. We detect
* this by checking if the last record we've read is greater
* or equal to the number of records in the database.
*/
if (t->re_last >= recno) {
++recno;
if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
goto err;
}
++t->re_last;
}
if (0) {
eof: t->re_eof = 1;
ret = DB_NOTFOUND;
}
err: if (!was_modified)
t->re_modified = 0;
return (ret);
}
/*
* __ram_add --
* Add records into the tree.
*/
static int
__ram_add(dbc, recnop, data, flags, bi_flags)
DBC *dbc;
db_recno_t *recnop;
DBT *data;
u_int32_t flags, bi_flags;
{
BTREE_CURSOR *cp;
int exact, ret, stack, t_ret;
cp = (BTREE_CURSOR *)dbc->internal;
retry: /* Find the slot for insertion. */
if ((ret = __bam_rsearch(dbc, recnop,
SR_INSERT | (flags == DB_APPEND ? SR_APPEND : 0), 1, &exact)) != 0)
return (ret);
stack = 1;
/* Copy the page into the cursor. */
STACK_TO_CURSOR(cp, ret);
if (ret != 0)
goto err;
if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) &&
!B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) {
ret = DB_KEYEXIST;
goto err;
}
/*
* The application may modify the data based on the selected record
* number.
*/
if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL &&
(ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0)
goto err;
/*
* Select the arguments for __bam_iitem() and do the insert. If the
* key is an exact match, or we're replacing the data item with a
* new data item, replace the current item. If the key isn't an exact
* match, we're inserting a new key/data pair, before the search
* location.
*/
switch (ret = __bam_iitem(dbc,
NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
case 0:
/*
* Don't adjust anything.
*
* If we inserted a record, no cursors need adjusting because
* the only new record it's possible to insert is at the very
* end of the tree. The necessary adjustments to the internal
* page counts were made by __bam_iitem().
*
* If we overwrote a record, no cursors need adjusting because
* future DBcursor->get calls will simply return the underlying
* record (there's no adjustment made for the DB_CURRENT flag
* when a cursor get operation immediately follows a cursor
* delete operation, and the normal adjustment for the DB_NEXT
* flag is still correct).
*/
break;
case DB_NEEDSPLIT:
/* Discard the stack of pages and split the page. */
(void)__bam_stkrel(dbc, STK_CLRDBC);
stack = 0;
if ((ret = __bam_split(dbc, recnop, NULL)) != 0)
goto err;
goto retry;
/* NOTREACHED */
default:
goto err;
}
err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
|