jens@0
|
1 |
/*
|
jens@0
|
2 |
* Index.cpp
|
jens@0
|
3 |
* Ottoman
|
jens@0
|
4 |
*
|
jens@0
|
5 |
* Created by Jens Alfke on 8/27/09.
|
jens@0
|
6 |
* Copyright 2009 Jens Alfke. All rights reserved.
|
jens@0
|
7 |
* BSD-Licensed: See the file "LICENSE.txt" for details.
|
jens@0
|
8 |
*/
|
jens@0
|
9 |
|
jens@0
|
10 |
#include "Index.h"
|
jens@0
|
11 |
#include "Chunk.h"
|
jens@0
|
12 |
#include "File.h"
|
jens@0
|
13 |
#include "Dictionary.h"
|
jens@0
|
14 |
#include <assert.h>
|
jens@0
|
15 |
#include <errno.h>
|
jens@0
|
16 |
#include <algorithm>
|
jens@0
|
17 |
#include <math.h>
|
jens@0
|
18 |
|
jbm@8
|
19 |
#define __STDC_LIMIT_MACROS /* I <3 you, C99 */
|
jbm@8
|
20 |
# define UINT32_MAX (4294967295U)
|
jbm@8
|
21 |
|
jbm@8
|
22 |
#include <stdint.h>
|
jbm@8
|
23 |
|
jbm@8
|
24 |
|
jens@0
|
25 |
namespace Mooseyard {
|
jens@0
|
26 |
|
jens@0
|
27 |
// Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
|
jens@0
|
28 |
// significantly larger, which increases the file size. In practice a smaller table size seems
|
jens@0
|
29 |
// to be a bigger win since disk I/O is slow.
|
jens@0
|
30 |
#define PROBE_QUADRATIC 0
|
jens@0
|
31 |
|
jens@0
|
32 |
// Similarly use a higher max load than for memory-resident tables, to save space.
|
jens@2
|
33 |
static const float kMaxLoadFactor = 0.85f;
|
jens@0
|
34 |
|
jens@0
|
35 |
static const uint32_t kIndexMagicNumber = 0x54378543;
|
jens@0
|
36 |
|
jens@0
|
37 |
|
jens@0
|
38 |
/** An index (hash-table) entry as stored in the memory-mapped file. */
|
jens@0
|
39 |
class Index::Entry {
|
jens@0
|
40 |
public:
|
jens@0
|
41 |
explicit Entry() :_hash(0), _valuePosition(0) { }
|
jens@0
|
42 |
explicit Entry (HashCode h, FilePosition p)
|
jens@0
|
43 |
:_hash(h), _valuePosition(p) { }
|
jens@0
|
44 |
LittleEndian<HashCode> hash() const {return _hash;}
|
jens@0
|
45 |
bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
|
jens@0
|
46 |
|
jens@0
|
47 |
const KeyValueChunk* value (const File *file) const {
|
jens@0
|
48 |
return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
|
jens@0
|
49 |
}
|
jens@0
|
50 |
FilePosition valuePosition() const {return _valuePosition;}
|
jens@0
|
51 |
|
jens@0
|
52 |
|
jens@0
|
53 |
bool isAvailable() const {return _valuePosition == 0;} // false if deleted
|
jens@0
|
54 |
bool hasValue() const {return _valuePosition > 1;} // false if deleted
|
jens@0
|
55 |
|
jens@0
|
56 |
void markDeleted() {_valuePosition = kDeletedPosition;}
|
jens@0
|
57 |
void markEmpty() {_valuePosition = 0;}
|
jens@0
|
58 |
|
jens@0
|
59 |
|
jens@0
|
60 |
static const FilePosition kDeletedPosition = 1;
|
jens@0
|
61 |
|
jens@0
|
62 |
private:
|
jens@0
|
63 |
LittleEndian<HashCode> _hash;
|
jens@0
|
64 |
LittleEndian<FilePosition> _valuePosition;
|
jens@0
|
65 |
};
|
jens@0
|
66 |
|
jens@0
|
67 |
|
jens@0
|
68 |
#pragma mark -
|
jens@0
|
69 |
#pragma mark CREATION:
|
jens@0
|
70 |
|
jens@0
|
71 |
Index::Index (uint32_t itsTableSize)
|
jens@0
|
72 |
:Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
|
jens@0
|
73 |
_magicNumber(kIndexMagicNumber),
|
jens@0
|
74 |
_count(0),
|
jens@0
|
75 |
_tableSize(itsTableSize)
|
jens@0
|
76 |
{ }
|
jens@0
|
77 |
|
jens@0
|
78 |
Index* Index::create (int capacity) {
|
jens@0
|
79 |
#if PROBE_QUADRATIC
|
jens@0
|
80 |
int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
|
jens@0
|
81 |
#else
|
jens@0
|
82 |
int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
|
jens@0
|
83 |
#endif
|
jens@0
|
84 |
size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
|
jens@0
|
85 |
void* storage = ::operator new(size);
|
jens@0
|
86 |
memset(storage,0,size);
|
jens@0
|
87 |
return new(storage) Index(tableSize);
|
jens@0
|
88 |
}
|
jens@0
|
89 |
|
jens@0
|
90 |
const Index* Index::indexMappedAt (const void* ptr) {
|
jens@0
|
91 |
const Index *index = (const Index*)ptr;
|
jens@0
|
92 |
index->validate();
|
jens@0
|
93 |
return index;
|
jens@0
|
94 |
}
|
jens@0
|
95 |
|
jens@0
|
96 |
void Index::validate() const throw(File::Error) {
|
jens@0
|
97 |
if (_magicNumber != kIndexMagicNumber)
|
jens@0
|
98 |
throw File::Error(ERANGE, "Bad Index (wrong magic number)");
|
jens@0
|
99 |
if (_tableSize < 2 || _count > _tableSize)
|
jens@0
|
100 |
throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
|
jens@0
|
101 |
#if PROBE_QUADRATIC
|
jens@0
|
102 |
if ((_tableSize & (_tableSize-1)) != 0)
|
jens@0
|
103 |
throw File::Error(ERANGE, "Bad Index (size not power of 2)");
|
jens@0
|
104 |
#endif
|
jens@0
|
105 |
if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
|
jens@0
|
106 |
throw File::Error(ERANGE, "Bad Index (wrong size)");
|
jens@0
|
107 |
}
|
jens@0
|
108 |
|
jens@0
|
109 |
void Index::validateEntries (const File *file) const throw(File::Error) {
|
jens@0
|
110 |
validate();
|
jens@0
|
111 |
uint32_t size = _tableSize;
|
jens@0
|
112 |
uint32_t count = 0;
|
jens@0
|
113 |
const Entry *entry = table(0);
|
jens@0
|
114 |
for (uint32_t i=0; i<size; i++, entry++) {
|
jens@0
|
115 |
if (entry->hasValue()) {
|
jens@0
|
116 |
count++;
|
jens@0
|
117 |
const KeyValueChunk *chunk = entry->value(file);
|
jens@0
|
118 |
if ((void*)chunk->end() > (void*)this)
|
jens@0
|
119 |
throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
|
jens@0
|
120 |
chunk->validate();
|
jens@0
|
121 |
if (entry->hash() != Key::computeHash(chunk->key()))
|
jens@0
|
122 |
throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
|
jens@0
|
123 |
}
|
jens@0
|
124 |
}
|
jens@0
|
125 |
if (count != _count)
|
jens@0
|
126 |
throw File::Error(ERANGE, "Bad Index (inconsistent count)");
|
jens@0
|
127 |
}
|
jens@0
|
128 |
|
jens@0
|
129 |
|
jens@0
|
130 |
#pragma mark -
|
jens@0
|
131 |
#pragma mark LOOKUP:
|
jens@0
|
132 |
|
jens@0
|
133 |
const Index::Entry* Index::table (int index) const {
|
jens@0
|
134 |
return &((const Index::Entry*)&_table)[index];
|
jens@0
|
135 |
}
|
jens@0
|
136 |
|
jens@0
|
137 |
Index::Entry* Index::table (int index) {
|
jens@0
|
138 |
return &((Index::Entry*)&_table)[index];
|
jens@0
|
139 |
}
|
jens@0
|
140 |
|
jens@0
|
141 |
|
jens@0
|
142 |
const KeyValueChunk* Index::_find (File *file, Key key) const {
|
jens@0
|
143 |
int index = key.hash % _tableSize;
|
jens@0
|
144 |
LittleEndian<HashCode> lhash = key.hash;
|
jens@0
|
145 |
#if PROBE_QUADRATIC
|
jens@0
|
146 |
int probe = 0;
|
jens@0
|
147 |
#endif
|
jens@0
|
148 |
for(;;) {
|
jens@0
|
149 |
const Index::Entry *found = table(index);
|
jens@0
|
150 |
if (found->isAvailable()) {
|
jens@0
|
151 |
return NULL;
|
jens@0
|
152 |
} else if (found->hasHash(lhash)) {
|
jens@0
|
153 |
const KeyValueChunk *value = found->value(file);
|
jens@0
|
154 |
if (value->hasKey(key)) {
|
jens@0
|
155 |
return value;
|
jens@0
|
156 |
}
|
jens@0
|
157 |
}
|
jens@0
|
158 |
#if PROBE_QUADRATIC
|
jens@0
|
159 |
index = (index + ++probe) % _tableSize;
|
jens@0
|
160 |
#else
|
jens@0
|
161 |
index = (index + 1) % _tableSize;
|
jens@0
|
162 |
#endif
|
jens@0
|
163 |
}
|
jens@0
|
164 |
}
|
jens@0
|
165 |
|
jens@0
|
166 |
Blob Index::get (File *file, Key key) const {
|
jens@0
|
167 |
const KeyValueChunk *kv = _find(file, key);
|
jens@0
|
168 |
return kv ?kv->value() :Blob();
|
jens@0
|
169 |
}
|
jens@0
|
170 |
|
jens@0
|
171 |
|
jens@0
|
172 |
// Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
|
jens@0
|
173 |
// While searching, entries past maxPosition should be considered newly-added and not compared,
|
jens@0
|
174 |
// since they're not in the memory-mapped part of the file yet.
|
jens@0
|
175 |
// Returns true if the entry was added/removed.
|
jens@0
|
176 |
bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
|
jens@0
|
177 |
bool result = (valuePosition <= Entry::kDeletedPosition);
|
jens@0
|
178 |
int index = key.hash % _tableSize;
|
jens@0
|
179 |
Index::Entry entry(key.hash, valuePosition);
|
jens@0
|
180 |
#if PROBE_QUADRATIC
|
jens@0
|
181 |
int probe = 0;
|
jens@0
|
182 |
#endif
|
jens@0
|
183 |
Index::Entry *found;
|
jens@0
|
184 |
for(;;) {
|
jens@0
|
185 |
found = table(index);
|
jens@0
|
186 |
if (!found->hasValue()) {
|
jens@0
|
187 |
if (entry.hasValue()) {
|
jens@0
|
188 |
result = true;
|
jens@0
|
189 |
break; // it's a put and we found an empty entry
|
jens@0
|
190 |
} else if (found->isAvailable()) {
|
jens@0
|
191 |
return false; // it's a delete but the key wasn't found
|
jens@0
|
192 |
}
|
jens@0
|
193 |
}
|
jens@0
|
194 |
else if (found->hasHash(entry.hash())) {
|
jens@0
|
195 |
if (!file)
|
jens@0
|
196 |
break; // no file given, so assume no duplicate keys
|
jens@0
|
197 |
else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
|
jens@0
|
198 |
break; // found existing (not newly-inserted) entry with the matching key
|
jens@0
|
199 |
/*else
|
jens@0
|
200 |
printf("** hash collision! hash=%u\n", key.hash);*/
|
jens@0
|
201 |
}
|
jens@0
|
202 |
#if PROBE_QUADRATIC
|
jens@0
|
203 |
index = (index + ++probe) % _tableSize;
|
jens@0
|
204 |
#else
|
jens@0
|
205 |
index = (index + 1) % _tableSize;
|
jens@0
|
206 |
#endif
|
jens@0
|
207 |
}
|
jens@0
|
208 |
// Finally found where to put it:
|
jens@0
|
209 |
*found = entry;
|
jens@0
|
210 |
return result;
|
jens@0
|
211 |
}
|
jens@0
|
212 |
|
jens@0
|
213 |
bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
|
jens@0
|
214 |
bool added = _put(file, key, valuePosition, maxPosition);
|
jens@0
|
215 |
if (added)
|
jens@0
|
216 |
++_count;
|
jens@0
|
217 |
return added;
|
jens@0
|
218 |
}
|
jens@0
|
219 |
|
jens@0
|
220 |
bool Index::remove (File *file, Key key, FilePosition maxPosition) {
|
jens@0
|
221 |
bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
|
jens@0
|
222 |
if (removed) {
|
jens@0
|
223 |
assert(_count > 0);
|
jens@0
|
224 |
--_count;
|
jens@0
|
225 |
}
|
jens@0
|
226 |
return removed;
|
jens@0
|
227 |
}
|
jens@0
|
228 |
|
jens@0
|
229 |
void Index::removeAll () {
|
jens@0
|
230 |
memset(table(0), 0, _tableSize*sizeof(Entry));
|
jens@0
|
231 |
_count = 0;
|
jens@0
|
232 |
}
|
jens@0
|
233 |
|
jens@0
|
234 |
|
jens@0
|
235 |
void Index::copyFrom (File *file, const Index *index) {
|
jens@0
|
236 |
const Entry *src = index->table(0);
|
jens@0
|
237 |
if (index->_tableSize == _tableSize) {
|
jens@0
|
238 |
memcpy(table(0), src, _tableSize*sizeof(Entry));
|
jens@0
|
239 |
} else {
|
jens@0
|
240 |
removeAll();
|
jens@0
|
241 |
for (uint32_t i=0; i<index->_tableSize; i++,src++)
|
jens@0
|
242 |
if (src->hasValue())
|
jens@0
|
243 |
_put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
|
jens@0
|
244 |
}
|
jens@0
|
245 |
_count = index->count();
|
jens@0
|
246 |
}
|
jens@0
|
247 |
|
jens@0
|
248 |
|
jens@0
|
249 |
|
jens@0
|
250 |
Index::Iterator::Iterator (File *file, const Index *index)
|
jens@0
|
251 |
:_file(file),
|
jens@0
|
252 |
_current(index->table(-1)),
|
jens@0
|
253 |
_end(index->table(index->_tableSize))
|
jens@0
|
254 |
{
|
jens@0
|
255 |
next();
|
jens@0
|
256 |
}
|
jens@0
|
257 |
|
jens@0
|
258 |
bool Index::Iterator::hasValue() const {return _current != NULL;}
|
jens@0
|
259 |
Key Index::Iterator::key() const {return Key(_current->value(_file)->key(), _current->hash());}
|
jens@0
|
260 |
Blob Index::Iterator::value() const {return _current->value(_file)->value();}
|
jens@0
|
261 |
|
jens@0
|
262 |
bool Index::Iterator::next() {
|
jens@0
|
263 |
if (_current) {
|
jens@0
|
264 |
while (++_current < _end) {
|
jens@0
|
265 |
if (_current->hasValue())
|
jens@0
|
266 |
return true;
|
jens@0
|
267 |
}
|
jens@0
|
268 |
_current = NULL;
|
jens@0
|
269 |
}
|
jens@0
|
270 |
return false;
|
jens@0
|
271 |
}
|
jens@0
|
272 |
|
jens@0
|
273 |
FilePosition Index::Iterator::valuePosition() {
|
jens@0
|
274 |
return _current ?_current->valuePosition() :0;
|
jens@0
|
275 |
}
|
jens@0
|
276 |
}
|