1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/src/Index.cpp Sun Sep 20 15:14:12 2009 -0700
1.3 @@ -0,0 +1,270 @@
1.4 +/*
1.5 + * Index.cpp
1.6 + * Ottoman
1.7 + *
1.8 + * Created by Jens Alfke on 8/27/09.
1.9 + * Copyright 2009 Jens Alfke. All rights reserved.
1.10 + * BSD-Licensed: See the file "LICENSE.txt" for details.
1.11 + */
1.12 +
1.13 +#include "Index.h"
1.14 +#include "Chunk.h"
1.15 +#include "File.h"
1.16 +#include "Dictionary.h"
1.17 +#include <assert.h>
1.18 +#include <errno.h>
1.19 +#include <algorithm>
1.20 +#include <math.h>
1.21 +
1.22 +namespace Mooseyard {
1.23 +
1.24 + // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
1.25 + // significantly larger, which increases the file size. In practice a smaller table size seems
1.26 + // to be a bigger win since disk I/O is slow.
1.27 + #define PROBE_QUADRATIC 0
1.28 +
1.29 + // Similarly use a higher max load than for memory-resident tables, to save space.
1.30 + static const float kMaxLoadFactor = 0.85;
1.31 +
1.32 + static const uint32_t kIndexMagicNumber = 0x54378543;
1.33 +
1.34 +
1.35 + /** An index (hash-table) entry as stored in the memory-mapped file. */
1.36 + class Index::Entry {
1.37 + public:
1.38 + explicit Entry() :_hash(0), _valuePosition(0) { }
1.39 + explicit Entry (HashCode h, FilePosition p)
1.40 + :_hash(h), _valuePosition(p) { }
1.41 + LittleEndian<HashCode> hash() const {return _hash;}
1.42 + bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
1.43 +
1.44 + const KeyValueChunk* value (const File *file) const {
1.45 + return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
1.46 + }
1.47 + FilePosition valuePosition() const {return _valuePosition;}
1.48 +
1.49 +
1.50 + bool isAvailable() const {return _valuePosition == 0;} // false if deleted
1.51 + bool hasValue() const {return _valuePosition > 1;} // false if deleted
1.52 +
1.53 + void markDeleted() {_valuePosition = kDeletedPosition;}
1.54 + void markEmpty() {_valuePosition = 0;}
1.55 +
1.56 +
1.57 + static const FilePosition kDeletedPosition = 1;
1.58 +
1.59 + private:
1.60 + LittleEndian<HashCode> _hash;
1.61 + LittleEndian<FilePosition> _valuePosition;
1.62 + };
1.63 +
1.64 +
1.65 +#pragma mark -
1.66 +#pragma mark CREATION:
1.67 +
1.68 + Index::Index (uint32_t itsTableSize)
1.69 + :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
1.70 + _magicNumber(kIndexMagicNumber),
1.71 + _count(0),
1.72 + _tableSize(itsTableSize)
1.73 + { }
1.74 +
1.75 + Index* Index::create (int capacity) {
1.76 +#if PROBE_QUADRATIC
1.77 + int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
1.78 +#else
1.79 + int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
1.80 +#endif
1.81 + size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
1.82 + void* storage = ::operator new(size);
1.83 + memset(storage,0,size);
1.84 + return new(storage) Index(tableSize);
1.85 + }
1.86 +
1.87 + const Index* Index::indexMappedAt (const void* ptr) {
1.88 + const Index *index = (const Index*)ptr;
1.89 + index->validate();
1.90 + return index;
1.91 + }
1.92 +
1.93 + void Index::validate() const throw(File::Error) {
1.94 + if (_magicNumber != kIndexMagicNumber)
1.95 + throw File::Error(ERANGE, "Bad Index (wrong magic number)");
1.96 + if (_tableSize < 2 || _count > _tableSize)
1.97 + throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
1.98 +#if PROBE_QUADRATIC
1.99 + if ((_tableSize & (_tableSize-1)) != 0)
1.100 + throw File::Error(ERANGE, "Bad Index (size not power of 2)");
1.101 +#endif
1.102 + if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
1.103 + throw File::Error(ERANGE, "Bad Index (wrong size)");
1.104 + }
1.105 +
1.106 + void Index::validateEntries (const File *file) const throw(File::Error) {
1.107 + validate();
1.108 + uint32_t size = _tableSize;
1.109 + uint32_t count = 0;
1.110 + const Entry *entry = table(0);
1.111 + for (uint32_t i=0; i<size; i++, entry++) {
1.112 + if (entry->hasValue()) {
1.113 + count++;
1.114 + const KeyValueChunk *chunk = entry->value(file);
1.115 + if ((void*)chunk->end() > (void*)this)
1.116 + throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
1.117 + chunk->validate();
1.118 + if (entry->hash() != Key::computeHash(chunk->key()))
1.119 + throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
1.120 + }
1.121 + }
1.122 + if (count != _count)
1.123 + throw File::Error(ERANGE, "Bad Index (inconsistent count)");
1.124 + }
1.125 +
1.126 +
1.127 +#pragma mark -
1.128 +#pragma mark LOOKUP:
1.129 +
1.130 + const Index::Entry* Index::table (int index) const {
1.131 + return &((const Index::Entry*)&_table)[index];
1.132 + }
1.133 +
1.134 + Index::Entry* Index::table (int index) {
1.135 + return &((Index::Entry*)&_table)[index];
1.136 + }
1.137 +
1.138 +
1.139 + const KeyValueChunk* Index::_find (File *file, Key key) const {
1.140 + int index = key.hash % _tableSize;
1.141 + LittleEndian<HashCode> lhash = key.hash;
1.142 +#if PROBE_QUADRATIC
1.143 + int probe = 0;
1.144 +#endif
1.145 + for(;;) {
1.146 + const Index::Entry *found = table(index);
1.147 + if (found->isAvailable()) {
1.148 + return NULL;
1.149 + } else if (found->hasHash(lhash)) {
1.150 + const KeyValueChunk *value = found->value(file);
1.151 + if (value->hasKey(key)) {
1.152 + return value;
1.153 + }
1.154 + }
1.155 +#if PROBE_QUADRATIC
1.156 + index = (index + ++probe) % _tableSize;
1.157 +#else
1.158 + index = (index + 1) % _tableSize;
1.159 +#endif
1.160 + }
1.161 + }
1.162 +
1.163 + Blob Index::get (File *file, Key key) const {
1.164 + const KeyValueChunk *kv = _find(file, key);
1.165 + return kv ?kv->value() :Blob();
1.166 + }
1.167 +
1.168 +
1.169 + // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
1.170 + // While searching, entries past maxPosition should be considered newly-added and not compared,
1.171 + // since they're not in the memory-mapped part of the file yet.
1.172 + // Returns true if the entry was added/removed.
1.173 + bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
1.174 + bool result = (valuePosition <= Entry::kDeletedPosition);
1.175 + int index = key.hash % _tableSize;
1.176 + Index::Entry entry(key.hash, valuePosition);
1.177 +#if PROBE_QUADRATIC
1.178 + int probe = 0;
1.179 +#endif
1.180 + Index::Entry *found;
1.181 + for(;;) {
1.182 + found = table(index);
1.183 + if (!found->hasValue()) {
1.184 + if (entry.hasValue()) {
1.185 + result = true;
1.186 + break; // it's a put and we found an empty entry
1.187 + } else if (found->isAvailable()) {
1.188 + return false; // it's a delete but the key wasn't found
1.189 + }
1.190 + }
1.191 + else if (found->hasHash(entry.hash())) {
1.192 + if (!file)
1.193 + break; // no file given, so assume no duplicate keys
1.194 + else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
1.195 + break; // found existing (not newly-inserted) entry with the matching key
1.196 + /*else
1.197 + printf("** hash collision! hash=%u\n", key.hash);*/
1.198 + }
1.199 +#if PROBE_QUADRATIC
1.200 + index = (index + ++probe) % _tableSize;
1.201 +#else
1.202 + index = (index + 1) % _tableSize;
1.203 +#endif
1.204 + }
1.205 + // Finally found where to put it:
1.206 + *found = entry;
1.207 + return result;
1.208 + }
1.209 +
1.210 + bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
1.211 + bool added = _put(file, key, valuePosition, maxPosition);
1.212 + if (added)
1.213 + ++_count;
1.214 + return added;
1.215 + }
1.216 +
1.217 + bool Index::remove (File *file, Key key, FilePosition maxPosition) {
1.218 + bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
1.219 + if (removed) {
1.220 + assert(_count > 0);
1.221 + --_count;
1.222 + }
1.223 + return removed;
1.224 + }
1.225 +
1.226 + void Index::removeAll () {
1.227 + memset(table(0), 0, _tableSize*sizeof(Entry));
1.228 + _count = 0;
1.229 + }
1.230 +
1.231 +
1.232 + void Index::copyFrom (File *file, const Index *index) {
1.233 + const Entry *src = index->table(0);
1.234 + if (index->_tableSize == _tableSize) {
1.235 + memcpy(table(0), src, _tableSize*sizeof(Entry));
1.236 + } else {
1.237 + removeAll();
1.238 + for (uint32_t i=0; i<index->_tableSize; i++,src++)
1.239 + if (src->hasValue())
1.240 + _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
1.241 + }
1.242 + _count = index->count();
1.243 + }
1.244 +
1.245 +
1.246 +
1.247 + Index::Iterator::Iterator (File *file, const Index *index)
1.248 + :_file(file),
1.249 + _current(index->table(-1)),
1.250 + _end(index->table(index->_tableSize))
1.251 + {
1.252 + next();
1.253 + }
1.254 +
1.255 + bool Index::Iterator::hasValue() const {return _current != NULL;}
1.256 + Key Index::Iterator::key() const {return Key(_current->value(_file)->key(), _current->hash());}
1.257 + Blob Index::Iterator::value() const {return _current->value(_file)->value();}
1.258 +
1.259 + bool Index::Iterator::next() {
1.260 + if (_current) {
1.261 + while (++_current < _end) {
1.262 + if (_current->hasValue())
1.263 + return true;
1.264 + }
1.265 + _current = NULL;
1.266 + }
1.267 + return false;
1.268 + }
1.269 +
1.270 + FilePosition Index::Iterator::valuePosition() {
1.271 + return _current ?_current->valuePosition() :0;
1.272 + }
1.273 +}