diff -r 000000000000 -r 31a43d94cc26 src/Index.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Index.cpp Sun Sep 20 15:14:12 2009 -0700 @@ -0,0 +1,270 @@ +/* + * Index.cpp + * Ottoman + * + * Created by Jens Alfke on 8/27/09. + * Copyright 2009 Jens Alfke. All rights reserved. + * BSD-Licensed: See the file "LICENSE.txt" for details. + */ + +#include "Index.h" +#include "Chunk.h" +#include "File.h" +#include "Dictionary.h" +#include +#include +#include +#include + +namespace Mooseyard { + + // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables + // significantly larger, which increases the file size. In practice a smaller table size seems + // to be a bigger win since disk I/O is slow. + #define PROBE_QUADRATIC 0 + + // Similarly use a higher max load than for memory-resident tables, to save space. + static const float kMaxLoadFactor = 0.85; + + static const uint32_t kIndexMagicNumber = 0x54378543; + + + /** An index (hash-table) entry as stored in the memory-mapped file. */ + class Index::Entry { + public: + explicit Entry() :_hash(0), _valuePosition(0) { } + explicit Entry (HashCode h, FilePosition p) + :_hash(h), _valuePosition(p) { } + LittleEndian hash() const {return _hash;} + bool hasHash (LittleEndian h) const {return _hash == h;} + + const KeyValueChunk* value (const File *file) const { + return (const KeyValueChunk*) file->mappedPosition(_valuePosition); + } + FilePosition valuePosition() const {return _valuePosition;} + + + bool isAvailable() const {return _valuePosition == 0;} // false if deleted + bool hasValue() const {return _valuePosition > 1;} // false if deleted + + void markDeleted() {_valuePosition = kDeletedPosition;} + void markEmpty() {_valuePosition = 0;} + + + static const FilePosition kDeletedPosition = 1; + + private: + LittleEndian _hash; + LittleEndian _valuePosition; + }; + + +#pragma mark - +#pragma mark CREATION: + + Index::Index (uint32_t itsTableSize) + :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType), + _magicNumber(kIndexMagicNumber), + _count(0), + _tableSize(itsTableSize) + { } + + Index* Index::create (int capacity) { +#if PROBE_QUADRATIC + int tableSize = Dictionary::choosePowerOfTwoSize(capacity); +#else + int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor); +#endif + size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry); + void* storage = ::operator new(size); + memset(storage,0,size); + return new(storage) Index(tableSize); + } + + const Index* Index::indexMappedAt (const void* ptr) { + const Index *index = (const Index*)ptr; + index->validate(); + return index; + } + + void Index::validate() const throw(File::Error) { + if (_magicNumber != kIndexMagicNumber) + throw File::Error(ERANGE, "Bad Index (wrong magic number)"); + if (_tableSize < 2 || _count > _tableSize) + throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)"); +#if PROBE_QUADRATIC + if ((_tableSize & (_tableSize-1)) != 0) + throw File::Error(ERANGE, "Bad Index (size not power of 2)"); +#endif + if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry)) + throw File::Error(ERANGE, "Bad Index (wrong size)"); + } + + void Index::validateEntries (const File *file) const throw(File::Error) { + validate(); + uint32_t size = _tableSize; + uint32_t count = 0; + const Entry *entry = table(0); + for (uint32_t i=0; ihasValue()) { + count++; + const KeyValueChunk *chunk = entry->value(file); + if ((void*)chunk->end() > (void*)this) + throw File::Error(ERANGE, "Bad Index entry (points beyond index)"); + chunk->validate(); + if (entry->hash() != Key::computeHash(chunk->key())) + throw File::Error(ERANGE, "Bad Index entry (wrong hash code)"); + } + } + if (count != _count) + throw File::Error(ERANGE, "Bad Index (inconsistent count)"); + } + + +#pragma mark - +#pragma mark LOOKUP: + + const Index::Entry* Index::table (int index) const { + return &((const Index::Entry*)&_table)[index]; + } + + Index::Entry* Index::table (int index) { + return &((Index::Entry*)&_table)[index]; + } + + + const KeyValueChunk* Index::_find (File *file, Key key) const { + int index = key.hash % _tableSize; + LittleEndian lhash = key.hash; +#if PROBE_QUADRATIC + int probe = 0; +#endif + for(;;) { + const Index::Entry *found = table(index); + if (found->isAvailable()) { + return NULL; + } else if (found->hasHash(lhash)) { + const KeyValueChunk *value = found->value(file); + if (value->hasKey(key)) { + return value; + } + } +#if PROBE_QUADRATIC + index = (index + ++probe) % _tableSize; +#else + index = (index + 1) % _tableSize; +#endif + } + } + + Blob Index::get (File *file, Key key) const { + const KeyValueChunk *kv = _find(file, key); + return kv ?kv->value() :Blob(); + } + + + // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.) + // While searching, entries past maxPosition should be considered newly-added and not compared, + // since they're not in the memory-mapped part of the file yet. + // Returns true if the entry was added/removed. + bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) { + bool result = (valuePosition <= Entry::kDeletedPosition); + int index = key.hash % _tableSize; + Index::Entry entry(key.hash, valuePosition); +#if PROBE_QUADRATIC + int probe = 0; +#endif + Index::Entry *found; + for(;;) { + found = table(index); + if (!found->hasValue()) { + if (entry.hasValue()) { + result = true; + break; // it's a put and we found an empty entry + } else if (found->isAvailable()) { + return false; // it's a delete but the key wasn't found + } + } + else if (found->hasHash(entry.hash())) { + if (!file) + break; // no file given, so assume no duplicate keys + else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key)) + break; // found existing (not newly-inserted) entry with the matching key + /*else + printf("** hash collision! hash=%u\n", key.hash);*/ + } +#if PROBE_QUADRATIC + index = (index + ++probe) % _tableSize; +#else + index = (index + 1) % _tableSize; +#endif + } + // Finally found where to put it: + *found = entry; + return result; + } + + bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) { + bool added = _put(file, key, valuePosition, maxPosition); + if (added) + ++_count; + return added; + } + + bool Index::remove (File *file, Key key, FilePosition maxPosition) { + bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition); + if (removed) { + assert(_count > 0); + --_count; + } + return removed; + } + + void Index::removeAll () { + memset(table(0), 0, _tableSize*sizeof(Entry)); + _count = 0; + } + + + void Index::copyFrom (File *file, const Index *index) { + const Entry *src = index->table(0); + if (index->_tableSize == _tableSize) { + memcpy(table(0), src, _tableSize*sizeof(Entry)); + } else { + removeAll(); + for (uint32_t i=0; i_tableSize; i++,src++) + if (src->hasValue()) + _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX); + } + _count = index->count(); + } + + + + Index::Iterator::Iterator (File *file, const Index *index) + :_file(file), + _current(index->table(-1)), + _end(index->table(index->_tableSize)) + { + next(); + } + + bool Index::Iterator::hasValue() const {return _current != NULL;} + Key Index::Iterator::key() const {return Key(_current->value(_file)->key(), _current->hash());} + Blob Index::Iterator::value() const {return _current->value(_file)->value();} + + bool Index::Iterator::next() { + if (_current) { + while (++_current < _end) { + if (_current->hasValue()) + return true; + } + _current = NULL; + } + return false; + } + + FilePosition Index::Iterator::valuePosition() { + return _current ?_current->valuePosition() :0; + } +}