jens@0: /* jens@0: * Index.cpp jens@0: * Ottoman jens@0: * jens@0: * Created by Jens Alfke on 8/27/09. jens@0: * Copyright 2009 Jens Alfke. All rights reserved. jens@0: * BSD-Licensed: See the file "LICENSE.txt" for details. jens@0: */ jens@0: jens@0: #include "Index.h" jens@0: #include "Chunk.h" jens@0: #include "File.h" jens@0: #include "Dictionary.h" jens@0: #include jens@0: #include jens@0: #include jens@0: #include jbm@8: #include jbm@8: jbm@8: jens@0: namespace Mooseyard { jens@0: jens@0: // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables jens@0: // significantly larger, which increases the file size. In practice a smaller table size seems jens@0: // to be a bigger win since disk I/O is slow. jens@0: #define PROBE_QUADRATIC 0 jens@0: jens@0: // Similarly use a higher max load than for memory-resident tables, to save space. jens@2: static const float kMaxLoadFactor = 0.85f; jens@0: jens@0: static const uint32_t kIndexMagicNumber = 0x54378543; jens@0: jens@0: jens@0: /** An index (hash-table) entry as stored in the memory-mapped file. */ jens@0: class Index::Entry { jens@0: public: jens@0: explicit Entry() :_hash(0), _valuePosition(0) { } jens@0: explicit Entry (HashCode h, FilePosition p) jens@0: :_hash(h), _valuePosition(p) { } jens@0: LittleEndian hash() const {return _hash;} jens@0: bool hasHash (LittleEndian h) const {return _hash == h;} jens@0: jens@0: const KeyValueChunk* value (const File *file) const { jens@0: return (const KeyValueChunk*) file->mappedPosition(_valuePosition); jens@0: } jens@0: FilePosition valuePosition() const {return _valuePosition;} jens@0: jens@0: jens@0: bool isAvailable() const {return _valuePosition == 0;} // false if deleted jens@0: bool hasValue() const {return _valuePosition > 1;} // false if deleted jens@0: jens@0: void markDeleted() {_valuePosition = kDeletedPosition;} jens@0: void markEmpty() {_valuePosition = 0;} jens@0: jens@0: jens@0: static const FilePosition kDeletedPosition = 1; jens@0: jens@0: private: jens@0: LittleEndian _hash; jens@0: LittleEndian _valuePosition; jens@0: }; jens@0: jens@0: jens@0: #pragma mark - jens@0: #pragma mark CREATION: jens@0: jens@0: Index::Index (uint32_t itsTableSize) jens@0: :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType), jens@0: _magicNumber(kIndexMagicNumber), jens@0: _count(0), jens@0: _tableSize(itsTableSize) jens@0: { } jens@0: jens@0: Index* Index::create (int capacity) { jens@0: #if PROBE_QUADRATIC jens@0: int tableSize = Dictionary::choosePowerOfTwoSize(capacity); jens@0: #else jens@0: int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor); jens@0: #endif jens@0: size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry); jens@0: void* storage = ::operator new(size); jens@0: memset(storage,0,size); jens@0: return new(storage) Index(tableSize); jens@0: } jens@0: jens@0: const Index* Index::indexMappedAt (const void* ptr) { jens@0: const Index *index = (const Index*)ptr; jens@0: index->validate(); jens@0: return index; jens@0: } jens@0: jens@0: void Index::validate() const throw(File::Error) { jens@0: if (_magicNumber != kIndexMagicNumber) jens@0: throw File::Error(ERANGE, "Bad Index (wrong magic number)"); jens@0: if (_tableSize < 2 || _count > _tableSize) jens@0: throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)"); jens@0: #if PROBE_QUADRATIC jens@0: if ((_tableSize & (_tableSize-1)) != 0) jens@0: throw File::Error(ERANGE, "Bad Index (size not power of 2)"); jens@0: #endif jens@0: if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry)) jens@0: throw File::Error(ERANGE, "Bad Index (wrong size)"); jens@0: } jens@0: jens@0: void Index::validateEntries (const File *file) const throw(File::Error) { jens@0: validate(); jens@0: uint32_t size = _tableSize; jens@0: uint32_t count = 0; jens@0: const Entry *entry = table(0); jens@0: for (uint32_t i=0; ihasValue()) { jens@0: count++; jens@0: const KeyValueChunk *chunk = entry->value(file); jens@0: if ((void*)chunk->end() > (void*)this) jens@0: throw File::Error(ERANGE, "Bad Index entry (points beyond index)"); jens@0: chunk->validate(); jens@0: if (entry->hash() != Key::computeHash(chunk->key())) jens@0: throw File::Error(ERANGE, "Bad Index entry (wrong hash code)"); jens@0: } jens@0: } jens@0: if (count != _count) jens@0: throw File::Error(ERANGE, "Bad Index (inconsistent count)"); jens@0: } jens@0: jens@0: jens@0: #pragma mark - jens@0: #pragma mark LOOKUP: jens@0: jens@0: const Index::Entry* Index::table (int index) const { jens@0: return &((const Index::Entry*)&_table)[index]; jens@0: } jens@0: jens@0: Index::Entry* Index::table (int index) { jens@0: return &((Index::Entry*)&_table)[index]; jens@0: } jens@0: jens@0: jens@0: const KeyValueChunk* Index::_find (File *file, Key key) const { jens@0: int index = key.hash % _tableSize; jens@0: LittleEndian lhash = key.hash; jens@0: #if PROBE_QUADRATIC jens@0: int probe = 0; jens@0: #endif jens@0: for(;;) { jens@0: const Index::Entry *found = table(index); jens@0: if (found->isAvailable()) { jens@0: return NULL; jens@0: } else if (found->hasHash(lhash)) { jens@0: const KeyValueChunk *value = found->value(file); jens@0: if (value->hasKey(key)) { jens@0: return value; jens@0: } jens@0: } jens@0: #if PROBE_QUADRATIC jens@0: index = (index + ++probe) % _tableSize; jens@0: #else jens@0: index = (index + 1) % _tableSize; jens@0: #endif jens@0: } jens@0: } jens@0: jens@0: Blob Index::get (File *file, Key key) const { jens@0: const KeyValueChunk *kv = _find(file, key); jens@0: return kv ?kv->value() :Blob(); jens@0: } jens@0: jens@0: jens@0: // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.) jens@0: // While searching, entries past maxPosition should be considered newly-added and not compared, jens@0: // since they're not in the memory-mapped part of the file yet. jens@0: // Returns true if the entry was added/removed. jens@0: bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) { jens@0: bool result = (valuePosition <= Entry::kDeletedPosition); jens@0: int index = key.hash % _tableSize; jens@0: Index::Entry entry(key.hash, valuePosition); jens@0: #if PROBE_QUADRATIC jens@0: int probe = 0; jens@0: #endif jens@0: Index::Entry *found; jens@0: for(;;) { jens@0: found = table(index); jens@0: if (!found->hasValue()) { jens@0: if (entry.hasValue()) { jens@0: result = true; jens@0: break; // it's a put and we found an empty entry jens@0: } else if (found->isAvailable()) { jens@0: return false; // it's a delete but the key wasn't found jens@0: } jens@0: } jens@0: else if (found->hasHash(entry.hash())) { jens@0: if (!file) jens@0: break; // no file given, so assume no duplicate keys jens@0: else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key)) jens@0: break; // found existing (not newly-inserted) entry with the matching key jens@0: /*else jens@0: printf("** hash collision! hash=%u\n", key.hash);*/ jens@0: } jens@0: #if PROBE_QUADRATIC jens@0: index = (index + ++probe) % _tableSize; jens@0: #else jens@0: index = (index + 1) % _tableSize; jens@0: #endif jens@0: } jens@0: // Finally found where to put it: jens@0: *found = entry; jens@0: return result; jens@0: } jens@0: jens@0: bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) { jens@0: bool added = _put(file, key, valuePosition, maxPosition); jens@0: if (added) jens@0: ++_count; jens@0: return added; jens@0: } jens@0: jens@0: bool Index::remove (File *file, Key key, FilePosition maxPosition) { jens@0: bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition); jens@0: if (removed) { jens@0: assert(_count > 0); jens@0: --_count; jens@0: } jens@0: return removed; jens@0: } jens@0: jens@0: void Index::removeAll () { jens@0: memset(table(0), 0, _tableSize*sizeof(Entry)); jens@0: _count = 0; jens@0: } jens@0: jens@0: jens@0: void Index::copyFrom (File *file, const Index *index) { jens@0: const Entry *src = index->table(0); jens@0: if (index->_tableSize == _tableSize) { jens@0: memcpy(table(0), src, _tableSize*sizeof(Entry)); jens@0: } else { jens@0: removeAll(); jens@0: for (uint32_t i=0; i_tableSize; i++,src++) jens@0: if (src->hasValue()) jens@0: _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX); jens@0: } jens@0: _count = index->count(); jens@0: } jens@0: jens@0: jens@0: jens@0: Index::Iterator::Iterator (File *file, const Index *index) jens@0: :_file(file), jens@0: _current(index->table(-1)), jens@0: _end(index->table(index->_tableSize)) jens@0: { jens@0: next(); jens@0: } jens@0: jens@0: bool Index::Iterator::hasValue() const {return _current != NULL;} jens@0: Key Index::Iterator::key() const {return Key(_current->value(_file)->key(), _current->hash());} jens@0: Blob Index::Iterator::value() const {return _current->value(_file)->value();} jens@0: jens@0: bool Index::Iterator::next() { jens@0: if (_current) { jens@0: while (++_current < _end) { jens@0: if (_current->hasValue()) jens@0: return true; jens@0: } jens@0: _current = NULL; jens@0: } jens@0: return false; jens@0: } jens@0: jens@0: FilePosition Index::Iterator::valuePosition() { jens@0: return _current ?_current->valuePosition() :0; jens@0: } jens@0: }