jens@0: /*
jens@0:  *  Index.cpp
jens@0:  *  Ottoman
jens@0:  *
jens@0:  *  Created by Jens Alfke on 8/27/09.
jens@0:  *  Copyright 2009 Jens Alfke. All rights reserved.
jens@0:  *  BSD-Licensed: See the file "LICENSE.txt" for details.
jens@0:  */
jens@0: 
jens@0: #include "Index.h"
jens@0: #include "Chunk.h"
jens@0: #include "File.h"
jens@0: #include "Dictionary.h"
jens@0: #include <assert.h>
jens@0: #include <errno.h>
jens@0: #include <algorithm>
jens@0: #include <math.h>
jens@0: 
jens@0: namespace Mooseyard {
jens@0:         
jens@0:     // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
jens@0:     // significantly larger, which increases the file size. In practice a smaller table size seems
jens@0:     // to be a bigger win since disk I/O is slow.
jens@0:     #define PROBE_QUADRATIC 0
jens@0: 
jens@0:     // Similarly use a higher max load than for memory-resident tables, to save space.
jens@2:     static const float kMaxLoadFactor = 0.85f;
jens@0: 
jens@0:     static const uint32_t kIndexMagicNumber = 0x54378543;
jens@0:     
jens@0: 
jens@0:     /** An index (hash-table) entry as stored in the memory-mapped file. */
jens@0:     class Index::Entry {
jens@0:     public:
jens@0:         explicit Entry()                        :_hash(0), _valuePosition(0) { }
jens@0:         explicit Entry (HashCode h, FilePosition p)
jens@0:                                                 :_hash(h), _valuePosition(p) { }
jens@0:         LittleEndian<HashCode> hash() const       {return _hash;}
jens@0:         bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
jens@0:         
jens@0:         const KeyValueChunk* value (const File *file) const {
jens@0:             return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
jens@0:         }
jens@0:         FilePosition valuePosition() const      {return _valuePosition;}
jens@0:         
jens@0:         
jens@0:         bool isAvailable() const                {return _valuePosition == 0;}   // false if deleted
jens@0:         bool hasValue() const                   {return _valuePosition > 1;}    // false if deleted
jens@0:         
jens@0:         void markDeleted()                      {_valuePosition = kDeletedPosition;}
jens@0:         void markEmpty()                        {_valuePosition = 0;}
jens@0:         
jens@0:         
jens@0:         static const FilePosition kDeletedPosition = 1;
jens@0:                 
jens@0:     private:
jens@0:         LittleEndian<HashCode> _hash;
jens@0:         LittleEndian<FilePosition> _valuePosition;
jens@0:     };
jens@0:         
jens@0:     
jens@0: #pragma mark -
jens@0: #pragma mark CREATION:
jens@0:         
jens@0:     Index::Index (uint32_t itsTableSize)
jens@0:         :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
jens@0:          _magicNumber(kIndexMagicNumber),
jens@0:          _count(0),
jens@0:          _tableSize(itsTableSize)
jens@0:     { }
jens@0:     
jens@0:     Index* Index::create (int capacity) {
jens@0: #if PROBE_QUADRATIC
jens@0:         int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
jens@0: #else
jens@0:         int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
jens@0: #endif
jens@0:         size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
jens@0:         void* storage = ::operator new(size);
jens@0:         memset(storage,0,size);
jens@0:         return new(storage) Index(tableSize);
jens@0:     }
jens@0:     
jens@0:     const Index* Index::indexMappedAt (const void* ptr) {
jens@0:         const Index *index = (const Index*)ptr;
jens@0:         index->validate();
jens@0:         return index;
jens@0:     }
jens@0:     
jens@0:     void Index::validate() const  throw(File::Error) {
jens@0:         if (_magicNumber != kIndexMagicNumber)
jens@0:             throw File::Error(ERANGE, "Bad Index (wrong magic number)");
jens@0:         if (_tableSize < 2 || _count > _tableSize)
jens@0:             throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
jens@0: #if PROBE_QUADRATIC
jens@0:         if ((_tableSize & (_tableSize-1)) != 0)
jens@0:             throw File::Error(ERANGE, "Bad Index (size not power of 2)");
jens@0: #endif
jens@0:         if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
jens@0:             throw File::Error(ERANGE, "Bad Index (wrong size)");
jens@0:     }
jens@0:     
jens@0:     void Index::validateEntries (const File *file) const  throw(File::Error) {
jens@0:         validate();
jens@0:         uint32_t size = _tableSize;
jens@0:         uint32_t count = 0;
jens@0:         const Entry *entry = table(0);
jens@0:         for (uint32_t i=0; i<size; i++, entry++) {
jens@0:             if (entry->hasValue()) {
jens@0:                 count++;
jens@0:                 const KeyValueChunk *chunk = entry->value(file);
jens@0:                 if ((void*)chunk->end() > (void*)this)
jens@0:                     throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
jens@0:                 chunk->validate();
jens@0:                 if (entry->hash() != Key::computeHash(chunk->key()))
jens@0:                     throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
jens@0:             }
jens@0:         }
jens@0:         if (count != _count)
jens@0:             throw File::Error(ERANGE, "Bad Index (inconsistent count)");
jens@0:     }
jens@0:     
jens@0:     
jens@0: #pragma mark -
jens@0: #pragma mark LOOKUP:
jens@0:     
jens@0:     const Index::Entry* Index::table (int index) const {
jens@0:         return &((const Index::Entry*)&_table)[index];
jens@0:     }
jens@0:     
jens@0:     Index::Entry* Index::table (int index) {
jens@0:         return &((Index::Entry*)&_table)[index];
jens@0:     }
jens@0:     
jens@0:         
jens@0:     const KeyValueChunk* Index::_find (File *file, Key key) const {
jens@0:         int index = key.hash % _tableSize;
jens@0:         LittleEndian<HashCode> lhash = key.hash;
jens@0: #if PROBE_QUADRATIC
jens@0:         int probe = 0;
jens@0: #endif
jens@0:         for(;;) {
jens@0:             const Index::Entry *found = table(index);
jens@0:             if (found->isAvailable()) {
jens@0:                 return NULL;
jens@0:             } else if (found->hasHash(lhash)) {
jens@0:                 const KeyValueChunk *value = found->value(file);
jens@0:                 if (value->hasKey(key)) {
jens@0:                     return value;
jens@0:                 }
jens@0:             }
jens@0: #if PROBE_QUADRATIC
jens@0:             index = (index + ++probe) % _tableSize;
jens@0: #else
jens@0:             index = (index + 1) % _tableSize;
jens@0: #endif
jens@0:         }
jens@0:     }
jens@0:     
jens@0:     Blob Index::get (File *file, Key key) const {
jens@0:         const KeyValueChunk *kv = _find(file, key);
jens@0:         return kv ?kv->value() :Blob();
jens@0:     }
jens@0:     
jens@0: 
jens@0:     // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
jens@0:     // While searching, entries past maxPosition should be considered newly-added and not compared,
jens@0:     // since they're not in the memory-mapped part of the file yet.
jens@0:     // Returns true if the entry was added/removed.
jens@0:     bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0:         bool result = (valuePosition <= Entry::kDeletedPosition);
jens@0:         int index = key.hash % _tableSize;
jens@0:         Index::Entry entry(key.hash, valuePosition);
jens@0: #if PROBE_QUADRATIC
jens@0:         int probe = 0;
jens@0: #endif
jens@0:         Index::Entry *found;
jens@0:         for(;;) {
jens@0:             found = table(index);
jens@0:             if (!found->hasValue()) {
jens@0:                 if (entry.hasValue()) {
jens@0:                     result = true;
jens@0:                     break;                  // it's a put and we found an empty entry
jens@0:                 } else if (found->isAvailable()) {
jens@0:                     return false;           // it's a delete but the key wasn't found
jens@0:                 }
jens@0:             }
jens@0:             else if (found->hasHash(entry.hash())) {
jens@0:                 if (!file)
jens@0:                     break;                  // no file given, so assume no duplicate keys
jens@0:                 else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
jens@0:                     break;                  // found existing (not newly-inserted) entry with the matching key
jens@0:                 /*else
jens@0:                     printf("** hash collision! hash=%u\n", key.hash);*/
jens@0:             }
jens@0: #if PROBE_QUADRATIC
jens@0:             index = (index + ++probe) % _tableSize;
jens@0: #else
jens@0:             index = (index + 1) % _tableSize;
jens@0: #endif
jens@0:         }
jens@0:         // Finally found where to put it:
jens@0:         *found = entry;
jens@0:         return result;
jens@0:     }
jens@0:         
jens@0:     bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0:         bool added = _put(file, key, valuePosition, maxPosition);
jens@0:         if (added)
jens@0:             ++_count;
jens@0:         return added;
jens@0:     }
jens@0:     
jens@0:     bool Index::remove (File *file, Key key, FilePosition maxPosition) {
jens@0:         bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
jens@0:         if (removed) {
jens@0:             assert(_count > 0);
jens@0:             --_count;
jens@0:         }
jens@0:         return removed;
jens@0:     }
jens@0:     
jens@0:     void Index::removeAll () {
jens@0:         memset(table(0), 0, _tableSize*sizeof(Entry));
jens@0:         _count = 0;
jens@0:     }
jens@0:     
jens@0:     
jens@0:     void Index::copyFrom (File *file, const Index *index) {
jens@0:         const Entry *src = index->table(0);
jens@0:         if (index->_tableSize == _tableSize) {
jens@0:             memcpy(table(0), src, _tableSize*sizeof(Entry));
jens@0:         } else {
jens@0:             removeAll();
jens@0:             for (uint32_t i=0; i<index->_tableSize; i++,src++)
jens@0:                 if (src->hasValue())
jens@0:                     _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
jens@0:         }
jens@0:         _count = index->count();
jens@0:     }        
jens@0:     
jens@0: 
jens@0: 
jens@0:     Index::Iterator::Iterator (File *file, const Index *index)
jens@0:         :_file(file),
jens@0:          _current(index->table(-1)),
jens@0:          _end(index->table(index->_tableSize))
jens@0:     {
jens@0:         next();
jens@0:     }
jens@0:     
jens@0:     bool Index::Iterator::hasValue() const  {return _current != NULL;}
jens@0:     Key Index::Iterator::key() const        {return Key(_current->value(_file)->key(), _current->hash());}
jens@0:     Blob Index::Iterator::value() const     {return _current->value(_file)->value();}
jens@0:     
jens@0:     bool Index::Iterator::next() {
jens@0:         if (_current) {
jens@0:             while (++_current < _end) {
jens@0:                 if (_current->hasValue())
jens@0:                     return true;
jens@0:             }
jens@0:             _current = NULL;
jens@0:         }
jens@0:         return false;
jens@0:     }
jens@0:     
jens@0:     FilePosition Index::Iterator::valuePosition() {
jens@0:         return _current ?_current->valuePosition() :0;
jens@0:     }
jens@0: }