src/Index.cpp
author Jens Alfke <jens@mooseyard.com>
Sun Sep 20 15:14:12 2009 -0700 (2009-09-20)
changeset 0 31a43d94cc26
child 2 851de24ecb61
permissions -rw-r--r--
First official checkin.
     1 /*
     2  *  Index.cpp
     3  *  Ottoman
     4  *
     5  *  Created by Jens Alfke on 8/27/09.
     6  *  Copyright 2009 Jens Alfke. All rights reserved.
     7  *  BSD-Licensed: See the file "LICENSE.txt" for details.
     8  */
     9 
    10 #include "Index.h"
    11 #include "Chunk.h"
    12 #include "File.h"
    13 #include "Dictionary.h"
    14 #include <assert.h>
    15 #include <errno.h>
    16 #include <algorithm>
    17 #include <math.h>
    18 
    19 namespace Mooseyard {
    20         
    21     // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
    22     // significantly larger, which increases the file size. In practice a smaller table size seems
    23     // to be a bigger win since disk I/O is slow.
    24     #define PROBE_QUADRATIC 0
    25 
    26     // Similarly use a higher max load than for memory-resident tables, to save space.
    27     static const float kMaxLoadFactor = 0.85;
    28 
    29     static const uint32_t kIndexMagicNumber = 0x54378543;
    30     
    31 
    32     /** An index (hash-table) entry as stored in the memory-mapped file. */
    33     class Index::Entry {
    34     public:
    35         explicit Entry()                        :_hash(0), _valuePosition(0) { }
    36         explicit Entry (HashCode h, FilePosition p)
    37                                                 :_hash(h), _valuePosition(p) { }
    38         LittleEndian<HashCode> hash() const       {return _hash;}
    39         bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
    40         
    41         const KeyValueChunk* value (const File *file) const {
    42             return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
    43         }
    44         FilePosition valuePosition() const      {return _valuePosition;}
    45         
    46         
    47         bool isAvailable() const                {return _valuePosition == 0;}   // false if deleted
    48         bool hasValue() const                   {return _valuePosition > 1;}    // false if deleted
    49         
    50         void markDeleted()                      {_valuePosition = kDeletedPosition;}
    51         void markEmpty()                        {_valuePosition = 0;}
    52         
    53         
    54         static const FilePosition kDeletedPosition = 1;
    55                 
    56     private:
    57         LittleEndian<HashCode> _hash;
    58         LittleEndian<FilePosition> _valuePosition;
    59     };
    60         
    61     
    62 #pragma mark -
    63 #pragma mark CREATION:
    64         
    65     Index::Index (uint32_t itsTableSize)
    66         :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
    67          _magicNumber(kIndexMagicNumber),
    68          _count(0),
    69          _tableSize(itsTableSize)
    70     { }
    71     
    72     Index* Index::create (int capacity) {
    73 #if PROBE_QUADRATIC
    74         int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
    75 #else
    76         int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
    77 #endif
    78         size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
    79         void* storage = ::operator new(size);
    80         memset(storage,0,size);
    81         return new(storage) Index(tableSize);
    82     }
    83     
    84     const Index* Index::indexMappedAt (const void* ptr) {
    85         const Index *index = (const Index*)ptr;
    86         index->validate();
    87         return index;
    88     }
    89     
    90     void Index::validate() const  throw(File::Error) {
    91         if (_magicNumber != kIndexMagicNumber)
    92             throw File::Error(ERANGE, "Bad Index (wrong magic number)");
    93         if (_tableSize < 2 || _count > _tableSize)
    94             throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
    95 #if PROBE_QUADRATIC
    96         if ((_tableSize & (_tableSize-1)) != 0)
    97             throw File::Error(ERANGE, "Bad Index (size not power of 2)");
    98 #endif
    99         if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
   100             throw File::Error(ERANGE, "Bad Index (wrong size)");
   101     }
   102     
   103     void Index::validateEntries (const File *file) const  throw(File::Error) {
   104         validate();
   105         uint32_t size = _tableSize;
   106         uint32_t count = 0;
   107         const Entry *entry = table(0);
   108         for (uint32_t i=0; i<size; i++, entry++) {
   109             if (entry->hasValue()) {
   110                 count++;
   111                 const KeyValueChunk *chunk = entry->value(file);
   112                 if ((void*)chunk->end() > (void*)this)
   113                     throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
   114                 chunk->validate();
   115                 if (entry->hash() != Key::computeHash(chunk->key()))
   116                     throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
   117             }
   118         }
   119         if (count != _count)
   120             throw File::Error(ERANGE, "Bad Index (inconsistent count)");
   121     }
   122     
   123     
   124 #pragma mark -
   125 #pragma mark LOOKUP:
   126     
   127     const Index::Entry* Index::table (int index) const {
   128         return &((const Index::Entry*)&_table)[index];
   129     }
   130     
   131     Index::Entry* Index::table (int index) {
   132         return &((Index::Entry*)&_table)[index];
   133     }
   134     
   135         
   136     const KeyValueChunk* Index::_find (File *file, Key key) const {
   137         int index = key.hash % _tableSize;
   138         LittleEndian<HashCode> lhash = key.hash;
   139 #if PROBE_QUADRATIC
   140         int probe = 0;
   141 #endif
   142         for(;;) {
   143             const Index::Entry *found = table(index);
   144             if (found->isAvailable()) {
   145                 return NULL;
   146             } else if (found->hasHash(lhash)) {
   147                 const KeyValueChunk *value = found->value(file);
   148                 if (value->hasKey(key)) {
   149                     return value;
   150                 }
   151             }
   152 #if PROBE_QUADRATIC
   153             index = (index + ++probe) % _tableSize;
   154 #else
   155             index = (index + 1) % _tableSize;
   156 #endif
   157         }
   158     }
   159     
   160     Blob Index::get (File *file, Key key) const {
   161         const KeyValueChunk *kv = _find(file, key);
   162         return kv ?kv->value() :Blob();
   163     }
   164     
   165 
   166     // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
   167     // While searching, entries past maxPosition should be considered newly-added and not compared,
   168     // since they're not in the memory-mapped part of the file yet.
   169     // Returns true if the entry was added/removed.
   170     bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
   171         bool result = (valuePosition <= Entry::kDeletedPosition);
   172         int index = key.hash % _tableSize;
   173         Index::Entry entry(key.hash, valuePosition);
   174 #if PROBE_QUADRATIC
   175         int probe = 0;
   176 #endif
   177         Index::Entry *found;
   178         for(;;) {
   179             found = table(index);
   180             if (!found->hasValue()) {
   181                 if (entry.hasValue()) {
   182                     result = true;
   183                     break;                  // it's a put and we found an empty entry
   184                 } else if (found->isAvailable()) {
   185                     return false;           // it's a delete but the key wasn't found
   186                 }
   187             }
   188             else if (found->hasHash(entry.hash())) {
   189                 if (!file)
   190                     break;                  // no file given, so assume no duplicate keys
   191                 else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
   192                     break;                  // found existing (not newly-inserted) entry with the matching key
   193                 /*else
   194                     printf("** hash collision! hash=%u\n", key.hash);*/
   195             }
   196 #if PROBE_QUADRATIC
   197             index = (index + ++probe) % _tableSize;
   198 #else
   199             index = (index + 1) % _tableSize;
   200 #endif
   201         }
   202         // Finally found where to put it:
   203         *found = entry;
   204         return result;
   205     }
   206         
   207     bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
   208         bool added = _put(file, key, valuePosition, maxPosition);
   209         if (added)
   210             ++_count;
   211         return added;
   212     }
   213     
   214     bool Index::remove (File *file, Key key, FilePosition maxPosition) {
   215         bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
   216         if (removed) {
   217             assert(_count > 0);
   218             --_count;
   219         }
   220         return removed;
   221     }
   222     
   223     void Index::removeAll () {
   224         memset(table(0), 0, _tableSize*sizeof(Entry));
   225         _count = 0;
   226     }
   227     
   228     
   229     void Index::copyFrom (File *file, const Index *index) {
   230         const Entry *src = index->table(0);
   231         if (index->_tableSize == _tableSize) {
   232             memcpy(table(0), src, _tableSize*sizeof(Entry));
   233         } else {
   234             removeAll();
   235             for (uint32_t i=0; i<index->_tableSize; i++,src++)
   236                 if (src->hasValue())
   237                     _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
   238         }
   239         _count = index->count();
   240     }        
   241     
   242 
   243 
   244     Index::Iterator::Iterator (File *file, const Index *index)
   245         :_file(file),
   246          _current(index->table(-1)),
   247          _end(index->table(index->_tableSize))
   248     {
   249         next();
   250     }
   251     
   252     bool Index::Iterator::hasValue() const  {return _current != NULL;}
   253     Key Index::Iterator::key() const        {return Key(_current->value(_file)->key(), _current->hash());}
   254     Blob Index::Iterator::value() const     {return _current->value(_file)->value();}
   255     
   256     bool Index::Iterator::next() {
   257         if (_current) {
   258             while (++_current < _end) {
   259                 if (_current->hasValue())
   260                     return true;
   261             }
   262             _current = NULL;
   263         }
   264         return false;
   265     }
   266     
   267     FilePosition Index::Iterator::valuePosition() {
   268         return _current ?_current->valuePosition() :0;
   269     }
   270 }