src/Index.cpp
author Jens Alfke <jens@mooseyard.com>
Tue Sep 29 15:46:42 2009 -0700 (2009-09-29)
changeset 9 629f61203db1
parent 8 21a6c17f4e3e
permissions -rw-r--r--
* Merged in jbm's changes for Linux compatibility, fixing Mac compatibility in the process :)
* Fixed two regressions having to do with the _previousTrailerPosition in VersionDictionary.cpp.
* Made sure RTTI is enabled in the OttomanTest target because gtest requires it.
     1 /*
     2  *  Index.cpp
     3  *  Ottoman
     4  *
     5  *  Created by Jens Alfke on 8/27/09.
     6  *  Copyright 2009 Jens Alfke. All rights reserved.
     7  *  BSD-Licensed: See the file "LICENSE.txt" for details.
     8  */
     9 
    10 #include "Index.h"
    11 #include "Chunk.h"
    12 #include "File.h"
    13 #include "Dictionary.h"
    14 #include <assert.h>
    15 #include <errno.h>
    16 #include <algorithm>
    17 #include <math.h>
    18 #include <stdint.h>
    19 
    20 
    21 namespace Mooseyard {
    22         
    23     // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
    24     // significantly larger, which increases the file size. In practice a smaller table size seems
    25     // to be a bigger win since disk I/O is slow.
    26     #define PROBE_QUADRATIC 0
    27 
    28     // Similarly use a higher max load than for memory-resident tables, to save space.
    29     static const float kMaxLoadFactor = 0.85f;
    30 
    31     static const uint32_t kIndexMagicNumber = 0x54378543;
    32     
    33 
    34     /** An index (hash-table) entry as stored in the memory-mapped file. */
    35     class Index::Entry {
    36     public:
    37         explicit Entry()                        :_hash(0), _valuePosition(0) { }
    38         explicit Entry (HashCode h, FilePosition p)
    39                                                 :_hash(h), _valuePosition(p) { }
    40         LittleEndian<HashCode> hash() const       {return _hash;}
    41         bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
    42         
    43         const KeyValueChunk* value (const File *file) const {
    44             return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
    45         }
    46         FilePosition valuePosition() const      {return _valuePosition;}
    47         
    48         
    49         bool isAvailable() const                {return _valuePosition == 0;}   // false if deleted
    50         bool hasValue() const                   {return _valuePosition > 1;}    // false if deleted
    51         
    52         void markDeleted()                      {_valuePosition = kDeletedPosition;}
    53         void markEmpty()                        {_valuePosition = 0;}
    54         
    55         
    56         static const FilePosition kDeletedPosition = 1;
    57                 
    58     private:
    59         LittleEndian<HashCode> _hash;
    60         LittleEndian<FilePosition> _valuePosition;
    61     };
    62         
    63     
    64 #pragma mark -
    65 #pragma mark CREATION:
    66         
    67     Index::Index (uint32_t itsTableSize)
    68         :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
    69          _magicNumber(kIndexMagicNumber),
    70          _count(0),
    71          _tableSize(itsTableSize)
    72     { }
    73     
    74     Index* Index::create (int capacity) {
    75 #if PROBE_QUADRATIC
    76         int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
    77 #else
    78         int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
    79 #endif
    80         size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
    81         void* storage = ::operator new(size);
    82         memset(storage,0,size);
    83         return new(storage) Index(tableSize);
    84     }
    85     
    86     const Index* Index::indexMappedAt (const void* ptr) {
    87         const Index *index = (const Index*)ptr;
    88         index->validate();
    89         return index;
    90     }
    91     
    92     void Index::validate() const  throw(File::Error) {
    93         if (_magicNumber != kIndexMagicNumber)
    94             throw File::Error(ERANGE, "Bad Index (wrong magic number)");
    95         if (_tableSize < 2 || _count > _tableSize)
    96             throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
    97 #if PROBE_QUADRATIC
    98         if ((_tableSize & (_tableSize-1)) != 0)
    99             throw File::Error(ERANGE, "Bad Index (size not power of 2)");
   100 #endif
   101         if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
   102             throw File::Error(ERANGE, "Bad Index (wrong size)");
   103     }
   104     
   105     void Index::validateEntries (const File *file) const  throw(File::Error) {
   106         validate();
   107         uint32_t size = _tableSize;
   108         uint32_t count = 0;
   109         const Entry *entry = table(0);
   110         for (uint32_t i=0; i<size; i++, entry++) {
   111             if (entry->hasValue()) {
   112                 count++;
   113                 const KeyValueChunk *chunk = entry->value(file);
   114                 if ((void*)chunk->end() > (void*)this)
   115                     throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
   116                 chunk->validate();
   117                 if (entry->hash() != Key::computeHash(chunk->key()))
   118                     throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
   119             }
   120         }
   121         if (count != _count)
   122             throw File::Error(ERANGE, "Bad Index (inconsistent count)");
   123     }
   124     
   125     
   126 #pragma mark -
   127 #pragma mark LOOKUP:
   128     
   129     const Index::Entry* Index::table (int index) const {
   130         return &((const Index::Entry*)&_table)[index];
   131     }
   132     
   133     Index::Entry* Index::table (int index) {
   134         return &((Index::Entry*)&_table)[index];
   135     }
   136     
   137         
   138     const KeyValueChunk* Index::_find (File *file, Key key) const {
   139         int index = key.hash % _tableSize;
   140         LittleEndian<HashCode> lhash = key.hash;
   141 #if PROBE_QUADRATIC
   142         int probe = 0;
   143 #endif
   144         for(;;) {
   145             const Index::Entry *found = table(index);
   146             if (found->isAvailable()) {
   147                 return NULL;
   148             } else if (found->hasHash(lhash)) {
   149                 const KeyValueChunk *value = found->value(file);
   150                 if (value->hasKey(key)) {
   151                     return value;
   152                 }
   153             }
   154 #if PROBE_QUADRATIC
   155             index = (index + ++probe) % _tableSize;
   156 #else
   157             index = (index + 1) % _tableSize;
   158 #endif
   159         }
   160     }
   161     
   162     Blob Index::get (File *file, Key key) const {
   163         const KeyValueChunk *kv = _find(file, key);
   164         return kv ?kv->value() :Blob();
   165     }
   166     
   167 
   168     // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
   169     // While searching, entries past maxPosition should be considered newly-added and not compared,
   170     // since they're not in the memory-mapped part of the file yet.
   171     // Returns true if the entry was added/removed.
   172     bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
   173         bool result = (valuePosition <= Entry::kDeletedPosition);
   174         int index = key.hash % _tableSize;
   175         Index::Entry entry(key.hash, valuePosition);
   176 #if PROBE_QUADRATIC
   177         int probe = 0;
   178 #endif
   179         Index::Entry *found;
   180         for(;;) {
   181             found = table(index);
   182             if (!found->hasValue()) {
   183                 if (entry.hasValue()) {
   184                     result = true;
   185                     break;                  // it's a put and we found an empty entry
   186                 } else if (found->isAvailable()) {
   187                     return false;           // it's a delete but the key wasn't found
   188                 }
   189             }
   190             else if (found->hasHash(entry.hash())) {
   191                 if (!file)
   192                     break;                  // no file given, so assume no duplicate keys
   193                 else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
   194                     break;                  // found existing (not newly-inserted) entry with the matching key
   195                 /*else
   196                     printf("** hash collision! hash=%u\n", key.hash);*/
   197             }
   198 #if PROBE_QUADRATIC
   199             index = (index + ++probe) % _tableSize;
   200 #else
   201             index = (index + 1) % _tableSize;
   202 #endif
   203         }
   204         // Finally found where to put it:
   205         *found = entry;
   206         return result;
   207     }
   208         
   209     bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
   210         bool added = _put(file, key, valuePosition, maxPosition);
   211         if (added)
   212             ++_count;
   213         return added;
   214     }
   215     
   216     bool Index::remove (File *file, Key key, FilePosition maxPosition) {
   217         bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
   218         if (removed) {
   219             assert(_count > 0);
   220             --_count;
   221         }
   222         return removed;
   223     }
   224     
   225     void Index::removeAll () {
   226         memset(table(0), 0, _tableSize*sizeof(Entry));
   227         _count = 0;
   228     }
   229     
   230     
   231     void Index::copyFrom (File *file, const Index *index) {
   232         const Entry *src = index->table(0);
   233         if (index->_tableSize == _tableSize) {
   234             memcpy(table(0), src, _tableSize*sizeof(Entry));
   235         } else {
   236             removeAll();
   237             for (uint32_t i=0; i<index->_tableSize; i++,src++)
   238                 if (src->hasValue())
   239                     _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
   240         }
   241         _count = index->count();
   242     }        
   243     
   244 
   245 
   246     Index::Iterator::Iterator (File *file, const Index *index)
   247         :_file(file),
   248          _current(index->table(-1)),
   249          _end(index->table(index->_tableSize))
   250     {
   251         next();
   252     }
   253     
   254     bool Index::Iterator::hasValue() const  {return _current != NULL;}
   255     Key Index::Iterator::key() const        {return Key(_current->value(_file)->key(), _current->hash());}
   256     Blob Index::Iterator::value() const     {return _current->value(_file)->value();}
   257     
   258     bool Index::Iterator::next() {
   259         if (_current) {
   260             while (++_current < _end) {
   261                 if (_current->hasValue())
   262                     return true;
   263             }
   264             _current = NULL;
   265         }
   266         return false;
   267     }
   268     
   269     FilePosition Index::Iterator::valuePosition() {
   270         return _current ?_current->valuePosition() :0;
   271     }
   272 }