src/Index.cpp
author jbm@humility
Mon Sep 28 23:39:08 2009 -0700 (2009-09-28)
changeset 8 21a6c17f4e3e
parent 2 851de24ecb61
child 9 629f61203db1
permissions -rw-r--r--
jbm: it compiles... but bombs in unit tests
     1 /*
     2  *  Index.cpp
     3  *  Ottoman
     4  *
     5  *  Created by Jens Alfke on 8/27/09.
     6  *  Copyright 2009 Jens Alfke. All rights reserved.
     7  *  BSD-Licensed: See the file "LICENSE.txt" for details.
     8  */
     9 
    10 #include "Index.h"
    11 #include "Chunk.h"
    12 #include "File.h"
    13 #include "Dictionary.h"
    14 #include <assert.h>
    15 #include <errno.h>
    16 #include <algorithm>
    17 #include <math.h>
    18 
    19 #define __STDC_LIMIT_MACROS /* I <3 you, C99 */
    20 # define UINT32_MAX             (4294967295U)
    21  
    22 #include <stdint.h>
    23 
    24 
    25 namespace Mooseyard {
    26         
    27     // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
    28     // significantly larger, which increases the file size. In practice a smaller table size seems
    29     // to be a bigger win since disk I/O is slow.
    30     #define PROBE_QUADRATIC 0
    31 
    32     // Similarly use a higher max load than for memory-resident tables, to save space.
    33     static const float kMaxLoadFactor = 0.85f;
    34 
    35     static const uint32_t kIndexMagicNumber = 0x54378543;
    36     
    37 
    38     /** An index (hash-table) entry as stored in the memory-mapped file. */
    39     class Index::Entry {
    40     public:
    41         explicit Entry()                        :_hash(0), _valuePosition(0) { }
    42         explicit Entry (HashCode h, FilePosition p)
    43                                                 :_hash(h), _valuePosition(p) { }
    44         LittleEndian<HashCode> hash() const       {return _hash;}
    45         bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
    46         
    47         const KeyValueChunk* value (const File *file) const {
    48             return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
    49         }
    50         FilePosition valuePosition() const      {return _valuePosition;}
    51         
    52         
    53         bool isAvailable() const                {return _valuePosition == 0;}   // false if deleted
    54         bool hasValue() const                   {return _valuePosition > 1;}    // false if deleted
    55         
    56         void markDeleted()                      {_valuePosition = kDeletedPosition;}
    57         void markEmpty()                        {_valuePosition = 0;}
    58         
    59         
    60         static const FilePosition kDeletedPosition = 1;
    61                 
    62     private:
    63         LittleEndian<HashCode> _hash;
    64         LittleEndian<FilePosition> _valuePosition;
    65     };
    66         
    67     
    68 #pragma mark -
    69 #pragma mark CREATION:
    70         
    71     Index::Index (uint32_t itsTableSize)
    72         :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
    73          _magicNumber(kIndexMagicNumber),
    74          _count(0),
    75          _tableSize(itsTableSize)
    76     { }
    77     
    78     Index* Index::create (int capacity) {
    79 #if PROBE_QUADRATIC
    80         int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
    81 #else
    82         int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
    83 #endif
    84         size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
    85         void* storage = ::operator new(size);
    86         memset(storage,0,size);
    87         return new(storage) Index(tableSize);
    88     }
    89     
    90     const Index* Index::indexMappedAt (const void* ptr) {
    91         const Index *index = (const Index*)ptr;
    92         index->validate();
    93         return index;
    94     }
    95     
    96     void Index::validate() const  throw(File::Error) {
    97         if (_magicNumber != kIndexMagicNumber)
    98             throw File::Error(ERANGE, "Bad Index (wrong magic number)");
    99         if (_tableSize < 2 || _count > _tableSize)
   100             throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
   101 #if PROBE_QUADRATIC
   102         if ((_tableSize & (_tableSize-1)) != 0)
   103             throw File::Error(ERANGE, "Bad Index (size not power of 2)");
   104 #endif
   105         if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
   106             throw File::Error(ERANGE, "Bad Index (wrong size)");
   107     }
   108     
   109     void Index::validateEntries (const File *file) const  throw(File::Error) {
   110         validate();
   111         uint32_t size = _tableSize;
   112         uint32_t count = 0;
   113         const Entry *entry = table(0);
   114         for (uint32_t i=0; i<size; i++, entry++) {
   115             if (entry->hasValue()) {
   116                 count++;
   117                 const KeyValueChunk *chunk = entry->value(file);
   118                 if ((void*)chunk->end() > (void*)this)
   119                     throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
   120                 chunk->validate();
   121                 if (entry->hash() != Key::computeHash(chunk->key()))
   122                     throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
   123             }
   124         }
   125         if (count != _count)
   126             throw File::Error(ERANGE, "Bad Index (inconsistent count)");
   127     }
   128     
   129     
   130 #pragma mark -
   131 #pragma mark LOOKUP:
   132     
   133     const Index::Entry* Index::table (int index) const {
   134         return &((const Index::Entry*)&_table)[index];
   135     }
   136     
   137     Index::Entry* Index::table (int index) {
   138         return &((Index::Entry*)&_table)[index];
   139     }
   140     
   141         
   142     const KeyValueChunk* Index::_find (File *file, Key key) const {
   143         int index = key.hash % _tableSize;
   144         LittleEndian<HashCode> lhash = key.hash;
   145 #if PROBE_QUADRATIC
   146         int probe = 0;
   147 #endif
   148         for(;;) {
   149             const Index::Entry *found = table(index);
   150             if (found->isAvailable()) {
   151                 return NULL;
   152             } else if (found->hasHash(lhash)) {
   153                 const KeyValueChunk *value = found->value(file);
   154                 if (value->hasKey(key)) {
   155                     return value;
   156                 }
   157             }
   158 #if PROBE_QUADRATIC
   159             index = (index + ++probe) % _tableSize;
   160 #else
   161             index = (index + 1) % _tableSize;
   162 #endif
   163         }
   164     }
   165     
   166     Blob Index::get (File *file, Key key) const {
   167         const KeyValueChunk *kv = _find(file, key);
   168         return kv ?kv->value() :Blob();
   169     }
   170     
   171 
   172     // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
   173     // While searching, entries past maxPosition should be considered newly-added and not compared,
   174     // since they're not in the memory-mapped part of the file yet.
   175     // Returns true if the entry was added/removed.
   176     bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
   177         bool result = (valuePosition <= Entry::kDeletedPosition);
   178         int index = key.hash % _tableSize;
   179         Index::Entry entry(key.hash, valuePosition);
   180 #if PROBE_QUADRATIC
   181         int probe = 0;
   182 #endif
   183         Index::Entry *found;
   184         for(;;) {
   185             found = table(index);
   186             if (!found->hasValue()) {
   187                 if (entry.hasValue()) {
   188                     result = true;
   189                     break;                  // it's a put and we found an empty entry
   190                 } else if (found->isAvailable()) {
   191                     return false;           // it's a delete but the key wasn't found
   192                 }
   193             }
   194             else if (found->hasHash(entry.hash())) {
   195                 if (!file)
   196                     break;                  // no file given, so assume no duplicate keys
   197                 else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
   198                     break;                  // found existing (not newly-inserted) entry with the matching key
   199                 /*else
   200                     printf("** hash collision! hash=%u\n", key.hash);*/
   201             }
   202 #if PROBE_QUADRATIC
   203             index = (index + ++probe) % _tableSize;
   204 #else
   205             index = (index + 1) % _tableSize;
   206 #endif
   207         }
   208         // Finally found where to put it:
   209         *found = entry;
   210         return result;
   211     }
   212         
   213     bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
   214         bool added = _put(file, key, valuePosition, maxPosition);
   215         if (added)
   216             ++_count;
   217         return added;
   218     }
   219     
   220     bool Index::remove (File *file, Key key, FilePosition maxPosition) {
   221         bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
   222         if (removed) {
   223             assert(_count > 0);
   224             --_count;
   225         }
   226         return removed;
   227     }
   228     
   229     void Index::removeAll () {
   230         memset(table(0), 0, _tableSize*sizeof(Entry));
   231         _count = 0;
   232     }
   233     
   234     
   235     void Index::copyFrom (File *file, const Index *index) {
   236         const Entry *src = index->table(0);
   237         if (index->_tableSize == _tableSize) {
   238             memcpy(table(0), src, _tableSize*sizeof(Entry));
   239         } else {
   240             removeAll();
   241             for (uint32_t i=0; i<index->_tableSize; i++,src++)
   242                 if (src->hasValue())
   243                     _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
   244         }
   245         _count = index->count();
   246     }        
   247     
   248 
   249 
   250     Index::Iterator::Iterator (File *file, const Index *index)
   251         :_file(file),
   252          _current(index->table(-1)),
   253          _end(index->table(index->_tableSize))
   254     {
   255         next();
   256     }
   257     
   258     bool Index::Iterator::hasValue() const  {return _current != NULL;}
   259     Key Index::Iterator::key() const        {return Key(_current->value(_file)->key(), _current->hash());}
   260     Blob Index::Iterator::value() const     {return _current->value(_file)->value();}
   261     
   262     bool Index::Iterator::next() {
   263         if (_current) {
   264             while (++_current < _end) {
   265                 if (_current->hasValue())
   266                     return true;
   267             }
   268             _current = NULL;
   269         }
   270         return false;
   271     }
   272     
   273     FilePosition Index::Iterator::valuePosition() {
   274         return _current ?_current->valuePosition() :0;
   275     }
   276 }