src/Index.cpp
author Jens Alfke <jens@mooseyard.com>
Tue Sep 29 15:46:42 2009 -0700 (2009-09-29)
changeset 9 629f61203db1
parent 8 21a6c17f4e3e
permissions -rw-r--r--
* Merged in jbm's changes for Linux compatibility, fixing Mac compatibility in the process :)
* Fixed two regressions having to do with the _previousTrailerPosition in VersionDictionary.cpp.
* Made sure RTTI is enabled in the OttomanTest target because gtest requires it.
jens@0
     1
/*
jens@0
     2
 *  Index.cpp
jens@0
     3
 *  Ottoman
jens@0
     4
 *
jens@0
     5
 *  Created by Jens Alfke on 8/27/09.
jens@0
     6
 *  Copyright 2009 Jens Alfke. All rights reserved.
jens@0
     7
 *  BSD-Licensed: See the file "LICENSE.txt" for details.
jens@0
     8
 */
jens@0
     9
jens@0
    10
#include "Index.h"
jens@0
    11
#include "Chunk.h"
jens@0
    12
#include "File.h"
jens@0
    13
#include "Dictionary.h"
jens@0
    14
#include <assert.h>
jens@0
    15
#include <errno.h>
jens@0
    16
#include <algorithm>
jens@0
    17
#include <math.h>
jbm@8
    18
#include <stdint.h>
jbm@8
    19
jbm@8
    20
jens@0
    21
namespace Mooseyard {
jens@0
    22
        
jens@0
    23
    // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
jens@0
    24
    // significantly larger, which increases the file size. In practice a smaller table size seems
jens@0
    25
    // to be a bigger win since disk I/O is slow.
jens@0
    26
    #define PROBE_QUADRATIC 0
jens@0
    27
jens@0
    28
    // Similarly use a higher max load than for memory-resident tables, to save space.
jens@2
    29
    static const float kMaxLoadFactor = 0.85f;
jens@0
    30
jens@0
    31
    static const uint32_t kIndexMagicNumber = 0x54378543;
jens@0
    32
    
jens@0
    33
jens@0
    34
    /** An index (hash-table) entry as stored in the memory-mapped file. */
jens@0
    35
    class Index::Entry {
jens@0
    36
    public:
jens@0
    37
        explicit Entry()                        :_hash(0), _valuePosition(0) { }
jens@0
    38
        explicit Entry (HashCode h, FilePosition p)
jens@0
    39
                                                :_hash(h), _valuePosition(p) { }
jens@0
    40
        LittleEndian<HashCode> hash() const       {return _hash;}
jens@0
    41
        bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
jens@0
    42
        
jens@0
    43
        const KeyValueChunk* value (const File *file) const {
jens@0
    44
            return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
jens@0
    45
        }
jens@0
    46
        FilePosition valuePosition() const      {return _valuePosition;}
jens@0
    47
        
jens@0
    48
        
jens@0
    49
        bool isAvailable() const                {return _valuePosition == 0;}   // false if deleted
jens@0
    50
        bool hasValue() const                   {return _valuePosition > 1;}    // false if deleted
jens@0
    51
        
jens@0
    52
        void markDeleted()                      {_valuePosition = kDeletedPosition;}
jens@0
    53
        void markEmpty()                        {_valuePosition = 0;}
jens@0
    54
        
jens@0
    55
        
jens@0
    56
        static const FilePosition kDeletedPosition = 1;
jens@0
    57
                
jens@0
    58
    private:
jens@0
    59
        LittleEndian<HashCode> _hash;
jens@0
    60
        LittleEndian<FilePosition> _valuePosition;
jens@0
    61
    };
jens@0
    62
        
jens@0
    63
    
jens@0
    64
#pragma mark -
jens@0
    65
#pragma mark CREATION:
jens@0
    66
        
jens@0
    67
    Index::Index (uint32_t itsTableSize)
jens@0
    68
        :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
jens@0
    69
         _magicNumber(kIndexMagicNumber),
jens@0
    70
         _count(0),
jens@0
    71
         _tableSize(itsTableSize)
jens@0
    72
    { }
jens@0
    73
    
jens@0
    74
    Index* Index::create (int capacity) {
jens@0
    75
#if PROBE_QUADRATIC
jens@0
    76
        int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
jens@0
    77
#else
jens@0
    78
        int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
jens@0
    79
#endif
jens@0
    80
        size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
jens@0
    81
        void* storage = ::operator new(size);
jens@0
    82
        memset(storage,0,size);
jens@0
    83
        return new(storage) Index(tableSize);
jens@0
    84
    }
jens@0
    85
    
jens@0
    86
    const Index* Index::indexMappedAt (const void* ptr) {
jens@0
    87
        const Index *index = (const Index*)ptr;
jens@0
    88
        index->validate();
jens@0
    89
        return index;
jens@0
    90
    }
jens@0
    91
    
jens@0
    92
    void Index::validate() const  throw(File::Error) {
jens@0
    93
        if (_magicNumber != kIndexMagicNumber)
jens@0
    94
            throw File::Error(ERANGE, "Bad Index (wrong magic number)");
jens@0
    95
        if (_tableSize < 2 || _count > _tableSize)
jens@0
    96
            throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
jens@0
    97
#if PROBE_QUADRATIC
jens@0
    98
        if ((_tableSize & (_tableSize-1)) != 0)
jens@0
    99
            throw File::Error(ERANGE, "Bad Index (size not power of 2)");
jens@0
   100
#endif
jens@0
   101
        if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
jens@0
   102
            throw File::Error(ERANGE, "Bad Index (wrong size)");
jens@0
   103
    }
jens@0
   104
    
jens@0
   105
    void Index::validateEntries (const File *file) const  throw(File::Error) {
jens@0
   106
        validate();
jens@0
   107
        uint32_t size = _tableSize;
jens@0
   108
        uint32_t count = 0;
jens@0
   109
        const Entry *entry = table(0);
jens@0
   110
        for (uint32_t i=0; i<size; i++, entry++) {
jens@0
   111
            if (entry->hasValue()) {
jens@0
   112
                count++;
jens@0
   113
                const KeyValueChunk *chunk = entry->value(file);
jens@0
   114
                if ((void*)chunk->end() > (void*)this)
jens@0
   115
                    throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
jens@0
   116
                chunk->validate();
jens@0
   117
                if (entry->hash() != Key::computeHash(chunk->key()))
jens@0
   118
                    throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
jens@0
   119
            }
jens@0
   120
        }
jens@0
   121
        if (count != _count)
jens@0
   122
            throw File::Error(ERANGE, "Bad Index (inconsistent count)");
jens@0
   123
    }
jens@0
   124
    
jens@0
   125
    
jens@0
   126
#pragma mark -
jens@0
   127
#pragma mark LOOKUP:
jens@0
   128
    
jens@0
   129
    const Index::Entry* Index::table (int index) const {
jens@0
   130
        return &((const Index::Entry*)&_table)[index];
jens@0
   131
    }
jens@0
   132
    
jens@0
   133
    Index::Entry* Index::table (int index) {
jens@0
   134
        return &((Index::Entry*)&_table)[index];
jens@0
   135
    }
jens@0
   136
    
jens@0
   137
        
jens@0
   138
    const KeyValueChunk* Index::_find (File *file, Key key) const {
jens@0
   139
        int index = key.hash % _tableSize;
jens@0
   140
        LittleEndian<HashCode> lhash = key.hash;
jens@0
   141
#if PROBE_QUADRATIC
jens@0
   142
        int probe = 0;
jens@0
   143
#endif
jens@0
   144
        for(;;) {
jens@0
   145
            const Index::Entry *found = table(index);
jens@0
   146
            if (found->isAvailable()) {
jens@0
   147
                return NULL;
jens@0
   148
            } else if (found->hasHash(lhash)) {
jens@0
   149
                const KeyValueChunk *value = found->value(file);
jens@0
   150
                if (value->hasKey(key)) {
jens@0
   151
                    return value;
jens@0
   152
                }
jens@0
   153
            }
jens@0
   154
#if PROBE_QUADRATIC
jens@0
   155
            index = (index + ++probe) % _tableSize;
jens@0
   156
#else
jens@0
   157
            index = (index + 1) % _tableSize;
jens@0
   158
#endif
jens@0
   159
        }
jens@0
   160
    }
jens@0
   161
    
jens@0
   162
    Blob Index::get (File *file, Key key) const {
jens@0
   163
        const KeyValueChunk *kv = _find(file, key);
jens@0
   164
        return kv ?kv->value() :Blob();
jens@0
   165
    }
jens@0
   166
    
jens@0
   167
jens@0
   168
    // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
jens@0
   169
    // While searching, entries past maxPosition should be considered newly-added and not compared,
jens@0
   170
    // since they're not in the memory-mapped part of the file yet.
jens@0
   171
    // Returns true if the entry was added/removed.
jens@0
   172
    bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0
   173
        bool result = (valuePosition <= Entry::kDeletedPosition);
jens@0
   174
        int index = key.hash % _tableSize;
jens@0
   175
        Index::Entry entry(key.hash, valuePosition);
jens@0
   176
#if PROBE_QUADRATIC
jens@0
   177
        int probe = 0;
jens@0
   178
#endif
jens@0
   179
        Index::Entry *found;
jens@0
   180
        for(;;) {
jens@0
   181
            found = table(index);
jens@0
   182
            if (!found->hasValue()) {
jens@0
   183
                if (entry.hasValue()) {
jens@0
   184
                    result = true;
jens@0
   185
                    break;                  // it's a put and we found an empty entry
jens@0
   186
                } else if (found->isAvailable()) {
jens@0
   187
                    return false;           // it's a delete but the key wasn't found
jens@0
   188
                }
jens@0
   189
            }
jens@0
   190
            else if (found->hasHash(entry.hash())) {
jens@0
   191
                if (!file)
jens@0
   192
                    break;                  // no file given, so assume no duplicate keys
jens@0
   193
                else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
jens@0
   194
                    break;                  // found existing (not newly-inserted) entry with the matching key
jens@0
   195
                /*else
jens@0
   196
                    printf("** hash collision! hash=%u\n", key.hash);*/
jens@0
   197
            }
jens@0
   198
#if PROBE_QUADRATIC
jens@0
   199
            index = (index + ++probe) % _tableSize;
jens@0
   200
#else
jens@0
   201
            index = (index + 1) % _tableSize;
jens@0
   202
#endif
jens@0
   203
        }
jens@0
   204
        // Finally found where to put it:
jens@0
   205
        *found = entry;
jens@0
   206
        return result;
jens@0
   207
    }
jens@0
   208
        
jens@0
   209
    bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0
   210
        bool added = _put(file, key, valuePosition, maxPosition);
jens@0
   211
        if (added)
jens@0
   212
            ++_count;
jens@0
   213
        return added;
jens@0
   214
    }
jens@0
   215
    
jens@0
   216
    bool Index::remove (File *file, Key key, FilePosition maxPosition) {
jens@0
   217
        bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
jens@0
   218
        if (removed) {
jens@0
   219
            assert(_count > 0);
jens@0
   220
            --_count;
jens@0
   221
        }
jens@0
   222
        return removed;
jens@0
   223
    }
jens@0
   224
    
jens@0
   225
    void Index::removeAll () {
jens@0
   226
        memset(table(0), 0, _tableSize*sizeof(Entry));
jens@0
   227
        _count = 0;
jens@0
   228
    }
jens@0
   229
    
jens@0
   230
    
jens@0
   231
    void Index::copyFrom (File *file, const Index *index) {
jens@0
   232
        const Entry *src = index->table(0);
jens@0
   233
        if (index->_tableSize == _tableSize) {
jens@0
   234
            memcpy(table(0), src, _tableSize*sizeof(Entry));
jens@0
   235
        } else {
jens@0
   236
            removeAll();
jens@0
   237
            for (uint32_t i=0; i<index->_tableSize; i++,src++)
jens@0
   238
                if (src->hasValue())
jens@0
   239
                    _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
jens@0
   240
        }
jens@0
   241
        _count = index->count();
jens@0
   242
    }        
jens@0
   243
    
jens@0
   244
jens@0
   245
jens@0
   246
    Index::Iterator::Iterator (File *file, const Index *index)
jens@0
   247
        :_file(file),
jens@0
   248
         _current(index->table(-1)),
jens@0
   249
         _end(index->table(index->_tableSize))
jens@0
   250
    {
jens@0
   251
        next();
jens@0
   252
    }
jens@0
   253
    
jens@0
   254
    bool Index::Iterator::hasValue() const  {return _current != NULL;}
jens@0
   255
    Key Index::Iterator::key() const        {return Key(_current->value(_file)->key(), _current->hash());}
jens@0
   256
    Blob Index::Iterator::value() const     {return _current->value(_file)->value();}
jens@0
   257
    
jens@0
   258
    bool Index::Iterator::next() {
jens@0
   259
        if (_current) {
jens@0
   260
            while (++_current < _end) {
jens@0
   261
                if (_current->hasValue())
jens@0
   262
                    return true;
jens@0
   263
            }
jens@0
   264
            _current = NULL;
jens@0
   265
        }
jens@0
   266
        return false;
jens@0
   267
    }
jens@0
   268
    
jens@0
   269
    FilePosition Index::Iterator::valuePosition() {
jens@0
   270
        return _current ?_current->valuePosition() :0;
jens@0
   271
    }
jens@0
   272
}