src/Index.cpp
author Jens Alfke <jens@mooseyard.com>
Sun Sep 20 21:25:47 2009 -0700 (2009-09-20)
changeset 2 851de24ecb61
parent 0 31a43d94cc26
child 8 21a6c17f4e3e
permissions -rw-r--r--
Added library target, changed some build settings, and fixed 64-to-32-bit conversion warnings.
jens@0
     1
/*
jens@0
     2
 *  Index.cpp
jens@0
     3
 *  Ottoman
jens@0
     4
 *
jens@0
     5
 *  Created by Jens Alfke on 8/27/09.
jens@0
     6
 *  Copyright 2009 Jens Alfke. All rights reserved.
jens@0
     7
 *  BSD-Licensed: See the file "LICENSE.txt" for details.
jens@0
     8
 */
jens@0
     9
jens@0
    10
#include "Index.h"
jens@0
    11
#include "Chunk.h"
jens@0
    12
#include "File.h"
jens@0
    13
#include "Dictionary.h"
jens@0
    14
#include <assert.h>
jens@0
    15
#include <errno.h>
jens@0
    16
#include <algorithm>
jens@0
    17
#include <math.h>
jens@0
    18
jens@0
    19
namespace Mooseyard {
jens@0
    20
        
jens@0
    21
    // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
jens@0
    22
    // significantly larger, which increases the file size. In practice a smaller table size seems
jens@0
    23
    // to be a bigger win since disk I/O is slow.
jens@0
    24
    #define PROBE_QUADRATIC 0
jens@0
    25
jens@0
    26
    // Similarly use a higher max load than for memory-resident tables, to save space.
jens@2
    27
    static const float kMaxLoadFactor = 0.85f;
jens@0
    28
jens@0
    29
    static const uint32_t kIndexMagicNumber = 0x54378543;
jens@0
    30
    
jens@0
    31
jens@0
    32
    /** An index (hash-table) entry as stored in the memory-mapped file. */
jens@0
    33
    class Index::Entry {
jens@0
    34
    public:
jens@0
    35
        explicit Entry()                        :_hash(0), _valuePosition(0) { }
jens@0
    36
        explicit Entry (HashCode h, FilePosition p)
jens@0
    37
                                                :_hash(h), _valuePosition(p) { }
jens@0
    38
        LittleEndian<HashCode> hash() const       {return _hash;}
jens@0
    39
        bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
jens@0
    40
        
jens@0
    41
        const KeyValueChunk* value (const File *file) const {
jens@0
    42
            return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
jens@0
    43
        }
jens@0
    44
        FilePosition valuePosition() const      {return _valuePosition;}
jens@0
    45
        
jens@0
    46
        
jens@0
    47
        bool isAvailable() const                {return _valuePosition == 0;}   // false if deleted
jens@0
    48
        bool hasValue() const                   {return _valuePosition > 1;}    // false if deleted
jens@0
    49
        
jens@0
    50
        void markDeleted()                      {_valuePosition = kDeletedPosition;}
jens@0
    51
        void markEmpty()                        {_valuePosition = 0;}
jens@0
    52
        
jens@0
    53
        
jens@0
    54
        static const FilePosition kDeletedPosition = 1;
jens@0
    55
                
jens@0
    56
    private:
jens@0
    57
        LittleEndian<HashCode> _hash;
jens@0
    58
        LittleEndian<FilePosition> _valuePosition;
jens@0
    59
    };
jens@0
    60
        
jens@0
    61
    
jens@0
    62
#pragma mark -
jens@0
    63
#pragma mark CREATION:
jens@0
    64
        
jens@0
    65
    Index::Index (uint32_t itsTableSize)
jens@0
    66
        :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
jens@0
    67
         _magicNumber(kIndexMagicNumber),
jens@0
    68
         _count(0),
jens@0
    69
         _tableSize(itsTableSize)
jens@0
    70
    { }
jens@0
    71
    
jens@0
    72
    Index* Index::create (int capacity) {
jens@0
    73
#if PROBE_QUADRATIC
jens@0
    74
        int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
jens@0
    75
#else
jens@0
    76
        int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
jens@0
    77
#endif
jens@0
    78
        size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
jens@0
    79
        void* storage = ::operator new(size);
jens@0
    80
        memset(storage,0,size);
jens@0
    81
        return new(storage) Index(tableSize);
jens@0
    82
    }
jens@0
    83
    
jens@0
    84
    const Index* Index::indexMappedAt (const void* ptr) {
jens@0
    85
        const Index *index = (const Index*)ptr;
jens@0
    86
        index->validate();
jens@0
    87
        return index;
jens@0
    88
    }
jens@0
    89
    
jens@0
    90
    void Index::validate() const  throw(File::Error) {
jens@0
    91
        if (_magicNumber != kIndexMagicNumber)
jens@0
    92
            throw File::Error(ERANGE, "Bad Index (wrong magic number)");
jens@0
    93
        if (_tableSize < 2 || _count > _tableSize)
jens@0
    94
            throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
jens@0
    95
#if PROBE_QUADRATIC
jens@0
    96
        if ((_tableSize & (_tableSize-1)) != 0)
jens@0
    97
            throw File::Error(ERANGE, "Bad Index (size not power of 2)");
jens@0
    98
#endif
jens@0
    99
        if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
jens@0
   100
            throw File::Error(ERANGE, "Bad Index (wrong size)");
jens@0
   101
    }
jens@0
   102
    
jens@0
   103
    void Index::validateEntries (const File *file) const  throw(File::Error) {
jens@0
   104
        validate();
jens@0
   105
        uint32_t size = _tableSize;
jens@0
   106
        uint32_t count = 0;
jens@0
   107
        const Entry *entry = table(0);
jens@0
   108
        for (uint32_t i=0; i<size; i++, entry++) {
jens@0
   109
            if (entry->hasValue()) {
jens@0
   110
                count++;
jens@0
   111
                const KeyValueChunk *chunk = entry->value(file);
jens@0
   112
                if ((void*)chunk->end() > (void*)this)
jens@0
   113
                    throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
jens@0
   114
                chunk->validate();
jens@0
   115
                if (entry->hash() != Key::computeHash(chunk->key()))
jens@0
   116
                    throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
jens@0
   117
            }
jens@0
   118
        }
jens@0
   119
        if (count != _count)
jens@0
   120
            throw File::Error(ERANGE, "Bad Index (inconsistent count)");
jens@0
   121
    }
jens@0
   122
    
jens@0
   123
    
jens@0
   124
#pragma mark -
jens@0
   125
#pragma mark LOOKUP:
jens@0
   126
    
jens@0
   127
    const Index::Entry* Index::table (int index) const {
jens@0
   128
        return &((const Index::Entry*)&_table)[index];
jens@0
   129
    }
jens@0
   130
    
jens@0
   131
    Index::Entry* Index::table (int index) {
jens@0
   132
        return &((Index::Entry*)&_table)[index];
jens@0
   133
    }
jens@0
   134
    
jens@0
   135
        
jens@0
   136
    const KeyValueChunk* Index::_find (File *file, Key key) const {
jens@0
   137
        int index = key.hash % _tableSize;
jens@0
   138
        LittleEndian<HashCode> lhash = key.hash;
jens@0
   139
#if PROBE_QUADRATIC
jens@0
   140
        int probe = 0;
jens@0
   141
#endif
jens@0
   142
        for(;;) {
jens@0
   143
            const Index::Entry *found = table(index);
jens@0
   144
            if (found->isAvailable()) {
jens@0
   145
                return NULL;
jens@0
   146
            } else if (found->hasHash(lhash)) {
jens@0
   147
                const KeyValueChunk *value = found->value(file);
jens@0
   148
                if (value->hasKey(key)) {
jens@0
   149
                    return value;
jens@0
   150
                }
jens@0
   151
            }
jens@0
   152
#if PROBE_QUADRATIC
jens@0
   153
            index = (index + ++probe) % _tableSize;
jens@0
   154
#else
jens@0
   155
            index = (index + 1) % _tableSize;
jens@0
   156
#endif
jens@0
   157
        }
jens@0
   158
    }
jens@0
   159
    
jens@0
   160
    Blob Index::get (File *file, Key key) const {
jens@0
   161
        const KeyValueChunk *kv = _find(file, key);
jens@0
   162
        return kv ?kv->value() :Blob();
jens@0
   163
    }
jens@0
   164
    
jens@0
   165
jens@0
   166
    // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
jens@0
   167
    // While searching, entries past maxPosition should be considered newly-added and not compared,
jens@0
   168
    // since they're not in the memory-mapped part of the file yet.
jens@0
   169
    // Returns true if the entry was added/removed.
jens@0
   170
    bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0
   171
        bool result = (valuePosition <= Entry::kDeletedPosition);
jens@0
   172
        int index = key.hash % _tableSize;
jens@0
   173
        Index::Entry entry(key.hash, valuePosition);
jens@0
   174
#if PROBE_QUADRATIC
jens@0
   175
        int probe = 0;
jens@0
   176
#endif
jens@0
   177
        Index::Entry *found;
jens@0
   178
        for(;;) {
jens@0
   179
            found = table(index);
jens@0
   180
            if (!found->hasValue()) {
jens@0
   181
                if (entry.hasValue()) {
jens@0
   182
                    result = true;
jens@0
   183
                    break;                  // it's a put and we found an empty entry
jens@0
   184
                } else if (found->isAvailable()) {
jens@0
   185
                    return false;           // it's a delete but the key wasn't found
jens@0
   186
                }
jens@0
   187
            }
jens@0
   188
            else if (found->hasHash(entry.hash())) {
jens@0
   189
                if (!file)
jens@0
   190
                    break;                  // no file given, so assume no duplicate keys
jens@0
   191
                else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
jens@0
   192
                    break;                  // found existing (not newly-inserted) entry with the matching key
jens@0
   193
                /*else
jens@0
   194
                    printf("** hash collision! hash=%u\n", key.hash);*/
jens@0
   195
            }
jens@0
   196
#if PROBE_QUADRATIC
jens@0
   197
            index = (index + ++probe) % _tableSize;
jens@0
   198
#else
jens@0
   199
            index = (index + 1) % _tableSize;
jens@0
   200
#endif
jens@0
   201
        }
jens@0
   202
        // Finally found where to put it:
jens@0
   203
        *found = entry;
jens@0
   204
        return result;
jens@0
   205
    }
jens@0
   206
        
jens@0
   207
    bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0
   208
        bool added = _put(file, key, valuePosition, maxPosition);
jens@0
   209
        if (added)
jens@0
   210
            ++_count;
jens@0
   211
        return added;
jens@0
   212
    }
jens@0
   213
    
jens@0
   214
    bool Index::remove (File *file, Key key, FilePosition maxPosition) {
jens@0
   215
        bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
jens@0
   216
        if (removed) {
jens@0
   217
            assert(_count > 0);
jens@0
   218
            --_count;
jens@0
   219
        }
jens@0
   220
        return removed;
jens@0
   221
    }
jens@0
   222
    
jens@0
   223
    void Index::removeAll () {
jens@0
   224
        memset(table(0), 0, _tableSize*sizeof(Entry));
jens@0
   225
        _count = 0;
jens@0
   226
    }
jens@0
   227
    
jens@0
   228
    
jens@0
   229
    void Index::copyFrom (File *file, const Index *index) {
jens@0
   230
        const Entry *src = index->table(0);
jens@0
   231
        if (index->_tableSize == _tableSize) {
jens@0
   232
            memcpy(table(0), src, _tableSize*sizeof(Entry));
jens@0
   233
        } else {
jens@0
   234
            removeAll();
jens@0
   235
            for (uint32_t i=0; i<index->_tableSize; i++,src++)
jens@0
   236
                if (src->hasValue())
jens@0
   237
                    _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
jens@0
   238
        }
jens@0
   239
        _count = index->count();
jens@0
   240
    }        
jens@0
   241
    
jens@0
   242
jens@0
   243
jens@0
   244
    Index::Iterator::Iterator (File *file, const Index *index)
jens@0
   245
        :_file(file),
jens@0
   246
         _current(index->table(-1)),
jens@0
   247
         _end(index->table(index->_tableSize))
jens@0
   248
    {
jens@0
   249
        next();
jens@0
   250
    }
jens@0
   251
    
jens@0
   252
    bool Index::Iterator::hasValue() const  {return _current != NULL;}
jens@0
   253
    Key Index::Iterator::key() const        {return Key(_current->value(_file)->key(), _current->hash());}
jens@0
   254
    Blob Index::Iterator::value() const     {return _current->value(_file)->value();}
jens@0
   255
    
jens@0
   256
    bool Index::Iterator::next() {
jens@0
   257
        if (_current) {
jens@0
   258
            while (++_current < _end) {
jens@0
   259
                if (_current->hasValue())
jens@0
   260
                    return true;
jens@0
   261
            }
jens@0
   262
            _current = NULL;
jens@0
   263
        }
jens@0
   264
        return false;
jens@0
   265
    }
jens@0
   266
    
jens@0
   267
    FilePosition Index::Iterator::valuePosition() {
jens@0
   268
        return _current ?_current->valuePosition() :0;
jens@0
   269
    }
jens@0
   270
}