src/Index.cpp
author jbm@humility
Mon Sep 28 23:39:08 2009 -0700 (2009-09-28)
changeset 8 21a6c17f4e3e
parent 2 851de24ecb61
child 9 629f61203db1
permissions -rw-r--r--
jbm: it compiles... but bombs in unit tests
jens@0
     1
/*
jens@0
     2
 *  Index.cpp
jens@0
     3
 *  Ottoman
jens@0
     4
 *
jens@0
     5
 *  Created by Jens Alfke on 8/27/09.
jens@0
     6
 *  Copyright 2009 Jens Alfke. All rights reserved.
jens@0
     7
 *  BSD-Licensed: See the file "LICENSE.txt" for details.
jens@0
     8
 */
jens@0
     9
jens@0
    10
#include "Index.h"
jens@0
    11
#include "Chunk.h"
jens@0
    12
#include "File.h"
jens@0
    13
#include "Dictionary.h"
jens@0
    14
#include <assert.h>
jens@0
    15
#include <errno.h>
jens@0
    16
#include <algorithm>
jens@0
    17
#include <math.h>
jens@0
    18
jbm@8
    19
#define __STDC_LIMIT_MACROS /* I <3 you, C99 */
jbm@8
    20
# define UINT32_MAX             (4294967295U)
jbm@8
    21
 
jbm@8
    22
#include <stdint.h>
jbm@8
    23
jbm@8
    24
jens@0
    25
namespace Mooseyard {
jens@0
    26
        
jens@0
    27
    // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
jens@0
    28
    // significantly larger, which increases the file size. In practice a smaller table size seems
jens@0
    29
    // to be a bigger win since disk I/O is slow.
jens@0
    30
    #define PROBE_QUADRATIC 0
jens@0
    31
jens@0
    32
    // Similarly use a higher max load than for memory-resident tables, to save space.
jens@2
    33
    static const float kMaxLoadFactor = 0.85f;
jens@0
    34
jens@0
    35
    static const uint32_t kIndexMagicNumber = 0x54378543;
jens@0
    36
    
jens@0
    37
jens@0
    38
    /** An index (hash-table) entry as stored in the memory-mapped file. */
jens@0
    39
    class Index::Entry {
jens@0
    40
    public:
jens@0
    41
        explicit Entry()                        :_hash(0), _valuePosition(0) { }
jens@0
    42
        explicit Entry (HashCode h, FilePosition p)
jens@0
    43
                                                :_hash(h), _valuePosition(p) { }
jens@0
    44
        LittleEndian<HashCode> hash() const       {return _hash;}
jens@0
    45
        bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
jens@0
    46
        
jens@0
    47
        const KeyValueChunk* value (const File *file) const {
jens@0
    48
            return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
jens@0
    49
        }
jens@0
    50
        FilePosition valuePosition() const      {return _valuePosition;}
jens@0
    51
        
jens@0
    52
        
jens@0
    53
        bool isAvailable() const                {return _valuePosition == 0;}   // false if deleted
jens@0
    54
        bool hasValue() const                   {return _valuePosition > 1;}    // false if deleted
jens@0
    55
        
jens@0
    56
        void markDeleted()                      {_valuePosition = kDeletedPosition;}
jens@0
    57
        void markEmpty()                        {_valuePosition = 0;}
jens@0
    58
        
jens@0
    59
        
jens@0
    60
        static const FilePosition kDeletedPosition = 1;
jens@0
    61
                
jens@0
    62
    private:
jens@0
    63
        LittleEndian<HashCode> _hash;
jens@0
    64
        LittleEndian<FilePosition> _valuePosition;
jens@0
    65
    };
jens@0
    66
        
jens@0
    67
    
jens@0
    68
#pragma mark -
jens@0
    69
#pragma mark CREATION:
jens@0
    70
        
jens@0
    71
    Index::Index (uint32_t itsTableSize)
jens@0
    72
        :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
jens@0
    73
         _magicNumber(kIndexMagicNumber),
jens@0
    74
         _count(0),
jens@0
    75
         _tableSize(itsTableSize)
jens@0
    76
    { }
jens@0
    77
    
jens@0
    78
    Index* Index::create (int capacity) {
jens@0
    79
#if PROBE_QUADRATIC
jens@0
    80
        int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
jens@0
    81
#else
jens@0
    82
        int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
jens@0
    83
#endif
jens@0
    84
        size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
jens@0
    85
        void* storage = ::operator new(size);
jens@0
    86
        memset(storage,0,size);
jens@0
    87
        return new(storage) Index(tableSize);
jens@0
    88
    }
jens@0
    89
    
jens@0
    90
    const Index* Index::indexMappedAt (const void* ptr) {
jens@0
    91
        const Index *index = (const Index*)ptr;
jens@0
    92
        index->validate();
jens@0
    93
        return index;
jens@0
    94
    }
jens@0
    95
    
jens@0
    96
    void Index::validate() const  throw(File::Error) {
jens@0
    97
        if (_magicNumber != kIndexMagicNumber)
jens@0
    98
            throw File::Error(ERANGE, "Bad Index (wrong magic number)");
jens@0
    99
        if (_tableSize < 2 || _count > _tableSize)
jens@0
   100
            throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
jens@0
   101
#if PROBE_QUADRATIC
jens@0
   102
        if ((_tableSize & (_tableSize-1)) != 0)
jens@0
   103
            throw File::Error(ERANGE, "Bad Index (size not power of 2)");
jens@0
   104
#endif
jens@0
   105
        if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
jens@0
   106
            throw File::Error(ERANGE, "Bad Index (wrong size)");
jens@0
   107
    }
jens@0
   108
    
jens@0
   109
    void Index::validateEntries (const File *file) const  throw(File::Error) {
jens@0
   110
        validate();
jens@0
   111
        uint32_t size = _tableSize;
jens@0
   112
        uint32_t count = 0;
jens@0
   113
        const Entry *entry = table(0);
jens@0
   114
        for (uint32_t i=0; i<size; i++, entry++) {
jens@0
   115
            if (entry->hasValue()) {
jens@0
   116
                count++;
jens@0
   117
                const KeyValueChunk *chunk = entry->value(file);
jens@0
   118
                if ((void*)chunk->end() > (void*)this)
jens@0
   119
                    throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
jens@0
   120
                chunk->validate();
jens@0
   121
                if (entry->hash() != Key::computeHash(chunk->key()))
jens@0
   122
                    throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
jens@0
   123
            }
jens@0
   124
        }
jens@0
   125
        if (count != _count)
jens@0
   126
            throw File::Error(ERANGE, "Bad Index (inconsistent count)");
jens@0
   127
    }
jens@0
   128
    
jens@0
   129
    
jens@0
   130
#pragma mark -
jens@0
   131
#pragma mark LOOKUP:
jens@0
   132
    
jens@0
   133
    const Index::Entry* Index::table (int index) const {
jens@0
   134
        return &((const Index::Entry*)&_table)[index];
jens@0
   135
    }
jens@0
   136
    
jens@0
   137
    Index::Entry* Index::table (int index) {
jens@0
   138
        return &((Index::Entry*)&_table)[index];
jens@0
   139
    }
jens@0
   140
    
jens@0
   141
        
jens@0
   142
    const KeyValueChunk* Index::_find (File *file, Key key) const {
jens@0
   143
        int index = key.hash % _tableSize;
jens@0
   144
        LittleEndian<HashCode> lhash = key.hash;
jens@0
   145
#if PROBE_QUADRATIC
jens@0
   146
        int probe = 0;
jens@0
   147
#endif
jens@0
   148
        for(;;) {
jens@0
   149
            const Index::Entry *found = table(index);
jens@0
   150
            if (found->isAvailable()) {
jens@0
   151
                return NULL;
jens@0
   152
            } else if (found->hasHash(lhash)) {
jens@0
   153
                const KeyValueChunk *value = found->value(file);
jens@0
   154
                if (value->hasKey(key)) {
jens@0
   155
                    return value;
jens@0
   156
                }
jens@0
   157
            }
jens@0
   158
#if PROBE_QUADRATIC
jens@0
   159
            index = (index + ++probe) % _tableSize;
jens@0
   160
#else
jens@0
   161
            index = (index + 1) % _tableSize;
jens@0
   162
#endif
jens@0
   163
        }
jens@0
   164
    }
jens@0
   165
    
jens@0
   166
    Blob Index::get (File *file, Key key) const {
jens@0
   167
        const KeyValueChunk *kv = _find(file, key);
jens@0
   168
        return kv ?kv->value() :Blob();
jens@0
   169
    }
jens@0
   170
    
jens@0
   171
jens@0
   172
    // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
jens@0
   173
    // While searching, entries past maxPosition should be considered newly-added and not compared,
jens@0
   174
    // since they're not in the memory-mapped part of the file yet.
jens@0
   175
    // Returns true if the entry was added/removed.
jens@0
   176
    bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0
   177
        bool result = (valuePosition <= Entry::kDeletedPosition);
jens@0
   178
        int index = key.hash % _tableSize;
jens@0
   179
        Index::Entry entry(key.hash, valuePosition);
jens@0
   180
#if PROBE_QUADRATIC
jens@0
   181
        int probe = 0;
jens@0
   182
#endif
jens@0
   183
        Index::Entry *found;
jens@0
   184
        for(;;) {
jens@0
   185
            found = table(index);
jens@0
   186
            if (!found->hasValue()) {
jens@0
   187
                if (entry.hasValue()) {
jens@0
   188
                    result = true;
jens@0
   189
                    break;                  // it's a put and we found an empty entry
jens@0
   190
                } else if (found->isAvailable()) {
jens@0
   191
                    return false;           // it's a delete but the key wasn't found
jens@0
   192
                }
jens@0
   193
            }
jens@0
   194
            else if (found->hasHash(entry.hash())) {
jens@0
   195
                if (!file)
jens@0
   196
                    break;                  // no file given, so assume no duplicate keys
jens@0
   197
                else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
jens@0
   198
                    break;                  // found existing (not newly-inserted) entry with the matching key
jens@0
   199
                /*else
jens@0
   200
                    printf("** hash collision! hash=%u\n", key.hash);*/
jens@0
   201
            }
jens@0
   202
#if PROBE_QUADRATIC
jens@0
   203
            index = (index + ++probe) % _tableSize;
jens@0
   204
#else
jens@0
   205
            index = (index + 1) % _tableSize;
jens@0
   206
#endif
jens@0
   207
        }
jens@0
   208
        // Finally found where to put it:
jens@0
   209
        *found = entry;
jens@0
   210
        return result;
jens@0
   211
    }
jens@0
   212
        
jens@0
   213
    bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0
   214
        bool added = _put(file, key, valuePosition, maxPosition);
jens@0
   215
        if (added)
jens@0
   216
            ++_count;
jens@0
   217
        return added;
jens@0
   218
    }
jens@0
   219
    
jens@0
   220
    bool Index::remove (File *file, Key key, FilePosition maxPosition) {
jens@0
   221
        bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
jens@0
   222
        if (removed) {
jens@0
   223
            assert(_count > 0);
jens@0
   224
            --_count;
jens@0
   225
        }
jens@0
   226
        return removed;
jens@0
   227
    }
jens@0
   228
    
jens@0
   229
    void Index::removeAll () {
jens@0
   230
        memset(table(0), 0, _tableSize*sizeof(Entry));
jens@0
   231
        _count = 0;
jens@0
   232
    }
jens@0
   233
    
jens@0
   234
    
jens@0
   235
    void Index::copyFrom (File *file, const Index *index) {
jens@0
   236
        const Entry *src = index->table(0);
jens@0
   237
        if (index->_tableSize == _tableSize) {
jens@0
   238
            memcpy(table(0), src, _tableSize*sizeof(Entry));
jens@0
   239
        } else {
jens@0
   240
            removeAll();
jens@0
   241
            for (uint32_t i=0; i<index->_tableSize; i++,src++)
jens@0
   242
                if (src->hasValue())
jens@0
   243
                    _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
jens@0
   244
        }
jens@0
   245
        _count = index->count();
jens@0
   246
    }        
jens@0
   247
    
jens@0
   248
jens@0
   249
jens@0
   250
    Index::Iterator::Iterator (File *file, const Index *index)
jens@0
   251
        :_file(file),
jens@0
   252
         _current(index->table(-1)),
jens@0
   253
         _end(index->table(index->_tableSize))
jens@0
   254
    {
jens@0
   255
        next();
jens@0
   256
    }
jens@0
   257
    
jens@0
   258
    bool Index::Iterator::hasValue() const  {return _current != NULL;}
jens@0
   259
    Key Index::Iterator::key() const        {return Key(_current->value(_file)->key(), _current->hash());}
jens@0
   260
    Blob Index::Iterator::value() const     {return _current->value(_file)->value();}
jens@0
   261
    
jens@0
   262
    bool Index::Iterator::next() {
jens@0
   263
        if (_current) {
jens@0
   264
            while (++_current < _end) {
jens@0
   265
                if (_current->hasValue())
jens@0
   266
                    return true;
jens@0
   267
            }
jens@0
   268
            _current = NULL;
jens@0
   269
        }
jens@0
   270
        return false;
jens@0
   271
    }
jens@0
   272
    
jens@0
   273
    FilePosition Index::Iterator::valuePosition() {
jens@0
   274
        return _current ?_current->valuePosition() :0;
jens@0
   275
    }
jens@0
   276
}