src/Index.cpp
changeset 0 31a43d94cc26
child 2 851de24ecb61
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/Index.cpp	Sun Sep 20 15:14:12 2009 -0700
     1.3 @@ -0,0 +1,270 @@
     1.4 +/*
     1.5 + *  Index.cpp
     1.6 + *  Ottoman
     1.7 + *
     1.8 + *  Created by Jens Alfke on 8/27/09.
     1.9 + *  Copyright 2009 Jens Alfke. All rights reserved.
    1.10 + *  BSD-Licensed: See the file "LICENSE.txt" for details.
    1.11 + */
    1.12 +
    1.13 +#include "Index.h"
    1.14 +#include "Chunk.h"
    1.15 +#include "File.h"
    1.16 +#include "Dictionary.h"
    1.17 +#include <assert.h>
    1.18 +#include <errno.h>
    1.19 +#include <algorithm>
    1.20 +#include <math.h>
    1.21 +
    1.22 +namespace Mooseyard {
    1.23 +        
    1.24 +    // Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
    1.25 +    // significantly larger, which increases the file size. In practice a smaller table size seems
    1.26 +    // to be a bigger win since disk I/O is slow.
    1.27 +    #define PROBE_QUADRATIC 0
    1.28 +
    1.29 +    // Similarly use a higher max load than for memory-resident tables, to save space.
    1.30 +    static const float kMaxLoadFactor = 0.85;
    1.31 +
    1.32 +    static const uint32_t kIndexMagicNumber = 0x54378543;
    1.33 +    
    1.34 +
    1.35 +    /** An index (hash-table) entry as stored in the memory-mapped file. */
    1.36 +    class Index::Entry {
    1.37 +    public:
    1.38 +        explicit Entry()                        :_hash(0), _valuePosition(0) { }
    1.39 +        explicit Entry (HashCode h, FilePosition p)
    1.40 +                                                :_hash(h), _valuePosition(p) { }
    1.41 +        LittleEndian<HashCode> hash() const       {return _hash;}
    1.42 +        bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
    1.43 +        
    1.44 +        const KeyValueChunk* value (const File *file) const {
    1.45 +            return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
    1.46 +        }
    1.47 +        FilePosition valuePosition() const      {return _valuePosition;}
    1.48 +        
    1.49 +        
    1.50 +        bool isAvailable() const                {return _valuePosition == 0;}   // false if deleted
    1.51 +        bool hasValue() const                   {return _valuePosition > 1;}    // false if deleted
    1.52 +        
    1.53 +        void markDeleted()                      {_valuePosition = kDeletedPosition;}
    1.54 +        void markEmpty()                        {_valuePosition = 0;}
    1.55 +        
    1.56 +        
    1.57 +        static const FilePosition kDeletedPosition = 1;
    1.58 +                
    1.59 +    private:
    1.60 +        LittleEndian<HashCode> _hash;
    1.61 +        LittleEndian<FilePosition> _valuePosition;
    1.62 +    };
    1.63 +        
    1.64 +    
    1.65 +#pragma mark -
    1.66 +#pragma mark CREATION:
    1.67 +        
    1.68 +    Index::Index (uint32_t itsTableSize)
    1.69 +        :Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
    1.70 +         _magicNumber(kIndexMagicNumber),
    1.71 +         _count(0),
    1.72 +         _tableSize(itsTableSize)
    1.73 +    { }
    1.74 +    
    1.75 +    Index* Index::create (int capacity) {
    1.76 +#if PROBE_QUADRATIC
    1.77 +        int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
    1.78 +#else
    1.79 +        int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
    1.80 +#endif
    1.81 +        size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
    1.82 +        void* storage = ::operator new(size);
    1.83 +        memset(storage,0,size);
    1.84 +        return new(storage) Index(tableSize);
    1.85 +    }
    1.86 +    
    1.87 +    const Index* Index::indexMappedAt (const void* ptr) {
    1.88 +        const Index *index = (const Index*)ptr;
    1.89 +        index->validate();
    1.90 +        return index;
    1.91 +    }
    1.92 +    
    1.93 +    void Index::validate() const  throw(File::Error) {
    1.94 +        if (_magicNumber != kIndexMagicNumber)
    1.95 +            throw File::Error(ERANGE, "Bad Index (wrong magic number)");
    1.96 +        if (_tableSize < 2 || _count > _tableSize)
    1.97 +            throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
    1.98 +#if PROBE_QUADRATIC
    1.99 +        if ((_tableSize & (_tableSize-1)) != 0)
   1.100 +            throw File::Error(ERANGE, "Bad Index (size not power of 2)");
   1.101 +#endif
   1.102 +        if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
   1.103 +            throw File::Error(ERANGE, "Bad Index (wrong size)");
   1.104 +    }
   1.105 +    
   1.106 +    void Index::validateEntries (const File *file) const  throw(File::Error) {
   1.107 +        validate();
   1.108 +        uint32_t size = _tableSize;
   1.109 +        uint32_t count = 0;
   1.110 +        const Entry *entry = table(0);
   1.111 +        for (uint32_t i=0; i<size; i++, entry++) {
   1.112 +            if (entry->hasValue()) {
   1.113 +                count++;
   1.114 +                const KeyValueChunk *chunk = entry->value(file);
   1.115 +                if ((void*)chunk->end() > (void*)this)
   1.116 +                    throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
   1.117 +                chunk->validate();
   1.118 +                if (entry->hash() != Key::computeHash(chunk->key()))
   1.119 +                    throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
   1.120 +            }
   1.121 +        }
   1.122 +        if (count != _count)
   1.123 +            throw File::Error(ERANGE, "Bad Index (inconsistent count)");
   1.124 +    }
   1.125 +    
   1.126 +    
   1.127 +#pragma mark -
   1.128 +#pragma mark LOOKUP:
   1.129 +    
   1.130 +    const Index::Entry* Index::table (int index) const {
   1.131 +        return &((const Index::Entry*)&_table)[index];
   1.132 +    }
   1.133 +    
   1.134 +    Index::Entry* Index::table (int index) {
   1.135 +        return &((Index::Entry*)&_table)[index];
   1.136 +    }
   1.137 +    
   1.138 +        
   1.139 +    const KeyValueChunk* Index::_find (File *file, Key key) const {
   1.140 +        int index = key.hash % _tableSize;
   1.141 +        LittleEndian<HashCode> lhash = key.hash;
   1.142 +#if PROBE_QUADRATIC
   1.143 +        int probe = 0;
   1.144 +#endif
   1.145 +        for(;;) {
   1.146 +            const Index::Entry *found = table(index);
   1.147 +            if (found->isAvailable()) {
   1.148 +                return NULL;
   1.149 +            } else if (found->hasHash(lhash)) {
   1.150 +                const KeyValueChunk *value = found->value(file);
   1.151 +                if (value->hasKey(key)) {
   1.152 +                    return value;
   1.153 +                }
   1.154 +            }
   1.155 +#if PROBE_QUADRATIC
   1.156 +            index = (index + ++probe) % _tableSize;
   1.157 +#else
   1.158 +            index = (index + 1) % _tableSize;
   1.159 +#endif
   1.160 +        }
   1.161 +    }
   1.162 +    
   1.163 +    Blob Index::get (File *file, Key key) const {
   1.164 +        const KeyValueChunk *kv = _find(file, key);
   1.165 +        return kv ?kv->value() :Blob();
   1.166 +    }
   1.167 +    
   1.168 +
   1.169 +    // Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
   1.170 +    // While searching, entries past maxPosition should be considered newly-added and not compared,
   1.171 +    // since they're not in the memory-mapped part of the file yet.
   1.172 +    // Returns true if the entry was added/removed.
   1.173 +    bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
   1.174 +        bool result = (valuePosition <= Entry::kDeletedPosition);
   1.175 +        int index = key.hash % _tableSize;
   1.176 +        Index::Entry entry(key.hash, valuePosition);
   1.177 +#if PROBE_QUADRATIC
   1.178 +        int probe = 0;
   1.179 +#endif
   1.180 +        Index::Entry *found;
   1.181 +        for(;;) {
   1.182 +            found = table(index);
   1.183 +            if (!found->hasValue()) {
   1.184 +                if (entry.hasValue()) {
   1.185 +                    result = true;
   1.186 +                    break;                  // it's a put and we found an empty entry
   1.187 +                } else if (found->isAvailable()) {
   1.188 +                    return false;           // it's a delete but the key wasn't found
   1.189 +                }
   1.190 +            }
   1.191 +            else if (found->hasHash(entry.hash())) {
   1.192 +                if (!file)
   1.193 +                    break;                  // no file given, so assume no duplicate keys
   1.194 +                else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
   1.195 +                    break;                  // found existing (not newly-inserted) entry with the matching key
   1.196 +                /*else
   1.197 +                    printf("** hash collision! hash=%u\n", key.hash);*/
   1.198 +            }
   1.199 +#if PROBE_QUADRATIC
   1.200 +            index = (index + ++probe) % _tableSize;
   1.201 +#else
   1.202 +            index = (index + 1) % _tableSize;
   1.203 +#endif
   1.204 +        }
   1.205 +        // Finally found where to put it:
   1.206 +        *found = entry;
   1.207 +        return result;
   1.208 +    }
   1.209 +        
   1.210 +    bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
   1.211 +        bool added = _put(file, key, valuePosition, maxPosition);
   1.212 +        if (added)
   1.213 +            ++_count;
   1.214 +        return added;
   1.215 +    }
   1.216 +    
   1.217 +    bool Index::remove (File *file, Key key, FilePosition maxPosition) {
   1.218 +        bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
   1.219 +        if (removed) {
   1.220 +            assert(_count > 0);
   1.221 +            --_count;
   1.222 +        }
   1.223 +        return removed;
   1.224 +    }
   1.225 +    
   1.226 +    void Index::removeAll () {
   1.227 +        memset(table(0), 0, _tableSize*sizeof(Entry));
   1.228 +        _count = 0;
   1.229 +    }
   1.230 +    
   1.231 +    
   1.232 +    void Index::copyFrom (File *file, const Index *index) {
   1.233 +        const Entry *src = index->table(0);
   1.234 +        if (index->_tableSize == _tableSize) {
   1.235 +            memcpy(table(0), src, _tableSize*sizeof(Entry));
   1.236 +        } else {
   1.237 +            removeAll();
   1.238 +            for (uint32_t i=0; i<index->_tableSize; i++,src++)
   1.239 +                if (src->hasValue())
   1.240 +                    _put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
   1.241 +        }
   1.242 +        _count = index->count();
   1.243 +    }        
   1.244 +    
   1.245 +
   1.246 +
   1.247 +    Index::Iterator::Iterator (File *file, const Index *index)
   1.248 +        :_file(file),
   1.249 +         _current(index->table(-1)),
   1.250 +         _end(index->table(index->_tableSize))
   1.251 +    {
   1.252 +        next();
   1.253 +    }
   1.254 +    
   1.255 +    bool Index::Iterator::hasValue() const  {return _current != NULL;}
   1.256 +    Key Index::Iterator::key() const        {return Key(_current->value(_file)->key(), _current->hash());}
   1.257 +    Blob Index::Iterator::value() const     {return _current->value(_file)->value();}
   1.258 +    
   1.259 +    bool Index::Iterator::next() {
   1.260 +        if (_current) {
   1.261 +            while (++_current < _end) {
   1.262 +                if (_current->hasValue())
   1.263 +                    return true;
   1.264 +            }
   1.265 +            _current = NULL;
   1.266 +        }
   1.267 +        return false;
   1.268 +    }
   1.269 +    
   1.270 +    FilePosition Index::Iterator::valuePosition() {
   1.271 +        return _current ?_current->valuePosition() :0;
   1.272 +    }
   1.273 +}