Ottoman: src/Index.cpp@21a6c17f4e3e (annotated)

jens@0	1	/*
jens@0	2	* Index.cpp
jens@0	3	* Ottoman
jens@0	4	*
jens@0	5	* Created by Jens Alfke on 8/27/09.
jens@0	6	* Copyright 2009 Jens Alfke. All rights reserved.
jens@0	7	* BSD-Licensed: See the file "LICENSE.txt" for details.
jens@0	8	*/
jens@0	9
jens@0	10	#include "Index.h"
jens@0	11	#include "Chunk.h"
jens@0	12	#include "File.h"
jens@0	13	#include "Dictionary.h"
jens@0	14	#include <assert.h>
jens@0	15	#include <errno.h>
jens@0	16	#include <algorithm>
jens@0	17	#include <math.h>
jens@0	18
jbm@8	19	#define __STDC_LIMIT_MACROS /* I <3 you, C99 */
jbm@8	20	# define UINT32_MAX (4294967295U)
jbm@8	21
jbm@8	22	#include <stdint.h>
jbm@8	23
jbm@8	24
jens@0	25	namespace Mooseyard {
jens@0	26
jens@0	27	// Quadratic probing results in fewer bucket tests on lookup, but makes the hash tables
jens@0	28	// significantly larger, which increases the file size. In practice a smaller table size seems
jens@0	29	// to be a bigger win since disk I/O is slow.
jens@0	30	#define PROBE_QUADRATIC 0
jens@0	31
jens@0	32	// Similarly use a higher max load than for memory-resident tables, to save space.
jens@2	33	static const float kMaxLoadFactor = 0.85f;
jens@0	34
jens@0	35	static const uint32_t kIndexMagicNumber = 0x54378543;
jens@0	36
jens@0	37
jens@0	38	/** An index (hash-table) entry as stored in the memory-mapped file. */
jens@0	39	class Index::Entry {
jens@0	40	public:
jens@0	41	explicit Entry() :_hash(0), _valuePosition(0) { }
jens@0	42	explicit Entry (HashCode h, FilePosition p)
jens@0	43	:_hash(h), _valuePosition(p) { }
jens@0	44	LittleEndian<HashCode> hash() const {return _hash;}
jens@0	45	bool hasHash (LittleEndian<HashCode> h) const {return _hash == h;}
jens@0	46
jens@0	47	const KeyValueChunk* value (const File *file) const {
jens@0	48	return (const KeyValueChunk*) file->mappedPosition(_valuePosition);
jens@0	49	}
jens@0	50	FilePosition valuePosition() const {return _valuePosition;}
jens@0	51
jens@0	52
jens@0	53	bool isAvailable() const {return _valuePosition == 0;} // false if deleted
jens@0	54	bool hasValue() const {return _valuePosition > 1;} // false if deleted
jens@0	55
jens@0	56	void markDeleted() {_valuePosition = kDeletedPosition;}
jens@0	57	void markEmpty() {_valuePosition = 0;}
jens@0	58
jens@0	59
jens@0	60	static const FilePosition kDeletedPosition = 1;
jens@0	61
jens@0	62	private:
jens@0	63	LittleEndian<HashCode> _hash;
jens@0	64	LittleEndian<FilePosition> _valuePosition;
jens@0	65	};
jens@0	66
jens@0	67
jens@0	68	#pragma mark -
jens@0	69	#pragma mark CREATION:
jens@0	70
jens@0	71	Index::Index (uint32_t itsTableSize)
jens@0	72	:Chunk(sizeof(Index) + itsTableSize*sizeof(Index::Entry), kChunkType),
jens@0	73	_magicNumber(kIndexMagicNumber),
jens@0	74	_count(0),
jens@0	75	_tableSize(itsTableSize)
jens@0	76	{ }
jens@0	77
jens@0	78	Index* Index::create (int capacity) {
jens@0	79	#if PROBE_QUADRATIC
jens@0	80	int tableSize = Dictionary::choosePowerOfTwoSize(capacity);
jens@0	81	#else
jens@0	82	int tableSize = Dictionary::chooseAnyOldSize(capacity, kMaxLoadFactor);
jens@0	83	#endif
jens@0	84	size_t size = sizeof(Index) + tableSize*sizeof(Index::Entry);
jens@0	85	void* storage = ::operator new(size);
jens@0	86	memset(storage,0,size);
jens@0	87	return new(storage) Index(tableSize);
jens@0	88	}
jens@0	89
jens@0	90	const Index* Index::indexMappedAt (const void* ptr) {
jens@0	91	const Index index = (const Index)ptr;
jens@0	92	index->validate();
jens@0	93	return index;
jens@0	94	}
jens@0	95
jens@0	96	void Index::validate() const throw(File::Error) {
jens@0	97	if (_magicNumber != kIndexMagicNumber)
jens@0	98	throw File::Error(ERANGE, "Bad Index (wrong magic number)");
jens@0	99	if (_tableSize < 2 \|\| _count > _tableSize)
jens@0	100	throw File::Error(ERANGE, "Bad Index (wrong count or tableSize)");
jens@0	101	#if PROBE_QUADRATIC
jens@0	102	if ((_tableSize & (_tableSize-1)) != 0)
jens@0	103	throw File::Error(ERANGE, "Bad Index (size not power of 2)");
jens@0	104	#endif
jens@0	105	if (size() != sizeof(Index)+_tableSize*sizeof(Index::Entry))
jens@0	106	throw File::Error(ERANGE, "Bad Index (wrong size)");
jens@0	107	}
jens@0	108
jens@0	109	void Index::validateEntries (const File *file) const throw(File::Error) {
jens@0	110	validate();
jens@0	111	uint32_t size = _tableSize;
jens@0	112	uint32_t count = 0;
jens@0	113	const Entry *entry = table(0);
jens@0	114	for (uint32_t i=0; i<size; i++, entry++) {
jens@0	115	if (entry->hasValue()) {
jens@0	116	count++;
jens@0	117	const KeyValueChunk *chunk = entry->value(file);
jens@0	118	if ((void)chunk->end() > (void)this)
jens@0	119	throw File::Error(ERANGE, "Bad Index entry (points beyond index)");
jens@0	120	chunk->validate();
jens@0	121	if (entry->hash() != Key::computeHash(chunk->key()))
jens@0	122	throw File::Error(ERANGE, "Bad Index entry (wrong hash code)");
jens@0	123	}
jens@0	124	}
jens@0	125	if (count != _count)
jens@0	126	throw File::Error(ERANGE, "Bad Index (inconsistent count)");
jens@0	127	}
jens@0	128
jens@0	129
jens@0	130	#pragma mark -
jens@0	131	#pragma mark LOOKUP:
jens@0	132
jens@0	133	const Index::Entry* Index::table (int index) const {
jens@0	134	return &((const Index::Entry*)&_table)[index];
jens@0	135	}
jens@0	136
jens@0	137	Index::Entry* Index::table (int index) {
jens@0	138	return &((Index::Entry*)&_table)[index];
jens@0	139	}
jens@0	140
jens@0	141
jens@0	142	const KeyValueChunk* Index::_find (File *file, Key key) const {
jens@0	143	int index = key.hash % _tableSize;
jens@0	144	LittleEndian<HashCode> lhash = key.hash;
jens@0	145	#if PROBE_QUADRATIC
jens@0	146	int probe = 0;
jens@0	147	#endif
jens@0	148	for(;;) {
jens@0	149	const Index::Entry *found = table(index);
jens@0	150	if (found->isAvailable()) {
jens@0	151	return NULL;
jens@0	152	} else if (found->hasHash(lhash)) {
jens@0	153	const KeyValueChunk *value = found->value(file);
jens@0	154	if (value->hasKey(key)) {
jens@0	155	return value;
jens@0	156	}
jens@0	157	}
jens@0	158	#if PROBE_QUADRATIC
jens@0	159	index = (index + ++probe) % _tableSize;
jens@0	160	#else
jens@0	161	index = (index + 1) % _tableSize;
jens@0	162	#endif
jens@0	163	}
jens@0	164	}
jens@0	165
jens@0	166	Blob Index::get (File *file, Key key) const {
jens@0	167	const KeyValueChunk *kv = _find(file, key);
jens@0	168	return kv ?kv->value() :Blob();
jens@0	169	}
jens@0	170
jens@0	171
jens@0	172	// Common implementation of put and delete. (If deleting, valuePosition will be kDeletedPosition.)
jens@0	173	// While searching, entries past maxPosition should be considered newly-added and not compared,
jens@0	174	// since they're not in the memory-mapped part of the file yet.
jens@0	175	// Returns true if the entry was added/removed.
jens@0	176	bool Index::_put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0	177	bool result = (valuePosition <= Entry::kDeletedPosition);
jens@0	178	int index = key.hash % _tableSize;
jens@0	179	Index::Entry entry(key.hash, valuePosition);
jens@0	180	#if PROBE_QUADRATIC
jens@0	181	int probe = 0;
jens@0	182	#endif
jens@0	183	Index::Entry *found;
jens@0	184	for(;;) {
jens@0	185	found = table(index);
jens@0	186	if (!found->hasValue()) {
jens@0	187	if (entry.hasValue()) {
jens@0	188	result = true;
jens@0	189	break; // it's a put and we found an empty entry
jens@0	190	} else if (found->isAvailable()) {
jens@0	191	return false; // it's a delete but the key wasn't found
jens@0	192	}
jens@0	193	}
jens@0	194	else if (found->hasHash(entry.hash())) {
jens@0	195	if (!file)
jens@0	196	break; // no file given, so assume no duplicate keys
jens@0	197	else if (found->valuePosition() < maxPosition && found->value(file)->hasKey(key))
jens@0	198	break; // found existing (not newly-inserted) entry with the matching key
jens@0	199	/*else
jens@0	200	printf("** hash collision! hash=%u\n", key.hash);*/
jens@0	201	}
jens@0	202	#if PROBE_QUADRATIC
jens@0	203	index = (index + ++probe) % _tableSize;
jens@0	204	#else
jens@0	205	index = (index + 1) % _tableSize;
jens@0	206	#endif
jens@0	207	}
jens@0	208	// Finally found where to put it:
jens@0	209	*found = entry;
jens@0	210	return result;
jens@0	211	}
jens@0	212
jens@0	213	bool Index::put (File *file, Key key, FilePosition valuePosition, FilePosition maxPosition) {
jens@0	214	bool added = _put(file, key, valuePosition, maxPosition);
jens@0	215	if (added)
jens@0	216	++_count;
jens@0	217	return added;
jens@0	218	}
jens@0	219
jens@0	220	bool Index::remove (File *file, Key key, FilePosition maxPosition) {
jens@0	221	bool removed = _put(file, key, Index::Entry::kDeletedPosition, maxPosition);
jens@0	222	if (removed) {
jens@0	223	assert(_count > 0);
jens@0	224	--_count;
jens@0	225	}
jens@0	226	return removed;
jens@0	227	}
jens@0	228
jens@0	229	void Index::removeAll () {
jens@0	230	memset(table(0), 0, _tableSize*sizeof(Entry));
jens@0	231	_count = 0;
jens@0	232	}
jens@0	233
jens@0	234
jens@0	235	void Index::copyFrom (File file, const Index index) {
jens@0	236	const Entry *src = index->table(0);
jens@0	237	if (index->_tableSize == _tableSize) {
jens@0	238	memcpy(table(0), src, _tableSize*sizeof(Entry));
jens@0	239	} else {
jens@0	240	removeAll();
jens@0	241	for (uint32_t i=0; i<index->_tableSize; i++,src++)
jens@0	242	if (src->hasValue())
jens@0	243	_put(NULL, Key(NULL,0,src->hash()), src->valuePosition(), UINT32_MAX);
jens@0	244	}
jens@0	245	_count = index->count();
jens@0	246	}
jens@0	247
jens@0	248
jens@0	249
jens@0	250	Index::Iterator::Iterator (File file, const Index index)
jens@0	251	:_file(file),
jens@0	252	_current(index->table(-1)),
jens@0	253	_end(index->table(index->_tableSize))
jens@0	254	{
jens@0	255	next();
jens@0	256	}
jens@0	257
jens@0	258	bool Index::Iterator::hasValue() const {return _current != NULL;}
jens@0	259	Key Index::Iterator::key() const {return Key(_current->value(_file)->key(), _current->hash());}
jens@0	260	Blob Index::Iterator::value() const {return _current->value(_file)->value();}
jens@0	261
jens@0	262	bool Index::Iterator::next() {
jens@0	263	if (_current) {
jens@0	264	while (++_current < _end) {
jens@0	265	if (_current->hasValue())
jens@0	266	return true;
jens@0	267	}
jens@0	268	_current = NULL;
jens@0	269	}
jens@0	270	return false;
jens@0	271	}
jens@0	272
jens@0	273	FilePosition Index::Iterator::valuePosition() {
jens@0	274	return _current ?_current->valuePosition() :0;
jens@0	275	}
jens@0	276	}

author	jbm@humility
	Mon Sep 28 23:39:08 2009 -0700 (2009-09-28)
changeset 8	21a6c17f4e3e
parent 2	851de24ecb61
child 9	629f61203db1
permissions	-rw-r--r--