* Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de.
* This file may be used under the terms of the MIT License.
*
* Roughly based on 'btlib' written by Marcus J. Ranum - it shares
* no code but achieves binary compatibility with the on disk format.
*/
#include "BPlusTree.h"
#include <file_systems/QueryParserUtils.h>
#include "Debug.h"
#include "Utility.h"
#if !_BOOT_MODE
# include "Inode.h"
#else
# include "Stream.h"
# define Inode BFS::Stream
# define strerror(x) "error message unavailable"
namespace BFS {
#endif
on disk structure.
*/
struct duplicate_array {
off_t count;
off_t values[0];
inline bool IsEmpty() const
{
return count == 0;
}
inline int32 Count() const
{
return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
}
inline off_t ValueAt(uint32 index) const
{
return BFS_ENDIAN_TO_HOST_INT64(values[index]);
}
inline void SetValueAt(uint32 index, off_t value)
{
values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
}
inline int32 Find(off_t value) const
{
int32 i;
return _FindInternal(value, i) ? i : -1;
}
void Insert(off_t value);
bool Remove(off_t value);
private:
bool _FindInternal(off_t value, int32& index) const;
} _PACKED;
#ifdef DEBUG
class NodeChecker {
public:
NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
:
fNode(node),
fSize(nodeSize),
fText(text)
{
Check("integrity check failed on construction.");
}
~NodeChecker()
{
Check("integrity check failed on destruction.");
}
void
Check(const char* message)
{
if (fNode->CheckIntegrity(fSize) != B_OK) {
dprintf("%s: %s\n", fText, message);
DEBUGGER(("NodeChecker integrity check failed!"));
}
}
private:
const bplustree_node* fNode;
int32 fSize;
const char* fText;
};
#endif
#if !_BOOT_MODE
class BitmapArray {
public:
BitmapArray(size_t numBits);
~BitmapArray();
status_t InitCheck() const;
bool IsSet(size_t index) const;
void Set(size_t index, bool set);
size_t CountSet() const { return fCountSet; }
private:
uint8* fBitmap;
size_t fSize;
size_t fCountSet;
};
struct TreeCheck {
TreeCheck(BPlusTree* tree)
:
fLevelCount(0),
fFreeCount(0),
fNodeSize(tree->NodeSize()),
fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
fFoundErrors(0),
fVisited(tree->Stream()->Size() / tree->NodeSize()),
fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
{
fPreviousOffsets = (off_t*)malloc(
sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
if (fPreviousOffsets != NULL) {
for (size_t i = 0; i < fMaxLevels; i++)
fPreviousOffsets[i] = BPLUSTREE_NULL;
}
}
~TreeCheck()
{
free(fPreviousOffsets);
}
status_t InitCheck() const
{
if (fPreviousOffsets == NULL)
return B_NO_MEMORY;
status_t status = fVisited.InitCheck();
if (status != B_OK)
return status;
return fVisitedFragment.InitCheck();
}
bool Visited(off_t offset) const
{
return fVisited.IsSet(offset / fNodeSize);
}
void SetVisited(off_t offset)
{
fVisited.Set(offset / fNodeSize, true);
}
size_t VisitedCount() const
{
return fVisited.CountSet();
}
bool VisitedFragment(off_t offset) const
{
return fVisitedFragment.IsSet(offset / fNodeSize);
}
void SetVisitedFragment(off_t offset)
{
fVisitedFragment.Set(offset / fNodeSize, true);
}
uint32 MaxLevels() const
{
return fLevelCount;
}
void SetLevel(uint32 level)
{
if (fLevelCount < level)
fLevelCount = level;
}
off_t PreviousOffset(uint32 level)
{
return fPreviousOffsets[level];
}
void SetPreviousOffset(uint32 level, off_t offset)
{
fPreviousOffsets[level] = offset;
}
void FoundError()
{
fFoundErrors++;
}
bool ErrorsFound()
{
return fFoundErrors != 0;
}
private:
uint32 fLevelCount;
uint32 fFreeCount;
uint32 fNodeSize;
uint32 fMaxLevels;
uint32 fFoundErrors;
BitmapArray fVisited;
BitmapArray fVisitedFragment;
off_t* fPreviousOffsets;
};
void
CachedNode::UnsetUnchanged(Transaction& transaction)
{
if (fTree == NULL || fTree->fStream == NULL)
return;
if (fNode != NULL) {
void* cache = fTree->fStream->GetVolume()->BlockCache();
block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
block_cache_put(cache, fBlockNumber);
fBlock = NULL;
fNode = NULL;
}
}
#endif
void
CachedNode::Unset()
{
if (fTree == NULL || fTree->fStream == NULL)
return;
if (fNode != NULL) {
#if !_BOOT_MODE
if (fWritable && fOffset == 0) {
memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
}
block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
fBlockNumber);
fBlock = NULL;
#endif
fNode = NULL;
}
}
const bplustree_node*
CachedNode::SetTo(off_t offset, bool check)
{
const bplustree_node* node;
if (SetTo(offset, &node, check) == B_OK)
return node;
return NULL;
}
status_t
CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
{
if (fTree == NULL || fTree->fStream == NULL)
RETURN_ERROR(B_BAD_VALUE);
if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
|| offset <= 0
|| (offset % fTree->fNodeSize) != 0) {
Unset();
RETURN_ERROR(B_BAD_VALUE);
}
if (InternalSetTo(NULL, offset) != NULL && check) {
if (!fTree->fHeader.CheckNode(fNode)) {
FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
fBlockNumber, fTree->fStream->ID()));
Unset();
return B_BAD_DATA;
}
}
*_node = fNode;
return B_OK;
}
#if !_BOOT_MODE
bplustree_node*
CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
{
if (fTree == NULL || fTree->fStream == NULL) {
REPORT_ERROR(B_BAD_VALUE);
return NULL;
}
if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
|| offset <= 0
|| (offset % fTree->fNodeSize) != 0) {
Unset();
return NULL;
}
if (InternalSetTo(&transaction, offset) != NULL && check) {
if (!fTree->fHeader.CheckNode(fNode)) {
FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
fBlockNumber, fTree->fStream->ID()));
Unset();
return NULL;
}
}
return fNode;
}
bplustree_node*
CachedNode::MakeWritable(Transaction& transaction)
{
if (fNode == NULL)
return NULL;
if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
fBlockNumber, transaction.ID()) == B_OK) {
return fNode;
}
return NULL;
}
#endif
const bplustree_header*
CachedNode::SetToHeader()
{
if (fTree == NULL || fTree->fStream == NULL) {
REPORT_ERROR(B_BAD_VALUE);
return NULL;
}
InternalSetTo(NULL, 0LL);
return (bplustree_header*)fNode;
}
#if !_BOOT_MODE
bplustree_header*
CachedNode::SetToWritableHeader(Transaction& transaction)
{
if (fTree == NULL || fTree->fStream == NULL) {
REPORT_ERROR(B_BAD_VALUE);
return NULL;
}
InternalSetTo(&transaction, 0LL);
if (fNode != NULL && !fTree->fInTransaction) {
transaction.AddListener(fTree);
fTree->fInTransaction = true;
if (!transaction.GetVolume()->IsInitializing()
&& fTree->fStream != transaction.GetVolume()->IndicesNode()) {
acquire_vnode(transaction.GetVolume()->FSVolume(),
fTree->fStream->ID());
}
}
return (bplustree_header*)fNode;
}
#endif
bplustree_node*
CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
{
off_t fileOffset;
block_run run;
if (offset >= fTree->fStream->Size()
|| fTree->fStream->FindBlockRun(offset, run, fileOffset) != B_OK) {
Unset();
return NULL;
}
#if !_BOOT_MODE
Volume* volume = fTree->fStream->GetVolume();
#else
Volume* volume = &fTree->fStream->GetVolume();
#endif
const int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
const off_t newBlockNumber = volume->ToBlock(run) + blockOffset;
uint8* block = NULL;
if (transaction == NULL && fBlock != NULL && fBlockNumber == newBlockNumber) {
block = fBlock;
} else {
#if !_BOOT_MODE
if (fBlock != NULL)
Unset();
if (transaction != NULL) {
block = fBlock = (uint8*)block_cache_get_writable(volume->BlockCache(),
newBlockNumber, transaction->ID());
fWritable = true;
} else {
block = fBlock = (uint8*)block_cache_get(volume->BlockCache(), newBlockNumber);
fWritable = false;
}
#else
if (fBlock == NULL) {
fBlock = (uint8*)malloc(volume->BlockSize());
if (fBlock == NULL)
return NULL;
}
if (read_pos(volume->Device(), newBlockNumber << volume->BlockShift(),
fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) {
block = fBlock;
}
fWritable = false;
#endif
}
fOffset = offset;
fBlockNumber = newBlockNumber;
if (block != NULL) {
fNode = (bplustree_node*)(block + offset
- (fileOffset + (blockOffset << volume->BlockShift())));
} else {
fNode = NULL;
REPORT_ERROR(B_IO_ERROR);
}
return fNode;
}
#if !_BOOT_MODE
status_t
CachedNode::Free(Transaction& transaction, off_t offset)
{
if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
RETURN_ERROR(B_BAD_VALUE);
CachedNode cached(fTree);
bplustree_header* header = cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
#if 0
off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
if (offset == lastOffset) {
status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
if (status != B_OK)
return status;
header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
return B_OK;
}
#endif
fNode->left_link = header->free_node_pointer;
fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
return B_OK;
}
status_t
CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
off_t* _offset)
{
if (fTree == NULL || fTree->fStream == NULL)
RETURN_ERROR(B_BAD_VALUE);
Unset();
bplustree_header* header;
status_t status;
if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
CachedNode cached(fTree);
header = cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
*_offset = header->FreeNode();
*_node = fNode;
header->free_node_pointer = fNode->left_link;
fNode->Initialize();
return B_OK;
}
Inode* stream = fTree->fStream;
if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
return status;
CachedNode cached(fTree);
header = cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
off_t offset = header->MaximumSize();
header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
+ fTree->fNodeSize);
cached.Unset();
if (SetToWritable(transaction, offset, false) == NULL)
RETURN_ERROR(B_ERROR);
fNode->Initialize();
*_offset = offset;
*_node = fNode;
return B_OK;
}
BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
:
fStream(NULL),
fInTransaction(false)
{
mutex_init(&fIteratorLock, "bfs b+tree iterator");
SetTo(transaction, stream);
}
#endif
BPlusTree::BPlusTree(Inode* stream)
:
fStream(NULL),
fInTransaction(false)
{
#if !_BOOT_MODE
mutex_init(&fIteratorLock, "bfs b+tree iterator");
#endif
SetTo(stream);
}
BPlusTree::BPlusTree()
:
fStream(NULL),
fNodeSize(BPLUSTREE_NODE_SIZE),
fAllowDuplicates(true),
fInTransaction(false),
fStatus(B_NO_INIT)
{
#if !_BOOT_MODE
mutex_init(&fIteratorLock, "bfs b+tree iterator");
#endif
}
BPlusTree::~BPlusTree()
{
#if !_BOOT_MODE
mutex_lock(&fIteratorLock);
SinglyLinkedList<TreeIterator>::ConstIterator iterator
= fIterators.GetIterator();
while (iterator.HasNext())
iterator.Next()->Stop();
mutex_destroy(&fIteratorLock);
ASSERT(!fInTransaction);
#endif
}
#if !_BOOT_MODE
status_t
BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
{
fStream = stream;
CachedNode cached(this);
bplustree_header* header = cached.SetToWritableHeader(transaction);
if (header == NULL) {
fStatus = stream->SetFileSize(transaction, nodeSize * 2);
if (fStatus != B_OK)
RETURN_ERROR(fStatus);
header = cached.SetToWritableHeader(transaction);
if (header == NULL)
RETURN_ERROR(fStatus = B_ERROR);
}
fAllowDuplicates = stream->IsIndex()
|| (stream->Mode() & S_ALLOW_DUPS) != 0;
fNodeSize = nodeSize;
header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
header->free_node_pointer
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
cached.Unset();
cached.SetToWritable(transaction, fHeader.RootNode(), false);
if (cached.Node() == NULL)
RETURN_ERROR(B_IO_ERROR);
cached.Node()->Initialize();
return fStatus = B_OK;
}
#endif
status_t
BPlusTree::SetTo(Inode* stream)
{
if (stream == NULL)
RETURN_ERROR(fStatus = B_BAD_VALUE);
fStream = stream;
CachedNode cached(this);
const bplustree_header* header = cached.SetToHeader();
if (header != NULL)
memcpy(&fHeader, header, sizeof(bplustree_header));
else
RETURN_ERROR(fStatus = B_IO_ERROR);
if (fHeader.MaximumSize() != stream->Size()) {
dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
}
if (!fHeader.IsValid()) {
#ifdef DEBUG
dump_bplustree_header(&fHeader);
dump_block((const char*)&fHeader, 128);
#endif
RETURN_ERROR(fStatus = B_BAD_DATA);
}
fNodeSize = fHeader.NodeSize();
static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
S_DOUBLE_INDEX};
uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
| S_FLOAT_INDEX | S_DOUBLE_INDEX);
if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
|| ((stream->Mode() & S_INDEX_DIR) != 0
&& kToMode[fHeader.DataType()] != mode)
|| !stream->IsContainer()) {
D( dump_bplustree_header(&fHeader);
dump_inode(&stream->Node());
);
RETURN_ERROR(fStatus = B_BAD_TYPE);
}
fAllowDuplicates = stream->IsIndex()
|| (stream->Mode() & S_ALLOW_DUPS) != 0;
cached.SetTo(fHeader.RootNode());
RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
}
status_t
BPlusTree::InitCheck()
{
return fStatus;
}
#if !_BOOT_MODE
status_t
BPlusTree::Validate(bool repair, bool& _errorsFound)
{
TreeCheck check(this);
if (check.InitCheck() != B_OK)
return B_NO_MEMORY;
check.SetVisited(0);
CachedNode cached(this);
off_t freeOffset = fHeader.FreeNode();
while (freeOffset > 0) {
const bplustree_node* node;
status_t status = cached.SetTo(freeOffset, &node, false);
if (status != B_OK) {
if (status == B_IO_ERROR)
return B_IO_ERROR;
dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
"not be read: %s\n", fStream->ID(), freeOffset,
strerror(status));
check.FoundError();
break;
}
if (check.Visited(freeOffset)) {
dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
" circular!\n", fStream->ID(), freeOffset);
check.FoundError();
break;
}
check.SetVisited(freeOffset);
if (node->OverflowLink() != BPLUSTREE_FREE) {
dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
" misses free mark!\n", fStream->ID(), freeOffset);
}
freeOffset = node->LeftLink();
}
const bplustree_node* root;
status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
if (status != B_OK)
return status;
status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
if (check.ErrorsFound())
_errorsFound = true;
if (status != B_OK)
return status;
if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
fHeader.MaxNumberOfLevels());
}
if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
" nodes.\n", fStream->ID(), check.VisitedCount(),
fHeader.MaximumSize() / fNodeSize);
}
return B_OK;
}
status_t
BPlusTree::MakeEmpty()
{
Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
fStream->WriteLockInTransaction(transaction);
CachedNode cached(this);
bplustree_header* header = cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
if (fStream->Size() > (off_t)NodeSize() * 2)
header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
else {
header->free_node_pointer
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
}
bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
if (node == NULL)
return B_IO_ERROR;
node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
node->all_key_count = 0;
node->all_key_length = 0;
for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
offset += NodeSize()) {
bplustree_node* node = cached.SetToWritable(transaction, offset, false);
if (node == NULL) {
dprintf("--> could not open %" B_PRIdOFF "\n", offset);
return B_IO_ERROR;
}
if (offset < fStream->Size() - (off_t)NodeSize())
node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
else
node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
if (offset % (1024 * 1024) == 0) {
transaction.Done();
transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
fStream->WriteLockInTransaction(transaction);
}
}
return transaction.Done();
}
int32
BPlusTree::TypeCodeToKeyType(type_code code)
{
switch (code) {
case B_STRING_TYPE:
return BPLUSTREE_STRING_TYPE;
case B_SSIZE_T_TYPE:
case B_INT32_TYPE:
return BPLUSTREE_INT32_TYPE;
case B_SIZE_T_TYPE:
case B_UINT32_TYPE:
return BPLUSTREE_UINT32_TYPE;
case B_OFF_T_TYPE:
case B_INT64_TYPE:
return BPLUSTREE_INT64_TYPE;
case B_UINT64_TYPE:
return BPLUSTREE_UINT64_TYPE;
case B_FLOAT_TYPE:
return BPLUSTREE_FLOAT_TYPE;
case B_DOUBLE_TYPE:
return BPLUSTREE_DOUBLE_TYPE;
}
return -1;
}
int32
BPlusTree::ModeToKeyType(mode_t mode)
{
switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
case S_INT_INDEX:
return BPLUSTREE_INT32_TYPE;
case S_UINT_INDEX:
return BPLUSTREE_UINT32_TYPE;
case S_LONG_LONG_INDEX:
return BPLUSTREE_INT64_TYPE;
case S_ULONG_LONG_INDEX:
return BPLUSTREE_UINT64_TYPE;
case S_FLOAT_INDEX:
return BPLUSTREE_FLOAT_TYPE;
case S_DOUBLE_INDEX:
return BPLUSTREE_DOUBLE_TYPE;
case S_STR_INDEX:
default:
return BPLUSTREE_STRING_TYPE;
}
}
#endif
#if !_BOOT_MODE
void
BPlusTree::TransactionDone(bool success)
{
if (!success) {
CachedNode cached(this);
const bplustree_header* header = cached.SetToHeader();
if (header != NULL)
memcpy(&fHeader, header, sizeof(bplustree_header));
}
}
void
BPlusTree::RemovedFromTransaction()
{
fInTransaction = false;
if (!fStream->GetVolume()->IsInitializing() && fStream != fStream->GetVolume()->IndicesNode())
put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
}
void
BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
uint16 splitAt, int8 change)
{
MutexLocker _(fIteratorLock);
SinglyLinkedList<TreeIterator>::ConstIterator iterator
= fIterators.GetIterator();
while (iterator.HasNext())
iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
}
void
BPlusTree::_AddIterator(TreeIterator* iterator)
{
MutexLocker _(fIteratorLock);
fIterators.Add(iterator);
}
void
BPlusTree::_RemoveIterator(TreeIterator* iterator)
{
MutexLocker _(fIteratorLock);
fIterators.Remove(iterator);
}
#endif
int32
BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
int keyLength2)
{
type_code type = 0;
switch (fHeader.DataType()) {
case BPLUSTREE_STRING_TYPE:
type = B_STRING_TYPE;
break;
case BPLUSTREE_INT32_TYPE:
type = B_INT32_TYPE;
break;
case BPLUSTREE_UINT32_TYPE:
type = B_UINT32_TYPE;
break;
case BPLUSTREE_INT64_TYPE:
type = B_INT64_TYPE;
break;
case BPLUSTREE_UINT64_TYPE:
type = B_UINT64_TYPE;
break;
case BPLUSTREE_FLOAT_TYPE:
type = B_FLOAT_TYPE;
break;
case BPLUSTREE_DOUBLE_TYPE:
type = B_DOUBLE_TYPE;
break;
}
return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2);
}
status_t
BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
uint16 keyLength, uint16* _index, off_t* _next)
{
#ifdef DEBUG
NodeChecker checker(node, fNodeSize, "find");
#endif
if (node->all_key_count == 0) {
if (_index)
*_index = 0;
if (_next)
*_next = node->OverflowLink();
return B_ENTRY_NOT_FOUND;
}
Unaligned<off_t>* values = node->Values();
int16 saveIndex = -1;
for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
uint16 i = (first + last) >> 1;
uint16 searchLength = 0;
uint8* searchKey = node->KeyAt(i, &searchLength);
if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
> (uint8*)node + fNodeSize
|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
#if !_BOOT_MODE
fStream->GetVolume()->Panic();
#endif
RETURN_ERROR(B_BAD_DATA);
}
int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
if (cmp < 0) {
last = i - 1;
saveIndex = i;
} else if (cmp > 0) {
saveIndex = first = i + 1;
} else {
if (_index)
*_index = i;
if (_next)
*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
return B_OK;
}
}
if (_index)
*_index = saveIndex;
if (_next) {
if (saveIndex == node->NumKeys())
*_next = node->OverflowLink();
else
*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
}
return B_ENTRY_NOT_FOUND;
}
#if !_BOOT_MODE
following the key, from the root node to the leaf node that could
or should contain that key.
*/
status_t
BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
uint16 keyLength)
{
node_and_key nodeAndKey;
nodeAndKey.nodeOffset = fHeader.RootNode();
CachedNode cached(this);
const bplustree_node* node;
while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
if (node->OverflowLink() == BPLUSTREE_NULL) {
nodeAndKey.keyIndex = 0;
stack.Push(nodeAndKey);
return B_OK;
}
off_t nextOffset;
status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
&nextOffset);
if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
RETURN_ERROR(B_ERROR);
if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
break;
}
stack.Push(nodeAndKey);
nodeAndKey.nodeOffset = nextOffset;
}
FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
return B_ERROR;
}
The CachedNode will be set to the writable fragment on success.
*/
status_t
BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
const bplustree_node* node, CachedNode& cached,
off_t* _offset, bplustree_node** _fragment, uint32* _index)
{
Unaligned<off_t>* values = node->Values();
for (int32 i = 0; i < node->NumKeys(); i++) {
off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
continue;
const bplustree_node* fragment = cached.SetTo(
bplustree_node::FragmentOffset(value), false);
if (fragment == NULL) {
FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
B_PRIdOFF "\n", value, fStream->ID()));
continue;
}
uint32 num = bplustree_node::MaxFragments(fNodeSize);
for (uint32 j = 0; j < num; j++) {
duplicate_array* array = fragment->FragmentAt(j);
if (array->IsEmpty()) {
*_fragment = cached.MakeWritable(transaction);
if (*_fragment == NULL)
return B_IO_ERROR;
*_offset = bplustree_node::FragmentOffset(value);
*_index = j;
return B_OK;
}
}
}
return B_ENTRY_NOT_FOUND;
}
status_t
BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
const bplustree_node* node, uint16 index, off_t value)
{
CachedNode cachedDuplicate(this);
Unaligned<off_t>* values = node->Values();
off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
status_t status;
off_t offset;
if (bplustree_node::IsDuplicate(oldValue)) {
if (bplustree_node::LinkType(oldValue)
== BPLUSTREE_DUPLICATE_FRAGMENT) {
bplustree_node* duplicate = cachedDuplicate.SetToWritable(
transaction, bplustree_node::FragmentOffset(oldValue), false);
if (duplicate == NULL)
return B_IO_ERROR;
duplicate_array* array = duplicate->FragmentAt(
bplustree_node::FragmentIndex(oldValue));
int32 arrayCount = array->Count();
if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
(int)bplustree_node::FragmentIndex(oldValue),
bplustree_node::FragmentOffset(oldValue), arrayCount,
fStream->ID()));
return B_BAD_DATA;
}
if (arrayCount < NUM_FRAGMENT_VALUES) {
array->Insert(value);
} else {
if (duplicate->FragmentsUsed(fNodeSize) < 2) {
offset = bplustree_node::FragmentOffset(oldValue);
memmove(duplicate->DuplicateArray(), array,
(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
duplicate->left_link = duplicate->right_link
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
array = duplicate->DuplicateArray();
array->Insert(value);
} else {
cachedDuplicate.UnsetUnchanged(transaction);
bplustree_node* newDuplicate;
status = cachedDuplicate.Allocate(transaction,
&newDuplicate, &offset);
if (status != B_OK)
RETURN_ERROR(status);
newDuplicate->overflow_link = array->count;
memcpy(&newDuplicate->all_key_count, &array->values[0],
array->Count() * sizeof(off_t));
memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
array = newDuplicate->DuplicateArray();
array->Insert(value);
}
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
values[index]
= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
BPLUSTREE_DUPLICATE_NODE, offset));
}
return B_OK;
}
duplicate_array* array;
int32 arrayCount;
const bplustree_node* duplicate;
off_t duplicateOffset;
do {
duplicateOffset = bplustree_node::FragmentOffset(oldValue);
duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
if (duplicate == NULL)
return B_IO_ERROR;
array = duplicate->DuplicateArray();
arrayCount =array->Count();
if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
duplicateOffset, arrayCount, fStream->ID()));
return B_BAD_DATA;
}
} while (arrayCount >= NUM_DUPLICATE_VALUES
&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
bplustree_node* writableDuplicate
= cachedDuplicate.MakeWritable(transaction);
if (writableDuplicate == NULL)
return B_IO_ERROR;
if (arrayCount < NUM_DUPLICATE_VALUES) {
array = writableDuplicate->DuplicateArray();
array->Insert(value);
} else {
bplustree_node* newDuplicate;
status = cachedDuplicate.Allocate(transaction, &newDuplicate,
&offset);
if (status != B_OK)
RETURN_ERROR(status);
writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
array = newDuplicate->DuplicateArray();
array->count = 0;
array->Insert(value);
}
return B_OK;
}
uint32 fragmentIndex = 0;
bplustree_node* fragment;
if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
&offset, &fragment, &fragmentIndex) != B_OK) {
status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
if (status != B_OK)
RETURN_ERROR(status);
memset(fragment, 0, fNodeSize);
}
duplicate_array* array = fragment->FragmentAt(fragmentIndex);
array->Insert(oldValue);
array->Insert(value);
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
return B_OK;
}
void
BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
uint16 keyLength, off_t value)
{
if (index > node->NumKeys())
return;
Unaligned<off_t>* values = node->Values();
Unaligned<uint16>* keyLengths = node->KeyLengths();
uint8* keys = node->Keys();
node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
+ keyLength);
Unaligned<off_t>* newValues = node->Values();
Unaligned<uint16>* newKeyLengths = node->KeyLengths();
memmove(newValues + index + 1, values + index,
sizeof(off_t) * (node->NumKeys() - 1 - index));
memmove(newValues, values, sizeof(off_t) * index);
newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
for (uint16 i = node->NumKeys(); i-- > index + 1;) {
newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
}
memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
int32 keyStart;
newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
+ (keyStart = index > 0
? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
int32 size = node->AllKeyLength() - length;
if (size > 0)
memmove(keys + length, keys + length - keyLength, size);
memcpy(keys + keyStart, key, keyLength);
}
\a other. It also takes care to create a new overflow link if the node
to split is an index node.
*/
status_t
BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
uint16* _keyLength, off_t* _value)
{
if (*_keyIndex > node->NumKeys() + 1)
return B_BAD_VALUE;
Unaligned<uint16>* inKeyLengths = node->KeyLengths();
Unaligned<off_t>* inKeyValues = node->Values();
uint8* inKeys = node->Keys();
uint8* outKeys = other->Keys();
int32 keyIndex = *_keyIndex;
if (keyIndex > node->NumKeys()) {
FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
return B_BAD_VALUE;
}
int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
size_t size = fNodeSize >> 1;
int32 out, in;
size_t keyLengths = 0;
for (in = out = 0; in < node->NumKeys() + 1;) {
keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]);
if (in == keyIndex && !bytes) {
bytes = *_keyLength;
bytesBefore = in > 0
? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
} else {
if (keyIndex < out)
bytesAfter = keyLengths - bytesBefore;
in++;
}
out++;
if (key_align(sizeof(bplustree_node) + bytes + keyLengths)
+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
break;
}
}
if (keyIndex >= out && in > 0)
bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
else if (keyIndex + 1 < out)
bytesAfter = keyLengths - bytesBefore;
if (bytesBefore < 0 || bytesAfter < 0)
return B_BAD_DATA;
other->left_link = node->left_link;
other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
+ bytesAfter);
other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
Unaligned<uint16>* outKeyLengths = other->KeyLengths();
Unaligned<off_t>* outKeyValues = other->Values();
int32 keys = out > keyIndex ? keyIndex : out;
if (bytesBefore) {
memcpy(outKeys, inKeys, bytesBefore);
memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
}
if (bytes) {
memcpy(outKeys + bytesBefore, key, bytes);
outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
if (bytesAfter) {
memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
bytesAfter);
keys = out - keyIndex - 1;
for (int32 i = 0;i < keys;i++) {
outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
+ bytes);
}
memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
keys * sizeof(off_t));
}
}
if (in != out)
keyIndex--;
int32 total = bytesBefore + bytesAfter;
uint8* newKey = NULL;
uint16 newLength = 0;
bool newAllocated = false;
if (node->OverflowLink() != BPLUSTREE_NULL) {
if (in == keyIndex) {
newKey = key;
newLength = *_keyLength;
other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
keyIndex--;
} else {
uint8* droppedKey = node->KeyAt(in, &newLength);
if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
> (uint8*)node + fNodeSize
|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
fStream->GetVolume()->Panic();
RETURN_ERROR(B_BAD_DATA);
}
newKey = (uint8*)malloc(newLength);
if (newKey == NULL)
return B_NO_MEMORY;
newAllocated = true;
memcpy(newKey, droppedKey, newLength);
other->overflow_link = inKeyValues[in];
total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
}
}
bytesBefore = bytesAfter = bytes = 0;
out = 0;
int32 skip = in;
while (in < node->NumKeys() + 1) {
if (in == keyIndex && !bytes) {
bytesBefore = in > skip
? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
bytes = *_keyLength;
out++;
} else {
if (in < node->NumKeys()) {
inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
if (bytes) {
inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
- bytesBefore - bytes;
}
out++;
}
in++;
}
}
if (keyIndex < skip)
bytesBefore = node->AllKeyLength() - total;
if (bytesBefore < 0 || bytesAfter < 0) {
if (newAllocated)
free(newKey);
return B_BAD_DATA;
}
node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
+ bytesAfter);
node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
outKeyLengths = node->KeyLengths();
outKeyValues = node->Values();
keys = keyIndex <= skip ? out : keyIndex - skip;
keyIndex -= skip;
in = out - keyIndex - 1;
if (bytesBefore)
memmove(inKeys, inKeys + total, bytesBefore);
if (bytesAfter) {
memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
bytesAfter);
}
if (bytesBefore)
memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
if (bytesAfter) {
memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
in * sizeof(uint16));
}
if (bytesBefore)
memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
if (bytesAfter) {
memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
in * sizeof(off_t));
}
if (bytes) {
memcpy(inKeys + bytesBefore, key, bytes);
outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
}
if (newKey == NULL)
newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
memcpy(key, newKey, newLength);
*_keyLength = newLength;
*_value = otherOffset;
if (newAllocated)
free(newKey);
return B_OK;
}
all be part of the \a transaction.
You need to have the inode write locked.
*/
status_t
BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
off_t value)
{
if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
RETURN_ERROR(B_BAD_VALUE);
#ifdef DEBUG
if (value < 0)
panic("tried to insert invalid value %" B_PRId64 "!\n", value);
#endif
ASSERT_WRITE_LOCKED_INODE(fStream);
Stack<node_and_key> stack;
if (_SeekDown(stack, key, keyLength) != B_OK)
RETURN_ERROR(B_ERROR);
uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH];
memcpy(keyBuffer, key, keyLength);
node_and_key nodeAndKey;
const bplustree_node* node;
CachedNode cached(this);
while (stack.Pop(&nodeAndKey)
&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
#ifdef DEBUG
NodeChecker checker(node, fNodeSize, "insert");
#endif
if (node->IsLeaf()) {
status_t status = _FindKey(node, key, keyLength,
&nodeAndKey.keyIndex);
if (status == B_OK) {
if (fAllowDuplicates) {
status = _InsertDuplicate(transaction, cached, node,
nodeAndKey.keyIndex, value);
if (status != B_OK)
RETURN_ERROR(status);
return B_OK;
}
return B_NAME_IN_USE;
}
}
bplustree_node* writableNode = cached.MakeWritable(transaction);
if (writableNode == NULL)
return B_IO_ERROR;
if (int32(key_align(sizeof(bplustree_node)
+ writableNode->AllKeyLength() + keyLength)
+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
+ sizeof(off_t))) < fNodeSize) {
_InsertKey(writableNode, nodeAndKey.keyIndex,
keyBuffer, keyLength, value);
_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
nodeAndKey.keyIndex, 0, 1);
return B_OK;
} else {
CachedNode cachedNewRoot(this);
CachedNode cachedOther(this);
off_t newRoot = BPLUSTREE_NULL;
if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
bplustree_node* root;
status_t status = cachedNewRoot.Allocate(transaction, &root,
&newRoot);
if (status != B_OK) {
RETURN_ERROR(status);
}
}
bplustree_node* other;
off_t otherOffset;
status_t status = cachedOther.Allocate(transaction, &other,
&otherOffset);
if (status != B_OK) {
cachedNewRoot.Free(transaction, newRoot);
RETURN_ERROR(status);
}
if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
&value) != B_OK) {
cachedOther.Free(transaction, otherOffset);
cachedNewRoot.Free(transaction, newRoot);
RETURN_ERROR(B_ERROR);
}
#ifdef DEBUG
checker.Check("insert split");
NodeChecker otherChecker(other, fNodeSize, "insert split other");
#endif
_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
if ((other = cachedOther.SetToWritable(transaction,
other->LeftLink())) != NULL) {
other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
}
if (newRoot != BPLUSTREE_NULL) {
bplustree_node* root = cachedNewRoot.Node();
_InsertKey(root, 0, keyBuffer, keyLength,
writableNode->LeftLink());
root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
nodeAndKey.nodeOffset);
CachedNode cached(this);
bplustree_header* header
= cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
header->MaxNumberOfLevels() + 1);
return B_OK;
}
}
}
RETURN_ERROR(B_ERROR);
}
It's part of the private tree interface.
*/
status_t
BPlusTree::_RemoveDuplicate(Transaction& transaction,
const bplustree_node* node, CachedNode& cached, uint16 index,
off_t value)
{
Unaligned<off_t>* values = node->Values();
off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
CachedNode cachedDuplicate(this);
off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
duplicateOffset, false);
if (duplicate == NULL)
return B_IO_ERROR;
if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
duplicate_array* array = duplicate->FragmentAt(
bplustree_node::FragmentIndex(oldValue));
int32 arrayCount = array->Count();
if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
arrayCount, fStream->ID()));
return B_BAD_DATA;
}
if (!array->Remove(value)) {
FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
duplicateOffset, fStream->ID()));
return B_ENTRY_NOT_FOUND;
}
if (--arrayCount == 1) {
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
values[index] = array->values[0];
if (duplicate->FragmentsUsed(fNodeSize) == 1) {
status_t status = cachedDuplicate.Free(transaction,
duplicateOffset);
if (status != B_OK)
return status;
} else
array->count = 0;
}
return B_OK;
}
duplicate_array* array = NULL;
int32 arrayCount = 0;
if (duplicate->LeftLink() != BPLUSTREE_NULL) {
FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
return B_BAD_DATA;
}
while (duplicate != NULL) {
array = duplicate->DuplicateArray();
arrayCount = array->Count();
if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
duplicateOffset, arrayCount, fStream->ID()));
return B_BAD_DATA;
}
if (array->Remove(value)) {
arrayCount--;
break;
}
if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
RETURN_ERROR(B_ENTRY_NOT_FOUND);
cachedDuplicate.UnsetUnchanged(transaction);
duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
false);
}
if (duplicate == NULL)
RETURN_ERROR(B_IO_ERROR);
while (true) {
off_t left = duplicate->LeftLink();
off_t right = duplicate->RightLink();
bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
if ((isLast && arrayCount == 1) || arrayCount == 0) {
if (left == BPLUSTREE_NULL) {
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
if (arrayCount == 1) {
values[index] = array->values[0];
} else {
values[index] = HOST_ENDIAN_TO_BFS_INT64(
bplustree_node::MakeLink(
BPLUSTREE_DUPLICATE_NODE, right));
}
}
status_t status = cachedDuplicate.Free(transaction,
duplicateOffset);
if (status != B_OK)
return status;
if (left != BPLUSTREE_NULL
&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
false)) != NULL) {
duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
array = duplicate->DuplicateArray();
arrayCount = array->Count();
if (right == BPLUSTREE_NULL
&& duplicate->LeftLink() == BPLUSTREE_NULL
&& arrayCount <= NUM_FRAGMENT_VALUES) {
duplicateOffset = left;
continue;
}
}
if (right != BPLUSTREE_NULL
&& (duplicate = cachedDuplicate.SetToWritable(transaction,
right, false)) != NULL) {
duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
array = duplicate->DuplicateArray();
arrayCount = array->Count();
if (left == BPLUSTREE_NULL
&& duplicate->RightLink() == BPLUSTREE_NULL
&& arrayCount <= NUM_FRAGMENT_VALUES) {
duplicateOffset = right;
continue;
}
}
return B_OK;
}
if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
CachedNode cachedOther(this);
bplustree_node* fragment = NULL;
uint32 fragmentIndex = 0;
off_t offset;
if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
&offset, &fragment, &fragmentIndex) == B_OK) {
duplicate_array* target = fragment->FragmentAt(fragmentIndex);
memcpy(target, array,
(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
cachedDuplicate.Free(transaction, duplicateOffset);
duplicateOffset = offset;
} else {
memmove(duplicate, array,
(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
}
if (cached.MakeWritable(transaction) == NULL)
return B_IO_ERROR;
values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
}
return B_OK;
}
}
Since it has to get the key from the node anyway (to obtain it's
pointer), it's not needed to pass the key & its length, although
the calling method (BPlusTree::Remove()) have this data.
*/
void
BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
{
if (index > node->NumKeys() && node->NumKeys() > 0) {
FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
"\n", index, fStream->ID()));
return;
}
Unaligned<off_t>* values = node->Values();
if (!node->IsLeaf() && index == node->NumKeys())
node->overflow_link = values[--index];
uint16 length = 0;
uint8* key = node->KeyAt(index, &length);
if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
|| length > BPLUSTREE_MAX_KEY_LENGTH) {
FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
fStream->ID()));
fStream->GetVolume()->Panic();
return;
}
Unaligned<uint16>* keyLengths = node->KeyLengths();
uint8* keys = node->Keys();
node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
node->AllKeyLength() - length);
Unaligned<off_t>* newValues = node->Values();
Unaligned<uint16>* newKeyLengths = node->KeyLengths();
memmove(key, key + length, node->AllKeyLength() - (key - keys));
if (index > 0 && newKeyLengths != keyLengths)
memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
for (uint16 i = index; i < node->NumKeys(); i++) {
newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
}
if (index > 0)
memmove(newValues, values, index * sizeof(off_t));
if (node->NumKeys() > index) {
memmove(newValues + index, values + index + 1,
(node->NumKeys() - index) * sizeof(off_t));
}
}
for trees which allow duplicates, so you may safely ignore it.
It's not an optional parameter, so at least you have to think about it.
You need to have the inode write locked.
*/
status_t
BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
off_t value)
{
if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
RETURN_ERROR(B_BAD_VALUE);
ASSERT_WRITE_LOCKED_INODE(fStream);
Stack<node_and_key> stack;
if (_SeekDown(stack, key, keyLength) != B_OK)
RETURN_ERROR(B_ERROR);
node_and_key nodeAndKey;
const bplustree_node* node;
CachedNode cached(this);
while (stack.Pop(&nodeAndKey)
&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
#ifdef DEBUG
NodeChecker checker(node, fNodeSize, "remove");
#endif
if (node->IsLeaf()) {
status_t status = _FindKey(node, key, keyLength,
&nodeAndKey.keyIndex);
if (status != B_OK)
RETURN_ERROR(status);
if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
node->Values()[nodeAndKey.keyIndex]))) {
if (fAllowDuplicates) {
return _RemoveDuplicate(transaction, node, cached,
nodeAndKey.keyIndex, value);
}
FATAL(("dupliate node found where no duplicates are "
"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
RETURN_ERROR(B_ERROR);
} else {
if (node->Values()[nodeAndKey.keyIndex] != value)
return B_ENTRY_NOT_FOUND;
off_t next = node->RightLink() == BPLUSTREE_NULL
? BPLUSTREE_FREE : node->RightLink();
_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
}
}
bplustree_node* writableNode = cached.MakeWritable(transaction);
if (writableNode == NULL)
return B_IO_ERROR;
if (nodeAndKey.nodeOffset == fHeader.RootNode()
&& (node->NumKeys() == 0
|| (node->NumKeys() == 1 && node->IsLeaf()))) {
writableNode->overflow_link
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
writableNode->all_key_count = 0;
writableNode->all_key_length = 0;
if (fHeader.MaxNumberOfLevels() != 1) {
CachedNode cached(this);
bplustree_header* header
= cached.SetToWritableHeader(transaction);
if (header == NULL)
return B_IO_ERROR;
header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
}
return B_OK;
}
if (writableNode->NumKeys() > 1
|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
_RemoveKey(writableNode, nodeAndKey.keyIndex);
return B_OK;
}
CachedNode otherCached(this);
bplustree_node* other = otherCached.SetToWritable(transaction,
writableNode->LeftLink());
if (other != NULL)
other->right_link = writableNode->right_link;
if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
!= NULL) {
other->left_link = writableNode->left_link;
}
cached.Free(transaction, nodeAndKey.nodeOffset);
}
RETURN_ERROR(B_ERROR);
}
Returns B_OK if the key could be found and its value replaced,
B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
to indicate that something went terribly wrong.
Note that this doesn't work with duplicates - it will just
return B_BAD_TYPE if you call this function on a tree where
duplicates are allowed.
You need to have the inode write locked.
*/
status_t
BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
off_t value)
{
if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
|| key == NULL)
RETURN_ERROR(B_BAD_VALUE);
if (fAllowDuplicates)
RETURN_ERROR(B_BAD_TYPE);
ASSERT_WRITE_LOCKED_INODE(fStream);
off_t nodeOffset = fHeader.RootNode();
CachedNode cached(this);
const bplustree_node* node;
while ((node = cached.SetTo(nodeOffset)) != NULL) {
uint16 keyIndex = 0;
off_t nextOffset;
status_t status = _FindKey(node, key, keyLength, &keyIndex,
&nextOffset);
if (node->OverflowLink() == BPLUSTREE_NULL) {
if (status == B_OK) {
bplustree_node* writableNode = cached.MakeWritable(transaction);
if (writableNode != NULL) {
writableNode->Values()[keyIndex]
= HOST_ENDIAN_TO_BFS_INT64(value);
} else
status = B_IO_ERROR;
}
return status;
} else if (nextOffset == nodeOffset)
RETURN_ERROR(B_ERROR);
nodeOffset = nextOffset;
}
RETURN_ERROR(B_ERROR);
}
#endif
if successful.
It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
while it descends the tree.
Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
It can also return other errors to indicate that something went wrong.
Note that this doesn't work with duplicates - it will just return
B_BAD_TYPE if you call this function on a tree where duplicates are
allowed.
You need to have the inode read or write locked.
*/
status_t
BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
{
if (key == NULL || keyLength < BPLUSTREE_MIN_KEY_LENGTH)
RETURN_ERROR(B_BAD_VALUE);
if (keyLength > BPLUSTREE_MAX_KEY_LENGTH)
RETURN_ERROR(B_NAME_TOO_LONG);
if (fAllowDuplicates)
RETURN_ERROR(B_BAD_TYPE);
#if !_BOOT_MODE
ASSERT_READ_LOCKED_INODE(fStream);
#endif
off_t nodeOffset = fHeader.RootNode();
CachedNode cached(this);
const bplustree_node* node;
#ifdef DEBUG
int32 levels = 0;
#endif
while ((node = cached.SetTo(nodeOffset)) != NULL) {
uint16 keyIndex = 0;
off_t nextOffset;
status_t status = _FindKey(node, key, keyLength, &keyIndex,
&nextOffset);
#ifdef DEBUG
levels++;
#endif
if (node->OverflowLink() == BPLUSTREE_NULL) {
if (status == B_OK && _value != NULL)
*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
#ifdef DEBUG
if (levels != (int32)fHeader.MaxNumberOfLevels())
DEBUGGER(("levels don't match"));
#endif
return status;
} else if (nextOffset == nodeOffset)
RETURN_ERROR(B_ERROR);
nodeOffset = nextOffset;
}
FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
B_PRIdOFF "\n", nodeOffset, fStream->ID()));
RETURN_ERROR(B_ERROR);
}
#if !_BOOT_MODE
status_t
BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
const uint8* largestKey, uint16 largestKeyLength,
const bplustree_node* parent)
{
if (parent->CheckIntegrity(fNodeSize) != B_OK) {
dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
"failed!\n", fStream->ID(), offset);
check.FoundError();
return B_OK;
}
if (level >= fHeader.MaxNumberOfLevels()) {
dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
"!\n", fStream->ID(), offset);
check.FoundError();
return B_OK;
}
check.SetLevel(level);
if (check.Visited(offset)) {
dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
fStream->ID(), offset);
check.FoundError();
return B_OK;
}
check.SetVisited(offset);
uint32 count = parent->NumKeys();
Unaligned<off_t>* values = parent->Values();
off_t lastOffset = check.PreviousOffset(level);
CachedNode cached(this);
for (uint32 i = 0; i < count; i++) {
uint16 keyLength;
uint8* key = parent->KeyAt(i, &keyLength);
if (largestKey != NULL) {
int result = _CompareKeys(key, keyLength, largestKey,
largestKeyLength);
if (result > 0 || (result == 0 && i != count - 1)) {
dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
B_PRIu32 " larger than it should!\n",
fStream->ID(), offset, i);
check.FoundError();
}
}
off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
if (bplustree_node::IsDuplicate(childOffset)) {
off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
off_t lastDuplicateOffset = BPLUSTREE_NULL;
while (duplicateOffset != BPLUSTREE_NULL) {
const bplustree_node* node;
status_t status = cached.SetTo(duplicateOffset, &node, false);
if (status != B_OK) {
if (status == B_IO_ERROR)
return B_IO_ERROR;
dprintf("inode %" B_PRIdOFF ": duplicate node at %"
B_PRIdOFF " could not be read: %s\n", fStream->ID(),
duplicateOffset, strerror(status));
check.FoundError();
break;
}
bool isFragmentNode = bplustree_node::LinkType(childOffset)
== BPLUSTREE_DUPLICATE_FRAGMENT;
bool isKnownFragment = isFragmentNode
&& check.VisitedFragment(duplicateOffset);
if (!isKnownFragment && check.Visited(duplicateOffset)) {
dprintf("inode %" B_PRIdOFF ": %s node at %"
B_PRIdOFF " already visited, referenced from %"
B_PRIdOFF "!\n", fStream->ID(),
isFragmentNode ? "fragment" : "duplicate",
duplicateOffset, offset);
check.FoundError();
break;
}
if (!check.Visited(duplicateOffset))
check.SetVisited(duplicateOffset);
if (!isKnownFragment && isFragmentNode)
check.SetVisitedFragment(duplicateOffset);
duplicate_array* array;
int32 minSize;
int32 maxSize;
if (isFragmentNode) {
array = node->FragmentAt(
bplustree_node::FragmentIndex(childOffset));
minSize = 2;
maxSize = NUM_FRAGMENT_VALUES;
} else {
array = node->DuplicateArray();
minSize = 1;
maxSize = NUM_DUPLICATE_VALUES;
}
int32 arrayCount = array->Count();
if (arrayCount < minSize || arrayCount > maxSize) {
dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
" has invalid array size %" B_PRId32 "!\n",
fStream->ID(), duplicateOffset, arrayCount);
check.FoundError();
} else {
for (int32 j = 0; j < arrayCount; j++) {
if (!fStream->GetVolume()->IsValidInodeBlock(
array->ValueAt(j))) {
dprintf("inode %" B_PRIdOFF ": duplicate at %"
B_PRIdOFF " contains invalid block %" B_PRIdOFF
" at %" B_PRId32 "!\n", fStream->ID(),
duplicateOffset, array->ValueAt(j), j);
check.FoundError();
break;
}
}
}
if (isFragmentNode)
break;
if (node->LeftLink() != lastDuplicateOffset) {
dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
" has wrong left link %" B_PRIdOFF ", expected %"
B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
node->LeftLink(), lastDuplicateOffset);
check.FoundError();
}
lastDuplicateOffset = duplicateOffset;
duplicateOffset = node->RightLink();
}
} else if (!parent->IsLeaf()) {
off_t nextOffset = parent->OverflowLink();
if (i < count - 1)
nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
if (i == 0 && lastOffset != BPLUSTREE_NULL) {
const bplustree_node* previous = cached.SetTo(lastOffset, true);
if (previous == NULL)
return B_IO_ERROR;
if (previous->RightLink() != childOffset) {
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
"!\n", fStream->ID(), lastOffset, previous->RightLink(),
childOffset);
check.FoundError();
}
}
status_t status = _ValidateChild(check, cached, level, childOffset,
lastOffset, nextOffset, key, keyLength);
if (status != B_OK)
return status;
} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
fStream->ID(), offset, childOffset, i);
check.FoundError();
}
lastOffset = childOffset;
}
if (parent->OverflowLink() != BPLUSTREE_NULL) {
off_t childOffset = parent->OverflowLink();
status_t status = _ValidateChild(check, cached, level, childOffset,
lastOffset, 0, NULL, 0);
if (status != B_OK)
return status;
lastOffset = childOffset;
}
check.SetPreviousOffset(level, lastOffset);
return B_OK;
}
status_t
BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
off_t offset, off_t lastOffset, off_t nextOffset,
const uint8* key, uint16 keyLength)
{
const bplustree_node* node;
status_t status = cached.SetTo(offset, &node, true);
if (status != B_OK) {
if (status == B_IO_ERROR)
return B_IO_ERROR;
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
"read: %s\n", fStream->ID(), offset, strerror(status));
check.FoundError();
return B_OK;
}
if (node->LeftLink() != lastOffset) {
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
check.FoundError();
}
if (nextOffset != 0 && node->RightLink() != nextOffset) {
dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
check.FoundError();
}
return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
}
#endif
TreeIterator::TreeIterator(BPlusTree* tree)
:
fTree(tree),
fCurrentNodeOffset(BPLUSTREE_NULL)
{
#if !_BOOT_MODE
tree->_AddIterator(this);
#endif
}
TreeIterator::~TreeIterator()
{
#if !_BOOT_MODE
if (fTree)
fTree->_RemoveIterator(this);
#endif
}
status_t
TreeIterator::Goto(int8 to)
{
if (fTree == NULL || fTree->fStream == NULL)
RETURN_ERROR(B_BAD_VALUE);
#if !_BOOT_MODE
InodeReadLocker locker(fTree->fStream);
#endif
off_t nodeOffset = fTree->fHeader.RootNode();
CachedNode cached(fTree);
const bplustree_node* node;
while ((node = cached.SetTo(nodeOffset)) != NULL) {
if (node->OverflowLink() == BPLUSTREE_NULL) {
fCurrentNodeOffset = nodeOffset;
fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
fDuplicateNode = BPLUSTREE_NULL;
return B_OK;
}
off_t nextOffset;
if (to == BPLUSTREE_END || node->all_key_count == 0)
nextOffset = node->OverflowLink();
else {
if (node->AllKeyLength() > fTree->fNodeSize
|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
- 8 * node->NumKeys())
RETURN_ERROR(B_ERROR);
nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
}
if (nextOffset == nodeOffset)
break;
nodeOffset = nextOffset;
}
FATAL(("%s fails\n", __FUNCTION__));
RETURN_ERROR(B_ERROR);
}
When it iterates through duplicates, the "key" is only updated for the
first entry - if you need to know when this happens, use the "duplicate"
parameter which is 0 for no duplicate, 1 for the first, and 2 for all
the other duplicates.
That's not too nice, but saves the 256 bytes that would be needed to
store the last key - if this will ever become an issue, it will be
easy to change.
The other advantage of this is, that the queries can skip all duplicates
at once when they are not relevant to them.
*/
status_t
TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
uint16 maxLength, off_t* value, uint16* duplicate)
{
if (fTree == NULL)
return B_INTERRUPTED;
bool forward = direction == BPLUSTREE_FORWARD;
if (fCurrentNodeOffset == BPLUSTREE_NULL
&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
RETURN_ERROR(B_ERROR);
if (fCurrentNodeOffset == BPLUSTREE_FREE)
return B_ENTRY_NOT_FOUND;
#if !_BOOT_MODE
InodeReadLocker locker(fTree->fStream);
#endif
CachedNode cached(fTree);
const bplustree_node* node;
if (fDuplicateNode != BPLUSTREE_NULL) {
if (!fIsFragment || fDuplicate < fNumDuplicates) {
node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
false);
} else
node = NULL;
if (node != NULL) {
if (!fIsFragment && fDuplicate >= fNumDuplicates) {
fDuplicateNode = node->RightLink();
if (fDuplicateNode != BPLUSTREE_NULL
&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
fNumDuplicates = node->CountDuplicates(fDuplicateNode,
false);
fDuplicate = 0;
}
}
if (fDuplicate < fNumDuplicates) {
*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
fDuplicate++);
if (duplicate)
*duplicate = 2;
return B_OK;
}
}
fDuplicateNode = BPLUSTREE_NULL;
}
off_t savedNodeOffset = fCurrentNodeOffset;
int32 savedKey = fCurrentKey;
if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
RETURN_ERROR(B_ERROR);
if (duplicate)
*duplicate = 0;
fCurrentKey += direction;
while ((forward && fCurrentKey >= node->NumKeys())
|| (!forward && fCurrentKey < 0)) {
fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
if (fCurrentNodeOffset != BPLUSTREE_NULL) {
node = cached.SetTo(fCurrentNodeOffset);
if (!node)
RETURN_ERROR(B_ERROR);
fCurrentKey = forward ? 0 : node->NumKeys() - 1;
} else {
fCurrentNodeOffset = savedNodeOffset;
fCurrentKey = savedKey;
return B_ENTRY_NOT_FOUND;
}
}
if (node->all_key_count == 0)
RETURN_ERROR(B_ERROR);
uint16 length = 0;
uint8* keyStart = node->KeyAt(fCurrentKey, &length);
if (keyStart + length + sizeof(off_t) + sizeof(uint16)
> (uint8*)node + fTree->fNodeSize
|| length > BPLUSTREE_MAX_KEY_LENGTH) {
#if !_BOOT_MODE
fTree->fStream->GetVolume()->Panic();
#endif
RETURN_ERROR(B_BAD_DATA);
}
bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE;
if (length + (needsTermination ? 1 : 0) > maxLength) {
if (!needsTermination || maxLength < INODE_FILE_NAME_LENGTH) {
fCurrentNodeOffset = savedNodeOffset;
fCurrentKey = savedKey;
return B_BUFFER_OVERFLOW;
}
length = maxLength - 1;
}
memcpy(key, keyStart, length);
if (needsTermination)
((char*)key)[length] = '\0';
*keyLength = length;
off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
uint8 type = bplustree_node::LinkType(offset);
if (type == BPLUSTREE_DUPLICATE_FRAGMENT
|| type == BPLUSTREE_DUPLICATE_NODE) {
fDuplicateNode = offset;
node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
false);
if (node == NULL)
RETURN_ERROR(B_ERROR);
fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
if (fNumDuplicates) {
offset = node->DuplicateAt(offset, fIsFragment, 0);
fDuplicate = 1;
if (duplicate)
*duplicate = 1;
} else {
fDuplicateNode = BPLUSTREE_NULL;
offset = 0;
}
}
*value = offset;
return B_OK;
}
sets the current position in the iterator, regardless of if the
key could be found or not.
*/
status_t
TreeIterator::Find(const uint8* key, uint16 keyLength)
{
if (fTree == NULL)
return B_INTERRUPTED;
if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
|| key == NULL)
RETURN_ERROR(B_BAD_VALUE);
#if !_BOOT_MODE
InodeReadLocker locker(fTree->fStream);
#endif
off_t nodeOffset = fTree->fHeader.RootNode();
CachedNode cached(fTree);
const bplustree_node* node;
while ((node = cached.SetTo(nodeOffset)) != NULL) {
uint16 keyIndex = 0;
off_t nextOffset = 0;
status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
&nextOffset);
if (node->OverflowLink() == BPLUSTREE_NULL) {
fCurrentNodeOffset = nodeOffset;
fCurrentKey = keyIndex - 1;
fDuplicateNode = BPLUSTREE_NULL;
return status;
} else if (nextOffset == nodeOffset)
RETURN_ERROR(B_ERROR);
nodeOffset = nextOffset;
}
RETURN_ERROR(B_ERROR);
}
void
TreeIterator::SkipDuplicates()
{
fDuplicateNode = BPLUSTREE_NULL;
}
void
TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
uint16 splitAt, int8 change)
{
if (offset != fCurrentNodeOffset)
return;
if (nextOffset != BPLUSTREE_NULL) {
fCurrentNodeOffset = nextOffset;
if (splitAt <= fCurrentKey) {
fCurrentKey -= splitAt;
keyIndex -= splitAt;
}
}
if (keyIndex <= fCurrentKey)
fCurrentKey += change;
}
void
TreeIterator::Stop()
{
fTree = NULL;
}
#ifdef DEBUG
void
TreeIterator::Dump()
{
__out("TreeIterator at %p:\n", this);
__out("\tfTree = %p\n", fTree);
__out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset);
__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
__out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n",
bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
fDuplicateNode);
__out("\tfDuplicate = %u\n", fDuplicate);
__out("\tfNumDuplicates = %u\n", fNumDuplicates);
__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
}
#endif
bool
bplustree_header::IsValid() const
{
return Magic() == BPLUSTREE_MAGIC
&& (RootNode() % NodeSize()) == 0
&& IsValidLink(RootNode())
&& IsValidLink(FreeNode());
}
void
bplustree_node::Initialize()
{
left_link = right_link = overflow_link
= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
all_key_count = 0;
all_key_length = 0;
}
uint8*
bplustree_node::KeyAt(int32 index, uint16* keyLength) const
{
if (index < 0 || index > NumKeys())
return NULL;
uint8* keyStart = Keys();
Unaligned<uint16>* keyLengths = KeyLengths();
*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
if (index > 0)
keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
return keyStart;
}
uint8
bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
{
if (isFragment) {
uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
return ((off_t*)this)[fragment];
}
return OverflowLink();
}
off_t
bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
{
uint32 start;
if (isFragment)
start = 8 * ((uint64)offset & 0x3ff);
else
start = 2;
return ((off_t*)this)[start + 1 + index];
}
used fragment count; at least, it can only count to two: it returns
0, if there is no fragment used, 1 if there is only one fragment
used, and 2 if there are at least 2 fragments used.
*/
uint32
bplustree_node::FragmentsUsed(uint32 nodeSize) const
{
uint32 used = 0;
for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
duplicate_array* array = FragmentAt(i);
if (array->Count() > 0 && ++used > 1)
return used;
}
return used;
}
status_t
bplustree_node::CheckIntegrity(uint32 nodeSize) const
{
if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
DEBUGGER(("invalid node: key/length count"));
for (int32 i = 0; i < NumKeys(); i++) {
uint16 length = 0;
uint8* key = KeyAt(i, &length);
if (key + length + sizeof(off_t) + sizeof(uint16)
> (uint8*)this + nodeSize
|| length > BPLUSTREE_MAX_KEY_LENGTH) {
dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
return B_BAD_DATA;
}
if (Values()[i] == -1) {
dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
"corrupted\n", this, (int)i, Values()[i].value);
return B_BAD_DATA;
}
}
return B_OK;
}
#if !_BOOT_MODE
BitmapArray::BitmapArray(size_t numBits)
{
fSize = (numBits + 7) / 8;
fBitmap = (uint8*)calloc(fSize, 1);
fCountSet = 0;
}
BitmapArray::~BitmapArray()
{
free(fBitmap);
}
status_t
BitmapArray::InitCheck() const
{
return fBitmap != NULL ? B_OK : B_NO_MEMORY;
}
bool
BitmapArray::IsSet(size_t index) const
{
uint32 byteIndex = index / 8;
if (byteIndex >= fSize)
return false;
return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
}
void
BitmapArray::Set(size_t index, bool set)
{
uint32 byteIndex = index / 8;
if (byteIndex >= fSize)
return;
if (set) {
fBitmap[byteIndex] |= 1UL << (index & 0x7);
fCountSet++;
} else {
fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
fCountSet--;
}
}
#endif
bool
duplicate_array::_FindInternal(off_t value, int32& index) const
{
int32 min = 0, max = Count() - 1;
off_t cmp;
while (min <= max) {
index = (min + max) / 2;
cmp = ValueAt(index) - value;
if (cmp < 0)
min = index + 1;
else if (cmp > 0)
max = index - 1;
else
return true;
}
return false;
}
void
duplicate_array::Insert(off_t value)
{
int32 size = Count();
int32 i = 0;
if (size > 8 ) {
if (!_FindInternal(value, i) && ValueAt(i) <= value)
i++;
} else {
for (i = 0; i < size; i++) {
if (ValueAt(i) > value)
break;
}
}
memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
}
bool
duplicate_array::Remove(off_t value)
{
int32 index = Find(value);
if (index == -1)
return false;
int32 newSize = Count() - 1;
memmove(&values[index], &values[index + 1],
(newSize - index) * sizeof(off_t));
count = HOST_ENDIAN_TO_BFS_INT64(newSize);
return true;
}
#if _BOOT_MODE
}
#endif