all repos

rss-tools @ master

get rss feed from sources that(i need and) dont provide one

rss-tools/vendor/go.etcd.io/bbolt/node.go (view raw)

Oleksandr Smirnov Oleksandr Smirnov
olexsmir@gmail.com
we're vendoring now, 7 days ago
1
package bbolt
2
3
import (
4
	"bytes"
5
	"fmt"
6
	"sort"
7
8
	"go.etcd.io/bbolt/internal/common"
9
)
10
11
// node represents an in-memory, deserialized page.
12
type node struct {
13
	bucket     *Bucket
14
	isLeaf     bool
15
	unbalanced bool
16
	spilled    bool
17
	key        []byte
18
	pgid       common.Pgid
19
	parent     *node
20
	children   nodes
21
	inodes     common.Inodes
22
}
23
24
// root returns the top-level node this node is attached to.
25
func (n *node) root() *node {
26
	if n.parent == nil {
27
		return n
28
	}
29
	return n.parent.root()
30
}
31
32
// minKeys returns the minimum number of inodes this node should have.
33
func (n *node) minKeys() int {
34
	if n.isLeaf {
35
		return 1
36
	}
37
	return 2
38
}
39
40
// size returns the size of the node after serialization.
41
func (n *node) size() int {
42
	sz, elsz := common.PageHeaderSize, n.pageElementSize()
43
	for i := 0; i < len(n.inodes); i++ {
44
		item := &n.inodes[i]
45
		sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
46
	}
47
	return int(sz)
48
}
49
50
// sizeLessThan returns true if the node is less than a given size.
51
// This is an optimization to avoid calculating a large node when we only need
52
// to know if it fits inside a certain page size.
53
func (n *node) sizeLessThan(v uintptr) bool {
54
	sz, elsz := common.PageHeaderSize, n.pageElementSize()
55
	for i := 0; i < len(n.inodes); i++ {
56
		item := &n.inodes[i]
57
		sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
58
		if sz >= v {
59
			return false
60
		}
61
	}
62
	return true
63
}
64
65
// pageElementSize returns the size of each page element based on the type of node.
66
func (n *node) pageElementSize() uintptr {
67
	if n.isLeaf {
68
		return common.LeafPageElementSize
69
	}
70
	return common.BranchPageElementSize
71
}
72
73
// childAt returns the child node at a given index.
74
func (n *node) childAt(index int) *node {
75
	if n.isLeaf {
76
		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
77
	}
78
	return n.bucket.node(n.inodes[index].Pgid(), n)
79
}
80
81
// childIndex returns the index of a given child node.
82
func (n *node) childIndex(child *node) int {
83
	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), child.key) != -1 })
84
	return index
85
}
86
87
// numChildren returns the number of children.
88
func (n *node) numChildren() int {
89
	return len(n.inodes)
90
}
91
92
// nextSibling returns the next node with the same parent.
93
func (n *node) nextSibling() *node {
94
	if n.parent == nil {
95
		return nil
96
	}
97
	index := n.parent.childIndex(n)
98
	if index >= n.parent.numChildren()-1 {
99
		return nil
100
	}
101
	return n.parent.childAt(index + 1)
102
}
103
104
// prevSibling returns the previous node with the same parent.
105
func (n *node) prevSibling() *node {
106
	if n.parent == nil {
107
		return nil
108
	}
109
	index := n.parent.childIndex(n)
110
	if index == 0 {
111
		return nil
112
	}
113
	return n.parent.childAt(index - 1)
114
}
115
116
// put inserts a key/value.
117
func (n *node) put(oldKey, newKey, value []byte, pgId common.Pgid, flags uint32) {
118
	if pgId >= n.bucket.tx.meta.Pgid() {
119
		panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.Pgid()))
120
	} else if len(oldKey) <= 0 {
121
		panic("put: zero-length old key")
122
	} else if len(newKey) <= 0 {
123
		panic("put: zero-length new key")
124
	}
125
126
	// Find insertion index.
127
	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), oldKey) != -1 })
128
129
	// Add capacity and shift nodes if we don't have an exact match and need to insert.
130
	exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].Key(), oldKey)
131
	if !exact {
132
		n.inodes = append(n.inodes, common.Inode{})
133
		copy(n.inodes[index+1:], n.inodes[index:])
134
	}
135
136
	inode := &n.inodes[index]
137
	inode.SetFlags(flags)
138
	inode.SetKey(newKey)
139
	inode.SetValue(value)
140
	inode.SetPgid(pgId)
141
	common.Assert(len(inode.Key()) > 0, "put: zero-length inode key")
142
}
143
144
// del removes a key from the node.
145
func (n *node) del(key []byte) {
146
	// Find index of key.
147
	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), key) != -1 })
148
149
	// Exit if the key isn't found.
150
	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].Key(), key) {
151
		return
152
	}
153
154
	// Delete inode from the node.
155
	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
156
157
	// Mark the node as needing rebalancing.
158
	n.unbalanced = true
159
}
160
161
// read initializes the node from a page.
162
func (n *node) read(p *common.Page) {
163
	n.pgid = p.Id()
164
	n.isLeaf = p.IsLeafPage()
165
	n.inodes = common.ReadInodeFromPage(p)
166
167
	// Save first key, so we can find the node in the parent when we spill.
168
	if len(n.inodes) > 0 {
169
		n.key = n.inodes[0].Key()
170
		common.Assert(len(n.key) > 0, "read: zero-length node key")
171
	} else {
172
		n.key = nil
173
	}
174
}
175
176
// write writes the items onto one or more pages.
177
// The page should have p.id (might be 0 for meta or bucket-inline page) and p.overflow set
178
// and the rest should be zeroed.
179
func (n *node) write(p *common.Page) {
180
	common.Assert(p.Count() == 0 && p.Flags() == 0, "node cannot be written into a not empty page")
181
182
	// Initialize page.
183
	if n.isLeaf {
184
		p.SetFlags(common.LeafPageFlag)
185
	} else {
186
		p.SetFlags(common.BranchPageFlag)
187
	}
188
189
	if len(n.inodes) >= 0xFFFF {
190
		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.Id()))
191
	}
192
	p.SetCount(uint16(len(n.inodes)))
193
194
	// Stop here if there are no items to write.
195
	if p.Count() == 0 {
196
		return
197
	}
198
199
	common.WriteInodeToPage(n.inodes, p)
200
201
	// DEBUG ONLY: n.dump()
202
}
203
204
// split breaks up a node into multiple smaller nodes, if appropriate.
205
// This should only be called from the spill() function.
206
func (n *node) split(pageSize uintptr) []*node {
207
	var nodes []*node
208
209
	node := n
210
	for {
211
		// Split node into two.
212
		a, b := node.splitTwo(pageSize)
213
		nodes = append(nodes, a)
214
215
		// If we can't split then exit the loop.
216
		if b == nil {
217
			break
218
		}
219
220
		// Set node to b so it gets split on the next iteration.
221
		node = b
222
	}
223
224
	return nodes
225
}
226
227
// splitTwo breaks up a node into two smaller nodes, if appropriate.
228
// This should only be called from the split() function.
229
func (n *node) splitTwo(pageSize uintptr) (*node, *node) {
230
	// Ignore the split if the page doesn't have at least enough nodes for
231
	// two pages or if the nodes can fit in a single page.
232
	if len(n.inodes) <= (common.MinKeysPerPage*2) || n.sizeLessThan(pageSize) {
233
		return n, nil
234
	}
235
236
	// Determine the threshold before starting a new node.
237
	var fillPercent = n.bucket.FillPercent
238
	if fillPercent < minFillPercent {
239
		fillPercent = minFillPercent
240
	} else if fillPercent > maxFillPercent {
241
		fillPercent = maxFillPercent
242
	}
243
	threshold := int(float64(pageSize) * fillPercent)
244
245
	// Determine split position and sizes of the two pages.
246
	splitIndex, _ := n.splitIndex(threshold)
247
248
	// Split node into two separate nodes.
249
	// If there's no parent then we'll need to create one.
250
	if n.parent == nil {
251
		n.parent = &node{bucket: n.bucket, children: []*node{n}}
252
	}
253
254
	// Create a new node and add it to the parent.
255
	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
256
	n.parent.children = append(n.parent.children, next)
257
258
	// Split inodes across two nodes.
259
	next.inodes = n.inodes[splitIndex:]
260
	n.inodes = n.inodes[:splitIndex]
261
262
	// Update the statistics.
263
	n.bucket.tx.stats.IncSplit(1)
264
265
	return n, next
266
}
267
268
// splitIndex finds the position where a page will fill a given threshold.
269
// It returns the index as well as the size of the first page.
270
// This is only be called from split().
271
func (n *node) splitIndex(threshold int) (index, sz uintptr) {
272
	sz = common.PageHeaderSize
273
274
	// Loop until we only have the minimum number of keys required for the second page.
275
	for i := 0; i < len(n.inodes)-common.MinKeysPerPage; i++ {
276
		index = uintptr(i)
277
		inode := n.inodes[i]
278
		elsize := n.pageElementSize() + uintptr(len(inode.Key())) + uintptr(len(inode.Value()))
279
280
		// If we have at least the minimum number of keys and adding another
281
		// node would put us over the threshold then exit and return.
282
		if index >= common.MinKeysPerPage && sz+elsize > uintptr(threshold) {
283
			break
284
		}
285
286
		// Add the element size to the total size.
287
		sz += elsize
288
	}
289
290
	return
291
}
292
293
// spill writes the nodes to dirty pages and splits nodes as it goes.
294
// Returns an error if dirty pages cannot be allocated.
295
func (n *node) spill() error {
296
	var tx = n.bucket.tx
297
	if n.spilled {
298
		return nil
299
	}
300
301
	// Spill child nodes first. Child nodes can materialize sibling nodes in
302
	// the case of split-merge so we cannot use a range loop. We have to check
303
	// the children size on every loop iteration.
304
	sort.Sort(n.children)
305
	for i := 0; i < len(n.children); i++ {
306
		if err := n.children[i].spill(); err != nil {
307
			return err
308
		}
309
	}
310
311
	// We no longer need the child list because it's only used for spill tracking.
312
	n.children = nil
313
314
	// Split nodes into appropriate sizes. The first node will always be n.
315
	var nodes = n.split(uintptr(tx.db.pageSize))
316
	for _, node := range nodes {
317
		// Add node's page to the freelist if it's not new.
318
		if node.pgid > 0 {
319
			tx.db.freelist.Free(tx.meta.Txid(), tx.page(node.pgid))
320
			node.pgid = 0
321
		}
322
323
		// Allocate contiguous space for the node.
324
		p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
325
		if err != nil {
326
			return err
327
		}
328
329
		// Write the node.
330
		if p.Id() >= tx.meta.Pgid() {
331
			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.Id(), tx.meta.Pgid()))
332
		}
333
		node.pgid = p.Id()
334
		node.write(p)
335
		node.spilled = true
336
337
		// Insert into parent inodes.
338
		if node.parent != nil {
339
			var key = node.key
340
			if key == nil {
341
				key = node.inodes[0].Key()
342
			}
343
344
			node.parent.put(key, node.inodes[0].Key(), nil, node.pgid, 0)
345
			node.key = node.inodes[0].Key()
346
			common.Assert(len(node.key) > 0, "spill: zero-length node key")
347
		}
348
349
		// Update the statistics.
350
		tx.stats.IncSpill(1)
351
	}
352
353
	// If the root node split and created a new root then we need to spill that
354
	// as well. We'll clear out the children to make sure it doesn't try to respill.
355
	if n.parent != nil && n.parent.pgid == 0 {
356
		n.children = nil
357
		return n.parent.spill()
358
	}
359
360
	return nil
361
}
362
363
// rebalance attempts to combine the node with sibling nodes if the node fill
364
// size is below a threshold or if there are not enough keys.
365
func (n *node) rebalance() {
366
	if !n.unbalanced {
367
		return
368
	}
369
	n.unbalanced = false
370
371
	// Update statistics.
372
	n.bucket.tx.stats.IncRebalance(1)
373
374
	// Ignore if node is above threshold (25% when FillPercent is set to DefaultFillPercent) and has enough keys.
375
	var threshold = int(float64(n.bucket.tx.db.pageSize)*n.bucket.FillPercent) / 2
376
	if n.size() > threshold && len(n.inodes) > n.minKeys() {
377
		return
378
	}
379
380
	// Root node has special handling.
381
	if n.parent == nil {
382
		// If root node is a branch and only has one node then collapse it.
383
		if !n.isLeaf && len(n.inodes) == 1 {
384
			// Move root's child up.
385
			child := n.bucket.node(n.inodes[0].Pgid(), n)
386
			n.isLeaf = child.isLeaf
387
			n.inodes = child.inodes[:]
388
			n.children = child.children
389
390
			// Reparent all child nodes being moved.
391
			for _, inode := range n.inodes {
392
				if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
393
					child.parent = n
394
				}
395
			}
396
397
			// Remove old child.
398
			child.parent = nil
399
			delete(n.bucket.nodes, child.pgid)
400
			child.free()
401
		}
402
403
		return
404
	}
405
406
	// If node has no keys then just remove it.
407
	if n.numChildren() == 0 {
408
		n.parent.del(n.key)
409
		n.parent.removeChild(n)
410
		delete(n.bucket.nodes, n.pgid)
411
		n.free()
412
		n.parent.rebalance()
413
		return
414
	}
415
416
	common.Assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
417
418
	// Merge with right sibling if idx == 0, otherwise left sibling.
419
	var leftNode, rightNode *node
420
	var useNextSibling = n.parent.childIndex(n) == 0
421
	if useNextSibling {
422
		leftNode = n
423
		rightNode = n.nextSibling()
424
	} else {
425
		leftNode = n.prevSibling()
426
		rightNode = n
427
	}
428
429
	// If both nodes are too small then merge them.
430
	// Reparent all child nodes being moved.
431
	for _, inode := range rightNode.inodes {
432
		if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
433
			child.parent.removeChild(child)
434
			child.parent = leftNode
435
			child.parent.children = append(child.parent.children, child)
436
		}
437
	}
438
439
	// Copy over inodes from right node to left node and remove right node.
440
	leftNode.inodes = append(leftNode.inodes, rightNode.inodes...)
441
	n.parent.del(rightNode.key)
442
	n.parent.removeChild(rightNode)
443
	delete(n.bucket.nodes, rightNode.pgid)
444
	rightNode.free()
445
446
	// Either this node or the sibling node was deleted from the parent so rebalance it.
447
	n.parent.rebalance()
448
}
449
450
// removes a node from the list of in-memory children.
451
// This does not affect the inodes.
452
func (n *node) removeChild(target *node) {
453
	for i, child := range n.children {
454
		if child == target {
455
			n.children = append(n.children[:i], n.children[i+1:]...)
456
			return
457
		}
458
	}
459
}
460
461
// dereference causes the node to copy all its inode key/value references to heap memory.
462
// This is required when the mmap is reallocated so inodes are not pointing to stale data.
463
func (n *node) dereference() {
464
	if n.key != nil {
465
		key := make([]byte, len(n.key))
466
		copy(key, n.key)
467
		n.key = key
468
		common.Assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
469
	}
470
471
	for i := range n.inodes {
472
		inode := &n.inodes[i]
473
474
		key := make([]byte, len(inode.Key()))
475
		copy(key, inode.Key())
476
		inode.SetKey(key)
477
		common.Assert(len(inode.Key()) > 0, "dereference: zero-length inode key")
478
479
		value := make([]byte, len(inode.Value()))
480
		copy(value, inode.Value())
481
		inode.SetValue(value)
482
	}
483
484
	// Recursively dereference children.
485
	for _, child := range n.children {
486
		child.dereference()
487
	}
488
489
	// Update statistics.
490
	n.bucket.tx.stats.IncNodeDeref(1)
491
}
492
493
// free adds the node's underlying page to the freelist.
494
func (n *node) free() {
495
	if n.pgid != 0 {
496
		n.bucket.tx.db.freelist.Free(n.bucket.tx.meta.Txid(), n.bucket.tx.page(n.pgid))
497
		n.pgid = 0
498
	}
499
}
500
501
// dump writes the contents of the node to STDERR for debugging purposes.
502
/*
503
func (n *node) dump() {
504
	// Write node header.
505
	var typ = "branch"
506
	if n.isLeaf {
507
		typ = "leaf"
508
	}
509
	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
510
511
	// Write out abbreviated version of each item.
512
	for _, item := range n.inodes {
513
		if n.isLeaf {
514
			if item.flags&bucketLeafFlag != 0 {
515
				bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
516
				warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
517
			} else {
518
				warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
519
			}
520
		} else {
521
			warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
522
		}
523
	}
524
	warn("")
525
}
526
*/
527
528
func compareKeys(left, right []byte) int {
529
	return bytes.Compare(left, right)
530
}
531
532
type nodes []*node
533
534
func (s nodes) Len() int      { return len(s) }
535
func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
536
func (s nodes) Less(i, j int) bool {
537
	return bytes.Compare(s[i].inodes[0].Key(), s[j].inodes[0].Key()) == -1
538
}