rss-tools: vendor/go.etcd.io/bbolt/node.go (master)

1

package bbolt

2

3

import (

4

	"bytes"

5

	"fmt"

6

	"sort"

7

8

	"go.etcd.io/bbolt/internal/common"

9

10

11

// node represents an in-memory, deserialized page.

12

type node struct {

13

	bucket     *Bucket

14

	isLeaf     bool

15

	unbalanced bool

16

	spilled    bool

17

	key        []byte

18

	pgid       common.Pgid

19

	parent     *node

20

	children   nodes

21

	inodes     common.Inodes

22

23

24

// root returns the top-level node this node is attached to.

25

func (n *node) root() *node {

26

	if n.parent == nil {

27

		return n

28

29

	return n.parent.root()

30

31

32

// minKeys returns the minimum number of inodes this node should have.

33

func (n *node) minKeys() int {

34

	if n.isLeaf {

35

		return 1

36

37

	return 2

38

39

40

// size returns the size of the node after serialization.

41

func (n *node) size() int {

42

	sz, elsz := common.PageHeaderSize, n.pageElementSize()

43

	for i := 0; i < len(n.inodes); i++ {

44

		item := &n.inodes[i]

45

		sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))

46

47

	return int(sz)

48

49

50

// sizeLessThan returns true if the node is less than a given size.

51

// This is an optimization to avoid calculating a large node when we only need

52

// to know if it fits inside a certain page size.

53

func (n *node) sizeLessThan(v uintptr) bool {

54

	sz, elsz := common.PageHeaderSize, n.pageElementSize()

55

	for i := 0; i < len(n.inodes); i++ {

56

		item := &n.inodes[i]

57

		sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))

58

		if sz >= v {

59

			return false

60

61

62

	return true

63

64

65

// pageElementSize returns the size of each page element based on the type of node.

66

func (n *node) pageElementSize() uintptr {

67

	if n.isLeaf {

68

		return common.LeafPageElementSize

69

70

	return common.BranchPageElementSize

71

72

73

// childAt returns the child node at a given index.

74

func (n *node) childAt(index int) *node {

75

	if n.isLeaf {

76

		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))

77

78

	return n.bucket.node(n.inodes[index].Pgid(), n)

79

80

81

// childIndex returns the index of a given child node.

82

func (n *node) childIndex(child *node) int {

83

	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), child.key) != -1 })

84

	return index

85

86

87

// numChildren returns the number of children.

88

func (n *node) numChildren() int {

89

	return len(n.inodes)

90

91

92

// nextSibling returns the next node with the same parent.

93

func (n *node) nextSibling() *node {

94

	if n.parent == nil {

95

		return nil

96

97

	index := n.parent.childIndex(n)

98

	if index >= n.parent.numChildren()-1 {

99

		return nil

100

101

	return n.parent.childAt(index + 1)

102

103

104

// prevSibling returns the previous node with the same parent.

105

func (n *node) prevSibling() *node {

106

	if n.parent == nil {

107

		return nil

108

109

	index := n.parent.childIndex(n)

110

	if index == 0 {

111

		return nil

112

113

	return n.parent.childAt(index - 1)

114

115

116

// put inserts a key/value.

117

func (n *node) put(oldKey, newKey, value []byte, pgId common.Pgid, flags uint32) {

118

	if pgId >= n.bucket.tx.meta.Pgid() {

119

		panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.Pgid()))

120

	} else if len(oldKey) <= 0 {

121

		panic("put: zero-length old key")

122

	} else if len(newKey) <= 0 {

123

		panic("put: zero-length new key")

124

125

126

	// Find insertion index.

127

	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), oldKey) != -1 })

128

129

	// Add capacity and shift nodes if we don't have an exact match and need to insert.

130

	exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].Key(), oldKey)

131

	if !exact {

132

		n.inodes = append(n.inodes, common.Inode{})

133

		copy(n.inodes[index+1:], n.inodes[index:])

134

135

136

	inode := &n.inodes[index]

137

	inode.SetFlags(flags)

138

	inode.SetKey(newKey)

139

	inode.SetValue(value)

140

	inode.SetPgid(pgId)

141

	common.Assert(len(inode.Key()) > 0, "put: zero-length inode key")

142

143

144

// del removes a key from the node.

145

func (n *node) del(key []byte) {

146

	// Find index of key.

147

	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), key) != -1 })

148

149

	// Exit if the key isn't found.

150

	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].Key(), key) {

151

		return

152

153

154

	// Delete inode from the node.

155

	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)

156

157

	// Mark the node as needing rebalancing.

158

	n.unbalanced = true

159

160

161

// read initializes the node from a page.

162

func (n *node) read(p *common.Page) {

163

	n.pgid = p.Id()

164

	n.isLeaf = p.IsLeafPage()

165

	n.inodes = common.ReadInodeFromPage(p)

166

167

	// Save first key, so we can find the node in the parent when we spill.

168

	if len(n.inodes) > 0 {

169

		n.key = n.inodes[0].Key()

170

		common.Assert(len(n.key) > 0, "read: zero-length node key")

171

	} else {

172

		n.key = nil

173

174

175

176

// write writes the items onto one or more pages.

177

// The page should have p.id (might be 0 for meta or bucket-inline page) and p.overflow set

178

// and the rest should be zeroed.

179

func (n *node) write(p *common.Page) {

180

	common.Assert(p.Count() == 0 && p.Flags() == 0, "node cannot be written into a not empty page")

181

182

	// Initialize page.

183

	if n.isLeaf {

184

		p.SetFlags(common.LeafPageFlag)

185

	} else {

186

		p.SetFlags(common.BranchPageFlag)

187

188

189

	if len(n.inodes) >= 0xFFFF {

190

		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.Id()))

191

192

	p.SetCount(uint16(len(n.inodes)))

193

194

	// Stop here if there are no items to write.

195

	if p.Count() == 0 {

196

		return

197

198

199

	common.WriteInodeToPage(n.inodes, p)

200

201

	// DEBUG ONLY: n.dump()

202

203

204

// split breaks up a node into multiple smaller nodes, if appropriate.

205

// This should only be called from the spill() function.

206

func (n *node) split(pageSize uintptr) []*node {

207

	var nodes []*node

208

209

	node := n

210

	for {

211

		// Split node into two.

212

		a, b := node.splitTwo(pageSize)

213

		nodes = append(nodes, a)

214

215

		// If we can't split then exit the loop.

216

		if b == nil {

217

			break

218

219

220

		// Set node to b so it gets split on the next iteration.

221

		node = b

222

223

224

	return nodes

225

226

227

// splitTwo breaks up a node into two smaller nodes, if appropriate.

228

// This should only be called from the split() function.

229

func (n *node) splitTwo(pageSize uintptr) (*node, *node) {

230

	// Ignore the split if the page doesn't have at least enough nodes for

231

	// two pages or if the nodes can fit in a single page.

232

	if len(n.inodes) <= (common.MinKeysPerPage*2) || n.sizeLessThan(pageSize) {

233

		return n, nil

234

235

236

	// Determine the threshold before starting a new node.

237

	var fillPercent = n.bucket.FillPercent

238

	if fillPercent < minFillPercent {

239

		fillPercent = minFillPercent

240

	} else if fillPercent > maxFillPercent {

241

		fillPercent = maxFillPercent

242

243

	threshold := int(float64(pageSize) * fillPercent)

244

245

	// Determine split position and sizes of the two pages.

246

	splitIndex, _ := n.splitIndex(threshold)

247

248

	// Split node into two separate nodes.

249

	// If there's no parent then we'll need to create one.

250

	if n.parent == nil {

251

		n.parent = &node{bucket: n.bucket, children: []*node{n}}

252

253

254

	// Create a new node and add it to the parent.

255

	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}

256

	n.parent.children = append(n.parent.children, next)

257

258

	// Split inodes across two nodes.

259

	next.inodes = n.inodes[splitIndex:]

260

	n.inodes = n.inodes[:splitIndex]

261

262

	// Update the statistics.

263

	n.bucket.tx.stats.IncSplit(1)

264

265

	return n, next

266

267

268

// splitIndex finds the position where a page will fill a given threshold.

269

// It returns the index as well as the size of the first page.

270

// This is only be called from split().

271

func (n *node) splitIndex(threshold int) (index, sz uintptr) {

272

	sz = common.PageHeaderSize

273

274

	// Loop until we only have the minimum number of keys required for the second page.

275

	for i := 0; i < len(n.inodes)-common.MinKeysPerPage; i++ {

276

		index = uintptr(i)

277

		inode := n.inodes[i]

278

		elsize := n.pageElementSize() + uintptr(len(inode.Key())) + uintptr(len(inode.Value()))

279

280

		// If we have at least the minimum number of keys and adding another

281

		// node would put us over the threshold then exit and return.

282

		if index >= common.MinKeysPerPage && sz+elsize > uintptr(threshold) {

283

			break

284

285

286

		// Add the element size to the total size.

287

		sz += elsize

288

289

290

	return

291

292

293

// spill writes the nodes to dirty pages and splits nodes as it goes.

294

// Returns an error if dirty pages cannot be allocated.

295

func (n *node) spill() error {

296

	var tx = n.bucket.tx

297

	if n.spilled {

298

		return nil

299

300

301

	// Spill child nodes first. Child nodes can materialize sibling nodes in

302

	// the case of split-merge so we cannot use a range loop. We have to check

303

	// the children size on every loop iteration.

304

	sort.Sort(n.children)

305

	for i := 0; i < len(n.children); i++ {

306

		if err := n.children[i].spill(); err != nil {

307

			return err

308

309

310

311

	// We no longer need the child list because it's only used for spill tracking.

312

	n.children = nil

313

314

	// Split nodes into appropriate sizes. The first node will always be n.

315

	var nodes = n.split(uintptr(tx.db.pageSize))

316

	for _, node := range nodes {

317

		// Add node's page to the freelist if it's not new.

318

		if node.pgid > 0 {

319

			tx.db.freelist.Free(tx.meta.Txid(), tx.page(node.pgid))

320

			node.pgid = 0

321

322

323

		// Allocate contiguous space for the node.

324

		p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)

325

		if err != nil {

326

			return err

327

328

329

		// Write the node.

330

		if p.Id() >= tx.meta.Pgid() {

331

			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.Id(), tx.meta.Pgid()))

332

333

		node.pgid = p.Id()

334

		node.write(p)

335

		node.spilled = true

336

337

		// Insert into parent inodes.

338

		if node.parent != nil {

339

			var key = node.key

340

			if key == nil {

341

				key = node.inodes[0].Key()

342

343

344

			node.parent.put(key, node.inodes[0].Key(), nil, node.pgid, 0)

345

			node.key = node.inodes[0].Key()

346

			common.Assert(len(node.key) > 0, "spill: zero-length node key")

347

348

349

		// Update the statistics.

350

		tx.stats.IncSpill(1)

351

352

353

	// If the root node split and created a new root then we need to spill that

354

	// as well. We'll clear out the children to make sure it doesn't try to respill.

355

	if n.parent != nil && n.parent.pgid == 0 {

356

		n.children = nil

357

		return n.parent.spill()

358

359

360

	return nil

361

362

363

// rebalance attempts to combine the node with sibling nodes if the node fill

364

// size is below a threshold or if there are not enough keys.

365

func (n *node) rebalance() {

366

	if !n.unbalanced {

367

		return

368

369

	n.unbalanced = false

370

371

	// Update statistics.

372

	n.bucket.tx.stats.IncRebalance(1)

373

374

	// Ignore if node is above threshold (25% when FillPercent is set to DefaultFillPercent) and has enough keys.

375

	var threshold = int(float64(n.bucket.tx.db.pageSize)*n.bucket.FillPercent) / 2

376

	if n.size() > threshold && len(n.inodes) > n.minKeys() {

377

		return

378

379

380

	// Root node has special handling.

381

	if n.parent == nil {

382

		// If root node is a branch and only has one node then collapse it.

383

		if !n.isLeaf && len(n.inodes) == 1 {

384

			// Move root's child up.

385

			child := n.bucket.node(n.inodes[0].Pgid(), n)

386

			n.isLeaf = child.isLeaf

387

			n.inodes = child.inodes[:]

388

			n.children = child.children

389

390

			// Reparent all child nodes being moved.

391

			for _, inode := range n.inodes {

392

				if child, ok := n.bucket.nodes[inode.Pgid()]; ok {

393

					child.parent = n

394

395

396

397

			// Remove old child.

398

			child.parent = nil

399

			delete(n.bucket.nodes, child.pgid)

400

			child.free()

401

402

403

		return

404

405

406

	// If node has no keys then just remove it.

407

	if n.numChildren() == 0 {

408

		n.parent.del(n.key)

409

		n.parent.removeChild(n)

410

		delete(n.bucket.nodes, n.pgid)

411

		n.free()

412

		n.parent.rebalance()

413

		return

414

415

416

	common.Assert(n.parent.numChildren() > 1, "parent must have at least 2 children")

417

418

	// Merge with right sibling if idx == 0, otherwise left sibling.

419

	var leftNode, rightNode *node

420

	var useNextSibling = n.parent.childIndex(n) == 0

421

	if useNextSibling {

422

		leftNode = n

423

		rightNode = n.nextSibling()

424

	} else {

425

		leftNode = n.prevSibling()

426

		rightNode = n

427

428

429

	// If both nodes are too small then merge them.

430

	// Reparent all child nodes being moved.

431

	for _, inode := range rightNode.inodes {

432

		if child, ok := n.bucket.nodes[inode.Pgid()]; ok {

433

			child.parent.removeChild(child)

434

			child.parent = leftNode

435

			child.parent.children = append(child.parent.children, child)

436

437

438

439

	// Copy over inodes from right node to left node and remove right node.

440

	leftNode.inodes = append(leftNode.inodes, rightNode.inodes...)

441

	n.parent.del(rightNode.key)

442

	n.parent.removeChild(rightNode)

443

	delete(n.bucket.nodes, rightNode.pgid)

444

	rightNode.free()

445

446

	// Either this node or the sibling node was deleted from the parent so rebalance it.

447

	n.parent.rebalance()

448

449

450

// removes a node from the list of in-memory children.

451

// This does not affect the inodes.

452

func (n *node) removeChild(target *node) {

453

	for i, child := range n.children {

454

		if child == target {

455

			n.children = append(n.children[:i], n.children[i+1:]...)

456

			return

457

458

459

460

461

// dereference causes the node to copy all its inode key/value references to heap memory.

462

// This is required when the mmap is reallocated so inodes are not pointing to stale data.

463

func (n *node) dereference() {

464

	if n.key != nil {

465

		key := make([]byte, len(n.key))

466

		copy(key, n.key)

467

		n.key = key

468

		common.Assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")

469

470

471

	for i := range n.inodes {

472

		inode := &n.inodes[i]

473

474

		key := make([]byte, len(inode.Key()))

475

		copy(key, inode.Key())

476

		inode.SetKey(key)

477

		common.Assert(len(inode.Key()) > 0, "dereference: zero-length inode key")

478

479

		value := make([]byte, len(inode.Value()))

480

		copy(value, inode.Value())

481

		inode.SetValue(value)

482

483

484

	// Recursively dereference children.

485

	for _, child := range n.children {

486

		child.dereference()

487

488

489

	// Update statistics.

490

	n.bucket.tx.stats.IncNodeDeref(1)

491

492

493

// free adds the node's underlying page to the freelist.

494

func (n *node) free() {

495

	if n.pgid != 0 {

496

		n.bucket.tx.db.freelist.Free(n.bucket.tx.meta.Txid(), n.bucket.tx.page(n.pgid))

497

		n.pgid = 0

498

499

500

501

// dump writes the contents of the node to STDERR for debugging purposes.

502

/*

503

func (n *node) dump() {

504

	// Write node header.

505

	var typ = "branch"

506

	if n.isLeaf {

507

		typ = "leaf"

508

509

	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))

510

511

	// Write out abbreviated version of each item.

512

	for _, item := range n.inodes {

513

		if n.isLeaf {

514

			if item.flags&bucketLeafFlag != 0 {

515

				bucket := (*bucket)(unsafe.Pointer(&item.value[0]))

516

				warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)

517

			} else {

518

				warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))

519

520

		} else {

521

			warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)

522

523

524

	warn("")

525

526

*/

527

528

func compareKeys(left, right []byte) int {

529

	return bytes.Compare(left, right)

530

531

532

type nodes []*node

533

534

func (s nodes) Len() int      { return len(s) }

535

func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }

536

func (s nodes) Less(i, j int) bool {

537

	return bytes.Compare(s[i].inodes[0].Key(), s[j].inodes[0].Key()) == -1

538

1	package bbolt
2
3	import (
4	"bytes"
5	"fmt"
6	"sort"
7
8	"go.etcd.io/bbolt/internal/common"
9	)
10
11	// node represents an in-memory, deserialized page.
12	type node struct {
13	bucket *Bucket
14	isLeaf bool
15	unbalanced bool
16	spilled bool
17	key []byte
18	pgid common.Pgid
19	parent *node
20	children nodes
21	inodes common.Inodes
22	}
23
24	// root returns the top-level node this node is attached to.
25	func (n node) root() node {
26	if n.parent == nil {
27	return n
28	}
29	return n.parent.root()
30	}
31
32	// minKeys returns the minimum number of inodes this node should have.
33	func (n *node) minKeys() int {
34	if n.isLeaf {
35	return 1
36	}
37	return 2
38	}
39
40	// size returns the size of the node after serialization.
41	func (n *node) size() int {
42	sz, elsz := common.PageHeaderSize, n.pageElementSize()
43	for i := 0; i < len(n.inodes); i++ {
44	item := &n.inodes[i]
45	sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
46	}
47	return int(sz)
48	}
49
50	// sizeLessThan returns true if the node is less than a given size.
51	// This is an optimization to avoid calculating a large node when we only need
52	// to know if it fits inside a certain page size.
53	func (n *node) sizeLessThan(v uintptr) bool {
54	sz, elsz := common.PageHeaderSize, n.pageElementSize()
55	for i := 0; i < len(n.inodes); i++ {
56	item := &n.inodes[i]
57	sz += elsz + uintptr(len(item.Key())) + uintptr(len(item.Value()))
58	if sz >= v {
59	return false
60	}
61	}
62	return true
63	}
64
65	// pageElementSize returns the size of each page element based on the type of node.
66	func (n *node) pageElementSize() uintptr {
67	if n.isLeaf {
68	return common.LeafPageElementSize
69	}
70	return common.BranchPageElementSize
71	}
72
73	// childAt returns the child node at a given index.
74	func (n node) childAt(index int) node {
75	if n.isLeaf {
76	panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
77	}
78	return n.bucket.node(n.inodes[index].Pgid(), n)
79	}
80
81	// childIndex returns the index of a given child node.
82	func (n node) childIndex(child node) int {
83	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), child.key) != -1 })
84	return index
85	}
86
87	// numChildren returns the number of children.
88	func (n *node) numChildren() int {
89	return len(n.inodes)
90	}
91
92	// nextSibling returns the next node with the same parent.
93	func (n node) nextSibling() node {
94	if n.parent == nil {
95	return nil
96	}
97	index := n.parent.childIndex(n)
98	if index >= n.parent.numChildren()-1 {
99	return nil
100	}
101	return n.parent.childAt(index + 1)
102	}
103
104	// prevSibling returns the previous node with the same parent.
105	func (n node) prevSibling() node {
106	if n.parent == nil {
107	return nil
108	}
109	index := n.parent.childIndex(n)
110	if index == 0 {
111	return nil
112	}
113	return n.parent.childAt(index - 1)
114	}
115
116	// put inserts a key/value.
117	func (n *node) put(oldKey, newKey, value []byte, pgId common.Pgid, flags uint32) {
118	if pgId >= n.bucket.tx.meta.Pgid() {
119	panic(fmt.Sprintf("pgId (%d) above high water mark (%d)", pgId, n.bucket.tx.meta.Pgid()))
120	} else if len(oldKey) <= 0 {
121	panic("put: zero-length old key")
122	} else if len(newKey) <= 0 {
123	panic("put: zero-length new key")
124	}
125
126	// Find insertion index.
127	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), oldKey) != -1 })
128
129	// Add capacity and shift nodes if we don't have an exact match and need to insert.
130	exact := len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].Key(), oldKey)
131	if !exact {
132	n.inodes = append(n.inodes, common.Inode{})
133	copy(n.inodes[index+1:], n.inodes[index:])
134	}
135
136	inode := &n.inodes[index]
137	inode.SetFlags(flags)
138	inode.SetKey(newKey)
139	inode.SetValue(value)
140	inode.SetPgid(pgId)
141	common.Assert(len(inode.Key()) > 0, "put: zero-length inode key")
142	}
143
144	// del removes a key from the node.
145	func (n *node) del(key []byte) {
146	// Find index of key.
147	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].Key(), key) != -1 })
148
149	// Exit if the key isn't found.
150	if index >= len(n.inodes) \|\| !bytes.Equal(n.inodes[index].Key(), key) {
151	return
152	}
153
154	// Delete inode from the node.
155	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
156
157	// Mark the node as needing rebalancing.
158	n.unbalanced = true
159	}
160
161	// read initializes the node from a page.
162	func (n node) read(p common.Page) {
163	n.pgid = p.Id()
164	n.isLeaf = p.IsLeafPage()
165	n.inodes = common.ReadInodeFromPage(p)
166
167	// Save first key, so we can find the node in the parent when we spill.
168	if len(n.inodes) > 0 {
169	n.key = n.inodes[0].Key()
170	common.Assert(len(n.key) > 0, "read: zero-length node key")
171	} else {
172	n.key = nil
173	}
174	}
175
176	// write writes the items onto one or more pages.
177	// The page should have p.id (might be 0 for meta or bucket-inline page) and p.overflow set
178	// and the rest should be zeroed.
179	func (n node) write(p common.Page) {
180	common.Assert(p.Count() == 0 && p.Flags() == 0, "node cannot be written into a not empty page")
181
182	// Initialize page.
183	if n.isLeaf {
184	p.SetFlags(common.LeafPageFlag)
185	} else {
186	p.SetFlags(common.BranchPageFlag)
187	}
188
189	if len(n.inodes) >= 0xFFFF {
190	panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.Id()))
191	}
192	p.SetCount(uint16(len(n.inodes)))
193
194	// Stop here if there are no items to write.
195	if p.Count() == 0 {
196	return
197	}
198
199	common.WriteInodeToPage(n.inodes, p)
200
201	// DEBUG ONLY: n.dump()
202	}
203
204	// split breaks up a node into multiple smaller nodes, if appropriate.
205	// This should only be called from the spill() function.
206	func (n node) split(pageSize uintptr) []node {
207	var nodes []*node
208
209	node := n
210	for {
211	// Split node into two.
212	a, b := node.splitTwo(pageSize)
213	nodes = append(nodes, a)
214
215	// If we can't split then exit the loop.
216	if b == nil {
217	break
218	}
219
220	// Set node to b so it gets split on the next iteration.
221	node = b
222	}
223
224	return nodes
225	}
226
227	// splitTwo breaks up a node into two smaller nodes, if appropriate.
228	// This should only be called from the split() function.
229	func (n node) splitTwo(pageSize uintptr) (node, *node) {
230	// Ignore the split if the page doesn't have at least enough nodes for
231	// two pages or if the nodes can fit in a single page.
232	if len(n.inodes) <= (common.MinKeysPerPage*2) \|\| n.sizeLessThan(pageSize) {
233	return n, nil
234	}
235
236	// Determine the threshold before starting a new node.
237	var fillPercent = n.bucket.FillPercent
238	if fillPercent < minFillPercent {
239	fillPercent = minFillPercent
240	} else if fillPercent > maxFillPercent {
241	fillPercent = maxFillPercent
242	}
243	threshold := int(float64(pageSize) * fillPercent)
244
245	// Determine split position and sizes of the two pages.
246	splitIndex, _ := n.splitIndex(threshold)
247
248	// Split node into two separate nodes.
249	// If there's no parent then we'll need to create one.
250	if n.parent == nil {
251	n.parent = &node{bucket: n.bucket, children: []*node{n}}
252	}
253
254	// Create a new node and add it to the parent.
255	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
256	n.parent.children = append(n.parent.children, next)
257
258	// Split inodes across two nodes.
259	next.inodes = n.inodes[splitIndex:]
260	n.inodes = n.inodes[:splitIndex]
261
262	// Update the statistics.
263	n.bucket.tx.stats.IncSplit(1)
264
265	return n, next
266	}
267
268	// splitIndex finds the position where a page will fill a given threshold.
269	// It returns the index as well as the size of the first page.
270	// This is only be called from split().
271	func (n *node) splitIndex(threshold int) (index, sz uintptr) {
272	sz = common.PageHeaderSize
273
274	// Loop until we only have the minimum number of keys required for the second page.
275	for i := 0; i < len(n.inodes)-common.MinKeysPerPage; i++ {
276	index = uintptr(i)
277	inode := n.inodes[i]
278	elsize := n.pageElementSize() + uintptr(len(inode.Key())) + uintptr(len(inode.Value()))
279
280	// If we have at least the minimum number of keys and adding another
281	// node would put us over the threshold then exit and return.
282	if index >= common.MinKeysPerPage && sz+elsize > uintptr(threshold) {
283	break
284	}
285
286	// Add the element size to the total size.
287	sz += elsize
288	}
289
290	return
291	}
292
293	// spill writes the nodes to dirty pages and splits nodes as it goes.
294	// Returns an error if dirty pages cannot be allocated.
295	func (n *node) spill() error {
296	var tx = n.bucket.tx
297	if n.spilled {
298	return nil
299	}
300
301	// Spill child nodes first. Child nodes can materialize sibling nodes in
302	// the case of split-merge so we cannot use a range loop. We have to check
303	// the children size on every loop iteration.
304	sort.Sort(n.children)
305	for i := 0; i < len(n.children); i++ {
306	if err := n.children[i].spill(); err != nil {
307	return err
308	}
309	}
310
311	// We no longer need the child list because it's only used for spill tracking.
312	n.children = nil
313
314	// Split nodes into appropriate sizes. The first node will always be n.
315	var nodes = n.split(uintptr(tx.db.pageSize))
316	for _, node := range nodes {
317	// Add node's page to the freelist if it's not new.
318	if node.pgid > 0 {
319	tx.db.freelist.Free(tx.meta.Txid(), tx.page(node.pgid))
320	node.pgid = 0
321	}
322
323	// Allocate contiguous space for the node.
324	p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
325	if err != nil {
326	return err
327	}
328
329	// Write the node.
330	if p.Id() >= tx.meta.Pgid() {
331	panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.Id(), tx.meta.Pgid()))
332	}
333	node.pgid = p.Id()
334	node.write(p)
335	node.spilled = true
336
337	// Insert into parent inodes.
338	if node.parent != nil {
339	var key = node.key
340	if key == nil {
341	key = node.inodes[0].Key()
342	}
343
344	node.parent.put(key, node.inodes[0].Key(), nil, node.pgid, 0)
345	node.key = node.inodes[0].Key()
346	common.Assert(len(node.key) > 0, "spill: zero-length node key")
347	}
348
349	// Update the statistics.
350	tx.stats.IncSpill(1)
351	}
352
353	// If the root node split and created a new root then we need to spill that
354	// as well. We'll clear out the children to make sure it doesn't try to respill.
355	if n.parent != nil && n.parent.pgid == 0 {
356	n.children = nil
357	return n.parent.spill()
358	}
359
360	return nil
361	}
362
363	// rebalance attempts to combine the node with sibling nodes if the node fill
364	// size is below a threshold or if there are not enough keys.
365	func (n *node) rebalance() {
366	if !n.unbalanced {
367	return
368	}
369	n.unbalanced = false
370
371	// Update statistics.
372	n.bucket.tx.stats.IncRebalance(1)
373
374	// Ignore if node is above threshold (25% when FillPercent is set to DefaultFillPercent) and has enough keys.
375	var threshold = int(float64(n.bucket.tx.db.pageSize)*n.bucket.FillPercent) / 2
376	if n.size() > threshold && len(n.inodes) > n.minKeys() {
377	return
378	}
379
380	// Root node has special handling.
381	if n.parent == nil {
382	// If root node is a branch and only has one node then collapse it.
383	if !n.isLeaf && len(n.inodes) == 1 {
384	// Move root's child up.
385	child := n.bucket.node(n.inodes[0].Pgid(), n)
386	n.isLeaf = child.isLeaf
387	n.inodes = child.inodes[:]
388	n.children = child.children
389
390	// Reparent all child nodes being moved.
391	for _, inode := range n.inodes {
392	if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
393	child.parent = n
394	}
395	}
396
397	// Remove old child.
398	child.parent = nil
399	delete(n.bucket.nodes, child.pgid)
400	child.free()
401	}
402
403	return
404	}
405
406	// If node has no keys then just remove it.
407	if n.numChildren() == 0 {
408	n.parent.del(n.key)
409	n.parent.removeChild(n)
410	delete(n.bucket.nodes, n.pgid)
411	n.free()
412	n.parent.rebalance()
413	return
414	}
415
416	common.Assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
417
418	// Merge with right sibling if idx == 0, otherwise left sibling.
419	var leftNode, rightNode *node
420	var useNextSibling = n.parent.childIndex(n) == 0
421	if useNextSibling {
422	leftNode = n
423	rightNode = n.nextSibling()
424	} else {
425	leftNode = n.prevSibling()
426	rightNode = n
427	}
428
429	// If both nodes are too small then merge them.
430	// Reparent all child nodes being moved.
431	for _, inode := range rightNode.inodes {
432	if child, ok := n.bucket.nodes[inode.Pgid()]; ok {
433	child.parent.removeChild(child)
434	child.parent = leftNode
435	child.parent.children = append(child.parent.children, child)
436	}
437	}
438
439	// Copy over inodes from right node to left node and remove right node.
440	leftNode.inodes = append(leftNode.inodes, rightNode.inodes...)
441	n.parent.del(rightNode.key)
442	n.parent.removeChild(rightNode)
443	delete(n.bucket.nodes, rightNode.pgid)
444	rightNode.free()
445
446	// Either this node or the sibling node was deleted from the parent so rebalance it.
447	n.parent.rebalance()
448	}
449
450	// removes a node from the list of in-memory children.
451	// This does not affect the inodes.
452	func (n node) removeChild(target node) {
453	for i, child := range n.children {
454	if child == target {
455	n.children = append(n.children[:i], n.children[i+1:]...)
456	return
457	}
458	}
459	}
460
461	// dereference causes the node to copy all its inode key/value references to heap memory.
462	// This is required when the mmap is reallocated so inodes are not pointing to stale data.
463	func (n *node) dereference() {
464	if n.key != nil {
465	key := make([]byte, len(n.key))
466	copy(key, n.key)
467	n.key = key
468	common.Assert(n.pgid == 0 \|\| len(n.key) > 0, "dereference: zero-length node key on existing node")
469	}
470
471	for i := range n.inodes {
472	inode := &n.inodes[i]
473
474	key := make([]byte, len(inode.Key()))
475	copy(key, inode.Key())
476	inode.SetKey(key)
477	common.Assert(len(inode.Key()) > 0, "dereference: zero-length inode key")
478
479	value := make([]byte, len(inode.Value()))
480	copy(value, inode.Value())
481	inode.SetValue(value)
482	}
483
484	// Recursively dereference children.
485	for _, child := range n.children {
486	child.dereference()
487	}
488
489	// Update statistics.
490	n.bucket.tx.stats.IncNodeDeref(1)
491	}
492
493	// free adds the node's underlying page to the freelist.
494	func (n *node) free() {
495	if n.pgid != 0 {
496	n.bucket.tx.db.freelist.Free(n.bucket.tx.meta.Txid(), n.bucket.tx.page(n.pgid))
497	n.pgid = 0
498	}
499	}
500
501	// dump writes the contents of the node to STDERR for debugging purposes.
502	/*
503	func (n *node) dump() {
504	// Write node header.
505	var typ = "branch"
506	if n.isLeaf {
507	typ = "leaf"
508	}
509	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
510
511	// Write out abbreviated version of each item.
512	for _, item := range n.inodes {
513	if n.isLeaf {
514	if item.flags&bucketLeafFlag != 0 {
515	bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
516	warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
517	} else {
518	warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
519	}
520	} else {
521	warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
522	}
523	}
524	warn("")
525	}
526	*/
527
528	func compareKeys(left, right []byte) int {
529	return bytes.Compare(left, right)
530	}
531
532	type nodes []*node
533
534	func (s nodes) Len() int { return len(s) }
535	func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
536	func (s nodes) Less(i, j int) bool {
537	return bytes.Compare(s[i].inodes[0].Key(), s[j].inodes[0].Key()) == -1
538	}